LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - csr2022.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 59 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 15 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             :  **********************************************************************
       5             :  *   Copyright (C) 2005-2016, International Business Machines
       6             :  *   Corporation and others.  All Rights Reserved.
       7             :  **********************************************************************
       8             :  */
       9             : 
      10             : #include "unicode/utypes.h"
      11             : 
      12             : #if !UCONFIG_NO_CONVERSION
      13             : 
      14             : #include "cmemory.h"
      15             : #include "cstring.h"
      16             : 
      17             : #include "csr2022.h"
      18             : #include "csmatch.h"
      19             : 
      20             : U_NAMESPACE_BEGIN
      21             : 
      22             : /**
      23             :  * Matching function shared among the 2022 detectors JP, CN and KR
      24             :  * Counts up the number of legal and unrecognized escape sequences in
      25             :  * the sample of text, and computes a score based on the total number &
      26             :  * the proportion that fit the encoding.
      27             :  * 
      28             :  * 
      29             :  * @param text the byte buffer containing text to analyse
      30             :  * @param textLen  the size of the text in the byte.
      31             :  * @param escapeSequences the byte escape sequences to test for.
      32             :  * @return match quality, in the range of 0-100.
      33             :  */
      34           0 : int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) const
      35             : {
      36             :     int32_t i, j;
      37             :     int32_t escN;
      38           0 :     int32_t hits   = 0;
      39           0 :     int32_t misses = 0;
      40           0 :     int32_t shifts = 0;
      41             :     int32_t quality;
      42             : 
      43           0 :     i = 0;
      44           0 :     while(i < textLen) {
      45           0 :         if(text[i] == 0x1B) {
      46           0 :             escN = 0;
      47           0 :             while(escN < escapeSequences_length) {
      48           0 :                 const uint8_t *seq = escapeSequences[escN];
      49           0 :                 int32_t seq_length = (int32_t)uprv_strlen((const char *) seq);
      50             : 
      51           0 :                 if (textLen-i >= seq_length) {
      52           0 :                     j = 1;
      53           0 :                     while(j < seq_length) {
      54           0 :                         if(seq[j] != text[i+j]) {
      55           0 :                             goto checkEscapes;
      56             :                         }
      57             : 
      58           0 :                         j += 1;
      59             :                     }
      60             : 
      61           0 :                     hits += 1;
      62           0 :                     i += seq_length-1;
      63           0 :                     goto scanInput;
      64             :                 }
      65             :                 // else we ran out of string to compare this time.
      66             : checkEscapes:
      67           0 :                 escN += 1;
      68             :             }
      69             : 
      70           0 :             misses += 1;
      71             :         }
      72             : 
      73           0 :         if( text[i]== 0x0e || text[i] == 0x0f){
      74           0 :             shifts += 1;
      75             :         }
      76             : 
      77             : scanInput:
      78           0 :         i += 1;
      79             :     }
      80             : 
      81           0 :     if (hits == 0) {
      82           0 :         return 0;
      83             :     }
      84             : 
      85             :     //
      86             :     // Initial quality is based on relative proportion of recongized vs.
      87             :     //   unrecognized escape sequences. 
      88             :     //   All good:  quality = 100;
      89             :     //   half or less good: quality = 0;
      90             :     //   linear inbetween.
      91           0 :     quality = (100*hits - 100*misses) / (hits + misses);
      92             : 
      93             :     // Back off quality if there were too few escape sequences seen.
      94             :     //   Include shifts in this computation, so that KR does not get penalized
      95             :     //   for having only a single Escape sequence, but many shifts.
      96           0 :     if (hits+shifts < 5) {
      97           0 :         quality -= (5-(hits+shifts))*10;
      98             :     }
      99             : 
     100           0 :     if (quality < 0) {
     101           0 :         quality = 0;
     102             :     }
     103             : 
     104           0 :     return quality;
     105             : }
     106             : 
     107             : 
     108             : static const uint8_t escapeSequences_2022JP[][5] = {
     109             :     {0x1b, 0x24, 0x28, 0x43, 0x00},   // KS X 1001:1992
     110             :     {0x1b, 0x24, 0x28, 0x44, 0x00},   // JIS X 212-1990
     111             :     {0x1b, 0x24, 0x40, 0x00, 0x00},   // JIS C 6226-1978
     112             :     {0x1b, 0x24, 0x41, 0x00, 0x00},   // GB 2312-80
     113             :     {0x1b, 0x24, 0x42, 0x00, 0x00},   // JIS X 208-1983
     114             :     {0x1b, 0x26, 0x40, 0x00, 0x00},   // JIS X 208 1990, 1997
     115             :     {0x1b, 0x28, 0x42, 0x00, 0x00},   // ASCII
     116             :     {0x1b, 0x28, 0x48, 0x00, 0x00},   // JIS-Roman
     117             :     {0x1b, 0x28, 0x49, 0x00, 0x00},   // Half-width katakana
     118             :     {0x1b, 0x28, 0x4a, 0x00, 0x00},   // JIS-Roman
     119             :     {0x1b, 0x2e, 0x41, 0x00, 0x00},   // ISO 8859-1
     120             :     {0x1b, 0x2e, 0x46, 0x00, 0x00}    // ISO 8859-7
     121             : };
     122             : 
     123             : #if !UCONFIG_ONLY_HTML_CONVERSION
     124             : static const uint8_t escapeSequences_2022KR[][5] = {
     125             :     {0x1b, 0x24, 0x29, 0x43, 0x00}   
     126             : };
     127             : 
     128             : static const uint8_t escapeSequences_2022CN[][5] = {
     129             :     {0x1b, 0x24, 0x29, 0x41, 0x00},   // GB 2312-80
     130             :     {0x1b, 0x24, 0x29, 0x47, 0x00},   // CNS 11643-1992 Plane 1
     131             :     {0x1b, 0x24, 0x2A, 0x48, 0x00},   // CNS 11643-1992 Plane 2
     132             :     {0x1b, 0x24, 0x29, 0x45, 0x00},   // ISO-IR-165
     133             :     {0x1b, 0x24, 0x2B, 0x49, 0x00},   // CNS 11643-1992 Plane 3
     134             :     {0x1b, 0x24, 0x2B, 0x4A, 0x00},   // CNS 11643-1992 Plane 4
     135             :     {0x1b, 0x24, 0x2B, 0x4B, 0x00},   // CNS 11643-1992 Plane 5
     136             :     {0x1b, 0x24, 0x2B, 0x4C, 0x00},   // CNS 11643-1992 Plane 6
     137             :     {0x1b, 0x24, 0x2B, 0x4D, 0x00},   // CNS 11643-1992 Plane 7
     138             :     {0x1b, 0x4e, 0x00, 0x00, 0x00},   // SS2
     139             :     {0x1b, 0x4f, 0x00, 0x00, 0x00},   // SS3
     140             : };
     141             : #endif
     142             : 
     143           0 : CharsetRecog_2022JP::~CharsetRecog_2022JP() {}
     144             : 
     145           0 : const char *CharsetRecog_2022JP::getName() const {
     146           0 :     return "ISO-2022-JP";
     147             : }
     148             : 
     149           0 : UBool CharsetRecog_2022JP::match(InputText *textIn, CharsetMatch *results) const {
     150           0 :     int32_t confidence = match_2022(textIn->fInputBytes, 
     151             :                                     textIn->fInputLen, 
     152             :                                     escapeSequences_2022JP, 
     153           0 :                                     UPRV_LENGTHOF(escapeSequences_2022JP));
     154           0 :     results->set(textIn, this, confidence);
     155           0 :     return (confidence > 0);
     156             : }
     157             : 
     158             : #if !UCONFIG_ONLY_HTML_CONVERSION
     159           0 : CharsetRecog_2022KR::~CharsetRecog_2022KR() {}
     160             : 
     161           0 : const char *CharsetRecog_2022KR::getName() const {
     162           0 :     return "ISO-2022-KR";
     163             : }
     164             : 
     165           0 : UBool CharsetRecog_2022KR::match(InputText *textIn, CharsetMatch *results) const {
     166           0 :     int32_t confidence = match_2022(textIn->fInputBytes, 
     167             :                                     textIn->fInputLen, 
     168             :                                     escapeSequences_2022KR, 
     169           0 :                                     UPRV_LENGTHOF(escapeSequences_2022KR));
     170           0 :     results->set(textIn, this, confidence);
     171           0 :     return (confidence > 0);
     172             : }
     173             : 
     174           0 : CharsetRecog_2022CN::~CharsetRecog_2022CN() {}
     175             : 
     176           0 : const char *CharsetRecog_2022CN::getName() const {
     177           0 :     return "ISO-2022-CN";
     178             : }
     179             : 
     180           0 : UBool CharsetRecog_2022CN::match(InputText *textIn, CharsetMatch *results) const {
     181           0 :     int32_t confidence = match_2022(textIn->fInputBytes,
     182             :                                     textIn->fInputLen,
     183             :                                     escapeSequences_2022CN,
     184           0 :                                     UPRV_LENGTHOF(escapeSequences_2022CN));
     185           0 :     results->set(textIn, this, confidence);
     186           0 :     return (confidence > 0);
     187             : }
     188             : #endif
     189             : 
     190           0 : CharsetRecog_2022::~CharsetRecog_2022() {
     191             :     // nothing to do
     192           0 : }
     193             : 
     194             : U_NAMESPACE_END
     195             : #endif

Generated by: LCOV version 1.13