LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - csrmbcs.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 195 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 37 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             :  **********************************************************************
       5             :  *   Copyright (C) 2005-2016, International Business Machines
       6             :  *   Corporation and others.  All Rights Reserved.
       7             :  **********************************************************************
       8             :  */
       9             : 
      10             : #include "unicode/utypes.h"
      11             : 
      12             : #if !UCONFIG_NO_CONVERSION
      13             : 
      14             : #include "cmemory.h"
      15             : #include "csmatch.h"
      16             : #include "csrmbcs.h"
      17             : 
      18             : #include <math.h>
      19             : 
      20             : U_NAMESPACE_BEGIN
      21             : 
      22             : #define min(x,y) (((x)<(y))?(x):(y))
      23             : 
      24             : static const uint16_t commonChars_sjis [] = {
      25             : // TODO:  This set of data comes from the character frequency-
      26             : //        of-occurence analysis tool.  The data needs to be moved
      27             : //        into a resource and loaded from there.
      28             : 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
      29             : 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
      30             : 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
      31             : 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
      32             : 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
      33             : 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
      34             : 
      35             : static const uint16_t commonChars_euc_jp[] = {
      36             : // TODO:  This set of data comes from the character frequency-
      37             : //        of-occurence analysis tool.  The data needs to be moved
      38             : //        into a resource and loaded from there.
      39             : 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
      40             : 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
      41             : 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
      42             : 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
      43             : 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
      44             : 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
      45             : 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
      46             : 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
      47             : 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
      48             : 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
      49             : 
      50             : static const uint16_t commonChars_euc_kr[] = {
      51             : // TODO:  This set of data comes from the character frequency-
      52             : //        of-occurence analysis tool.  The data needs to be moved
      53             : //        into a resource and loaded from there.
      54             : 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
      55             : 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
      56             : 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
      57             : 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
      58             : 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
      59             : 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
      60             : 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
      61             : 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
      62             : 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
      63             : 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
      64             : 
      65             : static const uint16_t commonChars_big5[] = {
      66             : // TODO:  This set of data comes from the character frequency-
      67             : //        of-occurence analysis tool.  The data needs to be moved
      68             : //        into a resource and loaded from there.
      69             : 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
      70             : 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
      71             : 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
      72             : 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
      73             : 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
      74             : 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
      75             : 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
      76             : 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
      77             : 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
      78             : 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
      79             : 
      80             : static const uint16_t commonChars_gb_18030[] = {
      81             : // TODO:  This set of data comes from the character frequency-
      82             : //        of-occurence analysis tool.  The data needs to be moved
      83             : //        into a resource and loaded from there.
      84             : 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
      85             : 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
      86             : 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
      87             : 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
      88             : 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
      89             : 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
      90             : 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
      91             : 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
      92             : 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
      93             : 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
      94             : 
      95           0 : static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
      96             : {
      97           0 :     int32_t start = 0, end = len-1;
      98           0 :     int32_t mid = (start+end)/2;
      99             : 
     100           0 :     while(start <= end) {
     101           0 :         if(array[mid] == value) {
     102           0 :             return mid;
     103             :         }
     104             : 
     105           0 :         if(array[mid] < value){
     106           0 :             start = mid+1;
     107             :         } else {
     108           0 :             end = mid-1;
     109             :         }
     110             : 
     111           0 :         mid = (start+end)/2;
     112             :     }
     113             : 
     114           0 :     return -1;
     115             : }
     116             : 
     117           0 : IteratedChar::IteratedChar() : 
     118           0 : charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
     119             : {
     120             :     // nothing else to do.
     121           0 : }
     122             : 
     123             : /*void IteratedChar::reset()
     124             : {
     125             :     charValue = 0;
     126             :     index     = -1;
     127             :     nextIndex = 0;
     128             :     error     = FALSE;
     129             :     done      = FALSE;
     130             : }*/
     131             : 
     132           0 : int32_t IteratedChar::nextByte(InputText *det)
     133             : {
     134           0 :     if (nextIndex >= det->fRawLength) {
     135           0 :         done = TRUE;
     136             : 
     137           0 :         return -1;
     138             :     }
     139             : 
     140           0 :     return det->fRawInput[nextIndex++];
     141             : }
     142             : 
     143           0 : CharsetRecog_mbcs::~CharsetRecog_mbcs()
     144             : {
     145             :     // nothing to do.
     146           0 : }
     147             : 
     148           0 : int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
     149           0 :     int32_t singleByteCharCount = 0;
     150           0 :     int32_t doubleByteCharCount = 0;
     151           0 :     int32_t commonCharCount     = 0;
     152           0 :     int32_t badCharCount        = 0;
     153           0 :     int32_t totalCharCount      = 0;
     154           0 :     int32_t confidence          = 0;
     155           0 :     IteratedChar iter;
     156             : 
     157           0 :     while (nextChar(&iter, det)) {
     158           0 :         totalCharCount++;
     159             : 
     160           0 :         if (iter.error) {
     161           0 :             badCharCount++;
     162             :         } else {
     163           0 :             if (iter.charValue <= 0xFF) {
     164           0 :                 singleByteCharCount++;
     165             :             } else {
     166           0 :                 doubleByteCharCount++;
     167             : 
     168           0 :                 if (commonChars != 0) {
     169           0 :                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
     170           0 :                         commonCharCount += 1;
     171             :                     }
     172             :                 }
     173             :             }
     174             :         }
     175             : 
     176             : 
     177           0 :         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
     178             :             // Bail out early if the byte data is not matching the encoding scheme.
     179             :             // break detectBlock;
     180           0 :             return confidence;
     181             :         }
     182             :     }
     183             : 
     184           0 :     if (doubleByteCharCount <= 10 && badCharCount == 0) {
     185             :         // Not many multi-byte chars.
     186           0 :         if (doubleByteCharCount == 0 && totalCharCount < 10) {
     187             :             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
     188             :             // We don't have enough data to have any confidence.
     189             :             // Statistical analysis of single byte non-ASCII charcters would probably help here.
     190           0 :             confidence = 0;
     191             :         }
     192             :         else {
     193             :             //   ASCII or ISO file?  It's probably not our encoding,
     194             :             //   but is not incompatible with our encoding, so don't give it a zero.
     195           0 :             confidence = 10;
     196             :         }
     197             : 
     198           0 :         return confidence;
     199             :     }
     200             : 
     201             :     //
     202             :     //  No match if there are too many characters that don't fit the encoding scheme.
     203             :     //    (should we have zero tolerance for these?)
     204             :     //
     205           0 :     if (doubleByteCharCount < 20*badCharCount) {
     206           0 :         confidence = 0;
     207             : 
     208           0 :         return confidence;
     209             :     }
     210             : 
     211           0 :     if (commonChars == 0) {
     212             :         // We have no statistics on frequently occuring characters.
     213             :         //  Assess confidence purely on having a reasonable number of
     214             :         //  multi-byte characters (the more the better)
     215           0 :         confidence = 30 + doubleByteCharCount - 20*badCharCount;
     216             : 
     217           0 :         if (confidence > 100) {
     218           0 :             confidence = 100;
     219             :         }
     220             :     } else {
     221             :         //
     222             :         // Frequency of occurence statistics exist.
     223             :         //
     224             : 
     225           0 :         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
     226           0 :         double scaleFactor = 90.0 / maxVal;
     227           0 :         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
     228             : 
     229           0 :         confidence = min(confidence, 100);
     230             :     }
     231             : 
     232           0 :     if (confidence < 0) {
     233           0 :         confidence = 0;
     234             :     }
     235             : 
     236           0 :     return confidence;
     237             : }
     238             : 
     239           0 : CharsetRecog_sjis::~CharsetRecog_sjis()
     240             : {
     241             :     // nothing to do
     242           0 : }
     243             : 
     244           0 : UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
     245           0 :     it->index = it->nextIndex;
     246           0 :     it->error = FALSE;
     247             : 
     248           0 :     int32_t firstByte = it->charValue = it->nextByte(det);
     249             : 
     250           0 :     if (firstByte < 0) {
     251           0 :         return FALSE;
     252             :     }
     253             : 
     254           0 :     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
     255           0 :         return TRUE;
     256             :     }
     257             : 
     258           0 :     int32_t secondByte = it->nextByte(det);
     259           0 :     if (secondByte >= 0) {
     260           0 :         it->charValue = (firstByte << 8) | secondByte;
     261             :     }
     262             :     // else we'll handle the error later.
     263             : 
     264           0 :     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
     265             :         // Illegal second byte value.
     266           0 :         it->error = TRUE;
     267             :     }
     268             : 
     269           0 :     return TRUE;
     270             : }
     271             : 
     272           0 : UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
     273           0 :     int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
     274           0 :     results->set(det, this, confidence);
     275           0 :     return (confidence > 0);
     276             : }
     277             : 
     278           0 : const char *CharsetRecog_sjis::getName() const
     279             : {
     280           0 :     return "Shift_JIS";
     281             : }
     282             : 
     283           0 : const char *CharsetRecog_sjis::getLanguage() const
     284             : {
     285           0 :     return "ja";
     286             : }
     287             : 
     288           0 : CharsetRecog_euc::~CharsetRecog_euc()
     289             : {
     290             :     // nothing to do
     291           0 : }
     292             : 
     293           0 : UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
     294           0 :     int32_t firstByte  = 0;
     295           0 :     int32_t secondByte = 0;
     296           0 :     int32_t thirdByte  = 0;
     297             : 
     298           0 :     it->index = it->nextIndex;
     299           0 :     it->error = FALSE;
     300           0 :     firstByte = it->charValue = it->nextByte(det);
     301             : 
     302           0 :     if (firstByte < 0) {
     303             :         // Ran off the end of the input data
     304           0 :         return FALSE;
     305             :     }
     306             : 
     307           0 :     if (firstByte <= 0x8D) {
     308             :         // single byte char
     309           0 :         return TRUE;
     310             :     }
     311             : 
     312           0 :     secondByte = it->nextByte(det);
     313           0 :     if (secondByte >= 0) {
     314           0 :         it->charValue = (it->charValue << 8) | secondByte;
     315             :     }
     316             :     // else we'll handle the error later.
     317             : 
     318           0 :     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
     319             :         // Two byte Char
     320           0 :         if (secondByte < 0xA1) {
     321           0 :             it->error = TRUE;
     322             :         }
     323             : 
     324           0 :         return TRUE;
     325             :     }
     326             : 
     327           0 :     if (firstByte == 0x8E) {
     328             :         // Code Set 2.
     329             :         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
     330             :         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
     331             :         // We don't know which we've got.
     332             :         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
     333             :         //   bytes will look like a well formed 2 byte char.
     334           0 :         if (secondByte < 0xA1) {
     335           0 :             it->error = TRUE;
     336             :         }
     337             : 
     338           0 :         return TRUE;
     339             :     }
     340             : 
     341           0 :     if (firstByte == 0x8F) {
     342             :         // Code set 3.
     343             :         // Three byte total char size, two bytes of actual char value.
     344           0 :         thirdByte    = it->nextByte(det);
     345           0 :         it->charValue = (it->charValue << 8) | thirdByte;
     346             : 
     347           0 :         if (thirdByte < 0xa1) {
     348             :             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
     349           0 :             it->error = TRUE;
     350             :         }
     351             :     }
     352             : 
     353           0 :     return TRUE;
     354             : 
     355             : }
     356             : 
     357           0 : CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
     358             : {
     359             :     // nothing to do
     360           0 : }
     361             : 
     362           0 : const char *CharsetRecog_euc_jp::getName() const
     363             : {
     364           0 :     return "EUC-JP";
     365             : }
     366             : 
     367           0 : const char *CharsetRecog_euc_jp::getLanguage() const
     368             : {
     369           0 :     return "ja";
     370             : }
     371             : 
     372           0 : UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
     373             : {
     374           0 :     int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
     375           0 :     results->set(det, this, confidence);
     376           0 :     return (confidence > 0);
     377             : }
     378             : 
     379           0 : CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
     380             : {
     381             :     // nothing to do
     382           0 : }
     383             : 
     384           0 : const char *CharsetRecog_euc_kr::getName() const
     385             : {
     386           0 :     return "EUC-KR";
     387             : }
     388             : 
     389           0 : const char *CharsetRecog_euc_kr::getLanguage() const
     390             : {
     391           0 :     return "ko";
     392             : }
     393             : 
     394           0 : UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
     395             : {
     396           0 :     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
     397           0 :     results->set(det, this, confidence);
     398           0 :     return (confidence > 0);
     399             : }
     400             : 
     401           0 : CharsetRecog_big5::~CharsetRecog_big5()
     402             : {
     403             :     // nothing to do
     404           0 : }
     405             : 
     406           0 : UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
     407             : {
     408             :     int32_t firstByte;
     409             : 
     410           0 :     it->index = it->nextIndex;
     411           0 :     it->error = FALSE;
     412           0 :     firstByte = it->charValue = it->nextByte(det);
     413             : 
     414           0 :     if (firstByte < 0) {
     415           0 :         return FALSE;
     416             :     }
     417             : 
     418           0 :     if (firstByte <= 0x7F || firstByte == 0xFF) {
     419             :         // single byte character.
     420           0 :         return TRUE;
     421             :     }
     422             : 
     423           0 :     int32_t secondByte = it->nextByte(det);
     424           0 :     if (secondByte >= 0)  {
     425           0 :         it->charValue = (it->charValue << 8) | secondByte;
     426             :     }
     427             :     // else we'll handle the error later.
     428             : 
     429           0 :     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
     430           0 :         it->error = TRUE;
     431             :     }
     432             : 
     433           0 :     return TRUE;
     434             : }
     435             : 
     436           0 : const char *CharsetRecog_big5::getName() const
     437             : {
     438           0 :     return "Big5";
     439             : }
     440             : 
     441           0 : const char *CharsetRecog_big5::getLanguage() const
     442             : {
     443           0 :     return "zh";
     444             : }
     445             : 
     446           0 : UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
     447             : {
     448           0 :     int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
     449           0 :     results->set(det, this, confidence);
     450           0 :     return (confidence > 0);
     451             : }
     452             : 
     453           0 : CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
     454             : {
     455             :     // nothing to do
     456           0 : }
     457             : 
     458           0 : UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
     459           0 :     int32_t firstByte  = 0;
     460           0 :     int32_t secondByte = 0;
     461           0 :     int32_t thirdByte  = 0;
     462           0 :     int32_t fourthByte = 0;
     463             : 
     464           0 :     it->index = it->nextIndex;
     465           0 :     it->error = FALSE;
     466           0 :     firstByte = it->charValue = it->nextByte(det);
     467             : 
     468           0 :     if (firstByte < 0) {
     469             :         // Ran off the end of the input data
     470           0 :         return FALSE;
     471             :     }
     472             : 
     473           0 :     if (firstByte <= 0x80) {
     474             :         // single byte char
     475           0 :         return TRUE;
     476             :     }
     477             : 
     478           0 :     secondByte = it->nextByte(det);
     479           0 :     if (secondByte >= 0) {
     480           0 :         it->charValue = (it->charValue << 8) | secondByte;
     481             :     }
     482             :     // else we'll handle the error later.
     483             : 
     484           0 :     if (firstByte >= 0x81 && firstByte <= 0xFE) {
     485             :         // Two byte Char
     486           0 :         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
     487           0 :             return TRUE;
     488             :         }
     489             : 
     490             :         // Four byte char
     491           0 :         if (secondByte >= 0x30 && secondByte <= 0x39) {
     492           0 :             thirdByte = it->nextByte(det);
     493             : 
     494           0 :             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
     495           0 :                 fourthByte = it->nextByte(det);
     496             : 
     497           0 :                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
     498           0 :                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
     499             : 
     500           0 :                     return TRUE;
     501             :                 }
     502             :             }
     503             :         }
     504             : 
     505             :         // Something wasn't valid, or we ran out of data (-1).
     506           0 :         it->error = TRUE;
     507             :     }
     508             : 
     509           0 :     return TRUE;
     510             : }
     511             : 
     512           0 : const char *CharsetRecog_gb_18030::getName() const
     513             : {
     514           0 :     return "GB18030";
     515             : }
     516             : 
     517           0 : const char *CharsetRecog_gb_18030::getLanguage() const
     518             : {
     519           0 :     return "zh";
     520             : }
     521             : 
     522           0 : UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
     523             : {
     524           0 :     int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
     525           0 :     results->set(det, this, confidence);
     526           0 :     return (confidence > 0);
     527             : }
     528             : 
     529             : U_NAMESPACE_END
     530             : #endif

Generated by: LCOV version 1.13