LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - csrucode.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 96 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 22 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             :  **********************************************************************
       5             :  *   Copyright (C) 2005-2013, International Business Machines
       6             :  *   Corporation and others.  All Rights Reserved.
       7             :  **********************************************************************
       8             :  */
       9             : 
      10             : #include "unicode/utypes.h"
      11             : 
      12             : #if !UCONFIG_NO_CONVERSION
      13             : 
      14             : #include "csrucode.h"
      15             : #include "csmatch.h"
      16             : 
      17             : U_NAMESPACE_BEGIN
      18             : 
      19           0 : CharsetRecog_Unicode::~CharsetRecog_Unicode()
      20             : {
      21             :     // nothing to do
      22           0 : }
      23             : 
      24           0 : CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
      25             : {
      26             :     // nothing to do
      27           0 : }
      28             : 
      29           0 : const char *CharsetRecog_UTF_16_BE::getName() const
      30             : {
      31           0 :     return "UTF-16BE";
      32             : }
      33             : 
      34             : // UTF-16 confidence calculation. Very simple minded, but better than nothing.
      35             : //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
      36             : //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
      37             : //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
      38             : //   NULs should be rare in actual text. 
      39             : 
      40           0 : static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {
      41           0 :     if (codeUnit == 0) {
      42           0 :         confidence -= 10;
      43           0 :     } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
      44           0 :         confidence += 10;
      45             :     }
      46           0 :     if (confidence < 0) {
      47           0 :         confidence = 0;
      48           0 :     } else if (confidence > 100) {
      49           0 :         confidence = 100;
      50             :     }
      51           0 :     return confidence;
      52             : }
      53             : 
      54             : 
      55           0 : UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const
      56             : {
      57           0 :     const uint8_t *input = textIn->fRawInput;
      58           0 :     int32_t confidence = 10;
      59           0 :     int32_t length = textIn->fRawLength;
      60             : 
      61           0 :     int32_t bytesToCheck = (length > 30) ? 30 : length;
      62           0 :     for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
      63           0 :         UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1];
      64           0 :         if (charIndex == 0 && codeUnit == 0xFEFF) {
      65           0 :             confidence = 100;
      66           0 :             break;
      67             :         }
      68           0 :         confidence = adjustConfidence(codeUnit, confidence);
      69           0 :         if (confidence == 0 || confidence == 100) {
      70             :             break;
      71             :         }
      72             :     }
      73           0 :     if (bytesToCheck < 4 && confidence < 100) {
      74           0 :         confidence = 0;
      75             :     }
      76           0 :     results->set(textIn, this, confidence);
      77           0 :     return (confidence > 0);
      78             : }
      79             : 
      80           0 : CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
      81             : {
      82             :     // nothing to do
      83           0 : }
      84             : 
      85           0 : const char *CharsetRecog_UTF_16_LE::getName() const
      86             : {
      87           0 :     return "UTF-16LE";
      88             : }
      89             : 
      90           0 : UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const
      91             : {
      92           0 :     const uint8_t *input = textIn->fRawInput;
      93           0 :     int32_t confidence = 10;
      94           0 :     int32_t length = textIn->fRawLength;
      95             : 
      96           0 :     int32_t bytesToCheck = (length > 30) ? 30 : length;
      97           0 :     for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
      98           0 :         UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8);
      99           0 :         if (charIndex == 0 && codeUnit == 0xFEFF) {
     100           0 :             confidence = 100;     // UTF-16 BOM
     101           0 :             if (length >= 4 && input[2] == 0 && input[3] == 0) {
     102           0 :                 confidence = 0;   // UTF-32 BOM
     103             :             }
     104           0 :             break;
     105             :         }
     106           0 :         confidence = adjustConfidence(codeUnit, confidence);
     107           0 :         if (confidence == 0 || confidence == 100) {
     108             :             break;
     109             :         }
     110             :     }
     111           0 :     if (bytesToCheck < 4 && confidence < 100) {
     112           0 :         confidence = 0;
     113             :     }
     114           0 :     results->set(textIn, this, confidence);
     115           0 :     return (confidence > 0);
     116             : }
     117             : 
     118           0 : CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
     119             : {
     120             :     // nothing to do
     121           0 : }
     122             : 
     123           0 : UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const
     124             : {
     125           0 :     const uint8_t *input = textIn->fRawInput;
     126           0 :     int32_t limit = (textIn->fRawLength / 4) * 4;
     127           0 :     int32_t numValid = 0;
     128           0 :     int32_t numInvalid = 0;
     129           0 :     bool hasBOM = FALSE;
     130           0 :     int32_t confidence = 0;
     131             : 
     132           0 :     if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) {
     133           0 :         hasBOM = TRUE;
     134             :     }
     135             : 
     136           0 :     for(int32_t i = 0; i < limit; i += 4) {
     137           0 :         int32_t ch = getChar(input, i);
     138             : 
     139           0 :         if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
     140           0 :             numInvalid += 1;
     141             :         } else {
     142           0 :             numValid += 1;
     143             :         }
     144             :     }
     145             : 
     146             : 
     147             :     // Cook up some sort of confidence score, based on presense of a BOM
     148             :     //    and the existence of valid and/or invalid multi-byte sequences.
     149           0 :     if (hasBOM && numInvalid==0) {
     150           0 :         confidence = 100;
     151           0 :     } else if (hasBOM && numValid > numInvalid*10) {
     152           0 :         confidence = 80;
     153           0 :     } else if (numValid > 3 && numInvalid == 0) {
     154           0 :         confidence = 100;            
     155           0 :     } else if (numValid > 0 && numInvalid == 0) {
     156           0 :         confidence = 80;
     157           0 :     } else if (numValid > numInvalid*10) {
     158             :         // Probably corruput UTF-32BE data.  Valid sequences aren't likely by chance.
     159           0 :         confidence = 25;
     160             :     }
     161             : 
     162           0 :     results->set(textIn, this, confidence);
     163           0 :     return (confidence > 0);
     164             : }
     165             : 
     166           0 : CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
     167             : {
     168             :     // nothing to do
     169           0 : }
     170             : 
     171           0 : const char *CharsetRecog_UTF_32_BE::getName() const
     172             : {
     173           0 :     return "UTF-32BE";
     174             : }
     175             : 
     176           0 : int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const
     177             : {
     178           0 :     return input[index + 0] << 24 | input[index + 1] << 16 |
     179           0 :            input[index + 2] <<  8 | input[index + 3];
     180             : } 
     181             : 
     182           0 : CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
     183             : {
     184             :     // nothing to do
     185           0 : }
     186             : 
     187           0 : const char *CharsetRecog_UTF_32_LE::getName() const
     188             : {
     189           0 :     return "UTF-32LE";
     190             : }
     191             : 
     192           0 : int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const
     193             : {
     194           0 :     return input[index + 3] << 24 | input[index + 2] << 16 |
     195           0 :            input[index + 1] <<  8 | input[index + 0];
     196             : }
     197             : 
     198             : U_NAMESPACE_END
     199             : #endif
     200             : 

Generated by: LCOV version 1.13