LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - csrutf8.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 50 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             :  **********************************************************************
       5             :  *   Copyright (C) 2005-2014, International Business Machines
       6             :  *   Corporation and others.  All Rights Reserved.
       7             :  **********************************************************************
       8             :  */
       9             : 
      10             : #include "unicode/utypes.h"
      11             : 
      12             : #if !UCONFIG_NO_CONVERSION
      13             : 
      14             : #include "csrutf8.h"
      15             : #include "csmatch.h"
      16             : 
      17             : U_NAMESPACE_BEGIN
      18             : 
      19           0 : CharsetRecog_UTF8::~CharsetRecog_UTF8()
      20             : {
      21             :     // nothing to do
      22           0 : }
      23             : 
      24           0 : const char *CharsetRecog_UTF8::getName() const
      25             : {
      26           0 :     return "UTF-8";
      27             : }
      28             : 
      29           0 : UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
      30           0 :     bool hasBOM = FALSE;
      31           0 :     int32_t numValid = 0;
      32           0 :     int32_t numInvalid = 0;
      33           0 :     const uint8_t *inputBytes = input->fRawInput;
      34             :     int32_t i;
      35           0 :     int32_t trailBytes = 0;
      36             :     int32_t confidence;
      37             : 
      38           0 :     if (input->fRawLength >= 3 && 
      39           0 :         inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) {
      40           0 :             hasBOM = TRUE;
      41             :     }
      42             : 
      43             :     // Scan for multi-byte sequences
      44           0 :     for (i=0; i < input->fRawLength; i += 1) {
      45           0 :         int32_t b = inputBytes[i];
      46             : 
      47           0 :         if ((b & 0x80) == 0) {
      48           0 :             continue;   // ASCII
      49             :         }
      50             : 
      51             :         // Hi bit on char found.  Figure out how long the sequence should be
      52           0 :         if ((b & 0x0E0) == 0x0C0) {
      53           0 :             trailBytes = 1;
      54           0 :         } else if ((b & 0x0F0) == 0x0E0) {
      55           0 :             trailBytes = 2;
      56           0 :         } else if ((b & 0x0F8) == 0xF0) {
      57           0 :             trailBytes = 3;
      58             :         } else {
      59           0 :             numInvalid += 1;
      60           0 :             continue;
      61             :         }
      62             : 
      63             :         // Verify that we've got the right number of trail bytes in the sequence
      64             :         for (;;) {
      65           0 :             i += 1;
      66             : 
      67           0 :             if (i >= input->fRawLength) {
      68           0 :                 break;
      69             :             }
      70             : 
      71           0 :             b = inputBytes[i];
      72             : 
      73           0 :             if ((b & 0xC0) != 0x080) {
      74           0 :                 numInvalid += 1;
      75           0 :                 break;
      76             :             }
      77             : 
      78           0 :             if (--trailBytes == 0) {
      79           0 :                 numValid += 1;
      80           0 :                 break;
      81             :             }
      82             :         }
      83             : 
      84             :     }
      85             : 
      86             :     // Cook up some sort of confidence score, based on presence of a BOM
      87             :     //    and the existence of valid and/or invalid multi-byte sequences.
      88           0 :     confidence = 0;
      89           0 :     if (hasBOM && numInvalid == 0) {
      90           0 :         confidence = 100;
      91           0 :     } else if (hasBOM && numValid > numInvalid*10) {
      92           0 :         confidence = 80;
      93           0 :     } else if (numValid > 3 && numInvalid == 0) {
      94           0 :         confidence = 100;
      95           0 :     } else if (numValid > 0 && numInvalid == 0) {
      96           0 :         confidence = 80;
      97           0 :     } else if (numValid == 0 && numInvalid == 0) {
      98             :         // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
      99             :         //              accepts ASCII with confidence = 10.
     100           0 :         confidence = 15;
     101           0 :     } else if (numValid > numInvalid*10) {
     102             :         // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
     103           0 :         confidence = 25;
     104             :     }
     105             : 
     106           0 :     results->set(input, this, confidence);
     107           0 :     return (confidence > 0);
     108             : }
     109             : 
     110             : U_NAMESPACE_END
     111             : #endif

Generated by: LCOV version 1.13