LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - inputext.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 64 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 6 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             :  **********************************************************************
       5             :  *   Copyright (C) 2005-2016, International Business Machines
       6             :  *   Corporation and others.  All Rights Reserved.
       7             :  **********************************************************************
       8             :  */
       9             : 
      10             : #include "unicode/utypes.h"
      11             : 
      12             : #if !UCONFIG_NO_CONVERSION
      13             : 
      14             : #include "inputext.h"
      15             : 
      16             : #include "cmemory.h"
      17             : #include "cstring.h"
      18             : 
      19             : #include <string.h>
      20             : 
      21             : U_NAMESPACE_BEGIN
      22             : 
      23             : #define BUFFER_SIZE 8192
      24             : 
      25             : #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
      26             : #define DELETE_ARRAY(array) uprv_free((void *) (array))
      27             : 
      28           0 : InputText::InputText(UErrorCode &status)
      29           0 :     : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
      30             :                                                  //   removed if appropriate.
      31           0 :       fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
      32             :                                                  //   Value is percent, not absolute.
      33             :       fDeclaredEncoding(0),
      34             :       fRawInput(0),
      35           0 :       fRawLength(0)
      36             : {
      37           0 :     if (fInputBytes == NULL || fByteStats == NULL) {
      38           0 :         status = U_MEMORY_ALLOCATION_ERROR;
      39             :     }
      40           0 : }
      41             : 
      42           0 : InputText::~InputText()
      43             : {
      44           0 :     DELETE_ARRAY(fDeclaredEncoding);
      45           0 :     DELETE_ARRAY(fByteStats);
      46           0 :     DELETE_ARRAY(fInputBytes);
      47           0 : }
      48             : 
      49           0 : void InputText::setText(const char *in, int32_t len)
      50             : {
      51           0 :     fInputLen  = 0;
      52           0 :     fC1Bytes   = FALSE;
      53           0 :     fRawInput  = (const uint8_t *) in;
      54           0 :     fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
      55           0 : }
      56             : 
      57           0 : void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
      58             : {
      59           0 :     if(encoding) {
      60           0 :         if (len == -1) {
      61           0 :             len = (int32_t)uprv_strlen(encoding);
      62             :         }
      63             : 
      64           0 :         len += 1;     // to make place for the \0 at the end.
      65           0 :         uprv_free(fDeclaredEncoding);
      66           0 :         fDeclaredEncoding = NEW_ARRAY(char, len);
      67           0 :         uprv_strncpy(fDeclaredEncoding, encoding, len);
      68             :     }
      69           0 : }
      70             : 
      71           0 : UBool InputText::isSet() const 
      72             : {
      73           0 :     return fRawInput != NULL;
      74             : }
      75             : 
      76             : /**
      77             : *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
      78             : *               it by removing what appears to be html markup.
      79             : * 
      80             : * @internal
      81             : */
      82           0 : void InputText::MungeInput(UBool fStripTags) {
      83           0 :     int     srci = 0;
      84           0 :     int     dsti = 0;
      85             :     uint8_t b;
      86           0 :     bool    inMarkup = FALSE;
      87           0 :     int32_t openTags = 0;
      88           0 :     int32_t badTags  = 0;
      89             : 
      90             :     //
      91             :     //  html / xml markup stripping.
      92             :     //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
      93             :     //     discard everything within < brackets >
      94             :     //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
      95             :     //     guess as to whether the input was actually marked up at all.
      96             :     // TODO: Think about how this interacts with EBCDIC charsets that are detected.
      97           0 :     if (fStripTags) {
      98           0 :         for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
      99           0 :             b = fRawInput[srci];
     100             : 
     101           0 :             if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
     102           0 :                 if (inMarkup) {
     103           0 :                     badTags += 1;
     104             :                 }
     105             : 
     106           0 :                 inMarkup = TRUE;
     107           0 :                 openTags += 1;
     108             :             }
     109             : 
     110           0 :             if (! inMarkup) {
     111           0 :                 fInputBytes[dsti++] = b;
     112             :             }
     113             : 
     114           0 :             if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
     115           0 :                 inMarkup = FALSE;
     116             :             }
     117             :         }
     118             : 
     119           0 :         fInputLen = dsti;
     120             :     }
     121             : 
     122             :     //
     123             :     //  If it looks like this input wasn't marked up, or if it looks like it's
     124             :     //    essentially nothing but markup abandon the markup stripping.
     125             :     //    Detection will have to work on the unstripped input.
     126             :     //
     127           0 :     if (openTags<5 || openTags/5 < badTags || 
     128           0 :         (fInputLen < 100 && fRawLength>600))
     129             :     {
     130           0 :         int32_t limit = fRawLength;
     131             : 
     132           0 :         if (limit > BUFFER_SIZE) {
     133           0 :             limit = BUFFER_SIZE;
     134             :         }
     135             : 
     136           0 :         for (srci=0; srci<limit; srci++) {
     137           0 :             fInputBytes[srci] = fRawInput[srci];
     138             :         }
     139             : 
     140           0 :         fInputLen = srci;
     141             :     }
     142             : 
     143             :     //
     144             :     // Tally up the byte occurence statistics.
     145             :     // These are available for use by the various detectors.
     146             :     //
     147             : 
     148           0 :     uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
     149             : 
     150           0 :     for (srci = 0; srci < fInputLen; srci += 1) {
     151           0 :         fByteStats[fInputBytes[srci]] += 1;
     152             :     }
     153             : 
     154           0 :     for (int32_t i = 0x80; i <= 0x9F; i += 1) {
     155           0 :         if (fByteStats[i] != 0) {
     156           0 :             fC1Bytes = TRUE;
     157           0 :             break;
     158             :         }
     159             :     }
     160           0 : }
     161             : 
     162             : U_NAMESPACE_END
     163             : #endif
     164             : 

Generated by: LCOV version 1.13