Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : **********************************************************************
5 : * Copyright (C) 2005-2016, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : **********************************************************************
8 : */
9 :
10 : #include "unicode/utypes.h"
11 :
12 : #if !UCONFIG_NO_CONVERSION
13 :
14 : #include "inputext.h"
15 :
16 : #include "cmemory.h"
17 : #include "cstring.h"
18 :
19 : #include <string.h>
20 :
21 : U_NAMESPACE_BEGIN
22 :
23 : #define BUFFER_SIZE 8192
24 :
25 : #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
26 : #define DELETE_ARRAY(array) uprv_free((void *) (array))
27 :
28 0 : InputText::InputText(UErrorCode &status)
29 0 : : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
30 : // removed if appropriate.
31 0 : fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
32 : // Value is percent, not absolute.
33 : fDeclaredEncoding(0),
34 : fRawInput(0),
35 0 : fRawLength(0)
36 : {
37 0 : if (fInputBytes == NULL || fByteStats == NULL) {
38 0 : status = U_MEMORY_ALLOCATION_ERROR;
39 : }
40 0 : }
41 :
42 0 : InputText::~InputText()
43 : {
44 0 : DELETE_ARRAY(fDeclaredEncoding);
45 0 : DELETE_ARRAY(fByteStats);
46 0 : DELETE_ARRAY(fInputBytes);
47 0 : }
48 :
49 0 : void InputText::setText(const char *in, int32_t len)
50 : {
51 0 : fInputLen = 0;
52 0 : fC1Bytes = FALSE;
53 0 : fRawInput = (const uint8_t *) in;
54 0 : fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
55 0 : }
56 :
57 0 : void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
58 : {
59 0 : if(encoding) {
60 0 : if (len == -1) {
61 0 : len = (int32_t)uprv_strlen(encoding);
62 : }
63 :
64 0 : len += 1; // to make place for the \0 at the end.
65 0 : uprv_free(fDeclaredEncoding);
66 0 : fDeclaredEncoding = NEW_ARRAY(char, len);
67 0 : uprv_strncpy(fDeclaredEncoding, encoding, len);
68 : }
69 0 : }
70 :
71 0 : UBool InputText::isSet() const
72 : {
73 0 : return fRawInput != NULL;
74 : }
75 :
76 : /**
77 : * MungeInput - after getting a set of raw input data to be analyzed, preprocess
78 : * it by removing what appears to be html markup.
79 : *
80 : * @internal
81 : */
82 0 : void InputText::MungeInput(UBool fStripTags) {
83 0 : int srci = 0;
84 0 : int dsti = 0;
85 : uint8_t b;
86 0 : bool inMarkup = FALSE;
87 0 : int32_t openTags = 0;
88 0 : int32_t badTags = 0;
89 :
90 : //
91 : // html / xml markup stripping.
92 : // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
93 : // discard everything within < brackets >
94 : // Count how many total '<' and illegal (nested) '<' occur, so we can make some
95 : // guess as to whether the input was actually marked up at all.
96 : // TODO: Think about how this interacts with EBCDIC charsets that are detected.
97 0 : if (fStripTags) {
98 0 : for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
99 0 : b = fRawInput[srci];
100 :
101 0 : if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
102 0 : if (inMarkup) {
103 0 : badTags += 1;
104 : }
105 :
106 0 : inMarkup = TRUE;
107 0 : openTags += 1;
108 : }
109 :
110 0 : if (! inMarkup) {
111 0 : fInputBytes[dsti++] = b;
112 : }
113 :
114 0 : if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
115 0 : inMarkup = FALSE;
116 : }
117 : }
118 :
119 0 : fInputLen = dsti;
120 : }
121 :
122 : //
123 : // If it looks like this input wasn't marked up, or if it looks like it's
124 : // essentially nothing but markup abandon the markup stripping.
125 : // Detection will have to work on the unstripped input.
126 : //
127 0 : if (openTags<5 || openTags/5 < badTags ||
128 0 : (fInputLen < 100 && fRawLength>600))
129 : {
130 0 : int32_t limit = fRawLength;
131 :
132 0 : if (limit > BUFFER_SIZE) {
133 0 : limit = BUFFER_SIZE;
134 : }
135 :
136 0 : for (srci=0; srci<limit; srci++) {
137 0 : fInputBytes[srci] = fRawInput[srci];
138 : }
139 :
140 0 : fInputLen = srci;
141 : }
142 :
143 : //
144 : // Tally up the byte occurence statistics.
145 : // These are available for use by the various detectors.
146 : //
147 :
148 0 : uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
149 :
150 0 : for (srci = 0; srci < fInputLen; srci += 1) {
151 0 : fByteStats[fInputBytes[srci]] += 1;
152 : }
153 :
154 0 : for (int32_t i = 0x80; i <= 0x9F; i += 1) {
155 0 : if (fByteStats[i] != 0) {
156 0 : fC1Bytes = TRUE;
157 0 : break;
158 : }
159 : }
160 0 : }
161 :
162 : U_NAMESPACE_END
163 : #endif
164 :
|