Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : **********************************************************************
5 : * Copyright (C) 2005-2016, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : **********************************************************************
8 : */
9 :
10 : #include "unicode/utypes.h"
11 :
12 : #if !UCONFIG_NO_CONVERSION
13 :
14 : #include "cmemory.h"
15 : #include "csmatch.h"
16 : #include "csrmbcs.h"
17 :
18 : #include <math.h>
19 :
20 : U_NAMESPACE_BEGIN
21 :
22 : #define min(x,y) (((x)<(y))?(x):(y))
23 :
24 : static const uint16_t commonChars_sjis [] = {
25 : // TODO: This set of data comes from the character frequency-
26 : // of-occurence analysis tool. The data needs to be moved
27 : // into a resource and loaded from there.
28 : 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29 : 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30 : 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31 : 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32 : 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33 : 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34 :
35 : static const uint16_t commonChars_euc_jp[] = {
36 : // TODO: This set of data comes from the character frequency-
37 : // of-occurence analysis tool. The data needs to be moved
38 : // into a resource and loaded from there.
39 : 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40 : 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41 : 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42 : 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43 : 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44 : 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45 : 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46 : 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47 : 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48 : 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49 :
50 : static const uint16_t commonChars_euc_kr[] = {
51 : // TODO: This set of data comes from the character frequency-
52 : // of-occurence analysis tool. The data needs to be moved
53 : // into a resource and loaded from there.
54 : 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55 : 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56 : 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57 : 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58 : 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59 : 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60 : 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61 : 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62 : 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63 : 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64 :
65 : static const uint16_t commonChars_big5[] = {
66 : // TODO: This set of data comes from the character frequency-
67 : // of-occurence analysis tool. The data needs to be moved
68 : // into a resource and loaded from there.
69 : 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70 : 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71 : 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72 : 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73 : 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74 : 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75 : 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76 : 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77 : 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78 : 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79 :
80 : static const uint16_t commonChars_gb_18030[] = {
81 : // TODO: This set of data comes from the character frequency-
82 : // of-occurence analysis tool. The data needs to be moved
83 : // into a resource and loaded from there.
84 : 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85 : 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86 : 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87 : 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88 : 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89 : 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90 : 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91 : 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92 : 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93 : 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94 :
95 0 : static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96 : {
97 0 : int32_t start = 0, end = len-1;
98 0 : int32_t mid = (start+end)/2;
99 :
100 0 : while(start <= end) {
101 0 : if(array[mid] == value) {
102 0 : return mid;
103 : }
104 :
105 0 : if(array[mid] < value){
106 0 : start = mid+1;
107 : } else {
108 0 : end = mid-1;
109 : }
110 :
111 0 : mid = (start+end)/2;
112 : }
113 :
114 0 : return -1;
115 : }
116 :
117 0 : IteratedChar::IteratedChar() :
118 0 : charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
119 : {
120 : // nothing else to do.
121 0 : }
122 :
123 : /*void IteratedChar::reset()
124 : {
125 : charValue = 0;
126 : index = -1;
127 : nextIndex = 0;
128 : error = FALSE;
129 : done = FALSE;
130 : }*/
131 :
132 0 : int32_t IteratedChar::nextByte(InputText *det)
133 : {
134 0 : if (nextIndex >= det->fRawLength) {
135 0 : done = TRUE;
136 :
137 0 : return -1;
138 : }
139 :
140 0 : return det->fRawInput[nextIndex++];
141 : }
142 :
143 0 : CharsetRecog_mbcs::~CharsetRecog_mbcs()
144 : {
145 : // nothing to do.
146 0 : }
147 :
148 0 : int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149 0 : int32_t singleByteCharCount = 0;
150 0 : int32_t doubleByteCharCount = 0;
151 0 : int32_t commonCharCount = 0;
152 0 : int32_t badCharCount = 0;
153 0 : int32_t totalCharCount = 0;
154 0 : int32_t confidence = 0;
155 0 : IteratedChar iter;
156 :
157 0 : while (nextChar(&iter, det)) {
158 0 : totalCharCount++;
159 :
160 0 : if (iter.error) {
161 0 : badCharCount++;
162 : } else {
163 0 : if (iter.charValue <= 0xFF) {
164 0 : singleByteCharCount++;
165 : } else {
166 0 : doubleByteCharCount++;
167 :
168 0 : if (commonChars != 0) {
169 0 : if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
170 0 : commonCharCount += 1;
171 : }
172 : }
173 : }
174 : }
175 :
176 :
177 0 : if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
178 : // Bail out early if the byte data is not matching the encoding scheme.
179 : // break detectBlock;
180 0 : return confidence;
181 : }
182 : }
183 :
184 0 : if (doubleByteCharCount <= 10 && badCharCount == 0) {
185 : // Not many multi-byte chars.
186 0 : if (doubleByteCharCount == 0 && totalCharCount < 10) {
187 : // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188 : // We don't have enough data to have any confidence.
189 : // Statistical analysis of single byte non-ASCII charcters would probably help here.
190 0 : confidence = 0;
191 : }
192 : else {
193 : // ASCII or ISO file? It's probably not our encoding,
194 : // but is not incompatible with our encoding, so don't give it a zero.
195 0 : confidence = 10;
196 : }
197 :
198 0 : return confidence;
199 : }
200 :
201 : //
202 : // No match if there are too many characters that don't fit the encoding scheme.
203 : // (should we have zero tolerance for these?)
204 : //
205 0 : if (doubleByteCharCount < 20*badCharCount) {
206 0 : confidence = 0;
207 :
208 0 : return confidence;
209 : }
210 :
211 0 : if (commonChars == 0) {
212 : // We have no statistics on frequently occuring characters.
213 : // Assess confidence purely on having a reasonable number of
214 : // multi-byte characters (the more the better)
215 0 : confidence = 30 + doubleByteCharCount - 20*badCharCount;
216 :
217 0 : if (confidence > 100) {
218 0 : confidence = 100;
219 : }
220 : } else {
221 : //
222 : // Frequency of occurence statistics exist.
223 : //
224 :
225 0 : double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
226 0 : double scaleFactor = 90.0 / maxVal;
227 0 : confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
228 :
229 0 : confidence = min(confidence, 100);
230 : }
231 :
232 0 : if (confidence < 0) {
233 0 : confidence = 0;
234 : }
235 :
236 0 : return confidence;
237 : }
238 :
239 0 : CharsetRecog_sjis::~CharsetRecog_sjis()
240 : {
241 : // nothing to do
242 0 : }
243 :
244 0 : UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245 0 : it->index = it->nextIndex;
246 0 : it->error = FALSE;
247 :
248 0 : int32_t firstByte = it->charValue = it->nextByte(det);
249 :
250 0 : if (firstByte < 0) {
251 0 : return FALSE;
252 : }
253 :
254 0 : if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
255 0 : return TRUE;
256 : }
257 :
258 0 : int32_t secondByte = it->nextByte(det);
259 0 : if (secondByte >= 0) {
260 0 : it->charValue = (firstByte << 8) | secondByte;
261 : }
262 : // else we'll handle the error later.
263 :
264 0 : if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
265 : // Illegal second byte value.
266 0 : it->error = TRUE;
267 : }
268 :
269 0 : return TRUE;
270 : }
271 :
272 0 : UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
273 0 : int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274 0 : results->set(det, this, confidence);
275 0 : return (confidence > 0);
276 : }
277 :
278 0 : const char *CharsetRecog_sjis::getName() const
279 : {
280 0 : return "Shift_JIS";
281 : }
282 :
283 0 : const char *CharsetRecog_sjis::getLanguage() const
284 : {
285 0 : return "ja";
286 : }
287 :
288 0 : CharsetRecog_euc::~CharsetRecog_euc()
289 : {
290 : // nothing to do
291 0 : }
292 :
293 0 : UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294 0 : int32_t firstByte = 0;
295 0 : int32_t secondByte = 0;
296 0 : int32_t thirdByte = 0;
297 :
298 0 : it->index = it->nextIndex;
299 0 : it->error = FALSE;
300 0 : firstByte = it->charValue = it->nextByte(det);
301 :
302 0 : if (firstByte < 0) {
303 : // Ran off the end of the input data
304 0 : return FALSE;
305 : }
306 :
307 0 : if (firstByte <= 0x8D) {
308 : // single byte char
309 0 : return TRUE;
310 : }
311 :
312 0 : secondByte = it->nextByte(det);
313 0 : if (secondByte >= 0) {
314 0 : it->charValue = (it->charValue << 8) | secondByte;
315 : }
316 : // else we'll handle the error later.
317 :
318 0 : if (firstByte >= 0xA1 && firstByte <= 0xFE) {
319 : // Two byte Char
320 0 : if (secondByte < 0xA1) {
321 0 : it->error = TRUE;
322 : }
323 :
324 0 : return TRUE;
325 : }
326 :
327 0 : if (firstByte == 0x8E) {
328 : // Code Set 2.
329 : // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330 : // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331 : // We don't know which we've got.
332 : // Treat it like EUC-JP. If the data really was EUC-TW, the following two
333 : // bytes will look like a well formed 2 byte char.
334 0 : if (secondByte < 0xA1) {
335 0 : it->error = TRUE;
336 : }
337 :
338 0 : return TRUE;
339 : }
340 :
341 0 : if (firstByte == 0x8F) {
342 : // Code set 3.
343 : // Three byte total char size, two bytes of actual char value.
344 0 : thirdByte = it->nextByte(det);
345 0 : it->charValue = (it->charValue << 8) | thirdByte;
346 :
347 0 : if (thirdByte < 0xa1) {
348 : // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
349 0 : it->error = TRUE;
350 : }
351 : }
352 :
353 0 : return TRUE;
354 :
355 : }
356 :
357 0 : CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
358 : {
359 : // nothing to do
360 0 : }
361 :
362 0 : const char *CharsetRecog_euc_jp::getName() const
363 : {
364 0 : return "EUC-JP";
365 : }
366 :
367 0 : const char *CharsetRecog_euc_jp::getLanguage() const
368 : {
369 0 : return "ja";
370 : }
371 :
372 0 : UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
373 : {
374 0 : int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375 0 : results->set(det, this, confidence);
376 0 : return (confidence > 0);
377 : }
378 :
379 0 : CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
380 : {
381 : // nothing to do
382 0 : }
383 :
384 0 : const char *CharsetRecog_euc_kr::getName() const
385 : {
386 0 : return "EUC-KR";
387 : }
388 :
389 0 : const char *CharsetRecog_euc_kr::getLanguage() const
390 : {
391 0 : return "ko";
392 : }
393 :
394 0 : UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
395 : {
396 0 : int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397 0 : results->set(det, this, confidence);
398 0 : return (confidence > 0);
399 : }
400 :
401 0 : CharsetRecog_big5::~CharsetRecog_big5()
402 : {
403 : // nothing to do
404 0 : }
405 :
406 0 : UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
407 : {
408 : int32_t firstByte;
409 :
410 0 : it->index = it->nextIndex;
411 0 : it->error = FALSE;
412 0 : firstByte = it->charValue = it->nextByte(det);
413 :
414 0 : if (firstByte < 0) {
415 0 : return FALSE;
416 : }
417 :
418 0 : if (firstByte <= 0x7F || firstByte == 0xFF) {
419 : // single byte character.
420 0 : return TRUE;
421 : }
422 :
423 0 : int32_t secondByte = it->nextByte(det);
424 0 : if (secondByte >= 0) {
425 0 : it->charValue = (it->charValue << 8) | secondByte;
426 : }
427 : // else we'll handle the error later.
428 :
429 0 : if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
430 0 : it->error = TRUE;
431 : }
432 :
433 0 : return TRUE;
434 : }
435 :
436 0 : const char *CharsetRecog_big5::getName() const
437 : {
438 0 : return "Big5";
439 : }
440 :
441 0 : const char *CharsetRecog_big5::getLanguage() const
442 : {
443 0 : return "zh";
444 : }
445 :
446 0 : UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
447 : {
448 0 : int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449 0 : results->set(det, this, confidence);
450 0 : return (confidence > 0);
451 : }
452 :
453 0 : CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
454 : {
455 : // nothing to do
456 0 : }
457 :
458 0 : UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459 0 : int32_t firstByte = 0;
460 0 : int32_t secondByte = 0;
461 0 : int32_t thirdByte = 0;
462 0 : int32_t fourthByte = 0;
463 :
464 0 : it->index = it->nextIndex;
465 0 : it->error = FALSE;
466 0 : firstByte = it->charValue = it->nextByte(det);
467 :
468 0 : if (firstByte < 0) {
469 : // Ran off the end of the input data
470 0 : return FALSE;
471 : }
472 :
473 0 : if (firstByte <= 0x80) {
474 : // single byte char
475 0 : return TRUE;
476 : }
477 :
478 0 : secondByte = it->nextByte(det);
479 0 : if (secondByte >= 0) {
480 0 : it->charValue = (it->charValue << 8) | secondByte;
481 : }
482 : // else we'll handle the error later.
483 :
484 0 : if (firstByte >= 0x81 && firstByte <= 0xFE) {
485 : // Two byte Char
486 0 : if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
487 0 : return TRUE;
488 : }
489 :
490 : // Four byte char
491 0 : if (secondByte >= 0x30 && secondByte <= 0x39) {
492 0 : thirdByte = it->nextByte(det);
493 :
494 0 : if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
495 0 : fourthByte = it->nextByte(det);
496 :
497 0 : if (fourthByte >= 0x30 && fourthByte <= 0x39) {
498 0 : it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
499 :
500 0 : return TRUE;
501 : }
502 : }
503 : }
504 :
505 : // Something wasn't valid, or we ran out of data (-1).
506 0 : it->error = TRUE;
507 : }
508 :
509 0 : return TRUE;
510 : }
511 :
512 0 : const char *CharsetRecog_gb_18030::getName() const
513 : {
514 0 : return "GB18030";
515 : }
516 :
517 0 : const char *CharsetRecog_gb_18030::getLanguage() const
518 : {
519 0 : return "zh";
520 : }
521 :
522 0 : UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
523 : {
524 0 : int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525 0 : results->set(det, this, confidence);
526 0 : return (confidence > 0);
527 : }
528 :
529 : U_NAMESPACE_END
530 : #endif
|