Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #ifndef CharDistribution_h__
7 : #define CharDistribution_h__
8 :
9 : #include "nscore.h"
10 :
11 : #define ENOUGH_DATA_THRESHOLD 1024
12 :
13 : class CharDistributionAnalysis
14 : {
15 : public:
16 0 : CharDistributionAnalysis() {Reset();}
17 :
18 : //feed a block of data and do distribution analysis
19 : void HandleData(const char* aBuf, uint32_t aLen) {}
20 :
21 : //Feed a character with known length
22 0 : void HandleOneChar(const char* aStr, uint32_t aCharLen)
23 : {
24 : int32_t order;
25 :
26 : //we only care about 2-bytes character in our distribution analysis
27 0 : order = (aCharLen == 2) ? GetOrder(aStr) : -1;
28 :
29 0 : if (order >= 0)
30 : {
31 0 : mTotalChars++;
32 : //order is valid
33 0 : if ((uint32_t)order < mTableSize)
34 : {
35 0 : if (512 > mCharToFreqOrder[order])
36 0 : mFreqChars++;
37 : }
38 : }
39 0 : }
40 :
41 : //return confidence base on existing data
42 : float GetConfidence(void);
43 :
44 : //Reset analyser, clear any state
45 0 : void Reset()
46 : {
47 0 : mDone = false;
48 0 : mTotalChars = 0;
49 0 : mFreqChars = 0;
50 0 : mDataThreshold = 0;
51 0 : }
52 :
53 : //It is not necessary to receive all data to draw conclusion. For charset detection,
54 : // certain amount of data is enough
55 : bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
56 :
57 : protected:
58 : //we do not handle character base on its original encoding string, but
59 : //convert this encoding string to a number, here called order.
60 : //This allow multiple encoding of a language to share one frequency table
61 0 : virtual int32_t GetOrder(const char* str) {return -1;}
62 :
63 : //If this flag is set to true, detection is done and conclusion has been made
64 : bool mDone;
65 :
66 : //The number of characters whose frequency order is less than 512
67 : uint32_t mFreqChars;
68 :
69 : //Total character encounted.
70 : uint32_t mTotalChars;
71 :
72 : //Number of hi-byte characters needed to trigger detection
73 : uint32_t mDataThreshold;
74 :
75 : //Mapping table to get frequency order from char order (get from GetOrder())
76 : const int16_t *mCharToFreqOrder;
77 :
78 : //Size of above table
79 : uint32_t mTableSize;
80 :
81 : //This is a constant value varies from language to language, it is used in
82 : //calculating confidence. See my paper for further detail.
83 : float mTypicalDistributionRatio;
84 : };
85 :
86 :
87 : class EUCTWDistributionAnalysis: public CharDistributionAnalysis
88 : {
89 : public:
90 : EUCTWDistributionAnalysis();
91 : protected:
92 :
93 : //for euc-TW encoding, we are interested
94 : // first byte range: 0xc4 -- 0xfe
95 : // second byte range: 0xa1 -- 0xfe
96 : //no validation needed here. State machine has done that
97 : int32_t GetOrder(const char* str)
98 : { if ((unsigned char)*str >= (unsigned char)0xc4)
99 : return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
100 : else
101 : return -1;
102 : }
103 : };
104 :
105 :
106 : class EUCKRDistributionAnalysis : public CharDistributionAnalysis
107 : {
108 : public:
109 : EUCKRDistributionAnalysis();
110 : protected:
111 : //for euc-KR encoding, we are interested
112 : // first byte range: 0xb0 -- 0xfe
113 : // second byte range: 0xa1 -- 0xfe
114 : //no validation needed here. State machine has done that
115 : int32_t GetOrder(const char* str)
116 : { if ((unsigned char)*str >= (unsigned char)0xb0)
117 : return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
118 : else
119 : return -1;
120 : }
121 : };
122 :
123 : class GB2312DistributionAnalysis : public CharDistributionAnalysis
124 : {
125 : public:
126 : GB2312DistributionAnalysis();
127 : protected:
128 : //for GB2312 encoding, we are interested
129 : // first byte range: 0xb0 -- 0xfe
130 : // second byte range: 0xa1 -- 0xfe
131 : //no validation needed here. State machine has done that
132 : int32_t GetOrder(const char* str)
133 : { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
134 : return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
135 : else
136 : return -1;
137 : }
138 : };
139 :
140 :
141 : class Big5DistributionAnalysis : public CharDistributionAnalysis
142 : {
143 : public:
144 : Big5DistributionAnalysis();
145 : protected:
146 : //for big5 encoding, we are interested
147 : // first byte range: 0xa4 -- 0xfe
148 : // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
149 : //no validation needed here. State machine has done that
150 : int32_t GetOrder(const char* str)
151 : { if ((unsigned char)*str >= (unsigned char)0xa4)
152 : if ((unsigned char)str[1] >= (unsigned char)0xa1)
153 : return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
154 : else
155 : return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
156 : else
157 : return -1;
158 : }
159 : };
160 :
161 : class SJISDistributionAnalysis : public CharDistributionAnalysis
162 : {
163 : public:
164 : SJISDistributionAnalysis();
165 : protected:
166 : //for sjis encoding, we are interested
167 : // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
168 : // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
169 : //no validation needed here. State machine has done that
170 0 : int32_t GetOrder(const char* str)
171 : {
172 : int32_t order;
173 0 : if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
174 0 : order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
175 0 : else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
176 0 : order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
177 : else
178 0 : return -1;
179 0 : order += (unsigned char)*(str+1) - 0x40;
180 0 : if ((unsigned char)str[1] > (unsigned char)0x7f)
181 0 : order--;
182 0 : return order;
183 : }
184 : };
185 :
186 : class EUCJPDistributionAnalysis : public CharDistributionAnalysis
187 : {
188 : public:
189 : EUCJPDistributionAnalysis();
190 : protected:
191 : //for euc-JP encoding, we are interested
192 : // first byte range: 0xa0 -- 0xfe
193 : // second byte range: 0xa1 -- 0xfe
194 : //no validation needed here. State machine has done that
195 0 : int32_t GetOrder(const char* str)
196 0 : { if ((unsigned char)*str >= (unsigned char)0xa0)
197 0 : return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
198 : else
199 0 : return -1;
200 : }
201 : };
202 :
203 : #endif //CharDistribution_h__
204 :
|