Line data Source code
1 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "nsLatin1Prober.h"
7 : #include <stdio.h>
8 :
9 : #define UDF 0 // undefined
10 : #define OTH 1 //other
11 : #define ASC 2 // ascii capital letter
12 : #define ASS 3 // ascii small letter
13 : #define ACV 4 // accent capital vowel
14 : #define ACO 5 // accent capital other
15 : #define ASV 6 // accent small vowel
16 : #define ASO 7 // accent small other
17 : #define CLASS_NUM 8 // total classes
18 :
19 : static const unsigned char Latin1_CharToClass[] =
20 : {
21 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
22 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
23 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
24 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
25 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
26 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
27 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
28 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
29 : OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
30 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
31 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
32 : ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
33 : OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
34 : ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
35 : ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
36 : ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
37 : OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
38 : OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
39 : UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
40 : OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
41 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
42 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
43 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
44 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
45 : ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
46 : ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
47 : ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
48 : ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
49 : ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
50 : ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
51 : ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
52 : ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
53 : };
54 :
55 :
56 : /* 0 : illegal
57 : 1 : very unlikely
58 : 2 : normal
59 : 3 : very likely
60 : */
61 : static const unsigned char Latin1ClassModel[] =
62 : {
63 : /* UDF OTH ASC ASS ACV ACO ASV ASO */
64 : /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
65 : /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
66 : /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
67 : /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
68 : /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
69 : /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
70 : /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
71 : /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
72 : };
73 :
74 0 : void nsLatin1Prober::Reset(void)
75 : {
76 0 : mState = eDetecting;
77 0 : mLastCharClass = OTH;
78 0 : for (int i = 0; i < FREQ_CAT_NUM; i++)
79 0 : mFreqCounter[i] = 0;
80 0 : }
81 :
82 :
83 0 : nsProbingState nsLatin1Prober::HandleData(const char* aBuf, uint32_t aLen)
84 : {
85 0 : char *newBuf1 = 0;
86 0 : uint32_t newLen1 = 0;
87 :
88 0 : if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
89 0 : newBuf1 = (char*)aBuf;
90 0 : newLen1 = aLen;
91 : }
92 :
93 : unsigned char charClass;
94 : unsigned char freq;
95 0 : for (uint32_t i = 0; i < newLen1; i++)
96 : {
97 0 : charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
98 0 : freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
99 0 : if (freq == 0) {
100 0 : mState = eNotMe;
101 0 : break;
102 : }
103 0 : mFreqCounter[freq]++;
104 0 : mLastCharClass = charClass;
105 : }
106 :
107 0 : if (newBuf1 != aBuf)
108 0 : free(newBuf1);
109 :
110 0 : return mState;
111 : }
112 :
113 0 : float nsLatin1Prober::GetConfidence(void)
114 : {
115 0 : if (mState == eNotMe)
116 0 : return 0.01f;
117 :
118 : float confidence;
119 0 : uint32_t total = 0;
120 0 : for (int32_t i = 0; i < FREQ_CAT_NUM; i++)
121 0 : total += mFreqCounter[i];
122 :
123 0 : if(!total)
124 0 : confidence = 0.0f;
125 : else
126 : {
127 0 : confidence = mFreqCounter[3]*1.0f / total;
128 0 : confidence -= mFreqCounter[1]*20.0f/total;
129 : }
130 :
131 0 : if (confidence < 0.0f)
132 0 : confidence = 0.0f;
133 :
134 : // lower the confidence of latin1 so that other more accurate detector
135 : // can take priority.
136 0 : confidence *= 0.50f;
137 :
138 0 : return confidence;
139 : }
140 :
141 : #ifdef DEBUG_chardet
142 : void nsLatin1Prober::DumpStatus()
143 : {
144 : printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
145 : }
146 : #endif
147 :
148 :
|