Line data Source code
1 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : // for japanese encoding, obeserve characteristic:
7 : // 1, kana character (or hankaku?) often have hight frequency of appereance
8 : // 2, kana character often exist in group
9 : // 3, certain combination of kana is never used in japanese language
10 :
11 : #include "nsEUCJPProber.h"
12 : #include "nsDebug.h"
13 :
14 0 : void nsEUCJPProber::Reset(void)
15 : {
16 0 : mCodingSM->Reset();
17 0 : mState = eDetecting;
18 0 : mContextAnalyser.Reset();
19 0 : mDistributionAnalyser.Reset();
20 0 : }
21 :
22 0 : nsProbingState nsEUCJPProber::HandleData(const char* aBuf, uint32_t aLen)
23 : {
24 0 : NS_ASSERTION(aLen, "HandleData called with empty buffer");
25 : nsSMState codingState;
26 :
27 0 : for (uint32_t i = 0; i < aLen; i++)
28 : {
29 0 : codingState = mCodingSM->NextState(aBuf[i]);
30 0 : if (codingState == eItsMe)
31 : {
32 0 : mState = eFoundIt;
33 0 : break;
34 : }
35 0 : if (codingState == eStart)
36 : {
37 0 : uint32_t charLen = mCodingSM->GetCurrentCharLen();
38 :
39 0 : if (i == 0)
40 : {
41 0 : mLastChar[1] = aBuf[0];
42 0 : mContextAnalyser.HandleOneChar(mLastChar, charLen);
43 0 : mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
44 : }
45 : else
46 : {
47 0 : mContextAnalyser.HandleOneChar(aBuf+i-1, charLen);
48 0 : mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
49 : }
50 : }
51 : }
52 :
53 0 : mLastChar[0] = aBuf[aLen-1];
54 :
55 0 : if (mState == eDetecting)
56 0 : if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
57 0 : mState = eFoundIt;
58 :
59 0 : return mState;
60 : }
61 :
62 0 : float nsEUCJPProber::GetConfidence(void)
63 : {
64 0 : float contxtCf = mContextAnalyser.GetConfidence();
65 0 : float distribCf = mDistributionAnalyser.GetConfidence();
66 :
67 0 : return (contxtCf > distribCf ? contxtCf : distribCf);
68 : }
69 :
|