LCOV - code coverage report
Current view: top level - extensions/universalchardet/src/base - CharDistribution.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 30 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 6 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2             : /* This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       5             : 
       6             : #ifndef CharDistribution_h__
       7             : #define CharDistribution_h__
       8             : 
       9             : #include "nscore.h"
      10             : 
      11             : #define ENOUGH_DATA_THRESHOLD 1024
      12             : 
      13             : class CharDistributionAnalysis
      14             : {
      15             : public:
      16           0 :   CharDistributionAnalysis() {Reset();}
      17             : 
      18             :   //feed a block of data and do distribution analysis
      19             :   void HandleData(const char* aBuf, uint32_t aLen) {}
      20             : 
      21             :   //Feed a character with known length
      22           0 :   void HandleOneChar(const char* aStr, uint32_t aCharLen)
      23             :   {
      24             :     int32_t order;
      25             : 
      26             :     //we only care about 2-bytes character in our distribution analysis
      27           0 :     order = (aCharLen == 2) ? GetOrder(aStr) : -1;
      28             : 
      29           0 :     if (order >= 0)
      30             :     {
      31           0 :       mTotalChars++;
      32             :       //order is valid
      33           0 :       if ((uint32_t)order < mTableSize)
      34             :       {
      35           0 :         if (512 > mCharToFreqOrder[order])
      36           0 :           mFreqChars++;
      37             :       }
      38             :     }
      39           0 :   }
      40             : 
      41             :   //return confidence base on existing data
      42             :   float GetConfidence(void);
      43             : 
      44             :   //Reset analyser, clear any state
      45           0 :   void      Reset()
      46             :   {
      47           0 :     mDone = false;
      48           0 :     mTotalChars = 0;
      49           0 :     mFreqChars = 0;
      50           0 :     mDataThreshold = 0;
      51           0 :   }
      52             : 
      53             :   //It is not necessary to receive all data to draw conclusion. For charset detection,
      54             :   // certain amount of data is enough
      55             :   bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
      56             : 
      57             : protected:
      58             :   //we do not handle character base on its original encoding string, but
      59             :   //convert this encoding string to a number, here called order.
      60             :   //This allow multiple encoding of a language to share one frequency table
      61           0 :   virtual int32_t GetOrder(const char* str) {return -1;}
      62             : 
      63             :   //If this flag is set to true, detection is done and conclusion has been made
      64             :   bool     mDone;
      65             : 
      66             :   //The number of characters whose frequency order is less than 512
      67             :   uint32_t mFreqChars;
      68             : 
      69             :   //Total character encounted.
      70             :   uint32_t mTotalChars;
      71             : 
      72             :   //Number of hi-byte characters needed to trigger detection
      73             :   uint32_t mDataThreshold;
      74             : 
      75             :   //Mapping table to get frequency order from char order (get from GetOrder())
      76             :   const int16_t  *mCharToFreqOrder;
      77             : 
      78             :   //Size of above table
      79             :   uint32_t mTableSize;
      80             : 
      81             :   //This is a constant value varies from language to language, it is used in
      82             :   //calculating confidence. See my paper for further detail.
      83             :   float    mTypicalDistributionRatio;
      84             : };
      85             : 
      86             : 
      87             : class EUCTWDistributionAnalysis: public CharDistributionAnalysis
      88             : {
      89             : public:
      90             :   EUCTWDistributionAnalysis();
      91             : protected:
      92             : 
      93             :   //for euc-TW encoding, we are interested
      94             :   //  first  byte range: 0xc4 -- 0xfe
      95             :   //  second byte range: 0xa1 -- 0xfe
      96             :   //no validation needed here. State machine has done that
      97             :   int32_t GetOrder(const char* str)
      98             :   { if ((unsigned char)*str >= (unsigned char)0xc4)
      99             :       return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
     100             :     else
     101             :       return -1;
     102             :   }
     103             : };
     104             : 
     105             : 
     106             : class EUCKRDistributionAnalysis : public CharDistributionAnalysis
     107             : {
     108             : public:
     109             :   EUCKRDistributionAnalysis();
     110             : protected:
     111             :   //for euc-KR encoding, we are interested
     112             :   //  first  byte range: 0xb0 -- 0xfe
     113             :   //  second byte range: 0xa1 -- 0xfe
     114             :   //no validation needed here. State machine has done that
     115             :   int32_t GetOrder(const char* str)
     116             :   { if ((unsigned char)*str >= (unsigned char)0xb0)
     117             :       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
     118             :     else
     119             :       return -1;
     120             :   }
     121             : };
     122             : 
     123             : class GB2312DistributionAnalysis : public CharDistributionAnalysis
     124             : {
     125             : public:
     126             :   GB2312DistributionAnalysis();
     127             : protected:
     128             :   //for GB2312 encoding, we are interested
     129             :   //  first  byte range: 0xb0 -- 0xfe
     130             :   //  second byte range: 0xa1 -- 0xfe
     131             :   //no validation needed here. State machine has done that
     132             :   int32_t GetOrder(const char* str)
     133             :   { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
     134             :       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
     135             :     else
     136             :       return -1;
     137             :   }
     138             : };
     139             : 
     140             : 
     141             : class Big5DistributionAnalysis : public CharDistributionAnalysis
     142             : {
     143             : public:
     144             :   Big5DistributionAnalysis();
     145             : protected:
     146             :   //for big5 encoding, we are interested
     147             :   //  first  byte range: 0xa4 -- 0xfe
     148             :   //  second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
     149             :   //no validation needed here. State machine has done that
     150             :   int32_t GetOrder(const char* str)
     151             :   { if ((unsigned char)*str >= (unsigned char)0xa4)
     152             :       if ((unsigned char)str[1] >= (unsigned char)0xa1)
     153             :         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
     154             :       else
     155             :         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
     156             :     else
     157             :       return -1;
     158             :   }
     159             : };
     160             : 
     161             : class SJISDistributionAnalysis : public CharDistributionAnalysis
     162             : {
     163             : public:
     164             :   SJISDistributionAnalysis();
     165             : protected:
     166             :   //for sjis encoding, we are interested
     167             :   //  first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
     168             :   //  second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
     169             :   //no validation needed here. State machine has done that
     170           0 :   int32_t GetOrder(const char* str)
     171             :   {
     172             :     int32_t order;
     173           0 :     if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
     174           0 :       order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
     175           0 :     else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
     176           0 :       order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
     177             :     else
     178           0 :       return -1;
     179           0 :     order += (unsigned char)*(str+1) - 0x40;
     180           0 :     if ((unsigned char)str[1] > (unsigned char)0x7f)
     181           0 :       order--;
     182           0 :     return order;
     183             :   }
     184             : };
     185             : 
     186             : class EUCJPDistributionAnalysis : public CharDistributionAnalysis
     187             : {
     188             : public:
     189             :   EUCJPDistributionAnalysis();
     190             : protected:
     191             :   //for euc-JP encoding, we are interested
     192             :   //  first  byte range: 0xa0 -- 0xfe
     193             :   //  second byte range: 0xa1 -- 0xfe
     194             :   //no validation needed here. State machine has done that
     195           0 :   int32_t GetOrder(const char* str)
     196           0 :   { if ((unsigned char)*str >= (unsigned char)0xa0)
     197           0 :       return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
     198             :     else
     199           0 :       return -1;
     200             :   }
     201             : };
     202             : 
     203             : #endif //CharDistribution_h__
     204             : 

Generated by: LCOV version 1.13