LCOV - code coverage report
Current view: top level - intl/unicharutil/util - IrishCasing.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 23 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 2 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2             : /* This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       5             : 
       6             : /******************************************************************************
       7             : 
       8             : This file provides a finite state machine to support Irish Gaelic uppercasing
       9             : rules.
      10             : 
      11             : The caller will need to iterate through a string, passing a State variable
      12             : along with the current character to each UpperCase call and checking the flags
      13             : that are returned:
      14             : 
      15             :   If aMarkPos is true, caller must remember the current index in the string as
      16             :   a possible target for a future action.
      17             : 
      18             :   If aAction is non-zero, then one or more characters from the marked index are
      19             :   to be modified:
      20             :     1  lowercase the marked letter
      21             :     2  lowercase the marked letter and its successor
      22             :     3  lowercase the marked letter, and delete its successor
      23             : 
      24             : 
      25             : ### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639,
      26             : ### comments 1 and 4:
      27             : 
      28             : v = [a,á,e,é,i,í,o,ó,u,ú]
      29             : V = [A,Á,E,É,I,Í,O,Ó,U,Ú]
      30             : 
      31             : bhf -> bhF
      32             : bhF -> bhF
      33             : bp  -> bP
      34             : bP  -> bP
      35             : dt  -> dT
      36             : dT  -> dT
      37             : gc  -> gC
      38             : gC  -> gC
      39             : h{V}  -> h{V}
      40             : mb  -> mB
      41             : mB  -> mB
      42             : n-{v} -> n{V}
      43             : n{V} -> n{V}
      44             : nd  -> nD
      45             : nD  -> nD
      46             : ng  -> nG
      47             : nG  -> nG
      48             : t-{v} -> t{V}
      49             : t{V} -> t{V}
      50             : ts{v} -> tS{V}
      51             : tS{v} -> tS{V}
      52             : tS{V} -> tS{V}
      53             : tsl  -> tSL
      54             : tSl  -> tSL
      55             : tSL  -> tSL
      56             : tsn  -> tSN
      57             : tSn  -> tSN
      58             : tSN  -> tSN
      59             : tsr  -> tSR
      60             : tSr  -> tSR
      61             : tSR  -> tSR
      62             : 
      63             : ### Create table of states and actions for each input class.
      64             : 
      65             : Start (non-word) state is #; generic in-word state is _, once we know there's
      66             : no special action to do in this word.
      67             : 
      68             :          #   _   b   bh  d   g   h   m   n   n-  t   t-  ts
      69             : input\state
      70             : b        b'  _   _   _   _   _   _   1   _   _   _   _   _
      71             : B        _   _   _   _   _   _   _   1   _   _   _   _   _
      72             : c        _   _   _   _   _   1   _   _   _   _   _   _   _
      73             : C        _   _   _   _   _   1   _   _   _   _   _   _   _
      74             : d        d'  _   _   _   _   _   _   _   1   _   _   _   _
      75             : D        _   _   _   _   _   _   _   _   1   _   _   _   _
      76             : f        _   _   _   2   _   _   _   _   _   _   _   _   _
      77             : F        _   _   _   2   _   _   _   _   _   _   _   _   _
      78             : g        g'  _   _   _   _   _   _   _   1   _   _   _   _
      79             : G        _   _   _   _   _   _   _   _   1   _   _   _   _
      80             : h        h'  _   bh  _   _   _   _   _   _   _   _   _   _
      81             : l        _   _   _   _   _   _   _   _   _   _   _   _   1
      82             : L        _   _   _   _   _   _   _   _   _   _   _   _   1
      83             : m        m'  _   _   _   _   _   _   _   _   _   _   _   _
      84             : n        n'  _   _   _   _   _   _   _   _   _   _   _   1
      85             : N        _   _   _   _   _   _   _   _   _   _   _   _   1
      86             : p        _   _   1   _   _   _   _   _   _   _   _   _   _
      87             : P        _   _   1   _   _   _   _   _   _   _   _   _   _
      88             : r        _   _   _   _   _   _   _   _   _   _   _   _   1
      89             : R        _   _   _   _   _   _   _   _   _   _   _   _   1
      90             : s        _   _   _   _   _   _   _   _   _   _   ts  _   _
      91             : S        _   _   _   _   _   _   _   _   _   _   ts  _   _
      92             : t        t'  _   _   _   1   _   _   _   _   _   _   _   _
      93             : T        _   _   _   _   1   _   _   _   _   _   _   _   _
      94             : vowel    _   _   _   _   _   _   _   _   _   1d  _   1d  1
      95             : Vowel    _   _   _   _   _   _   1   _   1   _   1   _   1
      96             : hyph     _   _   _   _   _   _   _   _   n-  _   t-  _   _
      97             : letter   _   _   _   _   _   _   _   _   _   _   _   _   _
      98             : other    #   #   #   #   #   #   #   #   #   #   #   #   #
      99             : 
     100             : Actions:
     101             :   1            lowercase one letter at start of word
     102             :   2            lowercase two letters at start of word
     103             :   1d           lowercase one letter at start of word, and delete next
     104             :                (and then go to state _, nothing further to do in this word)
     105             : 
     106             : else just go to the given state; suffix ' indicates mark start-of-word.
     107             : 
     108             : ### Consolidate identical states and classes:
     109             : 
     110             :          0   1   2   3   4   5   6   7   8   9   A   B
     111             :          #   _   b   bh  d   g   h   m   n [nt]- t   ts
     112             : input\state
     113             : b        b'  _   _   _   _   _   _   1   _   _   _   _
     114             : B        _   _   _   _   _   _   _   1   _   _   _   _
     115             : [cC]     _   _   _   _   _   1   _   _   _   _   _   _
     116             : d        d'  _   _   _   _   _   _   _   1   _   _   _
     117             : [DG]     _   _   _   _   _   _   _   _   1   _   _   _
     118             : [fF]     _   _   _   2   _   _   _   _   _   _   _   _
     119             : g        g'  _   _   _   _   _   _   _   1   _   _   _
     120             : h        h'  _   bh  _   _   _   _   _   _   _   _   _
     121             : [lLNrR]  _   _   _   _   _   _   _   _   _   _   _   1
     122             : m        m'  _   _   _   _   _   _   _   _   _   _   _
     123             : n        n'  _   _   _   _   _   _   _   _   _   _   1
     124             : [pP]     _   _   1   _   _   _   _   _   _   _   _   _
     125             : [sS]     _   _   _   _   _   _   _   _   _   _   ts  _
     126             : t        t'  _   _   _   1   _   _   _   _   _   _   _
     127             : T        _   _   _   _   1   _   _   _   _   _   _   _
     128             : vowel    _   _   _   _   _   _   _   _   _   1d  _   1
     129             : Vowel    _   _   _   _   _   _   1   _   1   _   1   1
     130             : hyph     _   _   _   _   _   _   _   _ [nt-] _ [nt-] _
     131             : letter   _   _   _   _   _   _   _   _   _   _   _   _
     132             : other    #   #   #   #   #   #   #   #   #   #   #   #
     133             : 
     134             : So we have 20 input classes, and 12 states.
     135             : 
     136             : State table array will contain bytes that encode action and new state:
     137             : 
     138             :   0x80  -  bit flag: mark start-of-word position
     139             :   0x40  -  currently unused
     140             :   0x30  -  action mask: 4 values
     141             :            0x00  -  do nothing
     142             :            0x10  -  lowercase one letter
     143             :            0x20  -  lowercase two letters
     144             :            0x30  -  lowercase one, delete one
     145             :   0x0F  -  next-state mask
     146             : ******************************************************************************/
     147             : 
     148             : #include "IrishCasing.h"
     149             : 
     150             : #include "nsUnicodeProperties.h"
     151             : #include "nsUnicharUtils.h"
     152             : 
     153             : namespace mozilla {
     154             : 
     155             : const uint8_t
     156             : IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = {
     157             : //  #     _     b     bh    d     g     h     m     n     [nt]- t     ts
     158             :   { 0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // b
     159             :   { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // B
     160             :   { 0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [cC]
     161             :   { 0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // d
     162             :   { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // [DG]
     163             :   { 0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [fF]
     164             :   { 0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // g
     165             :   { 0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // h
     166             :   { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // [lLNrR]
     167             :   { 0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // m
     168             :   { 0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // n
     169             :   { 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [pP]
     170             :   { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, 0x01 }, // [sS]
     171             :   { 0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // t
     172             :   { 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // T
     173             :   { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01, 0x11 }, // vowel
     174             :   { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11, 0x11 }, // Vowel
     175             :   { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09, 0x01 }, // hyph
     176             :   { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // letter
     177             :   { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }  // other
     178             : };
     179             : 
     180             : #define HYPHEN          0x2010
     181             : #define NO_BREAK_HYPHEN 0x2011
     182             : #define a_ACUTE         0x00e1
     183             : #define e_ACUTE         0x00e9
     184             : #define i_ACUTE         0x00ed
     185             : #define o_ACUTE         0x00f3
     186             : #define u_ACUTE         0x00fa
     187             : #define A_ACUTE         0x00c1
     188             : #define E_ACUTE         0x00c9
     189             : #define I_ACUTE         0x00cd
     190             : #define O_ACUTE         0x00d3
     191             : #define U_ACUTE         0x00da
     192             : 
     193             : const uint8_t IrishCasing::sLcClasses[26] = {
     194             :   kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel,
     195             :   kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter,
     196             :   kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel,
     197             :   kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t,
     198             :   kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
     199             :   kClass_letter
     200             : };
     201             : 
     202             : const uint8_t IrishCasing::sUcClasses[26] = {
     203             :   kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel,
     204             :   kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter,
     205             :   kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel,
     206             :   kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T,
     207             :   kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
     208             :   kClass_letter
     209             : };
     210             : 
     211             : uint8_t
     212           0 : IrishCasing::GetClass(uint32_t aCh)
     213             : {
     214             :   using mozilla::unicode::GetGenCategory;
     215           0 :   if (aCh >= 'a' && aCh <= 'z') {
     216           0 :     return sLcClasses[aCh - 'a'];
     217           0 :   } else if (aCh >= 'A' && aCh <= 'Z') {
     218           0 :     return sUcClasses[aCh - 'A'];
     219           0 :   } else if (GetGenCategory(aCh) == nsUGenCategory::kLetter) {
     220           0 :     if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE ||
     221           0 :         aCh == o_ACUTE || aCh == u_ACUTE) {
     222           0 :       return kClass_vowel;
     223           0 :     } else if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE ||
     224           0 :                aCh == O_ACUTE || aCh == U_ACUTE) {
     225           0 :       return kClass_Vowel;
     226             :     } else {
     227           0 :       return kClass_letter;
     228             :     }
     229           0 :   } else if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) {
     230           0 :     return kClass_hyph;
     231             :   } else {
     232           0 :     return kClass_other;
     233             :   }
     234             : }
     235             : 
     236             : uint32_t
     237           0 : IrishCasing::UpperCase(uint32_t aCh, State& aState,
     238             :                        bool& aMarkPos, uint8_t& aAction)
     239             : {
     240           0 :   uint8_t cls = GetClass(aCh);
     241           0 :   uint8_t stateEntry = sUppercaseStateTable[cls][aState];
     242           0 :   aMarkPos = !!(stateEntry & kMarkPositionFlag);
     243           0 :   aAction = (stateEntry & kActionMask) >> kActionShift;
     244           0 :   aState = State(stateEntry & kNextStateMask);
     245             : 
     246           0 :   return ToUpperCase(aCh);
     247             : }
     248             : 
     249             : } // namespace mozilla

Generated by: LCOV version 1.13