LCOV - output.info - intl/lwbrk/nsJISx4051LineBreaker.cpp

LCOV - code coverage report

Current view:	top level - intl/lwbrk - nsJISx4051LineBreaker.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	152	355	42.8 %
Date:	2017-07-14 16:53:18	Functions:	25	38	65.8 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2             : /* This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       5             : 
       6             : 
       7             : 
       8             : #include "nsJISx4051LineBreaker.h"
       9             : 
      10             : #include "jisx4051class.h"
      11             : #include "nsComplexBreaker.h"
      12             : #include "nsTArray.h"
      13             : #include "nsUnicodeProperties.h"
      14             : 
      15             : using namespace mozilla::unicode;
      16             : 
      17             : /*
      18             : 
      19             :    Simplification of Pair Table in JIS X 4051
      20             : 
      21             :    1. The Origion Table - in 4.1.3
      22             : 
      23             :    In JIS x 4051. The pair table is defined as below
      24             : 
      25             :    Class of
      26             :    Leading    Class of Trailing Char Class
      27             :    Char
      28             : 
      29             :               1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20
      30             :                                                  *  #  *  #
      31             :         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E
      32             :         2        X  X  X  X  X                                               X
      33             :         3        X  X  X  X  X                                               X
      34             :         4        X  X  X  X  X                                               X
      35             :         5        X  X  X  X  X                                               X
      36             :         6        X  X  X  X  X                                               X
      37             :         7        X  X  X  X  X  X                                            X
      38             :         8        X  X  X  X  X                                X              E
      39             :         9        X  X  X  X  X                                               X
      40             :        10        X  X  X  X  X                                               X
      41             :        11        X  X  X  X  X                                               X
      42             :        12        X  X  X  X  X                                               X
      43             :        13        X  X  X  X  X                    X                          X
      44             :        14        X  X  X  X  X                          X                    X
      45             :        15        X  X  X  X  X        X                       X        X     X
      46             :        16        X  X  X  X  X                                   X     X     X
      47             :        17        X  X  X  X  X                                               E
      48             :        18        X  X  X  X  X                                X  X     X     X
      49             :        19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E
      50             :        20        X  X  X  X  X                                               E
      51             : 
      52             :    * Same Char
      53             :    # Other Char
      54             : 
      55             :    X Cannot Break
      56             : 
      57             :    The classes mean:
      58             :       1: Open parenthesis
      59             :       2: Close parenthesis
      60             :       3: Prohibit a line break before
      61             :       4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
      62             :       5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
      63             :       6: Full stop
      64             :       7: Non-breakable between same characters
      65             :       8: Prefix (e.g., "$", "NO.")
      66             :       9: Postfix (e.g., "%")
      67             :      10: Ideographic space
      68             :      11: Hiragana
      69             :      12: Japanese characters (except class 11)
      70             :      13: Subscript
      71             :      14: Ruby
      72             :      15: Numeric
      73             :      16: Alphabet
      74             :      17: Space for Western language
      75             :      18: Western characters (except class 17)
      76             :      19: Split line note (Warichu) begin quote
      77             :      20: Split line note (Warichu) end quote
      78             : 
      79             :    2. Simplified by remove the class which we do not care
      80             : 
      81             :    However, since we do not care about class 13(Subscript), 14(Ruby),
      82             :    16 (Aphabet), 19(split line note begin quote), and 20(split line note end
      83             :    quote) we can simplify this par table into the following
      84             : 
      85             :    Class of
      86             :    Leading    Class of Trailing Char Class
      87             :    Char
      88             : 
      89             :               1  2  3  4  5  6  7  8  9 10 11 12 15 17 18
      90             : 
      91             :         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X
      92             :         2        X  X  X  X  X
      93             :         3        X  X  X  X  X
      94             :         4        X  X  X  X  X
      95             :         5        X  X  X  X  X
      96             :         6        X  X  X  X  X
      97             :         7        X  X  X  X  X  X
      98             :         8        X  X  X  X  X                    X
      99             :         9        X  X  X  X  X
     100             :        10        X  X  X  X  X
     101             :        11        X  X  X  X  X
     102             :        12        X  X  X  X  X
     103             :        15        X  X  X  X  X        X           X     X
     104             :        17        X  X  X  X  X
     105             :        18        X  X  X  X  X                    X     X
     106             : 
     107             :    3. Simplified by merged classes
     108             : 
     109             :    After the 2 simplification, the pair table have some duplication
     110             :    a. class 2, 3, 4, 5, 6,  are the same- we can merged them
     111             :    b. class 10, 11, 12, 17  are the same- we can merged them
     112             : 
     113             : 
     114             :    Class of
     115             :    Leading    Class of Trailing Char Class
     116             :    Char
     117             : 
     118             :               1 [a] 7  8  9 [b]15 18
     119             : 
     120             :         1     X  X  X  X  X  X  X  X
     121             :       [a]        X
     122             :         7        X  X
     123             :         8        X              X
     124             :         9        X
     125             :       [b]        X
     126             :        15        X        X     X  X
     127             :        18        X              X  X
     128             : 
     129             : 
     130             :    4. We add COMPLEX characters and make it breakable w/ all ther class
     131             :       except after class 1 and before class [a]
     132             : 
     133             :    Class of
     134             :    Leading    Class of Trailing Char Class
     135             :    Char
     136             : 
     137             :               1 [a] 7  8  9 [b]15 18 COMPLEX
     138             : 
     139             :         1     X  X  X  X  X  X  X  X  X
     140             :       [a]        X
     141             :         7        X  X
     142             :         8        X              X
     143             :         9        X
     144             :       [b]        X
     145             :        15        X        X     X  X
     146             :        18        X              X  X
     147             :   COMPLEX        X                    T
     148             : 
     149             :      T : need special handling
     150             : 
     151             : 
     152             :    5. However, we need two special class for some punctuations/parentheses,
     153             :       theirs breaking rules like character class (18), see bug 389056.
     154             :       And also we need character like punctuation that is same behavior with 18,
     155             :       but the characters are not letters of all languages. (e.g., '_')
     156             :       [c]. Based on open parenthesis class (1), but it is not breakable after
     157             :            character class (18) or numeric class (15).
     158             :       [d]. Based on close parenthesis (or punctuation) class (2), but it is not
     159             :            breakable before character class (18) or numeric class (15).
     160             : 
     161             :    Class of
     162             :    Leading    Class of Trailing Char Class
     163             :    Char
     164             : 
     165             :               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d]
     166             : 
     167             :         1     X  X  X  X  X  X  X  X  X       X    X
     168             :       [a]        X                            X    X
     169             :         7        X  X
     170             :         8        X              X
     171             :         9        X
     172             :       [b]        X                                 X
     173             :        15        X        X     X  X          X    X
     174             :        18        X              X  X          X    X
     175             :   COMPLEX        X                    T
     176             :       [c]     X  X  X  X  X  X  X  X  X       X    X
     177             :       [d]        X              X  X               X
     178             : 
     179             : 
     180             :    6. And Unicode has "NON-BREAK" characters. The lines should be broken around
     181             :       them. But in JIS X 4051, such class is not, therefore, we create [e].
     182             : 
     183             :    Class of
     184             :    Leading    Class of Trailing Char Class
     185             :    Char
     186             : 
     187             :               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
     188             : 
     189             :         1     X  X  X  X  X  X  X  X  X       X    X   X
     190             :       [a]        X                                 X   X
     191             :         7        X  X                                  X
     192             :         8        X              X                      X
     193             :         9        X                                     X
     194             :       [b]        X                                 X   X
     195             :        15        X        X     X  X          X    X   X
     196             :        18        X              X  X          X    X   X
     197             :   COMPLEX        X                    T                X
     198             :       [c]     X  X  X  X  X  X  X  X  X       X    X   X
     199             :       [d]        X              X  X               X   X
     200             :       [e]     X  X  X  X  X  X  X  X  X       X    X   X
     201             : 
     202             : 
     203             :    7. Now we use one bit to encode weather it is breakable, and use 2 bytes
     204             :       for one row, then the bit table will look like:
     205             : 
     206             :                  18    <-   1
     207             : 
     208             :        1  0000 1111 1111 1111  = 0x0FFF
     209             :       [a] 0000 1100 0000 0010  = 0x0C02
     210             :        7  0000 1000 0000 0110  = 0x0806
     211             :        8  0000 1000 0100 0010  = 0x0842
     212             :        9  0000 1000 0000 0010  = 0x0802
     213             :       [b] 0000 1100 0000 0010  = 0x0C02
     214             :       15  0000 1110 1101 0010  = 0x0ED2
     215             :       18  0000 1110 1100 0010  = 0x0EC2
     216             :  COMPLEX  0000 1001 0000 0010  = 0x0902
     217             :       [c] 0000 1111 1111 1111  = 0x0FFF
     218             :       [d] 0000 1100 1100 0010  = 0x0CC2
     219             :       [e] 0000 1111 1111 1111  = 0x0FFF
     220             : */
     221             : 
     222             : #define MAX_CLASSES 12
     223             : 
     224             : static const uint16_t gPair[MAX_CLASSES] = {
     225             :   0x0FFF,
     226             :   0x0C02,
     227             :   0x0806,
     228             :   0x0842,
     229             :   0x0802,
     230             :   0x0C02,
     231             :   0x0ED2,
     232             :   0x0EC2,
     233             :   0x0902,
     234             :   0x0FFF,
     235             :   0x0CC2,
     236             :   0x0FFF
     237             : };
     238             : 
     239             : 
     240             : /*
     241             : 
     242             :    8. And if the character is not enough far from word start, word end and
     243             :       another break point, we should not break in non-CJK languages.
     244             :       I.e., Don't break around 15, 18, [c] and [d], but don't change
     245             :       that if they are related to [b].
     246             : 
     247             :    Class of
     248             :    Leading    Class of Trailing Char Class
     249             :    Char
     250             : 
     251             :               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
     252             : 
     253             :         1     X  X  X  X  X  X  X  X  X       X    X   X
     254             :       [a]        X              X  X          X    X   X
     255             :         7        X  X           X  X          X    X   X
     256             :         8        X              X  X          X    X   X
     257             :         9        X              X  X          X    X   X
     258             :       [b]        X                                 X   X
     259             :        15     X  X  X  X  X     X  X  X       X    X   X
     260             :        18     X  X  X  X  X     X  X  X       X    X   X
     261             :   COMPLEX        X              X  X  T       X    X   X
     262             :       [c]     X  X  X  X  X  X  X  X  X       X    X   X
     263             :       [d]     X  X  X  X  X     X  X  X       X    X   X
     264             :       [e]     X  X  X  X  X  X  X  X  X       X    X   X
     265             : 
     266             :                  18    <-   1
     267             : 
     268             :        1  0000 1111 1111 1111  = 0x0FFF
     269             :       [a] 0000 1110 1100 0010  = 0x0EC2
     270             :        7  0000 1110 1100 0110  = 0x0EC6
     271             :        8  0000 1110 1100 0010  = 0x0EC2
     272             :        9  0000 1110 1100 0010  = 0x0EC2
     273             :       [b] 0000 1100 0000 0010  = 0x0C02
     274             :       15  0000 1111 1101 1111  = 0x0FDF
     275             :       18  0000 1111 1101 1111  = 0x0FDF
     276             :  COMPLEX  0000 1111 1100 0010  = 0x0FC2
     277             :       [c] 0000 1111 1111 1111  = 0x0FFF
     278             :       [d] 0000 1111 1101 1111  = 0x0FDF
     279             :       [e] 0000 1111 1111 1111  = 0x0FFF
     280             : */
     281             : 
     282             : static const uint16_t gPairConservative[MAX_CLASSES] = {
     283             :   0x0FFF,
     284             :   0x0EC2,
     285             :   0x0EC6,
     286             :   0x0EC2,
     287             :   0x0EC2,
     288             :   0x0C02,
     289             :   0x0FDF,
     290             :   0x0FDF,
     291             :   0x0FC2,
     292             :   0x0FFF,
     293             :   0x0FDF,
     294             :   0x0FFF
     295             : };
     296             : 
     297             : 
     298             : /*
     299             : 
     300             :    9. Now we map the class to number
     301             : 
     302             :       0: 1
     303             :       1: [a]- 2, 3, 4, 5, 6
     304             :       2: 7
     305             :       3: 8
     306             :       4: 9
     307             :       5: [b]- 10, 11, 12, 17
     308             :       6: 15
     309             :       7: 18
     310             :       8: COMPLEX
     311             :       9: [c]
     312             :       A: [d]
     313             :       B: [e]
     314             : 
     315             :     and they mean:
     316             :       0: Open parenthesis
     317             :       1: Punctuation that prohibits break before
     318             :       2: Non-breakable between same classes
     319             :       3: Prefix
     320             :       4: Postfix
     321             :       5: Breakable character (Spaces and Most Japanese characters)
     322             :       6: Numeric
     323             :       7: Characters
     324             :       8: Need special handling characters (E.g., Thai)
     325             :       9: Open parentheses like Character (See bug 389056)
     326             :       A: Close parenthese (or punctuations) like Character (See bug 389056)
     327             :       B: Non breakable (See bug 390920)
     328             : 
     329             : */
     330             : 
     331             : #define CLASS_NONE                             INT8_MAX
     332             : 
     333             : #define CLASS_OPEN                             0x00
     334             : #define CLASS_CLOSE                            0x01
     335             : #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
     336             : #define CLASS_PREFIX                           0x03
     337             : #define CLASS_POSTFFIX                         0x04
     338             : #define CLASS_BREAKABLE                        0x05
     339             : #define CLASS_NUMERIC                          0x06
     340             : #define CLASS_CHARACTER                        0x07
     341             : #define CLASS_COMPLEX                          0x08
     342             : #define CLASS_OPEN_LIKE_CHARACTER              0x09
     343             : #define CLASS_CLOSE_LIKE_CHARACTER             0x0A
     344             : #define CLASS_NON_BREAKABLE                    0x0B
     345             : 
     346             : #define U_NULL      char16_t(0x0000)
     347             : #define U_SLASH     char16_t('/')
     348             : #define U_SPACE     char16_t(' ')
     349             : #define U_HYPHEN    char16_t('-')
     350             : #define U_EQUAL     char16_t('=')
     351             : #define U_PERCENT   char16_t('%')
     352             : #define U_AMPERSAND char16_t('&')
     353             : #define U_SEMICOLON char16_t(';')
     354             : #define U_BACKSLASH char16_t('\\')
     355             : #define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
     356             : #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
     357             : #define U_OPEN_GUILLEMET    char16_t(0x00AB)
     358             : 
     359             : #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
     360             :                                      (c) == U_SLASH || \
     361             :                                      (c) == U_PERCENT || \
     362             :                                      (c) == U_AMPERSAND || \
     363             :                                      (c) == U_SEMICOLON || \
     364             :                                      (c) == U_BACKSLASH || \
     365             :                                      (c) == U_OPEN_SINGLE_QUOTE || \
     366             :                                      (c) == U_OPEN_DOUBLE_QUOTE || \
     367             :                                      (c) == U_OPEN_GUILLEMET)
     368             : 
     369             : #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
     370             : 
     371             : static inline int
     372         106 : GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)
     373             : {
     374         106 :   return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
     375             : }
     376             : 
     377             : static inline int
     378           0 : IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
     379             : {
     380           0 :   return ((0xff66 <= (u)) && ((u) <= 0xff70));
     381             : }
     382             : 
     383             : static inline int
     384         109 : IS_CJK_CHAR(char32_t u)
     385             : {
     386           0 :   return ((0x1100 <= (u) && (u) <= 0x11ff) ||
     387           0 :           (0x2e80 <= (u) && (u) <= 0xd7ff) ||
     388           0 :           (0xf900 <= (u) && (u) <= 0xfaff) ||
     389         109 :           (0xff00 <= (u) && (u) <= 0xffef) ||
     390         109 :           (0x20000 <= (u) && (u) <= 0x2fffd));
     391             : }
     392             : 
     393             : static inline bool
     394         110 : IS_NONBREAKABLE_SPACE(char16_t u)
     395             : {
     396         110 :   return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
     397             : }
     398             : 
     399             : static inline bool
     400         116 : IS_HYPHEN(char16_t u)
     401             : {
     402         116 :   return (u == U_HYPHEN ||
     403         116 :           u == 0x058A || // ARMENIAN HYPHEN
     404         116 :           u == 0x2010 || // HYPHEN
     405         232 :           u == 0x2012 || // FIGURE DASH
     406         116 :           u == 0x2013);  // EN DASH
     407             : }
     408             : 
     409             : static int8_t
     410         106 : GetClass(uint32_t u)
     411             : {
     412         106 :   if (u < 0x10000) {
     413         106 :     uint16_t h = u & 0xFF00;
     414         106 :     uint16_t l = u & 0x00ff;
     415             : 
     416             :     // Handle 3 range table first
     417         106 :     if (0x0000 == h) {
     418         106 :       return GETCLASSFROMTABLE(gLBClass00, l);
     419             :     }
     420           0 :     if (0x1700 == h) {
     421           0 :       return GETCLASSFROMTABLE(gLBClass17, l);
     422             :     }
     423           0 :     if (NS_NeedsPlatformNativeHandling(u)) {
     424           0 :       return CLASS_COMPLEX;
     425             :     }
     426           0 :     if (0x0E00 == h) {
     427           0 :       return GETCLASSFROMTABLE(gLBClass0E, l);
     428             :     }
     429           0 :     if (0x2000 == h) {
     430           0 :       return GETCLASSFROMTABLE(gLBClass20, l);
     431             :     }
     432           0 :     if (0x2100 == h) {
     433           0 :       return GETCLASSFROMTABLE(gLBClass21, l);
     434             :     }
     435           0 :     if (0x3000 == h) {
     436           0 :       return GETCLASSFROMTABLE(gLBClass30, l);
     437             :     }
     438           0 :     if (0xff00 == h) {
     439           0 :       if (l < 0x0060) { // Fullwidth ASCII variant
     440           0 :         return GETCLASSFROMTABLE(gLBClass00, (l+0x20));
     441             :       }
     442           0 :       if (l < 0x00a0) { // Halfwidth Katakana variants
     443           0 :         switch (l) {
     444           0 :         case 0x61: return GetClass(0x3002);
     445           0 :         case 0x62: return GetClass(0x300c);
     446           0 :         case 0x63: return GetClass(0x300d);
     447           0 :         case 0x64: return GetClass(0x3001);
     448           0 :         case 0x65: return GetClass(0x30fb);
     449           0 :         case 0x9e: return GetClass(0x309b);
     450           0 :         case 0x9f: return GetClass(0x309c);
     451             :         default:
     452           0 :           if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) {
     453           0 :             return CLASS_CLOSE; // jis x4051 class 3
     454             :           }
     455           0 :           return CLASS_BREAKABLE; // jis x4051 class 11
     456             :         }
     457             :       }
     458           0 :       if (l < 0x00e0) {
     459           0 :         return CLASS_CHARACTER; // Halfwidth Hangul variants
     460             :       }
     461           0 :       if (l < 0x00f0) {
     462             :         static char16_t NarrowFFEx[16] = {
     463             :           0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
     464             :           0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
     465             :         };
     466           0 :         return GetClass(NarrowFFEx[l - 0x00e0]);
     467             :       }
     468           0 :     } else if (0x3100 == h) {
     469           0 :       if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
     470             :                        // XXX: This is per UAX #14, but UAX #14 may change
     471             :                        // the line breaking rules about Kanbun and Bopomofo.
     472           0 :         return CLASS_BREAKABLE;
     473             :       }
     474           0 :       if (l >= 0xf0) { // Katakana small letters for Ainu
     475           0 :         return CLASS_CLOSE;
     476             :       }
     477           0 :     } else if (0x0300 == h) {
     478           0 :       if (0x4F == l || (0x5C <= l && l <= 0x62)) {
     479           0 :         return CLASS_NON_BREAKABLE;
     480             :       }
     481           0 :     } else if (0x0500 == h) {
     482             :       // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
     483           0 :       if (l == 0x8A) {
     484           0 :         return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
     485             :       }
     486           0 :     } else if (0x0F00 == h) {
     487           0 :       if (0x08 == l || 0x0C == l || 0x12 == l) {
     488           0 :         return CLASS_NON_BREAKABLE;
     489             :       }
     490           0 :     } else if (0x1800 == h) {
     491           0 :       if (0x0E == l) {
     492           0 :         return CLASS_NON_BREAKABLE;
     493             :       }
     494           0 :     } else if (0x1600 == h) {
     495           0 :       if (0x80 == l) { // U+1680 OGHAM SPACE MARK
     496           0 :         return CLASS_BREAKABLE;
     497             :       }
     498           0 :     } else if (u == 0xfeff) {
     499           0 :       return CLASS_NON_BREAKABLE;
     500             :     }
     501             :   }
     502             : 
     503             :   // Mapping for Unicode LineBreak.txt classes to the (simplified) set of
     504             :   // character classes used here.
     505             :   // XXX The mappings here were derived by comparing the Unicode LineBreak
     506             :   //     values of BMP characters to the classes our existing GetClass returns
     507             :   //     for the same codepoints; in cases where characters with the same
     508             :   //     LineBreak class mapped to various classes here, I picked what seemed
     509             :   //     the most prevalent equivalence.
     510             :   //     Some of these are unclear to me, but currently they are ONLY used
     511             :   //     for characters not handled by the old code above, so all the JISx405
     512             :   //     special cases should already be accounted for.
     513             :   static const int8_t sUnicodeLineBreakToClass[] = {
     514             :     /* UNKNOWN = 0,                       [XX] */ CLASS_CHARACTER,
     515             :     /* AMBIGUOUS = 1,                     [AI] */ CLASS_CHARACTER,
     516             :     /* ALPHABETIC = 2,                    [AL] */ CLASS_CHARACTER,
     517             :     /* BREAK_BOTH = 3,                    [B2] */ CLASS_CHARACTER,
     518             :     /* BREAK_AFTER = 4,                   [BA] */ CLASS_CHARACTER,
     519             :     /* BREAK_BEFORE = 5,                  [BB] */ CLASS_OPEN_LIKE_CHARACTER,
     520             :     /* MANDATORY_BREAK = 6,               [BK] */ CLASS_CHARACTER,
     521             :     /* CONTINGENT_BREAK = 7,              [CB] */ CLASS_CHARACTER,
     522             :     /* CLOSE_PUNCTUATION = 8,             [CL] */ CLASS_CHARACTER,
     523             :     /* COMBINING_MARK = 9,                [CM] */ CLASS_CHARACTER,
     524             :     /* CARRIAGE_RETURN = 10,              [CR] */ CLASS_BREAKABLE,
     525             :     /* EXCLAMATION = 11,                  [EX] */ CLASS_CHARACTER,
     526             :     /* GLUE = 12,                         [GL] */ CLASS_NON_BREAKABLE,
     527             :     /* HYPHEN = 13,                       [HY] */ CLASS_CHARACTER,
     528             :     /* IDEOGRAPHIC = 14,                  [ID] */ CLASS_BREAKABLE,
     529             :     /* INSEPARABLE = 15,                  [IN] */ CLASS_CLOSE_LIKE_CHARACTER,
     530             :     /* INFIX_NUMERIC = 16,                [IS] */ CLASS_CHARACTER,
     531             :     /* LINE_FEED = 17,                    [LF] */ CLASS_BREAKABLE,
     532             :     /* NONSTARTER = 18,                   [NS] */ CLASS_CLOSE_LIKE_CHARACTER,
     533             :     /* NUMERIC = 19,                      [NU] */ CLASS_CHARACTER,
     534             :     /* OPEN_PUNCTUATION = 20,             [OP] */ CLASS_CHARACTER,
     535             :     /* POSTFIX_NUMERIC = 21,              [PO] */ CLASS_CHARACTER,
     536             :     /* PREFIX_NUMERIC = 22,               [PR] */ CLASS_CHARACTER,
     537             :     /* QUOTATION = 23,                    [QU] */ CLASS_CHARACTER,
     538             :     /* COMPLEX_CONTEXT = 24,              [SA] */ CLASS_CHARACTER,
     539             :     /* SURROGATE = 25,                    [SG] */ CLASS_CHARACTER,
     540             :     /* SPACE = 26,                        [SP] */ CLASS_BREAKABLE,
     541             :     /* BREAK_SYMBOLS = 27,                [SY] */ CLASS_CHARACTER,
     542             :     /* ZWSPACE = 28,                      [ZW] */ CLASS_BREAKABLE,
     543             :     /* NEXT_LINE = 29,                    [NL] */ CLASS_CHARACTER,
     544             :     /* WORD_JOINER = 30,                  [WJ] */ CLASS_NON_BREAKABLE,
     545             :     /* H2 = 31,                           [H2] */ CLASS_BREAKABLE,
     546             :     /* H3 = 32,                           [H3] */ CLASS_BREAKABLE,
     547             :     /* JL = 33,                           [JL] */ CLASS_CHARACTER,
     548             :     /* JT = 34,                           [JT] */ CLASS_CHARACTER,
     549             :     /* JV = 35,                           [JV] */ CLASS_CHARACTER,
     550             :     /* CLOSE_PARENTHESIS = 36,            [CP] */ CLASS_CLOSE_LIKE_CHARACTER,
     551             :     /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE,
     552             :     /* HEBREW_LETTER = 38,                [HL] */ CLASS_CHARACTER,
     553             :     /* REGIONAL_INDICATOR = 39,           [RI] */ CLASS_CHARACTER,
     554             :     /* E_BASE = 40,                       [EB] */ CLASS_BREAKABLE,
     555             :     /* E_MODIFIER = 41,                   [EM] */ CLASS_CHARACTER,
     556             :     /* ZWJ = 42,                          [ZWJ]*/ CLASS_CHARACTER
     557             :   };
     558             : 
     559             : #if ENABLE_INTL_API
     560             :   static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass),
     561             :                 "Gecko vs ICU LineBreak class mismatch");
     562             : #endif
     563             : 
     564           0 :   auto cls = mozilla::unicode::GetLineBreakClass(u);
     565           0 :   MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass));
     566           0 :   return sUnicodeLineBreakToClass[cls];
     567             : }
     568             : 
     569             : static bool
     570          86 : GetPair(int8_t c1, int8_t c2)
     571             : {
     572          86 :   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
     573          86 :   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
     574             : 
     575          86 :   return (0 == ((gPair[c1] >> c2) & 0x0001));
     576             : }
     577             : 
     578             : static bool
     579          20 : GetPairConservative(int8_t c1, int8_t c2)
     580             : {
     581          20 :   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
     582          20 :   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
     583             : 
     584          20 :   return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
     585             : }
     586             : 
     587           3 : nsJISx4051LineBreaker::nsJISx4051LineBreaker()
     588             : {
     589           3 : }
     590             : 
     591           0 : nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
     592             : {
     593           0 : }
     594             : 
     595          24 : NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
     596             : 
     597             : class ContextState {
     598             : public:
     599           3 :   ContextState(const char16_t* aText, uint32_t aLength)
     600           3 :     : mUniText(aText)
     601             :     , mText(nullptr)
     602           3 :     , mLength(aLength)
     603             :   {
     604           3 :     Init();
     605           3 :   }
     606             : 
     607           1 :   ContextState(const uint8_t* aText, uint32_t aLength)
     608           1 :     : mUniText(nullptr)
     609             :     , mText(aText)
     610           1 :     , mLength(aLength)
     611             :   {
     612           1 :     Init();
     613           1 :   }
     614             : 
     615           0 :   uint32_t Length() const { return mLength; }
     616           0 :   uint32_t Index() const { return mIndex; }
     617             : 
     618             :   // This gets a single code unit of the text, without checking for surrogates
     619             :   // (in the case of a 16-bit text buffer). That's OK if we're only checking for
     620             :   // specific characters that are known to be BMP values.
     621           0 :   char16_t GetCodeUnitAt(uint32_t aIndex) const {
     622           0 :     MOZ_ASSERT(aIndex < mLength, "Out of range!");
     623           0 :     return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
     624             :   }
     625             : 
     626             :   // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs
     627             :   // as necessary. It must ONLY be called for 16-bit text, not 8-bit.
     628         335 :   char32_t GetUnicodeCharAt(uint32_t aIndex) const {
     629         335 :     MOZ_ASSERT(mUniText, "Only for 16-bit text!");
     630         335 :     MOZ_ASSERT(aIndex < mLength, "Out of range!");
     631         335 :     char32_t c = mUniText[aIndex];
     632         335 :     if (NS_IS_HIGH_SURROGATE(c) && aIndex + 1 < mLength &&
     633           0 :         NS_IS_LOW_SURROGATE(mUniText[aIndex + 1])) {
     634           0 :       c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]);
     635             :     }
     636         335 :     return c;
     637             :   }
     638             : 
     639         110 :   void AdvanceIndex() {
     640         110 :     ++mIndex;
     641         110 :   }
     642             : 
     643           3 :   void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
     644             : 
     645             : // A word of western language should not be broken. But even if the word has
     646             : // only ASCII characters, non-natural context words should be broken, e.g.,
     647             : // URL and file path. For protecting the natural words, we should use
     648             : // conservative breaking rules at following conditions:
     649             : //   1. at near the start of word
     650             : //   2. at near the end of word
     651             : //   3. at near the latest broken point
     652             : // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters,
     653             : // which varies depending whether we are looking at a letter or a non-letter
     654             : // character: for non-letters, we use an extended "conservative" range.
     655             : 
     656             : #define CONSERVATIVE_RANGE_LETTER 2
     657             : #define CONSERVATIVE_RANGE_OTHER  6
     658             : 
     659         111 :   bool UseConservativeBreaking(uint32_t aOffset = 0) const {
     660         111 :     if (mHasCJKChar)
     661           0 :       return false;
     662         111 :     uint32_t index = mIndex + aOffset;
     663             : 
     664             :     // If the character at index is a letter (rather than various punctuation
     665             :     // characters, etc) then we want a shorter "conservative" range
     666             :     uint32_t conservativeRangeStart, conservativeRangeEnd;
     667         301 :     if (index < mLength &&
     668         111 :         nsUGenCategory::kLetter ==
     669         111 :           (mText ? GetGenCategory(mText[index])
     670         111 :                  : GetGenCategory(GetUnicodeCharAt(index)))) {
     671             :       // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start
     672             :       // to get more balanced behavior (if we break off a 2-letter prefix,
     673             :       // that means the break will actually be three letters from start of
     674             :       // word, to include the hyphen; whereas a 2-letter suffix will be
     675             :       // broken only two letters from end of word).
     676          79 :       conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER;
     677          79 :       conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1;
     678             :     } else {
     679          32 :       conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER;
     680             :     }
     681             : 
     682         101 :     bool result = (index < conservativeRangeStart ||
     683         207 :                      mLength - index < conservativeRangeEnd ||
     684         207 :                      index - mLastBreakIndex < conservativeRangeStart);
     685         111 :     if (result || !mHasNonbreakableSpace)
     686         111 :       return result;
     687             : 
     688             :     // This text has no-breakable space, we need to check whether the index
     689             :     // is near it.
     690             : 
     691             :     // Note that index is always larger than conservativeRange here.
     692           0 :     for (uint32_t i = index; index - conservativeRangeStart < i; --i) {
     693           0 :       if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1)))
     694           0 :         return true;
     695             :     }
     696             :     // Note that index is always less than mLength - conservativeRange.
     697           0 :     for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) {
     698           0 :       if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i)))
     699           0 :         return true;
     700             :     }
     701           0 :     return false;
     702             :   }
     703             : 
     704           0 :   bool HasPreviousEqualsSign() const {
     705           0 :     return mHasPreviousEqualsSign;
     706             :   }
     707           1 :   void NotifySeenEqualsSign() {
     708           1 :     mHasPreviousEqualsSign = true;
     709           1 :   }
     710             : 
     711           4 :   bool HasPreviousSlash() const {
     712           4 :     return mHasPreviousSlash;
     713             :   }
     714           5 :   void NotifySeenSlash() {
     715           5 :     mHasPreviousSlash = true;
     716           5 :   }
     717             : 
     718           0 :   bool HasPreviousBackslash() const {
     719           0 :     return mHasPreviousBackslash;
     720             :   }
     721           0 :   void NotifySeenBackslash() {
     722           0 :     mHasPreviousBackslash = true;
     723           0 :   }
     724             : 
     725           0 :   uint32_t GetPreviousNonHyphenCharacter() const {
     726           0 :     return mPreviousNonHyphenCharacter;
     727             :   }
     728         110 :   void NotifyNonHyphenCharacter(uint32_t ch) {
     729         110 :     mPreviousNonHyphenCharacter = ch;
     730         110 :   }
     731             : 
     732             : private:
     733           4 :   void Init() {
     734           4 :     mIndex = 0;
     735           4 :     mLastBreakIndex = 0;
     736           4 :     mPreviousNonHyphenCharacter = U_NULL;
     737           4 :     mHasCJKChar = false;
     738           4 :     mHasNonbreakableSpace = false;
     739           4 :     mHasPreviousEqualsSign = false;
     740           4 :     mHasPreviousSlash = false;
     741           4 :     mHasPreviousBackslash = false;
     742             : 
     743           4 :     if (mText) {
     744             :       // 8-bit text: we only need to check for &nbsp;
     745           2 :       for (uint32_t i = 0; i < mLength; ++i) {
     746           1 :         if (IS_NONBREAKABLE_SPACE(mText[i])) {
     747           0 :           mHasNonbreakableSpace = true;
     748           0 :           break;
     749             :         }
     750             :       }
     751             :     } else {
     752             :       // 16-bit text: handle surrogates and check for CJK as well as &nbsp;
     753         112 :       for (uint32_t i = 0; i < mLength; ++i) {
     754         109 :         char32_t u = GetUnicodeCharAt(i);
     755         109 :         if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) {
     756           0 :           mHasNonbreakableSpace = true;
     757           0 :           if (mHasCJKChar) {
     758           0 :             break;
     759             :           }
     760         109 :         } else if (!mHasCJKChar && IS_CJK_CHAR(u)) {
     761           0 :           mHasCJKChar = 1;
     762           0 :           if (mHasNonbreakableSpace) {
     763           0 :             break;
     764             :           }
     765             :         }
     766         109 :         if (u > 0xFFFFu) {
     767           0 :           ++i; // step over trailing low surrogate
     768             :         }
     769             :       }
     770             :     }
     771           4 :   }
     772             : 
     773             :   const char16_t* const mUniText;
     774             :   const uint8_t* const mText;
     775             : 
     776             :   uint32_t mIndex;
     777             :   const uint32_t mLength;         // length of text
     778             :   uint32_t mLastBreakIndex;
     779             :   char32_t mPreviousNonHyphenCharacter; // The last character we have seen
     780             :                                          // which is not U_HYPHEN
     781             :   bool mHasCJKChar; // if the text has CJK character, this is true.
     782             :   bool mHasNonbreakableSpace; // if the text has no-breakable space,
     783             :                                      // this is true.
     784             :   bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
     785             :   bool mHasPreviousSlash;      // True if we have seen a U_SLASH
     786             :   bool mHasPreviousBackslash;  // True if we have seen a U_BACKSLASH
     787             : };
     788             : 
     789             : static int8_t
     790           6 : ContextualAnalysis(char32_t prev, char32_t cur, char32_t next,
     791             :                    ContextState &aState)
     792             : {
     793             :   // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
     794             : 
     795           6 :   if (IS_HYPHEN(cur)) {
     796             :     // If next character is hyphen, we don't need to break between them.
     797           0 :     if (IS_HYPHEN(next))
     798           0 :       return CLASS_CHARACTER;
     799             :     // If prev and next characters are numeric, it may be in Math context.
     800             :     // So, we should not break here.
     801           0 :     bool prevIsNum = IS_ASCII_DIGIT(prev);
     802           0 :     bool nextIsNum = IS_ASCII_DIGIT(next);
     803           0 :     if (prevIsNum && nextIsNum)
     804           0 :       return CLASS_NUMERIC;
     805             :     // If one side is numeric and the other is a character, or if both sides are
     806             :     // characters, the hyphen should be breakable.
     807           0 :     if (!aState.UseConservativeBreaking(1)) {
     808           0 :       char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
     809           0 :       if (prevOfHyphen && next) {
     810           0 :         int8_t prevClass = GetClass(prevOfHyphen);
     811           0 :         int8_t nextClass = GetClass(next);
     812             :         bool prevIsNumOrCharOrClose =
     813           0 :           prevIsNum ||
     814           0 :           (prevClass == CLASS_CHARACTER &&
     815           0 :             !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
     816           0 :           prevClass == CLASS_CLOSE ||
     817           0 :           prevClass == CLASS_CLOSE_LIKE_CHARACTER;
     818             :         bool nextIsNumOrCharOrOpen =
     819           0 :           nextIsNum ||
     820           0 :           (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
     821           0 :           nextClass == CLASS_OPEN ||
     822           0 :           nextClass == CLASS_OPEN_LIKE_CHARACTER ||
     823           0 :           next == U_OPEN_SINGLE_QUOTE ||
     824           0 :           next == U_OPEN_DOUBLE_QUOTE ||
     825           0 :           next == U_OPEN_GUILLEMET;
     826           0 :         if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
     827           0 :           return CLASS_CLOSE;
     828             :         }
     829             :       }
     830             :     }
     831             :   } else {
     832           6 :     aState.NotifyNonHyphenCharacter(cur);
     833           6 :     if (cur == U_SLASH || cur == U_BACKSLASH) {
     834             :       // If this is immediately after same char, we should not break here.
     835           6 :       if (prev == cur)
     836           1 :         return CLASS_CHARACTER;
     837             :       // If this text has two or more (BACK)SLASHs, this may be file path or URL.
     838             :       // Make sure to compute shouldReturn before we notify on this slash.
     839          12 :       bool shouldReturn = !aState.UseConservativeBreaking() &&
     840           0 :         (cur == U_SLASH ?
     841           9 :          aState.HasPreviousSlash() : aState.HasPreviousBackslash());
     842             : 
     843           5 :       if (cur == U_SLASH) {
     844           5 :         aState.NotifySeenSlash();
     845             :       } else {
     846           0 :         aState.NotifySeenBackslash();
     847             :       }
     848             : 
     849           5 :       if (shouldReturn)
     850           3 :         return CLASS_OPEN;
     851           0 :     } else if (cur == U_PERCENT) {
     852             :       // If this is a part of the param of URL, we should break before.
     853           0 :       if (!aState.UseConservativeBreaking()) {
     854           0 :         if (aState.Index() >= 3 &&
     855           0 :             aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT)
     856           0 :           return CLASS_OPEN;
     857           0 :         if (aState.Index() + 3 < aState.Length() &&
     858           0 :             aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT)
     859           0 :           return CLASS_OPEN;
     860             :       }
     861           0 :     } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
     862             :       // If this may be a separator of params of URL, we should break after.
     863           0 :       if (!aState.UseConservativeBreaking(1) &&
     864           0 :           aState.HasPreviousEqualsSign())
     865           0 :         return CLASS_CLOSE;
     866           0 :     } else if (cur == U_OPEN_SINGLE_QUOTE ||
     867           0 :                cur == U_OPEN_DOUBLE_QUOTE ||
     868             :                cur == U_OPEN_GUILLEMET) {
     869             :       // for CJK usage, we treat these as openers to allow a break before them,
     870             :       // but otherwise treat them as normal characters because quote mark usage
     871             :       // in various Western languages varies too much; see bug #450088 discussion.
     872           0 :       if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
     873           0 :         return CLASS_OPEN;
     874             :     } else {
     875           0 :       NS_ERROR("Forgot to handle the current character!");
     876             :     }
     877             :   }
     878           2 :   return GetClass(cur);
     879             : }
     880             : 
     881             : 
     882             : int32_t
     883           0 : nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
     884             :                                 uint32_t aPos, int8_t aDirection)
     885             : {
     886           0 :   bool    textNeedsJISx4051 = false;
     887             :   int32_t begin, end;
     888             : 
     889           0 :   for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
     890           0 :     if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
     891           0 :       textNeedsJISx4051 = true;
     892             :     }
     893             :   }
     894           0 :   for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
     895           0 :     if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
     896           0 :       textNeedsJISx4051 = true;
     897             :     }
     898             :   }
     899             : 
     900             :   int32_t ret;
     901           0 :   AutoTArray<uint8_t, 2000> breakState;
     902           0 :   if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
     903             :     // No complex text character, do not try to do complex line break.
     904             :     // (This is required for serializers. See Bug #344816.)
     905             :     // Also fall back to this when out of memory.
     906           0 :     if (aDirection < 0) {
     907           0 :       ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
     908             :     } else {
     909           0 :       ret = end;
     910             :     }
     911             :   } else {
     912           0 :     GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,
     913           0 :                       breakState.Elements());
     914             : 
     915           0 :     ret = aPos;
     916           0 :     do {
     917           0 :       ret += aDirection;
     918           0 :     } while (begin < ret && ret < end && !breakState[ret - begin]);
     919             :   }
     920             : 
     921           0 :   return ret;
     922             : }
     923             : 
     924             : int32_t
     925           0 : nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,
     926             :                             uint32_t aPos)
     927             : {
     928           0 :   NS_ASSERTION(aText, "aText shouldn't be null");
     929           0 :   NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
     930             : 
     931           0 :   int32_t nextPos = WordMove(aText, aLen, aPos, 1);
     932           0 :   return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
     933             : }
     934             : 
     935             : int32_t
     936           0 : nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,
     937             :                             uint32_t aPos)
     938             : {
     939           0 :   NS_ASSERTION(aText, "aText shouldn't be null");
     940           0 :   NS_ASSERTION(aLen >= aPos && aPos > 0,
     941             :                "Bad position passed to nsJISx4051LineBreaker::Prev");
     942             : 
     943           0 :   int32_t prevPos = WordMove(aText, aLen, aPos, -1);
     944           0 :   return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
     945             : }
     946             : 
     947             : void
     948           3 : nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
     949             :                                          uint8_t aWordBreak,
     950             :                                          uint8_t* aBreakBefore)
     951             : {
     952             :   uint32_t cur;
     953           3 :   int8_t lastClass = CLASS_NONE;
     954           3 :   ContextState state(aChars, aLength);
     955             : 
     956         112 :   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
     957         109 :     char32_t ch = state.GetUnicodeCharAt(cur);
     958         109 :     uint32_t chLen = ch > 0xFFFFu ? 2 : 1;
     959             :     int8_t cl;
     960             : 
     961         109 :     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
     962             :       char32_t prev, next;
     963           6 :       if (cur > 0) {
     964             :         // not using state.GetUnicodeCharAt() here because we're looking back
     965             :         // rather than forward for possible surrogates
     966           6 :         prev = aChars[cur - 1];
     967           6 :         if (NS_IS_LOW_SURROGATE(prev) && cur > 1 &&
     968           0 :             NS_IS_HIGH_SURROGATE(aChars[cur - 2])) {
     969           0 :           prev = SURROGATE_TO_UCS4(aChars[cur - 2], prev);
     970             :         }
     971             :       } else {
     972           0 :         prev = 0;
     973             :       }
     974           6 :       if (cur + chLen < aLength) {
     975           6 :         next = state.GetUnicodeCharAt(cur + chLen);
     976             :       } else {
     977           0 :         next = 0;
     978             :       }
     979           6 :       cl = ContextualAnalysis(prev, ch, next, state);
     980             :     } else {
     981         103 :       if (ch == U_EQUAL)
     982           0 :         state.NotifySeenEqualsSign();
     983         103 :       state.NotifyNonHyphenCharacter(ch);
     984         103 :       cl = GetClass(ch);
     985             :     }
     986             : 
     987         109 :     bool allowBreak = false;
     988         109 :     if (cur > 0) {
     989         106 :       NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
     990             :                    "Loop should have prevented adjacent complex chars here");
     991         106 :       if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
     992         212 :         allowBreak = (state.UseConservativeBreaking()) ?
     993         106 :           GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
     994           0 :       } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
     995           0 :         allowBreak = true;
     996             :       }
     997             :     }
     998         109 :     aBreakBefore[cur] = allowBreak;
     999         109 :     if (allowBreak)
    1000           3 :       state.NotifyBreakBefore();
    1001         109 :     lastClass = cl;
    1002         109 :     if (CLASS_COMPLEX == cl) {
    1003           0 :       uint32_t end = cur + chLen;
    1004             : 
    1005           0 :       while (end < aLength) {
    1006           0 :         char32_t c = state.GetUnicodeCharAt(end);
    1007           0 :         if (CLASS_COMPLEX != GetClass(c)) {
    1008           0 :           break;
    1009             :         }
    1010           0 :         ++end;
    1011           0 :         if (c > 0xFFFFU) { // it was a surrogate pair
    1012           0 :           ++end;
    1013             :         }
    1014             :       }
    1015             : 
    1016           0 :       NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
    1017             : 
    1018             :       // We have to consider word-break value again for complex characters
    1019           0 :       if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
    1020             :         // Respect word-break property
    1021           0 :         for (uint32_t i = cur; i < end; i++)
    1022           0 :           aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);
    1023             :       }
    1024             : 
    1025             :       // restore breakability at chunk begin, which was always set to false
    1026             :       // by the complex line breaker
    1027           0 :       aBreakBefore[cur] = allowBreak;
    1028             : 
    1029           0 :       cur = end - 1;
    1030             :     }
    1031             : 
    1032         109 :     if (chLen == 2) {
    1033             :       // Supplementary-plane character: mark that we cannot break before the
    1034             :       // trailing low surrogate, and advance past it.
    1035           0 :       ++cur;
    1036           0 :       aBreakBefore[cur] = false;
    1037           0 :       state.AdvanceIndex();
    1038             :     }
    1039             :   }
    1040           3 : }
    1041             : 
    1042             : void
    1043           1 : nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
    1044             :                                          uint8_t aWordBreak,
    1045             :                                          uint8_t* aBreakBefore)
    1046             : {
    1047             :   uint32_t cur;
    1048           1 :   int8_t lastClass = CLASS_NONE;
    1049           1 :   ContextState state(aChars, aLength);
    1050             : 
    1051           2 :   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
    1052           1 :     char32_t ch = aChars[cur];
    1053             :     int8_t cl;
    1054             : 
    1055           1 :     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
    1056           0 :       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
    1057             :                               ch,
    1058           0 :                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
    1059           0 :                               state);
    1060             :     } else {
    1061           1 :       if (ch == U_EQUAL)
    1062           1 :         state.NotifySeenEqualsSign();
    1063           1 :       state.NotifyNonHyphenCharacter(ch);
    1064           1 :       cl = GetClass(ch);
    1065             :     }
    1066             : 
    1067           1 :     bool allowBreak = false;
    1068           1 :     if (cur > 0) {
    1069           0 :       if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
    1070           0 :         allowBreak = (state.UseConservativeBreaking()) ?
    1071           0 :           GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
    1072           0 :       } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
    1073           0 :         allowBreak = true;
    1074             :       }
    1075             :     }
    1076           1 :     aBreakBefore[cur] = allowBreak;
    1077           1 :     if (allowBreak)
    1078           0 :       state.NotifyBreakBefore();
    1079           1 :     lastClass = cl;
    1080             :   }
    1081           1 : }

Generated by: LCOV version 1.13