Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 :
7 :
8 : #include "nsJISx4051LineBreaker.h"
9 :
10 : #include "jisx4051class.h"
11 : #include "nsComplexBreaker.h"
12 : #include "nsTArray.h"
13 : #include "nsUnicodeProperties.h"
14 :
15 : using namespace mozilla::unicode;
16 :
17 : /*
18 :
19 : Simplification of Pair Table in JIS X 4051
20 :
21 : 1. The Origion Table - in 4.1.3
22 :
23 : In JIS x 4051. The pair table is defined as below
24 :
25 : Class of
26 : Leading Class of Trailing Char Class
27 : Char
28 :
29 : 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
30 : * # * #
31 : 1 X X X X X X X X X X X X X X X X X X X X X E
32 : 2 X X X X X X
33 : 3 X X X X X X
34 : 4 X X X X X X
35 : 5 X X X X X X
36 : 6 X X X X X X
37 : 7 X X X X X X X
38 : 8 X X X X X X E
39 : 9 X X X X X X
40 : 10 X X X X X X
41 : 11 X X X X X X
42 : 12 X X X X X X
43 : 13 X X X X X X X
44 : 14 X X X X X X X
45 : 15 X X X X X X X X X
46 : 16 X X X X X X X X
47 : 17 X X X X X E
48 : 18 X X X X X X X X X
49 : 19 X E E E E E X X X X X X X X X X X X E X E E
50 : 20 X X X X X E
51 :
52 : * Same Char
53 : # Other Char
54 :
55 : X Cannot Break
56 :
57 : The classes mean:
58 : 1: Open parenthesis
59 : 2: Close parenthesis
60 : 3: Prohibit a line break before
61 : 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
62 : 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
63 : 6: Full stop
64 : 7: Non-breakable between same characters
65 : 8: Prefix (e.g., "$", "NO.")
66 : 9: Postfix (e.g., "%")
67 : 10: Ideographic space
68 : 11: Hiragana
69 : 12: Japanese characters (except class 11)
70 : 13: Subscript
71 : 14: Ruby
72 : 15: Numeric
73 : 16: Alphabet
74 : 17: Space for Western language
75 : 18: Western characters (except class 17)
76 : 19: Split line note (Warichu) begin quote
77 : 20: Split line note (Warichu) end quote
78 :
79 : 2. Simplified by remove the class which we do not care
80 :
81 : However, since we do not care about class 13(Subscript), 14(Ruby),
82 : 16 (Aphabet), 19(split line note begin quote), and 20(split line note end
83 : quote) we can simplify this par table into the following
84 :
85 : Class of
86 : Leading Class of Trailing Char Class
87 : Char
88 :
89 : 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18
90 :
91 : 1 X X X X X X X X X X X X X X X
92 : 2 X X X X X
93 : 3 X X X X X
94 : 4 X X X X X
95 : 5 X X X X X
96 : 6 X X X X X
97 : 7 X X X X X X
98 : 8 X X X X X X
99 : 9 X X X X X
100 : 10 X X X X X
101 : 11 X X X X X
102 : 12 X X X X X
103 : 15 X X X X X X X X
104 : 17 X X X X X
105 : 18 X X X X X X X
106 :
107 : 3. Simplified by merged classes
108 :
109 : After the 2 simplification, the pair table have some duplication
110 : a. class 2, 3, 4, 5, 6, are the same- we can merged them
111 : b. class 10, 11, 12, 17 are the same- we can merged them
112 :
113 :
114 : Class of
115 : Leading Class of Trailing Char Class
116 : Char
117 :
118 : 1 [a] 7 8 9 [b]15 18
119 :
120 : 1 X X X X X X X X
121 : [a] X
122 : 7 X X
123 : 8 X X
124 : 9 X
125 : [b] X
126 : 15 X X X X
127 : 18 X X X
128 :
129 :
130 : 4. We add COMPLEX characters and make it breakable w/ all ther class
131 : except after class 1 and before class [a]
132 :
133 : Class of
134 : Leading Class of Trailing Char Class
135 : Char
136 :
137 : 1 [a] 7 8 9 [b]15 18 COMPLEX
138 :
139 : 1 X X X X X X X X X
140 : [a] X
141 : 7 X X
142 : 8 X X
143 : 9 X
144 : [b] X
145 : 15 X X X X
146 : 18 X X X
147 : COMPLEX X T
148 :
149 : T : need special handling
150 :
151 :
152 : 5. However, we need two special class for some punctuations/parentheses,
153 : theirs breaking rules like character class (18), see bug 389056.
154 : And also we need character like punctuation that is same behavior with 18,
155 : but the characters are not letters of all languages. (e.g., '_')
156 : [c]. Based on open parenthesis class (1), but it is not breakable after
157 : character class (18) or numeric class (15).
158 : [d]. Based on close parenthesis (or punctuation) class (2), but it is not
159 : breakable before character class (18) or numeric class (15).
160 :
161 : Class of
162 : Leading Class of Trailing Char Class
163 : Char
164 :
165 : 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d]
166 :
167 : 1 X X X X X X X X X X X
168 : [a] X X X
169 : 7 X X
170 : 8 X X
171 : 9 X
172 : [b] X X
173 : 15 X X X X X X
174 : 18 X X X X X
175 : COMPLEX X T
176 : [c] X X X X X X X X X X X
177 : [d] X X X X
178 :
179 :
180 : 6. And Unicode has "NON-BREAK" characters. The lines should be broken around
181 : them. But in JIS X 4051, such class is not, therefore, we create [e].
182 :
183 : Class of
184 : Leading Class of Trailing Char Class
185 : Char
186 :
187 : 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
188 :
189 : 1 X X X X X X X X X X X X
190 : [a] X X X
191 : 7 X X X
192 : 8 X X X
193 : 9 X X
194 : [b] X X X
195 : 15 X X X X X X X
196 : 18 X X X X X X
197 : COMPLEX X T X
198 : [c] X X X X X X X X X X X X
199 : [d] X X X X X
200 : [e] X X X X X X X X X X X X
201 :
202 :
203 : 7. Now we use one bit to encode weather it is breakable, and use 2 bytes
204 : for one row, then the bit table will look like:
205 :
206 : 18 <- 1
207 :
208 : 1 0000 1111 1111 1111 = 0x0FFF
209 : [a] 0000 1100 0000 0010 = 0x0C02
210 : 7 0000 1000 0000 0110 = 0x0806
211 : 8 0000 1000 0100 0010 = 0x0842
212 : 9 0000 1000 0000 0010 = 0x0802
213 : [b] 0000 1100 0000 0010 = 0x0C02
214 : 15 0000 1110 1101 0010 = 0x0ED2
215 : 18 0000 1110 1100 0010 = 0x0EC2
216 : COMPLEX 0000 1001 0000 0010 = 0x0902
217 : [c] 0000 1111 1111 1111 = 0x0FFF
218 : [d] 0000 1100 1100 0010 = 0x0CC2
219 : [e] 0000 1111 1111 1111 = 0x0FFF
220 : */
221 :
222 : #define MAX_CLASSES 12
223 :
224 : static const uint16_t gPair[MAX_CLASSES] = {
225 : 0x0FFF,
226 : 0x0C02,
227 : 0x0806,
228 : 0x0842,
229 : 0x0802,
230 : 0x0C02,
231 : 0x0ED2,
232 : 0x0EC2,
233 : 0x0902,
234 : 0x0FFF,
235 : 0x0CC2,
236 : 0x0FFF
237 : };
238 :
239 :
240 : /*
241 :
242 : 8. And if the character is not enough far from word start, word end and
243 : another break point, we should not break in non-CJK languages.
244 : I.e., Don't break around 15, 18, [c] and [d], but don't change
245 : that if they are related to [b].
246 :
247 : Class of
248 : Leading Class of Trailing Char Class
249 : Char
250 :
251 : 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
252 :
253 : 1 X X X X X X X X X X X X
254 : [a] X X X X X X
255 : 7 X X X X X X X
256 : 8 X X X X X X
257 : 9 X X X X X X
258 : [b] X X X
259 : 15 X X X X X X X X X X X
260 : 18 X X X X X X X X X X X
261 : COMPLEX X X X T X X X
262 : [c] X X X X X X X X X X X X
263 : [d] X X X X X X X X X X X
264 : [e] X X X X X X X X X X X X
265 :
266 : 18 <- 1
267 :
268 : 1 0000 1111 1111 1111 = 0x0FFF
269 : [a] 0000 1110 1100 0010 = 0x0EC2
270 : 7 0000 1110 1100 0110 = 0x0EC6
271 : 8 0000 1110 1100 0010 = 0x0EC2
272 : 9 0000 1110 1100 0010 = 0x0EC2
273 : [b] 0000 1100 0000 0010 = 0x0C02
274 : 15 0000 1111 1101 1111 = 0x0FDF
275 : 18 0000 1111 1101 1111 = 0x0FDF
276 : COMPLEX 0000 1111 1100 0010 = 0x0FC2
277 : [c] 0000 1111 1111 1111 = 0x0FFF
278 : [d] 0000 1111 1101 1111 = 0x0FDF
279 : [e] 0000 1111 1111 1111 = 0x0FFF
280 : */
281 :
282 : static const uint16_t gPairConservative[MAX_CLASSES] = {
283 : 0x0FFF,
284 : 0x0EC2,
285 : 0x0EC6,
286 : 0x0EC2,
287 : 0x0EC2,
288 : 0x0C02,
289 : 0x0FDF,
290 : 0x0FDF,
291 : 0x0FC2,
292 : 0x0FFF,
293 : 0x0FDF,
294 : 0x0FFF
295 : };
296 :
297 :
298 : /*
299 :
300 : 9. Now we map the class to number
301 :
302 : 0: 1
303 : 1: [a]- 2, 3, 4, 5, 6
304 : 2: 7
305 : 3: 8
306 : 4: 9
307 : 5: [b]- 10, 11, 12, 17
308 : 6: 15
309 : 7: 18
310 : 8: COMPLEX
311 : 9: [c]
312 : A: [d]
313 : B: [e]
314 :
315 : and they mean:
316 : 0: Open parenthesis
317 : 1: Punctuation that prohibits break before
318 : 2: Non-breakable between same classes
319 : 3: Prefix
320 : 4: Postfix
321 : 5: Breakable character (Spaces and Most Japanese characters)
322 : 6: Numeric
323 : 7: Characters
324 : 8: Need special handling characters (E.g., Thai)
325 : 9: Open parentheses like Character (See bug 389056)
326 : A: Close parenthese (or punctuations) like Character (See bug 389056)
327 : B: Non breakable (See bug 390920)
328 :
329 : */
330 :
331 : #define CLASS_NONE INT8_MAX
332 :
333 : #define CLASS_OPEN 0x00
334 : #define CLASS_CLOSE 0x01
335 : #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
336 : #define CLASS_PREFIX 0x03
337 : #define CLASS_POSTFFIX 0x04
338 : #define CLASS_BREAKABLE 0x05
339 : #define CLASS_NUMERIC 0x06
340 : #define CLASS_CHARACTER 0x07
341 : #define CLASS_COMPLEX 0x08
342 : #define CLASS_OPEN_LIKE_CHARACTER 0x09
343 : #define CLASS_CLOSE_LIKE_CHARACTER 0x0A
344 : #define CLASS_NON_BREAKABLE 0x0B
345 :
346 : #define U_NULL char16_t(0x0000)
347 : #define U_SLASH char16_t('/')
348 : #define U_SPACE char16_t(' ')
349 : #define U_HYPHEN char16_t('-')
350 : #define U_EQUAL char16_t('=')
351 : #define U_PERCENT char16_t('%')
352 : #define U_AMPERSAND char16_t('&')
353 : #define U_SEMICOLON char16_t(';')
354 : #define U_BACKSLASH char16_t('\\')
355 : #define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
356 : #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
357 : #define U_OPEN_GUILLEMET char16_t(0x00AB)
358 :
359 : #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
360 : (c) == U_SLASH || \
361 : (c) == U_PERCENT || \
362 : (c) == U_AMPERSAND || \
363 : (c) == U_SEMICOLON || \
364 : (c) == U_BACKSLASH || \
365 : (c) == U_OPEN_SINGLE_QUOTE || \
366 : (c) == U_OPEN_DOUBLE_QUOTE || \
367 : (c) == U_OPEN_GUILLEMET)
368 :
369 : #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
370 :
371 : static inline int
372 106 : GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)
373 : {
374 106 : return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
375 : }
376 :
377 : static inline int
378 0 : IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
379 : {
380 0 : return ((0xff66 <= (u)) && ((u) <= 0xff70));
381 : }
382 :
383 : static inline int
384 109 : IS_CJK_CHAR(char32_t u)
385 : {
386 0 : return ((0x1100 <= (u) && (u) <= 0x11ff) ||
387 0 : (0x2e80 <= (u) && (u) <= 0xd7ff) ||
388 0 : (0xf900 <= (u) && (u) <= 0xfaff) ||
389 109 : (0xff00 <= (u) && (u) <= 0xffef) ||
390 109 : (0x20000 <= (u) && (u) <= 0x2fffd));
391 : }
392 :
393 : static inline bool
394 110 : IS_NONBREAKABLE_SPACE(char16_t u)
395 : {
396 110 : return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
397 : }
398 :
399 : static inline bool
400 116 : IS_HYPHEN(char16_t u)
401 : {
402 116 : return (u == U_HYPHEN ||
403 116 : u == 0x058A || // ARMENIAN HYPHEN
404 116 : u == 0x2010 || // HYPHEN
405 232 : u == 0x2012 || // FIGURE DASH
406 116 : u == 0x2013); // EN DASH
407 : }
408 :
409 : static int8_t
410 106 : GetClass(uint32_t u)
411 : {
412 106 : if (u < 0x10000) {
413 106 : uint16_t h = u & 0xFF00;
414 106 : uint16_t l = u & 0x00ff;
415 :
416 : // Handle 3 range table first
417 106 : if (0x0000 == h) {
418 106 : return GETCLASSFROMTABLE(gLBClass00, l);
419 : }
420 0 : if (0x1700 == h) {
421 0 : return GETCLASSFROMTABLE(gLBClass17, l);
422 : }
423 0 : if (NS_NeedsPlatformNativeHandling(u)) {
424 0 : return CLASS_COMPLEX;
425 : }
426 0 : if (0x0E00 == h) {
427 0 : return GETCLASSFROMTABLE(gLBClass0E, l);
428 : }
429 0 : if (0x2000 == h) {
430 0 : return GETCLASSFROMTABLE(gLBClass20, l);
431 : }
432 0 : if (0x2100 == h) {
433 0 : return GETCLASSFROMTABLE(gLBClass21, l);
434 : }
435 0 : if (0x3000 == h) {
436 0 : return GETCLASSFROMTABLE(gLBClass30, l);
437 : }
438 0 : if (0xff00 == h) {
439 0 : if (l < 0x0060) { // Fullwidth ASCII variant
440 0 : return GETCLASSFROMTABLE(gLBClass00, (l+0x20));
441 : }
442 0 : if (l < 0x00a0) { // Halfwidth Katakana variants
443 0 : switch (l) {
444 0 : case 0x61: return GetClass(0x3002);
445 0 : case 0x62: return GetClass(0x300c);
446 0 : case 0x63: return GetClass(0x300d);
447 0 : case 0x64: return GetClass(0x3001);
448 0 : case 0x65: return GetClass(0x30fb);
449 0 : case 0x9e: return GetClass(0x309b);
450 0 : case 0x9f: return GetClass(0x309c);
451 : default:
452 0 : if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) {
453 0 : return CLASS_CLOSE; // jis x4051 class 3
454 : }
455 0 : return CLASS_BREAKABLE; // jis x4051 class 11
456 : }
457 : }
458 0 : if (l < 0x00e0) {
459 0 : return CLASS_CHARACTER; // Halfwidth Hangul variants
460 : }
461 0 : if (l < 0x00f0) {
462 : static char16_t NarrowFFEx[16] = {
463 : 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
464 : 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
465 : };
466 0 : return GetClass(NarrowFFEx[l - 0x00e0]);
467 : }
468 0 : } else if (0x3100 == h) {
469 0 : if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
470 : // XXX: This is per UAX #14, but UAX #14 may change
471 : // the line breaking rules about Kanbun and Bopomofo.
472 0 : return CLASS_BREAKABLE;
473 : }
474 0 : if (l >= 0xf0) { // Katakana small letters for Ainu
475 0 : return CLASS_CLOSE;
476 : }
477 0 : } else if (0x0300 == h) {
478 0 : if (0x4F == l || (0x5C <= l && l <= 0x62)) {
479 0 : return CLASS_NON_BREAKABLE;
480 : }
481 0 : } else if (0x0500 == h) {
482 : // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
483 0 : if (l == 0x8A) {
484 0 : return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
485 : }
486 0 : } else if (0x0F00 == h) {
487 0 : if (0x08 == l || 0x0C == l || 0x12 == l) {
488 0 : return CLASS_NON_BREAKABLE;
489 : }
490 0 : } else if (0x1800 == h) {
491 0 : if (0x0E == l) {
492 0 : return CLASS_NON_BREAKABLE;
493 : }
494 0 : } else if (0x1600 == h) {
495 0 : if (0x80 == l) { // U+1680 OGHAM SPACE MARK
496 0 : return CLASS_BREAKABLE;
497 : }
498 0 : } else if (u == 0xfeff) {
499 0 : return CLASS_NON_BREAKABLE;
500 : }
501 : }
502 :
503 : // Mapping for Unicode LineBreak.txt classes to the (simplified) set of
504 : // character classes used here.
505 : // XXX The mappings here were derived by comparing the Unicode LineBreak
506 : // values of BMP characters to the classes our existing GetClass returns
507 : // for the same codepoints; in cases where characters with the same
508 : // LineBreak class mapped to various classes here, I picked what seemed
509 : // the most prevalent equivalence.
510 : // Some of these are unclear to me, but currently they are ONLY used
511 : // for characters not handled by the old code above, so all the JISx405
512 : // special cases should already be accounted for.
513 : static const int8_t sUnicodeLineBreakToClass[] = {
514 : /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER,
515 : /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER,
516 : /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER,
517 : /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER,
518 : /* BREAK_AFTER = 4, [BA] */ CLASS_CHARACTER,
519 : /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER,
520 : /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER,
521 : /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER,
522 : /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CHARACTER,
523 : /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER,
524 : /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE,
525 : /* EXCLAMATION = 11, [EX] */ CLASS_CHARACTER,
526 : /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE,
527 : /* HYPHEN = 13, [HY] */ CLASS_CHARACTER,
528 : /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE,
529 : /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER,
530 : /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER,
531 : /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE,
532 : /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER,
533 : /* NUMERIC = 19, [NU] */ CLASS_CHARACTER,
534 : /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_CHARACTER,
535 : /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CHARACTER,
536 : /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER,
537 : /* QUOTATION = 23, [QU] */ CLASS_CHARACTER,
538 : /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER,
539 : /* SURROGATE = 25, [SG] */ CLASS_CHARACTER,
540 : /* SPACE = 26, [SP] */ CLASS_BREAKABLE,
541 : /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER,
542 : /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE,
543 : /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER,
544 : /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE,
545 : /* H2 = 31, [H2] */ CLASS_BREAKABLE,
546 : /* H3 = 32, [H3] */ CLASS_BREAKABLE,
547 : /* JL = 33, [JL] */ CLASS_CHARACTER,
548 : /* JT = 34, [JT] */ CLASS_CHARACTER,
549 : /* JV = 35, [JV] */ CLASS_CHARACTER,
550 : /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER,
551 : /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE,
552 : /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER,
553 : /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER,
554 : /* E_BASE = 40, [EB] */ CLASS_BREAKABLE,
555 : /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER,
556 : /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER
557 : };
558 :
559 : #if ENABLE_INTL_API
560 : static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass),
561 : "Gecko vs ICU LineBreak class mismatch");
562 : #endif
563 :
564 0 : auto cls = mozilla::unicode::GetLineBreakClass(u);
565 0 : MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass));
566 0 : return sUnicodeLineBreakToClass[cls];
567 : }
568 :
569 : static bool
570 86 : GetPair(int8_t c1, int8_t c2)
571 : {
572 86 : NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
573 86 : NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
574 :
575 86 : return (0 == ((gPair[c1] >> c2) & 0x0001));
576 : }
577 :
578 : static bool
579 20 : GetPairConservative(int8_t c1, int8_t c2)
580 : {
581 20 : NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
582 20 : NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
583 :
584 20 : return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
585 : }
586 :
587 3 : nsJISx4051LineBreaker::nsJISx4051LineBreaker()
588 : {
589 3 : }
590 :
591 0 : nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
592 : {
593 0 : }
594 :
595 24 : NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
596 :
597 : class ContextState {
598 : public:
599 3 : ContextState(const char16_t* aText, uint32_t aLength)
600 3 : : mUniText(aText)
601 : , mText(nullptr)
602 3 : , mLength(aLength)
603 : {
604 3 : Init();
605 3 : }
606 :
607 1 : ContextState(const uint8_t* aText, uint32_t aLength)
608 1 : : mUniText(nullptr)
609 : , mText(aText)
610 1 : , mLength(aLength)
611 : {
612 1 : Init();
613 1 : }
614 :
615 0 : uint32_t Length() const { return mLength; }
616 0 : uint32_t Index() const { return mIndex; }
617 :
618 : // This gets a single code unit of the text, without checking for surrogates
619 : // (in the case of a 16-bit text buffer). That's OK if we're only checking for
620 : // specific characters that are known to be BMP values.
621 0 : char16_t GetCodeUnitAt(uint32_t aIndex) const {
622 0 : MOZ_ASSERT(aIndex < mLength, "Out of range!");
623 0 : return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
624 : }
625 :
626 : // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs
627 : // as necessary. It must ONLY be called for 16-bit text, not 8-bit.
628 335 : char32_t GetUnicodeCharAt(uint32_t aIndex) const {
629 335 : MOZ_ASSERT(mUniText, "Only for 16-bit text!");
630 335 : MOZ_ASSERT(aIndex < mLength, "Out of range!");
631 335 : char32_t c = mUniText[aIndex];
632 335 : if (NS_IS_HIGH_SURROGATE(c) && aIndex + 1 < mLength &&
633 0 : NS_IS_LOW_SURROGATE(mUniText[aIndex + 1])) {
634 0 : c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]);
635 : }
636 335 : return c;
637 : }
638 :
639 110 : void AdvanceIndex() {
640 110 : ++mIndex;
641 110 : }
642 :
643 3 : void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
644 :
645 : // A word of western language should not be broken. But even if the word has
646 : // only ASCII characters, non-natural context words should be broken, e.g.,
647 : // URL and file path. For protecting the natural words, we should use
648 : // conservative breaking rules at following conditions:
649 : // 1. at near the start of word
650 : // 2. at near the end of word
651 : // 3. at near the latest broken point
652 : // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters,
653 : // which varies depending whether we are looking at a letter or a non-letter
654 : // character: for non-letters, we use an extended "conservative" range.
655 :
656 : #define CONSERVATIVE_RANGE_LETTER 2
657 : #define CONSERVATIVE_RANGE_OTHER 6
658 :
659 111 : bool UseConservativeBreaking(uint32_t aOffset = 0) const {
660 111 : if (mHasCJKChar)
661 0 : return false;
662 111 : uint32_t index = mIndex + aOffset;
663 :
664 : // If the character at index is a letter (rather than various punctuation
665 : // characters, etc) then we want a shorter "conservative" range
666 : uint32_t conservativeRangeStart, conservativeRangeEnd;
667 301 : if (index < mLength &&
668 111 : nsUGenCategory::kLetter ==
669 111 : (mText ? GetGenCategory(mText[index])
670 111 : : GetGenCategory(GetUnicodeCharAt(index)))) {
671 : // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start
672 : // to get more balanced behavior (if we break off a 2-letter prefix,
673 : // that means the break will actually be three letters from start of
674 : // word, to include the hyphen; whereas a 2-letter suffix will be
675 : // broken only two letters from end of word).
676 79 : conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER;
677 79 : conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1;
678 : } else {
679 32 : conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER;
680 : }
681 :
682 101 : bool result = (index < conservativeRangeStart ||
683 207 : mLength - index < conservativeRangeEnd ||
684 207 : index - mLastBreakIndex < conservativeRangeStart);
685 111 : if (result || !mHasNonbreakableSpace)
686 111 : return result;
687 :
688 : // This text has no-breakable space, we need to check whether the index
689 : // is near it.
690 :
691 : // Note that index is always larger than conservativeRange here.
692 0 : for (uint32_t i = index; index - conservativeRangeStart < i; --i) {
693 0 : if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1)))
694 0 : return true;
695 : }
696 : // Note that index is always less than mLength - conservativeRange.
697 0 : for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) {
698 0 : if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i)))
699 0 : return true;
700 : }
701 0 : return false;
702 : }
703 :
704 0 : bool HasPreviousEqualsSign() const {
705 0 : return mHasPreviousEqualsSign;
706 : }
707 1 : void NotifySeenEqualsSign() {
708 1 : mHasPreviousEqualsSign = true;
709 1 : }
710 :
711 4 : bool HasPreviousSlash() const {
712 4 : return mHasPreviousSlash;
713 : }
714 5 : void NotifySeenSlash() {
715 5 : mHasPreviousSlash = true;
716 5 : }
717 :
718 0 : bool HasPreviousBackslash() const {
719 0 : return mHasPreviousBackslash;
720 : }
721 0 : void NotifySeenBackslash() {
722 0 : mHasPreviousBackslash = true;
723 0 : }
724 :
725 0 : uint32_t GetPreviousNonHyphenCharacter() const {
726 0 : return mPreviousNonHyphenCharacter;
727 : }
728 110 : void NotifyNonHyphenCharacter(uint32_t ch) {
729 110 : mPreviousNonHyphenCharacter = ch;
730 110 : }
731 :
732 : private:
733 4 : void Init() {
734 4 : mIndex = 0;
735 4 : mLastBreakIndex = 0;
736 4 : mPreviousNonHyphenCharacter = U_NULL;
737 4 : mHasCJKChar = false;
738 4 : mHasNonbreakableSpace = false;
739 4 : mHasPreviousEqualsSign = false;
740 4 : mHasPreviousSlash = false;
741 4 : mHasPreviousBackslash = false;
742 :
743 4 : if (mText) {
744 : // 8-bit text: we only need to check for
745 2 : for (uint32_t i = 0; i < mLength; ++i) {
746 1 : if (IS_NONBREAKABLE_SPACE(mText[i])) {
747 0 : mHasNonbreakableSpace = true;
748 0 : break;
749 : }
750 : }
751 : } else {
752 : // 16-bit text: handle surrogates and check for CJK as well as
753 112 : for (uint32_t i = 0; i < mLength; ++i) {
754 109 : char32_t u = GetUnicodeCharAt(i);
755 109 : if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) {
756 0 : mHasNonbreakableSpace = true;
757 0 : if (mHasCJKChar) {
758 0 : break;
759 : }
760 109 : } else if (!mHasCJKChar && IS_CJK_CHAR(u)) {
761 0 : mHasCJKChar = 1;
762 0 : if (mHasNonbreakableSpace) {
763 0 : break;
764 : }
765 : }
766 109 : if (u > 0xFFFFu) {
767 0 : ++i; // step over trailing low surrogate
768 : }
769 : }
770 : }
771 4 : }
772 :
773 : const char16_t* const mUniText;
774 : const uint8_t* const mText;
775 :
776 : uint32_t mIndex;
777 : const uint32_t mLength; // length of text
778 : uint32_t mLastBreakIndex;
779 : char32_t mPreviousNonHyphenCharacter; // The last character we have seen
780 : // which is not U_HYPHEN
781 : bool mHasCJKChar; // if the text has CJK character, this is true.
782 : bool mHasNonbreakableSpace; // if the text has no-breakable space,
783 : // this is true.
784 : bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
785 : bool mHasPreviousSlash; // True if we have seen a U_SLASH
786 : bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH
787 : };
788 :
789 : static int8_t
790 6 : ContextualAnalysis(char32_t prev, char32_t cur, char32_t next,
791 : ContextState &aState)
792 : {
793 : // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
794 :
795 6 : if (IS_HYPHEN(cur)) {
796 : // If next character is hyphen, we don't need to break between them.
797 0 : if (IS_HYPHEN(next))
798 0 : return CLASS_CHARACTER;
799 : // If prev and next characters are numeric, it may be in Math context.
800 : // So, we should not break here.
801 0 : bool prevIsNum = IS_ASCII_DIGIT(prev);
802 0 : bool nextIsNum = IS_ASCII_DIGIT(next);
803 0 : if (prevIsNum && nextIsNum)
804 0 : return CLASS_NUMERIC;
805 : // If one side is numeric and the other is a character, or if both sides are
806 : // characters, the hyphen should be breakable.
807 0 : if (!aState.UseConservativeBreaking(1)) {
808 0 : char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
809 0 : if (prevOfHyphen && next) {
810 0 : int8_t prevClass = GetClass(prevOfHyphen);
811 0 : int8_t nextClass = GetClass(next);
812 : bool prevIsNumOrCharOrClose =
813 0 : prevIsNum ||
814 0 : (prevClass == CLASS_CHARACTER &&
815 0 : !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
816 0 : prevClass == CLASS_CLOSE ||
817 0 : prevClass == CLASS_CLOSE_LIKE_CHARACTER;
818 : bool nextIsNumOrCharOrOpen =
819 0 : nextIsNum ||
820 0 : (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
821 0 : nextClass == CLASS_OPEN ||
822 0 : nextClass == CLASS_OPEN_LIKE_CHARACTER ||
823 0 : next == U_OPEN_SINGLE_QUOTE ||
824 0 : next == U_OPEN_DOUBLE_QUOTE ||
825 0 : next == U_OPEN_GUILLEMET;
826 0 : if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
827 0 : return CLASS_CLOSE;
828 : }
829 : }
830 : }
831 : } else {
832 6 : aState.NotifyNonHyphenCharacter(cur);
833 6 : if (cur == U_SLASH || cur == U_BACKSLASH) {
834 : // If this is immediately after same char, we should not break here.
835 6 : if (prev == cur)
836 1 : return CLASS_CHARACTER;
837 : // If this text has two or more (BACK)SLASHs, this may be file path or URL.
838 : // Make sure to compute shouldReturn before we notify on this slash.
839 12 : bool shouldReturn = !aState.UseConservativeBreaking() &&
840 0 : (cur == U_SLASH ?
841 9 : aState.HasPreviousSlash() : aState.HasPreviousBackslash());
842 :
843 5 : if (cur == U_SLASH) {
844 5 : aState.NotifySeenSlash();
845 : } else {
846 0 : aState.NotifySeenBackslash();
847 : }
848 :
849 5 : if (shouldReturn)
850 3 : return CLASS_OPEN;
851 0 : } else if (cur == U_PERCENT) {
852 : // If this is a part of the param of URL, we should break before.
853 0 : if (!aState.UseConservativeBreaking()) {
854 0 : if (aState.Index() >= 3 &&
855 0 : aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT)
856 0 : return CLASS_OPEN;
857 0 : if (aState.Index() + 3 < aState.Length() &&
858 0 : aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT)
859 0 : return CLASS_OPEN;
860 : }
861 0 : } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
862 : // If this may be a separator of params of URL, we should break after.
863 0 : if (!aState.UseConservativeBreaking(1) &&
864 0 : aState.HasPreviousEqualsSign())
865 0 : return CLASS_CLOSE;
866 0 : } else if (cur == U_OPEN_SINGLE_QUOTE ||
867 0 : cur == U_OPEN_DOUBLE_QUOTE ||
868 : cur == U_OPEN_GUILLEMET) {
869 : // for CJK usage, we treat these as openers to allow a break before them,
870 : // but otherwise treat them as normal characters because quote mark usage
871 : // in various Western languages varies too much; see bug #450088 discussion.
872 0 : if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
873 0 : return CLASS_OPEN;
874 : } else {
875 0 : NS_ERROR("Forgot to handle the current character!");
876 : }
877 : }
878 2 : return GetClass(cur);
879 : }
880 :
881 :
882 : int32_t
883 0 : nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
884 : uint32_t aPos, int8_t aDirection)
885 : {
886 0 : bool textNeedsJISx4051 = false;
887 : int32_t begin, end;
888 :
889 0 : for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
890 0 : if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
891 0 : textNeedsJISx4051 = true;
892 : }
893 : }
894 0 : for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
895 0 : if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
896 0 : textNeedsJISx4051 = true;
897 : }
898 : }
899 :
900 : int32_t ret;
901 0 : AutoTArray<uint8_t, 2000> breakState;
902 0 : if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
903 : // No complex text character, do not try to do complex line break.
904 : // (This is required for serializers. See Bug #344816.)
905 : // Also fall back to this when out of memory.
906 0 : if (aDirection < 0) {
907 0 : ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
908 : } else {
909 0 : ret = end;
910 : }
911 : } else {
912 0 : GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,
913 0 : breakState.Elements());
914 :
915 0 : ret = aPos;
916 0 : do {
917 0 : ret += aDirection;
918 0 : } while (begin < ret && ret < end && !breakState[ret - begin]);
919 : }
920 :
921 0 : return ret;
922 : }
923 :
924 : int32_t
925 0 : nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,
926 : uint32_t aPos)
927 : {
928 0 : NS_ASSERTION(aText, "aText shouldn't be null");
929 0 : NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
930 :
931 0 : int32_t nextPos = WordMove(aText, aLen, aPos, 1);
932 0 : return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
933 : }
934 :
935 : int32_t
936 0 : nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,
937 : uint32_t aPos)
938 : {
939 0 : NS_ASSERTION(aText, "aText shouldn't be null");
940 0 : NS_ASSERTION(aLen >= aPos && aPos > 0,
941 : "Bad position passed to nsJISx4051LineBreaker::Prev");
942 :
943 0 : int32_t prevPos = WordMove(aText, aLen, aPos, -1);
944 0 : return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
945 : }
946 :
947 : void
948 3 : nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
949 : uint8_t aWordBreak,
950 : uint8_t* aBreakBefore)
951 : {
952 : uint32_t cur;
953 3 : int8_t lastClass = CLASS_NONE;
954 3 : ContextState state(aChars, aLength);
955 :
956 112 : for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
957 109 : char32_t ch = state.GetUnicodeCharAt(cur);
958 109 : uint32_t chLen = ch > 0xFFFFu ? 2 : 1;
959 : int8_t cl;
960 :
961 109 : if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
962 : char32_t prev, next;
963 6 : if (cur > 0) {
964 : // not using state.GetUnicodeCharAt() here because we're looking back
965 : // rather than forward for possible surrogates
966 6 : prev = aChars[cur - 1];
967 6 : if (NS_IS_LOW_SURROGATE(prev) && cur > 1 &&
968 0 : NS_IS_HIGH_SURROGATE(aChars[cur - 2])) {
969 0 : prev = SURROGATE_TO_UCS4(aChars[cur - 2], prev);
970 : }
971 : } else {
972 0 : prev = 0;
973 : }
974 6 : if (cur + chLen < aLength) {
975 6 : next = state.GetUnicodeCharAt(cur + chLen);
976 : } else {
977 0 : next = 0;
978 : }
979 6 : cl = ContextualAnalysis(prev, ch, next, state);
980 : } else {
981 103 : if (ch == U_EQUAL)
982 0 : state.NotifySeenEqualsSign();
983 103 : state.NotifyNonHyphenCharacter(ch);
984 103 : cl = GetClass(ch);
985 : }
986 :
987 109 : bool allowBreak = false;
988 109 : if (cur > 0) {
989 106 : NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
990 : "Loop should have prevented adjacent complex chars here");
991 106 : if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
992 212 : allowBreak = (state.UseConservativeBreaking()) ?
993 106 : GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
994 0 : } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
995 0 : allowBreak = true;
996 : }
997 : }
998 109 : aBreakBefore[cur] = allowBreak;
999 109 : if (allowBreak)
1000 3 : state.NotifyBreakBefore();
1001 109 : lastClass = cl;
1002 109 : if (CLASS_COMPLEX == cl) {
1003 0 : uint32_t end = cur + chLen;
1004 :
1005 0 : while (end < aLength) {
1006 0 : char32_t c = state.GetUnicodeCharAt(end);
1007 0 : if (CLASS_COMPLEX != GetClass(c)) {
1008 0 : break;
1009 : }
1010 0 : ++end;
1011 0 : if (c > 0xFFFFU) { // it was a surrogate pair
1012 0 : ++end;
1013 : }
1014 : }
1015 :
1016 0 : NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
1017 :
1018 : // We have to consider word-break value again for complex characters
1019 0 : if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
1020 : // Respect word-break property
1021 0 : for (uint32_t i = cur; i < end; i++)
1022 0 : aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);
1023 : }
1024 :
1025 : // restore breakability at chunk begin, which was always set to false
1026 : // by the complex line breaker
1027 0 : aBreakBefore[cur] = allowBreak;
1028 :
1029 0 : cur = end - 1;
1030 : }
1031 :
1032 109 : if (chLen == 2) {
1033 : // Supplementary-plane character: mark that we cannot break before the
1034 : // trailing low surrogate, and advance past it.
1035 0 : ++cur;
1036 0 : aBreakBefore[cur] = false;
1037 0 : state.AdvanceIndex();
1038 : }
1039 : }
1040 3 : }
1041 :
1042 : void
1043 1 : nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
1044 : uint8_t aWordBreak,
1045 : uint8_t* aBreakBefore)
1046 : {
1047 : uint32_t cur;
1048 1 : int8_t lastClass = CLASS_NONE;
1049 1 : ContextState state(aChars, aLength);
1050 :
1051 2 : for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
1052 1 : char32_t ch = aChars[cur];
1053 : int8_t cl;
1054 :
1055 1 : if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
1056 0 : cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
1057 : ch,
1058 0 : cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
1059 0 : state);
1060 : } else {
1061 1 : if (ch == U_EQUAL)
1062 1 : state.NotifySeenEqualsSign();
1063 1 : state.NotifyNonHyphenCharacter(ch);
1064 1 : cl = GetClass(ch);
1065 : }
1066 :
1067 1 : bool allowBreak = false;
1068 1 : if (cur > 0) {
1069 0 : if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
1070 0 : allowBreak = (state.UseConservativeBreaking()) ?
1071 0 : GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
1072 0 : } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
1073 0 : allowBreak = true;
1074 : }
1075 : }
1076 1 : aBreakBefore[cur] = allowBreak;
1077 1 : if (allowBreak)
1078 0 : state.NotifyBreakBefore();
1079 1 : lastClass = cl;
1080 : }
1081 1 : }
|