Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 :
7 : #include "nsSampleWordBreaker.h"
8 :
9 3 : nsSampleWordBreaker::nsSampleWordBreaker()
10 : {
11 3 : }
12 0 : nsSampleWordBreaker::~nsSampleWordBreaker()
13 : {
14 0 : }
15 :
16 24 : NS_IMPL_ISUPPORTS(nsSampleWordBreaker, nsIWordBreaker)
17 :
18 0 : bool nsSampleWordBreaker::BreakInBetween(
19 : const char16_t* aText1 , uint32_t aTextLen1,
20 : const char16_t* aText2 , uint32_t aTextLen2)
21 : {
22 0 : NS_PRECONDITION( nullptr != aText1, "null ptr");
23 0 : NS_PRECONDITION( nullptr != aText2, "null ptr");
24 :
25 0 : if(!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2))
26 0 : return false;
27 :
28 0 : return GetClass(aText1[aTextLen1-1]) != GetClass(aText2[0]);
29 : }
30 :
31 :
32 : #define IS_ASCII(c) (0 == ( 0xFF80 & (c)))
33 : #define ASCII_IS_ALPHA(c) ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z')))
34 : #define ASCII_IS_DIGIT(c) (( '0' <= (c)) && ((c) <= '9'))
35 : #define ASCII_IS_SPACE(c) (( ' ' == (c)) || ( '\t' == (c)) || ( '\r' == (c)) || ( '\n' == (c)))
36 : #define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80)
37 :
38 : // we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect Unicode 3.0
39 : #define IS_HAN(c) (( 0x3400 <= (c)) && ((c) <= 0x9fff))||(( 0xf900 <= (c)) && ((c) <= 0xfaff))
40 : #define IS_KATAKANA(c) (( 0x30A0 <= (c)) && ((c) <= 0x30FF))
41 : #define IS_HIRAGANA(c) (( 0x3040 <= (c)) && ((c) <= 0x309F))
42 : #define IS_HALFWIDTHKATAKANA(c) (( 0xFF60 <= (c)) && ((c) <= 0xFF9F))
43 : #define IS_THAI(c) (0x0E00 == (0xFF80 & (c) )) // Look at the higest 9 bits
44 :
45 : /* static */ nsWordBreakClass
46 0 : nsIWordBreaker::GetClass(char16_t c)
47 : {
48 : // begin of the hack
49 :
50 0 : if (IS_ALPHABETICAL_SCRIPT(c)) {
51 0 : if(IS_ASCII(c)) {
52 0 : if(ASCII_IS_SPACE(c)) {
53 0 : return kWbClassSpace;
54 0 : } else if(ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c)) {
55 0 : return kWbClassAlphaLetter;
56 : } else {
57 0 : return kWbClassPunct;
58 : }
59 0 : } else if(IS_THAI(c)) {
60 0 : return kWbClassThaiLetter;
61 0 : } else if (c == 0x00A0/*NBSP*/) {
62 0 : return kWbClassSpace;
63 : } else {
64 0 : return kWbClassAlphaLetter;
65 : }
66 : } else {
67 0 : if(IS_HAN(c)) {
68 0 : return kWbClassHanLetter;
69 0 : } else if(IS_KATAKANA(c)) {
70 0 : return kWbClassKatakanaLetter;
71 0 : } else if(IS_HIRAGANA(c)) {
72 0 : return kWbClassHiraganaLetter;
73 0 : } else if(IS_HALFWIDTHKATAKANA(c)) {
74 0 : return kWbClassHWKatakanaLetter;
75 : } else {
76 0 : return kWbClassAlphaLetter;
77 : }
78 : }
79 : return static_cast<nsWordBreakClass>(0);
80 : }
81 :
82 0 : nsWordRange nsSampleWordBreaker::FindWord(
83 : const char16_t* aText , uint32_t aTextLen,
84 : uint32_t aOffset)
85 : {
86 : nsWordRange range;
87 0 : NS_PRECONDITION( nullptr != aText, "null ptr");
88 0 : NS_PRECONDITION( 0 != aTextLen, "len = 0");
89 0 : NS_PRECONDITION( aOffset <= aTextLen, "aOffset > aTextLen");
90 :
91 0 : range.mBegin = aTextLen + 1;
92 0 : range.mEnd = aTextLen + 1;
93 :
94 0 : if(!aText || aOffset > aTextLen)
95 0 : return range;
96 :
97 0 : nsWordBreakClass c = GetClass(aText[aOffset]);
98 : uint32_t i;
99 : // Scan forward
100 0 : range.mEnd--;
101 0 : for(i = aOffset +1;i <= aTextLen; i++)
102 : {
103 0 : if( c != GetClass(aText[i]))
104 : {
105 0 : range.mEnd = i;
106 0 : break;
107 : }
108 : }
109 :
110 : // Scan backward
111 0 : range.mBegin = 0;
112 0 : for(i = aOffset ;i > 0; i--)
113 : {
114 0 : if( c != GetClass(aText[i-1]))
115 : {
116 0 : range.mBegin = i;
117 0 : break;
118 : }
119 : }
120 : if(kWbClassThaiLetter == c)
121 : {
122 : // need to call Thai word breaker from here
123 : // we should pass the whole Thai segment to the thai word breaker to find a shorter answer
124 : }
125 0 : return range;
126 : }
127 :
128 0 : int32_t nsSampleWordBreaker::NextWord(
129 : const char16_t* aText, uint32_t aLen, uint32_t aPos)
130 : {
131 : nsWordBreakClass c1, c2;
132 0 : uint32_t cur = aPos;
133 0 : if (cur == aLen)
134 0 : return NS_WORDBREAKER_NEED_MORE_TEXT;
135 0 : c1 = GetClass(aText[cur]);
136 :
137 0 : for(cur++; cur <aLen; cur++)
138 : {
139 0 : c2 = GetClass(aText[cur]);
140 0 : if(c2 != c1)
141 0 : break;
142 : }
143 : if(kWbClassThaiLetter == c1)
144 : {
145 : // need to call Thai word breaker from here
146 : // we should pass the whole Thai segment to the thai word breaker to find a shorter answer
147 : }
148 0 : if (cur == aLen)
149 0 : return NS_WORDBREAKER_NEED_MORE_TEXT;
150 0 : return cur;
151 : }
|