Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : *
6 : * Copyright (C) 2009-2012, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : *******************************************************************************
10 : * file name: filterednormalizer2.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2009dec10
16 : * created by: Markus W. Scherer
17 : */
18 :
19 : #include "unicode/utypes.h"
20 :
21 : #if !UCONFIG_NO_NORMALIZATION
22 :
23 : #include "unicode/normalizer2.h"
24 : #include "unicode/uniset.h"
25 : #include "unicode/unistr.h"
26 : #include "unicode/unorm.h"
27 : #include "cpputils.h"
28 :
29 : U_NAMESPACE_BEGIN
30 :
31 0 : FilteredNormalizer2::~FilteredNormalizer2() {}
32 :
33 : UnicodeString &
34 0 : FilteredNormalizer2::normalize(const UnicodeString &src,
35 : UnicodeString &dest,
36 : UErrorCode &errorCode) const {
37 0 : uprv_checkCanGetBuffer(src, errorCode);
38 0 : if(U_FAILURE(errorCode)) {
39 0 : dest.setToBogus();
40 0 : return dest;
41 : }
42 0 : if(&dest==&src) {
43 0 : errorCode=U_ILLEGAL_ARGUMENT_ERROR;
44 0 : return dest;
45 : }
46 0 : dest.remove();
47 0 : return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
48 : }
49 :
50 : // Internal: No argument checking, and appends to dest.
51 : // Pass as input spanCondition the one that is likely to yield a non-zero
52 : // span length at the start of src.
53 : // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
54 : // USET_SPAN_SIMPLE should be passed in for the start of src
55 : // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
56 : // an in-filter prefix.
57 : UnicodeString &
58 0 : FilteredNormalizer2::normalize(const UnicodeString &src,
59 : UnicodeString &dest,
60 : USetSpanCondition spanCondition,
61 : UErrorCode &errorCode) const {
62 0 : UnicodeString tempDest; // Don't throw away destination buffer between iterations.
63 0 : for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
64 0 : int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
65 0 : int32_t spanLength=spanLimit-prevSpanLimit;
66 0 : if(spanCondition==USET_SPAN_NOT_CONTAINED) {
67 0 : if(spanLength!=0) {
68 0 : dest.append(src, prevSpanLimit, spanLength);
69 : }
70 0 : spanCondition=USET_SPAN_SIMPLE;
71 : } else {
72 0 : if(spanLength!=0) {
73 : // Not norm2.normalizeSecondAndAppend() because we do not want
74 : // to modify the non-filter part of dest.
75 0 : dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
76 0 : tempDest, errorCode));
77 0 : if(U_FAILURE(errorCode)) {
78 0 : break;
79 : }
80 : }
81 0 : spanCondition=USET_SPAN_NOT_CONTAINED;
82 : }
83 0 : prevSpanLimit=spanLimit;
84 : }
85 0 : return dest;
86 : }
87 :
88 : UnicodeString &
89 0 : FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
90 : const UnicodeString &second,
91 : UErrorCode &errorCode) const {
92 0 : return normalizeSecondAndAppend(first, second, TRUE, errorCode);
93 : }
94 :
95 : UnicodeString &
96 0 : FilteredNormalizer2::append(UnicodeString &first,
97 : const UnicodeString &second,
98 : UErrorCode &errorCode) const {
99 0 : return normalizeSecondAndAppend(first, second, FALSE, errorCode);
100 : }
101 :
102 : UnicodeString &
103 0 : FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
104 : const UnicodeString &second,
105 : UBool doNormalize,
106 : UErrorCode &errorCode) const {
107 0 : uprv_checkCanGetBuffer(first, errorCode);
108 0 : uprv_checkCanGetBuffer(second, errorCode);
109 0 : if(U_FAILURE(errorCode)) {
110 0 : return first;
111 : }
112 0 : if(&first==&second) {
113 0 : errorCode=U_ILLEGAL_ARGUMENT_ERROR;
114 0 : return first;
115 : }
116 0 : if(first.isEmpty()) {
117 0 : if(doNormalize) {
118 0 : return normalize(second, first, errorCode);
119 : } else {
120 0 : return first=second;
121 : }
122 : }
123 : // merge the in-filter suffix of the first string with the in-filter prefix of the second
124 0 : int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
125 0 : if(prefixLimit!=0) {
126 0 : UnicodeString prefix(second.tempSubString(0, prefixLimit));
127 0 : int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
128 0 : if(suffixStart==0) {
129 0 : if(doNormalize) {
130 0 : norm2.normalizeSecondAndAppend(first, prefix, errorCode);
131 : } else {
132 0 : norm2.append(first, prefix, errorCode);
133 : }
134 : } else {
135 0 : UnicodeString middle(first, suffixStart, INT32_MAX);
136 0 : if(doNormalize) {
137 0 : norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
138 : } else {
139 0 : norm2.append(middle, prefix, errorCode);
140 : }
141 0 : first.replace(suffixStart, INT32_MAX, middle);
142 : }
143 : }
144 0 : if(prefixLimit<second.length()) {
145 0 : UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
146 0 : if(doNormalize) {
147 0 : normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
148 : } else {
149 0 : first.append(rest);
150 : }
151 : }
152 0 : return first;
153 : }
154 :
155 : UBool
156 0 : FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
157 0 : return set.contains(c) && norm2.getDecomposition(c, decomposition);
158 : }
159 :
160 : UBool
161 0 : FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
162 0 : return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
163 : }
164 :
165 : UChar32
166 0 : FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
167 0 : return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
168 : }
169 :
170 : uint8_t
171 0 : FilteredNormalizer2::getCombiningClass(UChar32 c) const {
172 0 : return set.contains(c) ? norm2.getCombiningClass(c) : 0;
173 : }
174 :
175 : UBool
176 0 : FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
177 0 : uprv_checkCanGetBuffer(s, errorCode);
178 0 : if(U_FAILURE(errorCode)) {
179 0 : return FALSE;
180 : }
181 0 : USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
182 0 : for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
183 0 : int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
184 0 : if(spanCondition==USET_SPAN_NOT_CONTAINED) {
185 0 : spanCondition=USET_SPAN_SIMPLE;
186 : } else {
187 0 : if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
188 0 : U_FAILURE(errorCode)
189 : ) {
190 0 : return FALSE;
191 : }
192 0 : spanCondition=USET_SPAN_NOT_CONTAINED;
193 : }
194 0 : prevSpanLimit=spanLimit;
195 : }
196 0 : return TRUE;
197 : }
198 :
199 : UNormalizationCheckResult
200 0 : FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
201 0 : uprv_checkCanGetBuffer(s, errorCode);
202 0 : if(U_FAILURE(errorCode)) {
203 0 : return UNORM_MAYBE;
204 : }
205 0 : UNormalizationCheckResult result=UNORM_YES;
206 0 : USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
207 0 : for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
208 0 : int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
209 0 : if(spanCondition==USET_SPAN_NOT_CONTAINED) {
210 0 : spanCondition=USET_SPAN_SIMPLE;
211 : } else {
212 : UNormalizationCheckResult qcResult=
213 0 : norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
214 0 : if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
215 0 : return qcResult;
216 0 : } else if(qcResult==UNORM_MAYBE) {
217 0 : result=qcResult;
218 : }
219 0 : spanCondition=USET_SPAN_NOT_CONTAINED;
220 : }
221 0 : prevSpanLimit=spanLimit;
222 : }
223 0 : return result;
224 : }
225 :
226 : int32_t
227 0 : FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
228 0 : uprv_checkCanGetBuffer(s, errorCode);
229 0 : if(U_FAILURE(errorCode)) {
230 0 : return 0;
231 : }
232 0 : USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
233 0 : for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
234 0 : int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
235 0 : if(spanCondition==USET_SPAN_NOT_CONTAINED) {
236 0 : spanCondition=USET_SPAN_SIMPLE;
237 : } else {
238 : int32_t yesLimit=
239 : prevSpanLimit+
240 0 : norm2.spanQuickCheckYes(
241 0 : s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
242 0 : if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
243 0 : return yesLimit;
244 : }
245 0 : spanCondition=USET_SPAN_NOT_CONTAINED;
246 : }
247 0 : prevSpanLimit=spanLimit;
248 : }
249 0 : return s.length();
250 : }
251 :
252 : UBool
253 0 : FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
254 0 : return !set.contains(c) || norm2.hasBoundaryBefore(c);
255 : }
256 :
257 : UBool
258 0 : FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
259 0 : return !set.contains(c) || norm2.hasBoundaryAfter(c);
260 : }
261 :
262 : UBool
263 0 : FilteredNormalizer2::isInert(UChar32 c) const {
264 0 : return !set.contains(c) || norm2.isInert(c);
265 : }
266 :
267 : U_NAMESPACE_END
268 :
269 : // C API ------------------------------------------------------------------- ***
270 :
271 : U_NAMESPACE_USE
272 :
273 : U_CAPI UNormalizer2 * U_EXPORT2
274 0 : unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
275 0 : if(U_FAILURE(*pErrorCode)) {
276 0 : return NULL;
277 : }
278 0 : if(filterSet==NULL) {
279 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
280 0 : return NULL;
281 : }
282 : Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
283 0 : *UnicodeSet::fromUSet(filterSet));
284 0 : if(fn2==NULL) {
285 0 : *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
286 : }
287 0 : return (UNormalizer2 *)fn2;
288 : }
289 :
290 : #endif // !UCONFIG_NO_NORMALIZATION
|