Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : * Copyright (c) 1996-2014, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : ******************************************************************************
8 : * File unorm.cpp
9 : *
10 : * Created by: Vladimir Weinstein 12052000
11 : *
12 : * Modification history :
13 : *
14 : * Date Name Description
15 : * 02/01/01 synwee Added normalization quickcheck enum and method.
16 : * 02/12/01 synwee Commented out quickcheck util api has been approved
17 : * Added private method for doing FCD checks
18 : * 02/23/01 synwee Modified quickcheck and checkFCE to run through
19 : * string for codepoints < 0x300 for the normalization
20 : * mode NFC.
21 : * 05/25/01+ Markus Scherer total rewrite, implement all normalization here
22 : * instead of just wrappers around normlzr.cpp,
23 : * load unorm.dat, support Unicode 3.1 with
24 : * supplementary code points, etc.
25 : * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code
26 : */
27 :
28 : #include "unicode/utypes.h"
29 :
30 : #if !UCONFIG_NO_NORMALIZATION
31 :
32 : #include "unicode/udata.h"
33 : #include "unicode/ustring.h"
34 : #include "unicode/uiter.h"
35 : #include "unicode/unorm.h"
36 : #include "unicode/unorm2.h"
37 : #include "normalizer2impl.h"
38 : #include "unormimp.h"
39 : #include "uprops.h"
40 : #include "ustr_imp.h"
41 :
42 : U_NAMESPACE_USE
43 :
44 : /* quick check functions ---------------------------------------------------- */
45 :
46 : U_CAPI UNormalizationCheckResult U_EXPORT2
47 0 : unorm_quickCheck(const UChar *src,
48 : int32_t srcLength,
49 : UNormalizationMode mode,
50 : UErrorCode *pErrorCode) {
51 0 : const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
52 0 : return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
53 : }
54 :
55 : U_CAPI UNormalizationCheckResult U_EXPORT2
56 0 : unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
57 : UNormalizationMode mode, int32_t options,
58 : UErrorCode *pErrorCode) {
59 0 : const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
60 0 : if(options&UNORM_UNICODE_3_2) {
61 0 : FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
62 : return unorm2_quickCheck(
63 : reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
64 0 : src, srcLength, pErrorCode);
65 : } else {
66 0 : return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
67 : }
68 : }
69 :
70 : U_CAPI UBool U_EXPORT2
71 0 : unorm_isNormalized(const UChar *src, int32_t srcLength,
72 : UNormalizationMode mode,
73 : UErrorCode *pErrorCode) {
74 0 : const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
75 0 : return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
76 : }
77 :
78 : U_CAPI UBool U_EXPORT2
79 0 : unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
80 : UNormalizationMode mode, int32_t options,
81 : UErrorCode *pErrorCode) {
82 0 : const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
83 0 : if(options&UNORM_UNICODE_3_2) {
84 0 : FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
85 : return unorm2_isNormalized(
86 : reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
87 0 : src, srcLength, pErrorCode);
88 : } else {
89 0 : return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
90 : }
91 : }
92 :
93 : /* normalize() API ---------------------------------------------------------- */
94 :
95 : /** Public API for normalizing. */
96 : U_CAPI int32_t U_EXPORT2
97 0 : unorm_normalize(const UChar *src, int32_t srcLength,
98 : UNormalizationMode mode, int32_t options,
99 : UChar *dest, int32_t destCapacity,
100 : UErrorCode *pErrorCode) {
101 0 : const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
102 0 : if(options&UNORM_UNICODE_3_2) {
103 0 : FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
104 : return unorm2_normalize(
105 : reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
106 0 : src, srcLength, dest, destCapacity, pErrorCode);
107 : } else {
108 : return unorm2_normalize((const UNormalizer2 *)n2,
109 0 : src, srcLength, dest, destCapacity, pErrorCode);
110 : }
111 : }
112 :
113 :
114 : /* iteration functions ------------------------------------------------------ */
115 :
116 : static int32_t
117 0 : _iterate(UCharIterator *src, UBool forward,
118 : UChar *dest, int32_t destCapacity,
119 : const Normalizer2 *n2,
120 : UBool doNormalize, UBool *pNeededToNormalize,
121 : UErrorCode *pErrorCode) {
122 0 : if(U_FAILURE(*pErrorCode)) {
123 0 : return 0;
124 : }
125 0 : if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) {
126 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
127 0 : return 0;
128 : }
129 :
130 0 : if(pNeededToNormalize!=NULL) {
131 0 : *pNeededToNormalize=FALSE;
132 : }
133 0 : if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
134 0 : return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
135 : }
136 :
137 0 : UnicodeString buffer;
138 : UChar32 c;
139 0 : if(forward) {
140 : /* get one character and ignore its properties */
141 0 : buffer.append(uiter_next32(src));
142 : /* get all following characters until we see a boundary */
143 0 : while((c=uiter_next32(src))>=0) {
144 0 : if(n2->hasBoundaryBefore(c)) {
145 : /* back out the latest movement to stop at the boundary */
146 0 : src->move(src, -U16_LENGTH(c), UITER_CURRENT);
147 0 : break;
148 : } else {
149 0 : buffer.append(c);
150 : }
151 : }
152 : } else {
153 0 : while((c=uiter_previous32(src))>=0) {
154 : /* always write this character to the front of the buffer */
155 0 : buffer.insert(0, c);
156 : /* stop if this just-copied character is a boundary */
157 0 : if(n2->hasBoundaryBefore(c)) {
158 0 : break;
159 : }
160 : }
161 : }
162 :
163 0 : UnicodeString destString(dest, 0, destCapacity);
164 0 : if(buffer.length()>0 && doNormalize) {
165 0 : n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
166 0 : if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
167 0 : *pNeededToNormalize= destString!=buffer;
168 : }
169 0 : return destString.length();
170 : } else {
171 : /* just copy the source characters */
172 0 : return buffer.extract(dest, destCapacity, *pErrorCode);
173 : }
174 : }
175 :
176 : static int32_t
177 0 : unorm_iterate(UCharIterator *src, UBool forward,
178 : UChar *dest, int32_t destCapacity,
179 : UNormalizationMode mode, int32_t options,
180 : UBool doNormalize, UBool *pNeededToNormalize,
181 : UErrorCode *pErrorCode) {
182 0 : const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
183 0 : if(options&UNORM_UNICODE_3_2) {
184 0 : const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
185 0 : if(U_FAILURE(*pErrorCode)) {
186 0 : return 0;
187 : }
188 0 : FilteredNormalizer2 fn2(*n2, *uni32);
189 0 : return _iterate(src, forward, dest, destCapacity,
190 0 : &fn2, doNormalize, pNeededToNormalize, pErrorCode);
191 : }
192 0 : return _iterate(src, forward, dest, destCapacity,
193 0 : n2, doNormalize, pNeededToNormalize, pErrorCode);
194 : }
195 :
196 : U_CAPI int32_t U_EXPORT2
197 0 : unorm_previous(UCharIterator *src,
198 : UChar *dest, int32_t destCapacity,
199 : UNormalizationMode mode, int32_t options,
200 : UBool doNormalize, UBool *pNeededToNormalize,
201 : UErrorCode *pErrorCode) {
202 0 : return unorm_iterate(src, FALSE,
203 : dest, destCapacity,
204 : mode, options,
205 : doNormalize, pNeededToNormalize,
206 0 : pErrorCode);
207 : }
208 :
209 : U_CAPI int32_t U_EXPORT2
210 0 : unorm_next(UCharIterator *src,
211 : UChar *dest, int32_t destCapacity,
212 : UNormalizationMode mode, int32_t options,
213 : UBool doNormalize, UBool *pNeededToNormalize,
214 : UErrorCode *pErrorCode) {
215 0 : return unorm_iterate(src, TRUE,
216 : dest, destCapacity,
217 : mode, options,
218 : doNormalize, pNeededToNormalize,
219 0 : pErrorCode);
220 : }
221 :
222 : /* Concatenation of normalized strings -------------------------------------- */
223 :
224 : static int32_t
225 0 : _concatenate(const UChar *left, int32_t leftLength,
226 : const UChar *right, int32_t rightLength,
227 : UChar *dest, int32_t destCapacity,
228 : const Normalizer2 *n2,
229 : UErrorCode *pErrorCode) {
230 0 : if(U_FAILURE(*pErrorCode)) {
231 0 : return 0;
232 : }
233 0 : if(destCapacity<0 || (dest==NULL && destCapacity>0) ||
234 0 : left==NULL || leftLength<-1 || right==NULL || rightLength<-1) {
235 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
236 0 : return 0;
237 : }
238 :
239 : /* check for overlapping right and destination */
240 0 : if( dest!=NULL &&
241 0 : ((right>=dest && right<(dest+destCapacity)) ||
242 0 : (rightLength>0 && dest>=right && dest<(right+rightLength)))
243 : ) {
244 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
245 0 : return 0;
246 : }
247 :
248 : /* allow left==dest */
249 0 : UnicodeString destString;
250 0 : if(left==dest) {
251 0 : destString.setTo(dest, leftLength, destCapacity);
252 : } else {
253 0 : destString.setTo(dest, 0, destCapacity);
254 0 : destString.append(left, leftLength);
255 : }
256 0 : return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
257 0 : extract(dest, destCapacity, *pErrorCode);
258 : }
259 :
260 : U_CAPI int32_t U_EXPORT2
261 0 : unorm_concatenate(const UChar *left, int32_t leftLength,
262 : const UChar *right, int32_t rightLength,
263 : UChar *dest, int32_t destCapacity,
264 : UNormalizationMode mode, int32_t options,
265 : UErrorCode *pErrorCode) {
266 0 : const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
267 0 : if(options&UNORM_UNICODE_3_2) {
268 0 : const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
269 0 : if(U_FAILURE(*pErrorCode)) {
270 0 : return 0;
271 : }
272 0 : FilteredNormalizer2 fn2(*n2, *uni32);
273 : return _concatenate(left, leftLength, right, rightLength,
274 0 : dest, destCapacity, &fn2, pErrorCode);
275 : }
276 : return _concatenate(left, leftLength, right, rightLength,
277 0 : dest, destCapacity, n2, pErrorCode);
278 : }
279 :
280 : #endif /* #if !UCONFIG_NO_NORMALIZATION */
|