Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : *
6 : * Copyright (C) 2009-2013, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : *******************************************************************************
10 : * file name: normalizer2.h
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2009nov22
16 : * created by: Markus W. Scherer
17 : */
18 :
19 : #ifndef __NORMALIZER2_H__
20 : #define __NORMALIZER2_H__
21 :
22 : /**
23 : * \file
24 : * \brief C++ API: New API for Unicode Normalization.
25 : */
26 :
27 : #include "unicode/utypes.h"
28 :
29 : #if !UCONFIG_NO_NORMALIZATION
30 :
31 : #include "unicode/uniset.h"
32 : #include "unicode/unistr.h"
33 : #include "unicode/unorm2.h"
34 :
35 : U_NAMESPACE_BEGIN
36 :
37 : /**
38 : * Unicode normalization functionality for standard Unicode normalization or
39 : * for using custom mapping tables.
40 : * All instances of this class are unmodifiable/immutable.
41 : * Instances returned by getInstance() are singletons that must not be deleted by the caller.
42 : * The Normalizer2 class is not intended for public subclassing.
43 : *
44 : * The primary functions are to produce a normalized string and to detect whether
45 : * a string is already normalized.
46 : * The most commonly used normalization forms are those defined in
47 : * http://www.unicode.org/unicode/reports/tr15/
48 : * However, this API supports additional normalization forms for specialized purposes.
49 : * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
50 : * and can be used in implementations of UTS #46.
51 : *
52 : * Not only are the standard compose and decompose modes supplied,
53 : * but additional modes are provided as documented in the Mode enum.
54 : *
55 : * Some of the functions in this class identify normalization boundaries.
56 : * At a normalization boundary, the portions of the string
57 : * before it and starting from it do not interact and can be handled independently.
58 : *
59 : * The spanQuickCheckYes() stops at a normalization boundary.
60 : * When the goal is a normalized string, then the text before the boundary
61 : * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
62 : *
63 : * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
64 : * a character is guaranteed to be at a normalization boundary,
65 : * regardless of context.
66 : * This is used for moving from one normalization boundary to the next
67 : * or preceding boundary, and for performing iterative normalization.
68 : *
69 : * Iterative normalization is useful when only a small portion of a
70 : * longer string needs to be processed.
71 : * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
72 : * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
73 : * (to process only the substring for which sort key bytes are computed).
74 : *
75 : * The set of normalization boundaries returned by these functions may not be
76 : * complete: There may be more boundaries that could be returned.
77 : * Different functions may return different boundaries.
78 : * @stable ICU 4.4
79 : */
80 20 : class U_COMMON_API Normalizer2 : public UObject {
81 : public:
82 : /**
83 : * Destructor.
84 : * @stable ICU 4.4
85 : */
86 : ~Normalizer2();
87 :
88 : /**
89 : * Returns a Normalizer2 instance for Unicode NFC normalization.
90 : * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
91 : * Returns an unmodifiable singleton instance. Do not delete it.
92 : * @param errorCode Standard ICU error code. Its input value must
93 : * pass the U_SUCCESS() test, or else the function returns
94 : * immediately. Check for U_FAILURE() on output or use with
95 : * function chaining. (See User Guide for details.)
96 : * @return the requested Normalizer2, if successful
97 : * @stable ICU 49
98 : */
99 : static const Normalizer2 *
100 : getNFCInstance(UErrorCode &errorCode);
101 :
102 : /**
103 : * Returns a Normalizer2 instance for Unicode NFD normalization.
104 : * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
105 : * Returns an unmodifiable singleton instance. Do not delete it.
106 : * @param errorCode Standard ICU error code. Its input value must
107 : * pass the U_SUCCESS() test, or else the function returns
108 : * immediately. Check for U_FAILURE() on output or use with
109 : * function chaining. (See User Guide for details.)
110 : * @return the requested Normalizer2, if successful
111 : * @stable ICU 49
112 : */
113 : static const Normalizer2 *
114 : getNFDInstance(UErrorCode &errorCode);
115 :
116 : /**
117 : * Returns a Normalizer2 instance for Unicode NFKC normalization.
118 : * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
119 : * Returns an unmodifiable singleton instance. Do not delete it.
120 : * @param errorCode Standard ICU error code. Its input value must
121 : * pass the U_SUCCESS() test, or else the function returns
122 : * immediately. Check for U_FAILURE() on output or use with
123 : * function chaining. (See User Guide for details.)
124 : * @return the requested Normalizer2, if successful
125 : * @stable ICU 49
126 : */
127 : static const Normalizer2 *
128 : getNFKCInstance(UErrorCode &errorCode);
129 :
130 : /**
131 : * Returns a Normalizer2 instance for Unicode NFKD normalization.
132 : * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
133 : * Returns an unmodifiable singleton instance. Do not delete it.
134 : * @param errorCode Standard ICU error code. Its input value must
135 : * pass the U_SUCCESS() test, or else the function returns
136 : * immediately. Check for U_FAILURE() on output or use with
137 : * function chaining. (See User Guide for details.)
138 : * @return the requested Normalizer2, if successful
139 : * @stable ICU 49
140 : */
141 : static const Normalizer2 *
142 : getNFKDInstance(UErrorCode &errorCode);
143 :
144 : /**
145 : * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
146 : * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
147 : * Returns an unmodifiable singleton instance. Do not delete it.
148 : * @param errorCode Standard ICU error code. Its input value must
149 : * pass the U_SUCCESS() test, or else the function returns
150 : * immediately. Check for U_FAILURE() on output or use with
151 : * function chaining. (See User Guide for details.)
152 : * @return the requested Normalizer2, if successful
153 : * @stable ICU 49
154 : */
155 : static const Normalizer2 *
156 : getNFKCCasefoldInstance(UErrorCode &errorCode);
157 :
158 : /**
159 : * Returns a Normalizer2 instance which uses the specified data file
160 : * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
161 : * and which composes or decomposes text according to the specified mode.
162 : * Returns an unmodifiable singleton instance. Do not delete it.
163 : *
164 : * Use packageName=NULL for data files that are part of ICU's own data.
165 : * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
166 : * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
167 : * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
168 : *
169 : * @param packageName NULL for ICU built-in data, otherwise application data package name
170 : * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
171 : * @param mode normalization mode (compose or decompose etc.)
172 : * @param errorCode Standard ICU error code. Its input value must
173 : * pass the U_SUCCESS() test, or else the function returns
174 : * immediately. Check for U_FAILURE() on output or use with
175 : * function chaining. (See User Guide for details.)
176 : * @return the requested Normalizer2, if successful
177 : * @stable ICU 4.4
178 : */
179 : static const Normalizer2 *
180 : getInstance(const char *packageName,
181 : const char *name,
182 : UNormalization2Mode mode,
183 : UErrorCode &errorCode);
184 :
185 : /**
186 : * Returns the normalized form of the source string.
187 : * @param src source string
188 : * @param errorCode Standard ICU error code. Its input value must
189 : * pass the U_SUCCESS() test, or else the function returns
190 : * immediately. Check for U_FAILURE() on output or use with
191 : * function chaining. (See User Guide for details.)
192 : * @return normalized src
193 : * @stable ICU 4.4
194 : */
195 : UnicodeString
196 0 : normalize(const UnicodeString &src, UErrorCode &errorCode) const {
197 0 : UnicodeString result;
198 0 : normalize(src, result, errorCode);
199 0 : return result;
200 : }
201 : /**
202 : * Writes the normalized form of the source string to the destination string
203 : * (replacing its contents) and returns the destination string.
204 : * The source and destination strings must be different objects.
205 : * @param src source string
206 : * @param dest destination string; its contents is replaced with normalized src
207 : * @param errorCode Standard ICU error code. Its input value must
208 : * pass the U_SUCCESS() test, or else the function returns
209 : * immediately. Check for U_FAILURE() on output or use with
210 : * function chaining. (See User Guide for details.)
211 : * @return dest
212 : * @stable ICU 4.4
213 : */
214 : virtual UnicodeString &
215 : normalize(const UnicodeString &src,
216 : UnicodeString &dest,
217 : UErrorCode &errorCode) const = 0;
218 : /**
219 : * Appends the normalized form of the second string to the first string
220 : * (merging them at the boundary) and returns the first string.
221 : * The result is normalized if the first string was normalized.
222 : * The first and second strings must be different objects.
223 : * @param first string, should be normalized
224 : * @param second string, will be normalized
225 : * @param errorCode Standard ICU error code. Its input value must
226 : * pass the U_SUCCESS() test, or else the function returns
227 : * immediately. Check for U_FAILURE() on output or use with
228 : * function chaining. (See User Guide for details.)
229 : * @return first
230 : * @stable ICU 4.4
231 : */
232 : virtual UnicodeString &
233 : normalizeSecondAndAppend(UnicodeString &first,
234 : const UnicodeString &second,
235 : UErrorCode &errorCode) const = 0;
236 : /**
237 : * Appends the second string to the first string
238 : * (merging them at the boundary) and returns the first string.
239 : * The result is normalized if both the strings were normalized.
240 : * The first and second strings must be different objects.
241 : * @param first string, should be normalized
242 : * @param second string, should be normalized
243 : * @param errorCode Standard ICU error code. Its input value must
244 : * pass the U_SUCCESS() test, or else the function returns
245 : * immediately. Check for U_FAILURE() on output or use with
246 : * function chaining. (See User Guide for details.)
247 : * @return first
248 : * @stable ICU 4.4
249 : */
250 : virtual UnicodeString &
251 : append(UnicodeString &first,
252 : const UnicodeString &second,
253 : UErrorCode &errorCode) const = 0;
254 :
255 : /**
256 : * Gets the decomposition mapping of c.
257 : * Roughly equivalent to normalizing the String form of c
258 : * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
259 : * returns FALSE and does not write a string
260 : * if c does not have a decomposition mapping in this instance's data.
261 : * This function is independent of the mode of the Normalizer2.
262 : * @param c code point
263 : * @param decomposition String object which will be set to c's
264 : * decomposition mapping, if there is one.
265 : * @return TRUE if c has a decomposition, otherwise FALSE
266 : * @stable ICU 4.6
267 : */
268 : virtual UBool
269 : getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
270 :
271 : /**
272 : * Gets the raw decomposition mapping of c.
273 : *
274 : * This is similar to the getDecomposition() method but returns the
275 : * raw decomposition mapping as specified in UnicodeData.txt or
276 : * (for custom data) in the mapping files processed by the gennorm2 tool.
277 : * By contrast, getDecomposition() returns the processed,
278 : * recursively-decomposed version of this mapping.
279 : *
280 : * When used on a standard NFKC Normalizer2 instance,
281 : * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
282 : *
283 : * When used on a standard NFC Normalizer2 instance,
284 : * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
285 : * in this case, the result contains either one or two code points (=1..4 char16_ts).
286 : *
287 : * This function is independent of the mode of the Normalizer2.
288 : * The default implementation returns FALSE.
289 : * @param c code point
290 : * @param decomposition String object which will be set to c's
291 : * raw decomposition mapping, if there is one.
292 : * @return TRUE if c has a decomposition, otherwise FALSE
293 : * @stable ICU 49
294 : */
295 : virtual UBool
296 : getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
297 :
298 : /**
299 : * Performs pairwise composition of a & b and returns the composite if there is one.
300 : *
301 : * Returns a composite code point c only if c has a two-way mapping to a+b.
302 : * In standard Unicode normalization, this means that
303 : * c has a canonical decomposition to a+b
304 : * and c does not have the Full_Composition_Exclusion property.
305 : *
306 : * This function is independent of the mode of the Normalizer2.
307 : * The default implementation returns a negative value.
308 : * @param a A (normalization starter) code point.
309 : * @param b Another code point.
310 : * @return The non-negative composite code point if there is one; otherwise a negative value.
311 : * @stable ICU 49
312 : */
313 : virtual UChar32
314 : composePair(UChar32 a, UChar32 b) const;
315 :
316 : /**
317 : * Gets the combining class of c.
318 : * The default implementation returns 0
319 : * but all standard implementations return the Unicode Canonical_Combining_Class value.
320 : * @param c code point
321 : * @return c's combining class
322 : * @stable ICU 49
323 : */
324 : virtual uint8_t
325 : getCombiningClass(UChar32 c) const;
326 :
327 : /**
328 : * Tests if the string is normalized.
329 : * Internally, in cases where the quickCheck() method would return "maybe"
330 : * (which is only possible for the two COMPOSE modes) this method
331 : * resolves to "yes" or "no" to provide a definitive result,
332 : * at the cost of doing more work in those cases.
333 : * @param s input string
334 : * @param errorCode Standard ICU error code. Its input value must
335 : * pass the U_SUCCESS() test, or else the function returns
336 : * immediately. Check for U_FAILURE() on output or use with
337 : * function chaining. (See User Guide for details.)
338 : * @return TRUE if s is normalized
339 : * @stable ICU 4.4
340 : */
341 : virtual UBool
342 : isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
343 :
344 : /**
345 : * Tests if the string is normalized.
346 : * For the two COMPOSE modes, the result could be "maybe" in cases that
347 : * would take a little more work to resolve definitively.
348 : * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
349 : * combination of quick check + normalization, to avoid
350 : * re-checking the "yes" prefix.
351 : * @param s input string
352 : * @param errorCode Standard ICU error code. Its input value must
353 : * pass the U_SUCCESS() test, or else the function returns
354 : * immediately. Check for U_FAILURE() on output or use with
355 : * function chaining. (See User Guide for details.)
356 : * @return UNormalizationCheckResult
357 : * @stable ICU 4.4
358 : */
359 : virtual UNormalizationCheckResult
360 : quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
361 :
362 : /**
363 : * Returns the end of the normalized substring of the input string.
364 : * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
365 : * the substring <code>UnicodeString(s, 0, end)</code>
366 : * will pass the quick check with a "yes" result.
367 : *
368 : * The returned end index is usually one or more characters before the
369 : * "no" or "maybe" character: The end index is at a normalization boundary.
370 : * (See the class documentation for more about normalization boundaries.)
371 : *
372 : * When the goal is a normalized string and most input strings are expected
373 : * to be normalized already, then call this method,
374 : * and if it returns a prefix shorter than the input string,
375 : * copy that prefix and use normalizeSecondAndAppend() for the remainder.
376 : * @param s input string
377 : * @param errorCode Standard ICU error code. Its input value must
378 : * pass the U_SUCCESS() test, or else the function returns
379 : * immediately. Check for U_FAILURE() on output or use with
380 : * function chaining. (See User Guide for details.)
381 : * @return "yes" span end index
382 : * @stable ICU 4.4
383 : */
384 : virtual int32_t
385 : spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
386 :
387 : /**
388 : * Tests if the character always has a normalization boundary before it,
389 : * regardless of context.
390 : * If true, then the character does not normalization-interact with
391 : * preceding characters.
392 : * In other words, a string containing this character can be normalized
393 : * by processing portions before this character and starting from this
394 : * character independently.
395 : * This is used for iterative normalization. See the class documentation for details.
396 : * @param c character to test
397 : * @return TRUE if c has a normalization boundary before it
398 : * @stable ICU 4.4
399 : */
400 : virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
401 :
402 : /**
403 : * Tests if the character always has a normalization boundary after it,
404 : * regardless of context.
405 : * If true, then the character does not normalization-interact with
406 : * following characters.
407 : * In other words, a string containing this character can be normalized
408 : * by processing portions up to this character and after this
409 : * character independently.
410 : * This is used for iterative normalization. See the class documentation for details.
411 : * Note that this operation may be significantly slower than hasBoundaryBefore().
412 : * @param c character to test
413 : * @return TRUE if c has a normalization boundary after it
414 : * @stable ICU 4.4
415 : */
416 : virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
417 :
418 : /**
419 : * Tests if the character is normalization-inert.
420 : * If true, then the character does not change, nor normalization-interact with
421 : * preceding or following characters.
422 : * In other words, a string containing this character can be normalized
423 : * by processing portions before this character and after this
424 : * character independently.
425 : * This is used for iterative normalization. See the class documentation for details.
426 : * Note that this operation may be significantly slower than hasBoundaryBefore().
427 : * @param c character to test
428 : * @return TRUE if c is normalization-inert
429 : * @stable ICU 4.4
430 : */
431 : virtual UBool isInert(UChar32 c) const = 0;
432 : };
433 :
434 : /**
435 : * Normalization filtered by a UnicodeSet.
436 : * Normalizes portions of the text contained in the filter set and leaves
437 : * portions not contained in the filter set unchanged.
438 : * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
439 : * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
440 : * This class implements all of (and only) the Normalizer2 API.
441 : * An instance of this class is unmodifiable/immutable but is constructed and
442 : * must be destructed by the owner.
443 : * @stable ICU 4.4
444 : */
445 : class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
446 : public:
447 : /**
448 : * Constructs a filtered normalizer wrapping any Normalizer2 instance
449 : * and a filter set.
450 : * Both are aliased and must not be modified or deleted while this object
451 : * is used.
452 : * The filter set should be frozen; otherwise the performance will suffer greatly.
453 : * @param n2 wrapped Normalizer2 instance
454 : * @param filterSet UnicodeSet which determines the characters to be normalized
455 : * @stable ICU 4.4
456 : */
457 0 : FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
458 0 : norm2(n2), set(filterSet) {}
459 :
460 : /**
461 : * Destructor.
462 : * @stable ICU 4.4
463 : */
464 : ~FilteredNormalizer2();
465 :
466 : /**
467 : * Writes the normalized form of the source string to the destination string
468 : * (replacing its contents) and returns the destination string.
469 : * The source and destination strings must be different objects.
470 : * @param src source string
471 : * @param dest destination string; its contents is replaced with normalized src
472 : * @param errorCode Standard ICU error code. Its input value must
473 : * pass the U_SUCCESS() test, or else the function returns
474 : * immediately. Check for U_FAILURE() on output or use with
475 : * function chaining. (See User Guide for details.)
476 : * @return dest
477 : * @stable ICU 4.4
478 : */
479 : virtual UnicodeString &
480 : normalize(const UnicodeString &src,
481 : UnicodeString &dest,
482 : UErrorCode &errorCode) const;
483 : /**
484 : * Appends the normalized form of the second string to the first string
485 : * (merging them at the boundary) and returns the first string.
486 : * The result is normalized if the first string was normalized.
487 : * The first and second strings must be different objects.
488 : * @param first string, should be normalized
489 : * @param second string, will be normalized
490 : * @param errorCode Standard ICU error code. Its input value must
491 : * pass the U_SUCCESS() test, or else the function returns
492 : * immediately. Check for U_FAILURE() on output or use with
493 : * function chaining. (See User Guide for details.)
494 : * @return first
495 : * @stable ICU 4.4
496 : */
497 : virtual UnicodeString &
498 : normalizeSecondAndAppend(UnicodeString &first,
499 : const UnicodeString &second,
500 : UErrorCode &errorCode) const;
501 : /**
502 : * Appends the second string to the first string
503 : * (merging them at the boundary) and returns the first string.
504 : * The result is normalized if both the strings were normalized.
505 : * The first and second strings must be different objects.
506 : * @param first string, should be normalized
507 : * @param second string, should be normalized
508 : * @param errorCode Standard ICU error code. Its input value must
509 : * pass the U_SUCCESS() test, or else the function returns
510 : * immediately. Check for U_FAILURE() on output or use with
511 : * function chaining. (See User Guide for details.)
512 : * @return first
513 : * @stable ICU 4.4
514 : */
515 : virtual UnicodeString &
516 : append(UnicodeString &first,
517 : const UnicodeString &second,
518 : UErrorCode &errorCode) const;
519 :
520 : /**
521 : * Gets the decomposition mapping of c.
522 : * For details see the base class documentation.
523 : *
524 : * This function is independent of the mode of the Normalizer2.
525 : * @param c code point
526 : * @param decomposition String object which will be set to c's
527 : * decomposition mapping, if there is one.
528 : * @return TRUE if c has a decomposition, otherwise FALSE
529 : * @stable ICU 4.6
530 : */
531 : virtual UBool
532 : getDecomposition(UChar32 c, UnicodeString &decomposition) const;
533 :
534 : /**
535 : * Gets the raw decomposition mapping of c.
536 : * For details see the base class documentation.
537 : *
538 : * This function is independent of the mode of the Normalizer2.
539 : * @param c code point
540 : * @param decomposition String object which will be set to c's
541 : * raw decomposition mapping, if there is one.
542 : * @return TRUE if c has a decomposition, otherwise FALSE
543 : * @stable ICU 49
544 : */
545 : virtual UBool
546 : getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
547 :
548 : /**
549 : * Performs pairwise composition of a & b and returns the composite if there is one.
550 : * For details see the base class documentation.
551 : *
552 : * This function is independent of the mode of the Normalizer2.
553 : * @param a A (normalization starter) code point.
554 : * @param b Another code point.
555 : * @return The non-negative composite code point if there is one; otherwise a negative value.
556 : * @stable ICU 49
557 : */
558 : virtual UChar32
559 : composePair(UChar32 a, UChar32 b) const;
560 :
561 : /**
562 : * Gets the combining class of c.
563 : * The default implementation returns 0
564 : * but all standard implementations return the Unicode Canonical_Combining_Class value.
565 : * @param c code point
566 : * @return c's combining class
567 : * @stable ICU 49
568 : */
569 : virtual uint8_t
570 : getCombiningClass(UChar32 c) const;
571 :
572 : /**
573 : * Tests if the string is normalized.
574 : * For details see the Normalizer2 base class documentation.
575 : * @param s input string
576 : * @param errorCode Standard ICU error code. Its input value must
577 : * pass the U_SUCCESS() test, or else the function returns
578 : * immediately. Check for U_FAILURE() on output or use with
579 : * function chaining. (See User Guide for details.)
580 : * @return TRUE if s is normalized
581 : * @stable ICU 4.4
582 : */
583 : virtual UBool
584 : isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
585 : /**
586 : * Tests if the string is normalized.
587 : * For details see the Normalizer2 base class documentation.
588 : * @param s input string
589 : * @param errorCode Standard ICU error code. Its input value must
590 : * pass the U_SUCCESS() test, or else the function returns
591 : * immediately. Check for U_FAILURE() on output or use with
592 : * function chaining. (See User Guide for details.)
593 : * @return UNormalizationCheckResult
594 : * @stable ICU 4.4
595 : */
596 : virtual UNormalizationCheckResult
597 : quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
598 : /**
599 : * Returns the end of the normalized substring of the input string.
600 : * For details see the Normalizer2 base class documentation.
601 : * @param s input string
602 : * @param errorCode Standard ICU error code. Its input value must
603 : * pass the U_SUCCESS() test, or else the function returns
604 : * immediately. Check for U_FAILURE() on output or use with
605 : * function chaining. (See User Guide for details.)
606 : * @return "yes" span end index
607 : * @stable ICU 4.4
608 : */
609 : virtual int32_t
610 : spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
611 :
612 : /**
613 : * Tests if the character always has a normalization boundary before it,
614 : * regardless of context.
615 : * For details see the Normalizer2 base class documentation.
616 : * @param c character to test
617 : * @return TRUE if c has a normalization boundary before it
618 : * @stable ICU 4.4
619 : */
620 : virtual UBool hasBoundaryBefore(UChar32 c) const;
621 :
622 : /**
623 : * Tests if the character always has a normalization boundary after it,
624 : * regardless of context.
625 : * For details see the Normalizer2 base class documentation.
626 : * @param c character to test
627 : * @return TRUE if c has a normalization boundary after it
628 : * @stable ICU 4.4
629 : */
630 : virtual UBool hasBoundaryAfter(UChar32 c) const;
631 :
632 : /**
633 : * Tests if the character is normalization-inert.
634 : * For details see the Normalizer2 base class documentation.
635 : * @param c character to test
636 : * @return TRUE if c is normalization-inert
637 : * @stable ICU 4.4
638 : */
639 : virtual UBool isInert(UChar32 c) const;
640 : private:
641 : UnicodeString &
642 : normalize(const UnicodeString &src,
643 : UnicodeString &dest,
644 : USetSpanCondition spanCondition,
645 : UErrorCode &errorCode) const;
646 :
647 : UnicodeString &
648 : normalizeSecondAndAppend(UnicodeString &first,
649 : const UnicodeString &second,
650 : UBool doNormalize,
651 : UErrorCode &errorCode) const;
652 :
653 : const Normalizer2 &norm2;
654 : const UnicodeSet &set;
655 : };
656 :
657 : U_NAMESPACE_END
658 :
659 : #endif // !UCONFIG_NO_NORMALIZATION
660 : #endif // __NORMALIZER2_H__
|