Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2012-2016, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * utf8collationiterator.h
9 : *
10 : * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #ifndef __UTF8COLLATIONITERATOR_H__
15 : #define __UTF8COLLATIONITERATOR_H__
16 :
17 : #include "unicode/utypes.h"
18 :
19 : #if !UCONFIG_NO_COLLATION
20 :
21 : #include "cmemory.h"
22 : #include "collation.h"
23 : #include "collationdata.h"
24 : #include "collationiterator.h"
25 : #include "normalizer2impl.h"
26 :
27 : U_NAMESPACE_BEGIN
28 :
29 : /**
30 : * UTF-8 collation element and character iterator.
31 : * Handles normalized UTF-8 text inline, with length or NUL-terminated.
32 : * Unnormalized text is handled by a subclass.
33 : */
34 : class U_I18N_API UTF8CollationIterator : public CollationIterator {
35 : public:
36 0 : UTF8CollationIterator(const CollationData *d, UBool numeric,
37 : const uint8_t *s, int32_t p, int32_t len)
38 0 : : CollationIterator(d, numeric),
39 0 : u8(s), pos(p), length(len) {}
40 :
41 : virtual ~UTF8CollationIterator();
42 :
43 : virtual void resetToOffset(int32_t newOffset);
44 :
45 : virtual int32_t getOffset() const;
46 :
47 : virtual UChar32 nextCodePoint(UErrorCode &errorCode);
48 :
49 : virtual UChar32 previousCodePoint(UErrorCode &errorCode);
50 :
51 : protected:
52 : /**
53 : * For byte sequences that are illegal in UTF-8, an error value may be returned
54 : * together with a bogus code point. The caller will ignore that code point.
55 : *
56 : * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
57 : * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
58 : *
59 : * Valid lead surrogates are returned from inside a normalized text segment,
60 : * where handleGetTrailSurrogate() will return the matching trail surrogate.
61 : */
62 : virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
63 :
64 : virtual UBool foundNULTerminator();
65 :
66 : virtual UBool forbidSurrogateCodePoints() const;
67 :
68 : virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
69 :
70 : virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
71 :
72 : const uint8_t *u8;
73 : int32_t pos;
74 : int32_t length; // <0 for NUL-terminated strings
75 : };
76 :
77 : /**
78 : * Incrementally checks the input text for FCD and normalizes where necessary.
79 : */
80 : class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
81 : public:
82 0 : FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
83 : const uint8_t *s, int32_t p, int32_t len)
84 0 : : UTF8CollationIterator(data, numeric, s, p, len),
85 : state(CHECK_FWD), start(p),
86 0 : nfcImpl(data->nfcImpl) {}
87 :
88 : virtual ~FCDUTF8CollationIterator();
89 :
90 : virtual void resetToOffset(int32_t newOffset);
91 :
92 : virtual int32_t getOffset() const;
93 :
94 : virtual UChar32 nextCodePoint(UErrorCode &errorCode);
95 :
96 : virtual UChar32 previousCodePoint(UErrorCode &errorCode);
97 :
98 : protected:
99 : virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
100 :
101 : virtual UChar handleGetTrailSurrogate();
102 :
103 : virtual UBool foundNULTerminator();
104 :
105 : virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
106 :
107 : virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
108 :
109 : private:
110 : UBool nextHasLccc() const;
111 : UBool previousHasTccc() const;
112 :
113 : /**
114 : * Switches to forward checking if possible.
115 : */
116 : void switchToForward();
117 :
118 : /**
119 : * Extends the FCD text segment forward or normalizes around pos.
120 : * @return TRUE if success
121 : */
122 : UBool nextSegment(UErrorCode &errorCode);
123 :
124 : /**
125 : * Switches to backward checking.
126 : */
127 : void switchToBackward();
128 :
129 : /**
130 : * Extends the FCD text segment backward or normalizes around pos.
131 : * @return TRUE if success
132 : */
133 : UBool previousSegment(UErrorCode &errorCode);
134 :
135 : UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
136 :
137 : enum State {
138 : /**
139 : * The input text [start..pos[ passes the FCD check.
140 : * Moving forward checks incrementally.
141 : * limit is undefined.
142 : */
143 : CHECK_FWD,
144 : /**
145 : * The input text [pos..limit[ passes the FCD check.
146 : * Moving backward checks incrementally.
147 : * start is undefined.
148 : */
149 : CHECK_BWD,
150 : /**
151 : * The input text [start..limit[ passes the FCD check.
152 : * pos tracks the current text index.
153 : */
154 : IN_FCD_SEGMENT,
155 : /**
156 : * The input text [start..limit[ failed the FCD check and was normalized.
157 : * pos tracks the current index in the normalized string.
158 : */
159 : IN_NORMALIZED
160 : };
161 :
162 : State state;
163 :
164 : int32_t start;
165 : int32_t limit;
166 :
167 : const Normalizer2Impl &nfcImpl;
168 : UnicodeString normalized;
169 : };
170 :
171 : U_NAMESPACE_END
172 :
173 : #endif // !UCONFIG_NO_COLLATION
174 : #endif // __UTF8COLLATIONITERATOR_H__
|