Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : **********************************************************************
5 : * Copyright (c) 2003-2011, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : **********************************************************************
8 : * Author: Alan Liu
9 : * Created: September 24 2003
10 : * Since: ICU 2.8
11 : **********************************************************************
12 : */
13 : #ifndef _RULEITER_H_
14 : #define _RULEITER_H_
15 :
16 : #include "unicode/uobject.h"
17 :
18 : U_NAMESPACE_BEGIN
19 :
20 : class UnicodeString;
21 : class ParsePosition;
22 : class SymbolTable;
23 :
24 : /**
25 : * An iterator that returns 32-bit code points. This class is deliberately
26 : * <em>not</em> related to any of the ICU character iterator classes
27 : * in order to minimize complexity.
28 : * @author Alan Liu
29 : * @since ICU 2.8
30 : */
31 : class RuleCharacterIterator : public UMemory {
32 :
33 : // TODO: Ideas for later. (Do not implement if not needed, lest the
34 : // code coverage numbers go down due to unused methods.)
35 : // 1. Add a copy constructor, operator==() method.
36 : // 2. Rather than return DONE, throw an exception if the end
37 : // is reached -- this is an alternate usage model, probably not useful.
38 :
39 : private:
40 : /**
41 : * Text being iterated.
42 : */
43 : const UnicodeString& text;
44 :
45 : /**
46 : * Position of iterator.
47 : */
48 : ParsePosition& pos;
49 :
50 : /**
51 : * Symbol table used to parse and dereference variables. May be 0.
52 : */
53 : const SymbolTable* sym;
54 :
55 : /**
56 : * Current variable expansion, or 0 if none.
57 : */
58 : const UnicodeString* buf;
59 :
60 : /**
61 : * Position within buf. Meaningless if buf == 0.
62 : */
63 : int32_t bufPos;
64 :
65 : public:
66 : /**
67 : * Value returned when there are no more characters to iterate.
68 : */
69 : enum { DONE = -1 };
70 :
71 : /**
72 : * Bitmask option to enable parsing of variable names. If (options &
73 : * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
74 : * its value. Variables are parsed using the SymbolTable API.
75 : */
76 : enum { PARSE_VARIABLES = 1 };
77 :
78 : /**
79 : * Bitmask option to enable parsing of escape sequences. If (options &
80 : * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
81 : * to its value. Escapes are parsed using Utility.unescapeAt().
82 : */
83 : enum { PARSE_ESCAPES = 2 };
84 :
85 : /**
86 : * Bitmask option to enable skipping of whitespace. If (options &
87 : * SKIP_WHITESPACE) != 0, then Pattern_White_Space characters will be silently
88 : * skipped, as if they were not present in the input.
89 : */
90 : enum { SKIP_WHITESPACE = 4 };
91 :
92 : /**
93 : * Constructs an iterator over the given text, starting at the given
94 : * position.
95 : * @param text the text to be iterated
96 : * @param sym the symbol table, or null if there is none. If sym is null,
97 : * then variables will not be deferenced, even if the PARSE_VARIABLES
98 : * option is set.
99 : * @param pos upon input, the index of the next character to return. If a
100 : * variable has been dereferenced, then pos will <em>not</em> increment as
101 : * characters of the variable value are iterated.
102 : */
103 : RuleCharacterIterator(const UnicodeString& text, const SymbolTable* sym,
104 : ParsePosition& pos);
105 :
106 : /**
107 : * Returns true if this iterator has no more characters to return.
108 : */
109 : UBool atEnd() const;
110 :
111 : /**
112 : * Returns the next character using the given options, or DONE if there
113 : * are no more characters, and advance the position to the next
114 : * character.
115 : * @param options one or more of the following options, bitwise-OR-ed
116 : * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
117 : * @param isEscaped output parameter set to TRUE if the character
118 : * was escaped
119 : * @param ec input-output error code. An error will only be set by
120 : * this routing if options includes PARSE_VARIABLES and an unknown
121 : * variable name is seen, or if options includes PARSE_ESCAPES and
122 : * an invalid escape sequence is seen.
123 : * @return the current 32-bit code point, or DONE
124 : */
125 : UChar32 next(int32_t options, UBool& isEscaped, UErrorCode& ec);
126 :
127 : /**
128 : * Returns true if this iterator is currently within a variable expansion.
129 : */
130 : inline UBool inVariable() const;
131 :
132 : /**
133 : * An opaque object representing the position of a RuleCharacterIterator.
134 : */
135 : struct Pos : public UMemory {
136 : private:
137 : const UnicodeString* buf;
138 : int32_t pos;
139 : int32_t bufPos;
140 : friend class RuleCharacterIterator;
141 : };
142 :
143 : /**
144 : * Sets an object which, when later passed to setPos(), will
145 : * restore this iterator's position. Usage idiom:
146 : *
147 : * RuleCharacterIterator iterator = ...;
148 : * RuleCharacterIterator::Pos pos;
149 : * iterator.getPos(pos);
150 : * for (;;) {
151 : * iterator.getPos(pos);
152 : * int c = iterator.next(...);
153 : * ...
154 : * }
155 : * iterator.setPos(pos);
156 : *
157 : * @param p a position object to be set to this iterator's
158 : * current position.
159 : */
160 : void getPos(Pos& p) const;
161 :
162 : /**
163 : * Restores this iterator to the position it had when getPos()
164 : * set the given object.
165 : * @param p a position object previously set by getPos()
166 : */
167 : void setPos(const Pos& p);
168 :
169 : /**
170 : * Skips ahead past any ignored characters, as indicated by the given
171 : * options. This is useful in conjunction with the lookahead() method.
172 : *
173 : * Currently, this only has an effect for SKIP_WHITESPACE.
174 : * @param options one or more of the following options, bitwise-OR-ed
175 : * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
176 : */
177 : void skipIgnored(int32_t options);
178 :
179 : /**
180 : * Returns a string containing the remainder of the characters to be
181 : * returned by this iterator, without any option processing. If the
182 : * iterator is currently within a variable expansion, this will only
183 : * extend to the end of the variable expansion. This method is provided
184 : * so that iterators may interoperate with string-based APIs. The typical
185 : * sequence of calls is to call skipIgnored(), then call lookahead(), then
186 : * parse the string returned by lookahead(), then call jumpahead() to
187 : * resynchronize the iterator.
188 : * @param result a string to receive the characters to be returned
189 : * by future calls to next()
190 : * @param maxLookAhead The maximum to copy into the result.
191 : * @return a reference to result
192 : */
193 : UnicodeString& lookahead(UnicodeString& result, int32_t maxLookAhead = -1) const;
194 :
195 : /**
196 : * Advances the position by the given number of 16-bit code units.
197 : * This is useful in conjunction with the lookahead() method.
198 : * @param count the number of 16-bit code units to jump over
199 : */
200 : void jumpahead(int32_t count);
201 :
202 : /**
203 : * Returns a string representation of this object, consisting of the
204 : * characters being iterated, with a '|' marking the current position.
205 : * Position within an expanded variable is <em>not</em> indicated.
206 : * @param result output parameter to receive a string
207 : * representation of this object
208 : */
209 : // UnicodeString& toString(UnicodeString& result) const;
210 :
211 : private:
212 : /**
213 : * Returns the current 32-bit code point without parsing escapes, parsing
214 : * variables, or skipping whitespace.
215 : * @return the current 32-bit code point
216 : */
217 : UChar32 _current() const;
218 :
219 : /**
220 : * Advances the position by the given amount.
221 : * @param count the number of 16-bit code units to advance past
222 : */
223 : void _advance(int32_t count);
224 : };
225 :
226 0 : inline UBool RuleCharacterIterator::inVariable() const {
227 0 : return buf != 0;
228 : }
229 :
230 : U_NAMESPACE_END
231 :
232 : #endif // _RULEITER_H_
233 : //eof
|