Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : **********************************************************************
5 : * Copyright (c) 2001-2016, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : **********************************************************************
8 : * Date Name Description
9 : * 11/19/2001 aliu Creation.
10 : **********************************************************************
11 : */
12 :
13 : #include "unicode/uchar.h"
14 : #include "unicode/utf16.h"
15 : #include "patternprops.h"
16 : #include "util.h"
17 :
18 : U_NAMESPACE_BEGIN
19 :
20 : /**
21 : * Parse an integer at pos, either of the form \d+ or of the form
22 : * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
23 : * or octal format.
24 : * @param pos INPUT-OUTPUT parameter. On input, the first
25 : * character to parse. On output, the character after the last
26 : * parsed character.
27 : */
28 0 : int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) {
29 0 : int32_t count = 0;
30 0 : int32_t value = 0;
31 0 : int32_t p = pos;
32 0 : int8_t radix = 10;
33 :
34 0 : if (p < limit && rule.charAt(p) == 48 /*0*/) {
35 0 : if (p+1 < limit && (rule.charAt(p+1) == 0x78 /*x*/ || rule.charAt(p+1) == 0x58 /*X*/)) {
36 0 : p += 2;
37 0 : radix = 16;
38 : }
39 : else {
40 0 : p++;
41 0 : count = 1;
42 0 : radix = 8;
43 : }
44 : }
45 :
46 0 : while (p < limit) {
47 0 : int32_t d = u_digit(rule.charAt(p++), radix);
48 0 : if (d < 0) {
49 0 : --p;
50 0 : break;
51 : }
52 0 : ++count;
53 0 : int32_t v = (value * radix) + d;
54 0 : if (v <= value) {
55 : // If there are too many input digits, at some point
56 : // the value will go negative, e.g., if we have seen
57 : // "0x8000000" already and there is another '0', when
58 : // we parse the next 0 the value will go negative.
59 0 : return 0;
60 : }
61 0 : value = v;
62 : }
63 0 : if (count > 0) {
64 0 : pos = p;
65 : }
66 0 : return value;
67 : }
68 :
69 : /**
70 : * Parse a pattern string starting at offset pos. Keywords are
71 : * matched case-insensitively. Spaces may be skipped and may be
72 : * optional or required. Integer values may be parsed, and if
73 : * they are, they will be returned in the given array. If
74 : * successful, the offset of the next non-space character is
75 : * returned. On failure, -1 is returned.
76 : * @param pattern must only contain lowercase characters, which
77 : * will match their uppercase equivalents as well. A space
78 : * character matches one or more required spaces. A '~' character
79 : * matches zero or more optional spaces. A '#' character matches
80 : * an integer and stores it in parsedInts, which the caller must
81 : * ensure has enough capacity.
82 : * @param parsedInts array to receive parsed integers. Caller
83 : * must ensure that parsedInts.length is >= the number of '#'
84 : * signs in 'pattern'.
85 : * @return the position after the last character parsed, or -1 if
86 : * the parse failed
87 : */
88 0 : int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
89 : const UnicodeString& pattern, int32_t* parsedInts) {
90 : // TODO Update this to handle surrogates
91 : int32_t p;
92 0 : int32_t intCount = 0; // number of integers parsed
93 0 : for (int32_t i=0; i<pattern.length(); ++i) {
94 0 : UChar cpat = pattern.charAt(i);
95 : UChar c;
96 0 : switch (cpat) {
97 : case 32 /*' '*/:
98 0 : if (pos >= limit) {
99 0 : return -1;
100 : }
101 0 : c = rule.charAt(pos++);
102 0 : if (!PatternProps::isWhiteSpace(c)) {
103 0 : return -1;
104 : }
105 : // FALL THROUGH to skipWhitespace
106 : U_FALLTHROUGH;
107 : case 126 /*'~'*/:
108 0 : pos = skipWhitespace(rule, pos);
109 0 : break;
110 : case 35 /*'#'*/:
111 0 : p = pos;
112 0 : parsedInts[intCount++] = parseInteger(rule, p, limit);
113 0 : if (p == pos) {
114 : // Syntax error; failed to parse integer
115 0 : return -1;
116 : }
117 0 : pos = p;
118 0 : break;
119 : default:
120 0 : if (pos >= limit) {
121 0 : return -1;
122 : }
123 0 : c = (UChar) u_tolower(rule.charAt(pos++));
124 0 : if (c != cpat) {
125 0 : return -1;
126 : }
127 0 : break;
128 : }
129 : }
130 0 : return pos;
131 : }
132 :
133 : /**
134 : * Parse a Unicode identifier from the given string at the given
135 : * position. Return the identifier, or an empty string if there
136 : * is no identifier.
137 : * @param str the string to parse
138 : * @param pos INPUT-OUPUT parameter. On INPUT, pos is the
139 : * first character to examine. It must be less than str.length(),
140 : * and it must not point to a whitespace character. That is, must
141 : * have pos < str.length(). On
142 : * OUTPUT, the position after the last parsed character.
143 : * @return the Unicode identifier, or an empty string if there is
144 : * no valid identifier at pos.
145 : */
146 0 : UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
147 : // assert(pos < str.length());
148 0 : UnicodeString buf;
149 0 : int p = pos;
150 0 : while (p < str.length()) {
151 0 : UChar32 ch = str.char32At(p);
152 0 : if (buf.length() == 0) {
153 0 : if (u_isIDStart(ch)) {
154 0 : buf.append(ch);
155 : } else {
156 0 : buf.truncate(0);
157 0 : return buf;
158 : }
159 : } else {
160 0 : if (u_isIDPart(ch)) {
161 0 : buf.append(ch);
162 : } else {
163 0 : break;
164 : }
165 : }
166 0 : p += U16_LENGTH(ch);
167 : }
168 0 : pos = p;
169 0 : return buf;
170 : }
171 :
172 : /**
173 : * Parse an unsigned 31-bit integer at the given offset. Use
174 : * UCharacter.digit() to parse individual characters into digits.
175 : * @param text the text to be parsed
176 : * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the
177 : * offset within text at which to start parsing; it should point
178 : * to a valid digit. On exit, pos[0] is the offset after the last
179 : * parsed character. If the parse failed, it will be unchanged on
180 : * exit. Must be >= 0 on entry.
181 : * @param radix the radix in which to parse; must be >= 2 and <=
182 : * 36.
183 : * @return a non-negative parsed number, or -1 upon parse failure.
184 : * Parse fails if there are no digits, that is, if pos[0] does not
185 : * point to a valid digit on entry, or if the number to be parsed
186 : * does not fit into a 31-bit unsigned integer.
187 : */
188 0 : int32_t ICU_Utility::parseNumber(const UnicodeString& text,
189 : int32_t& pos, int8_t radix) {
190 : // assert(pos[0] >= 0);
191 : // assert(radix >= 2);
192 : // assert(radix <= 36);
193 0 : int32_t n = 0;
194 0 : int32_t p = pos;
195 0 : while (p < text.length()) {
196 0 : UChar32 ch = text.char32At(p);
197 0 : int32_t d = u_digit(ch, radix);
198 0 : if (d < 0) {
199 0 : break;
200 : }
201 0 : n = radix*n + d;
202 : // ASSUME that when a 32-bit integer overflows it becomes
203 : // negative. E.g., 214748364 * 10 + 8 => negative value.
204 0 : if (n < 0) {
205 0 : return -1;
206 : }
207 0 : ++p;
208 : }
209 0 : if (p == pos) {
210 0 : return -1;
211 : }
212 0 : pos = p;
213 0 : return n;
214 : }
215 :
216 : U_NAMESPACE_END
217 :
|