Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : *
6 : * Copyright (C) 2002-2012, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : *******************************************************************************
10 : * file name: uiter.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2002jan18
16 : * created by: Markus W. Scherer
17 : */
18 :
19 : #include "unicode/utypes.h"
20 : #include "unicode/ustring.h"
21 : #include "unicode/chariter.h"
22 : #include "unicode/rep.h"
23 : #include "unicode/uiter.h"
24 : #include "unicode/utf.h"
25 : #include "unicode/utf8.h"
26 : #include "unicode/utf16.h"
27 : #include "cstring.h"
28 :
29 : U_NAMESPACE_USE
30 :
31 : #define IS_EVEN(n) (((n)&1)==0)
32 : #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
33 :
34 : U_CDECL_BEGIN
35 :
36 : /* No-Op UCharIterator implementation for illegal input --------------------- */
37 :
38 : static int32_t U_CALLCONV
39 0 : noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
40 0 : return 0;
41 : }
42 :
43 : static int32_t U_CALLCONV
44 0 : noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
45 0 : return 0;
46 : }
47 :
48 : static UBool U_CALLCONV
49 0 : noopHasNext(UCharIterator * /*iter*/) {
50 0 : return FALSE;
51 : }
52 :
53 : static UChar32 U_CALLCONV
54 0 : noopCurrent(UCharIterator * /*iter*/) {
55 0 : return U_SENTINEL;
56 : }
57 :
58 : static uint32_t U_CALLCONV
59 0 : noopGetState(const UCharIterator * /*iter*/) {
60 0 : return UITER_NO_STATE;
61 : }
62 :
63 : static void U_CALLCONV
64 0 : noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
65 0 : *pErrorCode=U_UNSUPPORTED_ERROR;
66 0 : }
67 :
68 : static const UCharIterator noopIterator={
69 : 0, 0, 0, 0, 0, 0,
70 : noopGetIndex,
71 : noopMove,
72 : noopHasNext,
73 : noopHasNext,
74 : noopCurrent,
75 : noopCurrent,
76 : noopCurrent,
77 : NULL,
78 : noopGetState,
79 : noopSetState
80 : };
81 :
82 : /* UCharIterator implementation for simple strings -------------------------- */
83 :
84 : /*
85 : * This is an implementation of a code unit (UChar) iterator
86 : * for UChar * strings.
87 : *
88 : * The UCharIterator.context field holds a pointer to the string.
89 : */
90 :
91 : static int32_t U_CALLCONV
92 0 : stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
93 0 : switch(origin) {
94 : case UITER_ZERO:
95 0 : return 0;
96 : case UITER_START:
97 0 : return iter->start;
98 : case UITER_CURRENT:
99 0 : return iter->index;
100 : case UITER_LIMIT:
101 0 : return iter->limit;
102 : case UITER_LENGTH:
103 0 : return iter->length;
104 : default:
105 : /* not a valid origin */
106 : /* Should never get here! */
107 0 : return -1;
108 : }
109 : }
110 :
111 : static int32_t U_CALLCONV
112 0 : stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
113 : int32_t pos;
114 :
115 0 : switch(origin) {
116 : case UITER_ZERO:
117 0 : pos=delta;
118 0 : break;
119 : case UITER_START:
120 0 : pos=iter->start+delta;
121 0 : break;
122 : case UITER_CURRENT:
123 0 : pos=iter->index+delta;
124 0 : break;
125 : case UITER_LIMIT:
126 0 : pos=iter->limit+delta;
127 0 : break;
128 : case UITER_LENGTH:
129 0 : pos=iter->length+delta;
130 0 : break;
131 : default:
132 0 : return -1; /* Error */
133 : }
134 :
135 0 : if(pos<iter->start) {
136 0 : pos=iter->start;
137 0 : } else if(pos>iter->limit) {
138 0 : pos=iter->limit;
139 : }
140 :
141 0 : return iter->index=pos;
142 : }
143 :
144 : static UBool U_CALLCONV
145 0 : stringIteratorHasNext(UCharIterator *iter) {
146 0 : return iter->index<iter->limit;
147 : }
148 :
149 : static UBool U_CALLCONV
150 0 : stringIteratorHasPrevious(UCharIterator *iter) {
151 0 : return iter->index>iter->start;
152 : }
153 :
154 : static UChar32 U_CALLCONV
155 0 : stringIteratorCurrent(UCharIterator *iter) {
156 0 : if(iter->index<iter->limit) {
157 0 : return ((const UChar *)(iter->context))[iter->index];
158 : } else {
159 0 : return U_SENTINEL;
160 : }
161 : }
162 :
163 : static UChar32 U_CALLCONV
164 0 : stringIteratorNext(UCharIterator *iter) {
165 0 : if(iter->index<iter->limit) {
166 0 : return ((const UChar *)(iter->context))[iter->index++];
167 : } else {
168 0 : return U_SENTINEL;
169 : }
170 : }
171 :
172 : static UChar32 U_CALLCONV
173 0 : stringIteratorPrevious(UCharIterator *iter) {
174 0 : if(iter->index>iter->start) {
175 0 : return ((const UChar *)(iter->context))[--iter->index];
176 : } else {
177 0 : return U_SENTINEL;
178 : }
179 : }
180 :
181 : static uint32_t U_CALLCONV
182 0 : stringIteratorGetState(const UCharIterator *iter) {
183 0 : return (uint32_t)iter->index;
184 : }
185 :
186 : static void U_CALLCONV
187 0 : stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
188 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
189 : /* do nothing */
190 0 : } else if(iter==NULL) {
191 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
192 0 : } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
193 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
194 : } else {
195 0 : iter->index=(int32_t)state;
196 : }
197 0 : }
198 :
199 : static const UCharIterator stringIterator={
200 : 0, 0, 0, 0, 0, 0,
201 : stringIteratorGetIndex,
202 : stringIteratorMove,
203 : stringIteratorHasNext,
204 : stringIteratorHasPrevious,
205 : stringIteratorCurrent,
206 : stringIteratorNext,
207 : stringIteratorPrevious,
208 : NULL,
209 : stringIteratorGetState,
210 : stringIteratorSetState
211 : };
212 :
213 : U_CAPI void U_EXPORT2
214 0 : uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
215 0 : if(iter!=0) {
216 0 : if(s!=0 && length>=-1) {
217 0 : *iter=stringIterator;
218 0 : iter->context=s;
219 0 : if(length>=0) {
220 0 : iter->length=length;
221 : } else {
222 0 : iter->length=u_strlen(s);
223 : }
224 0 : iter->limit=iter->length;
225 : } else {
226 0 : *iter=noopIterator;
227 : }
228 : }
229 0 : }
230 :
231 : /* UCharIterator implementation for UTF-16BE strings ------------------------ */
232 :
233 : /*
234 : * This is an implementation of a code unit (UChar) iterator
235 : * for UTF-16BE strings, i.e., strings in byte-vectors where
236 : * each UChar is stored as a big-endian pair of bytes.
237 : *
238 : * The UCharIterator.context field holds a pointer to the string.
239 : * Everything works just like with a normal UChar iterator (uiter_setString),
240 : * except that UChars are assembled from byte pairs.
241 : */
242 :
243 : /* internal helper function */
244 : static inline UChar32
245 0 : utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
246 0 : const uint8_t *p=(const uint8_t *)iter->context;
247 0 : return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
248 : }
249 :
250 : static UChar32 U_CALLCONV
251 0 : utf16BEIteratorCurrent(UCharIterator *iter) {
252 : int32_t index;
253 :
254 0 : if((index=iter->index)<iter->limit) {
255 0 : return utf16BEIteratorGet(iter, index);
256 : } else {
257 0 : return U_SENTINEL;
258 : }
259 : }
260 :
261 : static UChar32 U_CALLCONV
262 0 : utf16BEIteratorNext(UCharIterator *iter) {
263 : int32_t index;
264 :
265 0 : if((index=iter->index)<iter->limit) {
266 0 : iter->index=index+1;
267 0 : return utf16BEIteratorGet(iter, index);
268 : } else {
269 0 : return U_SENTINEL;
270 : }
271 : }
272 :
273 : static UChar32 U_CALLCONV
274 0 : utf16BEIteratorPrevious(UCharIterator *iter) {
275 : int32_t index;
276 :
277 0 : if((index=iter->index)>iter->start) {
278 0 : iter->index=--index;
279 0 : return utf16BEIteratorGet(iter, index);
280 : } else {
281 0 : return U_SENTINEL;
282 : }
283 : }
284 :
285 : static const UCharIterator utf16BEIterator={
286 : 0, 0, 0, 0, 0, 0,
287 : stringIteratorGetIndex,
288 : stringIteratorMove,
289 : stringIteratorHasNext,
290 : stringIteratorHasPrevious,
291 : utf16BEIteratorCurrent,
292 : utf16BEIteratorNext,
293 : utf16BEIteratorPrevious,
294 : NULL,
295 : stringIteratorGetState,
296 : stringIteratorSetState
297 : };
298 :
299 : /*
300 : * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
301 : * i.e., before a pair of 0 bytes where the first 0 byte is at an even
302 : * offset from s.
303 : */
304 : static int32_t
305 0 : utf16BE_strlen(const char *s) {
306 0 : if(IS_POINTER_EVEN(s)) {
307 : /*
308 : * even-aligned, call u_strlen(s)
309 : * we are probably on a little-endian machine, but searching for UChar NUL
310 : * does not care about endianness
311 : */
312 0 : return u_strlen((const UChar *)s);
313 : } else {
314 : /* odd-aligned, search for pair of 0 bytes */
315 0 : const char *p=s;
316 :
317 0 : while(!(*p==0 && p[1]==0)) {
318 0 : p+=2;
319 : }
320 0 : return (int32_t)((p-s)/2);
321 : }
322 : }
323 :
324 : U_CAPI void U_EXPORT2
325 0 : uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
326 0 : if(iter!=NULL) {
327 : /* allow only even-length strings (the input length counts bytes) */
328 0 : if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
329 : /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
330 0 : length>>=1;
331 :
332 : if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
333 : /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
334 : uiter_setString(iter, (const UChar *)s, length);
335 : return;
336 : }
337 :
338 0 : *iter=utf16BEIterator;
339 0 : iter->context=s;
340 0 : if(length>=0) {
341 0 : iter->length=length;
342 : } else {
343 0 : iter->length=utf16BE_strlen(s);
344 : }
345 0 : iter->limit=iter->length;
346 : } else {
347 0 : *iter=noopIterator;
348 : }
349 : }
350 : }
351 :
352 : /* UCharIterator wrapper around CharacterIterator --------------------------- */
353 :
354 : /*
355 : * This is wrapper code around a C++ CharacterIterator to
356 : * look like a C UCharIterator.
357 : *
358 : * The UCharIterator.context field holds a pointer to the CharacterIterator.
359 : */
360 :
361 : static int32_t U_CALLCONV
362 0 : characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
363 0 : switch(origin) {
364 : case UITER_ZERO:
365 0 : return 0;
366 : case UITER_START:
367 0 : return ((CharacterIterator *)(iter->context))->startIndex();
368 : case UITER_CURRENT:
369 0 : return ((CharacterIterator *)(iter->context))->getIndex();
370 : case UITER_LIMIT:
371 0 : return ((CharacterIterator *)(iter->context))->endIndex();
372 : case UITER_LENGTH:
373 0 : return ((CharacterIterator *)(iter->context))->getLength();
374 : default:
375 : /* not a valid origin */
376 : /* Should never get here! */
377 0 : return -1;
378 : }
379 : }
380 :
381 : static int32_t U_CALLCONV
382 0 : characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
383 0 : switch(origin) {
384 : case UITER_ZERO:
385 0 : ((CharacterIterator *)(iter->context))->setIndex(delta);
386 0 : return ((CharacterIterator *)(iter->context))->getIndex();
387 : case UITER_START:
388 : case UITER_CURRENT:
389 : case UITER_LIMIT:
390 0 : return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
391 : case UITER_LENGTH:
392 0 : ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
393 0 : return ((CharacterIterator *)(iter->context))->getIndex();
394 : default:
395 : /* not a valid origin */
396 : /* Should never get here! */
397 0 : return -1;
398 : }
399 : }
400 :
401 : static UBool U_CALLCONV
402 0 : characterIteratorHasNext(UCharIterator *iter) {
403 0 : return ((CharacterIterator *)(iter->context))->hasNext();
404 : }
405 :
406 : static UBool U_CALLCONV
407 0 : characterIteratorHasPrevious(UCharIterator *iter) {
408 0 : return ((CharacterIterator *)(iter->context))->hasPrevious();
409 : }
410 :
411 : static UChar32 U_CALLCONV
412 0 : characterIteratorCurrent(UCharIterator *iter) {
413 : UChar32 c;
414 :
415 0 : c=((CharacterIterator *)(iter->context))->current();
416 0 : if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
417 0 : return c;
418 : } else {
419 0 : return U_SENTINEL;
420 : }
421 : }
422 :
423 : static UChar32 U_CALLCONV
424 0 : characterIteratorNext(UCharIterator *iter) {
425 0 : if(((CharacterIterator *)(iter->context))->hasNext()) {
426 0 : return ((CharacterIterator *)(iter->context))->nextPostInc();
427 : } else {
428 0 : return U_SENTINEL;
429 : }
430 : }
431 :
432 : static UChar32 U_CALLCONV
433 0 : characterIteratorPrevious(UCharIterator *iter) {
434 0 : if(((CharacterIterator *)(iter->context))->hasPrevious()) {
435 0 : return ((CharacterIterator *)(iter->context))->previous();
436 : } else {
437 0 : return U_SENTINEL;
438 : }
439 : }
440 :
441 : static uint32_t U_CALLCONV
442 0 : characterIteratorGetState(const UCharIterator *iter) {
443 0 : return ((CharacterIterator *)(iter->context))->getIndex();
444 : }
445 :
446 : static void U_CALLCONV
447 0 : characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
448 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
449 : /* do nothing */
450 0 : } else if(iter==NULL || iter->context==NULL) {
451 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
452 0 : } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
453 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
454 : } else {
455 0 : ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
456 : }
457 0 : }
458 :
459 : static const UCharIterator characterIteratorWrapper={
460 : 0, 0, 0, 0, 0, 0,
461 : characterIteratorGetIndex,
462 : characterIteratorMove,
463 : characterIteratorHasNext,
464 : characterIteratorHasPrevious,
465 : characterIteratorCurrent,
466 : characterIteratorNext,
467 : characterIteratorPrevious,
468 : NULL,
469 : characterIteratorGetState,
470 : characterIteratorSetState
471 : };
472 :
473 : U_CAPI void U_EXPORT2
474 0 : uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
475 0 : if(iter!=0) {
476 0 : if(charIter!=0) {
477 0 : *iter=characterIteratorWrapper;
478 0 : iter->context=charIter;
479 : } else {
480 0 : *iter=noopIterator;
481 : }
482 : }
483 0 : }
484 :
485 : /* UCharIterator wrapper around Replaceable --------------------------------- */
486 :
487 : /*
488 : * This is an implementation of a code unit (UChar) iterator
489 : * based on a Replaceable object.
490 : *
491 : * The UCharIterator.context field holds a pointer to the Replaceable.
492 : * UCharIterator.length and UCharIterator.index hold Replaceable.length()
493 : * and the iteration index.
494 : */
495 :
496 : static UChar32 U_CALLCONV
497 0 : replaceableIteratorCurrent(UCharIterator *iter) {
498 0 : if(iter->index<iter->limit) {
499 0 : return ((Replaceable *)(iter->context))->charAt(iter->index);
500 : } else {
501 0 : return U_SENTINEL;
502 : }
503 : }
504 :
505 : static UChar32 U_CALLCONV
506 0 : replaceableIteratorNext(UCharIterator *iter) {
507 0 : if(iter->index<iter->limit) {
508 0 : return ((Replaceable *)(iter->context))->charAt(iter->index++);
509 : } else {
510 0 : return U_SENTINEL;
511 : }
512 : }
513 :
514 : static UChar32 U_CALLCONV
515 0 : replaceableIteratorPrevious(UCharIterator *iter) {
516 0 : if(iter->index>iter->start) {
517 0 : return ((Replaceable *)(iter->context))->charAt(--iter->index);
518 : } else {
519 0 : return U_SENTINEL;
520 : }
521 : }
522 :
523 : static const UCharIterator replaceableIterator={
524 : 0, 0, 0, 0, 0, 0,
525 : stringIteratorGetIndex,
526 : stringIteratorMove,
527 : stringIteratorHasNext,
528 : stringIteratorHasPrevious,
529 : replaceableIteratorCurrent,
530 : replaceableIteratorNext,
531 : replaceableIteratorPrevious,
532 : NULL,
533 : stringIteratorGetState,
534 : stringIteratorSetState
535 : };
536 :
537 : U_CAPI void U_EXPORT2
538 0 : uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
539 0 : if(iter!=0) {
540 0 : if(rep!=0) {
541 0 : *iter=replaceableIterator;
542 0 : iter->context=rep;
543 0 : iter->limit=iter->length=rep->length();
544 : } else {
545 0 : *iter=noopIterator;
546 : }
547 : }
548 0 : }
549 :
550 : /* UCharIterator implementation for UTF-8 strings --------------------------- */
551 :
552 : /*
553 : * Possible, probably necessary only for an implementation for arbitrary
554 : * converters:
555 : * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
556 : * This would require to turn reservedFn into a close function and
557 : * to introduce a uiter_close(iter).
558 : */
559 :
560 : #define UITER_CNV_CAPACITY 16
561 :
562 : /*
563 : * Minimal implementation:
564 : * Maintain a single-UChar buffer for an additional surrogate.
565 : * The caller must not modify start and limit because they are used internally.
566 : *
567 : * Use UCharIterator fields as follows:
568 : * context pointer to UTF-8 string
569 : * length UTF-16 length of the string; -1 until lazy evaluation
570 : * start current UTF-8 index
571 : * index current UTF-16 index; may be -1="unknown" after setState()
572 : * limit UTF-8 length of the string
573 : * reservedField supplementary code point
574 : *
575 : * Since UCharIterator delivers 16-bit code units, the iteration can be
576 : * currently in the middle of the byte sequence for a supplementary code point.
577 : * In this case, reservedField will contain that code point and start will
578 : * point to after the corresponding byte sequence. The UTF-16 index will be
579 : * one less than what it would otherwise be corresponding to the UTF-8 index.
580 : * Otherwise, reservedField will be 0.
581 : */
582 :
583 : /*
584 : * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
585 : * Add implementations that do not call strlen() for iteration but check for NUL.
586 : */
587 :
588 : static int32_t U_CALLCONV
589 0 : utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
590 0 : switch(origin) {
591 : case UITER_ZERO:
592 : case UITER_START:
593 0 : return 0;
594 : case UITER_CURRENT:
595 0 : if(iter->index<0) {
596 : /* the current UTF-16 index is unknown after setState(), count from the beginning */
597 : const uint8_t *s;
598 : UChar32 c;
599 : int32_t i, limit, index;
600 :
601 0 : s=(const uint8_t *)iter->context;
602 0 : i=index=0;
603 0 : limit=iter->start; /* count up to the UTF-8 index */
604 0 : while(i<limit) {
605 0 : U8_NEXT_OR_FFFD(s, i, limit, c);
606 0 : index+=U16_LENGTH(c);
607 : }
608 :
609 0 : iter->start=i; /* just in case setState() did not get us to a code point boundary */
610 0 : if(i==iter->limit) {
611 0 : iter->length=index; /* in case it was <0 or wrong */
612 : }
613 0 : if(iter->reservedField!=0) {
614 0 : --index; /* we are in the middle of a supplementary code point */
615 : }
616 0 : iter->index=index;
617 : }
618 0 : return iter->index;
619 : case UITER_LIMIT:
620 : case UITER_LENGTH:
621 0 : if(iter->length<0) {
622 : const uint8_t *s;
623 : UChar32 c;
624 : int32_t i, limit, length;
625 :
626 0 : s=(const uint8_t *)iter->context;
627 0 : if(iter->index<0) {
628 : /*
629 : * the current UTF-16 index is unknown after setState(),
630 : * we must first count from the beginning to here
631 : */
632 0 : i=length=0;
633 0 : limit=iter->start;
634 :
635 : /* count from the beginning to the current index */
636 0 : while(i<limit) {
637 0 : U8_NEXT_OR_FFFD(s, i, limit, c);
638 0 : length+=U16_LENGTH(c);
639 : }
640 :
641 : /* assume i==limit==iter->start, set the UTF-16 index */
642 0 : iter->start=i; /* just in case setState() did not get us to a code point boundary */
643 0 : iter->index= iter->reservedField!=0 ? length-1 : length;
644 : } else {
645 0 : i=iter->start;
646 0 : length=iter->index;
647 0 : if(iter->reservedField!=0) {
648 0 : ++length;
649 : }
650 : }
651 :
652 : /* count from the current index to the end */
653 0 : limit=iter->limit;
654 0 : while(i<limit) {
655 0 : U8_NEXT_OR_FFFD(s, i, limit, c);
656 0 : length+=U16_LENGTH(c);
657 : }
658 0 : iter->length=length;
659 : }
660 0 : return iter->length;
661 : default:
662 : /* not a valid origin */
663 : /* Should never get here! */
664 0 : return -1;
665 : }
666 : }
667 :
668 : static int32_t U_CALLCONV
669 0 : utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
670 : const uint8_t *s;
671 : UChar32 c;
672 : int32_t pos; /* requested UTF-16 index */
673 : int32_t i; /* UTF-8 index */
674 : UBool havePos;
675 :
676 : /* calculate the requested UTF-16 index */
677 0 : switch(origin) {
678 : case UITER_ZERO:
679 : case UITER_START:
680 0 : pos=delta;
681 0 : havePos=TRUE;
682 : /* iter->index<0 (unknown) is possible */
683 0 : break;
684 : case UITER_CURRENT:
685 0 : if(iter->index>=0) {
686 0 : pos=iter->index+delta;
687 0 : havePos=TRUE;
688 : } else {
689 : /* the current UTF-16 index is unknown after setState(), use only delta */
690 0 : pos=0;
691 0 : havePos=FALSE;
692 : }
693 0 : break;
694 : case UITER_LIMIT:
695 : case UITER_LENGTH:
696 0 : if(iter->length>=0) {
697 0 : pos=iter->length+delta;
698 0 : havePos=TRUE;
699 : } else {
700 : /* pin to the end, avoid counting the length */
701 0 : iter->index=-1;
702 0 : iter->start=iter->limit;
703 0 : iter->reservedField=0;
704 0 : if(delta>=0) {
705 0 : return UITER_UNKNOWN_INDEX;
706 : } else {
707 : /* the current UTF-16 index is unknown, use only delta */
708 0 : pos=0;
709 0 : havePos=FALSE;
710 : }
711 : }
712 0 : break;
713 : default:
714 0 : return -1; /* Error */
715 : }
716 :
717 0 : if(havePos) {
718 : /* shortcuts: pinning to the edges of the string */
719 0 : if(pos<=0) {
720 0 : iter->index=iter->start=iter->reservedField=0;
721 0 : return 0;
722 0 : } else if(iter->length>=0 && pos>=iter->length) {
723 0 : iter->index=iter->length;
724 0 : iter->start=iter->limit;
725 0 : iter->reservedField=0;
726 0 : return iter->index;
727 : }
728 :
729 : /* minimize the number of U8_NEXT/PREV operations */
730 0 : if(iter->index<0 || pos<iter->index/2) {
731 : /* go forward from the start instead of backward from the current index */
732 0 : iter->index=iter->start=iter->reservedField=0;
733 0 : } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
734 : /*
735 : * if we have the UTF-16 index and length and the new position is
736 : * closer to the end than the current index,
737 : * then go backward from the end instead of forward from the current index
738 : */
739 0 : iter->index=iter->length;
740 0 : iter->start=iter->limit;
741 0 : iter->reservedField=0;
742 : }
743 :
744 0 : delta=pos-iter->index;
745 0 : if(delta==0) {
746 0 : return iter->index; /* nothing to do */
747 : }
748 : } else {
749 : /* move relative to unknown UTF-16 index */
750 0 : if(delta==0) {
751 0 : return UITER_UNKNOWN_INDEX; /* nothing to do */
752 0 : } else if(-delta>=iter->start) {
753 : /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
754 0 : iter->index=iter->start=iter->reservedField=0;
755 0 : return 0;
756 0 : } else if(delta>=(iter->limit-iter->start)) {
757 : /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
758 0 : iter->index=iter->length; /* may or may not be <0 (unknown) */
759 0 : iter->start=iter->limit;
760 0 : iter->reservedField=0;
761 0 : return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
762 : }
763 : }
764 :
765 : /* delta!=0 */
766 :
767 : /* move towards the requested position, pin to the edges of the string */
768 0 : s=(const uint8_t *)iter->context;
769 0 : pos=iter->index; /* could be <0 (unknown) */
770 0 : i=iter->start;
771 0 : if(delta>0) {
772 : /* go forward */
773 0 : int32_t limit=iter->limit;
774 0 : if(iter->reservedField!=0) {
775 0 : iter->reservedField=0;
776 0 : ++pos;
777 0 : --delta;
778 : }
779 0 : while(delta>0 && i<limit) {
780 0 : U8_NEXT_OR_FFFD(s, i, limit, c);
781 0 : if(c<=0xffff) {
782 0 : ++pos;
783 0 : --delta;
784 0 : } else if(delta>=2) {
785 0 : pos+=2;
786 0 : delta-=2;
787 : } else /* delta==1 */ {
788 : /* stop in the middle of a supplementary code point */
789 0 : iter->reservedField=c;
790 0 : ++pos;
791 0 : break; /* delta=0; */
792 : }
793 : }
794 0 : if(i==limit) {
795 0 : if(iter->length<0 && iter->index>=0) {
796 0 : iter->length= iter->reservedField==0 ? pos : pos+1;
797 0 : } else if(iter->index<0 && iter->length>=0) {
798 0 : iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
799 : }
800 : }
801 : } else /* delta<0 */ {
802 : /* go backward */
803 0 : if(iter->reservedField!=0) {
804 0 : iter->reservedField=0;
805 0 : i-=4; /* we stayed behind the supplementary code point; go before it now */
806 0 : --pos;
807 0 : ++delta;
808 : }
809 0 : while(delta<0 && i>0) {
810 0 : U8_PREV_OR_FFFD(s, 0, i, c);
811 0 : if(c<=0xffff) {
812 0 : --pos;
813 0 : ++delta;
814 0 : } else if(delta<=-2) {
815 0 : pos-=2;
816 0 : delta+=2;
817 : } else /* delta==-1 */ {
818 : /* stop in the middle of a supplementary code point */
819 0 : i+=4; /* back to behind this supplementary code point for consistent state */
820 0 : iter->reservedField=c;
821 0 : --pos;
822 0 : break; /* delta=0; */
823 : }
824 : }
825 : }
826 :
827 0 : iter->start=i;
828 0 : if(iter->index>=0) {
829 0 : return iter->index=pos;
830 : } else {
831 : /* we started with index<0 (unknown) so pos is bogus */
832 0 : if(i<=1) {
833 0 : return iter->index=i; /* reached the beginning */
834 : } else {
835 : /* we still don't know the UTF-16 index */
836 0 : return UITER_UNKNOWN_INDEX;
837 : }
838 : }
839 : }
840 :
841 : static UBool U_CALLCONV
842 0 : utf8IteratorHasNext(UCharIterator *iter) {
843 0 : return iter->start<iter->limit || iter->reservedField!=0;
844 : }
845 :
846 : static UBool U_CALLCONV
847 0 : utf8IteratorHasPrevious(UCharIterator *iter) {
848 0 : return iter->start>0;
849 : }
850 :
851 : static UChar32 U_CALLCONV
852 0 : utf8IteratorCurrent(UCharIterator *iter) {
853 0 : if(iter->reservedField!=0) {
854 0 : return U16_TRAIL(iter->reservedField);
855 0 : } else if(iter->start<iter->limit) {
856 0 : const uint8_t *s=(const uint8_t *)iter->context;
857 : UChar32 c;
858 0 : int32_t i=iter->start;
859 :
860 0 : U8_NEXT_OR_FFFD(s, i, iter->limit, c);
861 0 : if(c<=0xffff) {
862 0 : return c;
863 : } else {
864 0 : return U16_LEAD(c);
865 : }
866 : } else {
867 0 : return U_SENTINEL;
868 : }
869 : }
870 :
871 : static UChar32 U_CALLCONV
872 0 : utf8IteratorNext(UCharIterator *iter) {
873 : int32_t index;
874 :
875 0 : if(iter->reservedField!=0) {
876 0 : UChar trail=U16_TRAIL(iter->reservedField);
877 0 : iter->reservedField=0;
878 0 : if((index=iter->index)>=0) {
879 0 : iter->index=index+1;
880 : }
881 0 : return trail;
882 0 : } else if(iter->start<iter->limit) {
883 0 : const uint8_t *s=(const uint8_t *)iter->context;
884 : UChar32 c;
885 :
886 0 : U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
887 0 : if((index=iter->index)>=0) {
888 0 : iter->index=++index;
889 0 : if(iter->length<0 && iter->start==iter->limit) {
890 0 : iter->length= c<=0xffff ? index : index+1;
891 : }
892 0 : } else if(iter->start==iter->limit && iter->length>=0) {
893 0 : iter->index= c<=0xffff ? iter->length : iter->length-1;
894 : }
895 0 : if(c<=0xffff) {
896 0 : return c;
897 : } else {
898 0 : iter->reservedField=c;
899 0 : return U16_LEAD(c);
900 : }
901 : } else {
902 0 : return U_SENTINEL;
903 : }
904 : }
905 :
906 : static UChar32 U_CALLCONV
907 0 : utf8IteratorPrevious(UCharIterator *iter) {
908 : int32_t index;
909 :
910 0 : if(iter->reservedField!=0) {
911 0 : UChar lead=U16_LEAD(iter->reservedField);
912 0 : iter->reservedField=0;
913 0 : iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
914 0 : if((index=iter->index)>0) {
915 0 : iter->index=index-1;
916 : }
917 0 : return lead;
918 0 : } else if(iter->start>0) {
919 0 : const uint8_t *s=(const uint8_t *)iter->context;
920 : UChar32 c;
921 :
922 0 : U8_PREV_OR_FFFD(s, 0, iter->start, c);
923 0 : if((index=iter->index)>0) {
924 0 : iter->index=index-1;
925 0 : } else if(iter->start<=1) {
926 0 : iter->index= c<=0xffff ? iter->start : iter->start+1;
927 : }
928 0 : if(c<=0xffff) {
929 0 : return c;
930 : } else {
931 0 : iter->start+=4; /* back to behind this supplementary code point for consistent state */
932 0 : iter->reservedField=c;
933 0 : return U16_TRAIL(c);
934 : }
935 : } else {
936 0 : return U_SENTINEL;
937 : }
938 : }
939 :
940 : static uint32_t U_CALLCONV
941 0 : utf8IteratorGetState(const UCharIterator *iter) {
942 0 : uint32_t state=(uint32_t)(iter->start<<1);
943 0 : if(iter->reservedField!=0) {
944 0 : state|=1;
945 : }
946 0 : return state;
947 : }
948 :
949 : static void U_CALLCONV
950 0 : utf8IteratorSetState(UCharIterator *iter,
951 : uint32_t state,
952 : UErrorCode *pErrorCode)
953 : {
954 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
955 : /* do nothing */
956 0 : } else if(iter==NULL) {
957 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
958 0 : } else if(state==utf8IteratorGetState(iter)) {
959 : /* setting to the current state: no-op */
960 : } else {
961 0 : int32_t index=(int32_t)(state>>1); /* UTF-8 index */
962 0 : state&=1; /* 1 if in surrogate pair, must be index>=4 */
963 :
964 0 : if((state==0 ? index<0 : index<4) || iter->limit<index) {
965 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
966 : } else {
967 0 : iter->start=index; /* restore UTF-8 byte index */
968 0 : if(index<=1) {
969 0 : iter->index=index;
970 : } else {
971 0 : iter->index=-1; /* unknown UTF-16 index */
972 : }
973 0 : if(state==0) {
974 0 : iter->reservedField=0;
975 : } else {
976 : /* verified index>=4 above */
977 : UChar32 c;
978 0 : U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
979 0 : if(c<=0xffff) {
980 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
981 : } else {
982 0 : iter->reservedField=c;
983 : }
984 : }
985 : }
986 : }
987 0 : }
988 :
989 : static const UCharIterator utf8Iterator={
990 : 0, 0, 0, 0, 0, 0,
991 : utf8IteratorGetIndex,
992 : utf8IteratorMove,
993 : utf8IteratorHasNext,
994 : utf8IteratorHasPrevious,
995 : utf8IteratorCurrent,
996 : utf8IteratorNext,
997 : utf8IteratorPrevious,
998 : NULL,
999 : utf8IteratorGetState,
1000 : utf8IteratorSetState
1001 : };
1002 :
1003 : U_CAPI void U_EXPORT2
1004 0 : uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
1005 0 : if(iter!=0) {
1006 0 : if(s!=0 && length>=-1) {
1007 0 : *iter=utf8Iterator;
1008 0 : iter->context=s;
1009 0 : if(length>=0) {
1010 0 : iter->limit=length;
1011 : } else {
1012 0 : iter->limit=(int32_t)uprv_strlen(s);
1013 : }
1014 0 : iter->length= iter->limit<=1 ? iter->limit : -1;
1015 : } else {
1016 0 : *iter=noopIterator;
1017 : }
1018 : }
1019 0 : }
1020 :
1021 : /* Helper functions --------------------------------------------------------- */
1022 :
1023 : U_CAPI UChar32 U_EXPORT2
1024 0 : uiter_current32(UCharIterator *iter) {
1025 : UChar32 c, c2;
1026 :
1027 0 : c=iter->current(iter);
1028 0 : if(U16_IS_SURROGATE(c)) {
1029 0 : if(U16_IS_SURROGATE_LEAD(c)) {
1030 : /*
1031 : * go to the next code unit
1032 : * we know that we are not at the limit because c!=U_SENTINEL
1033 : */
1034 0 : iter->move(iter, 1, UITER_CURRENT);
1035 0 : if(U16_IS_TRAIL(c2=iter->current(iter))) {
1036 0 : c=U16_GET_SUPPLEMENTARY(c, c2);
1037 : }
1038 :
1039 : /* undo index movement */
1040 0 : iter->move(iter, -1, UITER_CURRENT);
1041 : } else {
1042 0 : if(U16_IS_LEAD(c2=iter->previous(iter))) {
1043 0 : c=U16_GET_SUPPLEMENTARY(c2, c);
1044 : }
1045 0 : if(c2>=0) {
1046 : /* undo index movement */
1047 0 : iter->move(iter, 1, UITER_CURRENT);
1048 : }
1049 : }
1050 : }
1051 0 : return c;
1052 : }
1053 :
1054 : U_CAPI UChar32 U_EXPORT2
1055 0 : uiter_next32(UCharIterator *iter) {
1056 : UChar32 c, c2;
1057 :
1058 0 : c=iter->next(iter);
1059 0 : if(U16_IS_LEAD(c)) {
1060 0 : if(U16_IS_TRAIL(c2=iter->next(iter))) {
1061 0 : c=U16_GET_SUPPLEMENTARY(c, c2);
1062 0 : } else if(c2>=0) {
1063 : /* unmatched first surrogate, undo index movement */
1064 0 : iter->move(iter, -1, UITER_CURRENT);
1065 : }
1066 : }
1067 0 : return c;
1068 : }
1069 :
1070 : U_CAPI UChar32 U_EXPORT2
1071 0 : uiter_previous32(UCharIterator *iter) {
1072 : UChar32 c, c2;
1073 :
1074 0 : c=iter->previous(iter);
1075 0 : if(U16_IS_TRAIL(c)) {
1076 0 : if(U16_IS_LEAD(c2=iter->previous(iter))) {
1077 0 : c=U16_GET_SUPPLEMENTARY(c2, c);
1078 0 : } else if(c2>=0) {
1079 : /* unmatched second surrogate, undo index movement */
1080 0 : iter->move(iter, 1, UITER_CURRENT);
1081 : }
1082 : }
1083 0 : return c;
1084 : }
1085 :
1086 : U_CAPI uint32_t U_EXPORT2
1087 0 : uiter_getState(const UCharIterator *iter) {
1088 0 : if(iter==NULL || iter->getState==NULL) {
1089 0 : return UITER_NO_STATE;
1090 : } else {
1091 0 : return iter->getState(iter);
1092 : }
1093 : }
1094 :
1095 : U_CAPI void U_EXPORT2
1096 0 : uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
1097 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1098 : /* do nothing */
1099 0 : } else if(iter==NULL) {
1100 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1101 0 : } else if(iter->setState==NULL) {
1102 0 : *pErrorCode=U_UNSUPPORTED_ERROR;
1103 : } else {
1104 0 : iter->setState(iter, state, pErrorCode);
1105 : }
1106 0 : }
1107 :
1108 : U_CDECL_END
|