Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : *
6 : * Copyright (C) 2005-2016, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : *******************************************************************************
10 : * file name: utext.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2005apr12
16 : * created by: Markus W. Scherer
17 : */
18 :
19 : #include "unicode/utypes.h"
20 : #include "unicode/ustring.h"
21 : #include "unicode/unistr.h"
22 : #include "unicode/chariter.h"
23 : #include "unicode/utext.h"
24 : #include "unicode/utf.h"
25 : #include "unicode/utf8.h"
26 : #include "unicode/utf16.h"
27 : #include "ustr_imp.h"
28 : #include "cmemory.h"
29 : #include "cstring.h"
30 : #include "uassert.h"
31 : #include "putilimp.h"
32 :
33 : U_NAMESPACE_USE
34 :
35 : #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
36 :
37 :
38 : static UBool
39 0 : utext_access(UText *ut, int64_t index, UBool forward) {
40 0 : return ut->pFuncs->access(ut, index, forward);
41 : }
42 :
43 :
44 :
45 : U_CAPI UBool U_EXPORT2
46 0 : utext_moveIndex32(UText *ut, int32_t delta) {
47 : UChar32 c;
48 0 : if (delta > 0) {
49 0 : do {
50 0 : if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
51 0 : return FALSE;
52 : }
53 0 : c = ut->chunkContents[ut->chunkOffset];
54 0 : if (U16_IS_SURROGATE(c)) {
55 0 : c = utext_next32(ut);
56 0 : if (c == U_SENTINEL) {
57 0 : return FALSE;
58 : }
59 : } else {
60 0 : ut->chunkOffset++;
61 : }
62 : } while(--delta>0);
63 :
64 0 : } else if (delta<0) {
65 0 : do {
66 0 : if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
67 0 : return FALSE;
68 : }
69 0 : c = ut->chunkContents[ut->chunkOffset-1];
70 0 : if (U16_IS_SURROGATE(c)) {
71 0 : c = utext_previous32(ut);
72 0 : if (c == U_SENTINEL) {
73 0 : return FALSE;
74 : }
75 : } else {
76 0 : ut->chunkOffset--;
77 : }
78 : } while(++delta<0);
79 : }
80 :
81 0 : return TRUE;
82 : }
83 :
84 :
85 : U_CAPI int64_t U_EXPORT2
86 0 : utext_nativeLength(UText *ut) {
87 0 : return ut->pFuncs->nativeLength(ut);
88 : }
89 :
90 :
91 : U_CAPI UBool U_EXPORT2
92 0 : utext_isLengthExpensive(const UText *ut) {
93 0 : UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
94 0 : return r;
95 : }
96 :
97 :
98 : U_CAPI int64_t U_EXPORT2
99 0 : utext_getNativeIndex(const UText *ut) {
100 0 : if(ut->chunkOffset <= ut->nativeIndexingLimit) {
101 0 : return ut->chunkNativeStart+ut->chunkOffset;
102 : } else {
103 0 : return ut->pFuncs->mapOffsetToNative(ut);
104 : }
105 : }
106 :
107 :
108 : U_CAPI void U_EXPORT2
109 0 : utext_setNativeIndex(UText *ut, int64_t index) {
110 0 : if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
111 : // The desired position is outside of the current chunk.
112 : // Access the new position. Assume a forward iteration from here,
113 : // which will also be optimimum for a single random access.
114 : // Reverse iterations may suffer slightly.
115 0 : ut->pFuncs->access(ut, index, TRUE);
116 0 : } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
117 : // utf-16 indexing.
118 0 : ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
119 : } else {
120 0 : ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
121 : }
122 : // The convention is that the index must always be on a code point boundary.
123 : // Adjust the index position if it is in the middle of a surrogate pair.
124 0 : if (ut->chunkOffset<ut->chunkLength) {
125 0 : UChar c= ut->chunkContents[ut->chunkOffset];
126 0 : if (U16_IS_TRAIL(c)) {
127 0 : if (ut->chunkOffset==0) {
128 0 : ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
129 : }
130 0 : if (ut->chunkOffset>0) {
131 0 : UChar lead = ut->chunkContents[ut->chunkOffset-1];
132 0 : if (U16_IS_LEAD(lead)) {
133 0 : ut->chunkOffset--;
134 : }
135 : }
136 : }
137 : }
138 0 : }
139 :
140 :
141 :
142 : U_CAPI int64_t U_EXPORT2
143 0 : utext_getPreviousNativeIndex(UText *ut) {
144 : //
145 : // Fast-path the common case.
146 : // Common means current position is not at the beginning of a chunk
147 : // and the preceding character is not supplementary.
148 : //
149 0 : int32_t i = ut->chunkOffset - 1;
150 : int64_t result;
151 0 : if (i >= 0) {
152 0 : UChar c = ut->chunkContents[i];
153 0 : if (U16_IS_TRAIL(c) == FALSE) {
154 0 : if (i <= ut->nativeIndexingLimit) {
155 0 : result = ut->chunkNativeStart + i;
156 : } else {
157 0 : ut->chunkOffset = i;
158 0 : result = ut->pFuncs->mapOffsetToNative(ut);
159 0 : ut->chunkOffset++;
160 : }
161 0 : return result;
162 : }
163 : }
164 :
165 : // If at the start of text, simply return 0.
166 0 : if (ut->chunkOffset==0 && ut->chunkNativeStart==0) {
167 0 : return 0;
168 : }
169 :
170 : // Harder, less common cases. We are at a chunk boundary, or on a surrogate.
171 : // Keep it simple, use other functions to handle the edges.
172 : //
173 0 : utext_previous32(ut);
174 0 : result = UTEXT_GETNATIVEINDEX(ut);
175 0 : utext_next32(ut);
176 0 : return result;
177 : }
178 :
179 :
180 : //
181 : // utext_current32. Get the UChar32 at the current position.
182 : // UText iteration position is always on a code point boundary,
183 : // never on the trail half of a surrogate pair.
184 : //
185 : U_CAPI UChar32 U_EXPORT2
186 0 : utext_current32(UText *ut) {
187 : UChar32 c;
188 0 : if (ut->chunkOffset==ut->chunkLength) {
189 : // Current position is just off the end of the chunk.
190 0 : if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
191 : // Off the end of the text.
192 0 : return U_SENTINEL;
193 : }
194 : }
195 :
196 0 : c = ut->chunkContents[ut->chunkOffset];
197 0 : if (U16_IS_LEAD(c) == FALSE) {
198 : // Normal, non-supplementary case.
199 0 : return c;
200 : }
201 :
202 : //
203 : // Possible supplementary char.
204 : //
205 0 : UChar32 trail = 0;
206 0 : UChar32 supplementaryC = c;
207 0 : if ((ut->chunkOffset+1) < ut->chunkLength) {
208 : // The trail surrogate is in the same chunk.
209 0 : trail = ut->chunkContents[ut->chunkOffset+1];
210 : } else {
211 : // The trail surrogate is in a different chunk.
212 : // Because we must maintain the iteration position, we need to switch forward
213 : // into the new chunk, get the trail surrogate, then revert the chunk back to the
214 : // original one.
215 : // An edge case to be careful of: the entire text may end with an unpaired
216 : // leading surrogate. The attempt to access the trail will fail, but
217 : // the original position before the unpaired lead still needs to be restored.
218 0 : int64_t nativePosition = ut->chunkNativeLimit;
219 0 : int32_t originalOffset = ut->chunkOffset;
220 0 : if (ut->pFuncs->access(ut, nativePosition, TRUE)) {
221 0 : trail = ut->chunkContents[ut->chunkOffset];
222 : }
223 0 : UBool r = ut->pFuncs->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk
224 0 : U_ASSERT(r==TRUE);
225 0 : ut->chunkOffset = originalOffset;
226 0 : if(!r) {
227 0 : return U_SENTINEL;
228 : }
229 : }
230 :
231 0 : if (U16_IS_TRAIL(trail)) {
232 0 : supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
233 : }
234 0 : return supplementaryC;
235 :
236 : }
237 :
238 :
239 : U_CAPI UChar32 U_EXPORT2
240 0 : utext_char32At(UText *ut, int64_t nativeIndex) {
241 0 : UChar32 c = U_SENTINEL;
242 :
243 : // Fast path the common case.
244 0 : if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
245 0 : ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
246 0 : c = ut->chunkContents[ut->chunkOffset];
247 0 : if (U16_IS_SURROGATE(c) == FALSE) {
248 0 : return c;
249 : }
250 : }
251 :
252 :
253 0 : utext_setNativeIndex(ut, nativeIndex);
254 0 : if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
255 0 : c = ut->chunkContents[ut->chunkOffset];
256 0 : if (U16_IS_SURROGATE(c)) {
257 : // For surrogates, let current32() deal with the complications
258 : // of supplementaries that may span chunk boundaries.
259 0 : c = utext_current32(ut);
260 : }
261 : }
262 0 : return c;
263 : }
264 :
265 :
266 : U_CAPI UChar32 U_EXPORT2
267 0 : utext_next32(UText *ut) {
268 : UChar32 c;
269 :
270 0 : if (ut->chunkOffset >= ut->chunkLength) {
271 0 : if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
272 0 : return U_SENTINEL;
273 : }
274 : }
275 :
276 0 : c = ut->chunkContents[ut->chunkOffset++];
277 0 : if (U16_IS_LEAD(c) == FALSE) {
278 : // Normal case, not supplementary.
279 : // (A trail surrogate seen here is just returned as is, as a surrogate value.
280 : // It cannot be part of a pair.)
281 0 : return c;
282 : }
283 :
284 0 : if (ut->chunkOffset >= ut->chunkLength) {
285 0 : if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
286 : // c is an unpaired lead surrogate at the end of the text.
287 : // return it as it is.
288 0 : return c;
289 : }
290 : }
291 0 : UChar32 trail = ut->chunkContents[ut->chunkOffset];
292 0 : if (U16_IS_TRAIL(trail) == FALSE) {
293 : // c was an unpaired lead surrogate, not at the end of the text.
294 : // return it as it is (unpaired). Iteration position is on the
295 : // following character, possibly in the next chunk, where the
296 : // trail surrogate would have been if it had existed.
297 0 : return c;
298 : }
299 :
300 0 : UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
301 0 : ut->chunkOffset++; // move iteration position over the trail surrogate.
302 0 : return supplementary;
303 : }
304 :
305 :
306 : U_CAPI UChar32 U_EXPORT2
307 0 : utext_previous32(UText *ut) {
308 : UChar32 c;
309 :
310 0 : if (ut->chunkOffset <= 0) {
311 0 : if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
312 0 : return U_SENTINEL;
313 : }
314 : }
315 0 : ut->chunkOffset--;
316 0 : c = ut->chunkContents[ut->chunkOffset];
317 0 : if (U16_IS_TRAIL(c) == FALSE) {
318 : // Normal case, not supplementary.
319 : // (A lead surrogate seen here is just returned as is, as a surrogate value.
320 : // It cannot be part of a pair.)
321 0 : return c;
322 : }
323 :
324 0 : if (ut->chunkOffset <= 0) {
325 0 : if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
326 : // c is an unpaired trail surrogate at the start of the text.
327 : // return it as it is.
328 0 : return c;
329 : }
330 : }
331 :
332 0 : UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
333 0 : if (U16_IS_LEAD(lead) == FALSE) {
334 : // c was an unpaired trail surrogate, not at the end of the text.
335 : // return it as it is (unpaired). Iteration position is at c
336 0 : return c;
337 : }
338 :
339 0 : UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
340 0 : ut->chunkOffset--; // move iteration position over the lead surrogate.
341 0 : return supplementary;
342 : }
343 :
344 :
345 :
346 : U_CAPI UChar32 U_EXPORT2
347 0 : utext_next32From(UText *ut, int64_t index) {
348 0 : UChar32 c = U_SENTINEL;
349 :
350 0 : if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
351 : // Desired position is outside of the current chunk.
352 0 : if(!ut->pFuncs->access(ut, index, TRUE)) {
353 : // no chunk available here
354 0 : return U_SENTINEL;
355 : }
356 0 : } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
357 : // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
358 0 : ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
359 : } else {
360 : // Desired position is in chunk, with non-UTF16 indexing.
361 0 : ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
362 : }
363 :
364 0 : c = ut->chunkContents[ut->chunkOffset++];
365 0 : if (U16_IS_SURROGATE(c)) {
366 : // Surrogates. Many edge cases. Use other functions that already
367 : // deal with the problems.
368 0 : utext_setNativeIndex(ut, index);
369 0 : c = utext_next32(ut);
370 : }
371 0 : return c;
372 : }
373 :
374 :
375 : U_CAPI UChar32 U_EXPORT2
376 0 : utext_previous32From(UText *ut, int64_t index) {
377 : //
378 : // Return the character preceding the specified index.
379 : // Leave the iteration position at the start of the character that was returned.
380 : //
381 : UChar32 cPrev; // The character preceding cCurr, which is what we will return.
382 :
383 : // Address the chunk containg the position preceding the incoming index
384 : // A tricky edge case:
385 : // We try to test the requested native index against the chunkNativeStart to determine
386 : // whether the character preceding the one at the index is in the current chunk.
387 : // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
388 : // requested index is on something other than the first position of the first char.
389 : //
390 0 : if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
391 : // Requested native index is outside of the current chunk.
392 0 : if(!ut->pFuncs->access(ut, index, FALSE)) {
393 : // no chunk available here
394 0 : return U_SENTINEL;
395 : }
396 0 : } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
397 : // Direct UTF-16 indexing.
398 0 : ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
399 : } else {
400 0 : ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
401 0 : if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) {
402 : // no chunk available here
403 0 : return U_SENTINEL;
404 : }
405 : }
406 :
407 : //
408 : // Simple case with no surrogates.
409 : //
410 0 : ut->chunkOffset--;
411 0 : cPrev = ut->chunkContents[ut->chunkOffset];
412 :
413 0 : if (U16_IS_SURROGATE(cPrev)) {
414 : // Possible supplementary. Many edge cases.
415 : // Let other functions do the heavy lifting.
416 0 : utext_setNativeIndex(ut, index);
417 0 : cPrev = utext_previous32(ut);
418 : }
419 0 : return cPrev;
420 : }
421 :
422 :
423 : U_CAPI int32_t U_EXPORT2
424 0 : utext_extract(UText *ut,
425 : int64_t start, int64_t limit,
426 : UChar *dest, int32_t destCapacity,
427 : UErrorCode *status) {
428 0 : return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
429 : }
430 :
431 :
432 :
433 : U_CAPI UBool U_EXPORT2
434 0 : utext_equals(const UText *a, const UText *b) {
435 0 : if (a==NULL || b==NULL ||
436 0 : a->magic != UTEXT_MAGIC ||
437 0 : b->magic != UTEXT_MAGIC) {
438 : // Null or invalid arguments don't compare equal to anything.
439 0 : return FALSE;
440 : }
441 :
442 0 : if (a->pFuncs != b->pFuncs) {
443 : // Different types of text providers.
444 0 : return FALSE;
445 : }
446 :
447 0 : if (a->context != b->context) {
448 : // Different sources (different strings)
449 0 : return FALSE;
450 : }
451 0 : if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
452 : // Different current position in the string.
453 0 : return FALSE;
454 : }
455 :
456 0 : return TRUE;
457 : }
458 :
459 : U_CAPI UBool U_EXPORT2
460 0 : utext_isWritable(const UText *ut)
461 : {
462 0 : UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
463 0 : return b;
464 : }
465 :
466 :
467 : U_CAPI void U_EXPORT2
468 0 : utext_freeze(UText *ut) {
469 : // Zero out the WRITABLE flag.
470 0 : ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
471 0 : }
472 :
473 :
474 : U_CAPI UBool U_EXPORT2
475 0 : utext_hasMetaData(const UText *ut)
476 : {
477 0 : UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
478 0 : return b;
479 : }
480 :
481 :
482 :
483 : U_CAPI int32_t U_EXPORT2
484 0 : utext_replace(UText *ut,
485 : int64_t nativeStart, int64_t nativeLimit,
486 : const UChar *replacementText, int32_t replacementLength,
487 : UErrorCode *status)
488 : {
489 0 : if (U_FAILURE(*status)) {
490 0 : return 0;
491 : }
492 0 : if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
493 0 : *status = U_NO_WRITE_PERMISSION;
494 0 : return 0;
495 : }
496 0 : int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
497 0 : return i;
498 : }
499 :
500 : U_CAPI void U_EXPORT2
501 0 : utext_copy(UText *ut,
502 : int64_t nativeStart, int64_t nativeLimit,
503 : int64_t destIndex,
504 : UBool move,
505 : UErrorCode *status)
506 : {
507 0 : if (U_FAILURE(*status)) {
508 0 : return;
509 : }
510 0 : if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
511 0 : *status = U_NO_WRITE_PERMISSION;
512 0 : return;
513 : }
514 0 : ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
515 : }
516 :
517 :
518 :
519 : U_CAPI UText * U_EXPORT2
520 0 : utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
521 0 : if (U_FAILURE(*status)) {
522 0 : return dest;
523 : }
524 0 : UText *result = src->pFuncs->clone(dest, src, deep, status);
525 0 : if (U_FAILURE(*status)) {
526 0 : return result;
527 : }
528 0 : if (result == NULL) {
529 0 : *status = U_MEMORY_ALLOCATION_ERROR;
530 0 : return result;
531 : }
532 0 : if (readOnly) {
533 0 : utext_freeze(result);
534 : }
535 0 : return result;
536 : }
537 :
538 :
539 :
540 : //------------------------------------------------------------------------------
541 : //
542 : // UText common functions implementation
543 : //
544 : //------------------------------------------------------------------------------
545 :
546 : //
547 : // UText.flags bit definitions
548 : //
549 : enum {
550 : UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap.
551 : // 0 if caller provided storage for the UText.
552 :
553 : UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate
554 : // heap block.
555 : // 0 if there is no separate allocation. Either no extra
556 : // storage was requested, or it is appended to the end
557 : // of the main UText storage.
558 :
559 : UTEXT_OPEN = 4 // 1 if this UText is currently open
560 : // 0 if this UText is not open.
561 : };
562 :
563 :
564 : //
565 : // Extended form of a UText. The purpose is to aid in computing the total size required
566 : // when a provider asks for a UText to be allocated with extra storage.
567 :
568 : struct ExtendedUText {
569 : UText ut;
570 : UAlignedMemory extension;
571 : };
572 :
573 : static const UText emptyText = UTEXT_INITIALIZER;
574 :
575 : U_CAPI UText * U_EXPORT2
576 0 : utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
577 0 : if (U_FAILURE(*status)) {
578 0 : return ut;
579 : }
580 :
581 0 : if (ut == NULL) {
582 : // We need to heap-allocate storage for the new UText
583 0 : int32_t spaceRequired = sizeof(UText);
584 0 : if (extraSpace > 0) {
585 0 : spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
586 : }
587 0 : ut = (UText *)uprv_malloc(spaceRequired);
588 0 : if (ut == NULL) {
589 0 : *status = U_MEMORY_ALLOCATION_ERROR;
590 0 : return NULL;
591 : } else {
592 0 : *ut = emptyText;
593 0 : ut->flags |= UTEXT_HEAP_ALLOCATED;
594 0 : if (spaceRequired>0) {
595 0 : ut->extraSize = extraSpace;
596 0 : ut->pExtra = &((ExtendedUText *)ut)->extension;
597 : }
598 : }
599 : } else {
600 : // We have been supplied with an already existing UText.
601 : // Verify that it really appears to be a UText.
602 0 : if (ut->magic != UTEXT_MAGIC) {
603 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
604 0 : return ut;
605 : }
606 : // If the ut is already open and there's a provider supplied close
607 : // function, call it.
608 0 : if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL) {
609 0 : ut->pFuncs->close(ut);
610 : }
611 0 : ut->flags &= ~UTEXT_OPEN;
612 :
613 : // If extra space was requested by our caller, check whether
614 : // sufficient already exists, and allocate new if needed.
615 0 : if (extraSpace > ut->extraSize) {
616 : // Need more space. If there is existing separately allocated space,
617 : // delete it first, then allocate new space.
618 0 : if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
619 0 : uprv_free(ut->pExtra);
620 0 : ut->extraSize = 0;
621 : }
622 0 : ut->pExtra = uprv_malloc(extraSpace);
623 0 : if (ut->pExtra == NULL) {
624 0 : *status = U_MEMORY_ALLOCATION_ERROR;
625 : } else {
626 0 : ut->extraSize = extraSpace;
627 0 : ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
628 : }
629 : }
630 : }
631 0 : if (U_SUCCESS(*status)) {
632 0 : ut->flags |= UTEXT_OPEN;
633 :
634 : // Initialize all remaining fields of the UText.
635 : //
636 0 : ut->context = NULL;
637 0 : ut->chunkContents = NULL;
638 0 : ut->p = NULL;
639 0 : ut->q = NULL;
640 0 : ut->r = NULL;
641 0 : ut->a = 0;
642 0 : ut->b = 0;
643 0 : ut->c = 0;
644 0 : ut->chunkOffset = 0;
645 0 : ut->chunkLength = 0;
646 0 : ut->chunkNativeStart = 0;
647 0 : ut->chunkNativeLimit = 0;
648 0 : ut->nativeIndexingLimit = 0;
649 0 : ut->providerProperties = 0;
650 0 : ut->privA = 0;
651 0 : ut->privB = 0;
652 0 : ut->privC = 0;
653 0 : ut->privP = NULL;
654 0 : if (ut->pExtra!=NULL && ut->extraSize>0)
655 0 : uprv_memset(ut->pExtra, 0, ut->extraSize);
656 :
657 : }
658 0 : return ut;
659 : }
660 :
661 :
662 : U_CAPI UText * U_EXPORT2
663 0 : utext_close(UText *ut) {
664 0 : if (ut==NULL ||
665 0 : ut->magic != UTEXT_MAGIC ||
666 0 : (ut->flags & UTEXT_OPEN) == 0)
667 : {
668 : // The supplied ut is not an open UText.
669 : // Do nothing.
670 0 : return ut;
671 : }
672 :
673 : // If the provider gave us a close function, call it now.
674 : // This will clean up anything allocated specifically by the provider.
675 0 : if (ut->pFuncs->close != NULL) {
676 0 : ut->pFuncs->close(ut);
677 : }
678 0 : ut->flags &= ~UTEXT_OPEN;
679 :
680 : // If we (the framework) allocated the UText or subsidiary storage,
681 : // delete it.
682 0 : if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
683 0 : uprv_free(ut->pExtra);
684 0 : ut->pExtra = NULL;
685 0 : ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
686 0 : ut->extraSize = 0;
687 : }
688 :
689 : // Zero out function table of the closed UText. This is a defensive move,
690 : // inteded to cause applications that inadvertantly use a closed
691 : // utext to crash with null pointer errors.
692 0 : ut->pFuncs = NULL;
693 :
694 0 : if (ut->flags & UTEXT_HEAP_ALLOCATED) {
695 : // This UText was allocated by UText setup. We need to free it.
696 : // Clear magic, so we can detect if the user messes up and immediately
697 : // tries to reopen another UText using the deleted storage.
698 0 : ut->magic = 0;
699 0 : uprv_free(ut);
700 0 : ut = NULL;
701 : }
702 0 : return ut;
703 : }
704 :
705 :
706 :
707 :
708 : //
709 : // invalidateChunk Reset a chunk to have no contents, so that the next call
710 : // to access will cause new data to load.
711 : // This is needed when copy/move/replace operate directly on the
712 : // backing text, potentially putting it out of sync with the
713 : // contents in the chunk.
714 : //
715 : static void
716 0 : invalidateChunk(UText *ut) {
717 0 : ut->chunkLength = 0;
718 0 : ut->chunkNativeLimit = 0;
719 0 : ut->chunkNativeStart = 0;
720 0 : ut->chunkOffset = 0;
721 0 : ut->nativeIndexingLimit = 0;
722 0 : }
723 :
724 : //
725 : // pinIndex Do range pinning on a native index parameter.
726 : // 64 bit pinning is done in place.
727 : // 32 bit truncated result is returned as a convenience for
728 : // use in providers that don't need 64 bits.
729 : static int32_t
730 0 : pinIndex(int64_t &index, int64_t limit) {
731 0 : if (index<0) {
732 0 : index = 0;
733 0 : } else if (index > limit) {
734 0 : index = limit;
735 : }
736 0 : return (int32_t)index;
737 : }
738 :
739 :
740 : U_CDECL_BEGIN
741 :
742 : //
743 : // Pointer relocation function,
744 : // a utility used by shallow clone.
745 : // Adjust a pointer that refers to something within one UText (the source)
746 : // to refer to the same relative offset within a another UText (the target)
747 : //
748 0 : static void adjustPointer(UText *dest, const void **destPtr, const UText *src) {
749 : // convert all pointers to (char *) so that byte address arithmetic will work.
750 0 : char *dptr = (char *)*destPtr;
751 0 : char *dUText = (char *)dest;
752 0 : char *sUText = (char *)src;
753 :
754 0 : if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) {
755 : // target ptr was to something within the src UText's pExtra storage.
756 : // relocate it into the target UText's pExtra region.
757 0 : *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra);
758 0 : } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
759 : // target ptr was pointing to somewhere within the source UText itself.
760 : // Move it to the same offset within the target UText.
761 0 : *destPtr = dUText + (dptr-sUText);
762 : }
763 0 : }
764 :
765 :
766 : //
767 : // Clone. This is a generic copy-the-utext-by-value clone function that can be
768 : // used as-is with some utext types, and as a helper by other clones.
769 : //
770 : static UText * U_CALLCONV
771 0 : shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
772 0 : if (U_FAILURE(*status)) {
773 0 : return NULL;
774 : }
775 0 : int32_t srcExtraSize = src->extraSize;
776 :
777 : //
778 : // Use the generic text_setup to allocate storage if required.
779 : //
780 0 : dest = utext_setup(dest, srcExtraSize, status);
781 0 : if (U_FAILURE(*status)) {
782 0 : return dest;
783 : }
784 :
785 : //
786 : // flags (how the UText was allocated) and the pointer to the
787 : // extra storage must retain the values in the cloned utext that
788 : // were set up by utext_setup. Save them separately before
789 : // copying the whole struct.
790 : //
791 0 : void *destExtra = dest->pExtra;
792 0 : int32_t flags = dest->flags;
793 :
794 :
795 : //
796 : // Copy the whole UText struct by value.
797 : // Any "Extra" storage is copied also.
798 : //
799 0 : int sizeToCopy = src->sizeOfStruct;
800 0 : if (sizeToCopy > dest->sizeOfStruct) {
801 0 : sizeToCopy = dest->sizeOfStruct;
802 : }
803 0 : uprv_memcpy(dest, src, sizeToCopy);
804 0 : dest->pExtra = destExtra;
805 0 : dest->flags = flags;
806 0 : if (srcExtraSize > 0) {
807 0 : uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
808 : }
809 :
810 : //
811 : // Relocate any pointers in the target that refer to the UText itself
812 : // to point to the cloned copy rather than the original source.
813 : //
814 0 : adjustPointer(dest, &dest->context, src);
815 0 : adjustPointer(dest, &dest->p, src);
816 0 : adjustPointer(dest, &dest->q, src);
817 0 : adjustPointer(dest, &dest->r, src);
818 0 : adjustPointer(dest, (const void **)&dest->chunkContents, src);
819 :
820 : // The newly shallow-cloned UText does _not_ own the underlying storage for the text.
821 : // (The source for the clone may or may not have owned the text.)
822 :
823 0 : dest->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
824 :
825 0 : return dest;
826 : }
827 :
828 :
829 : U_CDECL_END
830 :
831 :
832 :
833 : //------------------------------------------------------------------------------
834 : //
835 : // UText implementation for UTF-8 char * strings (read-only)
836 : // Limitation: string length must be <= 0x7fffffff in length.
837 : // (length must for in an int32_t variable)
838 : //
839 : // Use of UText data members:
840 : // context pointer to UTF-8 string
841 : // utext.b is the input string length (bytes).
842 : // utext.c Length scanned so far in string
843 : // (for optimizing finding length of zero terminated strings.)
844 : // utext.p pointer to the current buffer
845 : // utext.q pointer to the other buffer.
846 : //
847 : //------------------------------------------------------------------------------
848 :
849 : // Chunk size.
850 : // Must be less than 42 (256/6), because of byte mapping from UChar indexes to native indexes.
851 : // Worst case there are six UTF-8 bytes per UChar.
852 : // obsolete 6 byte form fd + 5 trails maps to fffd
853 : // obsolete 5 byte form fc + 4 trails maps to fffd
854 : // non-shortest 4 byte forms maps to fffd
855 : // normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
856 : // mapToUChars array size must allow for the worst case, 6.
857 : // This could be brought down to 4, by treating fd and fc as pure illegal,
858 : // rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
859 : //
860 : enum { UTF8_TEXT_CHUNK_SIZE=32 };
861 :
862 : //
863 : // UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
864 : // Each contains the UChar chunk buffer, the to and from native maps, and
865 : // header info.
866 : //
867 : // because backwards iteration fills the buffers starting at the end and
868 : // working towards the front, the filled part of the buffers may not begin
869 : // at the start of the available storage for the buffers.
870 : //
871 : // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
872 : // the last character added being a supplementary, and thus requiring a surrogate
873 : // pair. Doing this is simpler than checking for the edge case.
874 : //
875 :
876 : struct UTF8Buf {
877 : int32_t bufNativeStart; // Native index of first char in UChar buf
878 : int32_t bufNativeLimit; // Native index following last char in buf.
879 : int32_t bufStartIdx; // First filled position in buf.
880 : int32_t bufLimitIdx; // Limit of filled range in buf.
881 : int32_t bufNILimit; // Limit of native indexing part of buf
882 : int32_t toUCharsMapStart; // Native index corresponding to
883 : // mapToUChars[0].
884 : // Set to bufNativeStart when filling forwards.
885 : // Set to computed value when filling backwards.
886 :
887 : UChar buf[UTF8_TEXT_CHUNK_SIZE+4]; // The UChar buffer. Requires one extra position beyond the
888 : // the chunk size, to allow for surrogate at the end.
889 : // Length must be identical to mapToNative array, below,
890 : // because of the way indexing works when the array is
891 : // filled backwards during a reverse iteration. Thus,
892 : // the additional extra size.
893 : uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map UChar index in buf to
894 : // native offset from bufNativeStart.
895 : // Requires two extra slots,
896 : // one for a supplementary starting in the last normal position,
897 : // and one for an entry for the buffer limit position.
898 : uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to
899 : // correspoding offset in filled part of buf.
900 : int32_t align;
901 : };
902 :
903 : U_CDECL_BEGIN
904 :
905 : //
906 : // utf8TextLength
907 : //
908 : // Get the length of the string. If we don't already know it,
909 : // we'll need to scan for the trailing nul.
910 : //
911 : static int64_t U_CALLCONV
912 0 : utf8TextLength(UText *ut) {
913 0 : if (ut->b < 0) {
914 : // Zero terminated string, and we haven't scanned to the end yet.
915 : // Scan it now.
916 0 : const char *r = (const char *)ut->context + ut->c;
917 0 : while (*r != 0) {
918 0 : r++;
919 : }
920 0 : if ((r - (const char *)ut->context) < 0x7fffffff) {
921 0 : ut->b = (int32_t)(r - (const char *)ut->context);
922 : } else {
923 : // Actual string was bigger (more than 2 gig) than we
924 : // can handle. Clip it to 2 GB.
925 0 : ut->b = 0x7fffffff;
926 : }
927 0 : ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
928 : }
929 0 : return ut->b;
930 : }
931 :
932 :
933 :
934 :
935 :
936 :
937 : static UBool U_CALLCONV
938 0 : utf8TextAccess(UText *ut, int64_t index, UBool forward) {
939 : //
940 : // Apologies to those who are allergic to goto statements.
941 : // Consider each goto to a labelled block to be the equivalent of
942 : // call the named block as if it were a function();
943 : // return;
944 : //
945 0 : const uint8_t *s8=(const uint8_t *)ut->context;
946 0 : UTF8Buf *u8b = NULL;
947 0 : int32_t length = ut->b; // Length of original utf-8
948 0 : int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits.
949 0 : int32_t mapIndex = 0;
950 0 : if (index<0) {
951 0 : ix=0;
952 0 : } else if (index > 0x7fffffff) {
953 : // Strings with 64 bit lengths not supported by this UTF-8 provider.
954 0 : ix = 0x7fffffff;
955 : }
956 :
957 : // Pin requested index to the string length.
958 0 : if (ix>length) {
959 0 : if (length>=0) {
960 0 : ix=length;
961 0 : } else if (ix>=ut->c) {
962 : // Zero terminated string, and requested index is beyond
963 : // the region that has already been scanned.
964 : // Scan up to either the end of the string or to the
965 : // requested position, whichever comes first.
966 0 : while (ut->c<ix && s8[ut->c]!=0) {
967 0 : ut->c++;
968 : }
969 : // TODO: support for null terminated string length > 32 bits.
970 0 : if (s8[ut->c] == 0) {
971 : // We just found the actual length of the string.
972 : // Trim the requested index back to that.
973 0 : ix = ut->c;
974 0 : ut->b = ut->c;
975 0 : length = ut->c;
976 0 : ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
977 : }
978 : }
979 : }
980 :
981 : //
982 : // Dispatch to the appropriate action for a forward iteration request.
983 : //
984 0 : if (forward) {
985 0 : if (ix==ut->chunkNativeLimit) {
986 : // Check for normal sequential iteration cases first.
987 0 : if (ix==length) {
988 : // Just reached end of string
989 : // Don't swap buffers, but do set the
990 : // current buffer position.
991 0 : ut->chunkOffset = ut->chunkLength;
992 0 : return FALSE;
993 : } else {
994 : // End of current buffer.
995 : // check whether other buffer already has what we need.
996 0 : UTF8Buf *altB = (UTF8Buf *)ut->q;
997 0 : if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
998 0 : goto swapBuffers;
999 : }
1000 : }
1001 : }
1002 :
1003 : // A random access. Desired index could be in either or niether buf.
1004 : // For optimizing the order of testing, first check for the index
1005 : // being in the other buffer. This will be the case for uses that
1006 : // move back and forth over a fairly limited range
1007 : {
1008 0 : u8b = (UTF8Buf *)ut->q; // the alternate buffer
1009 0 : if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
1010 : // Requested index is in the other buffer.
1011 0 : goto swapBuffers;
1012 : }
1013 0 : if (ix == length) {
1014 : // Requested index is end-of-string.
1015 : // (this is the case of randomly seeking to the end.
1016 : // The case of iterating off the end is handled earlier.)
1017 0 : if (ix == ut->chunkNativeLimit) {
1018 : // Current buffer extends up to the end of the string.
1019 : // Leave it as the current buffer.
1020 0 : ut->chunkOffset = ut->chunkLength;
1021 0 : return FALSE;
1022 : }
1023 0 : if (ix == u8b->bufNativeLimit) {
1024 : // Alternate buffer extends to the end of string.
1025 : // Swap it in as the current buffer.
1026 0 : goto swapBuffersAndFail;
1027 : }
1028 :
1029 : // Neither existing buffer extends to the end of the string.
1030 0 : goto makeStubBuffer;
1031 : }
1032 :
1033 0 : if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) {
1034 : // Requested index is in neither buffer.
1035 : goto fillForward;
1036 : }
1037 :
1038 : // Requested index is in this buffer.
1039 0 : u8b = (UTF8Buf *)ut->p; // the current buffer
1040 0 : mapIndex = ix - u8b->toUCharsMapStart;
1041 0 : U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
1042 0 : ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1043 0 : return TRUE;
1044 :
1045 : }
1046 : }
1047 :
1048 :
1049 : //
1050 : // Dispatch to the appropriate action for a
1051 : // Backwards Diretion iteration request.
1052 : //
1053 0 : if (ix==ut->chunkNativeStart) {
1054 : // Check for normal sequential iteration cases first.
1055 0 : if (ix==0) {
1056 : // Just reached the start of string
1057 : // Don't swap buffers, but do set the
1058 : // current buffer position.
1059 0 : ut->chunkOffset = 0;
1060 0 : return FALSE;
1061 : } else {
1062 : // Start of current buffer.
1063 : // check whether other buffer already has what we need.
1064 0 : UTF8Buf *altB = (UTF8Buf *)ut->q;
1065 0 : if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
1066 0 : goto swapBuffers;
1067 : }
1068 : }
1069 : }
1070 :
1071 : // A random access. Desired index could be in either or niether buf.
1072 : // For optimizing the order of testing,
1073 : // Most likely case: in the other buffer.
1074 : // Second most likely: in neither buffer.
1075 : // Unlikely, but must work: in the current buffer.
1076 0 : u8b = (UTF8Buf *)ut->q; // the alternate buffer
1077 0 : if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
1078 : // Requested index is in the other buffer.
1079 0 : goto swapBuffers;
1080 : }
1081 : // Requested index is start-of-string.
1082 : // (this is the case of randomly seeking to the start.
1083 : // The case of iterating off the start is handled earlier.)
1084 0 : if (ix==0) {
1085 0 : if (u8b->bufNativeStart==0) {
1086 : // Alternate buffer contains the data for the start string.
1087 : // Make it be the current buffer.
1088 0 : goto swapBuffersAndFail;
1089 : } else {
1090 : // Request for data before the start of string,
1091 : // neither buffer is usable.
1092 : // set up a zero-length buffer.
1093 0 : goto makeStubBuffer;
1094 : }
1095 : }
1096 :
1097 0 : if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) {
1098 : // Requested index is in neither buffer.
1099 : goto fillReverse;
1100 : }
1101 :
1102 : // Requested index is in this buffer.
1103 : // Set the utf16 buffer index.
1104 0 : u8b = (UTF8Buf *)ut->p;
1105 0 : mapIndex = ix - u8b->toUCharsMapStart;
1106 0 : ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1107 0 : if (ut->chunkOffset==0) {
1108 : // This occurs when the first character in the text is
1109 : // a multi-byte UTF-8 char, and the requested index is to
1110 : // one of the trailing bytes. Because there is no preceding ,
1111 : // character, this access fails. We can't pick up on the
1112 : // situation sooner because the requested index is not zero.
1113 0 : return FALSE;
1114 : } else {
1115 0 : return TRUE;
1116 : }
1117 :
1118 :
1119 :
1120 : swapBuffers:
1121 : // The alternate buffer (ut->q) has the string data that was requested.
1122 : // Swap the primary and alternate buffers, and set the
1123 : // chunk index into the new primary buffer.
1124 : {
1125 0 : u8b = (UTF8Buf *)ut->q;
1126 0 : ut->q = ut->p;
1127 0 : ut->p = u8b;
1128 0 : ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1129 0 : ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1130 0 : ut->chunkNativeStart = u8b->bufNativeStart;
1131 0 : ut->chunkNativeLimit = u8b->bufNativeLimit;
1132 0 : ut->nativeIndexingLimit = u8b->bufNILimit;
1133 :
1134 : // Index into the (now current) chunk
1135 : // Use the map to set the chunk index. It's more trouble than it's worth
1136 : // to check whether native indexing can be used.
1137 0 : U_ASSERT(ix>=u8b->bufNativeStart);
1138 0 : U_ASSERT(ix<=u8b->bufNativeLimit);
1139 0 : mapIndex = ix - u8b->toUCharsMapStart;
1140 0 : U_ASSERT(mapIndex>=0);
1141 0 : U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
1142 0 : ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1143 :
1144 0 : return TRUE;
1145 : }
1146 :
1147 :
1148 : swapBuffersAndFail:
1149 : // We got a request for either the start or end of the string,
1150 : // with iteration continuing in the out-of-bounds direction.
1151 : // The alternate buffer already contains the data up to the
1152 : // start/end.
1153 : // Swap the buffers, then return failure, indicating that we couldn't
1154 : // make things correct for continuing the iteration in the requested
1155 : // direction. The position & buffer are correct should the
1156 : // user decide to iterate in the opposite direction.
1157 0 : u8b = (UTF8Buf *)ut->q;
1158 0 : ut->q = ut->p;
1159 0 : ut->p = u8b;
1160 0 : ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1161 0 : ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1162 0 : ut->chunkNativeStart = u8b->bufNativeStart;
1163 0 : ut->chunkNativeLimit = u8b->bufNativeLimit;
1164 0 : ut->nativeIndexingLimit = u8b->bufNILimit;
1165 :
1166 : // Index into the (now current) chunk
1167 : // For this function (swapBuffersAndFail), the requested index
1168 : // will always be at either the start or end of the chunk.
1169 0 : if (ix==u8b->bufNativeLimit) {
1170 0 : ut->chunkOffset = ut->chunkLength;
1171 : } else {
1172 0 : ut->chunkOffset = 0;
1173 0 : U_ASSERT(ix == u8b->bufNativeStart);
1174 : }
1175 0 : return FALSE;
1176 :
1177 : makeStubBuffer:
1178 : // The user has done a seek/access past the start or end
1179 : // of the string. Rather than loading data that is likely
1180 : // to never be used, just set up a zero-length buffer at
1181 : // the position.
1182 0 : u8b = (UTF8Buf *)ut->q;
1183 0 : u8b->bufNativeStart = ix;
1184 0 : u8b->bufNativeLimit = ix;
1185 0 : u8b->bufStartIdx = 0;
1186 0 : u8b->bufLimitIdx = 0;
1187 0 : u8b->bufNILimit = 0;
1188 0 : u8b->toUCharsMapStart = ix;
1189 0 : u8b->mapToNative[0] = 0;
1190 0 : u8b->mapToUChars[0] = 0;
1191 0 : goto swapBuffersAndFail;
1192 :
1193 :
1194 :
1195 : fillForward:
1196 : {
1197 : // Move the incoming index to a code point boundary.
1198 0 : U8_SET_CP_START(s8, 0, ix);
1199 :
1200 : // Swap the UText buffers.
1201 : // We want to fill what was previously the alternate buffer,
1202 : // and make what was the current buffer be the new alternate.
1203 0 : UTF8Buf *u8b = (UTF8Buf *)ut->q;
1204 0 : ut->q = ut->p;
1205 0 : ut->p = u8b;
1206 :
1207 0 : int32_t strLen = ut->b;
1208 0 : UBool nulTerminated = FALSE;
1209 0 : if (strLen < 0) {
1210 0 : strLen = 0x7fffffff;
1211 0 : nulTerminated = TRUE;
1212 : }
1213 :
1214 0 : UChar *buf = u8b->buf;
1215 0 : uint8_t *mapToNative = u8b->mapToNative;
1216 0 : uint8_t *mapToUChars = u8b->mapToUChars;
1217 0 : int32_t destIx = 0;
1218 0 : int32_t srcIx = ix;
1219 0 : UBool seenNonAscii = FALSE;
1220 0 : UChar32 c = 0;
1221 :
1222 : // Fill the chunk buffer and mapping arrays.
1223 0 : while (destIx<UTF8_TEXT_CHUNK_SIZE) {
1224 0 : c = s8[srcIx];
1225 0 : if (c>0 && c<0x80) {
1226 : // Special case ASCII range for speed.
1227 : // zero is excluded to simplify bounds checking.
1228 0 : buf[destIx] = (UChar)c;
1229 0 : mapToNative[destIx] = (uint8_t)(srcIx - ix);
1230 0 : mapToUChars[srcIx-ix] = (uint8_t)destIx;
1231 0 : srcIx++;
1232 0 : destIx++;
1233 : } else {
1234 : // General case, handle everything.
1235 0 : if (seenNonAscii == FALSE) {
1236 0 : seenNonAscii = TRUE;
1237 0 : u8b->bufNILimit = destIx;
1238 : }
1239 :
1240 0 : int32_t cIx = srcIx;
1241 0 : int32_t dIx = destIx;
1242 0 : int32_t dIxSaved = destIx;
1243 0 : U8_NEXT_OR_FFFD(s8, srcIx, strLen, c);
1244 0 : if (c==0 && nulTerminated) {
1245 0 : srcIx--;
1246 0 : break;
1247 : }
1248 :
1249 0 : U16_APPEND_UNSAFE(buf, destIx, c);
1250 0 : do {
1251 0 : mapToNative[dIx++] = (uint8_t)(cIx - ix);
1252 0 : } while (dIx < destIx);
1253 :
1254 0 : do {
1255 0 : mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved;
1256 0 : } while (cIx < srcIx);
1257 : }
1258 0 : if (srcIx>=strLen) {
1259 0 : break;
1260 : }
1261 :
1262 : }
1263 :
1264 : // store Native <--> Chunk Map entries for the end of the buffer.
1265 : // There is no actual character here, but the index position is valid.
1266 0 : mapToNative[destIx] = (uint8_t)(srcIx - ix);
1267 0 : mapToUChars[srcIx - ix] = (uint8_t)destIx;
1268 :
1269 : // fill in Buffer descriptor
1270 0 : u8b->bufNativeStart = ix;
1271 0 : u8b->bufNativeLimit = srcIx;
1272 0 : u8b->bufStartIdx = 0;
1273 0 : u8b->bufLimitIdx = destIx;
1274 0 : if (seenNonAscii == FALSE) {
1275 0 : u8b->bufNILimit = destIx;
1276 : }
1277 0 : u8b->toUCharsMapStart = u8b->bufNativeStart;
1278 :
1279 : // Set UText chunk to refer to this buffer.
1280 0 : ut->chunkContents = buf;
1281 0 : ut->chunkOffset = 0;
1282 0 : ut->chunkLength = u8b->bufLimitIdx;
1283 0 : ut->chunkNativeStart = u8b->bufNativeStart;
1284 0 : ut->chunkNativeLimit = u8b->bufNativeLimit;
1285 0 : ut->nativeIndexingLimit = u8b->bufNILimit;
1286 :
1287 : // For zero terminated strings, keep track of the maximum point
1288 : // scanned so far.
1289 0 : if (nulTerminated && srcIx>ut->c) {
1290 0 : ut->c = srcIx;
1291 0 : if (c==0) {
1292 : // We scanned to the end.
1293 : // Remember the actual length.
1294 0 : ut->b = srcIx;
1295 0 : ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1296 : }
1297 : }
1298 0 : return TRUE;
1299 : }
1300 :
1301 :
1302 : fillReverse:
1303 : {
1304 : // Move the incoming index to a code point boundary.
1305 : // Can only do this if the incoming index is somewhere in the interior of the string.
1306 : // If index is at the end, there is no character there to look at.
1307 0 : if (ix != ut->b) {
1308 : // Note: this function will only move the index back if it is on a trail byte
1309 : // and there is a preceding lead byte and the sequence from the lead
1310 : // through this trail could be part of a valid UTF-8 sequence
1311 : // Otherwise the index remains unchanged.
1312 0 : U8_SET_CP_START(s8, 0, ix);
1313 : }
1314 :
1315 : // Swap the UText buffers.
1316 : // We want to fill what was previously the alternate buffer,
1317 : // and make what was the current buffer be the new alternate.
1318 0 : UTF8Buf *u8b = (UTF8Buf *)ut->q;
1319 0 : ut->q = ut->p;
1320 0 : ut->p = u8b;
1321 :
1322 0 : UChar *buf = u8b->buf;
1323 0 : uint8_t *mapToNative = u8b->mapToNative;
1324 0 : uint8_t *mapToUChars = u8b->mapToUChars;
1325 0 : int32_t toUCharsMapStart = ix - sizeof(UTF8Buf::mapToUChars) + 1;
1326 : // Note that toUCharsMapStart can be negative. Happens when the remaining
1327 : // text from current position to the beginning is less than the buffer size.
1328 : // + 1 because mapToUChars must have a slot at the end for the bufNativeLimit entry.
1329 0 : int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region
1330 : // at end of buffer to leave room
1331 : // for a surrogate pair at the
1332 : // buffer start.
1333 0 : int32_t srcIx = ix;
1334 0 : int32_t bufNILimit = destIx;
1335 : UChar32 c;
1336 :
1337 : // Map to/from Native Indexes, fill in for the position at the end of
1338 : // the buffer.
1339 : //
1340 0 : mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1341 0 : mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1342 :
1343 : // Fill the chunk buffer
1344 : // Work backwards, filling from the end of the buffer towards the front.
1345 : //
1346 0 : while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) {
1347 0 : srcIx--;
1348 0 : destIx--;
1349 :
1350 : // Get last byte of the UTF-8 character
1351 0 : c = s8[srcIx];
1352 0 : if (c<0x80) {
1353 : // Special case ASCII range for speed.
1354 0 : buf[destIx] = (UChar)c;
1355 0 : U_ASSERT(toUCharsMapStart <= srcIx);
1356 0 : mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1357 0 : mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1358 : } else {
1359 : // General case, handle everything non-ASCII.
1360 :
1361 0 : int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char
1362 :
1363 : // Get the full character from the UTF8 string.
1364 : // use code derived from tbe macros in utf8.h
1365 : // Leaves srcIx pointing at the first byte of the UTF-8 char.
1366 : //
1367 0 : c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3);
1368 : // leaves srcIx at first byte of the multi-byte char.
1369 :
1370 : // Store the character in UTF-16 buffer.
1371 0 : if (c<0x10000) {
1372 0 : buf[destIx] = (UChar)c;
1373 0 : mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1374 : } else {
1375 0 : buf[destIx] = U16_TRAIL(c);
1376 0 : mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1377 0 : buf[--destIx] = U16_LEAD(c);
1378 0 : mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1379 : }
1380 :
1381 : // Fill in the map from native indexes to UChars buf index.
1382 0 : do {
1383 0 : mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
1384 0 : } while (sIx >= srcIx);
1385 0 : U_ASSERT(toUCharsMapStart <= (srcIx+1));
1386 :
1387 : // Set native indexing limit to be the current position.
1388 : // We are processing a non-ascii, non-native-indexing char now;
1389 : // the limit will be here if the rest of the chars to be
1390 : // added to this buffer are ascii.
1391 0 : bufNILimit = destIx;
1392 : }
1393 : }
1394 0 : u8b->bufNativeStart = srcIx;
1395 0 : u8b->bufNativeLimit = ix;
1396 0 : u8b->bufStartIdx = destIx;
1397 0 : u8b->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2;
1398 0 : u8b->bufNILimit = bufNILimit - u8b->bufStartIdx;
1399 0 : u8b->toUCharsMapStart = toUCharsMapStart;
1400 :
1401 0 : ut->chunkContents = &buf[u8b->bufStartIdx];
1402 0 : ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1403 0 : ut->chunkOffset = ut->chunkLength;
1404 0 : ut->chunkNativeStart = u8b->bufNativeStart;
1405 0 : ut->chunkNativeLimit = u8b->bufNativeLimit;
1406 0 : ut->nativeIndexingLimit = u8b->bufNILimit;
1407 0 : return TRUE;
1408 : }
1409 :
1410 : }
1411 :
1412 :
1413 :
1414 : //
1415 : // This is a slightly modified copy of u_strFromUTF8,
1416 : // Inserts a Replacement Char rather than failing on invalid UTF-8
1417 : // Removes unnecessary features.
1418 : //
1419 : static UChar*
1420 0 : utext_strFromUTF8(UChar *dest,
1421 : int32_t destCapacity,
1422 : int32_t *pDestLength,
1423 : const char* src,
1424 : int32_t srcLength, // required. NUL terminated not supported.
1425 : UErrorCode *pErrorCode
1426 : )
1427 : {
1428 :
1429 0 : UChar *pDest = dest;
1430 0 : UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
1431 0 : UChar32 ch=0;
1432 0 : int32_t index = 0;
1433 0 : int32_t reqLength = 0;
1434 0 : uint8_t* pSrc = (uint8_t*) src;
1435 :
1436 :
1437 0 : while((index < srcLength)&&(pDest<pDestLimit)){
1438 0 : ch = pSrc[index++];
1439 0 : if(ch <=0x7f){
1440 0 : *pDest++=(UChar)ch;
1441 : }else{
1442 0 : ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
1443 0 : if(U_IS_BMP(ch)){
1444 0 : *(pDest++)=(UChar)ch;
1445 : }else{
1446 0 : *(pDest++)=U16_LEAD(ch);
1447 0 : if(pDest<pDestLimit){
1448 0 : *(pDest++)=U16_TRAIL(ch);
1449 : }else{
1450 0 : reqLength++;
1451 0 : break;
1452 : }
1453 : }
1454 : }
1455 : }
1456 : /* donot fill the dest buffer just count the UChars needed */
1457 0 : while(index < srcLength){
1458 0 : ch = pSrc[index++];
1459 0 : if(ch <= 0x7f){
1460 0 : reqLength++;
1461 : }else{
1462 0 : ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
1463 0 : reqLength+=U16_LENGTH(ch);
1464 : }
1465 : }
1466 :
1467 0 : reqLength+=(int32_t)(pDest - dest);
1468 :
1469 0 : if(pDestLength){
1470 0 : *pDestLength = reqLength;
1471 : }
1472 :
1473 : /* Terminate the buffer */
1474 0 : u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
1475 :
1476 0 : return dest;
1477 : }
1478 :
1479 :
1480 :
1481 : static int32_t U_CALLCONV
1482 0 : utf8TextExtract(UText *ut,
1483 : int64_t start, int64_t limit,
1484 : UChar *dest, int32_t destCapacity,
1485 : UErrorCode *pErrorCode) {
1486 0 : if(U_FAILURE(*pErrorCode)) {
1487 0 : return 0;
1488 : }
1489 0 : if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1490 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1491 0 : return 0;
1492 : }
1493 0 : int32_t length = ut->b;
1494 0 : int32_t start32 = pinIndex(start, length);
1495 0 : int32_t limit32 = pinIndex(limit, length);
1496 :
1497 0 : if(start32>limit32) {
1498 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1499 0 : return 0;
1500 : }
1501 :
1502 :
1503 : // adjust the incoming indexes to land on code point boundaries if needed.
1504 : // adjust by no more than three, because that is the largest number of trail bytes
1505 : // in a well formed UTF8 character.
1506 0 : const uint8_t *buf = (const uint8_t *)ut->context;
1507 : int i;
1508 0 : if (start32 < ut->chunkNativeLimit) {
1509 0 : for (i=0; i<3; i++) {
1510 0 : if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
1511 : break;
1512 : }
1513 0 : start32--;
1514 : }
1515 : }
1516 :
1517 0 : if (limit32 < ut->chunkNativeLimit) {
1518 0 : for (i=0; i<3; i++) {
1519 0 : if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
1520 : break;
1521 : }
1522 0 : limit32--;
1523 : }
1524 : }
1525 :
1526 : // Do the actual extract.
1527 0 : int32_t destLength=0;
1528 0 : utext_strFromUTF8(dest, destCapacity, &destLength,
1529 0 : (const char *)ut->context+start32, limit32-start32,
1530 0 : pErrorCode);
1531 0 : utf8TextAccess(ut, limit32, TRUE);
1532 0 : return destLength;
1533 : }
1534 :
1535 : //
1536 : // utf8TextMapOffsetToNative
1537 : //
1538 : // Map a chunk (UTF-16) offset to a native index.
1539 : static int64_t U_CALLCONV
1540 0 : utf8TextMapOffsetToNative(const UText *ut) {
1541 : //
1542 0 : UTF8Buf *u8b = (UTF8Buf *)ut->p;
1543 0 : U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
1544 0 : int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
1545 0 : U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
1546 0 : return nativeOffset;
1547 : }
1548 :
1549 : //
1550 : // Map a native index to the corrsponding chunk offset
1551 : //
1552 : static int32_t U_CALLCONV
1553 0 : utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
1554 0 : U_ASSERT(index64 <= 0x7fffffff);
1555 0 : int32_t index = (int32_t)index64;
1556 0 : UTF8Buf *u8b = (UTF8Buf *)ut->p;
1557 0 : U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
1558 0 : U_ASSERT(index<=ut->chunkNativeLimit);
1559 0 : int32_t mapIndex = index - u8b->toUCharsMapStart;
1560 0 : U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
1561 0 : int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1562 0 : U_ASSERT(offset>=0 && offset<=ut->chunkLength);
1563 0 : return offset;
1564 : }
1565 :
1566 : static UText * U_CALLCONV
1567 0 : utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)
1568 : {
1569 : // First do a generic shallow clone. Does everything needed for the UText struct itself.
1570 0 : dest = shallowTextClone(dest, src, status);
1571 :
1572 : // For deep clones, make a copy of the string.
1573 : // The copied storage is owned by the newly created clone.
1574 : //
1575 : // TODO: There is an isssue with using utext_nativeLength().
1576 : // That function is non-const in cases where the input was NUL terminated
1577 : // and the length has not yet been determined.
1578 : // This function (clone()) is const.
1579 : // There potentially a thread safety issue lurking here.
1580 : //
1581 0 : if (deep && U_SUCCESS(*status)) {
1582 0 : int32_t len = (int32_t)utext_nativeLength((UText *)src);
1583 0 : char *copyStr = (char *)uprv_malloc(len+1);
1584 0 : if (copyStr == NULL) {
1585 0 : *status = U_MEMORY_ALLOCATION_ERROR;
1586 : } else {
1587 0 : uprv_memcpy(copyStr, src->context, len+1);
1588 0 : dest->context = copyStr;
1589 0 : dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1590 : }
1591 : }
1592 0 : return dest;
1593 : }
1594 :
1595 :
1596 : static void U_CALLCONV
1597 0 : utf8TextClose(UText *ut) {
1598 : // Most of the work of close is done by the generic UText framework close.
1599 : // All that needs to be done here is to delete the UTF8 string if the UText
1600 : // owns it. This occurs if the UText was created by cloning.
1601 0 : if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1602 0 : char *s = (char *)ut->context;
1603 0 : uprv_free(s);
1604 0 : ut->context = NULL;
1605 : }
1606 0 : }
1607 :
1608 : U_CDECL_END
1609 :
1610 :
1611 : static const struct UTextFuncs utf8Funcs =
1612 : {
1613 : sizeof(UTextFuncs),
1614 : 0, 0, 0, // Reserved alignment padding
1615 : utf8TextClone,
1616 : utf8TextLength,
1617 : utf8TextAccess,
1618 : utf8TextExtract,
1619 : NULL, /* replace*/
1620 : NULL, /* copy */
1621 : utf8TextMapOffsetToNative,
1622 : utf8TextMapIndexToUTF16,
1623 : utf8TextClose,
1624 : NULL, // spare 1
1625 : NULL, // spare 2
1626 : NULL // spare 3
1627 : };
1628 :
1629 :
1630 : static const char gEmptyString[] = {0};
1631 :
1632 : U_CAPI UText * U_EXPORT2
1633 0 : utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
1634 0 : if(U_FAILURE(*status)) {
1635 0 : return NULL;
1636 : }
1637 0 : if(s==NULL && length==0) {
1638 0 : s = gEmptyString;
1639 : }
1640 :
1641 0 : if(s==NULL || length<-1 || length>INT32_MAX) {
1642 0 : *status=U_ILLEGAL_ARGUMENT_ERROR;
1643 0 : return NULL;
1644 : }
1645 :
1646 0 : ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status);
1647 0 : if (U_FAILURE(*status)) {
1648 0 : return ut;
1649 : }
1650 :
1651 0 : ut->pFuncs = &utf8Funcs;
1652 0 : ut->context = s;
1653 0 : ut->b = (int32_t)length;
1654 0 : ut->c = (int32_t)length;
1655 0 : if (ut->c < 0) {
1656 0 : ut->c = 0;
1657 0 : ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1658 : }
1659 0 : ut->p = ut->pExtra;
1660 0 : ut->q = (char *)ut->pExtra + sizeof(UTF8Buf);
1661 0 : return ut;
1662 :
1663 : }
1664 :
1665 :
1666 :
1667 :
1668 :
1669 :
1670 :
1671 :
1672 : //------------------------------------------------------------------------------
1673 : //
1674 : // UText implementation wrapper for Replaceable (read/write)
1675 : //
1676 : // Use of UText data members:
1677 : // context pointer to Replaceable.
1678 : // p pointer to Replaceable if it is owned by the UText.
1679 : //
1680 : //------------------------------------------------------------------------------
1681 :
1682 :
1683 :
1684 : // minimum chunk size for this implementation: 3
1685 : // to allow for possible trimming for code point boundaries
1686 : enum { REP_TEXT_CHUNK_SIZE=10 };
1687 :
1688 : struct ReplExtra {
1689 : /*
1690 : * Chunk UChars.
1691 : * +1 to simplify filling with surrogate pair at the end.
1692 : */
1693 : UChar s[REP_TEXT_CHUNK_SIZE+1];
1694 : };
1695 :
1696 :
1697 : U_CDECL_BEGIN
1698 :
1699 : static UText * U_CALLCONV
1700 0 : repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
1701 : // First do a generic shallow clone. Does everything needed for the UText struct itself.
1702 0 : dest = shallowTextClone(dest, src, status);
1703 :
1704 : // For deep clones, make a copy of the Replaceable.
1705 : // The copied Replaceable storage is owned by the newly created UText clone.
1706 : // A non-NULL pointer in UText.p is the signal to the close() function to delete
1707 : // it.
1708 : //
1709 0 : if (deep && U_SUCCESS(*status)) {
1710 0 : const Replaceable *replSrc = (const Replaceable *)src->context;
1711 0 : dest->context = replSrc->clone();
1712 0 : dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1713 :
1714 : // with deep clone, the copy is writable, even when the source is not.
1715 0 : dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
1716 : }
1717 0 : return dest;
1718 : }
1719 :
1720 :
1721 : static void U_CALLCONV
1722 0 : repTextClose(UText *ut) {
1723 : // Most of the work of close is done by the generic UText framework close.
1724 : // All that needs to be done here is delete the Replaceable if the UText
1725 : // owns it. This occurs if the UText was created by cloning.
1726 0 : if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1727 0 : Replaceable *rep = (Replaceable *)ut->context;
1728 0 : delete rep;
1729 0 : ut->context = NULL;
1730 : }
1731 0 : }
1732 :
1733 :
1734 : static int64_t U_CALLCONV
1735 0 : repTextLength(UText *ut) {
1736 0 : const Replaceable *replSrc = (const Replaceable *)ut->context;
1737 0 : int32_t len = replSrc->length();
1738 0 : return len;
1739 : }
1740 :
1741 :
1742 : static UBool U_CALLCONV
1743 0 : repTextAccess(UText *ut, int64_t index, UBool forward) {
1744 0 : const Replaceable *rep=(const Replaceable *)ut->context;
1745 0 : int32_t length=rep->length(); // Full length of the input text (bigger than a chunk)
1746 :
1747 : // clip the requested index to the limits of the text.
1748 0 : int32_t index32 = pinIndex(index, length);
1749 0 : U_ASSERT(index<=INT32_MAX);
1750 :
1751 :
1752 : /*
1753 : * Compute start/limit boundaries around index, for a segment of text
1754 : * to be extracted.
1755 : * To allow for the possibility that our user gave an index to the trailing
1756 : * half of a surrogate pair, we must request one extra preceding UChar when
1757 : * going in the forward direction. This will ensure that the buffer has the
1758 : * entire code point at the specified index.
1759 : */
1760 0 : if(forward) {
1761 :
1762 0 : if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
1763 : // Buffer already contains the requested position.
1764 0 : ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1765 0 : return TRUE;
1766 : }
1767 0 : if (index32>=length && ut->chunkNativeLimit==length) {
1768 : // Request for end of string, and buffer already extends up to it.
1769 : // Can't get the data, but don't change the buffer.
1770 0 : ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
1771 0 : return FALSE;
1772 : }
1773 :
1774 0 : ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
1775 : // Going forward, so we want to have the buffer with stuff at and beyond
1776 : // the requested index. The -1 gets us one code point before the
1777 : // requested index also, to handle the case of the index being on
1778 : // a trail surrogate of a surrogate pair.
1779 0 : if(ut->chunkNativeLimit > length) {
1780 0 : ut->chunkNativeLimit = length;
1781 : }
1782 : // unless buffer ran off end, start is index-1.
1783 0 : ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
1784 0 : if(ut->chunkNativeStart < 0) {
1785 0 : ut->chunkNativeStart = 0;
1786 : }
1787 : } else {
1788 : // Reverse iteration. Fill buffer with data preceding the requested index.
1789 0 : if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
1790 : // Requested position already in buffer.
1791 0 : ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
1792 0 : return TRUE;
1793 : }
1794 0 : if (index32==0 && ut->chunkNativeStart==0) {
1795 : // Request for start, buffer already begins at start.
1796 : // No data, but keep the buffer as is.
1797 0 : ut->chunkOffset = 0;
1798 0 : return FALSE;
1799 : }
1800 :
1801 : // Figure out the bounds of the chunk to extract for reverse iteration.
1802 : // Need to worry about chunk not splitting surrogate pairs, and while still
1803 : // containing the data we need.
1804 : // Fix by requesting a chunk that includes an extra UChar at the end.
1805 : // If this turns out to be a lead surrogate, we can lop it off and still have
1806 : // the data we wanted.
1807 0 : ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE;
1808 0 : if (ut->chunkNativeStart < 0) {
1809 0 : ut->chunkNativeStart = 0;
1810 : }
1811 :
1812 0 : ut->chunkNativeLimit = index32 + 1;
1813 0 : if (ut->chunkNativeLimit > length) {
1814 0 : ut->chunkNativeLimit = length;
1815 : }
1816 : }
1817 :
1818 : // Extract the new chunk of text from the Replaceable source.
1819 0 : ReplExtra *ex = (ReplExtra *)ut->pExtra;
1820 : // UnicodeString with its buffer a writable alias to the chunk buffer
1821 0 : UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
1822 0 : rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
1823 :
1824 0 : ut->chunkContents = ex->s;
1825 0 : ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
1826 0 : ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart);
1827 :
1828 : // Surrogate pairs from the input text must not span chunk boundaries.
1829 : // If end of chunk could be the start of a surrogate, trim it off.
1830 0 : if (ut->chunkNativeLimit < length &&
1831 0 : U16_IS_LEAD(ex->s[ut->chunkLength-1])) {
1832 0 : ut->chunkLength--;
1833 0 : ut->chunkNativeLimit--;
1834 0 : if (ut->chunkOffset > ut->chunkLength) {
1835 0 : ut->chunkOffset = ut->chunkLength;
1836 : }
1837 : }
1838 :
1839 : // if the first UChar in the chunk could be the trailing half of a surrogate pair,
1840 : // trim it off.
1841 0 : if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
1842 0 : ++(ut->chunkContents);
1843 0 : ++(ut->chunkNativeStart);
1844 0 : --(ut->chunkLength);
1845 0 : --(ut->chunkOffset);
1846 : }
1847 :
1848 : // adjust the index/chunkOffset to a code point boundary
1849 0 : U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset);
1850 :
1851 : // Use fast indexing for get/setNativeIndex()
1852 0 : ut->nativeIndexingLimit = ut->chunkLength;
1853 :
1854 0 : return TRUE;
1855 : }
1856 :
1857 :
1858 :
1859 : static int32_t U_CALLCONV
1860 0 : repTextExtract(UText *ut,
1861 : int64_t start, int64_t limit,
1862 : UChar *dest, int32_t destCapacity,
1863 : UErrorCode *status) {
1864 0 : const Replaceable *rep=(const Replaceable *)ut->context;
1865 0 : int32_t length=rep->length();
1866 :
1867 0 : if(U_FAILURE(*status)) {
1868 0 : return 0;
1869 : }
1870 0 : if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1871 0 : *status=U_ILLEGAL_ARGUMENT_ERROR;
1872 : }
1873 0 : if(start>limit) {
1874 0 : *status=U_INDEX_OUTOFBOUNDS_ERROR;
1875 0 : return 0;
1876 : }
1877 :
1878 0 : int32_t start32 = pinIndex(start, length);
1879 0 : int32_t limit32 = pinIndex(limit, length);
1880 :
1881 : // adjust start, limit if they point to trail half of surrogates
1882 0 : if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
1883 0 : U_IS_SUPPLEMENTARY(rep->char32At(start32))){
1884 0 : start32--;
1885 : }
1886 0 : if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
1887 0 : U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
1888 0 : limit32--;
1889 : }
1890 :
1891 0 : length=limit32-start32;
1892 0 : if(length>destCapacity) {
1893 0 : limit32 = start32 + destCapacity;
1894 : }
1895 0 : UnicodeString buffer(dest, 0, destCapacity); // writable alias
1896 0 : rep->extractBetween(start32, limit32, buffer);
1897 0 : repTextAccess(ut, limit32, TRUE);
1898 :
1899 0 : return u_terminateUChars(dest, destCapacity, length, status);
1900 : }
1901 :
1902 : static int32_t U_CALLCONV
1903 0 : repTextReplace(UText *ut,
1904 : int64_t start, int64_t limit,
1905 : const UChar *src, int32_t length,
1906 : UErrorCode *status) {
1907 0 : Replaceable *rep=(Replaceable *)ut->context;
1908 : int32_t oldLength;
1909 :
1910 0 : if(U_FAILURE(*status)) {
1911 0 : return 0;
1912 : }
1913 0 : if(src==NULL && length!=0) {
1914 0 : *status=U_ILLEGAL_ARGUMENT_ERROR;
1915 0 : return 0;
1916 : }
1917 0 : oldLength=rep->length(); // will subtract from new length
1918 0 : if(start>limit ) {
1919 0 : *status=U_INDEX_OUTOFBOUNDS_ERROR;
1920 0 : return 0;
1921 : }
1922 :
1923 0 : int32_t start32 = pinIndex(start, oldLength);
1924 0 : int32_t limit32 = pinIndex(limit, oldLength);
1925 :
1926 : // Snap start & limit to code point boundaries.
1927 0 : if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
1928 0 : start32>0 && U16_IS_LEAD(rep->charAt(start32-1)))
1929 : {
1930 0 : start32--;
1931 : }
1932 0 : if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) &&
1933 0 : U16_IS_TRAIL(rep->charAt(limit32)))
1934 : {
1935 0 : limit32++;
1936 : }
1937 :
1938 : // Do the actual replace operation using methods of the Replaceable class
1939 0 : UnicodeString replStr((UBool)(length<0), src, length); // read-only alias
1940 0 : rep->handleReplaceBetween(start32, limit32, replStr);
1941 0 : int32_t newLength = rep->length();
1942 0 : int32_t lengthDelta = newLength - oldLength;
1943 :
1944 : // Is the UText chunk buffer OK?
1945 0 : if (ut->chunkNativeLimit > start32) {
1946 : // this replace operation may have impacted the current chunk.
1947 : // invalidate it, which will force a reload on the next access.
1948 0 : invalidateChunk(ut);
1949 : }
1950 :
1951 : // set the iteration position to the end of the newly inserted replacement text.
1952 0 : int32_t newIndexPos = limit32 + lengthDelta;
1953 0 : repTextAccess(ut, newIndexPos, TRUE);
1954 :
1955 0 : return lengthDelta;
1956 : }
1957 :
1958 :
1959 : static void U_CALLCONV
1960 0 : repTextCopy(UText *ut,
1961 : int64_t start, int64_t limit,
1962 : int64_t destIndex,
1963 : UBool move,
1964 : UErrorCode *status)
1965 : {
1966 0 : Replaceable *rep=(Replaceable *)ut->context;
1967 0 : int32_t length=rep->length();
1968 :
1969 0 : if(U_FAILURE(*status)) {
1970 0 : return;
1971 : }
1972 0 : if (start>limit || (start<destIndex && destIndex<limit))
1973 : {
1974 0 : *status=U_INDEX_OUTOFBOUNDS_ERROR;
1975 0 : return;
1976 : }
1977 :
1978 0 : int32_t start32 = pinIndex(start, length);
1979 0 : int32_t limit32 = pinIndex(limit, length);
1980 0 : int32_t destIndex32 = pinIndex(destIndex, length);
1981 :
1982 : // TODO: snap input parameters to code point boundaries.
1983 :
1984 0 : if(move) {
1985 : // move: copy to destIndex, then replace original with nothing
1986 0 : int32_t segLength=limit32-start32;
1987 0 : rep->copy(start32, limit32, destIndex32);
1988 0 : if(destIndex32<start32) {
1989 0 : start32+=segLength;
1990 0 : limit32+=segLength;
1991 : }
1992 0 : rep->handleReplaceBetween(start32, limit32, UnicodeString());
1993 : } else {
1994 : // copy
1995 0 : rep->copy(start32, limit32, destIndex32);
1996 : }
1997 :
1998 : // If the change to the text touched the region in the chunk buffer,
1999 : // invalidate the buffer.
2000 0 : int32_t firstAffectedIndex = destIndex32;
2001 0 : if (move && start32<firstAffectedIndex) {
2002 0 : firstAffectedIndex = start32;
2003 : }
2004 0 : if (firstAffectedIndex < ut->chunkNativeLimit) {
2005 : // changes may have affected range covered by the chunk
2006 0 : invalidateChunk(ut);
2007 : }
2008 :
2009 : // Put iteration position at the newly inserted (moved) block,
2010 0 : int32_t nativeIterIndex = destIndex32 + limit32 - start32;
2011 0 : if (move && destIndex32>start32) {
2012 : // moved a block of text towards the end of the string.
2013 0 : nativeIterIndex = destIndex32;
2014 : }
2015 :
2016 : // Set position, reload chunk if needed.
2017 0 : repTextAccess(ut, nativeIterIndex, TRUE);
2018 : }
2019 :
2020 : static const struct UTextFuncs repFuncs =
2021 : {
2022 : sizeof(UTextFuncs),
2023 : 0, 0, 0, // Reserved alignment padding
2024 : repTextClone,
2025 : repTextLength,
2026 : repTextAccess,
2027 : repTextExtract,
2028 : repTextReplace,
2029 : repTextCopy,
2030 : NULL, // MapOffsetToNative,
2031 : NULL, // MapIndexToUTF16,
2032 : repTextClose,
2033 : NULL, // spare 1
2034 : NULL, // spare 2
2035 : NULL // spare 3
2036 : };
2037 :
2038 :
2039 : U_CAPI UText * U_EXPORT2
2040 0 : utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
2041 : {
2042 0 : if(U_FAILURE(*status)) {
2043 0 : return NULL;
2044 : }
2045 0 : if(rep==NULL) {
2046 0 : *status=U_ILLEGAL_ARGUMENT_ERROR;
2047 0 : return NULL;
2048 : }
2049 0 : ut = utext_setup(ut, sizeof(ReplExtra), status);
2050 0 : if(U_FAILURE(*status)) {
2051 0 : return ut;
2052 : }
2053 :
2054 0 : ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2055 0 : if(rep->hasMetaData()) {
2056 0 : ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
2057 : }
2058 :
2059 0 : ut->pFuncs = &repFuncs;
2060 0 : ut->context = rep;
2061 0 : return ut;
2062 : }
2063 :
2064 : U_CDECL_END
2065 :
2066 :
2067 :
2068 :
2069 :
2070 :
2071 :
2072 :
2073 : //------------------------------------------------------------------------------
2074 : //
2075 : // UText implementation for UnicodeString (read/write) and
2076 : // for const UnicodeString (read only)
2077 : // (same implementation, only the flags are different)
2078 : //
2079 : // Use of UText data members:
2080 : // context pointer to UnicodeString
2081 : // p pointer to UnicodeString IF this UText owns the string
2082 : // and it must be deleted on close(). NULL otherwise.
2083 : //
2084 : //------------------------------------------------------------------------------
2085 :
2086 : U_CDECL_BEGIN
2087 :
2088 :
2089 : static UText * U_CALLCONV
2090 0 : unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
2091 : // First do a generic shallow clone. Does everything needed for the UText struct itself.
2092 0 : dest = shallowTextClone(dest, src, status);
2093 :
2094 : // For deep clones, make a copy of the UnicodeSring.
2095 : // The copied UnicodeString storage is owned by the newly created UText clone.
2096 : // A non-NULL pointer in UText.p is the signal to the close() function to delete
2097 : // the UText.
2098 : //
2099 0 : if (deep && U_SUCCESS(*status)) {
2100 0 : const UnicodeString *srcString = (const UnicodeString *)src->context;
2101 0 : dest->context = new UnicodeString(*srcString);
2102 0 : dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2103 :
2104 : // with deep clone, the copy is writable, even when the source is not.
2105 0 : dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2106 : }
2107 0 : return dest;
2108 : }
2109 :
2110 : static void U_CALLCONV
2111 0 : unistrTextClose(UText *ut) {
2112 : // Most of the work of close is done by the generic UText framework close.
2113 : // All that needs to be done here is delete the UnicodeString if the UText
2114 : // owns it. This occurs if the UText was created by cloning.
2115 0 : if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2116 0 : UnicodeString *str = (UnicodeString *)ut->context;
2117 0 : delete str;
2118 0 : ut->context = NULL;
2119 : }
2120 0 : }
2121 :
2122 :
2123 : static int64_t U_CALLCONV
2124 0 : unistrTextLength(UText *t) {
2125 0 : return ((const UnicodeString *)t->context)->length();
2126 : }
2127 :
2128 :
2129 : static UBool U_CALLCONV
2130 0 : unistrTextAccess(UText *ut, int64_t index, UBool forward) {
2131 0 : int32_t length = ut->chunkLength;
2132 0 : ut->chunkOffset = pinIndex(index, length);
2133 :
2134 : // Check whether request is at the start or end
2135 0 : UBool retVal = (forward && index<length) || (!forward && index>0);
2136 0 : return retVal;
2137 : }
2138 :
2139 :
2140 :
2141 : static int32_t U_CALLCONV
2142 0 : unistrTextExtract(UText *t,
2143 : int64_t start, int64_t limit,
2144 : UChar *dest, int32_t destCapacity,
2145 : UErrorCode *pErrorCode) {
2146 0 : const UnicodeString *us=(const UnicodeString *)t->context;
2147 0 : int32_t length=us->length();
2148 :
2149 0 : if(U_FAILURE(*pErrorCode)) {
2150 0 : return 0;
2151 : }
2152 0 : if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
2153 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2154 : }
2155 0 : if(start<0 || start>limit) {
2156 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2157 0 : return 0;
2158 : }
2159 :
2160 0 : int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
2161 0 : int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
2162 :
2163 0 : length=limit32-start32;
2164 0 : if (destCapacity>0 && dest!=NULL) {
2165 0 : int32_t trimmedLength = length;
2166 0 : if(trimmedLength>destCapacity) {
2167 0 : trimmedLength=destCapacity;
2168 : }
2169 0 : us->extract(start32, trimmedLength, dest);
2170 0 : t->chunkOffset = start32+trimmedLength;
2171 : } else {
2172 0 : t->chunkOffset = start32;
2173 : }
2174 0 : u_terminateUChars(dest, destCapacity, length, pErrorCode);
2175 0 : return length;
2176 : }
2177 :
2178 : static int32_t U_CALLCONV
2179 0 : unistrTextReplace(UText *ut,
2180 : int64_t start, int64_t limit,
2181 : const UChar *src, int32_t length,
2182 : UErrorCode *pErrorCode) {
2183 0 : UnicodeString *us=(UnicodeString *)ut->context;
2184 : int32_t oldLength;
2185 :
2186 0 : if(U_FAILURE(*pErrorCode)) {
2187 0 : return 0;
2188 : }
2189 0 : if(src==NULL && length!=0) {
2190 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2191 : }
2192 0 : if(start>limit) {
2193 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2194 0 : return 0;
2195 : }
2196 0 : oldLength=us->length();
2197 0 : int32_t start32 = pinIndex(start, oldLength);
2198 0 : int32_t limit32 = pinIndex(limit, oldLength);
2199 0 : if (start32 < oldLength) {
2200 0 : start32 = us->getChar32Start(start32);
2201 : }
2202 0 : if (limit32 < oldLength) {
2203 0 : limit32 = us->getChar32Start(limit32);
2204 : }
2205 :
2206 : // replace
2207 0 : us->replace(start32, limit32-start32, src, length);
2208 0 : int32_t newLength = us->length();
2209 :
2210 : // Update the chunk description.
2211 0 : ut->chunkContents = us->getBuffer();
2212 0 : ut->chunkLength = newLength;
2213 0 : ut->chunkNativeLimit = newLength;
2214 0 : ut->nativeIndexingLimit = newLength;
2215 :
2216 : // Set iteration position to the point just following the newly inserted text.
2217 0 : int32_t lengthDelta = newLength - oldLength;
2218 0 : ut->chunkOffset = limit32 + lengthDelta;
2219 :
2220 0 : return lengthDelta;
2221 : }
2222 :
2223 : static void U_CALLCONV
2224 0 : unistrTextCopy(UText *ut,
2225 : int64_t start, int64_t limit,
2226 : int64_t destIndex,
2227 : UBool move,
2228 : UErrorCode *pErrorCode) {
2229 0 : UnicodeString *us=(UnicodeString *)ut->context;
2230 0 : int32_t length=us->length();
2231 :
2232 0 : if(U_FAILURE(*pErrorCode)) {
2233 0 : return;
2234 : }
2235 0 : int32_t start32 = pinIndex(start, length);
2236 0 : int32_t limit32 = pinIndex(limit, length);
2237 0 : int32_t destIndex32 = pinIndex(destIndex, length);
2238 :
2239 0 : if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) {
2240 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2241 0 : return;
2242 : }
2243 :
2244 0 : if(move) {
2245 : // move: copy to destIndex, then remove original
2246 0 : int32_t segLength=limit32-start32;
2247 0 : us->copy(start32, limit32, destIndex32);
2248 0 : if(destIndex32<start32) {
2249 0 : start32+=segLength;
2250 : }
2251 0 : us->remove(start32, segLength);
2252 : } else {
2253 : // copy
2254 0 : us->copy(start32, limit32, destIndex32);
2255 : }
2256 :
2257 : // update chunk description, set iteration position.
2258 0 : ut->chunkContents = us->getBuffer();
2259 0 : if (move==FALSE) {
2260 : // copy operation, string length grows
2261 0 : ut->chunkLength += limit32-start32;
2262 0 : ut->chunkNativeLimit = ut->chunkLength;
2263 0 : ut->nativeIndexingLimit = ut->chunkLength;
2264 : }
2265 :
2266 : // Iteration position to end of the newly inserted text.
2267 0 : ut->chunkOffset = destIndex32+limit32-start32;
2268 0 : if (move && destIndex32>start32) {
2269 0 : ut->chunkOffset = destIndex32;
2270 : }
2271 :
2272 : }
2273 :
2274 : static const struct UTextFuncs unistrFuncs =
2275 : {
2276 : sizeof(UTextFuncs),
2277 : 0, 0, 0, // Reserved alignment padding
2278 : unistrTextClone,
2279 : unistrTextLength,
2280 : unistrTextAccess,
2281 : unistrTextExtract,
2282 : unistrTextReplace,
2283 : unistrTextCopy,
2284 : NULL, // MapOffsetToNative,
2285 : NULL, // MapIndexToUTF16,
2286 : unistrTextClose,
2287 : NULL, // spare 1
2288 : NULL, // spare 2
2289 : NULL // spare 3
2290 : };
2291 :
2292 :
2293 :
2294 : U_CDECL_END
2295 :
2296 :
2297 : U_CAPI UText * U_EXPORT2
2298 0 : utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
2299 0 : ut = utext_openConstUnicodeString(ut, s, status);
2300 0 : if (U_SUCCESS(*status)) {
2301 0 : ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2302 : }
2303 0 : return ut;
2304 : }
2305 :
2306 :
2307 :
2308 : U_CAPI UText * U_EXPORT2
2309 0 : utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
2310 0 : if (U_SUCCESS(*status) && s->isBogus()) {
2311 : // The UnicodeString is bogus, but we still need to detach the UText
2312 : // from whatever it was hooked to before, if anything.
2313 0 : utext_openUChars(ut, NULL, 0, status);
2314 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
2315 0 : return ut;
2316 : }
2317 0 : ut = utext_setup(ut, 0, status);
2318 : // note: use the standard (writable) function table for UnicodeString.
2319 : // The flag settings disable writing, so having the functions in
2320 : // the table is harmless.
2321 0 : if (U_SUCCESS(*status)) {
2322 0 : ut->pFuncs = &unistrFuncs;
2323 0 : ut->context = s;
2324 0 : ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2325 0 : ut->chunkContents = s->getBuffer();
2326 0 : ut->chunkLength = s->length();
2327 0 : ut->chunkNativeStart = 0;
2328 0 : ut->chunkNativeLimit = ut->chunkLength;
2329 0 : ut->nativeIndexingLimit = ut->chunkLength;
2330 : }
2331 0 : return ut;
2332 : }
2333 :
2334 : //------------------------------------------------------------------------------
2335 : //
2336 : // UText implementation for const UChar * strings
2337 : //
2338 : // Use of UText data members:
2339 : // context pointer to UnicodeString
2340 : // a length. -1 if not yet known.
2341 : //
2342 : // TODO: support 64 bit lengths.
2343 : //
2344 : //------------------------------------------------------------------------------
2345 :
2346 : U_CDECL_BEGIN
2347 :
2348 :
2349 : static UText * U_CALLCONV
2350 0 : ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) {
2351 : // First do a generic shallow clone.
2352 0 : dest = shallowTextClone(dest, src, status);
2353 :
2354 : // For deep clones, make a copy of the string.
2355 : // The copied storage is owned by the newly created clone.
2356 : // A non-NULL pointer in UText.p is the signal to the close() function to delete
2357 : // it.
2358 : //
2359 0 : if (deep && U_SUCCESS(*status)) {
2360 0 : U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
2361 0 : int32_t len = (int32_t)utext_nativeLength(dest);
2362 :
2363 : // The cloned string IS going to be NUL terminated, whether or not the original was.
2364 0 : const UChar *srcStr = (const UChar *)src->context;
2365 0 : UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar));
2366 0 : if (copyStr == NULL) {
2367 0 : *status = U_MEMORY_ALLOCATION_ERROR;
2368 : } else {
2369 : int64_t i;
2370 0 : for (i=0; i<len; i++) {
2371 0 : copyStr[i] = srcStr[i];
2372 : }
2373 0 : copyStr[len] = 0;
2374 0 : dest->context = copyStr;
2375 0 : dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2376 : }
2377 : }
2378 0 : return dest;
2379 : }
2380 :
2381 :
2382 : static void U_CALLCONV
2383 0 : ucstrTextClose(UText *ut) {
2384 : // Most of the work of close is done by the generic UText framework close.
2385 : // All that needs to be done here is delete the string if the UText
2386 : // owns it. This occurs if the UText was created by cloning.
2387 0 : if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2388 0 : UChar *s = (UChar *)ut->context;
2389 0 : uprv_free(s);
2390 0 : ut->context = NULL;
2391 : }
2392 0 : }
2393 :
2394 :
2395 :
2396 : static int64_t U_CALLCONV
2397 0 : ucstrTextLength(UText *ut) {
2398 0 : if (ut->a < 0) {
2399 : // null terminated, we don't yet know the length. Scan for it.
2400 : // Access is not convenient for doing this
2401 : // because the current interation postion can't be changed.
2402 0 : const UChar *str = (const UChar *)ut->context;
2403 : for (;;) {
2404 0 : if (str[ut->chunkNativeLimit] == 0) {
2405 0 : break;
2406 : }
2407 0 : ut->chunkNativeLimit++;
2408 : }
2409 0 : ut->a = ut->chunkNativeLimit;
2410 0 : ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2411 0 : ut->nativeIndexingLimit = ut->chunkLength;
2412 0 : ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2413 : }
2414 0 : return ut->a;
2415 : }
2416 :
2417 :
2418 : static UBool U_CALLCONV
2419 0 : ucstrTextAccess(UText *ut, int64_t index, UBool forward) {
2420 0 : const UChar *str = (const UChar *)ut->context;
2421 :
2422 : // pin the requested index to the bounds of the string,
2423 : // and set current iteration position.
2424 0 : if (index<0) {
2425 0 : index = 0;
2426 0 : } else if (index < ut->chunkNativeLimit) {
2427 : // The request data is within the chunk as it is known so far.
2428 : // Put index on a code point boundary.
2429 0 : U16_SET_CP_START(str, 0, index);
2430 0 : } else if (ut->a >= 0) {
2431 : // We know the length of this string, and the user is requesting something
2432 : // at or beyond the length. Pin the requested index to the length.
2433 0 : index = ut->a;
2434 : } else {
2435 : // Null terminated string, length not yet known, and the requested index
2436 : // is beyond where we have scanned so far.
2437 : // Scan to 32 UChars beyond the requested index. The strategy here is
2438 : // to avoid fully scanning a long string when the caller only wants to
2439 : // see a few characters at its beginning.
2440 0 : int32_t scanLimit = (int32_t)index + 32;
2441 0 : if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression
2442 0 : scanLimit = INT32_MAX;
2443 : }
2444 :
2445 0 : int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
2446 0 : for (; chunkLimit<scanLimit; chunkLimit++) {
2447 0 : if (str[chunkLimit] == 0) {
2448 : // We found the end of the string. Remember it, pin the requested index to it,
2449 : // and bail out of here.
2450 0 : ut->a = chunkLimit;
2451 0 : ut->chunkLength = chunkLimit;
2452 0 : ut->nativeIndexingLimit = chunkLimit;
2453 0 : if (index >= chunkLimit) {
2454 0 : index = chunkLimit;
2455 : } else {
2456 0 : U16_SET_CP_START(str, 0, index);
2457 : }
2458 :
2459 0 : ut->chunkNativeLimit = chunkLimit;
2460 0 : ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2461 0 : goto breakout;
2462 : }
2463 : }
2464 : // We scanned through the next batch of UChars without finding the end.
2465 0 : U16_SET_CP_START(str, 0, index);
2466 0 : if (chunkLimit == INT32_MAX) {
2467 : // Scanned to the limit of a 32 bit length.
2468 : // Forceably trim the overlength string back so length fits in int32
2469 : // TODO: add support for 64 bit strings.
2470 0 : ut->a = chunkLimit;
2471 0 : ut->chunkLength = chunkLimit;
2472 0 : ut->nativeIndexingLimit = chunkLimit;
2473 0 : if (index > chunkLimit) {
2474 0 : index = chunkLimit;
2475 : }
2476 0 : ut->chunkNativeLimit = chunkLimit;
2477 0 : ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2478 : } else {
2479 : // The endpoint of a chunk must not be left in the middle of a surrogate pair.
2480 : // If the current end is on a lead surrogate, back the end up by one.
2481 : // It doesn't matter if the end char happens to be an unpaired surrogate,
2482 : // and it's simpler not to worry about it.
2483 0 : if (U16_IS_LEAD(str[chunkLimit-1])) {
2484 0 : --chunkLimit;
2485 : }
2486 : // Null-terminated chunk with end still unknown.
2487 : // Update the chunk length to reflect what has been scanned thus far.
2488 : // That the full length is still unknown is (still) flagged by
2489 : // ut->a being < 0.
2490 0 : ut->chunkNativeLimit = chunkLimit;
2491 0 : ut->nativeIndexingLimit = chunkLimit;
2492 0 : ut->chunkLength = chunkLimit;
2493 : }
2494 :
2495 : }
2496 : breakout:
2497 0 : U_ASSERT(index<=INT32_MAX);
2498 0 : ut->chunkOffset = (int32_t)index;
2499 :
2500 : // Check whether request is at the start or end
2501 0 : UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0);
2502 0 : return retVal;
2503 : }
2504 :
2505 :
2506 :
2507 : static int32_t U_CALLCONV
2508 0 : ucstrTextExtract(UText *ut,
2509 : int64_t start, int64_t limit,
2510 : UChar *dest, int32_t destCapacity,
2511 : UErrorCode *pErrorCode)
2512 : {
2513 0 : if(U_FAILURE(*pErrorCode)) {
2514 0 : return 0;
2515 : }
2516 0 : if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
2517 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2518 0 : return 0;
2519 : }
2520 :
2521 : //const UChar *s=(const UChar *)ut->context;
2522 : int32_t si, di;
2523 :
2524 : int32_t start32;
2525 : int32_t limit32;
2526 :
2527 : // Access the start. Does two things we need:
2528 : // Pins 'start' to the length of the string, if it came in out-of-bounds.
2529 : // Snaps 'start' to the beginning of a code point.
2530 0 : ucstrTextAccess(ut, start, TRUE);
2531 0 : const UChar *s=ut->chunkContents;
2532 0 : start32 = ut->chunkOffset;
2533 :
2534 0 : int32_t strLength=(int32_t)ut->a;
2535 0 : if (strLength >= 0) {
2536 0 : limit32 = pinIndex(limit, strLength);
2537 : } else {
2538 0 : limit32 = pinIndex(limit, INT32_MAX);
2539 : }
2540 0 : di = 0;
2541 0 : for (si=start32; si<limit32; si++) {
2542 0 : if (strLength<0 && s[si]==0) {
2543 : // Just hit the end of a null-terminated string.
2544 0 : ut->a = si; // set string length for this UText
2545 0 : ut->chunkNativeLimit = si;
2546 0 : ut->chunkLength = si;
2547 0 : ut->nativeIndexingLimit = si;
2548 0 : strLength = si;
2549 0 : limit32 = si;
2550 0 : break;
2551 : }
2552 0 : U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */
2553 0 : if (di<destCapacity) {
2554 : // only store if there is space.
2555 0 : dest[di] = s[si];
2556 : } else {
2557 0 : if (strLength>=0) {
2558 : // We have filled the destination buffer, and the string length is known.
2559 : // Cut the loop short. There is no need to scan string termination.
2560 0 : di = limit32 - start32;
2561 0 : si = limit32;
2562 0 : break;
2563 : }
2564 : }
2565 0 : di++;
2566 : }
2567 :
2568 : // If the limit index points to a lead surrogate of a pair,
2569 : // add the corresponding trail surrogate to the destination.
2570 0 : if (si>0 && U16_IS_LEAD(s[si-1]) &&
2571 0 : ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si])))
2572 : {
2573 0 : if (di<destCapacity) {
2574 : // store only if there is space in the output buffer.
2575 0 : dest[di++] = s[si];
2576 : }
2577 0 : si++;
2578 : }
2579 :
2580 : // Put iteration position at the point just following the extracted text
2581 0 : if (si <= ut->chunkNativeLimit) {
2582 0 : ut->chunkOffset = si;
2583 : } else {
2584 0 : ucstrTextAccess(ut, si, TRUE);
2585 : }
2586 :
2587 : // Add a terminating NUL if space in the buffer permits,
2588 : // and set the error status as required.
2589 0 : u_terminateUChars(dest, destCapacity, di, pErrorCode);
2590 0 : return di;
2591 : }
2592 :
2593 : static const struct UTextFuncs ucstrFuncs =
2594 : {
2595 : sizeof(UTextFuncs),
2596 : 0, 0, 0, // Reserved alignment padding
2597 : ucstrTextClone,
2598 : ucstrTextLength,
2599 : ucstrTextAccess,
2600 : ucstrTextExtract,
2601 : NULL, // Replace
2602 : NULL, // Copy
2603 : NULL, // MapOffsetToNative,
2604 : NULL, // MapIndexToUTF16,
2605 : ucstrTextClose,
2606 : NULL, // spare 1
2607 : NULL, // spare 2
2608 : NULL, // spare 3
2609 : };
2610 :
2611 : U_CDECL_END
2612 :
2613 : static const UChar gEmptyUString[] = {0};
2614 :
2615 : U_CAPI UText * U_EXPORT2
2616 0 : utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) {
2617 0 : if (U_FAILURE(*status)) {
2618 0 : return NULL;
2619 : }
2620 0 : if(s==NULL && length==0) {
2621 0 : s = gEmptyUString;
2622 : }
2623 0 : if (s==NULL || length < -1 || length>INT32_MAX) {
2624 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
2625 0 : return NULL;
2626 : }
2627 0 : ut = utext_setup(ut, 0, status);
2628 0 : if (U_SUCCESS(*status)) {
2629 0 : ut->pFuncs = &ucstrFuncs;
2630 0 : ut->context = s;
2631 0 : ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2632 0 : if (length==-1) {
2633 0 : ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2634 : }
2635 0 : ut->a = length;
2636 0 : ut->chunkContents = s;
2637 0 : ut->chunkNativeStart = 0;
2638 0 : ut->chunkNativeLimit = length>=0? length : 0;
2639 0 : ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2640 0 : ut->chunkOffset = 0;
2641 0 : ut->nativeIndexingLimit = ut->chunkLength;
2642 : }
2643 0 : return ut;
2644 : }
2645 :
2646 :
2647 : //------------------------------------------------------------------------------
2648 : //
2649 : // UText implementation for text from ICU CharacterIterators
2650 : //
2651 : // Use of UText data members:
2652 : // context pointer to the CharacterIterator
2653 : // a length of the full text.
2654 : // p pointer to buffer 1
2655 : // b start index of local buffer 1 contents
2656 : // q pointer to buffer 2
2657 : // c start index of local buffer 2 contents
2658 : // r pointer to the character iterator if the UText owns it.
2659 : // Null otherwise.
2660 : //
2661 : //------------------------------------------------------------------------------
2662 : #define CIBufSize 16
2663 :
2664 : U_CDECL_BEGIN
2665 : static void U_CALLCONV
2666 0 : charIterTextClose(UText *ut) {
2667 : // Most of the work of close is done by the generic UText framework close.
2668 : // All that needs to be done here is delete the CharacterIterator if the UText
2669 : // owns it. This occurs if the UText was created by cloning.
2670 0 : CharacterIterator *ci = (CharacterIterator *)ut->r;
2671 0 : delete ci;
2672 0 : ut->r = NULL;
2673 0 : }
2674 :
2675 : static int64_t U_CALLCONV
2676 0 : charIterTextLength(UText *ut) {
2677 0 : return (int32_t)ut->a;
2678 : }
2679 :
2680 : static UBool U_CALLCONV
2681 0 : charIterTextAccess(UText *ut, int64_t index, UBool forward) {
2682 0 : CharacterIterator *ci = (CharacterIterator *)ut->context;
2683 :
2684 0 : int32_t clippedIndex = (int32_t)index;
2685 0 : if (clippedIndex<0) {
2686 0 : clippedIndex=0;
2687 0 : } else if (clippedIndex>=ut->a) {
2688 0 : clippedIndex=(int32_t)ut->a;
2689 : }
2690 0 : int32_t neededIndex = clippedIndex;
2691 0 : if (!forward && neededIndex>0) {
2692 : // reverse iteration, want the position just before what was asked for.
2693 0 : neededIndex--;
2694 0 : } else if (forward && neededIndex==ut->a && neededIndex>0) {
2695 : // Forward iteration, don't ask for something past the end of the text.
2696 0 : neededIndex--;
2697 : }
2698 :
2699 : // Find the native index of the start of the buffer containing what we want.
2700 0 : neededIndex -= neededIndex % CIBufSize;
2701 :
2702 0 : UChar *buf = NULL;
2703 0 : UBool needChunkSetup = TRUE;
2704 : int i;
2705 0 : if (ut->chunkNativeStart == neededIndex) {
2706 : // The buffer we want is already the current chunk.
2707 0 : needChunkSetup = FALSE;
2708 0 : } else if (ut->b == neededIndex) {
2709 : // The first buffer (buffer p) has what we need.
2710 0 : buf = (UChar *)ut->p;
2711 0 : } else if (ut->c == neededIndex) {
2712 : // The second buffer (buffer q) has what we need.
2713 0 : buf = (UChar *)ut->q;
2714 : } else {
2715 : // Neither buffer already has what we need.
2716 : // Load new data from the character iterator.
2717 : // Use the buf that is not the current buffer.
2718 0 : buf = (UChar *)ut->p;
2719 0 : if (ut->p == ut->chunkContents) {
2720 0 : buf = (UChar *)ut->q;
2721 : }
2722 0 : ci->setIndex(neededIndex);
2723 0 : for (i=0; i<CIBufSize; i++) {
2724 0 : buf[i] = ci->nextPostInc();
2725 0 : if (i+neededIndex > ut->a) {
2726 0 : break;
2727 : }
2728 : }
2729 : }
2730 :
2731 : // We have a buffer with the data we need.
2732 : // Set it up as the current chunk, if it wasn't already.
2733 0 : if (needChunkSetup) {
2734 0 : ut->chunkContents = buf;
2735 0 : ut->chunkLength = CIBufSize;
2736 0 : ut->chunkNativeStart = neededIndex;
2737 0 : ut->chunkNativeLimit = neededIndex + CIBufSize;
2738 0 : if (ut->chunkNativeLimit > ut->a) {
2739 0 : ut->chunkNativeLimit = ut->a;
2740 0 : ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
2741 : }
2742 0 : ut->nativeIndexingLimit = ut->chunkLength;
2743 0 : U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize);
2744 : }
2745 0 : ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
2746 0 : UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0);
2747 0 : return success;
2748 : }
2749 :
2750 : static UText * U_CALLCONV
2751 0 : charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) {
2752 0 : if (U_FAILURE(*status)) {
2753 0 : return NULL;
2754 : }
2755 :
2756 0 : if (deep) {
2757 : // There is no CharacterIterator API for cloning the underlying text storage.
2758 0 : *status = U_UNSUPPORTED_ERROR;
2759 0 : return NULL;
2760 : } else {
2761 0 : CharacterIterator *srcCI =(CharacterIterator *)src->context;
2762 0 : srcCI = srcCI->clone();
2763 0 : dest = utext_openCharacterIterator(dest, srcCI, status);
2764 0 : if (U_FAILURE(*status)) {
2765 0 : return dest;
2766 : }
2767 : // cast off const on getNativeIndex.
2768 : // For CharacterIterator based UTexts, this is safe, the operation is const.
2769 0 : int64_t ix = utext_getNativeIndex((UText *)src);
2770 0 : utext_setNativeIndex(dest, ix);
2771 0 : dest->r = srcCI; // flags that this UText owns the CharacterIterator
2772 : }
2773 0 : return dest;
2774 : }
2775 :
2776 : static int32_t U_CALLCONV
2777 0 : charIterTextExtract(UText *ut,
2778 : int64_t start, int64_t limit,
2779 : UChar *dest, int32_t destCapacity,
2780 : UErrorCode *status)
2781 : {
2782 0 : if(U_FAILURE(*status)) {
2783 0 : return 0;
2784 : }
2785 0 : if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
2786 0 : *status=U_ILLEGAL_ARGUMENT_ERROR;
2787 0 : return 0;
2788 : }
2789 0 : int32_t length = (int32_t)ut->a;
2790 0 : int32_t start32 = pinIndex(start, length);
2791 0 : int32_t limit32 = pinIndex(limit, length);
2792 0 : int32_t desti = 0;
2793 : int32_t srci;
2794 : int32_t copyLimit;
2795 :
2796 0 : CharacterIterator *ci = (CharacterIterator *)ut->context;
2797 0 : ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
2798 0 : srci = ci->getIndex();
2799 0 : copyLimit = srci;
2800 0 : while (srci<limit32) {
2801 0 : UChar32 c = ci->next32PostInc();
2802 0 : int32_t len = U16_LENGTH(c);
2803 0 : U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */
2804 0 : if (desti+len <= destCapacity) {
2805 0 : U16_APPEND_UNSAFE(dest, desti, c);
2806 0 : copyLimit = srci+len;
2807 : } else {
2808 0 : desti += len;
2809 0 : *status = U_BUFFER_OVERFLOW_ERROR;
2810 : }
2811 0 : srci += len;
2812 : }
2813 :
2814 0 : charIterTextAccess(ut, copyLimit, TRUE);
2815 :
2816 0 : u_terminateUChars(dest, destCapacity, desti, status);
2817 0 : return desti;
2818 : }
2819 :
2820 : static const struct UTextFuncs charIterFuncs =
2821 : {
2822 : sizeof(UTextFuncs),
2823 : 0, 0, 0, // Reserved alignment padding
2824 : charIterTextClone,
2825 : charIterTextLength,
2826 : charIterTextAccess,
2827 : charIterTextExtract,
2828 : NULL, // Replace
2829 : NULL, // Copy
2830 : NULL, // MapOffsetToNative,
2831 : NULL, // MapIndexToUTF16,
2832 : charIterTextClose,
2833 : NULL, // spare 1
2834 : NULL, // spare 2
2835 : NULL // spare 3
2836 : };
2837 : U_CDECL_END
2838 :
2839 :
2840 : U_CAPI UText * U_EXPORT2
2841 0 : utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
2842 0 : if (U_FAILURE(*status)) {
2843 0 : return NULL;
2844 : }
2845 :
2846 0 : if (ci->startIndex() > 0) {
2847 : // No support for CharacterIterators that do not start indexing from zero.
2848 0 : *status = U_UNSUPPORTED_ERROR;
2849 0 : return NULL;
2850 : }
2851 :
2852 : // Extra space in UText for 2 buffers of CIBufSize UChars each.
2853 0 : int32_t extraSpace = 2 * CIBufSize * sizeof(UChar);
2854 0 : ut = utext_setup(ut, extraSpace, status);
2855 0 : if (U_SUCCESS(*status)) {
2856 0 : ut->pFuncs = &charIterFuncs;
2857 0 : ut->context = ci;
2858 0 : ut->providerProperties = 0;
2859 0 : ut->a = ci->endIndex(); // Length of text
2860 0 : ut->p = ut->pExtra; // First buffer
2861 0 : ut->b = -1; // Native index of first buffer contents
2862 0 : ut->q = (UChar*)ut->pExtra+CIBufSize; // Second buffer
2863 0 : ut->c = -1; // Native index of second buffer contents
2864 :
2865 : // Initialize current chunk contents to be empty.
2866 : // First access will fault something in.
2867 : // Note: The initial nativeStart and chunkOffset must sum to zero
2868 : // so that getNativeIndex() will correctly compute to zero
2869 : // if no call to Access() has ever been made. They can't be both
2870 : // zero without Access() thinking that the chunk is valid.
2871 0 : ut->chunkContents = (UChar *)ut->p;
2872 0 : ut->chunkNativeStart = -1;
2873 0 : ut->chunkOffset = 1;
2874 0 : ut->chunkNativeLimit = 0;
2875 0 : ut->chunkLength = 0;
2876 0 : ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing
2877 : }
2878 0 : return ut;
2879 : }
|