Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : *
6 : * Copyright (C) 2005-2016, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : *******************************************************************************
10 : * file name: ucasemap.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2005may06
16 : * created by: Markus W. Scherer
17 : *
18 : * Case mapping service object and functions using it.
19 : */
20 :
21 : #include "unicode/utypes.h"
22 : #include "unicode/brkiter.h"
23 : #include "unicode/casemap.h"
24 : #include "unicode/edits.h"
25 : #include "unicode/ubrk.h"
26 : #include "unicode/uloc.h"
27 : #include "unicode/ustring.h"
28 : #include "unicode/ucasemap.h"
29 : #if !UCONFIG_NO_BREAK_ITERATION
30 : #include "unicode/utext.h"
31 : #endif
32 : #include "unicode/utf.h"
33 : #include "unicode/utf8.h"
34 : #include "unicode/utf16.h"
35 : #include "cmemory.h"
36 : #include "cstring.h"
37 : #include "uassert.h"
38 : #include "ucase.h"
39 : #include "ucasemap_imp.h"
40 : #include "ustr_imp.h"
41 :
42 : U_NAMESPACE_BEGIN
43 :
44 : namespace {
45 :
46 : // TODO: share with UTF-16? inline in ucasemap_imp.h?
47 0 : int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
48 : Edits *edits, UErrorCode &errorCode) {
49 0 : if (U_SUCCESS(errorCode)) {
50 0 : if (destIndex > destCapacity) {
51 0 : errorCode = U_BUFFER_OVERFLOW_ERROR;
52 0 : } else if (edits != NULL) {
53 0 : edits->copyErrorTo(errorCode);
54 : }
55 : }
56 0 : return destIndex;
57 : }
58 :
59 : } // namespace
60 :
61 : U_NAMESPACE_END
62 :
63 : U_NAMESPACE_USE
64 :
65 : /* UCaseMap service object -------------------------------------------------- */
66 :
67 0 : UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
68 : #if !UCONFIG_NO_BREAK_ITERATION
69 : iter(NULL),
70 : #endif
71 0 : caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
72 0 : ucasemap_setLocale(this, localeID, pErrorCode);
73 0 : }
74 :
75 0 : UCaseMap::~UCaseMap() {
76 : #if !UCONFIG_NO_BREAK_ITERATION
77 : delete iter;
78 : #endif
79 0 : }
80 :
81 : U_CAPI UCaseMap * U_EXPORT2
82 0 : ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
83 0 : if(U_FAILURE(*pErrorCode)) {
84 0 : return NULL;
85 : }
86 0 : UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
87 0 : if(csm==NULL) {
88 0 : *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
89 0 : return NULL;
90 0 : } else if (U_FAILURE(*pErrorCode)) {
91 0 : delete csm;
92 0 : return NULL;
93 : }
94 0 : return csm;
95 : }
96 :
97 : U_CAPI void U_EXPORT2
98 0 : ucasemap_close(UCaseMap *csm) {
99 0 : delete csm;
100 0 : }
101 :
102 : U_CAPI const char * U_EXPORT2
103 0 : ucasemap_getLocale(const UCaseMap *csm) {
104 0 : return csm->locale;
105 : }
106 :
107 : U_CAPI uint32_t U_EXPORT2
108 0 : ucasemap_getOptions(const UCaseMap *csm) {
109 0 : return csm->options;
110 : }
111 :
112 : U_CAPI void U_EXPORT2
113 0 : ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
114 0 : if(U_FAILURE(*pErrorCode)) {
115 0 : return;
116 : }
117 0 : if (locale != NULL && *locale == 0) {
118 0 : csm->locale[0] = 0;
119 0 : csm->caseLocale = UCASE_LOC_ROOT;
120 0 : return;
121 : }
122 :
123 0 : int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
124 0 : if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
125 0 : *pErrorCode=U_ZERO_ERROR;
126 : /* we only really need the language code for case mappings */
127 0 : length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
128 : }
129 0 : if(length==sizeof(csm->locale)) {
130 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
131 : }
132 0 : if(U_SUCCESS(*pErrorCode)) {
133 0 : csm->caseLocale=UCASE_LOC_UNKNOWN;
134 0 : csm->caseLocale = ucase_getCaseLocale(csm->locale);
135 : } else {
136 0 : csm->locale[0]=0;
137 0 : csm->caseLocale = UCASE_LOC_ROOT;
138 : }
139 : }
140 :
141 : U_CAPI void U_EXPORT2
142 0 : ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
143 0 : if(U_FAILURE(*pErrorCode)) {
144 0 : return;
145 : }
146 0 : csm->options=options;
147 : }
148 :
149 : /* UTF-8 string case mappings ----------------------------------------------- */
150 :
151 : /* TODO(markus): Move to a new, separate utf8case.cpp file. */
152 :
153 : /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
154 : static inline int32_t
155 0 : appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
156 : int32_t result, const UChar *s,
157 : int32_t cpLength, uint32_t options, icu::Edits *edits) {
158 : UChar32 c;
159 : int32_t length;
160 : UErrorCode errorCode;
161 :
162 : /* decode the result */
163 0 : if(result<0) {
164 : /* (not) original code point */
165 0 : if(edits!=NULL) {
166 0 : edits->addUnchanged(cpLength);
167 0 : if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
168 0 : return destIndex;
169 : }
170 : }
171 0 : c=~result;
172 0 : if(destIndex<destCapacity && c<=0x7f) { // ASCII slightly-fastpath
173 0 : dest[destIndex++]=(uint8_t)c;
174 0 : return destIndex;
175 : }
176 0 : length=cpLength;
177 : } else {
178 0 : if(result<=UCASE_MAX_STRING_LENGTH) {
179 : // string: "result" is the UTF-16 length
180 0 : errorCode=U_ZERO_ERROR;
181 0 : if(destIndex<destCapacity) {
182 0 : u_strToUTF8((char *)(dest+destIndex), destCapacity-destIndex, &length,
183 0 : s, result, &errorCode);
184 : } else {
185 0 : u_strToUTF8(NULL, 0, &length, s, result, &errorCode);
186 : }
187 0 : if(U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) {
188 0 : return -1;
189 : }
190 0 : if(length>(INT32_MAX-destIndex)) {
191 0 : return -1; // integer overflow
192 : }
193 0 : if(edits!=NULL) {
194 0 : edits->addReplace(cpLength, length);
195 : }
196 : // We might have an overflow, but we know the actual length.
197 0 : return destIndex+length;
198 0 : } else if(destIndex<destCapacity && result<=0x7f) { // ASCII slightly-fastpath
199 0 : dest[destIndex++]=(uint8_t)result;
200 0 : if(edits!=NULL) {
201 0 : edits->addReplace(cpLength, 1);
202 : }
203 0 : return destIndex;
204 : } else {
205 0 : c=result;
206 0 : length=U8_LENGTH(c);
207 0 : if(edits!=NULL) {
208 0 : edits->addReplace(cpLength, length);
209 : }
210 : }
211 : }
212 : // c>=0 single code point
213 0 : if(length>(INT32_MAX-destIndex)) {
214 0 : return -1; // integer overflow
215 : }
216 :
217 0 : if(destIndex<destCapacity) {
218 : /* append the result */
219 0 : UBool isError=FALSE;
220 0 : U8_APPEND(dest, destIndex, destCapacity, c, isError);
221 0 : if(isError) {
222 : /* overflow, nothing written */
223 0 : destIndex+=length;
224 : }
225 : } else {
226 : /* preflight */
227 0 : destIndex+=length;
228 : }
229 0 : return destIndex;
230 : }
231 :
232 : static inline int32_t
233 : appendASCII(uint8_t *dest, int32_t destIndex, int32_t destCapacity, uint8_t c) {
234 : if(destIndex<destCapacity) {
235 : dest[destIndex]=c;
236 : } else if(destIndex==INT32_MAX) {
237 : return -1; // integer overflow
238 : }
239 : return destIndex+1;
240 : }
241 :
242 : // See unicode/utf8.h U8_APPEND_UNSAFE().
243 0 : static inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
244 0 : static inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
245 :
246 : static inline int32_t
247 0 : appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, UChar32 c) {
248 0 : U_ASSERT(0x370 <= c && c <= 0x3ff); // 2-byte UTF-8, main Greek block
249 0 : if(2>(INT32_MAX-destIndex)) {
250 0 : return -1; // integer overflow
251 : }
252 0 : int32_t limit=destIndex+2;
253 0 : if(limit<=destCapacity) {
254 0 : dest+=destIndex;
255 0 : dest[0]=getTwoByteLead(c);
256 0 : dest[1]=getTwoByteTrail(c);
257 : }
258 0 : return limit;
259 : }
260 :
261 : static inline int32_t
262 0 : appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, const char *s) {
263 0 : if(2>(INT32_MAX-destIndex)) {
264 0 : return -1; // integer overflow
265 : }
266 0 : int32_t limit=destIndex+2;
267 0 : if(limit<=destCapacity) {
268 0 : dest+=destIndex;
269 0 : dest[0]=(uint8_t)s[0];
270 0 : dest[1]=(uint8_t)s[1];
271 : }
272 0 : return limit;
273 : }
274 :
275 : static inline int32_t
276 0 : appendUnchanged(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
277 : const uint8_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
278 0 : if(length>0) {
279 0 : if(edits!=NULL) {
280 0 : edits->addUnchanged(length);
281 0 : if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
282 0 : return destIndex;
283 : }
284 : }
285 0 : if(length>(INT32_MAX-destIndex)) {
286 0 : return -1; // integer overflow
287 : }
288 0 : if((destIndex+length)<=destCapacity) {
289 0 : uprv_memcpy(dest+destIndex, s, length);
290 : }
291 0 : destIndex+=length;
292 : }
293 0 : return destIndex;
294 : }
295 :
296 : static UChar32 U_CALLCONV
297 0 : utf8_caseContextIterator(void *context, int8_t dir) {
298 0 : UCaseContext *csc=(UCaseContext *)context;
299 : UChar32 c;
300 :
301 0 : if(dir<0) {
302 : /* reset for backward iteration */
303 0 : csc->index=csc->cpStart;
304 0 : csc->dir=dir;
305 0 : } else if(dir>0) {
306 : /* reset for forward iteration */
307 0 : csc->index=csc->cpLimit;
308 0 : csc->dir=dir;
309 : } else {
310 : /* continue current iteration direction */
311 0 : dir=csc->dir;
312 : }
313 :
314 0 : if(dir<0) {
315 0 : if(csc->start<csc->index) {
316 0 : U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
317 0 : return c;
318 : }
319 : } else {
320 0 : if(csc->index<csc->limit) {
321 0 : U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
322 0 : return c;
323 : }
324 : }
325 0 : return U_SENTINEL;
326 : }
327 :
328 : /*
329 : * Case-maps [srcStart..srcLimit[ but takes
330 : * context [0..srcLength[ into account.
331 : */
332 : static int32_t
333 0 : _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
334 : uint8_t *dest, int32_t destCapacity,
335 : const uint8_t *src, UCaseContext *csc,
336 : int32_t srcStart, int32_t srcLimit,
337 : icu::Edits *edits,
338 : UErrorCode &errorCode) {
339 : /* case mapping loop */
340 0 : int32_t srcIndex=srcStart;
341 0 : int32_t destIndex=0;
342 0 : while(srcIndex<srcLimit) {
343 : int32_t cpStart;
344 0 : csc->cpStart=cpStart=srcIndex;
345 : UChar32 c;
346 0 : U8_NEXT(src, srcIndex, srcLimit, c);
347 0 : csc->cpLimit=srcIndex;
348 0 : if(c<0) {
349 : // Malformed UTF-8.
350 0 : destIndex=appendUnchanged(dest, destIndex, destCapacity,
351 0 : src+cpStart, srcIndex-cpStart, options, edits);
352 0 : if(destIndex<0) {
353 0 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
354 0 : return 0;
355 : }
356 0 : continue;
357 : }
358 : const UChar *s;
359 0 : c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
360 0 : destIndex = appendResult(dest, destIndex, destCapacity, c, s,
361 0 : srcIndex - cpStart, options, edits);
362 0 : if (destIndex < 0) {
363 0 : errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
364 0 : return 0;
365 : }
366 : }
367 :
368 0 : return destIndex;
369 : }
370 :
371 : #if !UCONFIG_NO_BREAK_ITERATION
372 :
373 : U_CFUNC int32_t U_CALLCONV
374 : ucasemap_internalUTF8ToTitle(
375 : int32_t caseLocale, uint32_t options, BreakIterator *iter,
376 : uint8_t *dest, int32_t destCapacity,
377 : const uint8_t *src, int32_t srcLength,
378 : icu::Edits *edits,
379 : UErrorCode &errorCode) {
380 : if(U_FAILURE(errorCode)) {
381 : return 0;
382 : }
383 :
384 : /* set up local variables */
385 : UCaseContext csc=UCASECONTEXT_INITIALIZER;
386 : csc.p=(void *)src;
387 : csc.limit=srcLength;
388 : int32_t destIndex=0;
389 : int32_t prev=0;
390 : UBool isFirstIndex=TRUE;
391 :
392 : /* titlecasing loop */
393 : while(prev<srcLength) {
394 : /* find next index where to titlecase */
395 : int32_t index;
396 : if(isFirstIndex) {
397 : isFirstIndex=FALSE;
398 : index=iter->first();
399 : } else {
400 : index=iter->next();
401 : }
402 : if(index==UBRK_DONE || index>srcLength) {
403 : index=srcLength;
404 : }
405 :
406 : /*
407 : * Unicode 4 & 5 section 3.13 Default Case Operations:
408 : *
409 : * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
410 : * #29, "Text Boundaries." Between each pair of word boundaries, find the first
411 : * cased character F. If F exists, map F to default_title(F); then map each
412 : * subsequent character C to default_lower(C).
413 : *
414 : * In this implementation, segment [prev..index[ into 3 parts:
415 : * a) uncased characters (copy as-is) [prev..titleStart[
416 : * b) first case letter (titlecase) [titleStart..titleLimit[
417 : * c) subsequent characters (lowercase) [titleLimit..index[
418 : */
419 : if(prev<index) {
420 : /* find and copy uncased characters [prev..titleStart[ */
421 : int32_t titleStart=prev;
422 : int32_t titleLimit=prev;
423 : UChar32 c;
424 : U8_NEXT(src, titleLimit, index, c);
425 : if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
426 : /* Adjust the titlecasing index (titleStart) to the next cased character. */
427 : for(;;) {
428 : titleStart=titleLimit;
429 : if(titleLimit==index) {
430 : /*
431 : * only uncased characters in [prev..index[
432 : * stop with titleStart==titleLimit==index
433 : */
434 : break;
435 : }
436 : U8_NEXT(src, titleLimit, index, c);
437 : if(UCASE_NONE!=ucase_getType(c)) {
438 : break; /* cased letter at [titleStart..titleLimit[ */
439 : }
440 : }
441 : destIndex=appendUnchanged(dest, destIndex, destCapacity,
442 : src+prev, titleStart-prev, options, edits);
443 : if(destIndex<0) {
444 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
445 : return 0;
446 : }
447 : }
448 :
449 : if(titleStart<titleLimit) {
450 : /* titlecase c which is from [titleStart..titleLimit[ */
451 : if(c>=0) {
452 : csc.cpStart=titleStart;
453 : csc.cpLimit=titleLimit;
454 : const UChar *s;
455 : c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
456 : destIndex=appendResult(dest, destIndex, destCapacity, c, s,
457 : titleLimit-titleStart, options, edits);
458 : } else {
459 : // Malformed UTF-8.
460 : destIndex=appendUnchanged(dest, destIndex, destCapacity,
461 : src+titleStart, titleLimit-titleStart, options, edits);
462 : }
463 : if(destIndex<0) {
464 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
465 : return 0;
466 : }
467 :
468 : /* Special case Dutch IJ titlecasing */
469 : if (titleStart+1 < index &&
470 : caseLocale == UCASE_LOC_DUTCH &&
471 : (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
472 : if (src[titleStart+1] == 0x006A) {
473 : destIndex=appendASCII(dest, destIndex, destCapacity, 0x004A);
474 : if(destIndex<0) {
475 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
476 : return 0;
477 : }
478 : if(edits!=NULL) {
479 : edits->addReplace(1, 1);
480 : }
481 : titleLimit++;
482 : } else if (src[titleStart+1] == 0x004A) {
483 : // Keep the capital J from getting lowercased.
484 : destIndex=appendUnchanged(dest, destIndex, destCapacity,
485 : src+titleStart+1, 1, options, edits);
486 : if(destIndex<0) {
487 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
488 : return 0;
489 : }
490 : titleLimit++;
491 : }
492 : }
493 :
494 : /* lowercase [titleLimit..index[ */
495 : if(titleLimit<index) {
496 : if((options&U_TITLECASE_NO_LOWERCASE)==0) {
497 : /* Normal operation: Lowercase the rest of the word. */
498 : destIndex+=
499 : _caseMap(
500 : caseLocale, options, ucase_toFullLower,
501 : dest+destIndex, destCapacity-destIndex,
502 : src, &csc,
503 : titleLimit, index,
504 : edits, errorCode);
505 : if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
506 : errorCode=U_ZERO_ERROR;
507 : }
508 : if(U_FAILURE(errorCode)) {
509 : return destIndex;
510 : }
511 : } else {
512 : /* Optionally just copy the rest of the word unchanged. */
513 : destIndex=appendUnchanged(dest, destIndex, destCapacity,
514 : src+titleLimit, index-titleLimit, options, edits);
515 : if(destIndex<0) {
516 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
517 : return 0;
518 : }
519 : }
520 : }
521 : }
522 : }
523 :
524 : prev=index;
525 : }
526 :
527 : return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
528 : }
529 :
530 : #endif
531 :
532 : U_NAMESPACE_BEGIN
533 : namespace GreekUpper {
534 :
535 0 : UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
536 0 : while (i < length) {
537 : UChar32 c;
538 0 : U8_NEXT(s, i, length, c);
539 0 : int32_t type = ucase_getTypeOrIgnorable(c);
540 0 : if ((type & UCASE_IGNORABLE) != 0) {
541 : // Case-ignorable, continue with the loop.
542 0 : } else if (type != UCASE_NONE) {
543 0 : return TRUE; // Followed by cased letter.
544 : } else {
545 0 : return FALSE; // Uncased and not case-ignorable.
546 : }
547 : }
548 0 : return FALSE; // Not followed by cased letter.
549 : }
550 :
551 : // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
552 0 : int32_t toUpper(uint32_t options,
553 : uint8_t *dest, int32_t destCapacity,
554 : const uint8_t *src, int32_t srcLength,
555 : Edits *edits,
556 : UErrorCode &errorCode) {
557 0 : int32_t destIndex=0;
558 0 : uint32_t state = 0;
559 0 : for (int32_t i = 0; i < srcLength;) {
560 0 : int32_t nextIndex = i;
561 : UChar32 c;
562 0 : U8_NEXT(src, nextIndex, srcLength, c);
563 0 : uint32_t nextState = 0;
564 0 : int32_t type = ucase_getTypeOrIgnorable(c);
565 0 : if ((type & UCASE_IGNORABLE) != 0) {
566 : // c is case-ignorable
567 0 : nextState |= (state & AFTER_CASED);
568 0 : } else if (type != UCASE_NONE) {
569 : // c is cased
570 0 : nextState |= AFTER_CASED;
571 : }
572 0 : uint32_t data = getLetterData(c);
573 0 : if (data > 0) {
574 0 : uint32_t upper = data & UPPER_MASK;
575 : // Add a dialytika to this iota or ypsilon vowel
576 : // if we removed a tonos from the previous vowel,
577 : // and that previous vowel did not also have (or gain) a dialytika.
578 : // Adding one only to the final vowel in a longer sequence
579 : // (which does not occur in normal writing) would require lookahead.
580 : // Set the same flag as for preserving an existing dialytika.
581 0 : if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
582 0 : (upper == 0x399 || upper == 0x3A5)) {
583 0 : data |= HAS_DIALYTIKA;
584 : }
585 0 : int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
586 0 : if ((data & HAS_YPOGEGRAMMENI) != 0) {
587 0 : numYpogegrammeni = 1;
588 : }
589 : // Skip combining diacritics after this Greek letter.
590 0 : int32_t nextNextIndex = nextIndex;
591 0 : while (nextIndex < srcLength) {
592 : UChar32 c2;
593 0 : U8_NEXT(src, nextNextIndex, srcLength, c2);
594 0 : uint32_t diacriticData = getDiacriticData(c2);
595 0 : if (diacriticData != 0) {
596 0 : data |= diacriticData;
597 0 : if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
598 0 : ++numYpogegrammeni;
599 : }
600 0 : nextIndex = nextNextIndex;
601 : } else {
602 0 : break; // not a Greek diacritic
603 : }
604 : }
605 0 : if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
606 0 : nextState |= AFTER_VOWEL_WITH_ACCENT;
607 : }
608 : // Map according to Greek rules.
609 0 : UBool addTonos = FALSE;
610 0 : if (upper == 0x397 &&
611 0 : (data & HAS_ACCENT) != 0 &&
612 0 : numYpogegrammeni == 0 &&
613 0 : (state & AFTER_CASED) == 0 &&
614 0 : !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
615 : // Keep disjunctive "or" with (only) a tonos.
616 : // We use the same "word boundary" conditions as for the Final_Sigma test.
617 0 : if (i == nextIndex) {
618 0 : upper = 0x389; // Preserve the precomposed form.
619 : } else {
620 0 : addTonos = TRUE;
621 : }
622 0 : } else if ((data & HAS_DIALYTIKA) != 0) {
623 : // Preserve a vowel with dialytika in precomposed form if it exists.
624 0 : if (upper == 0x399) {
625 0 : upper = 0x3AA;
626 0 : data &= ~HAS_EITHER_DIALYTIKA;
627 0 : } else if (upper == 0x3A5) {
628 0 : upper = 0x3AB;
629 0 : data &= ~HAS_EITHER_DIALYTIKA;
630 : }
631 : }
632 :
633 0 : UBool change = TRUE;
634 0 : if (edits != NULL) {
635 : // Find out first whether we are changing the text.
636 0 : U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
637 0 : change = (i + 2) > nextIndex ||
638 0 : src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
639 : numYpogegrammeni > 0;
640 0 : int32_t i2 = i + 2;
641 0 : if ((data & HAS_EITHER_DIALYTIKA) != 0) {
642 0 : change |= (i2 + 2) > nextIndex ||
643 0 : src[i2] != (uint8_t)u8"\u0308"[0] ||
644 0 : src[i2 + 1] != (uint8_t)u8"\u0308"[1];
645 0 : i2 += 2;
646 : }
647 0 : if (addTonos) {
648 0 : change |= (i2 + 2) > nextIndex ||
649 0 : src[i2] != (uint8_t)u8"\u0301"[0] ||
650 0 : src[i2 + 1] != (uint8_t)u8"\u0301"[1];
651 0 : i2 += 2;
652 : }
653 0 : int32_t oldLength = nextIndex - i;
654 0 : int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399
655 0 : change |= oldLength != newLength;
656 0 : if (change) {
657 0 : if (edits != NULL) {
658 0 : edits->addReplace(oldLength, newLength);
659 : }
660 : } else {
661 0 : if (edits != NULL) {
662 0 : edits->addUnchanged(oldLength);
663 : }
664 : // Write unchanged text?
665 0 : change = (options & UCASEMAP_OMIT_UNCHANGED_TEXT) == 0;
666 : }
667 : }
668 :
669 0 : if (change) {
670 0 : destIndex=appendTwoBytes(dest, destIndex, destCapacity, upper);
671 0 : if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
672 0 : destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0308"); // restore or add a dialytika
673 : }
674 0 : if (destIndex >= 0 && addTonos) {
675 0 : destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0301");
676 : }
677 0 : while (destIndex >= 0 && numYpogegrammeni > 0) {
678 0 : destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0399");
679 0 : --numYpogegrammeni;
680 : }
681 0 : if(destIndex<0) {
682 0 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
683 0 : return 0;
684 : }
685 : }
686 0 : } else if(c>=0) {
687 : const UChar *s;
688 0 : c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
689 0 : destIndex = appendResult(dest, destIndex, destCapacity, c, s,
690 0 : nextIndex - i, options, edits);
691 0 : if (destIndex < 0) {
692 0 : errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
693 0 : return 0;
694 : }
695 : } else {
696 : // Malformed UTF-8.
697 0 : destIndex=appendUnchanged(dest, destIndex, destCapacity,
698 0 : src+i, nextIndex-i, options, edits);
699 0 : if(destIndex<0) {
700 0 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
701 0 : return 0;
702 : }
703 : }
704 0 : i = nextIndex;
705 0 : state = nextState;
706 : }
707 :
708 0 : return destIndex;
709 : }
710 :
711 : } // namespace GreekUpper
712 : U_NAMESPACE_END
713 :
714 : static int32_t U_CALLCONV
715 0 : ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
716 : uint8_t *dest, int32_t destCapacity,
717 : const uint8_t *src, int32_t srcLength,
718 : icu::Edits *edits,
719 : UErrorCode &errorCode) {
720 0 : UCaseContext csc=UCASECONTEXT_INITIALIZER;
721 0 : csc.p=(void *)src;
722 0 : csc.limit=srcLength;
723 : int32_t destIndex = _caseMap(
724 : caseLocale, options, ucase_toFullLower,
725 : dest, destCapacity,
726 : src, &csc, 0, srcLength,
727 0 : edits, errorCode);
728 0 : return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
729 : }
730 :
731 : static int32_t U_CALLCONV
732 0 : ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
733 : uint8_t *dest, int32_t destCapacity,
734 : const uint8_t *src, int32_t srcLength,
735 : icu::Edits *edits,
736 : UErrorCode &errorCode) {
737 : int32_t destIndex;
738 0 : if (caseLocale == UCASE_LOC_GREEK) {
739 : destIndex = GreekUpper::toUpper(options, dest, destCapacity,
740 0 : src, srcLength, edits, errorCode);
741 : } else {
742 0 : UCaseContext csc=UCASECONTEXT_INITIALIZER;
743 0 : csc.p=(void *)src;
744 0 : csc.limit=srcLength;
745 : destIndex = _caseMap(
746 : caseLocale, options, ucase_toFullUpper,
747 : dest, destCapacity,
748 : src, &csc, 0, srcLength,
749 0 : edits, errorCode);
750 : }
751 0 : return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
752 : }
753 :
754 : static int32_t U_CALLCONV
755 0 : ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
756 : uint8_t *dest, int32_t destCapacity,
757 : const uint8_t *src, int32_t srcLength,
758 : icu::Edits *edits,
759 : UErrorCode &errorCode) {
760 : /* case mapping loop */
761 0 : int32_t srcIndex = 0;
762 0 : int32_t destIndex = 0;
763 0 : while (srcIndex < srcLength) {
764 0 : int32_t cpStart = srcIndex;
765 : UChar32 c;
766 0 : U8_NEXT(src, srcIndex, srcLength, c);
767 0 : if(c<0) {
768 : // Malformed UTF-8.
769 0 : destIndex=appendUnchanged(dest, destIndex, destCapacity,
770 0 : src+cpStart, srcIndex-cpStart, options, edits);
771 0 : if(destIndex<0) {
772 0 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
773 0 : return 0;
774 : }
775 0 : continue;
776 : }
777 : const UChar *s;
778 0 : c = ucase_toFullFolding(c, &s, options);
779 0 : destIndex = appendResult(dest, destIndex, destCapacity, c, s,
780 0 : srcIndex - cpStart, options, edits);
781 0 : if (destIndex < 0) {
782 0 : errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
783 0 : return 0;
784 : }
785 : }
786 :
787 0 : return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
788 : }
789 :
790 : U_CFUNC int32_t
791 0 : ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
792 : uint8_t *dest, int32_t destCapacity,
793 : const uint8_t *src, int32_t srcLength,
794 : UTF8CaseMapper *stringCaseMapper,
795 : icu::Edits *edits,
796 : UErrorCode &errorCode) {
797 : int32_t destLength;
798 :
799 : /* check argument values */
800 0 : if(U_FAILURE(errorCode)) {
801 0 : return 0;
802 : }
803 0 : if( destCapacity<0 ||
804 0 : (dest==NULL && destCapacity>0) ||
805 0 : src==NULL ||
806 : srcLength<-1
807 : ) {
808 0 : errorCode=U_ILLEGAL_ARGUMENT_ERROR;
809 0 : return 0;
810 : }
811 :
812 : /* get the string length */
813 0 : if(srcLength==-1) {
814 0 : srcLength=(int32_t)uprv_strlen((const char *)src);
815 : }
816 :
817 : /* check for overlapping source and destination */
818 0 : if( dest!=NULL &&
819 0 : ((src>=dest && src<(dest+destCapacity)) ||
820 0 : (dest>=src && dest<(src+srcLength)))
821 : ) {
822 0 : errorCode=U_ILLEGAL_ARGUMENT_ERROR;
823 0 : return 0;
824 : }
825 :
826 0 : if(edits!=NULL) {
827 0 : edits->reset();
828 : }
829 : destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
830 0 : dest, destCapacity, src, srcLength, edits, errorCode);
831 0 : return u_terminateChars((char *)dest, destCapacity, destLength, &errorCode);
832 : }
833 :
834 : /* public API functions */
835 :
836 : U_CAPI int32_t U_EXPORT2
837 0 : ucasemap_utf8ToLower(const UCaseMap *csm,
838 : char *dest, int32_t destCapacity,
839 : const char *src, int32_t srcLength,
840 : UErrorCode *pErrorCode) {
841 : return ucasemap_mapUTF8(
842 0 : csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
843 : (uint8_t *)dest, destCapacity,
844 : (const uint8_t *)src, srcLength,
845 0 : ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
846 : }
847 :
848 : U_CAPI int32_t U_EXPORT2
849 0 : ucasemap_utf8ToUpper(const UCaseMap *csm,
850 : char *dest, int32_t destCapacity,
851 : const char *src, int32_t srcLength,
852 : UErrorCode *pErrorCode) {
853 : return ucasemap_mapUTF8(
854 0 : csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
855 : (uint8_t *)dest, destCapacity,
856 : (const uint8_t *)src, srcLength,
857 0 : ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
858 : }
859 :
860 : U_CAPI int32_t U_EXPORT2
861 0 : ucasemap_utf8FoldCase(const UCaseMap *csm,
862 : char *dest, int32_t destCapacity,
863 : const char *src, int32_t srcLength,
864 : UErrorCode *pErrorCode) {
865 : return ucasemap_mapUTF8(
866 0 : UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
867 : (uint8_t *)dest, destCapacity,
868 : (const uint8_t *)src, srcLength,
869 0 : ucasemap_internalUTF8Fold, NULL, *pErrorCode);
870 : }
871 :
872 : U_NAMESPACE_BEGIN
873 :
874 0 : int32_t CaseMap::utf8ToLower(
875 : const char *locale, uint32_t options,
876 : const char *src, int32_t srcLength,
877 : char *dest, int32_t destCapacity, Edits *edits,
878 : UErrorCode &errorCode) {
879 0 : return ucasemap_mapUTF8(
880 : ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
881 : (uint8_t *)dest, destCapacity,
882 : (const uint8_t *)src, srcLength,
883 0 : ucasemap_internalUTF8ToLower, edits, errorCode);
884 : }
885 :
886 0 : int32_t CaseMap::utf8ToUpper(
887 : const char *locale, uint32_t options,
888 : const char *src, int32_t srcLength,
889 : char *dest, int32_t destCapacity, Edits *edits,
890 : UErrorCode &errorCode) {
891 0 : return ucasemap_mapUTF8(
892 : ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
893 : (uint8_t *)dest, destCapacity,
894 : (const uint8_t *)src, srcLength,
895 0 : ucasemap_internalUTF8ToUpper, edits, errorCode);
896 : }
897 :
898 0 : int32_t CaseMap::utf8Fold(
899 : uint32_t options,
900 : const char *src, int32_t srcLength,
901 : char *dest, int32_t destCapacity, Edits *edits,
902 : UErrorCode &errorCode) {
903 : return ucasemap_mapUTF8(
904 : UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
905 : (uint8_t *)dest, destCapacity,
906 : (const uint8_t *)src, srcLength,
907 0 : ucasemap_internalUTF8Fold, edits, errorCode);
908 : }
909 :
910 : U_NAMESPACE_END
|