LCOV - code coverage report
Current view: top level - intl/icu/source/common - util.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 129 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 9 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : **********************************************************************
       5             : *   Copyright (c) 2001-2011, International Business Machines
       6             : *   Corporation and others.  All Rights Reserved.
       7             : **********************************************************************
       8             : *   Date        Name        Description
       9             : *   11/19/2001  aliu        Creation.
      10             : **********************************************************************
      11             : */
      12             : 
      13             : #include "unicode/unimatch.h"
      14             : #include "unicode/utf16.h"
      15             : #include "patternprops.h"
      16             : #include "util.h"
      17             : 
      18             : // Define UChar constants using hex for EBCDIC compatibility
      19             : 
      20             : static const UChar BACKSLASH  = 0x005C; /*\*/
      21             : static const UChar UPPER_U    = 0x0055; /*U*/
      22             : static const UChar LOWER_U    = 0x0075; /*u*/
      23             : static const UChar APOSTROPHE = 0x0027; // '\''
      24             : static const UChar SPACE      = 0x0020; // ' '
      25             : 
      26             : // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
      27             : static const UChar DIGITS[] = {
      28             :     48,49,50,51,52,53,54,55,56,57,
      29             :     65,66,67,68,69,70,71,72,73,74,
      30             :     75,76,77,78,79,80,81,82,83,84,
      31             :     85,86,87,88,89,90
      32             : };
      33             : 
      34             : U_NAMESPACE_BEGIN
      35             : 
      36           0 : UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
      37             :                                      int32_t radix, int32_t minDigits) {
      38           0 :     if (radix < 2 || radix > 36) {
      39             :         // Bogus radix
      40           0 :         return result.append((UChar)63/*?*/);
      41             :     }
      42             :     // Handle negatives
      43           0 :     if (n < 0) {
      44           0 :         n = -n;
      45           0 :         result.append((UChar)45/*-*/);
      46             :     }
      47             :     // First determine the number of digits
      48           0 :     int32_t nn = n;
      49           0 :     int32_t r = 1;
      50           0 :     while (nn >= radix) {
      51           0 :         nn /= radix;
      52           0 :         r *= radix;
      53           0 :         --minDigits;
      54             :     }
      55             :     // Now generate the digits
      56           0 :     while (--minDigits > 0) {
      57           0 :         result.append(DIGITS[0]);
      58             :     }
      59           0 :     while (r > 0) {
      60           0 :         int32_t digit = n / r;
      61           0 :         result.append(DIGITS[digit]);
      62           0 :         n -= digit * r;
      63           0 :         r /= radix;
      64             :     }
      65           0 :     return result;
      66             : }
      67             : 
      68             : /**
      69             :  * Return true if the character is NOT printable ASCII.
      70             :  */
      71           0 : UBool ICU_Utility::isUnprintable(UChar32 c) {
      72           0 :     return !(c >= 0x20 && c <= 0x7E);
      73             : }
      74             : 
      75             : /**
      76             :  * Escape unprintable characters using \uxxxx notation for U+0000 to
      77             :  * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
      78             :  * printable ASCII, then do nothing and return FALSE.  Otherwise,
      79             :  * append the escaped notation and return TRUE.
      80             :  */
      81           0 : UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
      82           0 :     if (isUnprintable(c)) {
      83           0 :         result.append(BACKSLASH);
      84           0 :         if (c & ~0xFFFF) {
      85           0 :             result.append(UPPER_U);
      86           0 :             result.append(DIGITS[0xF&(c>>28)]);
      87           0 :             result.append(DIGITS[0xF&(c>>24)]);
      88           0 :             result.append(DIGITS[0xF&(c>>20)]);
      89           0 :             result.append(DIGITS[0xF&(c>>16)]);
      90             :         } else {
      91           0 :             result.append(LOWER_U);
      92             :         }
      93           0 :         result.append(DIGITS[0xF&(c>>12)]);
      94           0 :         result.append(DIGITS[0xF&(c>>8)]);
      95           0 :         result.append(DIGITS[0xF&(c>>4)]);
      96           0 :         result.append(DIGITS[0xF&c]);
      97           0 :         return TRUE;
      98             :     }
      99           0 :     return FALSE;
     100             : }
     101             : 
     102             : /**
     103             :  * Returns the index of a character, ignoring quoted text.
     104             :  * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
     105             :  * found by a search for 'h'.
     106             :  */
     107             : // FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
     108             : /*
     109             : int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
     110             :                                int32_t start, int32_t limit,
     111             :                                UChar charToFind) {
     112             :     for (int32_t i=start; i<limit; ++i) {
     113             :         UChar c = text.charAt(i);
     114             :         if (c == BACKSLASH) {
     115             :             ++i;
     116             :         } else if (c == APOSTROPHE) {
     117             :             while (++i < limit
     118             :                    && text.charAt(i) != APOSTROPHE) {}
     119             :         } else if (c == charToFind) {
     120             :             return i;
     121             :         }
     122             :     }
     123             :     return -1;
     124             : }
     125             : */
     126             : 
     127             : /**
     128             :  * Skip over a sequence of zero or more white space characters at pos.
     129             :  * @param advance if true, advance pos to the first non-white-space
     130             :  * character at or after pos, or str.length(), if there is none.
     131             :  * Otherwise leave pos unchanged.
     132             :  * @return the index of the first non-white-space character at or
     133             :  * after pos, or str.length(), if there is none.
     134             :  */
     135           0 : int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
     136             :                                     UBool advance) {
     137           0 :     int32_t p = pos;
     138           0 :     const UChar* s = str.getBuffer();
     139           0 :     p = (int32_t)(PatternProps::skipWhiteSpace(s + p, str.length() - p) - s);
     140           0 :     if (advance) {
     141           0 :         pos = p;
     142             :     }
     143           0 :     return p;
     144             : }
     145             : 
     146             : /**
     147             :  * Skip over Pattern_White_Space in a Replaceable.
     148             :  * Skipping may be done in the forward or
     149             :  * reverse direction.  In either case, the leftmost index will be
     150             :  * inclusive, and the rightmost index will be exclusive.  That is,
     151             :  * given a range defined as [start, limit), the call
     152             :  * skipWhitespace(text, start, limit) will advance start past leading
     153             :  * whitespace, whereas the call skipWhitespace(text, limit, start),
     154             :  * will back up limit past trailing whitespace.
     155             :  * @param text the text to be analyzed
     156             :  * @param pos either the start or limit of a range of 'text', to skip
     157             :  * leading or trailing whitespace, respectively
     158             :  * @param stop either the limit or start of a range of 'text', to skip
     159             :  * leading or trailing whitespace, respectively
     160             :  * @return the new start or limit, depending on what was passed in to
     161             :  * 'pos'
     162             :  */
     163             : //?FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
     164             : //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
     165             : //?                                    int32_t pos, int32_t stop) {
     166             : //?    UChar32 c;
     167             : //?    UBool isForward = (stop >= pos);
     168             : //?
     169             : //?    if (!isForward) {
     170             : //?        --pos; // pos is a limit, so back up by one
     171             : //?    }
     172             : //?    
     173             : //?    while (pos != stop &&
     174             : //?           PatternProps::isWhiteSpace(c = text.char32At(pos))) {
     175             : //?        if (isForward) {
     176             : //?            pos += U16_LENGTH(c);
     177             : //?        } else {
     178             : //?            pos -= U16_LENGTH(c);
     179             : //?        }
     180             : //?    }
     181             : //?
     182             : //?    if (!isForward) {
     183             : //?        ++pos; // make pos back into a limit
     184             : //?    }
     185             : //?
     186             : //?    return pos;
     187             : //?}
     188             : 
     189             : /**
     190             :  * Parse a single non-whitespace character 'ch', optionally
     191             :  * preceded by whitespace.
     192             :  * @param id the string to be parsed
     193             :  * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
     194             :  * offset of the first character to be parsed.  On output, pos[0]
     195             :  * is the index after the last parsed character.  If the parse
     196             :  * fails, pos[0] will be unchanged.
     197             :  * @param ch the non-whitespace character to be parsed.
     198             :  * @return true if 'ch' is seen preceded by zero or more
     199             :  * whitespace characters.
     200             :  */
     201           0 : UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
     202           0 :     int32_t start = pos;
     203           0 :     skipWhitespace(id, pos, TRUE);
     204           0 :     if (pos == id.length() ||
     205           0 :         id.charAt(pos) != ch) {
     206           0 :         pos = start;
     207           0 :         return FALSE;
     208             :     }
     209           0 :     ++pos;
     210           0 :     return TRUE;
     211             : }
     212             : 
     213             : /**
     214             :  * Parse a pattern string within the given Replaceable and a parsing
     215             :  * pattern.  Characters are matched literally and case-sensitively
     216             :  * except for the following special characters:
     217             :  *
     218             :  * ~  zero or more Pattern_White_Space chars
     219             :  *
     220             :  * If end of pattern is reached with all matches along the way,
     221             :  * pos is advanced to the first unparsed index and returned.
     222             :  * Otherwise -1 is returned.
     223             :  * @param pat pattern that controls parsing
     224             :  * @param text text to be parsed, starting at index
     225             :  * @param index offset to first character to parse
     226             :  * @param limit offset after last character to parse
     227             :  * @return index after last parsed character, or -1 on parse failure.
     228             :  */
     229           0 : int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
     230             :                                   const Replaceable& text,
     231             :                                   int32_t index,
     232             :                                   int32_t limit) {
     233           0 :     int32_t ipat = 0;
     234             : 
     235             :     // empty pattern matches immediately
     236           0 :     if (ipat == pat.length()) {
     237           0 :         return index;
     238             :     }
     239             : 
     240           0 :     UChar32 cpat = pat.char32At(ipat);
     241             : 
     242           0 :     while (index < limit) {
     243           0 :         UChar32 c = text.char32At(index);
     244             : 
     245             :         // parse \s*
     246           0 :         if (cpat == 126 /*~*/) {
     247           0 :             if (PatternProps::isWhiteSpace(c)) {
     248           0 :                 index += U16_LENGTH(c);
     249           0 :                 continue;
     250             :             } else {
     251           0 :                 if (++ipat == pat.length()) {
     252           0 :                     return index; // success; c unparsed
     253             :                 }
     254             :                 // fall thru; process c again with next cpat
     255             :             }
     256             :         }
     257             : 
     258             :         // parse literal
     259           0 :         else if (c == cpat) {
     260           0 :             index += U16_LENGTH(c);
     261           0 :             ipat += U16_LENGTH(cpat);
     262           0 :             if (ipat == pat.length()) {
     263           0 :                 return index; // success; c parsed
     264             :             }
     265             :             // fall thru; get next cpat
     266             :         }
     267             : 
     268             :         // match failure of literal
     269             :         else {
     270           0 :             return -1;
     271             :         }
     272             : 
     273           0 :         cpat = pat.char32At(ipat);
     274             :     }
     275             : 
     276           0 :     return -1; // text ended before end of pat
     277             : }
     278             : 
     279             : /**
     280             :  * Append a character to a rule that is being built up.  To flush
     281             :  * the quoteBuf to rule, make one final call with isLiteral == TRUE.
     282             :  * If there is no final character, pass in (UChar32)-1 as c.
     283             :  * @param rule the string to append the character to
     284             :  * @param c the character to append, or (UChar32)-1 if none.
     285             :  * @param isLiteral if true, then the given character should not be
     286             :  * quoted or escaped.  Usually this means it is a syntactic element
     287             :  * such as > or $
     288             :  * @param escapeUnprintable if true, then unprintable characters
     289             :  * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will
     290             :  * appear outside of quotes.
     291             :  * @param quoteBuf a buffer which is used to build up quoted
     292             :  * substrings.  The caller should initially supply an empty buffer,
     293             :  * and thereafter should not modify the buffer.  The buffer should be
     294             :  * cleared out by, at the end, calling this method with a literal
     295             :  * character.
     296             :  */
     297           0 : void ICU_Utility::appendToRule(UnicodeString& rule,
     298             :                                UChar32 c,
     299             :                                UBool isLiteral,
     300             :                                UBool escapeUnprintable,
     301             :                                UnicodeString& quoteBuf) {
     302             :     // If we are escaping unprintables, then escape them outside
     303             :     // quotes.  \u and \U are not recognized within quotes.  The same
     304             :     // logic applies to literals, but literals are never escaped.
     305           0 :     if (isLiteral ||
     306           0 :         (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
     307           0 :         if (quoteBuf.length() > 0) {
     308             :             // We prefer backslash APOSTROPHE to double APOSTROPHE
     309             :             // (more readable, less similar to ") so if there are
     310             :             // double APOSTROPHEs at the ends, we pull them outside
     311             :             // of the quote.
     312             : 
     313             :             // If the first thing in the quoteBuf is APOSTROPHE
     314             :             // (doubled) then pull it out.
     315           0 :             while (quoteBuf.length() >= 2 &&
     316           0 :                    quoteBuf.charAt(0) == APOSTROPHE &&
     317           0 :                    quoteBuf.charAt(1) == APOSTROPHE) {
     318           0 :                 rule.append(BACKSLASH).append(APOSTROPHE);
     319           0 :                 quoteBuf.remove(0, 2);
     320             :             }
     321             :             // If the last thing in the quoteBuf is APOSTROPHE
     322             :             // (doubled) then remove and count it and add it after.
     323           0 :             int32_t trailingCount = 0;
     324           0 :             while (quoteBuf.length() >= 2 &&
     325           0 :                    quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
     326           0 :                    quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
     327           0 :                 quoteBuf.truncate(quoteBuf.length()-2);
     328           0 :                 ++trailingCount;
     329             :             }
     330           0 :             if (quoteBuf.length() > 0) {
     331           0 :                 rule.append(APOSTROPHE);
     332           0 :                 rule.append(quoteBuf);
     333           0 :                 rule.append(APOSTROPHE);
     334           0 :                 quoteBuf.truncate(0);
     335             :             }
     336           0 :             while (trailingCount-- > 0) {
     337           0 :                 rule.append(BACKSLASH).append(APOSTROPHE);
     338             :             }
     339             :         }
     340           0 :         if (c != (UChar32)-1) {
     341             :             /* Since spaces are ignored during parsing, they are
     342             :              * emitted only for readability.  We emit one here
     343             :              * only if there isn't already one at the end of the
     344             :              * rule.
     345             :              */
     346           0 :             if (c == SPACE) {
     347           0 :                 int32_t len = rule.length();
     348           0 :                 if (len > 0 && rule.charAt(len-1) != c) {
     349           0 :                     rule.append(c);
     350             :                 }
     351           0 :             } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
     352           0 :                 rule.append(c);
     353             :             }
     354             :         }
     355             :     }
     356             : 
     357             :     // Escape ' and '\' and don't begin a quote just for them
     358           0 :     else if (quoteBuf.length() == 0 &&
     359           0 :              (c == APOSTROPHE || c == BACKSLASH)) {
     360           0 :         rule.append(BACKSLASH);
     361           0 :         rule.append(c);
     362             :     }
     363             : 
     364             :     // Specials (printable ascii that isn't [0-9a-zA-Z]) and
     365             :     // whitespace need quoting.  Also append stuff to quotes if we are
     366             :     // building up a quoted substring already.
     367           0 :     else if (quoteBuf.length() > 0 ||
     368           0 :              (c >= 0x0021 && c <= 0x007E &&
     369           0 :               !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
     370           0 :                 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
     371           0 :                 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
     372           0 :              PatternProps::isWhiteSpace(c)) {
     373           0 :         quoteBuf.append(c);
     374             :         // Double ' within a quote
     375           0 :         if (c == APOSTROPHE) {
     376           0 :             quoteBuf.append(c);
     377             :         }
     378             :     }
     379             :     
     380             :     // Otherwise just append
     381             :     else {
     382           0 :         rule.append(c);
     383             :     }
     384           0 : }
     385             : 
     386           0 : void ICU_Utility::appendToRule(UnicodeString& rule,
     387             :                                const UnicodeString& text,
     388             :                                UBool isLiteral,
     389             :                                UBool escapeUnprintable,
     390             :                                UnicodeString& quoteBuf) {
     391           0 :     for (int32_t i=0; i<text.length(); ++i) {
     392           0 :         appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
     393             :     }
     394           0 : }
     395             : 
     396             : /**
     397             :  * Given a matcher reference, which may be null, append its
     398             :  * pattern as a literal to the given rule.
     399             :  */
     400           0 : void ICU_Utility::appendToRule(UnicodeString& rule,
     401             :                                const UnicodeMatcher* matcher,
     402             :                                UBool escapeUnprintable,
     403             :                                UnicodeString& quoteBuf) {
     404           0 :     if (matcher != NULL) {
     405           0 :         UnicodeString pat;
     406           0 :         appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
     407           0 :                      TRUE, escapeUnprintable, quoteBuf);
     408             :     }
     409           0 : }
     410             : 
     411             : U_NAMESPACE_END

Generated by: LCOV version 1.13