Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : *
6 : * Copyright (C) 2007, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : ******************************************************************************
10 : * file name: bmpset.h
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2007jan29
16 : * created by: Markus W. Scherer
17 : */
18 :
19 : #ifndef __BMPSET_H__
20 : #define __BMPSET_H__
21 :
22 : #include "unicode/utypes.h"
23 : #include "unicode/uniset.h"
24 :
25 : U_NAMESPACE_BEGIN
26 :
27 : /*
28 : * Helper class for frozen UnicodeSets, implements contains() and span()
29 : * optimized for BMP code points. Structured to be UTF-8-friendly.
30 : *
31 : * ASCII: Look up bytes.
32 : * 2-byte characters: Bits organized vertically.
33 : * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
34 : * with mixed for illegal ranges.
35 : * Supplementary characters: Call contains() on the parent set.
36 : */
37 : class BMPSet : public UMemory {
38 : public:
39 : BMPSet(const int32_t *parentList, int32_t parentListLength);
40 : BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);
41 : virtual ~BMPSet();
42 :
43 : virtual UBool contains(UChar32 c) const;
44 :
45 : /*
46 : * Span the initial substring for which each character c has spanCondition==contains(c).
47 : * It must be s<limit and spanCondition==0 or 1.
48 : * @return The string pointer which limits the span.
49 : */
50 : const UChar *span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
51 :
52 : /*
53 : * Span the trailing substring for which each character c has spanCondition==contains(c).
54 : * It must be s<limit and spanCondition==0 or 1.
55 : * @return The string pointer which starts the span.
56 : */
57 : const UChar *spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
58 :
59 : /*
60 : * Span the initial substring for which each character c has spanCondition==contains(c).
61 : * It must be length>0 and spanCondition==0 or 1.
62 : * @return The string pointer which limits the span.
63 : */
64 : const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
65 :
66 : /*
67 : * Span the trailing substring for which each character c has spanCondition==contains(c).
68 : * It must be length>0 and spanCondition==0 or 1.
69 : * @return The start of the span.
70 : */
71 : int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
72 :
73 : private:
74 : void initBits();
75 : void overrideIllegal();
76 :
77 : /**
78 : * Same as UnicodeSet::findCodePoint(UChar32 c) const except that the
79 : * binary search is restricted for finding code points in a certain range.
80 : *
81 : * For restricting the search for finding in the range start..end,
82 : * pass in
83 : * lo=findCodePoint(start) and
84 : * hi=findCodePoint(end)
85 : * with 0<=lo<=hi<len.
86 : * findCodePoint(c) defaults to lo=0 and hi=len-1.
87 : *
88 : * @param c a character in a subrange of MIN_VALUE..MAX_VALUE
89 : * @param lo The lowest index to be returned.
90 : * @param hi The highest index to be returned.
91 : * @return the smallest integer i in the range lo..hi,
92 : * inclusive, such that c < list[i]
93 : */
94 : int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;
95 :
96 : inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
97 :
98 : /*
99 : * One byte per ASCII character, or trail byte in lead position.
100 : * 0 or 1 for ASCII characters.
101 : * The value for trail bytes is the result of contains(FFFD)
102 : * for faster validity checking at runtime.
103 : */
104 : UBool asciiBytes[0xc0];
105 :
106 : /*
107 : * One bit per code point from U+0000..U+07FF.
108 : * The bits are organized vertically; consecutive code points
109 : * correspond to the same bit positions in consecutive table words.
110 : * With code point parts
111 : * lead=c{10..6}
112 : * trail=c{5..0}
113 : * it is set.contains(c)==(table7FF[trail] bit lead)
114 : *
115 : * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
116 : * for faster validity checking at runtime.
117 : */
118 : uint32_t table7FF[64];
119 :
120 : /*
121 : * One bit per 64 BMP code points.
122 : * The bits are organized vertically; consecutive 64-code point blocks
123 : * correspond to the same bit position in consecutive table words.
124 : * With code point parts
125 : * lead=c{15..12}
126 : * t1=c{11..6}
127 : * test bits (lead+16) and lead in bmpBlockBits[t1].
128 : * If the upper bit is 0, then the lower bit indicates if contains(c)
129 : * for all code points in the 64-block.
130 : * If the upper bit is 1, then the block is mixed and set.contains(c)
131 : * must be called.
132 : *
133 : * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to
134 : * the result of contains(FFFD) for faster validity checking at runtime.
135 : */
136 : uint32_t bmpBlockBits[64];
137 :
138 : /*
139 : * Inversion list indexes for restricted binary searches in
140 : * findCodePoint(), from
141 : * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).
142 : * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
143 : * always looked up in the bit tables.
144 : * The last pair of indexes is for finding supplementary code points.
145 : */
146 : int32_t list4kStarts[18];
147 :
148 : /*
149 : * The inversion list of the parent set, for the slower contains() implementation
150 : * for mixed BMP blocks and for supplementary code points.
151 : * The list is terminated with list[listLength-1]=0x110000.
152 : */
153 : const int32_t *list;
154 : int32_t listLength;
155 : };
156 :
157 0 : inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {
158 0 : return (UBool)(findCodePoint(c, lo, hi) & 1);
159 : }
160 :
161 : U_NAMESPACE_END
162 :
163 : #endif
|