Line data Source code
1 : /* GRAPHITE2 LICENSING
2 :
3 : Copyright 2011, SIL International
4 : All rights reserved.
5 :
6 : This library is free software; you can redistribute it and/or modify
7 : it under the terms of the GNU Lesser General Public License as published
8 : by the Free Software Foundation; either version 2.1 of License, or
9 : (at your option) any later version.
10 :
11 : This program is distributed in the hope that it will be useful,
12 : but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 : Lesser General Public License for more details.
15 :
16 : You should also have received a copy of the GNU Lesser General Public
17 : License along with this library in the file named "LICENSE".
18 : If not, write to the Free Software Foundation, 51 Franklin Street,
19 : Suite 500, Boston, MA 02110-1335, USA or visit their web page on the
20 : internet at http://www.fsf.org/licenses/lgpl.html.
21 :
22 : Alternatively, the contents of this file may be used under the terms of the
23 : Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public
24 : License, as published by the Free Software Foundation, either version 2
25 : of the License or (at your option) any later version.
26 : */
27 : #pragma once
28 :
29 : #include <cstdlib>
30 : #include "inc/Main.h"
31 :
32 : namespace graphite2 {
33 :
34 : typedef uint32 uchar_t;
35 :
36 : template <int N>
37 : struct _utf_codec
38 : {
39 : typedef uchar_t codeunit_t;
40 :
41 : static void put(codeunit_t * cp, const uchar_t , int8 & len) throw();
42 : static uchar_t get(const codeunit_t * cp, int8 & len) throw();
43 : static bool validate(const codeunit_t * s, const codeunit_t * e) throw();
44 : };
45 :
46 :
47 : template <>
48 : struct _utf_codec<32>
49 : {
50 : private:
51 : static const uchar_t limit = 0x110000;
52 : public:
53 : typedef uint32 codeunit_t;
54 :
55 : inline
56 0 : static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
57 : {
58 0 : *cp = usv; l = 1;
59 0 : }
60 :
61 : inline
62 0 : static uchar_t get(const codeunit_t * cp, int8 & l) throw()
63 : {
64 0 : if (cp[0] < limit) { l = 1; return cp[0]; }
65 0 : else { l = -1; return 0xFFFD; }
66 : }
67 :
68 : inline
69 : static bool validate(codeunit_t * s, codeunit_t * e) throw()
70 : {
71 : return e > s;
72 : }
73 : };
74 :
75 :
76 : template <>
77 : struct _utf_codec<16>
78 : {
79 : private:
80 : static const int32 lead_offset = 0xD800 - (0x10000 >> 10);
81 : static const int32 surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00;
82 : public:
83 : typedef uint16 codeunit_t;
84 :
85 : inline
86 : static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
87 : {
88 : if (usv < 0x10000) { l = 1; cp[0] = codeunit_t(usv); }
89 : else
90 : {
91 : cp[0] = codeunit_t(lead_offset + (usv >> 10));
92 : cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF));
93 : l = 2;
94 : }
95 : }
96 :
97 : inline
98 0 : static uchar_t get(const codeunit_t * cp, int8 & l) throw()
99 : {
100 0 : const uint32 uh = cp[0];
101 0 : l = 1;
102 :
103 0 : if (uh < 0xD800|| uh > 0xDFFF) { return uh; }
104 0 : const uint32 ul = cp[1];
105 0 : if (uh > 0xDBFF || ul < 0xDC00 || ul > 0xDFFF) { l = -1; return 0xFFFD; }
106 0 : ++l;
107 0 : return (uh<<10) + ul + surrogate_offset;
108 : }
109 :
110 : inline
111 0 : static bool validate(codeunit_t * s, codeunit_t * e) throw()
112 : {
113 0 : const ptrdiff_t n = e-s;
114 0 : if (n <= 0) return n == 0;
115 0 : const uint32 u = *(s+(n-1)); // Get the last codepoint
116 0 : return (u < 0xD800 || u > 0xDBFF);
117 : }
118 : };
119 :
120 :
121 : template <>
122 : struct _utf_codec<8>
123 : {
124 : private:
125 : static const int8 sz_lut[16];
126 : static const byte mask_lut[5];
127 : static const uchar_t limit = 0x110000;
128 :
129 : public:
130 : typedef uint8 codeunit_t;
131 :
132 : inline
133 0 : static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
134 : {
135 0 : if (usv < 0x80) {l = 1; cp[0] = usv; return; }
136 0 : if (usv < 0x0800) {l = 2; cp[0] = 0xC0 + (usv >> 6); cp[1] = 0x80 + (usv & 0x3F); return; }
137 0 : if (usv < 0x10000) {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F); cp[2] = 0x80 + (usv & 0x3F); return; }
138 0 : else {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; }
139 : }
140 :
141 : inline
142 0 : static uchar_t get(const codeunit_t * cp, int8 & l) throw()
143 : {
144 0 : const int8 seq_sz = sz_lut[*cp >> 4];
145 0 : uchar_t u = *cp & mask_lut[seq_sz];
146 0 : l = 1;
147 0 : bool toolong = false;
148 :
149 0 : switch(seq_sz) {
150 0 : case 4: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong = (u < 0x10); GR_FALLTHROUGH;
151 : // no break
152 0 : case 3: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20); GR_FALLTHROUGH;
153 : // no break
154 0 : case 2: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80); GR_FALLTHROUGH;
155 : // no break
156 0 : case 1: break;
157 0 : case 0: l = -1; return 0xFFFD;
158 : }
159 :
160 0 : if (l != seq_sz || toolong || u >= limit)
161 : {
162 0 : l = -l;
163 0 : return 0xFFFD;
164 : }
165 0 : return u;
166 : }
167 :
168 : inline
169 : static bool validate(codeunit_t * s, codeunit_t * e) throw()
170 : {
171 : const ptrdiff_t n = e-s;
172 : if (n <= 0) return n == 0;
173 : s += (n-1);
174 : if (*s < 0x80) return true;
175 : if (*s >= 0xC0) return false;
176 : if (n == 1) return true;
177 : if (*--s < 0x80) return true;
178 : if (*s >= 0xe0) return false;
179 : if (n == 2 || *s >= 0xC0) return true;
180 : if (*--s < 0x80) return true;
181 : if (*s >= 0xF0) return false;
182 : return true;
183 : }
184 :
185 : };
186 :
187 :
188 : template <typename C>
189 : class _utf_iterator
190 : {
191 : typedef _utf_codec<sizeof(C)*8> codec;
192 :
193 : C * cp;
194 : mutable int8 sl;
195 :
196 : public:
197 : typedef C codeunit_type;
198 : typedef uchar_t value_type;
199 : typedef uchar_t * pointer;
200 :
201 : class reference
202 : {
203 : const _utf_iterator & _i;
204 :
205 0 : reference(const _utf_iterator & i): _i(i) {}
206 : public:
207 0 : operator value_type () const throw () { return codec::get(_i.cp, _i.sl); }
208 0 : reference & operator = (const value_type usv) throw() { codec::put(_i.cp, usv, _i.sl); return *this; }
209 :
210 : friend class _utf_iterator;
211 : };
212 :
213 :
214 0 : _utf_iterator(const void * us=0) : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { }
215 :
216 0 : _utf_iterator & operator ++ () { cp += abs(sl); return *this; }
217 : _utf_iterator operator ++ (int) { _utf_iterator tmp(*this); operator++(); return tmp; }
218 :
219 0 : bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; }
220 0 : bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); }
221 :
222 0 : reference operator * () const throw() { return *this; }
223 : pointer operator ->() const throw() { return &operator *(); }
224 :
225 0 : operator codeunit_type * () const throw() { return cp; }
226 :
227 0 : bool error() const throw() { return sl < 1; }
228 : };
229 :
230 : template <typename C>
231 : struct utf
232 : {
233 : typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t;
234 :
235 : typedef _utf_iterator<C> iterator;
236 : typedef _utf_iterator<const C> const_iterator;
237 :
238 : inline
239 0 : static bool validate(codeunit_t * s, codeunit_t * e) throw() {
240 0 : return _utf_codec<sizeof(C)*8>::validate(s,e);
241 : }
242 : };
243 :
244 :
245 : typedef utf<uint32> utf32;
246 : typedef utf<uint16> utf16;
247 : typedef utf<uint8> utf8;
248 :
249 : } // namespace graphite2
|