Line data Source code
1 : /* Copyright 2013 Google Inc. All Rights Reserved.
2 :
3 : Distributed under MIT license.
4 : See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 : */
6 :
7 : /* Transformations on dictionary words. */
8 :
9 : #ifndef BROTLI_DEC_TRANSFORM_H_
10 : #define BROTLI_DEC_TRANSFORM_H_
11 :
12 : #include "./port.h"
13 : #include "./types.h"
14 :
15 : #if defined(__cplusplus) || defined(c_plusplus)
16 : extern "C" {
17 : #endif
18 :
19 : enum WordTransformType {
20 : kIdentity = 0,
21 : kOmitLast1 = 1,
22 : kOmitLast2 = 2,
23 : kOmitLast3 = 3,
24 : kOmitLast4 = 4,
25 : kOmitLast5 = 5,
26 : kOmitLast6 = 6,
27 : kOmitLast7 = 7,
28 : kOmitLast8 = 8,
29 : kOmitLast9 = 9,
30 : kUppercaseFirst = 10,
31 : kUppercaseAll = 11,
32 : kOmitFirst1 = 12,
33 : kOmitFirst2 = 13,
34 : kOmitFirst3 = 14,
35 : kOmitFirst4 = 15,
36 : kOmitFirst5 = 16,
37 : kOmitFirst6 = 17,
38 : kOmitFirst7 = 18,
39 : kOmitFirst8 = 19,
40 : kOmitFirst9 = 20
41 : };
42 :
43 : typedef struct {
44 : const uint8_t prefix_id;
45 : const uint8_t transform;
46 : const uint8_t suffix_id;
47 : } Transform;
48 :
49 : static const char kPrefixSuffix[208] =
50 : "\0 \0, \0 of the \0 of \0s \0.\0 and \0 in \0\"\0 to \0\">\0\n\0. \0]\0"
51 : " for \0 a \0 that \0\'\0 with \0 from \0 by \0(\0. The \0 on \0 as \0"
52 : " is \0ing \0\n\t\0:\0ed \0=\"\0 at \0ly \0,\0=\'\0.com/\0. This \0"
53 : " not \0er \0al \0ful \0ive \0less \0est \0ize \0\xc2\xa0\0ous ";
54 :
55 : enum {
56 : /* EMPTY = ""
57 : SP = " "
58 : DQUOT = "\""
59 : SQUOT = "'"
60 : CLOSEBR = "]"
61 : OPEN = "("
62 : SLASH = "/"
63 : NBSP = non-breaking space "\0xc2\xa0"
64 : */
65 : kPFix_EMPTY = 0,
66 : kPFix_SP = 1,
67 : kPFix_COMMASP = 3,
68 : kPFix_SPofSPtheSP = 6,
69 : kPFix_SPtheSP = 9,
70 : kPFix_eSP = 12,
71 : kPFix_SPofSP = 15,
72 : kPFix_sSP = 20,
73 : kPFix_DOT = 23,
74 : kPFix_SPandSP = 25,
75 : kPFix_SPinSP = 31,
76 : kPFix_DQUOT = 36,
77 : kPFix_SPtoSP = 38,
78 : kPFix_DQUOTGT = 43,
79 : kPFix_NEWLINE = 46,
80 : kPFix_DOTSP = 48,
81 : kPFix_CLOSEBR = 51,
82 : kPFix_SPforSP = 53,
83 : kPFix_SPaSP = 59,
84 : kPFix_SPthatSP = 63,
85 : kPFix_SQUOT = 70,
86 : kPFix_SPwithSP = 72,
87 : kPFix_SPfromSP = 79,
88 : kPFix_SPbySP = 86,
89 : kPFix_OPEN = 91,
90 : kPFix_DOTSPTheSP = 93,
91 : kPFix_SPonSP = 100,
92 : kPFix_SPasSP = 105,
93 : kPFix_SPisSP = 110,
94 : kPFix_ingSP = 115,
95 : kPFix_NEWLINETAB = 120,
96 : kPFix_COLON = 123,
97 : kPFix_edSP = 125,
98 : kPFix_EQDQUOT = 129,
99 : kPFix_SPatSP = 132,
100 : kPFix_lySP = 137,
101 : kPFix_COMMA = 141,
102 : kPFix_EQSQUOT = 143,
103 : kPFix_DOTcomSLASH = 146,
104 : kPFix_DOTSPThisSP = 152,
105 : kPFix_SPnotSP = 160,
106 : kPFix_erSP = 166,
107 : kPFix_alSP = 170,
108 : kPFix_fulSP = 174,
109 : kPFix_iveSP = 179,
110 : kPFix_lessSP = 184,
111 : kPFix_estSP = 190,
112 : kPFix_izeSP = 195,
113 : kPFix_NBSP = 200,
114 : kPFix_ousSP = 203
115 : };
116 :
117 : static const Transform kTransforms[] = {
118 : { kPFix_EMPTY, kIdentity, kPFix_EMPTY },
119 : { kPFix_EMPTY, kIdentity, kPFix_SP },
120 : { kPFix_SP, kIdentity, kPFix_SP },
121 : { kPFix_EMPTY, kOmitFirst1, kPFix_EMPTY },
122 : { kPFix_EMPTY, kUppercaseFirst, kPFix_SP },
123 : { kPFix_EMPTY, kIdentity, kPFix_SPtheSP },
124 : { kPFix_SP, kIdentity, kPFix_EMPTY },
125 : { kPFix_sSP, kIdentity, kPFix_SP },
126 : { kPFix_EMPTY, kIdentity, kPFix_SPofSP },
127 : { kPFix_EMPTY, kUppercaseFirst, kPFix_EMPTY },
128 : { kPFix_EMPTY, kIdentity, kPFix_SPandSP },
129 : { kPFix_EMPTY, kOmitFirst2, kPFix_EMPTY },
130 : { kPFix_EMPTY, kOmitLast1, kPFix_EMPTY },
131 : { kPFix_COMMASP, kIdentity, kPFix_SP },
132 : { kPFix_EMPTY, kIdentity, kPFix_COMMASP },
133 : { kPFix_SP, kUppercaseFirst, kPFix_SP },
134 : { kPFix_EMPTY, kIdentity, kPFix_SPinSP },
135 : { kPFix_EMPTY, kIdentity, kPFix_SPtoSP },
136 : { kPFix_eSP, kIdentity, kPFix_SP },
137 : { kPFix_EMPTY, kIdentity, kPFix_DQUOT },
138 : { kPFix_EMPTY, kIdentity, kPFix_DOT },
139 : { kPFix_EMPTY, kIdentity, kPFix_DQUOTGT },
140 : { kPFix_EMPTY, kIdentity, kPFix_NEWLINE },
141 : { kPFix_EMPTY, kOmitLast3, kPFix_EMPTY },
142 : { kPFix_EMPTY, kIdentity, kPFix_CLOSEBR },
143 : { kPFix_EMPTY, kIdentity, kPFix_SPforSP },
144 : { kPFix_EMPTY, kOmitFirst3, kPFix_EMPTY },
145 : { kPFix_EMPTY, kOmitLast2, kPFix_EMPTY },
146 : { kPFix_EMPTY, kIdentity, kPFix_SPaSP },
147 : { kPFix_EMPTY, kIdentity, kPFix_SPthatSP },
148 : { kPFix_SP, kUppercaseFirst, kPFix_EMPTY },
149 : { kPFix_EMPTY, kIdentity, kPFix_DOTSP },
150 : { kPFix_DOT, kIdentity, kPFix_EMPTY },
151 : { kPFix_SP, kIdentity, kPFix_COMMASP },
152 : { kPFix_EMPTY, kOmitFirst4, kPFix_EMPTY },
153 : { kPFix_EMPTY, kIdentity, kPFix_SPwithSP },
154 : { kPFix_EMPTY, kIdentity, kPFix_SQUOT },
155 : { kPFix_EMPTY, kIdentity, kPFix_SPfromSP },
156 : { kPFix_EMPTY, kIdentity, kPFix_SPbySP },
157 : { kPFix_EMPTY, kOmitFirst5, kPFix_EMPTY },
158 : { kPFix_EMPTY, kOmitFirst6, kPFix_EMPTY },
159 : { kPFix_SPtheSP, kIdentity, kPFix_EMPTY },
160 : { kPFix_EMPTY, kOmitLast4, kPFix_EMPTY },
161 : { kPFix_EMPTY, kIdentity, kPFix_DOTSPTheSP },
162 : { kPFix_EMPTY, kUppercaseAll, kPFix_EMPTY },
163 : { kPFix_EMPTY, kIdentity, kPFix_SPonSP },
164 : { kPFix_EMPTY, kIdentity, kPFix_SPasSP },
165 : { kPFix_EMPTY, kIdentity, kPFix_SPisSP },
166 : { kPFix_EMPTY, kOmitLast7, kPFix_EMPTY },
167 : { kPFix_EMPTY, kOmitLast1, kPFix_ingSP },
168 : { kPFix_EMPTY, kIdentity, kPFix_NEWLINETAB },
169 : { kPFix_EMPTY, kIdentity, kPFix_COLON },
170 : { kPFix_SP, kIdentity, kPFix_DOTSP },
171 : { kPFix_EMPTY, kIdentity, kPFix_edSP },
172 : { kPFix_EMPTY, kOmitFirst9, kPFix_EMPTY },
173 : { kPFix_EMPTY, kOmitFirst7, kPFix_EMPTY },
174 : { kPFix_EMPTY, kOmitLast6, kPFix_EMPTY },
175 : { kPFix_EMPTY, kIdentity, kPFix_OPEN },
176 : { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMASP },
177 : { kPFix_EMPTY, kOmitLast8, kPFix_EMPTY },
178 : { kPFix_EMPTY, kIdentity, kPFix_SPatSP },
179 : { kPFix_EMPTY, kIdentity, kPFix_lySP },
180 : { kPFix_SPtheSP, kIdentity, kPFix_SPofSP },
181 : { kPFix_EMPTY, kOmitLast5, kPFix_EMPTY },
182 : { kPFix_EMPTY, kOmitLast9, kPFix_EMPTY },
183 : { kPFix_SP, kUppercaseFirst, kPFix_COMMASP },
184 : { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOT },
185 : { kPFix_DOT, kIdentity, kPFix_OPEN },
186 : { kPFix_EMPTY, kUppercaseAll, kPFix_SP },
187 : { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOTGT },
188 : { kPFix_EMPTY, kIdentity, kPFix_EQDQUOT },
189 : { kPFix_SP, kIdentity, kPFix_DOT },
190 : { kPFix_DOTcomSLASH, kIdentity, kPFix_EMPTY },
191 : { kPFix_SPtheSP, kIdentity, kPFix_SPofSPtheSP },
192 : { kPFix_EMPTY, kUppercaseFirst, kPFix_SQUOT },
193 : { kPFix_EMPTY, kIdentity, kPFix_DOTSPThisSP },
194 : { kPFix_EMPTY, kIdentity, kPFix_COMMA },
195 : { kPFix_DOT, kIdentity, kPFix_SP },
196 : { kPFix_EMPTY, kUppercaseFirst, kPFix_OPEN },
197 : { kPFix_EMPTY, kUppercaseFirst, kPFix_DOT },
198 : { kPFix_EMPTY, kIdentity, kPFix_SPnotSP },
199 : { kPFix_SP, kIdentity, kPFix_EQDQUOT },
200 : { kPFix_EMPTY, kIdentity, kPFix_erSP },
201 : { kPFix_SP, kUppercaseAll, kPFix_SP },
202 : { kPFix_EMPTY, kIdentity, kPFix_alSP },
203 : { kPFix_SP, kUppercaseAll, kPFix_EMPTY },
204 : { kPFix_EMPTY, kIdentity, kPFix_EQSQUOT },
205 : { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOT },
206 : { kPFix_EMPTY, kUppercaseFirst, kPFix_DOTSP },
207 : { kPFix_SP, kIdentity, kPFix_OPEN },
208 : { kPFix_EMPTY, kIdentity, kPFix_fulSP },
209 : { kPFix_SP, kUppercaseFirst, kPFix_DOTSP },
210 : { kPFix_EMPTY, kIdentity, kPFix_iveSP },
211 : { kPFix_EMPTY, kIdentity, kPFix_lessSP },
212 : { kPFix_EMPTY, kUppercaseAll, kPFix_SQUOT },
213 : { kPFix_EMPTY, kIdentity, kPFix_estSP },
214 : { kPFix_SP, kUppercaseFirst, kPFix_DOT },
215 : { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOTGT },
216 : { kPFix_SP, kIdentity, kPFix_EQSQUOT },
217 : { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMA },
218 : { kPFix_EMPTY, kIdentity, kPFix_izeSP },
219 : { kPFix_EMPTY, kUppercaseAll, kPFix_DOT },
220 : { kPFix_NBSP, kIdentity, kPFix_EMPTY },
221 : { kPFix_SP, kIdentity, kPFix_COMMA },
222 : { kPFix_EMPTY, kUppercaseFirst, kPFix_EQDQUOT },
223 : { kPFix_EMPTY, kUppercaseAll, kPFix_EQDQUOT },
224 : { kPFix_EMPTY, kIdentity, kPFix_ousSP },
225 : { kPFix_EMPTY, kUppercaseAll, kPFix_COMMASP },
226 : { kPFix_EMPTY, kUppercaseFirst, kPFix_EQSQUOT },
227 : { kPFix_SP, kUppercaseFirst, kPFix_COMMA },
228 : { kPFix_SP, kUppercaseAll, kPFix_EQDQUOT },
229 : { kPFix_SP, kUppercaseAll, kPFix_COMMASP },
230 : { kPFix_EMPTY, kUppercaseAll, kPFix_COMMA },
231 : { kPFix_EMPTY, kUppercaseAll, kPFix_OPEN },
232 : { kPFix_EMPTY, kUppercaseAll, kPFix_DOTSP },
233 : { kPFix_SP, kUppercaseAll, kPFix_DOT },
234 : { kPFix_EMPTY, kUppercaseAll, kPFix_EQSQUOT },
235 : { kPFix_SP, kUppercaseAll, kPFix_DOTSP },
236 : { kPFix_SP, kUppercaseFirst, kPFix_EQDQUOT },
237 : { kPFix_SP, kUppercaseAll, kPFix_EQSQUOT },
238 : { kPFix_SP, kUppercaseFirst, kPFix_EQSQUOT },
239 : };
240 :
241 : static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
242 :
243 0 : static int ToUpperCase(uint8_t* p) {
244 0 : if (p[0] < 0xc0) {
245 0 : if (p[0] >= 'a' && p[0] <= 'z') {
246 0 : p[0] ^= 32;
247 : }
248 0 : return 1;
249 : }
250 : /* An overly simplified uppercasing model for utf-8. */
251 0 : if (p[0] < 0xe0) {
252 0 : p[1] ^= 32;
253 0 : return 2;
254 : }
255 : /* An arbitrary transform for three byte characters. */
256 0 : p[2] ^= 5;
257 0 : return 3;
258 : }
259 :
260 0 : static BROTLI_NOINLINE int TransformDictionaryWord(
261 : uint8_t* dst, const uint8_t* word, int len, int transform) {
262 0 : int idx = 0;
263 : {
264 0 : const char* prefix = &kPrefixSuffix[kTransforms[transform].prefix_id];
265 0 : while (*prefix) { dst[idx++] = (uint8_t)*prefix++; }
266 : }
267 : {
268 0 : const int t = kTransforms[transform].transform;
269 0 : int i = 0;
270 0 : int skip = t - (kOmitFirst1 - 1);
271 0 : if (skip > 0) {
272 0 : word += skip;
273 0 : len -= skip;
274 0 : } else if (t <= kOmitLast9) {
275 0 : len -= t;
276 : }
277 0 : while (i < len) { dst[idx++] = word[i++]; }
278 0 : if (t == kUppercaseFirst) {
279 0 : ToUpperCase(&dst[idx - len]);
280 0 : } else if (t == kUppercaseAll) {
281 0 : uint8_t* uppercase = &dst[idx - len];
282 0 : while (len > 0) {
283 0 : int step = ToUpperCase(uppercase);
284 0 : uppercase += step;
285 0 : len -= step;
286 : }
287 : }
288 : }
289 : {
290 0 : const char* suffix = &kPrefixSuffix[kTransforms[transform].suffix_id];
291 0 : while (*suffix) { dst[idx++] = (uint8_t)*suffix++; }
292 0 : return idx;
293 : }
294 : }
295 :
296 : #if defined(__cplusplus) || defined(c_plusplus)
297 : } /* extern "C" */
298 : #endif
299 :
300 : #endif /* BROTLI_DEC_TRANSFORM_H_ */
|