Line data Source code
1 : /* ***** BEGIN LICENSE BLOCK *****
2 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 : *
4 : * Copyright (C) 2002-2017 Németh László
5 : *
6 : * The contents of this file are subject to the Mozilla Public License Version
7 : * 1.1 (the "License"); you may not use this file except in compliance with
8 : * the License. You may obtain a copy of the License at
9 : * http://www.mozilla.org/MPL/
10 : *
11 : * Software distributed under the License is distributed on an "AS IS" basis,
12 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 : * for the specific language governing rights and limitations under the
14 : * License.
15 : *
16 : * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 : *
18 : * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 : * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 : * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 : * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 : * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either the GNU General Public License Version 2 or later (the "GPL"), or
26 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 : /*
38 : * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 : * And Contributors. All rights reserved.
40 : *
41 : * Redistribution and use in source and binary forms, with or without
42 : * modification, are permitted provided that the following conditions
43 : * are met:
44 : *
45 : * 1. Redistributions of source code must retain the above copyright
46 : * notice, this list of conditions and the following disclaimer.
47 : *
48 : * 2. Redistributions in binary form must reproduce the above copyright
49 : * notice, this list of conditions and the following disclaimer in the
50 : * documentation and/or other materials provided with the distribution.
51 : *
52 : * 3. All modifications to the source code must be clearly marked as
53 : * such. Binary redistributions based on modified source code
54 : * must be clearly marked as modified versions in the documentation
55 : * and/or other materials provided with the distribution.
56 : *
57 : * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 : * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 : * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 : * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 : * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 : * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 : * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 : * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 : * SUCH DAMAGE.
69 : */
70 :
71 : #ifndef CSUTIL_HXX_
72 : #define CSUTIL_HXX_
73 :
74 : #include "hunvisapi.h"
75 :
76 : // First some base level utility routines
77 :
78 : #include <fstream>
79 : #include <string>
80 : #include <vector>
81 : #include <string.h>
82 : #include "w_char.hxx"
83 : #include "htypes.hxx"
84 :
85 : #ifdef MOZILLA_CLIENT
86 : #include "nscore.h" // for mozalloc headers
87 : #endif
88 :
89 : // casing
90 : #define NOCAP 0
91 : #define INITCAP 1
92 : #define ALLCAP 2
93 : #define HUHCAP 3
94 : #define HUHINITCAP 4
95 :
96 : // default encoding and keystring
97 : #define SPELL_ENCODING "ISO8859-1"
98 : #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm"
99 :
100 : // default morphological fields
101 : #define MORPH_STEM "st:"
102 : #define MORPH_ALLOMORPH "al:"
103 : #define MORPH_POS "po:"
104 : #define MORPH_DERI_PFX "dp:"
105 : #define MORPH_INFL_PFX "ip:"
106 : #define MORPH_TERM_PFX "tp:"
107 : #define MORPH_DERI_SFX "ds:"
108 : #define MORPH_INFL_SFX "is:"
109 : #define MORPH_TERM_SFX "ts:"
110 : #define MORPH_SURF_PFX "sp:"
111 : #define MORPH_FREQ "fr:"
112 : #define MORPH_PHON "ph:"
113 : #define MORPH_HYPH "hy:"
114 : #define MORPH_PART "pa:"
115 : #define MORPH_FLAG "fl:"
116 : #define MORPH_HENTRY "_H:"
117 : #define MORPH_TAG_LEN strlen(MORPH_STEM)
118 :
119 : #define MSEP_FLD ' '
120 : #define MSEP_REC '\n'
121 : #define MSEP_ALT '\v'
122 :
123 : // default flags
124 : #define DEFAULTFLAGS 65510
125 : #define FORBIDDENWORD 65510
126 : #define ONLYUPCASEFLAG 65511
127 :
128 : // fix long pathname problem of WIN32 by using w_char std::fstream::open override
129 : LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path,
130 : std::ios_base::openmode mode);
131 :
132 : // convert UTF-16 characters to UTF-8
133 : LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest,
134 : const std::vector<w_char>& src);
135 :
136 : // convert UTF-8 characters to UTF-16
137 : LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest,
138 : const std::string& src);
139 :
140 : // remove end of line char(s)
141 : LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s);
142 :
143 : // duplicate string
144 : LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s);
145 :
146 : // parse into tokens with char delimiter
147 : LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str,
148 : std::string::const_iterator& start);
149 :
150 : // replace pat by rep in word and return word
151 : LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str,
152 : const std::string& search,
153 : const std::string& replace);
154 :
155 : // append s to ends of every lines in text
156 : LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str,
157 : const std::string& apd);
158 :
159 : // tokenize into lines with new line
160 : LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text,
161 : char breakchar);
162 :
163 : // tokenize into lines with new line and uniq in place
164 : LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar);
165 :
166 : LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar);
167 :
168 : // reverse word
169 : LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word);
170 :
171 : // reverse word
172 : LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&);
173 :
174 : // remove duplicates
175 : LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list);
176 :
177 : // character encoding information
178 : struct cs_info {
179 : unsigned char ccase;
180 : unsigned char clower;
181 : unsigned char cupper;
182 : };
183 :
184 : LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl();
185 : LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl();
186 : LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c,
187 : int langnum);
188 : LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum);
189 : LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum);
190 : LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c,
191 : int langnum);
192 : LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c);
193 :
194 : LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es);
195 :
196 : // get language identifiers of language codes
197 : LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang);
198 :
199 : // get characters of the given 8bit encoding with lower- and uppercase forms
200 : LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc);
201 :
202 : // convert std::string to all caps
203 : LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s,
204 : const struct cs_info* csconv);
205 :
206 : // convert null terminated string to all little
207 : LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s,
208 : const struct cs_info* csconv);
209 :
210 : // convert first letter of string to little
211 : LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s,
212 : const struct cs_info* csconv);
213 :
214 : // convert first letter of string to capital
215 : LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s,
216 : const struct cs_info* csconv);
217 :
218 : // convert first letter of UTF-8 string to capital
219 : LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
220 : mkinitcap_utf(std::vector<w_char>& u, int langnum);
221 :
222 : // convert UTF-8 string to little
223 : LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
224 : mkallsmall_utf(std::vector<w_char>& u, int langnum);
225 :
226 : // convert first letter of UTF-8 string to little
227 : LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
228 : mkinitsmall_utf(std::vector<w_char>& u, int langnum);
229 :
230 : // convert UTF-8 string to capital
231 : LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
232 : mkallcap_utf(std::vector<w_char>& u, int langnum);
233 :
234 : // get type of capitalization
235 : LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*);
236 :
237 : // get type of capitalization (UTF-8)
238 : LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum);
239 :
240 : // strip all ignored characters in the string
241 : LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf(
242 : std::string& word,
243 : const std::vector<w_char>& ignored_chars);
244 :
245 : // strip all ignored characters in the string
246 : LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars(
247 : std::string& word,
248 : const std::string& ignored_chars);
249 :
250 : LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line,
251 : std::string& out,
252 : int ln);
253 :
254 : LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line,
255 : std::string& out,
256 : std::vector<w_char>& out_utf16,
257 : int utf8,
258 : int ln);
259 :
260 : LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r);
261 :
262 : LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest,
263 : const std::string& morph,
264 : const std::string& var);
265 :
266 : // conversion function for protected memory
267 : LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source);
268 :
269 : // conversion function for protected memory
270 : LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
271 :
272 : // hash entry macros
273 0 : LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) {
274 : char* ret;
275 0 : if (!h->var)
276 0 : ret = NULL;
277 0 : else if (h->var & H_OPT_ALIASM)
278 0 : ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
279 : else
280 0 : ret = HENTRY_WORD(h) + h->blen + 1;
281 0 : return ret;
282 : }
283 :
284 : LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA(
285 : const struct hentry* h) {
286 : const char* ret;
287 : if (!h->var)
288 : ret = NULL;
289 : else if (h->var & H_OPT_ALIASM)
290 : ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
291 : else
292 : ret = HENTRY_WORD(h) + h->blen + 1;
293 : return ret;
294 : }
295 :
296 : // NULL-free version for warning-free OOo build
297 0 : LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2(
298 : const struct hentry* h) {
299 : const char* ret;
300 0 : if (!h->var)
301 0 : ret = "";
302 0 : else if (h->var & H_OPT_ALIASM)
303 0 : ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
304 : else
305 0 : ret = HENTRY_WORD(h) + h->blen + 1;
306 0 : return ret;
307 : }
308 :
309 0 : LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry* h,
310 : const char* p) {
311 0 : return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
312 : }
313 :
314 : #endif
|