Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "nsHyphenator.h"
7 : #include "nsIFile.h"
8 : #include "nsUTF8Utils.h"
9 : #include "nsUnicodeProperties.h"
10 : #include "nsIURI.h"
11 :
12 : #include "hyphen.h"
13 :
14 0 : nsHyphenator::nsHyphenator(nsIURI *aURI)
15 0 : : mDict(nullptr)
16 : {
17 0 : nsCString uriSpec;
18 0 : nsresult rv = aURI->GetSpec(uriSpec);
19 0 : if (NS_FAILED(rv)) {
20 0 : return;
21 : }
22 0 : mDict = hnj_hyphen_load(uriSpec.get());
23 : #ifdef DEBUG
24 0 : if (mDict) {
25 0 : printf("loaded hyphenation patterns from %s\n", uriSpec.get());
26 : }
27 : #endif
28 : }
29 :
30 0 : nsHyphenator::~nsHyphenator()
31 : {
32 0 : if (mDict != nullptr) {
33 0 : hnj_hyphen_free((HyphenDict*)mDict);
34 0 : mDict = nullptr;
35 : }
36 0 : }
37 :
38 : bool
39 0 : nsHyphenator::IsValid()
40 : {
41 0 : return (mDict != nullptr);
42 : }
43 :
44 : nsresult
45 0 : nsHyphenator::Hyphenate(const nsAString& aString, nsTArray<bool>& aHyphens)
46 : {
47 0 : if (!aHyphens.SetLength(aString.Length(), mozilla::fallible)) {
48 0 : return NS_ERROR_OUT_OF_MEMORY;
49 : }
50 0 : memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool));
51 :
52 0 : bool inWord = false;
53 0 : uint32_t wordStart = 0, wordLimit = 0;
54 : uint32_t chLen;
55 0 : for (uint32_t i = 0; i < aString.Length(); i += chLen) {
56 0 : uint32_t ch = aString[i];
57 0 : chLen = 1;
58 :
59 0 : if (NS_IS_HIGH_SURROGATE(ch)) {
60 0 : if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) {
61 0 : ch = SURROGATE_TO_UCS4(ch, aString[i+1]);
62 0 : chLen = 2;
63 : } else {
64 0 : NS_WARNING("unpaired surrogate found during hyphenation");
65 : }
66 : }
67 :
68 0 : nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch);
69 0 : if (cat == nsUGenCategory::kLetter || cat == nsUGenCategory::kMark) {
70 0 : if (!inWord) {
71 0 : inWord = true;
72 0 : wordStart = i;
73 : }
74 0 : wordLimit = i + chLen;
75 0 : if (i + chLen < aString.Length()) {
76 0 : continue;
77 : }
78 : }
79 :
80 0 : if (inWord) {
81 : // Convert the word to utf-8 for libhyphen, lowercasing it as we go
82 : // so that it will match the (lowercased) patterns (bug 1105644).
83 0 : nsAutoCString utf8;
84 0 : const char16_t* const begin = aString.BeginReading();
85 0 : const char16_t *cur = begin + wordStart;
86 0 : const char16_t *end = begin + wordLimit;
87 0 : while (cur < end) {
88 0 : uint32_t ch = *cur++;
89 :
90 0 : if (NS_IS_HIGH_SURROGATE(ch)) {
91 0 : if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
92 0 : ch = SURROGATE_TO_UCS4(ch, *cur++);
93 : } else {
94 0 : ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
95 : }
96 0 : } else if (NS_IS_LOW_SURROGATE(ch)) {
97 0 : ch = 0xfffd; // unpaired surrogate
98 : }
99 :
100 : // XXX What about language-specific casing? Consider Turkish I/i...
101 : // In practice, it looks like the current patterns will not be
102 : // affected by this, as they treat dotted and undotted i similarly.
103 0 : ch = ToLowerCase(ch);
104 :
105 0 : if (ch < 0x80) { // U+0000 - U+007F
106 0 : utf8.Append(ch);
107 0 : } else if (ch < 0x0800) { // U+0100 - U+07FF
108 0 : utf8.Append(0xC0 | (ch >> 6));
109 0 : utf8.Append(0x80 | (0x003F & ch));
110 0 : } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
111 0 : utf8.Append(0xE0 | (ch >> 12));
112 0 : utf8.Append(0x80 | (0x003F & (ch >> 6)));
113 0 : utf8.Append(0x80 | (0x003F & ch));
114 : } else {
115 0 : utf8.Append(0xF0 | (ch >> 18));
116 0 : utf8.Append(0x80 | (0x003F & (ch >> 12)));
117 0 : utf8.Append(0x80 | (0x003F & (ch >> 6)));
118 0 : utf8.Append(0x80 | (0x003F & ch));
119 : }
120 : }
121 :
122 0 : AutoTArray<char,200> utf8hyphens;
123 0 : utf8hyphens.SetLength(utf8.Length() + 5);
124 0 : char **rep = nullptr;
125 0 : int *pos = nullptr;
126 0 : int *cut = nullptr;
127 0 : int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict,
128 0 : utf8.BeginReading(), utf8.Length(),
129 : utf8hyphens.Elements(), nullptr,
130 0 : &rep, &pos, &cut);
131 0 : if (!err) {
132 : // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
133 : // from utf8 code unit indexing (which would match the utf8 input
134 : // string directly) to Unicode character indexing.
135 : // We then need to convert this to utf16 code unit offsets for Gecko.
136 0 : const char *hyphPtr = utf8hyphens.Elements();
137 0 : const char16_t *cur = begin + wordStart;
138 0 : const char16_t *end = begin + wordLimit;
139 0 : while (cur < end) {
140 0 : if (*hyphPtr & 0x01) {
141 0 : aHyphens[cur - begin] = true;
142 : }
143 0 : cur++;
144 0 : if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
145 0 : NS_IS_HIGH_SURROGATE(*(cur-1)))
146 : {
147 0 : cur++;
148 : }
149 0 : hyphPtr++;
150 : }
151 : }
152 : }
153 :
154 0 : inWord = false;
155 : }
156 :
157 0 : return NS_OK;
158 : }
|