OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "content/renderer/hyphenator/hyphenator.h" | |
6 | |
7 #include "base/files/memory_mapped_file.h" | |
8 #include "base/logging.h" | |
9 #include "base/memory/scoped_ptr.h" | |
10 #include "base/strings/string_util.h" | |
11 #include "base/strings/utf_string_conversions.h" | |
12 #include "content/common/hyphenator_messages.h" | |
13 #include "content/public/renderer/render_thread.h" | |
14 #include "third_party/hyphen/hyphen.h" | |
15 #include "third_party/icu/source/common/unicode/uscript.h" | |
16 | |
17 namespace { | |
18 | |
19 // A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds | |
20 // only the length of converted UTF-16 characters. This class is used for | |
21 // creating a mapping from the position of a UTF-8 string to a position of a | |
22 // UTF-16 string without unnecessary conversions. Even though the following | |
23 // snippet produces the same mapping, it needs to convert same characters many | |
24 // times. This class incrementally counts the number of converted UTF-16 | |
25 // characters to avoid this problem. | |
26 // | |
27 // scoped_ptr<size_t[]> position(new size_t[text.length()]); | |
28 // for (size_t i = 0; i < text.length(); ++i) | |
29 // position[i] = UTF8ToUTF16(text.substr(0, i)).length(); | |
30 // | |
31 class UTF16TextLength { | |
32 public: | |
33 UTF16TextLength(); | |
34 ~UTF16TextLength(); | |
35 | |
36 // Returns the current position. | |
37 int utf16_length() const { return utf16_length_; } | |
38 | |
39 // Appends one UTF-8 character to this converter and advances the converted | |
40 // position. This converter increases the position by one when it finishes | |
41 // reading a BMP character and increases by two when it finish reading a | |
42 // non-BMP character. | |
43 void Append(char c); | |
44 | |
45 private: | |
46 // The length of the converted UTF-16 text. | |
47 int utf16_length_; | |
48 | |
49 // The buffer that stores UTF-8 characters being converted. | |
50 std::string utf8_text_; | |
51 | |
52 DISALLOW_COPY_AND_ASSIGN(UTF16TextLength); | |
53 }; | |
54 | |
55 UTF16TextLength::UTF16TextLength() | |
56 : utf16_length_(0) { | |
57 } | |
58 | |
59 UTF16TextLength::~UTF16TextLength() { | |
60 } | |
61 | |
62 void UTF16TextLength::Append(char c) { | |
63 // Append the given character and try converting the UTF-8 characters in this | |
64 // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint, | |
65 // get the number of UTF-16 characters representing this codepoint and advance | |
66 // the position. | |
67 int code = 0; | |
68 int index = 0; | |
69 utf8_text_.push_back(c); | |
70 U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()), | |
71 code); | |
72 if (code != U_SENTINEL) { | |
73 utf8_text_.clear(); | |
74 utf16_length_ += U16_LENGTH(code); | |
75 } | |
76 } | |
77 | |
78 // A class that encapsulates a hyphenation query. This class owns resources | |
79 // temporarily needed for hyphenating one word, and deletes them when it is | |
80 // deleted as listed in the following snippet. | |
81 // | |
82 // std::vector<int> hyphens; | |
83 // QUery query(UTF8ToUTF16("hyphenate")); | |
84 // query.Hyphenate(dict, &hyphens); | |
85 // | |
86 class Query { | |
87 public: | |
88 explicit Query(const string16& word); | |
89 ~Query(); | |
90 | |
91 // Hyphenates a word with the specified dictionary. This function hyphenates | |
92 // the word provided to its constructor and returns a list of hyphenation | |
93 // points, positions where we can insert hyphens. | |
94 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets); | |
95 | |
96 private: | |
97 // A word to be hyphenated. | |
98 std::string word_utf8_; | |
99 | |
100 // Return variables from the hyphen library. | |
101 scoped_ptr<char[]> hyphen_vector_; | |
102 char** rep_; | |
103 int* pos_; | |
104 int* cut_; | |
105 | |
106 DISALLOW_COPY_AND_ASSIGN(Query); | |
107 }; | |
108 | |
109 Query::Query(const string16& word) | |
110 : rep_(NULL), | |
111 pos_(NULL), | |
112 cut_(NULL) { | |
113 // Remove trailing punctuation characters. WebKit does not remove these | |
114 // characters when it hyphenates a word. These characters prevent the hyphen | |
115 // library from applying some rules, i.e. they prevent the library from adding | |
116 // hyphens. | |
117 DCHECK(!word.empty()); | |
118 const char16* data = word.data(); | |
119 int length = static_cast<int>(word.length()); | |
120 while (length > 0) { | |
121 int previous = length; | |
122 int code = 0; | |
123 U16_PREV(data, 0, previous, code); | |
124 UErrorCode error = U_ZERO_ERROR; | |
125 if (uscript_getScript(code, &error) != USCRIPT_COMMON) | |
126 break; | |
127 length = previous; | |
128 } | |
129 UTF16ToUTF8(word.c_str(), length, &word_utf8_); | |
130 // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a | |
131 // buffer of |word_.length()| + 5 as written in Line 112 of | |
132 // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>. | |
133 hyphen_vector_.reset(new char[word_utf8_.length() + 5]); | |
134 } | |
135 | |
136 Query::~Query() { | |
137 if (rep_) { | |
138 for (size_t i = 0; i < word_utf8_.length(); ++i) { | |
139 if (rep_[i]) | |
140 free(rep_[i]); | |
141 } | |
142 free(rep_); | |
143 } | |
144 if (pos_) | |
145 free(pos_); | |
146 if (cut_) | |
147 free(cut_); | |
148 } | |
149 | |
150 bool Query::Hyphenate(HyphenDict* dictionary, | |
151 std::vector<int>* hyphen_offsets) { | |
152 DCHECK(dictionary); | |
153 DCHECK(hyphen_offsets); | |
154 | |
155 int error_code = hnj_hyphen_hyphenate2(dictionary, | |
156 word_utf8_.data(), | |
157 static_cast<int>(word_utf8_.length()), | |
158 hyphen_vector_.get(), | |
159 NULL, | |
160 &rep_, | |
161 &pos_, | |
162 &cut_); | |
163 if (error_code) | |
164 return false; | |
165 | |
166 // WebKit needs hyphenation points counted in UTF-16 characters. On the other | |
167 // hand, the hyphen library returns hyphenation points counted in UTF-8 | |
168 // characters. We increamentally convert hyphenation points in UTF-8 | |
169 // characters to hyphenation points in UTF-16 characters and write the | |
170 // converted hyphenation points to the output vector. | |
171 UTF16TextLength text_length; | |
172 hyphen_offsets->clear(); | |
173 for (size_t i = 0; i < word_utf8_.length(); ++i) { | |
174 text_length.Append(word_utf8_[i]); | |
175 if (hyphen_vector_[i] & 1) | |
176 hyphen_offsets->push_back(text_length.utf16_length()); | |
177 } | |
178 return !hyphen_offsets->empty(); | |
179 } | |
180 | |
181 } // namespace | |
182 | |
183 namespace content { | |
184 | |
185 Hyphenator::Hyphenator(base::PlatformFile file) | |
186 : dictionary_(NULL), | |
187 dictionary_file_(base::FdopenPlatformFile(file, "r")), | |
188 result_(0) { | |
189 } | |
190 | |
191 Hyphenator::~Hyphenator() { | |
192 if (dictionary_) | |
193 hnj_hyphen_free(dictionary_); | |
194 } | |
195 | |
196 bool Hyphenator::Initialize() { | |
197 if (dictionary_) | |
198 return true; | |
199 | |
200 if (!dictionary_file_.get()) | |
201 return false; | |
202 dictionary_ = hnj_hyphen_load_file(dictionary_file_.get()); | |
203 return !!dictionary_; | |
204 } | |
205 | |
206 bool Hyphenator::Attach(RenderThread* thread, const string16& locale) { | |
207 if (!thread) | |
208 return false; | |
209 locale_.assign(locale); | |
210 thread->AddObserver(this); | |
211 return thread->Send(new HyphenatorHostMsg_OpenDictionary(locale)); | |
212 } | |
213 | |
214 bool Hyphenator::CanHyphenate(const string16& locale) { | |
215 return !locale_.compare(locale); | |
216 } | |
217 | |
218 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word, | |
219 size_t before_index) { | |
220 if (!Initialize() || word.empty()) | |
221 return 0; | |
222 | |
223 // Call the hyphen library to get all hyphenation points, i.e. positions where | |
224 // we can insert hyphens. When WebKit finds a line-break, it calls this | |
225 // function twice or more with the same word to find the best hyphenation | |
226 // point. To avoid calling the hyphen library twice or more with the same | |
227 // word, we cache the last query. | |
228 if (word_ != word) { | |
229 word_ = word; | |
230 Query query(word); | |
231 result_ = query.Hyphenate(dictionary_, &hyphen_offsets_); | |
232 } | |
233 if (!result_) | |
234 return 0; | |
235 for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin(); | |
236 it != hyphen_offsets_.rend(); ++it) { | |
237 if (static_cast<size_t>(*it) < before_index) | |
238 return *it; | |
239 } | |
240 return 0; | |
241 } | |
242 | |
243 bool Hyphenator::OnControlMessageReceived(const IPC::Message& message) { | |
244 bool handled = true; | |
245 IPC_BEGIN_MESSAGE_MAP(Hyphenator, message) | |
246 IPC_MESSAGE_HANDLER(HyphenatorMsg_SetDictionary, OnSetDictionary) | |
247 IPC_MESSAGE_UNHANDLED(handled = false) | |
248 IPC_END_MESSAGE_MAP() | |
249 return handled; | |
250 } | |
251 | |
252 void Hyphenator::OnSetDictionary(IPC::PlatformFileForTransit file) { | |
253 base::PlatformFile rule_file = | |
254 IPC::PlatformFileForTransitToPlatformFile(file); | |
255 if (rule_file == base::kInvalidPlatformFileValue) | |
256 return; | |
257 // Delete the current dictionary and save the given file to this object. We | |
258 // initialize the hyphen library the first time when WebKit actually | |
259 // hyphenates a word, i.e. when WebKit calls the ComputeLastHyphenLocation | |
260 // function. (WebKit does not always hyphenate words even when it calls the | |
261 // CanHyphenate function, e.g. WebKit does not have to hyphenate words when it | |
262 // does not have to break text into lines.) | |
263 if (dictionary_) { | |
264 hnj_hyphen_free(dictionary_); | |
265 dictionary_ = NULL; | |
266 } | |
267 dictionary_file_.Set(base::FdopenPlatformFile(rule_file, "r")); | |
268 } | |
269 | |
270 } // namespace content | |
OLD | NEW |