Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(63)

Side by Side Diff: content/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "content/renderer/hyphenator/hyphenator.h"
6
7 #include "base/file_util.h"
8 #include "base/logging.h"
9 #include "base/memory/scoped_ptr.h"
10 #include "base/string_util.h"
11 #include "base/utf_string_conversions.h"
12 #include "third_party/hyphen/hyphen.h"
13 #include "unicode/uscript.h"
14
15 namespace {
16
17 // A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds
18 // only the length of converted UTF-16 characters. This class is used for
19 // creating a mapping from the position of a UTF-8 string to a position of a
20 // UTF-16 string without unnecessary conversions. Even though the following
21 // snippet produces the same mapping, it needs to convert same characters many
22 // times. This class incrementally counts the number of converted UTF-16
23 // characters to avoid this problem.
24 //
25 // scoped_array<size_t> position(new size_t[text.length()]);
26 // for (size_t i = 0; i < text.length(); ++i)
27 // position[i] = UTF8ToUTF16(text.substr(0, i)).length();
28 //
29 class UTF16TextLength {
30 public:
31 UTF16TextLength();
32 ~UTF16TextLength();
33
34 // Returns the current position.
35 int utf16_length() const { return utf16_length_; }
36
37 // Appends one UTF-8 character to this converter and advances the converted
38 // position. This converter increases the position by one when it finishes
39 // reading a BMP character and increases by two when it finish reading a
40 // non-BMP character.
41 void Append(char c);
42
43 private:
44 // The length of the converted UTF-16 text.
45 int utf16_length_;
46
47 // The buffer that stores UTF-8 characters being converted.
48 std::string utf8_text_;
49
50 DISALLOW_COPY_AND_ASSIGN(UTF16TextLength);
51 };
52
53 UTF16TextLength::UTF16TextLength()
54 : utf16_length_(0) {
55 }
56
57 UTF16TextLength::~UTF16TextLength() {
58 }
59
60 void UTF16TextLength::Append(char c) {
61 // Append the given character and try converting the UTF-8 characters in this
62 // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,
63 // get the number of UTF-16 characters representing this codepoint and advance
64 // the position.
65 int code = 0;
66 int index = 0;
67 utf8_text_.push_back(c);
68 U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),
69 code);
70 if (code != U_SENTINEL) {
71 utf8_text_.clear();
72 utf16_length_ += U16_LENGTH(code);
73 }
74 }
75
76 // A class that encapsulates a hyphenation query. This class owns resources
77 // temporarily needed for hyphenating one word, and deletes them when it is
78 // deleted as listed in the following snippet.
79 //
80 // std::vector<int> hyphens;
81 // QUery query(UTF8ToUTF16("hyphenate"));
82 // query.Hyphenate(dict, &hyphens);
83 //
84 class Query {
85 public:
86 explicit Query(const string16& word);
87 ~Query();
88
89 // Hyphenates a word with the specified dictionary. This function hyphenates
90 // the word provided to its constructor and returns a list of hyphenation
91 // points, positions where we can insert hyphens.
92 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);
93
94 private:
95 // A word to be hyphenated.
96 std::string word_utf8_;
97
98 // Return variables from the hyphen library.
99 scoped_array<char> hyphen_vector_;
100 char** rep_;
101 int* pos_;
102 int* cut_;
103
104 DISALLOW_COPY_AND_ASSIGN(Query);
105 };
106
107 Query::Query(const string16& word)
108 : rep_(NULL),
109 pos_(NULL),
110 cut_(NULL) {
111 // Remove trailing punctuation characters. WebKit does not remove these
112 // characters when it hyphenates a word. These characters prevent the hyphen
113 // library from applying some rules, i.e. they prevent the library from adding
114 // hyphens.
115 DCHECK(!word.empty());
116 const char16* data = word.data();
117 int length = static_cast<int>(word.length());
118 while (length > 0) {
119 int previous = length;
120 int code = 0;
121 U16_PREV(data, 0, previous, code);
122 UErrorCode error = U_ZERO_ERROR;
123 if (uscript_getScript(code, &error) != USCRIPT_COMMON)
124 break;
125 length = previous;
126 }
127 UTF16ToUTF8(word.c_str(), length, &word_utf8_);
128 // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a
129 // buffer of |word_.length()| + 5 as written in Line 112 of
130 // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.
131 hyphen_vector_.reset(new char[word_utf8_.length() + 5]);
132 }
133
134 Query::~Query() {
135 if (rep_) {
136 for (size_t i = 0; i < word_utf8_.length(); ++i) {
137 if (rep_[i])
138 free(rep_[i]);
139 }
140 free(rep_);
141 }
142 if (pos_)
143 free(pos_);
144 if (cut_)
145 free(cut_);
146 }
147
148 bool Query::Hyphenate(HyphenDict* dictionary,
149 std::vector<int>* hyphen_offsets) {
150 DCHECK(dictionary);
151 DCHECK(hyphen_offsets);
152
153 int error_code = hnj_hyphen_hyphenate2(dictionary,
154 word_utf8_.data(),
155 static_cast<int>(word_utf8_.length()),
156 hyphen_vector_.get(),
157 NULL,
158 &rep_,
159 &pos_,
160 &cut_);
161 if (error_code)
162 return false;
163
164 // WebKit needs hyphenation points counted in UTF-16 characters. On the other
165 // hand, the hyphen library returns hyphenation points counted in UTF-8
166 // characters. We increamentally convert hyphenation points in UTF-8
167 // characters to hyphenation points in UTF-16 characters and write the
168 // converted hyphenation points to the output vector.
169 UTF16TextLength text_length;
170 hyphen_offsets->clear();
171 for (size_t i = 0; i < word_utf8_.length(); ++i) {
172 text_length.Append(word_utf8_[i]);
173 if (hyphen_vector_[i] & 1)
174 hyphen_offsets->push_back(text_length.utf16_length());
175 }
176 return !hyphen_offsets->empty();
177 }
178
179 } // namespace
180
181 namespace content {
182
183 Hyphenator::Hyphenator(base::PlatformFile file)
184 : dictionary_(NULL),
185 rule_file_(file),
186 result_(0) {
187 }
188
189 Hyphenator::~Hyphenator() {
190 if (dictionary_)
191 hnj_hyphen_free(dictionary_);
192 }
193
194 bool Hyphenator::Initialize() {
195 if (dictionary_)
196 return true;
197
198 rule_map_.reset(new file_util::MemoryMappedFile);
199 if (!rule_map_->Initialize(rule_file_))
200 return false;
201
202 dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());
203 return !!dictionary_;
204 }
205
206 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,
207 size_t before_index) {
208 if (!dictionary_ || word.empty())
209 return 0;
210
211 // Call the hyphen library to get all hyphenation points, i.e. positions where
212 // we can insert hyphens. When WebKit finds a line-break, it calls this
213 // function twice or more with the same word to find the best hyphenation
214 // point. To avoid calling the hyphen library twice or more with the same
215 // word, we cache the last query.
216 if (word_ != word) {
217 word_ = word;
218 Query query(word);
219 result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);
220 }
221 if (!result_)
222 return 0;
223 for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();
224 it != hyphen_offsets_.rend(); ++it) {
225 if (static_cast<size_t>(*it) < before_index)
226 return *it;
227 }
228 return 0;
229 }
230
231 } // namespace content
OLDNEW
« no previous file with comments | « content/renderer/hyphenator/hyphenator.h ('k') | content/renderer/hyphenator/hyphenator_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698