OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "content/renderer/hyphenator/hyphenator.h" |
| 6 |
| 7 #include "base/file_util.h" |
| 8 #include "base/logging.h" |
| 9 #include "base/memory/scoped_ptr.h" |
| 10 #include "base/string_util.h" |
| 11 #include "base/utf_string_conversions.h" |
| 12 #include "third_party/hyphen/hyphen.h" |
| 13 #include "unicode/uscript.h" |
| 14 |
| 15 namespace { |
| 16 |
| 17 // A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds |
| 18 // only the length of converted UTF-16 characters. This class is used for |
| 19 // creating a mapping from the position of a UTF-8 string to a position of a |
| 20 // UTF-16 string without unnecessary conversions. Even though the following |
| 21 // snippet produces the same mapping, it needs to convert same characters many |
| 22 // times. This class incrementally counts the number of converted UTF-16 |
| 23 // characters to avoid this problem. |
| 24 // |
| 25 // scoped_array<size_t> position(new size_t[text.length()]); |
| 26 // for (size_t i = 0; i < text.length(); ++i) |
| 27 // position[i] = UTF8ToUTF16(text.substr(0, i)).length(); |
| 28 // |
| 29 class UTF16TextLength { |
| 30 public: |
| 31 UTF16TextLength(); |
| 32 ~UTF16TextLength(); |
| 33 |
| 34 // Returns the current position. |
| 35 int utf16_length() const { return utf16_length_; } |
| 36 |
| 37 // Appends one UTF-8 character to this converter and advances the converted |
| 38 // position. This converter increases the position by one when it finishes |
| 39 // reading a BMP character and increases by two when it finish reading a |
| 40 // non-BMP character. |
| 41 void Append(char c); |
| 42 |
| 43 private: |
| 44 // The length of the converted UTF-16 text. |
| 45 int utf16_length_; |
| 46 |
| 47 // The buffer that stores UTF-8 characters being converted. |
| 48 std::string utf8_text_; |
| 49 |
| 50 DISALLOW_COPY_AND_ASSIGN(UTF16TextLength); |
| 51 }; |
| 52 |
| 53 UTF16TextLength::UTF16TextLength() |
| 54 : utf16_length_(0) { |
| 55 } |
| 56 |
| 57 UTF16TextLength::~UTF16TextLength() { |
| 58 } |
| 59 |
| 60 void UTF16TextLength::Append(char c) { |
| 61 // Append the given character and try converting the UTF-8 characters in this |
| 62 // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint, |
| 63 // get the number of UTF-16 characters representing this codepoint and advance |
| 64 // the position. |
| 65 int code = 0; |
| 66 int index = 0; |
| 67 utf8_text_.push_back(c); |
| 68 U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()), |
| 69 code); |
| 70 if (code != U_SENTINEL) { |
| 71 utf8_text_.clear(); |
| 72 utf16_length_ += U16_LENGTH(code); |
| 73 } |
| 74 } |
| 75 |
| 76 // A class that encapsulates a hyphenation query. This class owns resources |
| 77 // temporarily needed for hyphenating one word, and deletes them when it is |
| 78 // deleted as listed in the following snippet. |
| 79 // |
| 80 // std::vector<int> hyphens; |
| 81 // QUery query(UTF8ToUTF16("hyphenate")); |
| 82 // query.Hyphenate(dict, &hyphens); |
| 83 // |
| 84 class Query { |
| 85 public: |
| 86 explicit Query(const string16& word); |
| 87 ~Query(); |
| 88 |
| 89 // Hyphenates a word with the specified dictionary. This function hyphenates |
| 90 // the word provided to its constructor and returns a list of hyphenation |
| 91 // points, positions where we can insert hyphens. |
| 92 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets); |
| 93 |
| 94 private: |
| 95 // A word to be hyphenated. |
| 96 std::string word_utf8_; |
| 97 |
| 98 // Return variables from the hyphen library. |
| 99 scoped_array<char> hyphen_vector_; |
| 100 char** rep_; |
| 101 int* pos_; |
| 102 int* cut_; |
| 103 |
| 104 DISALLOW_COPY_AND_ASSIGN(Query); |
| 105 }; |
| 106 |
| 107 Query::Query(const string16& word) |
| 108 : rep_(NULL), |
| 109 pos_(NULL), |
| 110 cut_(NULL) { |
| 111 // Remove trailing punctuation characters. WebKit does not remove these |
| 112 // characters when it hyphenates a word. These characters prevent the hyphen |
| 113 // library from applying some rules, i.e. they prevent the library from adding |
| 114 // hyphens. |
| 115 DCHECK(!word.empty()); |
| 116 const char16* data = word.data(); |
| 117 int length = static_cast<int>(word.length()); |
| 118 while (length > 0) { |
| 119 int previous = length; |
| 120 int code = 0; |
| 121 U16_PREV(data, 0, previous, code); |
| 122 UErrorCode error = U_ZERO_ERROR; |
| 123 if (uscript_getScript(code, &error) != USCRIPT_COMMON) |
| 124 break; |
| 125 length = previous; |
| 126 } |
| 127 UTF16ToUTF8(word.c_str(), length, &word_utf8_); |
| 128 // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a |
| 129 // buffer of |word_.length()| + 5 as written in Line 112 of |
| 130 // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>. |
| 131 hyphen_vector_.reset(new char[word_utf8_.length() + 5]); |
| 132 } |
| 133 |
| 134 Query::~Query() { |
| 135 if (rep_) { |
| 136 for (size_t i = 0; i < word_utf8_.length(); ++i) { |
| 137 if (rep_[i]) |
| 138 free(rep_[i]); |
| 139 } |
| 140 free(rep_); |
| 141 } |
| 142 if (pos_) |
| 143 free(pos_); |
| 144 if (cut_) |
| 145 free(cut_); |
| 146 } |
| 147 |
| 148 bool Query::Hyphenate(HyphenDict* dictionary, |
| 149 std::vector<int>* hyphen_offsets) { |
| 150 DCHECK(dictionary); |
| 151 DCHECK(hyphen_offsets); |
| 152 |
| 153 int error_code = hnj_hyphen_hyphenate2(dictionary, |
| 154 word_utf8_.data(), |
| 155 static_cast<int>(word_utf8_.length()), |
| 156 hyphen_vector_.get(), |
| 157 NULL, |
| 158 &rep_, |
| 159 &pos_, |
| 160 &cut_); |
| 161 if (error_code) |
| 162 return false; |
| 163 |
| 164 // WebKit needs hyphenation points counted in UTF-16 characters. On the other |
| 165 // hand, the hyphen library returns hyphenation points counted in UTF-8 |
| 166 // characters. We increamentally convert hyphenation points in UTF-8 |
| 167 // characters to hyphenation points in UTF-16 characters and write the |
| 168 // converted hyphenation points to the output vector. |
| 169 UTF16TextLength text_length; |
| 170 hyphen_offsets->clear(); |
| 171 for (size_t i = 0; i < word_utf8_.length(); ++i) { |
| 172 text_length.Append(word_utf8_[i]); |
| 173 if (hyphen_vector_[i] & 1) |
| 174 hyphen_offsets->push_back(text_length.utf16_length()); |
| 175 } |
| 176 return !hyphen_offsets->empty(); |
| 177 } |
| 178 |
| 179 } // namespace |
| 180 |
| 181 namespace content { |
| 182 |
| 183 Hyphenator::Hyphenator(base::PlatformFile file) |
| 184 : dictionary_(NULL), |
| 185 rule_file_(file), |
| 186 result_(0) { |
| 187 } |
| 188 |
| 189 Hyphenator::~Hyphenator() { |
| 190 if (dictionary_) |
| 191 hnj_hyphen_free(dictionary_); |
| 192 } |
| 193 |
| 194 bool Hyphenator::Initialize() { |
| 195 if (dictionary_) |
| 196 return true; |
| 197 |
| 198 rule_map_.reset(new file_util::MemoryMappedFile); |
| 199 if (!rule_map_->Initialize(rule_file_)) |
| 200 return false; |
| 201 |
| 202 dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length()); |
| 203 return !!dictionary_; |
| 204 } |
| 205 |
| 206 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word, |
| 207 size_t before_index) { |
| 208 if (!dictionary_ || word.empty()) |
| 209 return 0; |
| 210 |
| 211 // Call the hyphen library to get all hyphenation points, i.e. positions where |
| 212 // we can insert hyphens. When WebKit finds a line-break, it calls this |
| 213 // function twice or more with the same word to find the best hyphenation |
| 214 // point. To avoid calling the hyphen library twice or more with the same |
| 215 // word, we cache the last query. |
| 216 if (word_ != word) { |
| 217 word_ = word; |
| 218 Query query(word); |
| 219 result_ = query.Hyphenate(dictionary_, &hyphen_offsets_); |
| 220 } |
| 221 if (!result_) |
| 222 return 0; |
| 223 for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin(); |
| 224 it != hyphen_offsets_.rend(); ++it) { |
| 225 if (static_cast<size_t>(*it) < before_index) |
| 226 return *it; |
| 227 } |
| 228 return 0; |
| 229 } |
| 230 |
| 231 } // namespace content |
OLD | NEW |