Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(486)

Unified Diff: content/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « content/renderer/hyphenator/hyphenator.h ('k') | content/renderer/hyphenator/hyphenator_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: content/renderer/hyphenator/hyphenator.cc
===================================================================
--- content/renderer/hyphenator/hyphenator.cc (revision 0)
+++ content/renderer/hyphenator/hyphenator.cc (revision 0)
@@ -0,0 +1,231 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/renderer/hyphenator/hyphenator.h"
+
+#include "base/file_util.h"
+#include "base/logging.h"
+#include "base/memory/scoped_ptr.h"
+#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
+#include "third_party/hyphen/hyphen.h"
+#include "unicode/uscript.h"
+
+namespace {
+
+// A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds
+// only the length of converted UTF-16 characters. This class is used for
+// creating a mapping from the position of a UTF-8 string to a position of a
+// UTF-16 string without unnecessary conversions. Even though the following
+// snippet produces the same mapping, it needs to convert same characters many
+// times. This class incrementally counts the number of converted UTF-16
+// characters to avoid this problem.
+//
+// scoped_array<size_t> position(new size_t[text.length()]);
+// for (size_t i = 0; i < text.length(); ++i)
+// position[i] = UTF8ToUTF16(text.substr(0, i)).length();
+//
+class UTF16TextLength {
+ public:
+ UTF16TextLength();
+ ~UTF16TextLength();
+
+ // Returns the current position.
+ int utf16_length() const { return utf16_length_; }
+
+ // Appends one UTF-8 character to this converter and advances the converted
+ // position. This converter increases the position by one when it finishes
+ // reading a BMP character and increases by two when it finish reading a
+ // non-BMP character.
+ void Append(char c);
+
+ private:
+ // The length of the converted UTF-16 text.
+ int utf16_length_;
+
+ // The buffer that stores UTF-8 characters being converted.
+ std::string utf8_text_;
+
+ DISALLOW_COPY_AND_ASSIGN(UTF16TextLength);
+};
+
+UTF16TextLength::UTF16TextLength()
+ : utf16_length_(0) {
+}
+
+UTF16TextLength::~UTF16TextLength() {
+}
+
+void UTF16TextLength::Append(char c) {
+ // Append the given character and try converting the UTF-8 characters in this
+ // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,
+ // get the number of UTF-16 characters representing this codepoint and advance
+ // the position.
+ int code = 0;
+ int index = 0;
+ utf8_text_.push_back(c);
+ U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),
+ code);
+ if (code != U_SENTINEL) {
+ utf8_text_.clear();
+ utf16_length_ += U16_LENGTH(code);
+ }
+}
+
+// A class that encapsulates a hyphenation query. This class owns resources
+// temporarily needed for hyphenating one word, and deletes them when it is
+// deleted as listed in the following snippet.
+//
+// std::vector<int> hyphens;
+// QUery query(UTF8ToUTF16("hyphenate"));
+// query.Hyphenate(dict, &hyphens);
+//
+class Query {
+ public:
+ explicit Query(const string16& word);
+ ~Query();
+
+ // Hyphenates a word with the specified dictionary. This function hyphenates
+ // the word provided to its constructor and returns a list of hyphenation
+ // points, positions where we can insert hyphens.
+ bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);
+
+ private:
+ // A word to be hyphenated.
+ std::string word_utf8_;
+
+ // Return variables from the hyphen library.
+ scoped_array<char> hyphen_vector_;
+ char** rep_;
+ int* pos_;
+ int* cut_;
+
+ DISALLOW_COPY_AND_ASSIGN(Query);
+};
+
+Query::Query(const string16& word)
+ : rep_(NULL),
+ pos_(NULL),
+ cut_(NULL) {
+ // Remove trailing punctuation characters. WebKit does not remove these
+ // characters when it hyphenates a word. These characters prevent the hyphen
+ // library from applying some rules, i.e. they prevent the library from adding
+ // hyphens.
+ DCHECK(!word.empty());
+ const char16* data = word.data();
+ int length = static_cast<int>(word.length());
+ while (length > 0) {
+ int previous = length;
+ int code = 0;
+ U16_PREV(data, 0, previous, code);
+ UErrorCode error = U_ZERO_ERROR;
+ if (uscript_getScript(code, &error) != USCRIPT_COMMON)
+ break;
+ length = previous;
+ }
+ UTF16ToUTF8(word.c_str(), length, &word_utf8_);
+ // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a
+ // buffer of |word_.length()| + 5 as written in Line 112 of
+ // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.
+ hyphen_vector_.reset(new char[word_utf8_.length() + 5]);
+}
+
+Query::~Query() {
+ if (rep_) {
+ for (size_t i = 0; i < word_utf8_.length(); ++i) {
+ if (rep_[i])
+ free(rep_[i]);
+ }
+ free(rep_);
+ }
+ if (pos_)
+ free(pos_);
+ if (cut_)
+ free(cut_);
+}
+
+bool Query::Hyphenate(HyphenDict* dictionary,
+ std::vector<int>* hyphen_offsets) {
+ DCHECK(dictionary);
+ DCHECK(hyphen_offsets);
+
+ int error_code = hnj_hyphen_hyphenate2(dictionary,
+ word_utf8_.data(),
+ static_cast<int>(word_utf8_.length()),
+ hyphen_vector_.get(),
+ NULL,
+ &rep_,
+ &pos_,
+ &cut_);
+ if (error_code)
+ return false;
+
+ // WebKit needs hyphenation points counted in UTF-16 characters. On the other
+ // hand, the hyphen library returns hyphenation points counted in UTF-8
+ // characters. We increamentally convert hyphenation points in UTF-8
+ // characters to hyphenation points in UTF-16 characters and write the
+ // converted hyphenation points to the output vector.
+ UTF16TextLength text_length;
+ hyphen_offsets->clear();
+ for (size_t i = 0; i < word_utf8_.length(); ++i) {
+ text_length.Append(word_utf8_[i]);
+ if (hyphen_vector_[i] & 1)
+ hyphen_offsets->push_back(text_length.utf16_length());
+ }
+ return !hyphen_offsets->empty();
+}
+
+} // namespace
+
+namespace content {
+
+Hyphenator::Hyphenator(base::PlatformFile file)
+ : dictionary_(NULL),
+ rule_file_(file),
+ result_(0) {
+}
+
+Hyphenator::~Hyphenator() {
+ if (dictionary_)
+ hnj_hyphen_free(dictionary_);
+}
+
+bool Hyphenator::Initialize() {
+ if (dictionary_)
+ return true;
+
+ rule_map_.reset(new file_util::MemoryMappedFile);
+ if (!rule_map_->Initialize(rule_file_))
+ return false;
+
+ dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());
+ return !!dictionary_;
+}
+
+size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,
+ size_t before_index) {
+ if (!dictionary_ || word.empty())
+ return 0;
+
+ // Call the hyphen library to get all hyphenation points, i.e. positions where
+ // we can insert hyphens. When WebKit finds a line-break, it calls this
+ // function twice or more with the same word to find the best hyphenation
+ // point. To avoid calling the hyphen library twice or more with the same
+ // word, we cache the last query.
+ if (word_ != word) {
+ word_ = word;
+ Query query(word);
+ result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);
+ }
+ if (!result_)
+ return 0;
+ for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();
+ it != hyphen_offsets_.rend(); ++it) {
+ if (static_cast<size_t>(*it) < before_index)
+ return *it;
+ }
+ return 0;
+}
+
+} // namespace content
Property changes on: content\renderer\hyphenator\hyphenator.cc
___________________________________________________________________
Added: svn:eol-style
+ LF
« no previous file with comments | « content/renderer/hyphenator/hyphenator.h ('k') | content/renderer/hyphenator/hyphenator_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698