content/renderer/hyphenator/hyphenator.cc - Issue 9545017: Adds a hy-phen-ator.

Side by Side Diff: content/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "content/renderer/hyphenator/hyphenator.h"

	6

	7 #include "base/file_util.h"

	8 #include "base/logging.h"

	9 #include "base/memory/scoped_ptr.h"

	10 #include "base/string_util.h"

	11 #include "base/utf_string_conversions.h"

	12 #include "third_party/hyphen/hyphen.h"

	13 #include "unicode/uscript.h"

	14

	15 namespace {

	16

	17 // A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds

	18 // only the length of converted UTF-16 characters. This class is used for

	19 // creating a mapping from the position of a UTF-8 string to a position of a

	20 // UTF-16 string without unnecessary conversions. Even though the following

	21 // snippet produces the same mapping, it needs to convert same characters many

	22 // times. This class incrementally counts the number of converted UTF-16

	23 // characters to avoid this problem.

	24 //

	25 // scoped_array<size_t> position(new size_t[text.length()]);

	26 // for (size_t i = 0; i < text.length(); ++i)

	27 // position[i] = UTF8ToUTF16(text.substr(0, i)).length();

	28 //

	29 class UTF16TextLength {

	30 public:

	31 UTF16TextLength();

	32 ~UTF16TextLength();

	33

	34 // Returns the current position.

	35 int utf16_length() const { return utf16_length_; }

	36

	37 // Appends one UTF-8 character to this converter and advances the converted

	38 // position. This converter increases the position by one when it finishes

	39 // reading a BMP character and increases by two when it finish reading a

	40 // non-BMP character.

	41 void Append(char c);

	42

	43 private:

	44 // The length of the converted UTF-16 text.

	45 int utf16_length_;

	46

	47 // The buffer that stores UTF-8 characters being converted.

	48 std::string utf8_text_;

	49

	50 DISALLOW_COPY_AND_ASSIGN(UTF16TextLength);

	51 };

	52

	53 UTF16TextLength::UTF16TextLength()

	54 : utf16_length_(0) {

	55 }

	56

	57 UTF16TextLength::~UTF16TextLength() {

	58 }

	59

	60 void UTF16TextLength::Append(char c) {

	61 // Append the given character and try converting the UTF-8 characters in this

	62 // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,

	63 // get the number of UTF-16 characters representing this codepoint and advance

	64 // the position.

	65 int code = 0;

	66 int index = 0;

	67 utf8_text_.push_back(c);

	68 U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),

	69 code);

	70 if (code != U_SENTINEL) {

	71 utf8_text_.clear();

	72 utf16_length_ += U16_LENGTH(code);

	73 }

	74 }

	75

	76 // A class that encapsulates a hyphenation query. This class owns resources

	77 // temporarily needed for hyphenating one word, and deletes them when it is

	78 // deleted as listed in the following snippet.

	79 //

	80 // std::vector<int> hyphens;

	81 // QUery query(UTF8ToUTF16("hyphenate"));

	82 // query.Hyphenate(dict, &hyphens);

	83 //

	84 class Query {

	85 public:

	86 explicit Query(const string16& word);

	87 ~Query();

	88

	89 // Hyphenates a word with the specified dictionary. This function hyphenates

	90 // the word provided to its constructor and returns a list of hyphenation

	91 // points, positions where we can insert hyphens.

	92 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);

	93

	94 private:

	95 // A word to be hyphenated.

	96 std::string word_utf8_;

	97

	98 // Return variables from the hyphen library.

	99 scoped_array<char> hyphen_vector_;

	100 char** rep_;

	101 int* pos_;

	102 int* cut_;

	103

	104 DISALLOW_COPY_AND_ASSIGN(Query);

	105 };

	106

	107 Query::Query(const string16& word)

	108 : rep_(NULL),

	109 pos_(NULL),

	110 cut_(NULL) {

	111 // Remove trailing punctuation characters. WebKit does not remove these

	112 // characters when it hyphenates a word. These characters prevent the hyphen

	113 // library from applying some rules, i.e. they prevent the library from adding

	114 // hyphens.

	115 DCHECK(!word.empty());

	116 const char16* data = word.data();

	117 int length = static_cast<int>(word.length());

	118 while (length > 0) {

	119 int previous = length;

	120 int code = 0;

	121 U16_PREV(data, 0, previous, code);

	122 UErrorCode error = U_ZERO_ERROR;

	123 if (uscript_getScript(code, &error) != USCRIPT_COMMON)

	124 break;

	125 length = previous;

	126 }

	127 UTF16ToUTF8(word.c_str(), length, &word_utf8_);

	128 // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a

	129 // buffer of \|word_.length()\| + 5 as written in Line 112 of

	130 // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.

	131 hyphen_vector_.reset(new char[word_utf8_.length() + 5]);

	132 }

	133

	134 Query::~Query() {

	135 if (rep_) {

	136 for (size_t i = 0; i < word_utf8_.length(); ++i) {

	137 if (rep_[i])

	138 free(rep_[i]);

	139 }

	140 free(rep_);

	141 }

	142 if (pos_)

	143 free(pos_);

	144 if (cut_)

	145 free(cut_);

	146 }

	147

	148 bool Query::Hyphenate(HyphenDict* dictionary,

	149 std::vector<int>* hyphen_offsets) {

	150 DCHECK(dictionary);

	151 DCHECK(hyphen_offsets);

	152

	153 int error_code = hnj_hyphen_hyphenate2(dictionary,

	154 word_utf8_.data(),

	155 static_cast<int>(word_utf8_.length()),

	156 hyphen_vector_.get(),

	157 NULL,

	158 &rep_,

	159 &pos_,

	160 &cut_);

	161 if (error_code)

	162 return false;

	163

	164 // WebKit needs hyphenation points counted in UTF-16 characters. On the other

	165 // hand, the hyphen library returns hyphenation points counted in UTF-8

	166 // characters. We increamentally convert hyphenation points in UTF-8

	167 // characters to hyphenation points in UTF-16 characters and write the

	168 // converted hyphenation points to the output vector.

	169 UTF16TextLength text_length;

	170 hyphen_offsets->clear();

	171 for (size_t i = 0; i < word_utf8_.length(); ++i) {

	172 text_length.Append(word_utf8_[i]);

	173 if (hyphen_vector_[i] & 1)

	174 hyphen_offsets->push_back(text_length.utf16_length());

	175 }

	176 return !hyphen_offsets->empty();

	177 }

	178

	179 } // namespace

	180

	181 namespace content {

	182

	183 Hyphenator::Hyphenator(base::PlatformFile file)

	184 : dictionary_(NULL),

	185 rule_file_(file),

	186 result_(0) {

	187 }

	188

	189 Hyphenator::~Hyphenator() {

	190 if (dictionary_)

	191 hnj_hyphen_free(dictionary_);

	192 }

	193

	194 bool Hyphenator::Initialize() {

	195 if (dictionary_)

	196 return true;

	197

	198 rule_map_.reset(new file_util::MemoryMappedFile);

	199 if (!rule_map_->Initialize(rule_file_))

	200 return false;

	201

	202 dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());

	203 return !!dictionary_;

	204 }

	205

	206 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,

	207 size_t before_index) {

	208 if (!dictionary_ \|\| word.empty())

	209 return 0;

	210

	211 // Call the hyphen library to get all hyphenation points, i.e. positions where

	212 // we can insert hyphens. When WebKit finds a line-break, it calls this

	213 // function twice or more with the same word to find the best hyphenation

	214 // point. To avoid calling the hyphen library twice or more with the same

	215 // word, we cache the last query.

	216 if (word_ != word) {

	217 word_ = word;

	218 Query query(word);

	219 result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);

	220 }

	221 if (!result_)

	222 return 0;

	223 for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();

	224 it != hyphen_offsets_.rend(); ++it) {

	225 if (static_cast<size_t>(*it) < before_index)

	226 return *it;

	227 }

	228 return 0;

	229 }

	230

	231 } // namespace content

OLD	NEW

« no previous file with comments | « content/renderer/hyphenator/hyphenator.h ('k') | content/renderer/hyphenator/hyphenator_unittest.cc » ('j') | no next file with comments »