content/renderer/hyphenator/hyphenator.cc - Issue 20860003: Remove hyphenation code from Chromium.

Side by Side Diff: content/renderer/hyphenator/hyphenator.cc

Issue 20860003: Remove hyphenation code from Chromium. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: rebase Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "content/renderer/hyphenator/hyphenator.h"

6

7 #include "base/files/memory_mapped_file.h"

8 #include "base/logging.h"

9 #include "base/memory/scoped_ptr.h"

10 #include "base/strings/string_util.h"

11 #include "base/strings/utf_string_conversions.h"

12 #include "content/common/hyphenator_messages.h"

13 #include "content/public/renderer/render_thread.h"

14 #include "third_party/hyphen/hyphen.h"

15 #include "third_party/icu/source/common/unicode/uscript.h"

16

17 namespace {

18

19 // A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds

20 // only the length of converted UTF-16 characters. This class is used for

21 // creating a mapping from the position of a UTF-8 string to a position of a

22 // UTF-16 string without unnecessary conversions. Even though the following

23 // snippet produces the same mapping, it needs to convert same characters many

24 // times. This class incrementally counts the number of converted UTF-16

25 // characters to avoid this problem.

26 //

27 // scoped_ptr<size_t[]> position(new size_t[text.length()]);

28 // for (size_t i = 0; i < text.length(); ++i)

29 // position[i] = UTF8ToUTF16(text.substr(0, i)).length();

30 //

31 class UTF16TextLength {

32 public:

33 UTF16TextLength();

34 ~UTF16TextLength();

35

36 // Returns the current position.

37 int utf16_length() const { return utf16_length_; }

38

39 // Appends one UTF-8 character to this converter and advances the converted

40 // position. This converter increases the position by one when it finishes

41 // reading a BMP character and increases by two when it finish reading a

42 // non-BMP character.

43 void Append(char c);

44

45 private:

46 // The length of the converted UTF-16 text.

47 int utf16_length_;

48

49 // The buffer that stores UTF-8 characters being converted.

50 std::string utf8_text_;

51

52 DISALLOW_COPY_AND_ASSIGN(UTF16TextLength);

53 };

54

55 UTF16TextLength::UTF16TextLength()

56 : utf16_length_(0) {

57 }

58

59 UTF16TextLength::~UTF16TextLength() {

60 }

61

62 void UTF16TextLength::Append(char c) {

63 // Append the given character and try converting the UTF-8 characters in this

64 // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,

65 // get the number of UTF-16 characters representing this codepoint and advance

66 // the position.

67 int code = 0;

68 int index = 0;

69 utf8_text_.push_back(c);

70 U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),

71 code);

72 if (code != U_SENTINEL) {

73 utf8_text_.clear();

74 utf16_length_ += U16_LENGTH(code);

75 }

76 }

77

78 // A class that encapsulates a hyphenation query. This class owns resources

79 // temporarily needed for hyphenating one word, and deletes them when it is

80 // deleted as listed in the following snippet.

81 //

82 // std::vector<int> hyphens;

83 // QUery query(UTF8ToUTF16("hyphenate"));

84 // query.Hyphenate(dict, &hyphens);

85 //

86 class Query {

87 public:

88 explicit Query(const string16& word);

89 ~Query();

90

91 // Hyphenates a word with the specified dictionary. This function hyphenates

92 // the word provided to its constructor and returns a list of hyphenation

93 // points, positions where we can insert hyphens.

94 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);

95

96 private:

97 // A word to be hyphenated.

98 std::string word_utf8_;

99

100 // Return variables from the hyphen library.

101 scoped_ptr<char[]> hyphen_vector_;

102 char** rep_;

103 int* pos_;

104 int* cut_;

105

106 DISALLOW_COPY_AND_ASSIGN(Query);

107 };

108

109 Query::Query(const string16& word)

110 : rep_(NULL),

111 pos_(NULL),

112 cut_(NULL) {

113 // Remove trailing punctuation characters. WebKit does not remove these

114 // characters when it hyphenates a word. These characters prevent the hyphen

115 // library from applying some rules, i.e. they prevent the library from adding

116 // hyphens.

117 DCHECK(!word.empty());

118 const char16* data = word.data();

119 int length = static_cast<int>(word.length());

120 while (length > 0) {

121 int previous = length;

122 int code = 0;

123 U16_PREV(data, 0, previous, code);

124 UErrorCode error = U_ZERO_ERROR;

125 if (uscript_getScript(code, &error) != USCRIPT_COMMON)

126 break;

127 length = previous;

128 }

129 UTF16ToUTF8(word.c_str(), length, &word_utf8_);

130 // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a

131 // buffer of \|word_.length()\| + 5 as written in Line 112 of

132 // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.

133 hyphen_vector_.reset(new char[word_utf8_.length() + 5]);

134 }

135

136 Query::~Query() {

137 if (rep_) {

138 for (size_t i = 0; i < word_utf8_.length(); ++i) {

139 if (rep_[i])

140 free(rep_[i]);

141 }

142 free(rep_);

143 }

144 if (pos_)

145 free(pos_);

146 if (cut_)

147 free(cut_);

148 }

149

150 bool Query::Hyphenate(HyphenDict* dictionary,

151 std::vector<int>* hyphen_offsets) {

152 DCHECK(dictionary);

153 DCHECK(hyphen_offsets);

154

155 int error_code = hnj_hyphen_hyphenate2(dictionary,

156 word_utf8_.data(),

157 static_cast<int>(word_utf8_.length()),

158 hyphen_vector_.get(),

159 NULL,

160 &rep_,

161 &pos_,

162 &cut_);

163 if (error_code)

164 return false;

165

166 // WebKit needs hyphenation points counted in UTF-16 characters. On the other

167 // hand, the hyphen library returns hyphenation points counted in UTF-8

168 // characters. We increamentally convert hyphenation points in UTF-8

169 // characters to hyphenation points in UTF-16 characters and write the

170 // converted hyphenation points to the output vector.

171 UTF16TextLength text_length;

172 hyphen_offsets->clear();

173 for (size_t i = 0; i < word_utf8_.length(); ++i) {

174 text_length.Append(word_utf8_[i]);

175 if (hyphen_vector_[i] & 1)

176 hyphen_offsets->push_back(text_length.utf16_length());

177 }

178 return !hyphen_offsets->empty();

179 }

180

181 } // namespace

182

183 namespace content {

184

185 Hyphenator::Hyphenator(base::PlatformFile file)

186 : dictionary_(NULL),

187 dictionary_file_(base::FdopenPlatformFile(file, "r")),

188 result_(0) {

189 }

190

191 Hyphenator::~Hyphenator() {

192 if (dictionary_)

193 hnj_hyphen_free(dictionary_);

194 }

195

196 bool Hyphenator::Initialize() {

197 if (dictionary_)

198 return true;

199

200 if (!dictionary_file_.get())

201 return false;

202 dictionary_ = hnj_hyphen_load_file(dictionary_file_.get());

203 return !!dictionary_;

204 }

205

206 bool Hyphenator::Attach(RenderThread* thread, const string16& locale) {

207 if (!thread)

208 return false;

209 locale_.assign(locale);

210 thread->AddObserver(this);

211 return thread->Send(new HyphenatorHostMsg_OpenDictionary(locale));

212 }

213

214 bool Hyphenator::CanHyphenate(const string16& locale) {

215 return !locale_.compare(locale);

216 }

217

218 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,

219 size_t before_index) {

220 if (!Initialize() \|\| word.empty())

221 return 0;

222

223 // Call the hyphen library to get all hyphenation points, i.e. positions where

224 // we can insert hyphens. When WebKit finds a line-break, it calls this

225 // function twice or more with the same word to find the best hyphenation

226 // point. To avoid calling the hyphen library twice or more with the same

227 // word, we cache the last query.

228 if (word_ != word) {

229 word_ = word;

230 Query query(word);

231 result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);

232 }

233 if (!result_)

234 return 0;

235 for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();

236 it != hyphen_offsets_.rend(); ++it) {

237 if (static_cast<size_t>(*it) < before_index)

238 return *it;

239 }

240 return 0;

241 }

242

243 bool Hyphenator::OnControlMessageReceived(const IPC::Message& message) {

244 bool handled = true;

245 IPC_BEGIN_MESSAGE_MAP(Hyphenator, message)

246 IPC_MESSAGE_HANDLER(HyphenatorMsg_SetDictionary, OnSetDictionary)

247 IPC_MESSAGE_UNHANDLED(handled = false)

248 IPC_END_MESSAGE_MAP()

249 return handled;

250 }

251

252 void Hyphenator::OnSetDictionary(IPC::PlatformFileForTransit file) {

253 base::PlatformFile rule_file =

254 IPC::PlatformFileForTransitToPlatformFile(file);

255 if (rule_file == base::kInvalidPlatformFileValue)

256 return;

257 // Delete the current dictionary and save the given file to this object. We

258 // initialize the hyphen library the first time when WebKit actually

259 // hyphenates a word, i.e. when WebKit calls the ComputeLastHyphenLocation

260 // function. (WebKit does not always hyphenate words even when it calls the

261 // CanHyphenate function, e.g. WebKit does not have to hyphenate words when it

262 // does not have to break text into lines.)

263 if (dictionary_) {

264 hnj_hyphen_free(dictionary_);

265 dictionary_ = NULL;

266 }

267 dictionary_file_.Set(base::FdopenPlatformFile(rule_file, "r"));

268 }

269

270 } // namespace content

OLD	NEW

« no previous file with comments | « content/renderer/hyphenator/hyphenator.h ('k') | content/renderer/hyphenator/hyphenator_unittest.cc » ('j') | no next file with comments »