third_party/harfbuzz-ng/src/hb-ot-shape-normalize.cc - Issue 10915172: harfbuzz-ng roll

Unified Diff: third_party/harfbuzz-ng/src/hb-ot-shape-normalize.cc

Issue 10915172: harfbuzz-ng roll (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « third_party/harfbuzz-ng/src/hb-ot-shape-fallback-private.hh ('k') | third_party/harfbuzz-ng/src/hb-ot-shape-normalize-private.hh » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/harfbuzz-ng/src/hb-ot-shape-normalize.cc

diff --git a/third_party/harfbuzz-ng/src/hb-ot-shape-normalize.cc b/third_party/harfbuzz-ng/src/hb-ot-shape-normalize.cc

index 562ba881f47900c912a85a03bfc2e34f4d057877..18a3f3f441cf87248adbc23c6a647650899bdf92 100644

--- a/third_party/harfbuzz-ng/src/hb-ot-shape-normalize.cc

+++ b/third_party/harfbuzz-ng/src/hb-ot-shape-normalize.cc

@@ -62,96 +62,364 @@

* knowledge too. We need to provide assistance to the itemizer.

* - When a font does not support a character but supports its decomposition,

- * well, use the decomposition.

+ * well, use the decomposition (preferring the canonical decomposition, but

+ * falling back to the compatibility decomposition if necessary). The

+ * compatibility decomposition is really nice to have, for characters like

+ * ellipsis, or various-sized space characters.

- * - The Indic shaper requests decomposed output. This will handle splitting

- * matra for the Indic shaper.

+ * - The complex shapers can customize the compose and decompose functions to

+ * offload some of their requirements to the normalizer. For example, the

+ * Indic shaper may want to disallow recomposing of two matras.

+ *

+ * - We try compatibility decomposition if decomposing through canonical

+ * decomposition alone failed to find a sequence that the font supports.

+ * We don't try compatibility decomposition recursively during the canonical

+ * decomposition phase. This has minimal impact. There are only a handful

+ * of Greek letter that have canonical decompositions that include characters

+ * with compatibility decomposition. Those can be found using this command:

+ *

+ * egrep "`echo -n ';('; grep ';<' UnicodeData.txt | cut -d';' -f1 | tr '\n' '|'; echo ') '`" UnicodeData.txt

-static void

-output_glyph (hb_buffer_t *buffer, hb_codepoint_t glyph)

+static hb_bool_t

+decompose_func (hb_unicode_funcs_t *unicode,

+ hb_codepoint_t ab,

+ hb_codepoint_t *a,

+ hb_codepoint_t *b)

+ /* XXX FIXME, move these to complex shapers and propagage to normalizer.*/

+ switch (ab) {

+ case 0x0AC9 : return false;

+ case 0x0931 : return false;

+ case 0x0B94 : return false;

+ /* These ones have Unicode decompositions, but we do it

+ * this way to be close to what Uniscribe does. */

+ case 0x0DDA : *a = 0x0DD9; *b= 0x0DDA; return true;

+ case 0x0DDC : *a = 0x0DD9; *b= 0x0DDC; return true;

+ case 0x0DDD : *a = 0x0DD9; *b= 0x0DDD; return true;

+ case 0x0DDE : *a = 0x0DD9; *b= 0x0DDE; return true;

+ case 0x0F77 : *a = 0x0FB2; *b= 0x0F81; return true;

+ case 0x0F79 : *a = 0x0FB3; *b= 0x0F81; return true;

+ case 0x17BE : *a = 0x17C1; *b= 0x17BE; return true;

+ case 0x17BF : *a = 0x17C1; *b= 0x17BF; return true;

+ case 0x17C0 : *a = 0x17C1; *b= 0x17C0; return true;

+ case 0x17C4 : *a = 0x17C1; *b= 0x17C4; return true;

+ case 0x17C5 : *a = 0x17C1; *b= 0x17C5; return true;

+ case 0x1925 : *a = 0x1920; *b= 0x1923; return true;

+ case 0x1926 : *a = 0x1920; *b= 0x1924; return true;

+ case 0x1B3C : *a = 0x1B42; *b= 0x1B3C; return true;

+ case 0x1112E : *a = 0x11127; *b= 0x11131; return true;

+ case 0x1112F : *a = 0x11127; *b= 0x11132; return true;

+#if 0

+ case 0x0B57 : *a = 0xno decomp, -> RIGHT; return true;

+ case 0x1C29 : *a = 0xno decomp, -> LEFT; return true;

+ case 0xA9C0 : *a = 0xno decomp, -> RIGHT; return true;

+ case 0x111BF : *a = 0xno decomp, -> ABOVE; return true;

+#endif

+ }

+ return unicode->decompose (ab, a, b);

+static hb_bool_t

+compose_func (hb_unicode_funcs_t *unicode,

+ hb_codepoint_t a,

+ hb_codepoint_t b,

+ hb_codepoint_t *ab)

+ /* XXX, this belongs to indic normalizer. */

+ if (HB_UNICODE_GENERAL_CATEGORY_IS_MARK (unicode->general_category (a)))

+ return false;

+ /* XXX, add composition-exclusion exceptions to Indic shaper. */

+ if (a == 0x09AF && b == 0x09BC) { *ab = 0x09DF; return true; }

+ /* XXX, these belong to the hebew / default shaper. */

+ /* Hebrew presentation-form shaping.

+ * https://bugzilla.mozilla.org/show_bug.cgi?id=728866 */

+ // Hebrew presentation forms with dagesh, for characters 0x05D0..0x05EA;

+ // note that some letters do not have a dagesh presForm encoded

+ static const hb_codepoint_t sDageshForms[0x05EA - 0x05D0 + 1] = {

+ 0xFB30, // ALEF

+ 0xFB31, // BET

+ 0xFB32, // GIMEL

+ 0xFB33, // DALET

+ 0xFB34, // HE

+ 0xFB35, // VAV

+ 0xFB36, // ZAYIN

+ 0, // HET

+ 0xFB38, // TET

+ 0xFB39, // YOD

+ 0xFB3A, // FINAL KAF

+ 0xFB3B, // KAF

+ 0xFB3C, // LAMED

+ 0, // FINAL MEM

+ 0xFB3E, // MEM

+ 0, // FINAL NUN

+ 0xFB40, // NUN

+ 0xFB41, // SAMEKH

+ 0, // AYIN

+ 0xFB43, // FINAL PE

+ 0xFB44, // PE

+ 0, // FINAL TSADI

+ 0xFB46, // TSADI

+ 0xFB47, // QOF

+ 0xFB48, // RESH

+ 0xFB49, // SHIN

+ 0xFB4A // TAV

+ };

+ hb_bool_t found = unicode->compose (a, b, ab);

+ if (!found && (b & ~0x7F) == 0x0580) {

+ // special-case Hebrew presentation forms that are excluded from

+ // standard normalization, but wanted for old fonts

+ switch (b) {

+ case 0x05B4: // HIRIQ

+ if (a == 0x05D9) { // YOD

+ *ab = 0xFB1D;

+ found = true;

+ }

+ break;

+ case 0x05B7: // patah

+ if (a == 0x05F2) { // YIDDISH YOD YOD

+ *ab = 0xFB1F;

+ found = true;

+ } else if (a == 0x05D0) { // ALEF

+ *ab = 0xFB2E;

+ found = true;

+ }

+ break;

+ case 0x05B8: // QAMATS

+ if (a == 0x05D0) { // ALEF

+ *ab = 0xFB2F;

+ found = true;

+ }

+ break;

+ case 0x05B9: // HOLAM

+ if (a == 0x05D5) { // VAV

+ *ab = 0xFB4B;

+ found = true;

+ }

+ break;

+ case 0x05BC: // DAGESH

+ if (a >= 0x05D0 && a <= 0x05EA) {

+ *ab = sDageshForms[a - 0x05D0];

+ found = (*ab != 0);

+ } else if (a == 0xFB2A) { // SHIN WITH SHIN DOT

+ *ab = 0xFB2C;

+ found = true;

+ } else if (a == 0xFB2B) { // SHIN WITH SIN DOT

+ *ab = 0xFB2D;

+ found = true;

+ }

+ break;

+ case 0x05BF: // RAFE

+ switch (a) {

+ case 0x05D1: // BET

+ *ab = 0xFB4C;

+ found = true;

+ break;

+ case 0x05DB: // KAF

+ *ab = 0xFB4D;

+ found = true;

+ break;

+ case 0x05E4: // PE

+ *ab = 0xFB4E;

+ found = true;

+ break;

+ }

+ break;

+ case 0x05C1: // SHIN DOT

+ if (a == 0x05E9) { // SHIN

+ *ab = 0xFB2A;

+ found = true;

+ } else if (a == 0xFB49) { // SHIN WITH DAGESH

+ *ab = 0xFB2C;

+ found = true;

+ }

+ break;

+ case 0x05C2: // SIN DOT

+ if (a == 0x05E9) { // SHIN

+ *ab = 0xFB2B;

+ found = true;

+ } else if (a == 0xFB49) { // SHIN WITH DAGESH

+ *ab = 0xFB2D;

+ found = true;

+ }

+ break;

+ }

+ return found;

+static inline void

+set_glyph (hb_glyph_info_t &info, hb_font_t *font)

+ font->get_glyph (info.codepoint, 0, &info.glyph_index());

+static inline void

+output_char (hb_buffer_t *buffer, hb_codepoint_t unichar, hb_codepoint_t glyph)

{

- buffer->output_glyph (glyph);

+ buffer->cur().glyph_index() = glyph;

+ buffer->output_glyph (unichar);

_hb_glyph_info_set_unicode_props (&buffer->prev(), buffer->unicode);

}

-static bool

-decompose (hb_font_t *font, hb_buffer_t *buffer,

- bool shortest,

- hb_codepoint_t ab)

+static inline void

+next_char (hb_buffer_t *buffer, hb_codepoint_t glyph)

+ buffer->cur().glyph_index() = glyph;

+ buffer->next_glyph ();

+static inline void

+skip_char (hb_buffer_t *buffer)

+ buffer->skip_glyph ();

+/* Returns 0 if didn't decompose, number of resulting characters otherwise. */

+static inline unsigned int

+decompose (hb_font_t *font, hb_buffer_t *buffer, bool shortest, hb_codepoint_t ab)

{

- hb_codepoint_t a, b, glyph;

+ hb_codepoint_t a, b, a_glyph, b_glyph;

- if (!hb_unicode_decompose (buffer->unicode, ab, &a, &b) ||

- (b && !hb_font_get_glyph (font, b, 0, &glyph)))

- return false;

+ if (!decompose_func (buffer->unicode, ab, &a, &b) ||

+ (b && !font->get_glyph (b, 0, &b_glyph)))

+ return 0;

- bool has_a = hb_font_get_glyph (font, a, 0, &glyph);

+ bool has_a = font->get_glyph (a, 0, &a_glyph);

if (shortest && has_a) {

/* Output a and b */

- output_glyph (buffer, a);

- if (b)

- output_glyph (buffer, b);

- return true;

+ output_char (buffer, a, a_glyph);

+ if (likely (b)) {

+ output_char (buffer, b, b_glyph);

+ return 2;

+ }

+ return 1;

}

- if (decompose (font, buffer, shortest, a)) {

- if (b)

- output_glyph (buffer, b);

- return true;

+ unsigned int ret;

+ if ((ret = decompose (font, buffer, shortest, a))) {

+ if (b) {

+ output_char (buffer, b, b_glyph);

+ return ret + 1;

+ }

+ return ret;

}

if (has_a) {

- output_glyph (buffer, a);

- if (b)

- output_glyph (buffer, b);

- return true;

+ output_char (buffer, a, a_glyph);

+ if (likely (b)) {

+ output_char (buffer, b, b_glyph);

+ return 2;

+ }

+ return 1;

}

- return false;

+ return 0;

}

-static void

-decompose_current_glyph (hb_font_t *font, hb_buffer_t *buffer,

- bool shortest)

+/* Returns 0 if didn't decompose, number of resulting characters otherwise. */

+static inline bool

+decompose_compatibility (hb_font_t *font, hb_buffer_t *buffer, hb_codepoint_t u)

{

- if (decompose (font, buffer, shortest, buffer->cur().codepoint))

- buffer->skip_glyph ();

- else

- buffer->next_glyph ();

+ unsigned int len, i;

+ hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN];

+ hb_codepoint_t glyphs[HB_UNICODE_MAX_DECOMPOSITION_LEN];

+ len = buffer->unicode->decompose_compatibility (u, decomposed);

+ if (!len)

+ return 0;

+ for (i = 0; i < len; i++)

+ if (!font->get_glyph (decomposed[i], 0, &glyphs[i]))

+ return 0;

+ for (i = 0; i < len; i++)

+ output_char (buffer, decomposed[i], glyphs[i]);

+ return len;

}

-static void

-decompose_single_char_cluster (hb_font_t *font, hb_buffer_t *buffer,

- bool will_recompose)

+/* Returns true if recomposition may be benefitial. */

+static inline bool

+decompose_current_character (hb_font_t *font, hb_buffer_t *buffer, bool shortest)

{

hb_codepoint_t glyph;

+ unsigned int len = 1;

+ /* Kind of a cute waterfall here... */

+ if (shortest && font->get_glyph (buffer->cur().codepoint, 0, &glyph))

+ next_char (buffer, glyph);

+ else if ((len = decompose (font, buffer, shortest, buffer->cur().codepoint)))

+ skip_char (buffer);

+ else if (!shortest && font->get_glyph (buffer->cur().codepoint, 0, &glyph))

+ next_char (buffer, glyph);

+ else if ((len = decompose_compatibility (font, buffer, buffer->cur().codepoint)))

+ skip_char (buffer);

+ else

+ next_char (buffer, glyph); /* glyph is initialized in earlier branches. */

+ /*

+ * A recomposition would only be useful if we decomposed into at least three

+ * characters...

+ */

+ return len > 2;

- /* If recomposing and font supports this, we're good to go */

- if (will_recompose && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph)) {

+static inline void

+handle_variation_selector_cluster (hb_font_t *font, hb_buffer_t *buffer, unsigned int end)

+ for (; buffer->idx < end - 1;) {

+ if (unlikely (buffer->unicode->is_variation_selector (buffer->cur(+1).codepoint))) {

+ /* The next two lines are some ugly lines... But work. */

+ font->get_glyph (buffer->cur().codepoint, buffer->cur(+1).codepoint, &buffer->cur().glyph_index());

+ buffer->replace_glyphs (2, 1, &buffer->cur().codepoint);

+ } else {

+ set_glyph (buffer->cur(), font);

+ buffer->next_glyph ();

+ }

+ if (likely (buffer->idx < end)) {

+ set_glyph (buffer->cur(), font);

buffer->next_glyph ();

- return;

}

- decompose_current_glyph (font, buffer, will_recompose);

}

-static void

-decompose_multi_char_cluster (hb_font_t *font, hb_buffer_t *buffer,

- unsigned int end)

+/* Returns true if recomposition may be benefitial. */

+static inline bool

+decompose_multi_char_cluster (hb_font_t *font, hb_buffer_t *buffer, unsigned int end)

{

/* TODO Currently if there's a variation-selector we give-up, it's just too hard. */

for (unsigned int i = buffer->idx; i < end; i++)

- if (unlikely (_hb_unicode_is_variation_selector (buffer->info[i].codepoint))) {

- while (buffer->idx < end)

- buffer->next_glyph ();

- return;

+ if (unlikely (buffer->unicode->is_variation_selector (buffer->info[i].codepoint))) {

+ handle_variation_selector_cluster (font, buffer, end);

+ return false;

}

while (buffer->idx < end)

- decompose_current_glyph (font, buffer, false);

+ decompose_current_character (font, buffer, false);

+ /* We can be smarter here and only return true if there are at least two ccc!=0 marks.

+ * But does not matter. */

+ return true;

+static inline bool

+decompose_cluster (hb_font_t *font, hb_buffer_t *buffer, bool short_circuit, unsigned int end)

+ if (likely (buffer->idx + 1 == end))

+ return decompose_current_character (font, buffer, short_circuit);

+ else

+ return decompose_multi_char_cluster (font, buffer, end);

}

static int

compare_combining_class (const hb_glyph_info_t *pa, const hb_glyph_info_t *pb)

{

@@ -161,12 +429,14 @@ compare_combining_class (const hb_glyph_info_t *pa, const hb_glyph_info_t *pb)

return a < b ? -1 : a == b ? 0 : +1;

}

void

_hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,

hb_ot_shape_normalization_mode_t mode)

{

- bool recompose = mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED;

- bool has_multichar_clusters = false;

+ bool short_circuit = mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED &&

+ mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT;

+ bool can_use_recompose = false;

unsigned int count;

/* We do a fairly straightforward yet custom normalization process in three

@@ -187,17 +457,12 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,

if (buffer->cur().cluster != buffer->info[end].cluster)

break;

- if (buffer->idx + 1 == end)

- decompose_single_char_cluster (font, buffer, recompose);

- else {

- decompose_multi_char_cluster (font, buffer, end);

- has_multichar_clusters = true;

- }

+ can_use_recompose = decompose_cluster (font, buffer, short_circuit, end) || can_use_recompose;

}

buffer->swap_buffers ();

- if (mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_FULL && !has_multichar_clusters)

+ if (mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_FULL && !can_use_recompose)

return; /* Done! */

@@ -228,7 +493,7 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,

}

- if (!recompose)

+ if (mode == HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED)

return;

/* Third round, recompose */

@@ -252,19 +517,23 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,

(starter == buffer->out_len - 1 ||

_hb_glyph_info_get_modified_combining_class (&buffer->prev()) < _hb_glyph_info_get_modified_combining_class (&buffer->cur())) &&

/* And compose. */

- hb_unicode_compose (buffer->unicode,

- buffer->out_info[starter].codepoint,

- buffer->cur().codepoint,

- &composed) &&

+ compose_func (buffer->unicode,

+ buffer->out_info[starter].codepoint,

+ buffer->cur().codepoint,

+ &composed) &&

/* And the font has glyph for the composite. */

- hb_font_get_glyph (font, composed, 0, &glyph))

+ font->get_glyph (composed, 0, &glyph))

{

- /* Composes. Modify starter and carry on. */

- buffer->out_info[starter].codepoint = composed;

- /* XXX update cluster */

+ /* Composes. */

+ buffer->next_glyph (); /* Copy to out-buffer. */

+ if (unlikely (buffer->in_error))

+ return;

+ buffer->merge_out_clusters (starter, buffer->out_len);

+ buffer->out_len--; /* Remove the second composable. */

+ buffer->out_info[starter].codepoint = composed; /* Modify starter and carry on. */

+ set_glyph (buffer->out_info[starter], font);

_hb_glyph_info_set_unicode_props (&buffer->out_info[starter], buffer->unicode);

- buffer->skip_glyph ();

continue;

}