| Index: icu51/source/common/normalizer2impl.cpp
|
| ===================================================================
|
| --- icu51/source/common/normalizer2impl.cpp (revision 0)
|
| +++ icu51/source/common/normalizer2impl.cpp (revision 0)
|
| @@ -0,0 +1,2073 @@
|
| +/*
|
| +*******************************************************************************
|
| +*
|
| +* Copyright (C) 2009-2012, International Business Machines
|
| +* Corporation and others. All Rights Reserved.
|
| +*
|
| +*******************************************************************************
|
| +* file name: normalizer2impl.cpp
|
| +* encoding: US-ASCII
|
| +* tab size: 8 (not used)
|
| +* indentation:4
|
| +*
|
| +* created on: 2009nov22
|
| +* created by: Markus W. Scherer
|
| +*/
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_NORMALIZATION
|
| +
|
| +#include "unicode/normalizer2.h"
|
| +#include "unicode/udata.h"
|
| +#include "unicode/ustring.h"
|
| +#include "unicode/utf16.h"
|
| +#include "cmemory.h"
|
| +#include "mutex.h"
|
| +#include "normalizer2impl.h"
|
| +#include "putilimp.h"
|
| +#include "uassert.h"
|
| +#include "uset_imp.h"
|
| +#include "utrie2.h"
|
| +#include "uvector.h"
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +// ReorderingBuffer -------------------------------------------------------- ***
|
| +
|
| +UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
|
| + int32_t length=str.length();
|
| + start=str.getBuffer(destCapacity);
|
| + if(start==NULL) {
|
| + // getBuffer() already did str.setToBogus()
|
| + errorCode=U_MEMORY_ALLOCATION_ERROR;
|
| + return FALSE;
|
| + }
|
| + limit=start+length;
|
| + remainingCapacity=str.getCapacity()-length;
|
| + reorderStart=start;
|
| + if(start==limit) {
|
| + lastCC=0;
|
| + } else {
|
| + setIterator();
|
| + lastCC=previousCC();
|
| + // Set reorderStart after the last code point with cc<=1 if there is one.
|
| + if(lastCC>1) {
|
| + while(previousCC()>1) {}
|
| + }
|
| + reorderStart=codePointLimit;
|
| + }
|
| + return TRUE;
|
| +}
|
| +
|
| +UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
|
| + int32_t length=(int32_t)(limit-start);
|
| + return
|
| + length==(int32_t)(otherLimit-otherStart) &&
|
| + 0==u_memcmp(start, otherStart, length);
|
| +}
|
| +
|
| +UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
|
| + if(remainingCapacity<2 && !resize(2, errorCode)) {
|
| + return FALSE;
|
| + }
|
| + if(lastCC<=cc || cc==0) {
|
| + limit[0]=U16_LEAD(c);
|
| + limit[1]=U16_TRAIL(c);
|
| + limit+=2;
|
| + lastCC=cc;
|
| + if(cc<=1) {
|
| + reorderStart=limit;
|
| + }
|
| + } else {
|
| + insert(c, cc);
|
| + }
|
| + remainingCapacity-=2;
|
| + return TRUE;
|
| +}
|
| +
|
| +UBool ReorderingBuffer::append(const UChar *s, int32_t length,
|
| + uint8_t leadCC, uint8_t trailCC,
|
| + UErrorCode &errorCode) {
|
| + if(length==0) {
|
| + return TRUE;
|
| + }
|
| + if(remainingCapacity<length && !resize(length, errorCode)) {
|
| + return FALSE;
|
| + }
|
| + remainingCapacity-=length;
|
| + if(lastCC<=leadCC || leadCC==0) {
|
| + if(trailCC<=1) {
|
| + reorderStart=limit+length;
|
| + } else if(leadCC<=1) {
|
| + reorderStart=limit+1; // Ok if not a code point boundary.
|
| + }
|
| + const UChar *sLimit=s+length;
|
| + do { *limit++=*s++; } while(s!=sLimit);
|
| + lastCC=trailCC;
|
| + } else {
|
| + int32_t i=0;
|
| + UChar32 c;
|
| + U16_NEXT(s, i, length, c);
|
| + insert(c, leadCC); // insert first code point
|
| + while(i<length) {
|
| + U16_NEXT(s, i, length, c);
|
| + if(i<length) {
|
| + // s must be in NFD, otherwise we need to use getCC().
|
| + leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
|
| + } else {
|
| + leadCC=trailCC;
|
| + }
|
| + append(c, leadCC, errorCode);
|
| + }
|
| + }
|
| + return TRUE;
|
| +}
|
| +
|
| +UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
|
| + int32_t cpLength=U16_LENGTH(c);
|
| + if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
|
| + return FALSE;
|
| + }
|
| + remainingCapacity-=cpLength;
|
| + if(cpLength==1) {
|
| + *limit++=(UChar)c;
|
| + } else {
|
| + limit[0]=U16_LEAD(c);
|
| + limit[1]=U16_TRAIL(c);
|
| + limit+=2;
|
| + }
|
| + lastCC=0;
|
| + reorderStart=limit;
|
| + return TRUE;
|
| +}
|
| +
|
| +UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
|
| + if(s==sLimit) {
|
| + return TRUE;
|
| + }
|
| + int32_t length=(int32_t)(sLimit-s);
|
| + if(remainingCapacity<length && !resize(length, errorCode)) {
|
| + return FALSE;
|
| + }
|
| + u_memcpy(limit, s, length);
|
| + limit+=length;
|
| + remainingCapacity-=length;
|
| + lastCC=0;
|
| + reorderStart=limit;
|
| + return TRUE;
|
| +}
|
| +
|
| +void ReorderingBuffer::remove() {
|
| + reorderStart=limit=start;
|
| + remainingCapacity=str.getCapacity();
|
| + lastCC=0;
|
| +}
|
| +
|
| +void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
|
| + if(suffixLength<(limit-start)) {
|
| + limit-=suffixLength;
|
| + remainingCapacity+=suffixLength;
|
| + } else {
|
| + limit=start;
|
| + remainingCapacity=str.getCapacity();
|
| + }
|
| + lastCC=0;
|
| + reorderStart=limit;
|
| +}
|
| +
|
| +UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
|
| + int32_t reorderStartIndex=(int32_t)(reorderStart-start);
|
| + int32_t length=(int32_t)(limit-start);
|
| + str.releaseBuffer(length);
|
| + int32_t newCapacity=length+appendLength;
|
| + int32_t doubleCapacity=2*str.getCapacity();
|
| + if(newCapacity<doubleCapacity) {
|
| + newCapacity=doubleCapacity;
|
| + }
|
| + if(newCapacity<256) {
|
| + newCapacity=256;
|
| + }
|
| + start=str.getBuffer(newCapacity);
|
| + if(start==NULL) {
|
| + // getBuffer() already did str.setToBogus()
|
| + errorCode=U_MEMORY_ALLOCATION_ERROR;
|
| + return FALSE;
|
| + }
|
| + reorderStart=start+reorderStartIndex;
|
| + limit=start+length;
|
| + remainingCapacity=str.getCapacity()-length;
|
| + return TRUE;
|
| +}
|
| +
|
| +void ReorderingBuffer::skipPrevious() {
|
| + codePointLimit=codePointStart;
|
| + UChar c=*--codePointStart;
|
| + if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
|
| + --codePointStart;
|
| + }
|
| +}
|
| +
|
| +uint8_t ReorderingBuffer::previousCC() {
|
| + codePointLimit=codePointStart;
|
| + if(reorderStart>=codePointStart) {
|
| + return 0;
|
| + }
|
| + UChar32 c=*--codePointStart;
|
| + if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
|
| + return 0;
|
| + }
|
| +
|
| + UChar c2;
|
| + if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
|
| + --codePointStart;
|
| + c=U16_GET_SUPPLEMENTARY(c2, c);
|
| + }
|
| + return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
|
| +}
|
| +
|
| +// Inserts c somewhere before the last character.
|
| +// Requires 0<cc<lastCC which implies reorderStart<limit.
|
| +void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
|
| + for(setIterator(), skipPrevious(); previousCC()>cc;) {}
|
| + // insert c at codePointLimit, after the character with prevCC<=cc
|
| + UChar *q=limit;
|
| + UChar *r=limit+=U16_LENGTH(c);
|
| + do {
|
| + *--r=*--q;
|
| + } while(codePointLimit!=q);
|
| + writeCodePoint(q, c);
|
| + if(cc<=1) {
|
| + reorderStart=r;
|
| + }
|
| +}
|
| +
|
| +// Normalizer2Impl --------------------------------------------------------- ***
|
| +
|
| +struct CanonIterData : public UMemory {
|
| + CanonIterData(UErrorCode &errorCode);
|
| + ~CanonIterData();
|
| + void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
|
| + UTrie2 *trie;
|
| + UVector canonStartSets; // contains UnicodeSet *
|
| +};
|
| +
|
| +Normalizer2Impl::~Normalizer2Impl() {
|
| + udata_close(memory);
|
| + utrie2_close(normTrie);
|
| + delete (CanonIterData *)canonIterDataSingleton.fInstance;
|
| +}
|
| +
|
| +UBool U_CALLCONV
|
| +Normalizer2Impl::isAcceptable(void *context,
|
| + const char * /* type */, const char * /*name*/,
|
| + const UDataInfo *pInfo) {
|
| + if(
|
| + pInfo->size>=20 &&
|
| + pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
|
| + pInfo->charsetFamily==U_CHARSET_FAMILY &&
|
| + pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
|
| + pInfo->dataFormat[1]==0x72 &&
|
| + pInfo->dataFormat[2]==0x6d &&
|
| + pInfo->dataFormat[3]==0x32 &&
|
| + pInfo->formatVersion[0]==2
|
| + ) {
|
| + Normalizer2Impl *me=(Normalizer2Impl *)context;
|
| + uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
|
| + return TRUE;
|
| + } else {
|
| + return FALSE;
|
| + }
|
| +}
|
| +
|
| +void
|
| +Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
|
| + if(U_FAILURE(errorCode)) {
|
| + return;
|
| + }
|
| + memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
|
| + if(U_FAILURE(errorCode)) {
|
| + return;
|
| + }
|
| + const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
|
| + const int32_t *inIndexes=(const int32_t *)inBytes;
|
| + int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
|
| + if(indexesLength<=IX_MIN_MAYBE_YES) {
|
| + errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
|
| + return;
|
| + }
|
| +
|
| + minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
|
| + minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
|
| +
|
| + minYesNo=inIndexes[IX_MIN_YES_NO];
|
| + minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
|
| + minNoNo=inIndexes[IX_MIN_NO_NO];
|
| + limitNoNo=inIndexes[IX_LIMIT_NO_NO];
|
| + minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
|
| +
|
| + int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
|
| + int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
|
| + normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
| + inBytes+offset, nextOffset-offset, NULL,
|
| + &errorCode);
|
| + if(U_FAILURE(errorCode)) {
|
| + return;
|
| + }
|
| +
|
| + offset=nextOffset;
|
| + nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
|
| + maybeYesCompositions=(const uint16_t *)(inBytes+offset);
|
| + extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
|
| +
|
| + // smallFCD: new in formatVersion 2
|
| + offset=nextOffset;
|
| + smallFCD=inBytes+offset;
|
| +
|
| + // Build tccc180[].
|
| + // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
|
| + uint8_t bits=0;
|
| + for(UChar c=0; c<0x180; bits>>=1) {
|
| + if((c&0xff)==0) {
|
| + bits=smallFCD[c>>8]; // one byte per 0x100 code points
|
| + }
|
| + if(bits&1) {
|
| + for(int i=0; i<0x20; ++i, ++c) {
|
| + tccc180[c]=(uint8_t)getFCD16FromNormData(c);
|
| + }
|
| + } else {
|
| + uprv_memset(tccc180+c, 0, 0x20);
|
| + c+=0x20;
|
| + }
|
| + }
|
| +}
|
| +
|
| +uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
|
| + UChar32 c;
|
| + if(cpStart==(cpLimit-1)) {
|
| + c=*cpStart;
|
| + } else {
|
| + c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
|
| + }
|
| + uint16_t prevNorm16=getNorm16(c);
|
| + if(prevNorm16<=minYesNo) {
|
| + return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
|
| + } else {
|
| + return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
|
| + }
|
| +}
|
| +
|
| +U_CDECL_BEGIN
|
| +
|
| +static UBool U_CALLCONV
|
| +enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
|
| + /* add the start code point to the USet */
|
| + const USetAdder *sa=(const USetAdder *)context;
|
| + sa->add(sa->set, start);
|
| + return TRUE;
|
| +}
|
| +
|
| +static uint32_t U_CALLCONV
|
| +segmentStarterMapper(const void * /*context*/, uint32_t value) {
|
| + return value&CANON_NOT_SEGMENT_STARTER;
|
| +}
|
| +
|
| +U_CDECL_END
|
| +
|
| +void
|
| +Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
|
| + /* add the start code point of each same-value range of each trie */
|
| + utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
|
| +
|
| + /* add Hangul LV syllables and LV+1 because of skippables */
|
| + for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
|
| + sa->add(sa->set, c);
|
| + sa->add(sa->set, c+1);
|
| + }
|
| + sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
|
| +}
|
| +
|
| +void
|
| +Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
|
| + /* add the start code point of each same-value range of the canonical iterator data trie */
|
| + if(ensureCanonIterData(errorCode)) {
|
| + // currently only used for the SEGMENT_STARTER property
|
| + utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,
|
| + segmentStarterMapper, enumPropertyStartsRange, sa);
|
| + }
|
| +}
|
| +
|
| +const UChar *
|
| +Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
|
| + UChar32 minNeedDataCP,
|
| + ReorderingBuffer *buffer,
|
| + UErrorCode &errorCode) const {
|
| + // Make some effort to support NUL-terminated strings reasonably.
|
| + // Take the part of the fast quick check loop that does not look up
|
| + // data and check the first part of the string.
|
| + // After this prefix, determine the string length to simplify the rest
|
| + // of the code.
|
| + const UChar *prevSrc=src;
|
| + UChar c;
|
| + while((c=*src++)<minNeedDataCP && c!=0) {}
|
| + // Back out the last character for full processing.
|
| + // Copy this prefix.
|
| + if(--src!=prevSrc) {
|
| + if(buffer!=NULL) {
|
| + buffer->appendZeroCC(prevSrc, src, errorCode);
|
| + }
|
| + }
|
| + return src;
|
| +}
|
| +
|
| +// Dual functionality:
|
| +// buffer!=NULL: normalize
|
| +// buffer==NULL: isNormalized/spanQuickCheckYes
|
| +const UChar *
|
| +Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
|
| + ReorderingBuffer *buffer,
|
| + UErrorCode &errorCode) const {
|
| + UChar32 minNoCP=minDecompNoCP;
|
| + if(limit==NULL) {
|
| + src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
|
| + if(U_FAILURE(errorCode)) {
|
| + return src;
|
| + }
|
| + limit=u_strchr(src, 0);
|
| + }
|
| +
|
| + const UChar *prevSrc;
|
| + UChar32 c=0;
|
| + uint16_t norm16=0;
|
| +
|
| + // only for quick check
|
| + const UChar *prevBoundary=src;
|
| + uint8_t prevCC=0;
|
| +
|
| + for(;;) {
|
| + // count code units below the minimum or with irrelevant data for the quick check
|
| + for(prevSrc=src; src!=limit;) {
|
| + if( (c=*src)<minNoCP ||
|
| + isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
|
| + ) {
|
| + ++src;
|
| + } else if(!U16_IS_SURROGATE(c)) {
|
| + break;
|
| + } else {
|
| + UChar c2;
|
| + if(U16_IS_SURROGATE_LEAD(c)) {
|
| + if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
|
| + c=U16_GET_SUPPLEMENTARY(c, c2);
|
| + }
|
| + } else /* trail surrogate */ {
|
| + if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
|
| + --src;
|
| + c=U16_GET_SUPPLEMENTARY(c2, c);
|
| + }
|
| + }
|
| + if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
|
| + src+=U16_LENGTH(c);
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + }
|
| + // copy these code units all at once
|
| + if(src!=prevSrc) {
|
| + if(buffer!=NULL) {
|
| + if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
|
| + break;
|
| + }
|
| + } else {
|
| + prevCC=0;
|
| + prevBoundary=src;
|
| + }
|
| + }
|
| + if(src==limit) {
|
| + break;
|
| + }
|
| +
|
| + // Check one above-minimum, relevant code point.
|
| + src+=U16_LENGTH(c);
|
| + if(buffer!=NULL) {
|
| + if(!decompose(c, norm16, *buffer, errorCode)) {
|
| + break;
|
| + }
|
| + } else {
|
| + if(isDecompYes(norm16)) {
|
| + uint8_t cc=getCCFromYesOrMaybe(norm16);
|
| + if(prevCC<=cc || cc==0) {
|
| + prevCC=cc;
|
| + if(cc<=1) {
|
| + prevBoundary=src;
|
| + }
|
| + continue;
|
| + }
|
| + }
|
| + return prevBoundary; // "no" or cc out of order
|
| + }
|
| + }
|
| + return src;
|
| +}
|
| +
|
| +// Decompose a short piece of text which is likely to contain characters that
|
| +// fail the quick check loop and/or where the quick check loop's overhead
|
| +// is unlikely to be amortized.
|
| +// Called by the compose() and makeFCD() implementations.
|
| +UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
|
| + ReorderingBuffer &buffer,
|
| + UErrorCode &errorCode) const {
|
| + while(src<limit) {
|
| + UChar32 c;
|
| + uint16_t norm16;
|
| + UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
|
| + if(!decompose(c, norm16, buffer, errorCode)) {
|
| + return FALSE;
|
| + }
|
| + }
|
| + return TRUE;
|
| +}
|
| +
|
| +UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
|
| + ReorderingBuffer &buffer,
|
| + UErrorCode &errorCode) const {
|
| + // Only loops for 1:1 algorithmic mappings.
|
| + for(;;) {
|
| + // get the decomposition and the lead and trail cc's
|
| + if(isDecompYes(norm16)) {
|
| + // c does not decompose
|
| + return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
|
| + } else if(isHangul(norm16)) {
|
| + // Hangul syllable: decompose algorithmically
|
| + UChar jamos[3];
|
| + return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
|
| + } else if(isDecompNoAlgorithmic(norm16)) {
|
| + c=mapAlgorithmic(c, norm16);
|
| + norm16=getNorm16(c);
|
| + } else {
|
| + // c decomposes, get everything from the variable-length extra data
|
| + const uint16_t *mapping=getMapping(norm16);
|
| + uint16_t firstUnit=*mapping;
|
| + int32_t length=firstUnit&MAPPING_LENGTH_MASK;
|
| + uint8_t leadCC, trailCC;
|
| + trailCC=(uint8_t)(firstUnit>>8);
|
| + if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
|
| + leadCC=(uint8_t)(*(mapping-1)>>8);
|
| + } else {
|
| + leadCC=0;
|
| + }
|
| + return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
|
| + }
|
| + }
|
| +}
|
| +
|
| +const UChar *
|
| +Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
|
| + const UChar *decomp=NULL;
|
| + uint16_t norm16;
|
| + for(;;) {
|
| + if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
|
| + // c does not decompose
|
| + return decomp;
|
| + } else if(isHangul(norm16)) {
|
| + // Hangul syllable: decompose algorithmically
|
| + length=Hangul::decompose(c, buffer);
|
| + return buffer;
|
| + } else if(isDecompNoAlgorithmic(norm16)) {
|
| + c=mapAlgorithmic(c, norm16);
|
| + decomp=buffer;
|
| + length=0;
|
| + U16_APPEND_UNSAFE(buffer, length, c);
|
| + } else {
|
| + // c decomposes, get everything from the variable-length extra data
|
| + const uint16_t *mapping=getMapping(norm16);
|
| + length=*mapping&MAPPING_LENGTH_MASK;
|
| + return (const UChar *)mapping+1;
|
| + }
|
| + }
|
| +}
|
| +
|
| +// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
|
| +// so that a raw mapping fits that consists of one unit ("rm0")
|
| +// plus all but the first two code units of the normal mapping.
|
| +// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
|
| +const UChar *
|
| +Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
|
| + // We do not loop in this method because an algorithmic mapping itself
|
| + // becomes a final result rather than having to be decomposed recursively.
|
| + uint16_t norm16;
|
| + if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
|
| + // c does not decompose
|
| + return NULL;
|
| + } else if(isHangul(norm16)) {
|
| + // Hangul syllable: decompose algorithmically
|
| + Hangul::getRawDecomposition(c, buffer);
|
| + length=2;
|
| + return buffer;
|
| + } else if(isDecompNoAlgorithmic(norm16)) {
|
| + c=mapAlgorithmic(c, norm16);
|
| + length=0;
|
| + U16_APPEND_UNSAFE(buffer, length, c);
|
| + return buffer;
|
| + } else {
|
| + // c decomposes, get everything from the variable-length extra data
|
| + const uint16_t *mapping=getMapping(norm16);
|
| + uint16_t firstUnit=*mapping;
|
| + int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
|
| + if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
|
| + // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
|
| + // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
|
| + const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
|
| + uint16_t rm0=*rawMapping;
|
| + if(rm0<=MAPPING_LENGTH_MASK) {
|
| + length=rm0;
|
| + return (const UChar *)rawMapping-rm0;
|
| + } else {
|
| + // Copy the normal mapping and replace its first two code units with rm0.
|
| + buffer[0]=(UChar)rm0;
|
| + u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
|
| + length=mLength-1;
|
| + return buffer;
|
| + }
|
| + } else {
|
| + length=mLength;
|
| + return (const UChar *)mapping+1;
|
| + }
|
| + }
|
| +}
|
| +
|
| +void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
|
| + UBool doDecompose,
|
| + UnicodeString &safeMiddle,
|
| + ReorderingBuffer &buffer,
|
| + UErrorCode &errorCode) const {
|
| + buffer.copyReorderableSuffixTo(safeMiddle);
|
| + if(doDecompose) {
|
| + decompose(src, limit, &buffer, errorCode);
|
| + return;
|
| + }
|
| + // Just merge the strings at the boundary.
|
| + ForwardUTrie2StringIterator iter(normTrie, src, limit);
|
| + uint8_t firstCC, prevCC, cc;
|
| + firstCC=prevCC=cc=getCC(iter.next16());
|
| + while(cc!=0) {
|
| + prevCC=cc;
|
| + cc=getCC(iter.next16());
|
| + };
|
| + if(limit==NULL) { // appendZeroCC() needs limit!=NULL
|
| + limit=u_strchr(iter.codePointStart, 0);
|
| + }
|
| +
|
| + if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
|
| + buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
|
| + }
|
| +}
|
| +
|
| +// Note: hasDecompBoundary() could be implemented as aliases to
|
| +// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
|
| +// at the cost of building the FCD trie for a decomposition normalizer.
|
| +UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
|
| + for(;;) {
|
| + if(c<minDecompNoCP) {
|
| + return TRUE;
|
| + }
|
| + uint16_t norm16=getNorm16(c);
|
| + if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
|
| + return TRUE;
|
| + } else if(norm16>MIN_NORMAL_MAYBE_YES) {
|
| + return FALSE; // ccc!=0
|
| + } else if(isDecompNoAlgorithmic(norm16)) {
|
| + c=mapAlgorithmic(c, norm16);
|
| + } else {
|
| + // c decomposes, get everything from the variable-length extra data
|
| + const uint16_t *mapping=getMapping(norm16);
|
| + uint16_t firstUnit=*mapping;
|
| + if((firstUnit&MAPPING_LENGTH_MASK)==0) {
|
| + return FALSE;
|
| + }
|
| + if(!before) {
|
| + // decomp after-boundary: same as hasFCDBoundaryAfter(),
|
| + // fcd16<=1 || trailCC==0
|
| + if(firstUnit>0x1ff) {
|
| + return FALSE; // trailCC>1
|
| + }
|
| + if(firstUnit<=0xff) {
|
| + return TRUE; // trailCC==0
|
| + }
|
| + // if(trailCC==1) test leadCC==0, same as checking for before-boundary
|
| + }
|
| + // TRUE if leadCC==0 (hasFCDBoundaryBefore())
|
| + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
|
| + }
|
| + }
|
| +}
|
| +
|
| +/*
|
| + * Finds the recomposition result for
|
| + * a forward-combining "lead" character,
|
| + * specified with a pointer to its compositions list,
|
| + * and a backward-combining "trail" character.
|
| + *
|
| + * If the lead and trail characters combine, then this function returns
|
| + * the following "compositeAndFwd" value:
|
| + * Bits 21..1 composite character
|
| + * Bit 0 set if the composite is a forward-combining starter
|
| + * otherwise it returns -1.
|
| + *
|
| + * The compositions list has (trail, compositeAndFwd) pair entries,
|
| + * encoded as either pairs or triples of 16-bit units.
|
| + * The last entry has the high bit of its first unit set.
|
| + *
|
| + * The list is sorted by ascending trail characters (there are no duplicates).
|
| + * A linear search is used.
|
| + *
|
| + * See normalizer2impl.h for a more detailed description
|
| + * of the compositions list format.
|
| + */
|
| +int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
|
| + uint16_t key1, firstUnit;
|
| + if(trail<COMP_1_TRAIL_LIMIT) {
|
| + // trail character is 0..33FF
|
| + // result entry may have 2 or 3 units
|
| + key1=(uint16_t)(trail<<1);
|
| + while(key1>(firstUnit=*list)) {
|
| + list+=2+(firstUnit&COMP_1_TRIPLE);
|
| + }
|
| + if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
|
| + if(firstUnit&COMP_1_TRIPLE) {
|
| + return ((int32_t)list[1]<<16)|list[2];
|
| + } else {
|
| + return list[1];
|
| + }
|
| + }
|
| + } else {
|
| + // trail character is 3400..10FFFF
|
| + // result entry has 3 units
|
| + key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
|
| + (((trail>>COMP_1_TRAIL_SHIFT))&
|
| + ~COMP_1_TRIPLE));
|
| + uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
|
| + uint16_t secondUnit;
|
| + for(;;) {
|
| + if(key1>(firstUnit=*list)) {
|
| + list+=2+(firstUnit&COMP_1_TRIPLE);
|
| + } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
|
| + if(key2>(secondUnit=list[1])) {
|
| + if(firstUnit&COMP_1_LAST_TUPLE) {
|
| + break;
|
| + } else {
|
| + list+=3;
|
| + }
|
| + } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
|
| + return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
|
| + } else {
|
| + break;
|
| + }
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + }
|
| + return -1;
|
| +}
|
| +
|
| +/**
|
| + * @param list some character's compositions list
|
| + * @param set recursively receives the composites from these compositions
|
| + */
|
| +void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
|
| + uint16_t firstUnit;
|
| + int32_t compositeAndFwd;
|
| + do {
|
| + firstUnit=*list;
|
| + if((firstUnit&COMP_1_TRIPLE)==0) {
|
| + compositeAndFwd=list[1];
|
| + list+=2;
|
| + } else {
|
| + compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
|
| + list+=3;
|
| + }
|
| + UChar32 composite=compositeAndFwd>>1;
|
| + if((compositeAndFwd&1)!=0) {
|
| + addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
|
| + }
|
| + set.add(composite);
|
| + } while((firstUnit&COMP_1_LAST_TUPLE)==0);
|
| +}
|
| +
|
| +/*
|
| + * Recomposes the buffer text starting at recomposeStartIndex
|
| + * (which is in NFD - decomposed and canonically ordered),
|
| + * and truncates the buffer contents.
|
| + *
|
| + * Note that recomposition never lengthens the text:
|
| + * Any character consists of either one or two code units;
|
| + * a composition may contain at most one more code unit than the original starter,
|
| + * while the combining mark that is removed has at least one code unit.
|
| + */
|
| +void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
|
| + UBool onlyContiguous) const {
|
| + UChar *p=buffer.getStart()+recomposeStartIndex;
|
| + UChar *limit=buffer.getLimit();
|
| + if(p==limit) {
|
| + return;
|
| + }
|
| +
|
| + UChar *starter, *pRemove, *q, *r;
|
| + const uint16_t *compositionsList;
|
| + UChar32 c, compositeAndFwd;
|
| + uint16_t norm16;
|
| + uint8_t cc, prevCC;
|
| + UBool starterIsSupplementary;
|
| +
|
| + // Some of the following variables are not used until we have a forward-combining starter
|
| + // and are only initialized now to avoid compiler warnings.
|
| + compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
|
| + starter=NULL;
|
| + starterIsSupplementary=FALSE;
|
| + prevCC=0;
|
| +
|
| + for(;;) {
|
| + UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
|
| + cc=getCCFromYesOrMaybe(norm16);
|
| + if( // this character combines backward and
|
| + isMaybe(norm16) &&
|
| + // we have seen a starter that combines forward and
|
| + compositionsList!=NULL &&
|
| + // the backward-combining character is not blocked
|
| + (prevCC<cc || prevCC==0)
|
| + ) {
|
| + if(isJamoVT(norm16)) {
|
| + // c is a Jamo V/T, see if we can compose it with the previous character.
|
| + if(c<Hangul::JAMO_T_BASE) {
|
| + // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
|
| + UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
|
| + if(prev<Hangul::JAMO_L_COUNT) {
|
| + pRemove=p-1;
|
| + UChar syllable=(UChar)
|
| + (Hangul::HANGUL_BASE+
|
| + (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
|
| + Hangul::JAMO_T_COUNT);
|
| + UChar t;
|
| + if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
|
| + ++p;
|
| + syllable+=t; // The next character was a Jamo T.
|
| + }
|
| + *starter=syllable;
|
| + // remove the Jamo V/T
|
| + q=pRemove;
|
| + r=p;
|
| + while(r<limit) {
|
| + *q++=*r++;
|
| + }
|
| + limit=q;
|
| + p=pRemove;
|
| + }
|
| + }
|
| + /*
|
| + * No "else" for Jamo T:
|
| + * Since the input is in NFD, there are no Hangul LV syllables that
|
| + * a Jamo T could combine with.
|
| + * All Jamo Ts are combined above when handling Jamo Vs.
|
| + */
|
| + if(p==limit) {
|
| + break;
|
| + }
|
| + compositionsList=NULL;
|
| + continue;
|
| + } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
|
| + // The starter and the combining mark (c) do combine.
|
| + UChar32 composite=compositeAndFwd>>1;
|
| +
|
| + // Replace the starter with the composite, remove the combining mark.
|
| + pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
|
| + if(starterIsSupplementary) {
|
| + if(U_IS_SUPPLEMENTARY(composite)) {
|
| + // both are supplementary
|
| + starter[0]=U16_LEAD(composite);
|
| + starter[1]=U16_TRAIL(composite);
|
| + } else {
|
| + *starter=(UChar)composite;
|
| + // The composite is shorter than the starter,
|
| + // move the intermediate characters forward one.
|
| + starterIsSupplementary=FALSE;
|
| + q=starter+1;
|
| + r=q+1;
|
| + while(r<pRemove) {
|
| + *q++=*r++;
|
| + }
|
| + --pRemove;
|
| + }
|
| + } else if(U_IS_SUPPLEMENTARY(composite)) {
|
| + // The composite is longer than the starter,
|
| + // move the intermediate characters back one.
|
| + starterIsSupplementary=TRUE;
|
| + ++starter; // temporarily increment for the loop boundary
|
| + q=pRemove;
|
| + r=++pRemove;
|
| + while(starter<q) {
|
| + *--r=*--q;
|
| + }
|
| + *starter=U16_TRAIL(composite);
|
| + *--starter=U16_LEAD(composite); // undo the temporary increment
|
| + } else {
|
| + // both are on the BMP
|
| + *starter=(UChar)composite;
|
| + }
|
| +
|
| + /* remove the combining mark by moving the following text over it */
|
| + if(pRemove<p) {
|
| + q=pRemove;
|
| + r=p;
|
| + while(r<limit) {
|
| + *q++=*r++;
|
| + }
|
| + limit=q;
|
| + p=pRemove;
|
| + }
|
| + // Keep prevCC because we removed the combining mark.
|
| +
|
| + if(p==limit) {
|
| + break;
|
| + }
|
| + // Is the composite a starter that combines forward?
|
| + if(compositeAndFwd&1) {
|
| + compositionsList=
|
| + getCompositionsListForComposite(getNorm16(composite));
|
| + } else {
|
| + compositionsList=NULL;
|
| + }
|
| +
|
| + // We combined; continue with looking for compositions.
|
| + continue;
|
| + }
|
| + }
|
| +
|
| + // no combination this time
|
| + prevCC=cc;
|
| + if(p==limit) {
|
| + break;
|
| + }
|
| +
|
| + // If c did not combine, then check if it is a starter.
|
| + if(cc==0) {
|
| + // Found a new starter.
|
| + if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
|
| + // It may combine with something, prepare for it.
|
| + if(U_IS_BMP(c)) {
|
| + starterIsSupplementary=FALSE;
|
| + starter=p-1;
|
| + } else {
|
| + starterIsSupplementary=TRUE;
|
| + starter=p-2;
|
| + }
|
| + }
|
| + } else if(onlyContiguous) {
|
| + // FCC: no discontiguous compositions; any intervening character blocks.
|
| + compositionsList=NULL;
|
| + }
|
| + }
|
| + buffer.setReorderingLimit(limit);
|
| +}
|
| +
|
| +UChar32
|
| +Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
|
| + uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
|
| + const uint16_t *list;
|
| + if(isInert(norm16)) {
|
| + return U_SENTINEL;
|
| + } else if(norm16<minYesNoMappingsOnly) {
|
| + if(isJamoL(norm16)) {
|
| + b-=Hangul::JAMO_V_BASE;
|
| + if(0<=b && b<Hangul::JAMO_V_COUNT) {
|
| + return
|
| + (Hangul::HANGUL_BASE+
|
| + ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
|
| + Hangul::JAMO_T_COUNT);
|
| + } else {
|
| + return U_SENTINEL;
|
| + }
|
| + } else if(isHangul(norm16)) {
|
| + b-=Hangul::JAMO_T_BASE;
|
| + if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
|
| + return a+b;
|
| + } else {
|
| + return U_SENTINEL;
|
| + }
|
| + } else {
|
| + // 'a' has a compositions list in extraData
|
| + list=extraData+norm16;
|
| + if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
|
| + list+= // mapping pointer
|
| + 1+ // +1 to skip the first unit with the mapping lenth
|
| + (*list&MAPPING_LENGTH_MASK); // + mapping length
|
| + }
|
| + }
|
| + } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
|
| + return U_SENTINEL;
|
| + } else {
|
| + list=maybeYesCompositions+norm16-minMaybeYes;
|
| + }
|
| + if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
|
| + return U_SENTINEL;
|
| + }
|
| +#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
|
| + return combine(list, b)>>1;
|
| +#else
|
| + int32_t compositeAndFwd=combine(list, b);
|
| + return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
|
| +#endif
|
| +}
|
| +
|
| +// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
|
| +// doCompose: normalize
|
| +// !doCompose: isNormalized (buffer must be empty and initialized)
|
| +UBool
|
| +Normalizer2Impl::compose(const UChar *src, const UChar *limit,
|
| + UBool onlyContiguous,
|
| + UBool doCompose,
|
| + ReorderingBuffer &buffer,
|
| + UErrorCode &errorCode) const {
|
| + /*
|
| + * prevBoundary points to the last character before the current one
|
| + * that has a composition boundary before it with ccc==0 and quick check "yes".
|
| + * Keeping track of prevBoundary saves us looking for a composition boundary
|
| + * when we find a "no" or "maybe".
|
| + *
|
| + * When we back out from prevSrc back to prevBoundary,
|
| + * then we also remove those same characters (which had been simply copied
|
| + * or canonically-order-inserted) from the ReorderingBuffer.
|
| + * Therefore, at all times, the [prevBoundary..prevSrc[ source units
|
| + * must correspond 1:1 to destination units at the end of the destination buffer.
|
| + */
|
| + const UChar *prevBoundary=src;
|
| + UChar32 minNoMaybeCP=minCompNoMaybeCP;
|
| + if(limit==NULL) {
|
| + src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
|
| + doCompose ? &buffer : NULL,
|
| + errorCode);
|
| + if(U_FAILURE(errorCode)) {
|
| + return FALSE;
|
| + }
|
| + if(prevBoundary<src) {
|
| + // Set prevBoundary to the last character in the prefix.
|
| + prevBoundary=src-1;
|
| + }
|
| + limit=u_strchr(src, 0);
|
| + }
|
| +
|
| + const UChar *prevSrc;
|
| + UChar32 c=0;
|
| + uint16_t norm16=0;
|
| +
|
| + // only for isNormalized
|
| + uint8_t prevCC=0;
|
| +
|
| + for(;;) {
|
| + // count code units below the minimum or with irrelevant data for the quick check
|
| + for(prevSrc=src; src!=limit;) {
|
| + if( (c=*src)<minNoMaybeCP ||
|
| + isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
|
| + ) {
|
| + ++src;
|
| + } else if(!U16_IS_SURROGATE(c)) {
|
| + break;
|
| + } else {
|
| + UChar c2;
|
| + if(U16_IS_SURROGATE_LEAD(c)) {
|
| + if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
|
| + c=U16_GET_SUPPLEMENTARY(c, c2);
|
| + }
|
| + } else /* trail surrogate */ {
|
| + if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
|
| + --src;
|
| + c=U16_GET_SUPPLEMENTARY(c2, c);
|
| + }
|
| + }
|
| + if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
|
| + src+=U16_LENGTH(c);
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + }
|
| + // copy these code units all at once
|
| + if(src!=prevSrc) {
|
| + if(doCompose) {
|
| + if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
|
| + break;
|
| + }
|
| + } else {
|
| + prevCC=0;
|
| + }
|
| + if(src==limit) {
|
| + break;
|
| + }
|
| + // Set prevBoundary to the last character in the quick check loop.
|
| + prevBoundary=src-1;
|
| + if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
|
| + U16_IS_LEAD(*(prevBoundary-1))
|
| + ) {
|
| + --prevBoundary;
|
| + }
|
| + // The start of the current character (c).
|
| + prevSrc=src;
|
| + } else if(src==limit) {
|
| + break;
|
| + }
|
| +
|
| + src+=U16_LENGTH(c);
|
| + /*
|
| + * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
|
| + * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
|
| + * or has ccc!=0.
|
| + * Check for Jamo V/T, then for regular characters.
|
| + * c is not a Hangul syllable or Jamo L because those have "yes" properties.
|
| + */
|
| + if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
|
| + UChar prev=*(prevSrc-1);
|
| + UBool needToDecompose=FALSE;
|
| + if(c<Hangul::JAMO_T_BASE) {
|
| + // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
|
| + prev=(UChar)(prev-Hangul::JAMO_L_BASE);
|
| + if(prev<Hangul::JAMO_L_COUNT) {
|
| + if(!doCompose) {
|
| + return FALSE;
|
| + }
|
| + UChar syllable=(UChar)
|
| + (Hangul::HANGUL_BASE+
|
| + (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
|
| + Hangul::JAMO_T_COUNT);
|
| + UChar t;
|
| + if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
|
| + ++src;
|
| + syllable+=t; // The next character was a Jamo T.
|
| + prevBoundary=src;
|
| + buffer.setLastChar(syllable);
|
| + continue;
|
| + }
|
| + // If we see L+V+x where x!=T then we drop to the slow path,
|
| + // decompose and recompose.
|
| + // This is to deal with NFKC finding normal L and V but a
|
| + // compatibility variant of a T. We need to either fully compose that
|
| + // combination here (which would complicate the code and may not work
|
| + // with strange custom data) or use the slow path -- or else our replacing
|
| + // two input characters (L+V) with one output character (LV syllable)
|
| + // would violate the invariant that [prevBoundary..prevSrc[ has the same
|
| + // length as what we appended to the buffer since prevBoundary.
|
| + needToDecompose=TRUE;
|
| + }
|
| + } else if(Hangul::isHangulWithoutJamoT(prev)) {
|
| + // c is a Jamo Trailing consonant,
|
| + // compose with previous Hangul LV that does not contain a Jamo T.
|
| + if(!doCompose) {
|
| + return FALSE;
|
| + }
|
| + buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
|
| + prevBoundary=src;
|
| + continue;
|
| + }
|
| + if(!needToDecompose) {
|
| + // The Jamo V/T did not compose into a Hangul syllable.
|
| + if(doCompose) {
|
| + if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
|
| + break;
|
| + }
|
| + } else {
|
| + prevCC=0;
|
| + }
|
| + continue;
|
| + }
|
| + }
|
| + /*
|
| + * Source buffer pointers:
|
| + *
|
| + * all done quick check current char not yet
|
| + * "yes" but (c) processed
|
| + * may combine
|
| + * forward
|
| + * [-------------[-------------[-------------[-------------[
|
| + * | | | | |
|
| + * orig. src prevBoundary prevSrc src limit
|
| + *
|
| + *
|
| + * Destination buffer pointers inside the ReorderingBuffer:
|
| + *
|
| + * all done might take not filled yet
|
| + * characters for
|
| + * reordering
|
| + * [-------------[-------------[-------------[
|
| + * | | | |
|
| + * start reorderStart limit |
|
| + * +remainingCap.+
|
| + */
|
| + if(norm16>=MIN_YES_YES_WITH_CC) {
|
| + uint8_t cc=(uint8_t)norm16; // cc!=0
|
| + if( onlyContiguous && // FCC
|
| + (doCompose ? buffer.getLastCC() : prevCC)==0 &&
|
| + prevBoundary<prevSrc &&
|
| + // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
|
| + // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
|
| + // passed the quick check "yes && ccc==0" test.
|
| + // Check whether the last character was a "yesYes" or a "yesNo".
|
| + // If a "yesNo", then we get its trailing ccc from its
|
| + // mapping and check for canonical order.
|
| + // All other cases are ok.
|
| + getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
|
| + ) {
|
| + // Fails FCD test, need to decompose and contiguously recompose.
|
| + if(!doCompose) {
|
| + return FALSE;
|
| + }
|
| + } else if(doCompose) {
|
| + if(!buffer.append(c, cc, errorCode)) {
|
| + break;
|
| + }
|
| + continue;
|
| + } else if(prevCC<=cc) {
|
| + prevCC=cc;
|
| + continue;
|
| + } else {
|
| + return FALSE;
|
| + }
|
| + } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
|
| + return FALSE;
|
| + }
|
| +
|
| + /*
|
| + * Find appropriate boundaries around this character,
|
| + * decompose the source text from between the boundaries,
|
| + * and recompose it.
|
| + *
|
| + * We may need to remove the last few characters from the ReorderingBuffer
|
| + * to account for source text that was copied or appended
|
| + * but needs to take part in the recomposition.
|
| + */
|
| +
|
| + /*
|
| + * Find the last composition boundary in [prevBoundary..src[.
|
| + * It is either the decomposition of the current character (at prevSrc),
|
| + * or prevBoundary.
|
| + */
|
| + if(hasCompBoundaryBefore(c, norm16)) {
|
| + prevBoundary=prevSrc;
|
| + } else if(doCompose) {
|
| + buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
|
| + }
|
| +
|
| + // Find the next composition boundary in [src..limit[ -
|
| + // modifies src to point to the next starter.
|
| + src=(UChar *)findNextCompBoundary(src, limit);
|
| +
|
| + // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
|
| + int32_t recomposeStartIndex=buffer.length();
|
| + if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
|
| + break;
|
| + }
|
| + recompose(buffer, recomposeStartIndex, onlyContiguous);
|
| + if(!doCompose) {
|
| + if(!buffer.equals(prevBoundary, src)) {
|
| + return FALSE;
|
| + }
|
| + buffer.remove();
|
| + prevCC=0;
|
| + }
|
| +
|
| + // Move to the next starter. We never need to look back before this point again.
|
| + prevBoundary=src;
|
| + }
|
| + return TRUE;
|
| +}
|
| +
|
| +// Very similar to compose(): Make the same changes in both places if relevant.
|
| +// pQCResult==NULL: spanQuickCheckYes
|
| +// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
|
| +const UChar *
|
| +Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
|
| + UBool onlyContiguous,
|
| + UNormalizationCheckResult *pQCResult) const {
|
| + /*
|
| + * prevBoundary points to the last character before the current one
|
| + * that has a composition boundary before it with ccc==0 and quick check "yes".
|
| + */
|
| + const UChar *prevBoundary=src;
|
| + UChar32 minNoMaybeCP=minCompNoMaybeCP;
|
| + if(limit==NULL) {
|
| + UErrorCode errorCode=U_ZERO_ERROR;
|
| + src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
|
| + if(prevBoundary<src) {
|
| + // Set prevBoundary to the last character in the prefix.
|
| + prevBoundary=src-1;
|
| + }
|
| + limit=u_strchr(src, 0);
|
| + }
|
| +
|
| + const UChar *prevSrc;
|
| + UChar32 c=0;
|
| + uint16_t norm16=0;
|
| + uint8_t prevCC=0;
|
| +
|
| + for(;;) {
|
| + // count code units below the minimum or with irrelevant data for the quick check
|
| + for(prevSrc=src;;) {
|
| + if(src==limit) {
|
| + return src;
|
| + }
|
| + if( (c=*src)<minNoMaybeCP ||
|
| + isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
|
| + ) {
|
| + ++src;
|
| + } else if(!U16_IS_SURROGATE(c)) {
|
| + break;
|
| + } else {
|
| + UChar c2;
|
| + if(U16_IS_SURROGATE_LEAD(c)) {
|
| + if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
|
| + c=U16_GET_SUPPLEMENTARY(c, c2);
|
| + }
|
| + } else /* trail surrogate */ {
|
| + if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
|
| + --src;
|
| + c=U16_GET_SUPPLEMENTARY(c2, c);
|
| + }
|
| + }
|
| + if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
|
| + src+=U16_LENGTH(c);
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + }
|
| + if(src!=prevSrc) {
|
| + // Set prevBoundary to the last character in the quick check loop.
|
| + prevBoundary=src-1;
|
| + if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
|
| + U16_IS_LEAD(*(prevBoundary-1))
|
| + ) {
|
| + --prevBoundary;
|
| + }
|
| + prevCC=0;
|
| + // The start of the current character (c).
|
| + prevSrc=src;
|
| + }
|
| +
|
| + src+=U16_LENGTH(c);
|
| + /*
|
| + * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
|
| + * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
|
| + * or has ccc!=0.
|
| + */
|
| + if(isMaybeOrNonZeroCC(norm16)) {
|
| + uint8_t cc=getCCFromYesOrMaybe(norm16);
|
| + if( onlyContiguous && // FCC
|
| + cc!=0 &&
|
| + prevCC==0 &&
|
| + prevBoundary<prevSrc &&
|
| + // prevCC==0 && prevBoundary<prevSrc tell us that
|
| + // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
|
| + // passed the quick check "yes && ccc==0" test.
|
| + // Check whether the last character was a "yesYes" or a "yesNo".
|
| + // If a "yesNo", then we get its trailing ccc from its
|
| + // mapping and check for canonical order.
|
| + // All other cases are ok.
|
| + getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
|
| + ) {
|
| + // Fails FCD test.
|
| + } else if(prevCC<=cc || cc==0) {
|
| + prevCC=cc;
|
| + if(norm16<MIN_YES_YES_WITH_CC) {
|
| + if(pQCResult!=NULL) {
|
| + *pQCResult=UNORM_MAYBE;
|
| + } else {
|
| + return prevBoundary;
|
| + }
|
| + }
|
| + continue;
|
| + }
|
| + }
|
| + if(pQCResult!=NULL) {
|
| + *pQCResult=UNORM_NO;
|
| + }
|
| + return prevBoundary;
|
| + }
|
| +}
|
| +
|
| +void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
|
| + UBool doCompose,
|
| + UBool onlyContiguous,
|
| + UnicodeString &safeMiddle,
|
| + ReorderingBuffer &buffer,
|
| + UErrorCode &errorCode) const {
|
| + if(!buffer.isEmpty()) {
|
| + const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
|
| + if(src!=firstStarterInSrc) {
|
| + const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
|
| + buffer.getLimit());
|
| + int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
|
| + UnicodeString middle(lastStarterInDest, destSuffixLength);
|
| + buffer.removeSuffix(destSuffixLength);
|
| + safeMiddle=middle;
|
| + middle.append(src, (int32_t)(firstStarterInSrc-src));
|
| + const UChar *middleStart=middle.getBuffer();
|
| + compose(middleStart, middleStart+middle.length(), onlyContiguous,
|
| + TRUE, buffer, errorCode);
|
| + if(U_FAILURE(errorCode)) {
|
| + return;
|
| + }
|
| + src=firstStarterInSrc;
|
| + }
|
| + }
|
| + if(doCompose) {
|
| + compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
|
| + } else {
|
| + if(limit==NULL) { // appendZeroCC() needs limit!=NULL
|
| + limit=u_strchr(src, 0);
|
| + }
|
| + buffer.appendZeroCC(src, limit, errorCode);
|
| + }
|
| +}
|
| +
|
| +/**
|
| + * Does c have a composition boundary before it?
|
| + * True if its decomposition begins with a character that has
|
| + * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
|
| + * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
|
| + * (isCompYesAndZeroCC()) so we need not decompose.
|
| + */
|
| +UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
|
| + for(;;) {
|
| + if(isCompYesAndZeroCC(norm16)) {
|
| + return TRUE;
|
| + } else if(isMaybeOrNonZeroCC(norm16)) {
|
| + return FALSE;
|
| + } else if(isDecompNoAlgorithmic(norm16)) {
|
| + c=mapAlgorithmic(c, norm16);
|
| + norm16=getNorm16(c);
|
| + } else {
|
| + // c decomposes, get everything from the variable-length extra data
|
| + const uint16_t *mapping=getMapping(norm16);
|
| + uint16_t firstUnit=*mapping;
|
| + if((firstUnit&MAPPING_LENGTH_MASK)==0) {
|
| + return FALSE;
|
| + }
|
| + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
|
| + return FALSE; // non-zero leadCC
|
| + }
|
| + int32_t i=1; // skip over the firstUnit
|
| + UChar32 c;
|
| + U16_NEXT_UNSAFE(mapping, i, c);
|
| + return isCompYesAndZeroCC(getNorm16(c));
|
| + }
|
| + }
|
| +}
|
| +
|
| +UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
|
| + for(;;) {
|
| + uint16_t norm16=getNorm16(c);
|
| + if(isInert(norm16)) {
|
| + return TRUE;
|
| + } else if(norm16<=minYesNo) {
|
| + // Hangul: norm16==minYesNo
|
| + // Hangul LVT has a boundary after it.
|
| + // Hangul LV and non-inert yesYes characters combine forward.
|
| + return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
|
| + } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
|
| + return FALSE;
|
| + } else if(isDecompNoAlgorithmic(norm16)) {
|
| + c=mapAlgorithmic(c, norm16);
|
| + } else {
|
| + // c decomposes, get everything from the variable-length extra data.
|
| + // If testInert, then c must be a yesNo character which has lccc=0,
|
| + // otherwise it could be a noNo.
|
| + const uint16_t *mapping=getMapping(norm16);
|
| + uint16_t firstUnit=*mapping;
|
| + // TRUE if
|
| + // not MAPPING_NO_COMP_BOUNDARY_AFTER
|
| + // (which is set if
|
| + // c is not deleted, and
|
| + // it and its decomposition do not combine forward, and it has a starter)
|
| + // and if FCC then trailCC<=1
|
| + return
|
| + (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
|
| + (!onlyContiguous || firstUnit<=0x1ff);
|
| + }
|
| + }
|
| +}
|
| +
|
| +const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
|
| + BackwardUTrie2StringIterator iter(normTrie, start, p);
|
| + uint16_t norm16;
|
| + do {
|
| + norm16=iter.previous16();
|
| + } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
|
| + // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
|
| + // but that's probably not worth the extra cost.
|
| + return iter.codePointStart;
|
| +}
|
| +
|
| +const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
|
| + ForwardUTrie2StringIterator iter(normTrie, p, limit);
|
| + uint16_t norm16;
|
| + do {
|
| + norm16=iter.next16();
|
| + } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
|
| + return iter.codePointStart;
|
| +}
|
| +
|
| +// Note: normalizer2impl.cpp r30982 (2011-nov-27)
|
| +// still had getFCDTrie() which built and cached an FCD trie.
|
| +// That provided faster access to FCD data than getFCD16FromNormData()
|
| +// but required synchronization and consumed some 10kB of heap memory
|
| +// in any process that uses FCD (e.g., via collation).
|
| +// tccc180[] and smallFCD[] are intended to help with any loss of performance,
|
| +// at least for Latin & CJK.
|
| +
|
| +// Gets the FCD value from the regular normalization data.
|
| +uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
|
| + // Only loops for 1:1 algorithmic mappings.
|
| + for(;;) {
|
| + uint16_t norm16=getNorm16(c);
|
| + if(norm16<=minYesNo) {
|
| + // no decomposition or Hangul syllable, all zeros
|
| + return 0;
|
| + } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
|
| + // combining mark
|
| + norm16&=0xff;
|
| + return norm16|(norm16<<8);
|
| + } else if(norm16>=minMaybeYes) {
|
| + return 0;
|
| + } else if(isDecompNoAlgorithmic(norm16)) {
|
| + c=mapAlgorithmic(c, norm16);
|
| + } else {
|
| + // c decomposes, get everything from the variable-length extra data
|
| + const uint16_t *mapping=getMapping(norm16);
|
| + uint16_t firstUnit=*mapping;
|
| + if((firstUnit&MAPPING_LENGTH_MASK)==0) {
|
| + // A character that is deleted (maps to an empty string) must
|
| + // get the worst-case lccc and tccc values because arbitrary
|
| + // characters on both sides will become adjacent.
|
| + return 0x1ff;
|
| + } else {
|
| + norm16=firstUnit>>8; // tccc
|
| + if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
|
| + norm16|=*(mapping-1)&0xff00; // lccc
|
| + }
|
| + return norm16;
|
| + }
|
| + }
|
| + }
|
| +}
|
| +
|
| +// Dual functionality:
|
| +// buffer!=NULL: normalize
|
| +// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
|
| +const UChar *
|
| +Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
|
| + ReorderingBuffer *buffer,
|
| + UErrorCode &errorCode) const {
|
| + // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
|
| + // Similar to the prevBoundary in the compose() implementation.
|
| + const UChar *prevBoundary=src;
|
| + int32_t prevFCD16=0;
|
| + if(limit==NULL) {
|
| + src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
|
| + if(U_FAILURE(errorCode)) {
|
| + return src;
|
| + }
|
| + if(prevBoundary<src) {
|
| + prevBoundary=src;
|
| + // We know that the previous character's lccc==0.
|
| + // Fetching the fcd16 value was deferred for this below-U+0300 code point.
|
| + prevFCD16=getFCD16(*(src-1));
|
| + if(prevFCD16>1) {
|
| + --prevBoundary;
|
| + }
|
| + }
|
| + limit=u_strchr(src, 0);
|
| + }
|
| +
|
| + // Note: In this function we use buffer->appendZeroCC() because we track
|
| + // the lead and trail combining classes here, rather than leaving it to
|
| + // the ReorderingBuffer.
|
| + // The exception is the call to decomposeShort() which uses the buffer
|
| + // in the normal way.
|
| +
|
| + const UChar *prevSrc;
|
| + UChar32 c=0;
|
| + uint16_t fcd16=0;
|
| +
|
| + for(;;) {
|
| + // count code units with lccc==0
|
| + for(prevSrc=src; src!=limit;) {
|
| + if((c=*src)<MIN_CCC_LCCC_CP) {
|
| + prevFCD16=~c;
|
| + ++src;
|
| + } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
|
| + prevFCD16=0;
|
| + ++src;
|
| + } else {
|
| + if(U16_IS_SURROGATE(c)) {
|
| + UChar c2;
|
| + if(U16_IS_SURROGATE_LEAD(c)) {
|
| + if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
|
| + c=U16_GET_SUPPLEMENTARY(c, c2);
|
| + }
|
| + } else /* trail surrogate */ {
|
| + if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
|
| + --src;
|
| + c=U16_GET_SUPPLEMENTARY(c2, c);
|
| + }
|
| + }
|
| + }
|
| + if((fcd16=getFCD16FromNormData(c))<=0xff) {
|
| + prevFCD16=fcd16;
|
| + src+=U16_LENGTH(c);
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + }
|
| + // copy these code units all at once
|
| + if(src!=prevSrc) {
|
| + if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
|
| + break;
|
| + }
|
| + if(src==limit) {
|
| + break;
|
| + }
|
| + prevBoundary=src;
|
| + // We know that the previous character's lccc==0.
|
| + if(prevFCD16<0) {
|
| + // Fetching the fcd16 value was deferred for this below-U+0300 code point.
|
| + UChar32 prev=~prevFCD16;
|
| + prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
|
| + if(prevFCD16>1) {
|
| + --prevBoundary;
|
| + }
|
| + } else {
|
| + const UChar *p=src-1;
|
| + if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
|
| + --p;
|
| + // Need to fetch the previous character's FCD value because
|
| + // prevFCD16 was just for the trail surrogate code point.
|
| + prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
|
| + // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
|
| + }
|
| + if(prevFCD16>1) {
|
| + prevBoundary=p;
|
| + }
|
| + }
|
| + // The start of the current character (c).
|
| + prevSrc=src;
|
| + } else if(src==limit) {
|
| + break;
|
| + }
|
| +
|
| + src+=U16_LENGTH(c);
|
| + // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
|
| + // Check for proper order, and decompose locally if necessary.
|
| + if((prevFCD16&0xff)<=(fcd16>>8)) {
|
| + // proper order: prev tccc <= current lccc
|
| + if((fcd16&0xff)<=1) {
|
| + prevBoundary=src;
|
| + }
|
| + if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
|
| + break;
|
| + }
|
| + prevFCD16=fcd16;
|
| + continue;
|
| + } else if(buffer==NULL) {
|
| + return prevBoundary; // quick check "no"
|
| + } else {
|
| + /*
|
| + * Back out the part of the source that we copied or appended
|
| + * already but is now going to be decomposed.
|
| + * prevSrc is set to after what was copied/appended.
|
| + */
|
| + buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
|
| + /*
|
| + * Find the part of the source that needs to be decomposed,
|
| + * up to the next safe boundary.
|
| + */
|
| + src=findNextFCDBoundary(src, limit);
|
| + /*
|
| + * The source text does not fulfill the conditions for FCD.
|
| + * Decompose and reorder a limited piece of the text.
|
| + */
|
| + if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
|
| + break;
|
| + }
|
| + prevBoundary=src;
|
| + prevFCD16=0;
|
| + }
|
| + }
|
| + return src;
|
| +}
|
| +
|
| +void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
|
| + UBool doMakeFCD,
|
| + UnicodeString &safeMiddle,
|
| + ReorderingBuffer &buffer,
|
| + UErrorCode &errorCode) const {
|
| + if(!buffer.isEmpty()) {
|
| + const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
|
| + if(src!=firstBoundaryInSrc) {
|
| + const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
|
| + buffer.getLimit());
|
| + int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
|
| + UnicodeString middle(lastBoundaryInDest, destSuffixLength);
|
| + buffer.removeSuffix(destSuffixLength);
|
| + safeMiddle=middle;
|
| + middle.append(src, (int32_t)(firstBoundaryInSrc-src));
|
| + const UChar *middleStart=middle.getBuffer();
|
| + makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
|
| + if(U_FAILURE(errorCode)) {
|
| + return;
|
| + }
|
| + src=firstBoundaryInSrc;
|
| + }
|
| + }
|
| + if(doMakeFCD) {
|
| + makeFCD(src, limit, &buffer, errorCode);
|
| + } else {
|
| + if(limit==NULL) { // appendZeroCC() needs limit!=NULL
|
| + limit=u_strchr(src, 0);
|
| + }
|
| + buffer.appendZeroCC(src, limit, errorCode);
|
| + }
|
| +}
|
| +
|
| +const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
|
| + while(start<p && previousFCD16(start, p)>0xff) {}
|
| + return p;
|
| +}
|
| +
|
| +const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
|
| + while(p<limit) {
|
| + const UChar *codePointStart=p;
|
| + if(nextFCD16(p, limit)<=0xff) {
|
| + return codePointStart;
|
| + }
|
| + }
|
| + return p;
|
| +}
|
| +
|
| +// CanonicalIterator data -------------------------------------------------- ***
|
| +
|
| +CanonIterData::CanonIterData(UErrorCode &errorCode) :
|
| + trie(utrie2_open(0, 0, &errorCode)),
|
| + canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
|
| +
|
| +CanonIterData::~CanonIterData() {
|
| + utrie2_close(trie);
|
| +}
|
| +
|
| +void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
|
| + uint32_t canonValue=utrie2_get32(trie, decompLead);
|
| + if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
|
| + // origin is the first character whose decomposition starts with
|
| + // the character for which we are setting the value.
|
| + utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
|
| + } else {
|
| + // origin is not the first character, or it is U+0000.
|
| + UnicodeSet *set;
|
| + if((canonValue&CANON_HAS_SET)==0) {
|
| + set=new UnicodeSet;
|
| + if(set==NULL) {
|
| + errorCode=U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| + UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
|
| + canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
|
| + utrie2_set32(trie, decompLead, canonValue, &errorCode);
|
| + canonStartSets.addElement(set, errorCode);
|
| + if(firstOrigin!=0) {
|
| + set->add(firstOrigin);
|
| + }
|
| + } else {
|
| + set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
|
| + }
|
| + set->add(origin);
|
| + }
|
| +}
|
| +
|
| +class CanonIterDataSingleton {
|
| +public:
|
| + CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
|
| + singleton(s), impl(ni), errorCode(ec) {}
|
| + CanonIterData *getInstance(UErrorCode &errorCode) {
|
| + void *duplicate;
|
| + CanonIterData *instance=
|
| + (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode);
|
| + delete (CanonIterData *)duplicate;
|
| + return instance;
|
| + }
|
| + static void *createInstance(const void *context, UErrorCode &errorCode);
|
| + UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
|
| + if(value!=0) {
|
| + impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode);
|
| + }
|
| + return U_SUCCESS(errorCode);
|
| + }
|
| +
|
| +private:
|
| + SimpleSingleton &singleton;
|
| + Normalizer2Impl &impl;
|
| + CanonIterData *newData;
|
| + UErrorCode &errorCode;
|
| +};
|
| +
|
| +U_CDECL_BEGIN
|
| +
|
| +// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
|
| +static UBool U_CALLCONV
|
| +enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
|
| + return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);
|
| +}
|
| +
|
| +U_CDECL_END
|
| +
|
| +void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) {
|
| + CanonIterDataSingleton *me=(CanonIterDataSingleton *)context;
|
| + me->newData=new CanonIterData(errorCode);
|
| + if(me->newData==NULL) {
|
| + errorCode=U_MEMORY_ALLOCATION_ERROR;
|
| + return NULL;
|
| + }
|
| + if(U_SUCCESS(errorCode)) {
|
| + utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);
|
| + utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
|
| + if(U_SUCCESS(errorCode)) {
|
| + return me->newData;
|
| + }
|
| + }
|
| + delete me->newData;
|
| + return NULL;
|
| +}
|
| +
|
| +void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
|
| + CanonIterData &newData,
|
| + UErrorCode &errorCode) const {
|
| + if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
|
| + // Inert, or 2-way mapping (including Hangul syllable).
|
| + // We do not write a canonStartSet for any yesNo character.
|
| + // Composites from 2-way mappings are added at runtime from the
|
| + // starter's compositions list, and the other characters in
|
| + // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
|
| + // "maybe" characters.
|
| + return;
|
| + }
|
| + for(UChar32 c=start; c<=end; ++c) {
|
| + uint32_t oldValue=utrie2_get32(newData.trie, c);
|
| + uint32_t newValue=oldValue;
|
| + if(norm16>=minMaybeYes) {
|
| + // not a segment starter if it occurs in a decomposition or has cc!=0
|
| + newValue|=CANON_NOT_SEGMENT_STARTER;
|
| + if(norm16<MIN_NORMAL_MAYBE_YES) {
|
| + newValue|=CANON_HAS_COMPOSITIONS;
|
| + }
|
| + } else if(norm16<minYesNo) {
|
| + newValue|=CANON_HAS_COMPOSITIONS;
|
| + } else {
|
| + // c has a one-way decomposition
|
| + UChar32 c2=c;
|
| + uint16_t norm16_2=norm16;
|
| + while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
|
| + c2=mapAlgorithmic(c2, norm16_2);
|
| + norm16_2=getNorm16(c2);
|
| + }
|
| + if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
|
| + // c decomposes, get everything from the variable-length extra data
|
| + const uint16_t *mapping=getMapping(norm16_2);
|
| + uint16_t firstUnit=*mapping;
|
| + int32_t length=firstUnit&MAPPING_LENGTH_MASK;
|
| + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
|
| + if(c==c2 && (*(mapping-1)&0xff)!=0) {
|
| + newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
|
| + }
|
| + }
|
| + // Skip empty mappings (no characters in the decomposition).
|
| + if(length!=0) {
|
| + ++mapping; // skip over the firstUnit
|
| + // add c to first code point's start set
|
| + int32_t i=0;
|
| + U16_NEXT_UNSAFE(mapping, i, c2);
|
| + newData.addToStartSet(c, c2, errorCode);
|
| + // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
|
| + // one-way mapping. A 2-way mapping is possible here after
|
| + // intermediate algorithmic mapping.
|
| + if(norm16_2>=minNoNo) {
|
| + while(i<length) {
|
| + U16_NEXT_UNSAFE(mapping, i, c2);
|
| + uint32_t c2Value=utrie2_get32(newData.trie, c2);
|
| + if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
|
| + utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
|
| + &errorCode);
|
| + }
|
| + }
|
| + }
|
| + }
|
| + } else {
|
| + // c decomposed to c2 algorithmically; c has cc==0
|
| + newData.addToStartSet(c, c2, errorCode);
|
| + }
|
| + }
|
| + if(newValue!=oldValue) {
|
| + utrie2_set32(newData.trie, c, newValue, &errorCode);
|
| + }
|
| + }
|
| +}
|
| +
|
| +UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
|
| + // Logically const: Synchronized instantiation.
|
| + Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
|
| + CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode);
|
| + return U_SUCCESS(errorCode);
|
| +}
|
| +
|
| +int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
|
| + return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c);
|
| +}
|
| +
|
| +const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
|
| + return *(const UnicodeSet *)(
|
| + ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);
|
| +}
|
| +
|
| +UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
|
| + return getCanonValue(c)>=0;
|
| +}
|
| +
|
| +UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
|
| + int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
|
| + if(canonValue==0) {
|
| + return FALSE;
|
| + }
|
| + set.clear();
|
| + int32_t value=canonValue&CANON_VALUE_MASK;
|
| + if((canonValue&CANON_HAS_SET)!=0) {
|
| + set.addAll(getCanonStartSet(value));
|
| + } else if(value!=0) {
|
| + set.add(value);
|
| + }
|
| + if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
|
| + uint16_t norm16=getNorm16(c);
|
| + if(norm16==JAMO_L) {
|
| + UChar32 syllable=
|
| + (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
|
| + set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
|
| + } else {
|
| + addComposites(getCompositionsList(norm16), set);
|
| + }
|
| + }
|
| + return TRUE;
|
| +}
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| +// Normalizer2 data swapping ----------------------------------------------- ***
|
| +
|
| +U_NAMESPACE_USE
|
| +
|
| +U_CAPI int32_t U_EXPORT2
|
| +unorm2_swap(const UDataSwapper *ds,
|
| + const void *inData, int32_t length, void *outData,
|
| + UErrorCode *pErrorCode) {
|
| + const UDataInfo *pInfo;
|
| + int32_t headerSize;
|
| +
|
| + const uint8_t *inBytes;
|
| + uint8_t *outBytes;
|
| +
|
| + const int32_t *inIndexes;
|
| + int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
|
| +
|
| + int32_t i, offset, nextOffset, size;
|
| +
|
| + /* udata_swapDataHeader checks the arguments */
|
| + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
|
| + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
| + return 0;
|
| + }
|
| +
|
| + /* check data format and format version */
|
| + pInfo=(const UDataInfo *)((const char *)inData+4);
|
| + if(!(
|
| + pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
|
| + pInfo->dataFormat[1]==0x72 &&
|
| + pInfo->dataFormat[2]==0x6d &&
|
| + pInfo->dataFormat[3]==0x32 &&
|
| + (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
|
| + )) {
|
| + udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
|
| + pInfo->dataFormat[0], pInfo->dataFormat[1],
|
| + pInfo->dataFormat[2], pInfo->dataFormat[3],
|
| + pInfo->formatVersion[0]);
|
| + *pErrorCode=U_UNSUPPORTED_ERROR;
|
| + return 0;
|
| + }
|
| +
|
| + inBytes=(const uint8_t *)inData+headerSize;
|
| + outBytes=(uint8_t *)outData+headerSize;
|
| +
|
| + inIndexes=(const int32_t *)inBytes;
|
| +
|
| + if(length>=0) {
|
| + length-=headerSize;
|
| + if(length<(int32_t)sizeof(indexes)) {
|
| + udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
|
| + length);
|
| + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
| + return 0;
|
| + }
|
| + }
|
| +
|
| + /* read the first few indexes */
|
| + for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
|
| + indexes[i]=udata_readInt32(ds, inIndexes[i]);
|
| + }
|
| +
|
| + /* get the total length of the data */
|
| + size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
|
| +
|
| + if(length>=0) {
|
| + if(length<size) {
|
| + udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
|
| + length);
|
| + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
| + return 0;
|
| + }
|
| +
|
| + /* copy the data for inaccessible bytes */
|
| + if(inBytes!=outBytes) {
|
| + uprv_memcpy(outBytes, inBytes, size);
|
| + }
|
| +
|
| + offset=0;
|
| +
|
| + /* swap the int32_t indexes[] */
|
| + nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
|
| + ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
|
| + offset=nextOffset;
|
| +
|
| + /* swap the UTrie2 */
|
| + nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
|
| + utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
|
| + offset=nextOffset;
|
| +
|
| + /* swap the uint16_t extraData[] */
|
| + nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
|
| + ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
|
| + offset=nextOffset;
|
| +
|
| + /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
|
| + nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
|
| + offset=nextOffset;
|
| +
|
| + U_ASSERT(offset==size);
|
| + }
|
| +
|
| + return headerSize+size;
|
| +}
|
| +
|
| +#endif // !UCONFIG_NO_NORMALIZATION
|
|
|
| Property changes on: icu51/source/common/normalizer2impl.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|