| Index: icu51/source/common/normlzr.cpp
|
| ===================================================================
|
| --- icu51/source/common/normlzr.cpp (revision 0)
|
| +++ icu51/source/common/normlzr.cpp (revision 0)
|
| @@ -0,0 +1,521 @@
|
| +/*
|
| + *************************************************************************
|
| + * COPYRIGHT:
|
| + * Copyright (c) 1996-2012, International Business Machines Corporation and
|
| + * others. All Rights Reserved.
|
| + *************************************************************************
|
| + */
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_NORMALIZATION
|
| +
|
| +#include "unicode/uniset.h"
|
| +#include "unicode/unistr.h"
|
| +#include "unicode/chariter.h"
|
| +#include "unicode/schriter.h"
|
| +#include "unicode/uchriter.h"
|
| +#include "unicode/normlzr.h"
|
| +#include "unicode/utf16.h"
|
| +#include "cmemory.h"
|
| +#include "normalizer2impl.h"
|
| +#include "uprops.h" // for uniset_getUnicode32Instance()
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
|
| +
|
| +//-------------------------------------------------------------------------
|
| +// Constructors and other boilerplate
|
| +//-------------------------------------------------------------------------
|
| +
|
| +Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
|
| + UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
|
| + text(new StringCharacterIterator(str)),
|
| + currentIndex(0), nextIndex(0),
|
| + buffer(), bufferPos(0)
|
| +{
|
| + init();
|
| +}
|
| +
|
| +Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
|
| + UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
|
| + text(new UCharCharacterIterator(str, length)),
|
| + currentIndex(0), nextIndex(0),
|
| + buffer(), bufferPos(0)
|
| +{
|
| + init();
|
| +}
|
| +
|
| +Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
|
| + UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
|
| + text(iter.clone()),
|
| + currentIndex(0), nextIndex(0),
|
| + buffer(), bufferPos(0)
|
| +{
|
| + init();
|
| +}
|
| +
|
| +Normalizer::Normalizer(const Normalizer ©) :
|
| + UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
|
| + text(copy.text->clone()),
|
| + currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
|
| + buffer(copy.buffer), bufferPos(copy.bufferPos)
|
| +{
|
| + init();
|
| +}
|
| +
|
| +void
|
| +Normalizer::init() {
|
| + UErrorCode errorCode=U_ZERO_ERROR;
|
| + fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
|
| + if(fOptions&UNORM_UNICODE_3_2) {
|
| + delete fFilteredNorm2;
|
| + fNorm2=fFilteredNorm2=
|
| + new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
|
| + }
|
| + if(U_FAILURE(errorCode)) {
|
| + errorCode=U_ZERO_ERROR;
|
| + fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
|
| + }
|
| +}
|
| +
|
| +Normalizer::~Normalizer()
|
| +{
|
| + delete fFilteredNorm2;
|
| + delete text;
|
| +}
|
| +
|
| +Normalizer*
|
| +Normalizer::clone() const
|
| +{
|
| + return new Normalizer(*this);
|
| +}
|
| +
|
| +/**
|
| + * Generates a hash code for this iterator.
|
| + */
|
| +int32_t Normalizer::hashCode() const
|
| +{
|
| + return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
|
| +}
|
| +
|
| +UBool Normalizer::operator==(const Normalizer& that) const
|
| +{
|
| + return
|
| + this==&that ||
|
| + (fUMode==that.fUMode &&
|
| + fOptions==that.fOptions &&
|
| + *text==*that.text &&
|
| + buffer==that.buffer &&
|
| + bufferPos==that.bufferPos &&
|
| + nextIndex==that.nextIndex);
|
| +}
|
| +
|
| +//-------------------------------------------------------------------------
|
| +// Static utility methods
|
| +//-------------------------------------------------------------------------
|
| +
|
| +void U_EXPORT2
|
| +Normalizer::normalize(const UnicodeString& source,
|
| + UNormalizationMode mode, int32_t options,
|
| + UnicodeString& result,
|
| + UErrorCode &status) {
|
| + if(source.isBogus() || U_FAILURE(status)) {
|
| + result.setToBogus();
|
| + if(U_SUCCESS(status)) {
|
| + status=U_ILLEGAL_ARGUMENT_ERROR;
|
| + }
|
| + } else {
|
| + UnicodeString localDest;
|
| + UnicodeString *dest;
|
| +
|
| + if(&source!=&result) {
|
| + dest=&result;
|
| + } else {
|
| + // the source and result strings are the same object, use a temporary one
|
| + dest=&localDest;
|
| + }
|
| + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
|
| + if(U_SUCCESS(status)) {
|
| + if(options&UNORM_UNICODE_3_2) {
|
| + FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
|
| + normalize(source, *dest, status);
|
| + } else {
|
| + n2->normalize(source, *dest, status);
|
| + }
|
| + }
|
| + if(dest==&localDest && U_SUCCESS(status)) {
|
| + result=*dest;
|
| + }
|
| + }
|
| +}
|
| +
|
| +void U_EXPORT2
|
| +Normalizer::compose(const UnicodeString& source,
|
| + UBool compat, int32_t options,
|
| + UnicodeString& result,
|
| + UErrorCode &status) {
|
| + normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
|
| +}
|
| +
|
| +void U_EXPORT2
|
| +Normalizer::decompose(const UnicodeString& source,
|
| + UBool compat, int32_t options,
|
| + UnicodeString& result,
|
| + UErrorCode &status) {
|
| + normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
|
| +}
|
| +
|
| +UNormalizationCheckResult
|
| +Normalizer::quickCheck(const UnicodeString& source,
|
| + UNormalizationMode mode, int32_t options,
|
| + UErrorCode &status) {
|
| + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
|
| + if(U_SUCCESS(status)) {
|
| + if(options&UNORM_UNICODE_3_2) {
|
| + return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
|
| + quickCheck(source, status);
|
| + } else {
|
| + return n2->quickCheck(source, status);
|
| + }
|
| + } else {
|
| + return UNORM_MAYBE;
|
| + }
|
| +}
|
| +
|
| +UBool
|
| +Normalizer::isNormalized(const UnicodeString& source,
|
| + UNormalizationMode mode, int32_t options,
|
| + UErrorCode &status) {
|
| + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
|
| + if(U_SUCCESS(status)) {
|
| + if(options&UNORM_UNICODE_3_2) {
|
| + return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
|
| + isNormalized(source, status);
|
| + } else {
|
| + return n2->isNormalized(source, status);
|
| + }
|
| + } else {
|
| + return FALSE;
|
| + }
|
| +}
|
| +
|
| +UnicodeString & U_EXPORT2
|
| +Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
|
| + UnicodeString &result,
|
| + UNormalizationMode mode, int32_t options,
|
| + UErrorCode &errorCode) {
|
| + if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
|
| + result.setToBogus();
|
| + if(U_SUCCESS(errorCode)) {
|
| + errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
| + }
|
| + } else {
|
| + UnicodeString localDest;
|
| + UnicodeString *dest;
|
| +
|
| + if(&right!=&result) {
|
| + dest=&result;
|
| + } else {
|
| + // the right and result strings are the same object, use a temporary one
|
| + dest=&localDest;
|
| + }
|
| + *dest=left;
|
| + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
|
| + if(U_SUCCESS(errorCode)) {
|
| + if(options&UNORM_UNICODE_3_2) {
|
| + FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
|
| + append(*dest, right, errorCode);
|
| + } else {
|
| + n2->append(*dest, right, errorCode);
|
| + }
|
| + }
|
| + if(dest==&localDest && U_SUCCESS(errorCode)) {
|
| + result=*dest;
|
| + }
|
| + }
|
| + return result;
|
| +}
|
| +
|
| +//-------------------------------------------------------------------------
|
| +// Iteration API
|
| +//-------------------------------------------------------------------------
|
| +
|
| +/**
|
| + * Return the current character in the normalized text.
|
| + */
|
| +UChar32 Normalizer::current() {
|
| + if(bufferPos<buffer.length() || nextNormalize()) {
|
| + return buffer.char32At(bufferPos);
|
| + } else {
|
| + return DONE;
|
| + }
|
| +}
|
| +
|
| +/**
|
| + * Return the next character in the normalized text and advance
|
| + * the iteration position by one. If the end
|
| + * of the text has already been reached, {@link #DONE} is returned.
|
| + */
|
| +UChar32 Normalizer::next() {
|
| + if(bufferPos<buffer.length() || nextNormalize()) {
|
| + UChar32 c=buffer.char32At(bufferPos);
|
| + bufferPos+=U16_LENGTH(c);
|
| + return c;
|
| + } else {
|
| + return DONE;
|
| + }
|
| +}
|
| +
|
| +/**
|
| + * Return the previous character in the normalized text and decrement
|
| + * the iteration position by one. If the beginning
|
| + * of the text has already been reached, {@link #DONE} is returned.
|
| + */
|
| +UChar32 Normalizer::previous() {
|
| + if(bufferPos>0 || previousNormalize()) {
|
| + UChar32 c=buffer.char32At(bufferPos-1);
|
| + bufferPos-=U16_LENGTH(c);
|
| + return c;
|
| + } else {
|
| + return DONE;
|
| + }
|
| +}
|
| +
|
| +void Normalizer::reset() {
|
| + currentIndex=nextIndex=text->setToStart();
|
| + clearBuffer();
|
| +}
|
| +
|
| +void
|
| +Normalizer::setIndexOnly(int32_t index) {
|
| + text->setIndex(index); // pins index
|
| + currentIndex=nextIndex=text->getIndex();
|
| + clearBuffer();
|
| +}
|
| +
|
| +/**
|
| + * Return the first character in the normalized text. This resets
|
| + * the <tt>Normalizer's</tt> position to the beginning of the text.
|
| + */
|
| +UChar32 Normalizer::first() {
|
| + reset();
|
| + return next();
|
| +}
|
| +
|
| +/**
|
| + * Return the last character in the normalized text. This resets
|
| + * the <tt>Normalizer's</tt> position to be just before the
|
| + * the input text corresponding to that normalized character.
|
| + */
|
| +UChar32 Normalizer::last() {
|
| + currentIndex=nextIndex=text->setToEnd();
|
| + clearBuffer();
|
| + return previous();
|
| +}
|
| +
|
| +/**
|
| + * Retrieve the current iteration position in the input text that is
|
| + * being normalized. This method is useful in applications such as
|
| + * searching, where you need to be able to determine the position in
|
| + * the input text that corresponds to a given normalized output character.
|
| + * <p>
|
| + * <b>Note:</b> This method sets the position in the <em>input</em>, while
|
| + * {@link #next} and {@link #previous} iterate through characters in the
|
| + * <em>output</em>. This means that there is not necessarily a one-to-one
|
| + * correspondence between characters returned by <tt>next</tt> and
|
| + * <tt>previous</tt> and the indices passed to and returned from
|
| + * <tt>setIndex</tt> and {@link #getIndex}.
|
| + *
|
| + */
|
| +int32_t Normalizer::getIndex() const {
|
| + if(bufferPos<buffer.length()) {
|
| + return currentIndex;
|
| + } else {
|
| + return nextIndex;
|
| + }
|
| +}
|
| +
|
| +/**
|
| + * Retrieve the index of the start of the input text. This is the begin index
|
| + * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
|
| + * over which this <tt>Normalizer</tt> is iterating
|
| + */
|
| +int32_t Normalizer::startIndex() const {
|
| + return text->startIndex();
|
| +}
|
| +
|
| +/**
|
| + * Retrieve the index of the end of the input text. This is the end index
|
| + * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
|
| + * over which this <tt>Normalizer</tt> is iterating
|
| + */
|
| +int32_t Normalizer::endIndex() const {
|
| + return text->endIndex();
|
| +}
|
| +
|
| +//-------------------------------------------------------------------------
|
| +// Property access methods
|
| +//-------------------------------------------------------------------------
|
| +
|
| +void
|
| +Normalizer::setMode(UNormalizationMode newMode)
|
| +{
|
| + fUMode = newMode;
|
| + init();
|
| +}
|
| +
|
| +UNormalizationMode
|
| +Normalizer::getUMode() const
|
| +{
|
| + return fUMode;
|
| +}
|
| +
|
| +void
|
| +Normalizer::setOption(int32_t option,
|
| + UBool value)
|
| +{
|
| + if (value) {
|
| + fOptions |= option;
|
| + } else {
|
| + fOptions &= (~option);
|
| + }
|
| + init();
|
| +}
|
| +
|
| +UBool
|
| +Normalizer::getOption(int32_t option) const
|
| +{
|
| + return (fOptions & option) != 0;
|
| +}
|
| +
|
| +/**
|
| + * Set the input text over which this <tt>Normalizer</tt> will iterate.
|
| + * The iteration position is set to the beginning of the input text.
|
| + */
|
| +void
|
| +Normalizer::setText(const UnicodeString& newText,
|
| + UErrorCode &status)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| + CharacterIterator *newIter = new StringCharacterIterator(newText);
|
| + if (newIter == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| + delete text;
|
| + text = newIter;
|
| + reset();
|
| +}
|
| +
|
| +/**
|
| + * Set the input text over which this <tt>Normalizer</tt> will iterate.
|
| + * The iteration position is set to the beginning of the string.
|
| + */
|
| +void
|
| +Normalizer::setText(const CharacterIterator& newText,
|
| + UErrorCode &status)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| + CharacterIterator *newIter = newText.clone();
|
| + if (newIter == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| + delete text;
|
| + text = newIter;
|
| + reset();
|
| +}
|
| +
|
| +void
|
| +Normalizer::setText(const UChar* newText,
|
| + int32_t length,
|
| + UErrorCode &status)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| + CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
|
| + if (newIter == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| + delete text;
|
| + text = newIter;
|
| + reset();
|
| +}
|
| +
|
| +/**
|
| + * Copies the text under iteration into the UnicodeString referred to by "result".
|
| + * @param result Receives a copy of the text under iteration.
|
| + */
|
| +void
|
| +Normalizer::getText(UnicodeString& result)
|
| +{
|
| + text->getText(result);
|
| +}
|
| +
|
| +//-------------------------------------------------------------------------
|
| +// Private utility methods
|
| +//-------------------------------------------------------------------------
|
| +
|
| +void Normalizer::clearBuffer() {
|
| + buffer.remove();
|
| + bufferPos=0;
|
| +}
|
| +
|
| +UBool
|
| +Normalizer::nextNormalize() {
|
| + clearBuffer();
|
| + currentIndex=nextIndex;
|
| + text->setIndex(nextIndex);
|
| + if(!text->hasNext()) {
|
| + return FALSE;
|
| + }
|
| + // Skip at least one character so we make progress.
|
| + UnicodeString segment(text->next32PostInc());
|
| + while(text->hasNext()) {
|
| + UChar32 c;
|
| + if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
|
| + text->move32(-1, CharacterIterator::kCurrent);
|
| + break;
|
| + }
|
| + segment.append(c);
|
| + }
|
| + nextIndex=text->getIndex();
|
| + UErrorCode errorCode=U_ZERO_ERROR;
|
| + fNorm2->normalize(segment, buffer, errorCode);
|
| + return U_SUCCESS(errorCode) && !buffer.isEmpty();
|
| +}
|
| +
|
| +UBool
|
| +Normalizer::previousNormalize() {
|
| + clearBuffer();
|
| + nextIndex=currentIndex;
|
| + text->setIndex(currentIndex);
|
| + if(!text->hasPrevious()) {
|
| + return FALSE;
|
| + }
|
| + UnicodeString segment;
|
| + while(text->hasPrevious()) {
|
| + UChar32 c=text->previous32();
|
| + segment.insert(0, c);
|
| + if(fNorm2->hasBoundaryBefore(c)) {
|
| + break;
|
| + }
|
| + }
|
| + currentIndex=text->getIndex();
|
| + UErrorCode errorCode=U_ZERO_ERROR;
|
| + fNorm2->normalize(segment, buffer, errorCode);
|
| + bufferPos=buffer.length();
|
| + return U_SUCCESS(errorCode) && !buffer.isEmpty();
|
| +}
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| +#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
|
|
| Property changes on: icu51/source/common/normlzr.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|