| Index: icu51/source/common/uniset_props.cpp
|
| ===================================================================
|
| --- icu51/source/common/uniset_props.cpp (revision 0)
|
| +++ icu51/source/common/uniset_props.cpp (revision 0)
|
| @@ -0,0 +1,1304 @@
|
| +/*
|
| +*******************************************************************************
|
| +*
|
| +* Copyright (C) 1999-2012, International Business Machines
|
| +* Corporation and others. All Rights Reserved.
|
| +*
|
| +*******************************************************************************
|
| +* file name: uniset_props.cpp
|
| +* encoding: US-ASCII
|
| +* tab size: 8 (not used)
|
| +* indentation:4
|
| +*
|
| +* created on: 2004aug25
|
| +* created by: Markus W. Scherer
|
| +*
|
| +* Character property dependent functions moved here from uniset.cpp
|
| +*/
|
| +
|
| +#include "unicode/utypes.h"
|
| +#include "unicode/uniset.h"
|
| +#include "unicode/parsepos.h"
|
| +#include "unicode/uchar.h"
|
| +#include "unicode/uscript.h"
|
| +#include "unicode/symtable.h"
|
| +#include "unicode/uset.h"
|
| +#include "unicode/locid.h"
|
| +#include "unicode/brkiter.h"
|
| +#include "uset_imp.h"
|
| +#include "ruleiter.h"
|
| +#include "cmemory.h"
|
| +#include "ucln_cmn.h"
|
| +#include "util.h"
|
| +#include "uvector.h"
|
| +#include "uprops.h"
|
| +#include "propname.h"
|
| +#include "normalizer2impl.h"
|
| +#include "ucase.h"
|
| +#include "ubidi_props.h"
|
| +#include "uinvchar.h"
|
| +#include "uprops.h"
|
| +#include "charstr.h"
|
| +#include "cstring.h"
|
| +#include "mutex.h"
|
| +#include "umutex.h"
|
| +#include "uassert.h"
|
| +#include "hash.h"
|
| +
|
| +U_NAMESPACE_USE
|
| +
|
| +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
| +
|
| +// initial storage. Must be >= 0
|
| +// *** same as in uniset.cpp ! ***
|
| +#define START_EXTRA 16
|
| +
|
| +// Define UChar constants using hex for EBCDIC compatibility
|
| +// Used #define to reduce private static exports and memory access time.
|
| +#define SET_OPEN ((UChar)0x005B) /*[*/
|
| +#define SET_CLOSE ((UChar)0x005D) /*]*/
|
| +#define HYPHEN ((UChar)0x002D) /*-*/
|
| +#define COMPLEMENT ((UChar)0x005E) /*^*/
|
| +#define COLON ((UChar)0x003A) /*:*/
|
| +#define BACKSLASH ((UChar)0x005C) /*\*/
|
| +#define INTERSECTION ((UChar)0x0026) /*&*/
|
| +#define UPPER_U ((UChar)0x0055) /*U*/
|
| +#define LOWER_U ((UChar)0x0075) /*u*/
|
| +#define OPEN_BRACE ((UChar)123) /*{*/
|
| +#define CLOSE_BRACE ((UChar)125) /*}*/
|
| +#define UPPER_P ((UChar)0x0050) /*P*/
|
| +#define LOWER_P ((UChar)0x0070) /*p*/
|
| +#define UPPER_N ((UChar)78) /*N*/
|
| +#define EQUALS ((UChar)0x003D) /*=*/
|
| +
|
| +//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
|
| +static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
|
| +//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
|
| +//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
|
| +//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
|
| +static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
|
| +
|
| +// Special property set IDs
|
| +static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
|
| +static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
|
| +static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
|
| +
|
| +// Unicode name property alias
|
| +#define NAME_PROP "na"
|
| +#define NAME_PROP_LENGTH 2
|
| +
|
| +/**
|
| + * Delimiter string used in patterns to close a category reference:
|
| + * ":]". Example: "[:Lu:]".
|
| + */
|
| +//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
|
| +
|
| +// Cached sets ------------------------------------------------------------- ***
|
| +
|
| +U_CDECL_BEGIN
|
| +static UBool U_CALLCONV uset_cleanup();
|
| +U_CDECL_END
|
| +
|
| +// Not a TriStateSingletonWrapper because we think the UnicodeSet constructor
|
| +// can only fail with an out-of-memory error
|
| +// if we have a correct pattern and the properties data is hardcoded and always available.
|
| +class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> {
|
| +public:
|
| + UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) :
|
| + SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {}
|
| + UnicodeSet *getInstance(UErrorCode &errorCode) {
|
| + return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode);
|
| + }
|
| +private:
|
| + static void *createInstance(const void *context, UErrorCode &errorCode) {
|
| + UnicodeString pattern((const char *)context, -1, US_INV);
|
| + UnicodeSet *set=new UnicodeSet(pattern, errorCode);
|
| + if(set==NULL) {
|
| + errorCode=U_MEMORY_ALLOCATION_ERROR;
|
| + return NULL;
|
| + }
|
| + set->freeze();
|
| + ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
|
| + return set;
|
| + }
|
| +
|
| + const char *fPattern;
|
| +};
|
| +
|
| +U_CDECL_BEGIN
|
| +
|
| +static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
|
| +
|
| +STATIC_SIMPLE_SINGLETON(uni32Singleton);
|
| +
|
| +//----------------------------------------------------------------
|
| +// Inclusions list
|
| +//----------------------------------------------------------------
|
| +
|
| +// USetAdder implementation
|
| +// Does not use uset.h to reduce code dependencies
|
| +static void U_CALLCONV
|
| +_set_add(USet *set, UChar32 c) {
|
| + ((UnicodeSet *)set)->add(c);
|
| +}
|
| +
|
| +static void U_CALLCONV
|
| +_set_addRange(USet *set, UChar32 start, UChar32 end) {
|
| + ((UnicodeSet *)set)->add(start, end);
|
| +}
|
| +
|
| +static void U_CALLCONV
|
| +_set_addString(USet *set, const UChar *str, int32_t length) {
|
| + ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
|
| +}
|
| +
|
| +/**
|
| + * Cleanup function for UnicodeSet
|
| + */
|
| +static UBool U_CALLCONV uset_cleanup(void) {
|
| + int32_t i;
|
| +
|
| + for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
|
| + if (INCLUSIONS[i] != NULL) {
|
| + delete INCLUSIONS[i];
|
| + INCLUSIONS[i] = NULL;
|
| + }
|
| + }
|
| + UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance();
|
| + return TRUE;
|
| +}
|
| +
|
| +U_CDECL_END
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +/*
|
| +Reduce excessive reallocation, and make it easier to detect initialization
|
| +problems.
|
| +Usually you don't see smaller sets than this for Unicode 5.0.
|
| +*/
|
| +#define DEFAULT_INCLUSION_CAPACITY 3072
|
| +
|
| +const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
|
| + UBool needInit;
|
| + UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);
|
| + if (needInit) {
|
| + UnicodeSet* incl = new UnicodeSet();
|
| + USetAdder sa = {
|
| + (USet *)incl,
|
| + _set_add,
|
| + _set_addRange,
|
| + _set_addString,
|
| + NULL, // don't need remove()
|
| + NULL // don't need removeRange()
|
| + };
|
| + if (incl != NULL) {
|
| + incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
|
| + switch(src) {
|
| + case UPROPS_SRC_CHAR:
|
| + uchar_addPropertyStarts(&sa, &status);
|
| + break;
|
| + case UPROPS_SRC_PROPSVEC:
|
| + upropsvec_addPropertyStarts(&sa, &status);
|
| + break;
|
| + case UPROPS_SRC_CHAR_AND_PROPSVEC:
|
| + uchar_addPropertyStarts(&sa, &status);
|
| + upropsvec_addPropertyStarts(&sa, &status);
|
| + break;
|
| +#if !UCONFIG_NO_NORMALIZATION
|
| + case UPROPS_SRC_CASE_AND_NORM: {
|
| + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
|
| + if(U_SUCCESS(status)) {
|
| + impl->addPropertyStarts(&sa, status);
|
| + }
|
| + ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
|
| + break;
|
| + }
|
| + case UPROPS_SRC_NFC: {
|
| + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
|
| + if(U_SUCCESS(status)) {
|
| + impl->addPropertyStarts(&sa, status);
|
| + }
|
| + break;
|
| + }
|
| + case UPROPS_SRC_NFKC: {
|
| + const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
|
| + if(U_SUCCESS(status)) {
|
| + impl->addPropertyStarts(&sa, status);
|
| + }
|
| + break;
|
| + }
|
| + case UPROPS_SRC_NFKC_CF: {
|
| + const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
|
| + if(U_SUCCESS(status)) {
|
| + impl->addPropertyStarts(&sa, status);
|
| + }
|
| + break;
|
| + }
|
| + case UPROPS_SRC_NFC_CANON_ITER: {
|
| + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
|
| + if(U_SUCCESS(status)) {
|
| + impl->addCanonIterPropertyStarts(&sa, status);
|
| + }
|
| + break;
|
| + }
|
| +#endif
|
| + case UPROPS_SRC_CASE:
|
| + ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
|
| + break;
|
| + case UPROPS_SRC_BIDI:
|
| + ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);
|
| + break;
|
| + default:
|
| + status = U_INTERNAL_PROGRAM_ERROR;
|
| + break;
|
| + }
|
| + if (U_SUCCESS(status)) {
|
| + // Compact for caching
|
| + incl->compact();
|
| + umtx_lock(NULL);
|
| + if (INCLUSIONS[src] == NULL) {
|
| + INCLUSIONS[src] = incl;
|
| + incl = NULL;
|
| + ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
|
| + }
|
| + umtx_unlock(NULL);
|
| + }
|
| + delete incl;
|
| + } else {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + }
|
| + }
|
| + return INCLUSIONS[src];
|
| +}
|
| +
|
| +// Cache some sets for other services -------------------------------------- ***
|
| +
|
| +U_CFUNC UnicodeSet *
|
| +uniset_getUnicode32Instance(UErrorCode &errorCode) {
|
| + return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode);
|
| +}
|
| +
|
| +// helper functions for matching of pattern syntax pieces ------------------ ***
|
| +// these functions are parallel to the PERL_OPEN etc. strings above
|
| +
|
| +// using these functions is not only faster than UnicodeString::compare() and
|
| +// caseCompare(), but they also make UnicodeSet work for simple patterns when
|
| +// no Unicode properties data is available - when caseCompare() fails
|
| +
|
| +static inline UBool
|
| +isPerlOpen(const UnicodeString &pattern, int32_t pos) {
|
| + UChar c;
|
| + return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
|
| +}
|
| +
|
| +/*static inline UBool
|
| +isPerlClose(const UnicodeString &pattern, int32_t pos) {
|
| + return pattern.charAt(pos)==CLOSE_BRACE;
|
| +}*/
|
| +
|
| +static inline UBool
|
| +isNameOpen(const UnicodeString &pattern, int32_t pos) {
|
| + return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
|
| +}
|
| +
|
| +static inline UBool
|
| +isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
|
| + return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
|
| +}
|
| +
|
| +/*static inline UBool
|
| +isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
|
| + return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
|
| +}*/
|
| +
|
| +// TODO memory debugging provided inside uniset.cpp
|
| +// could be made available here but probably obsolete with use of modern
|
| +// memory leak checker tools
|
| +#define _dbgct(me)
|
| +
|
| +//----------------------------------------------------------------
|
| +// Constructors &c
|
| +//----------------------------------------------------------------
|
| +
|
| +/**
|
| + * Constructs a set from the given pattern, optionally ignoring
|
| + * white space. See the class description for the syntax of the
|
| + * pattern language.
|
| + * @param pattern a string specifying what characters are in the set
|
| + */
|
| +UnicodeSet::UnicodeSet(const UnicodeString& pattern,
|
| + UErrorCode& status) :
|
| + len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
|
| + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
|
| + fFlags(0)
|
| +{
|
| + if(U_SUCCESS(status)){
|
| + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
|
| + /* test for NULL */
|
| + if(list == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + }else{
|
| + allocateStrings(status);
|
| + applyPattern(pattern, status);
|
| + }
|
| + }
|
| + _dbgct(this);
|
| +}
|
| +
|
| +//----------------------------------------------------------------
|
| +// Public API
|
| +//----------------------------------------------------------------
|
| +
|
| +UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
|
| + UErrorCode& status) {
|
| + // Equivalent to
|
| + // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
|
| + // but without dependency on closeOver().
|
| + ParsePosition pos(0);
|
| + applyPatternIgnoreSpace(pattern, pos, NULL, status);
|
| + if (U_FAILURE(status)) return *this;
|
| +
|
| + int32_t i = pos.getIndex();
|
| + // Skip over trailing whitespace
|
| + ICU_Utility::skipWhitespace(pattern, i, TRUE);
|
| + if (i != pattern.length()) {
|
| + status = U_ILLEGAL_ARGUMENT_ERROR;
|
| + }
|
| + return *this;
|
| +}
|
| +
|
| +void
|
| +UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
|
| + ParsePosition& pos,
|
| + const SymbolTable* symbols,
|
| + UErrorCode& status) {
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| + if (isFrozen()) {
|
| + status = U_NO_WRITE_PERMISSION;
|
| + return;
|
| + }
|
| + // Need to build the pattern in a temporary string because
|
| + // _applyPattern calls add() etc., which set pat to empty.
|
| + UnicodeString rebuiltPat;
|
| + RuleCharacterIterator chars(pattern, symbols, pos);
|
| + applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);
|
| + if (U_FAILURE(status)) return;
|
| + if (chars.inVariable()) {
|
| + // syntaxError(chars, "Extra chars in variable value");
|
| + status = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + setPattern(rebuiltPat);
|
| +}
|
| +
|
| +/**
|
| + * Return true if the given position, in the given pattern, appears
|
| + * to be the start of a UnicodeSet pattern.
|
| + */
|
| +UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
|
| + return ((pos+1) < pattern.length() &&
|
| + pattern.charAt(pos) == (UChar)91/*[*/) ||
|
| + resemblesPropertyPattern(pattern, pos);
|
| +}
|
| +
|
| +//----------------------------------------------------------------
|
| +// Implementation: Pattern parsing
|
| +//----------------------------------------------------------------
|
| +
|
| +/**
|
| + * A small all-inline class to manage a UnicodeSet pointer. Add
|
| + * operator->() etc. as needed.
|
| + */
|
| +class UnicodeSetPointer {
|
| + UnicodeSet* p;
|
| +public:
|
| + inline UnicodeSetPointer() : p(0) {}
|
| + inline ~UnicodeSetPointer() { delete p; }
|
| + inline UnicodeSet* pointer() { return p; }
|
| + inline UBool allocate() {
|
| + if (p == 0) {
|
| + p = new UnicodeSet();
|
| + }
|
| + return p != 0;
|
| + }
|
| +};
|
| +
|
| +/**
|
| + * Parse the pattern from the given RuleCharacterIterator. The
|
| + * iterator is advanced over the parsed pattern.
|
| + * @param chars iterator over the pattern characters. Upon return
|
| + * it will be advanced to the first character after the parsed
|
| + * pattern, or the end of the iteration if all characters are
|
| + * parsed.
|
| + * @param symbols symbol table to use to parse and dereference
|
| + * variables, or null if none.
|
| + * @param rebuiltPat the pattern that was parsed, rebuilt or
|
| + * copied from the input pattern, as appropriate.
|
| + * @param options a bit mask of zero or more of the following:
|
| + * IGNORE_SPACE, CASE.
|
| + */
|
| +void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
|
| + const SymbolTable* symbols,
|
| + UnicodeString& rebuiltPat,
|
| + uint32_t options,
|
| + UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
|
| + UErrorCode& ec) {
|
| + if (U_FAILURE(ec)) return;
|
| +
|
| + // Syntax characters: [ ] ^ - & { }
|
| +
|
| + // Recognized special forms for chars, sets: c-c s-s s&s
|
| +
|
| + int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
|
| + RuleCharacterIterator::PARSE_ESCAPES;
|
| + if ((options & USET_IGNORE_SPACE) != 0) {
|
| + opts |= RuleCharacterIterator::SKIP_WHITESPACE;
|
| + }
|
| +
|
| + UnicodeString patLocal, buf;
|
| + UBool usePat = FALSE;
|
| + UnicodeSetPointer scratch;
|
| + RuleCharacterIterator::Pos backup;
|
| +
|
| + // mode: 0=before [, 1=between [...], 2=after ]
|
| + // lastItem: 0=none, 1=char, 2=set
|
| + int8_t lastItem = 0, mode = 0;
|
| + UChar32 lastChar = 0;
|
| + UChar op = 0;
|
| +
|
| + UBool invert = FALSE;
|
| +
|
| + clear();
|
| +
|
| + while (mode != 2 && !chars.atEnd()) {
|
| + U_ASSERT((lastItem == 0 && op == 0) ||
|
| + (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
|
| + (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
|
| + op == INTERSECTION /*'&'*/)));
|
| +
|
| + UChar32 c = 0;
|
| + UBool literal = FALSE;
|
| + UnicodeSet* nested = 0; // alias - do not delete
|
| +
|
| + // -------- Check for property pattern
|
| +
|
| + // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
|
| + int8_t setMode = 0;
|
| + if (resemblesPropertyPattern(chars, opts)) {
|
| + setMode = 2;
|
| + }
|
| +
|
| + // -------- Parse '[' of opening delimiter OR nested set.
|
| + // If there is a nested set, use `setMode' to define how
|
| + // the set should be parsed. If the '[' is part of the
|
| + // opening delimiter for this pattern, parse special
|
| + // strings "[", "[^", "[-", and "[^-". Check for stand-in
|
| + // characters representing a nested set in the symbol
|
| + // table.
|
| +
|
| + else {
|
| + // Prepare to backup if necessary
|
| + chars.getPos(backup);
|
| + c = chars.next(opts, literal, ec);
|
| + if (U_FAILURE(ec)) return;
|
| +
|
| + if (c == 0x5B /*'['*/ && !literal) {
|
| + if (mode == 1) {
|
| + chars.setPos(backup); // backup
|
| + setMode = 1;
|
| + } else {
|
| + // Handle opening '[' delimiter
|
| + mode = 1;
|
| + patLocal.append((UChar) 0x5B /*'['*/);
|
| + chars.getPos(backup); // prepare to backup
|
| + c = chars.next(opts, literal, ec);
|
| + if (U_FAILURE(ec)) return;
|
| + if (c == 0x5E /*'^'*/ && !literal) {
|
| + invert = TRUE;
|
| + patLocal.append((UChar) 0x5E /*'^'*/);
|
| + chars.getPos(backup); // prepare to backup
|
| + c = chars.next(opts, literal, ec);
|
| + if (U_FAILURE(ec)) return;
|
| + }
|
| + // Fall through to handle special leading '-';
|
| + // otherwise restart loop for nested [], \p{}, etc.
|
| + if (c == HYPHEN /*'-'*/) {
|
| + literal = TRUE;
|
| + // Fall through to handle literal '-' below
|
| + } else {
|
| + chars.setPos(backup); // backup
|
| + continue;
|
| + }
|
| + }
|
| + } else if (symbols != 0) {
|
| + const UnicodeFunctor *m = symbols->lookupMatcher(c);
|
| + if (m != 0) {
|
| + const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
|
| + if (ms == NULL) {
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + // casting away const, but `nested' won't be modified
|
| + // (important not to modify stored set)
|
| + nested = const_cast<UnicodeSet*>(ms);
|
| + setMode = 3;
|
| + }
|
| + }
|
| + }
|
| +
|
| + // -------- Handle a nested set. This either is inline in
|
| + // the pattern or represented by a stand-in that has
|
| + // previously been parsed and was looked up in the symbol
|
| + // table.
|
| +
|
| + if (setMode != 0) {
|
| + if (lastItem == 1) {
|
| + if (op != 0) {
|
| + // syntaxError(chars, "Char expected after operator");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + add(lastChar, lastChar);
|
| + _appendToPat(patLocal, lastChar, FALSE);
|
| + lastItem = 0;
|
| + op = 0;
|
| + }
|
| +
|
| + if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
|
| + patLocal.append(op);
|
| + }
|
| +
|
| + if (nested == 0) {
|
| + // lazy allocation
|
| + if (!scratch.allocate()) {
|
| + ec = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| + nested = scratch.pointer();
|
| + }
|
| + switch (setMode) {
|
| + case 1:
|
| + nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);
|
| + break;
|
| + case 2:
|
| + chars.skipIgnored(opts);
|
| + nested->applyPropertyPattern(chars, patLocal, ec);
|
| + if (U_FAILURE(ec)) return;
|
| + break;
|
| + case 3: // `nested' already parsed
|
| + nested->_toPattern(patLocal, FALSE);
|
| + break;
|
| + }
|
| +
|
| + usePat = TRUE;
|
| +
|
| + if (mode == 0) {
|
| + // Entire pattern is a category; leave parse loop
|
| + *this = *nested;
|
| + mode = 2;
|
| + break;
|
| + }
|
| +
|
| + switch (op) {
|
| + case HYPHEN: /*'-'*/
|
| + removeAll(*nested);
|
| + break;
|
| + case INTERSECTION: /*'&'*/
|
| + retainAll(*nested);
|
| + break;
|
| + case 0:
|
| + addAll(*nested);
|
| + break;
|
| + }
|
| +
|
| + op = 0;
|
| + lastItem = 2;
|
| +
|
| + continue;
|
| + }
|
| +
|
| + if (mode == 0) {
|
| + // syntaxError(chars, "Missing '['");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| +
|
| + // -------- Parse special (syntax) characters. If the
|
| + // current character is not special, or if it is escaped,
|
| + // then fall through and handle it below.
|
| +
|
| + if (!literal) {
|
| + switch (c) {
|
| + case 0x5D /*']'*/:
|
| + if (lastItem == 1) {
|
| + add(lastChar, lastChar);
|
| + _appendToPat(patLocal, lastChar, FALSE);
|
| + }
|
| + // Treat final trailing '-' as a literal
|
| + if (op == HYPHEN /*'-'*/) {
|
| + add(op, op);
|
| + patLocal.append(op);
|
| + } else if (op == INTERSECTION /*'&'*/) {
|
| + // syntaxError(chars, "Trailing '&'");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + patLocal.append((UChar) 0x5D /*']'*/);
|
| + mode = 2;
|
| + continue;
|
| + case HYPHEN /*'-'*/:
|
| + if (op == 0) {
|
| + if (lastItem != 0) {
|
| + op = (UChar) c;
|
| + continue;
|
| + } else {
|
| + // Treat final trailing '-' as a literal
|
| + add(c, c);
|
| + c = chars.next(opts, literal, ec);
|
| + if (U_FAILURE(ec)) return;
|
| + if (c == 0x5D /*']'*/ && !literal) {
|
| + patLocal.append(HYPHEN_RIGHT_BRACE, 2);
|
| + mode = 2;
|
| + continue;
|
| + }
|
| + }
|
| + }
|
| + // syntaxError(chars, "'-' not after char or set");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + case INTERSECTION /*'&'*/:
|
| + if (lastItem == 2 && op == 0) {
|
| + op = (UChar) c;
|
| + continue;
|
| + }
|
| + // syntaxError(chars, "'&' not after set");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + case 0x5E /*'^'*/:
|
| + // syntaxError(chars, "'^' not after '['");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + case 0x7B /*'{'*/:
|
| + if (op != 0) {
|
| + // syntaxError(chars, "Missing operand after operator");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + if (lastItem == 1) {
|
| + add(lastChar, lastChar);
|
| + _appendToPat(patLocal, lastChar, FALSE);
|
| + }
|
| + lastItem = 0;
|
| + buf.truncate(0);
|
| + {
|
| + UBool ok = FALSE;
|
| + while (!chars.atEnd()) {
|
| + c = chars.next(opts, literal, ec);
|
| + if (U_FAILURE(ec)) return;
|
| + if (c == 0x7D /*'}'*/ && !literal) {
|
| + ok = TRUE;
|
| + break;
|
| + }
|
| + buf.append(c);
|
| + }
|
| + if (buf.length() < 1 || !ok) {
|
| + // syntaxError(chars, "Invalid multicharacter string");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + }
|
| + // We have new string. Add it to set and continue;
|
| + // we don't need to drop through to the further
|
| + // processing
|
| + add(buf);
|
| + patLocal.append((UChar) 0x7B /*'{'*/);
|
| + _appendToPat(patLocal, buf, FALSE);
|
| + patLocal.append((UChar) 0x7D /*'}'*/);
|
| + continue;
|
| + case SymbolTable::SYMBOL_REF:
|
| + // symbols nosymbols
|
| + // [a-$] error error (ambiguous)
|
| + // [a$] anchor anchor
|
| + // [a-$x] var "x"* literal '$'
|
| + // [a-$.] error literal '$'
|
| + // *We won't get here in the case of var "x"
|
| + {
|
| + chars.getPos(backup);
|
| + c = chars.next(opts, literal, ec);
|
| + if (U_FAILURE(ec)) return;
|
| + UBool anchor = (c == 0x5D /*']'*/ && !literal);
|
| + if (symbols == 0 && !anchor) {
|
| + c = SymbolTable::SYMBOL_REF;
|
| + chars.setPos(backup);
|
| + break; // literal '$'
|
| + }
|
| + if (anchor && op == 0) {
|
| + if (lastItem == 1) {
|
| + add(lastChar, lastChar);
|
| + _appendToPat(patLocal, lastChar, FALSE);
|
| + }
|
| + add(U_ETHER);
|
| + usePat = TRUE;
|
| + patLocal.append((UChar) SymbolTable::SYMBOL_REF);
|
| + patLocal.append((UChar) 0x5D /*']'*/);
|
| + mode = 2;
|
| + continue;
|
| + }
|
| + // syntaxError(chars, "Unquoted '$'");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + default:
|
| + break;
|
| + }
|
| + }
|
| +
|
| + // -------- Parse literal characters. This includes both
|
| + // escaped chars ("\u4E01") and non-syntax characters
|
| + // ("a").
|
| +
|
| + switch (lastItem) {
|
| + case 0:
|
| + lastItem = 1;
|
| + lastChar = c;
|
| + break;
|
| + case 1:
|
| + if (op == HYPHEN /*'-'*/) {
|
| + if (lastChar >= c) {
|
| + // Don't allow redundant (a-a) or empty (b-a) ranges;
|
| + // these are most likely typos.
|
| + // syntaxError(chars, "Invalid range");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + add(lastChar, c);
|
| + _appendToPat(patLocal, lastChar, FALSE);
|
| + patLocal.append(op);
|
| + _appendToPat(patLocal, c, FALSE);
|
| + lastItem = 0;
|
| + op = 0;
|
| + } else {
|
| + add(lastChar, lastChar);
|
| + _appendToPat(patLocal, lastChar, FALSE);
|
| + lastChar = c;
|
| + }
|
| + break;
|
| + case 2:
|
| + if (op != 0) {
|
| + // syntaxError(chars, "Set expected after operator");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + lastChar = c;
|
| + lastItem = 1;
|
| + break;
|
| + }
|
| + }
|
| +
|
| + if (mode != 2) {
|
| + // syntaxError(chars, "Missing ']'");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| +
|
| + chars.skipIgnored(opts);
|
| +
|
| + /**
|
| + * Handle global flags (invert, case insensitivity). If this
|
| + * pattern should be compiled case-insensitive, then we need
|
| + * to close over case BEFORE COMPLEMENTING. This makes
|
| + * patterns like /[^abc]/i work.
|
| + */
|
| + if ((options & USET_CASE_INSENSITIVE) != 0) {
|
| + (this->*caseClosure)(USET_CASE_INSENSITIVE);
|
| + }
|
| + else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
|
| + (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
|
| + }
|
| + if (invert) {
|
| + complement();
|
| + }
|
| +
|
| + // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
|
| + // generated pattern.
|
| + if (usePat) {
|
| + rebuiltPat.append(patLocal);
|
| + } else {
|
| + _generatePattern(rebuiltPat, FALSE);
|
| + }
|
| + if (isBogus() && U_SUCCESS(ec)) {
|
| + // We likely ran out of memory. AHHH!
|
| + ec = U_MEMORY_ALLOCATION_ERROR;
|
| + }
|
| +}
|
| +
|
| +//----------------------------------------------------------------
|
| +// Property set implementation
|
| +//----------------------------------------------------------------
|
| +
|
| +static UBool numericValueFilter(UChar32 ch, void* context) {
|
| + return u_getNumericValue(ch) == *(double*)context;
|
| +}
|
| +
|
| +static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
|
| + int32_t value = *(int32_t*)context;
|
| + return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
|
| +}
|
| +
|
| +static UBool versionFilter(UChar32 ch, void* context) {
|
| + static const UVersionInfo none = { 0, 0, 0, 0 };
|
| + UVersionInfo v;
|
| + u_charAge(ch, v);
|
| + UVersionInfo* version = (UVersionInfo*)context;
|
| + return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
|
| +}
|
| +
|
| +typedef struct {
|
| + UProperty prop;
|
| + int32_t value;
|
| +} IntPropertyContext;
|
| +
|
| +static UBool intPropertyFilter(UChar32 ch, void* context) {
|
| + IntPropertyContext* c = (IntPropertyContext*)context;
|
| + return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
|
| +}
|
| +
|
| +static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
|
| + return uscript_hasScript(ch, *(UScriptCode*)context);
|
| +}
|
| +
|
| +/**
|
| + * Generic filter-based scanning code for UCD property UnicodeSets.
|
| + */
|
| +void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
|
| + void* context,
|
| + int32_t src,
|
| + UErrorCode &status) {
|
| + if (U_FAILURE(status)) return;
|
| +
|
| + // Logically, walk through all Unicode characters, noting the start
|
| + // and end of each range for which filter.contain(c) is
|
| + // true. Add each range to a set.
|
| + //
|
| + // To improve performance, use an inclusions set which
|
| + // encodes information about character ranges that are known
|
| + // to have identical properties.
|
| + // getInclusions(src) contains exactly the first characters of
|
| + // same-value ranges for the given properties "source".
|
| + const UnicodeSet* inclusions = getInclusions(src, status);
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + clear();
|
| +
|
| + UChar32 startHasProperty = -1;
|
| + int32_t limitRange = inclusions->getRangeCount();
|
| +
|
| + for (int j=0; j<limitRange; ++j) {
|
| + // get current range
|
| + UChar32 start = inclusions->getRangeStart(j);
|
| + UChar32 end = inclusions->getRangeEnd(j);
|
| +
|
| + // for all the code points in the range, process
|
| + for (UChar32 ch = start; ch <= end; ++ch) {
|
| + // only add to this UnicodeSet on inflection points --
|
| + // where the hasProperty value changes to false
|
| + if ((*filter)(ch, context)) {
|
| + if (startHasProperty < 0) {
|
| + startHasProperty = ch;
|
| + }
|
| + } else if (startHasProperty >= 0) {
|
| + add(startHasProperty, ch-1);
|
| + startHasProperty = -1;
|
| + }
|
| + }
|
| + }
|
| + if (startHasProperty >= 0) {
|
| + add((UChar32)startHasProperty, (UChar32)0x10FFFF);
|
| + }
|
| + if (isBogus() && U_SUCCESS(status)) {
|
| + // We likely ran out of memory. AHHH!
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + }
|
| +}
|
| +
|
| +static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
|
| + /* Note: we use ' ' in compiler code page */
|
| + int32_t j = 0;
|
| + char ch;
|
| + --dstCapacity; /* make room for term. zero */
|
| + while ((ch = *src++) != 0) {
|
| + if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
|
| + continue;
|
| + }
|
| + if (j >= dstCapacity) return FALSE;
|
| + dst[j++] = ch;
|
| + }
|
| + if (j > 0 && dst[j-1] == ' ') --j;
|
| + dst[j] = 0;
|
| + return TRUE;
|
| +}
|
| +
|
| +//----------------------------------------------------------------
|
| +// Property set API
|
| +//----------------------------------------------------------------
|
| +
|
| +#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
|
| +
|
| +UnicodeSet&
|
| +UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
|
| + if (U_FAILURE(ec) || isFrozen()) return *this;
|
| +
|
| + if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
|
| + applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
|
| + } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
|
| + UScriptCode script = (UScriptCode)value;
|
| + applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
|
| + } else {
|
| + IntPropertyContext c = {prop, value};
|
| + applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
|
| + }
|
| + return *this;
|
| +}
|
| +
|
| +UnicodeSet&
|
| +UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
|
| + const UnicodeString& value,
|
| + UErrorCode& ec) {
|
| + if (U_FAILURE(ec) || isFrozen()) return *this;
|
| +
|
| + // prop and value used to be converted to char * using the default
|
| + // converter instead of the invariant conversion.
|
| + // This should not be necessary because all Unicode property and value
|
| + // names use only invariant characters.
|
| + // If there are any variant characters, then we won't find them anyway.
|
| + // Checking first avoids assertion failures in the conversion.
|
| + if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
|
| + !uprv_isInvariantUString(value.getBuffer(), value.length())
|
| + ) {
|
| + FAIL(ec);
|
| + }
|
| + CharString pname, vname;
|
| + pname.appendInvariantChars(prop, ec);
|
| + vname.appendInvariantChars(value, ec);
|
| + if (U_FAILURE(ec)) return *this;
|
| +
|
| + UProperty p;
|
| + int32_t v;
|
| + UBool mustNotBeEmpty = FALSE, invert = FALSE;
|
| +
|
| + if (value.length() > 0) {
|
| + p = u_getPropertyEnum(pname.data());
|
| + if (p == UCHAR_INVALID_CODE) FAIL(ec);
|
| +
|
| + // Treat gc as gcm
|
| + if (p == UCHAR_GENERAL_CATEGORY) {
|
| + p = UCHAR_GENERAL_CATEGORY_MASK;
|
| + }
|
| +
|
| + if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
|
| + (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
|
| + (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
|
| + v = u_getPropertyValueEnum(p, vname.data());
|
| + if (v == UCHAR_INVALID_CODE) {
|
| + // Handle numeric CCC
|
| + if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
|
| + p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
|
| + p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
|
| + char* end;
|
| + double value = uprv_strtod(vname.data(), &end);
|
| + v = (int32_t) value;
|
| + if (v != value || v < 0 || *end != 0) {
|
| + // non-integral or negative value, or trailing junk
|
| + FAIL(ec);
|
| + }
|
| + // If the resultant set is empty then the numeric value
|
| + // was invalid.
|
| + mustNotBeEmpty = TRUE;
|
| + } else {
|
| + FAIL(ec);
|
| + }
|
| + }
|
| + }
|
| +
|
| + else {
|
| +
|
| + switch (p) {
|
| + case UCHAR_NUMERIC_VALUE:
|
| + {
|
| + char* end;
|
| + double value = uprv_strtod(vname.data(), &end);
|
| + if (*end != 0) {
|
| + FAIL(ec);
|
| + }
|
| + applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
|
| + return *this;
|
| + }
|
| + case UCHAR_NAME:
|
| + {
|
| + // Must munge name, since u_charFromName() does not do
|
| + // 'loose' matching.
|
| + char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
|
| + if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
|
| + UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
|
| + if (U_SUCCESS(ec)) {
|
| + clear();
|
| + add(ch);
|
| + return *this;
|
| + } else {
|
| + FAIL(ec);
|
| + }
|
| + }
|
| + case UCHAR_UNICODE_1_NAME:
|
| + // ICU 49 deprecates the Unicode_1_Name property APIs.
|
| + FAIL(ec);
|
| + case UCHAR_AGE:
|
| + {
|
| + // Must munge name, since u_versionFromString() does not do
|
| + // 'loose' matching.
|
| + char buf[128];
|
| + if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
|
| + UVersionInfo version;
|
| + u_versionFromString(version, buf);
|
| + applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
|
| + return *this;
|
| + }
|
| + case UCHAR_SCRIPT_EXTENSIONS:
|
| + v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
|
| + if (v == UCHAR_INVALID_CODE) {
|
| + FAIL(ec);
|
| + }
|
| + // fall through to calling applyIntPropertyValue()
|
| + break;
|
| + default:
|
| + // p is a non-binary, non-enumerated property that we
|
| + // don't support (yet).
|
| + FAIL(ec);
|
| + }
|
| + }
|
| + }
|
| +
|
| + else {
|
| + // value is empty. Interpret as General Category, Script, or
|
| + // Binary property.
|
| + p = UCHAR_GENERAL_CATEGORY_MASK;
|
| + v = u_getPropertyValueEnum(p, pname.data());
|
| + if (v == UCHAR_INVALID_CODE) {
|
| + p = UCHAR_SCRIPT;
|
| + v = u_getPropertyValueEnum(p, pname.data());
|
| + if (v == UCHAR_INVALID_CODE) {
|
| + p = u_getPropertyEnum(pname.data());
|
| + if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
|
| + v = 1;
|
| + } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
|
| + set(MIN_VALUE, MAX_VALUE);
|
| + return *this;
|
| + } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
|
| + set(0, 0x7F);
|
| + return *this;
|
| + } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
|
| + // [:Assigned:]=[:^Cn:]
|
| + p = UCHAR_GENERAL_CATEGORY_MASK;
|
| + v = U_GC_CN_MASK;
|
| + invert = TRUE;
|
| + } else {
|
| + FAIL(ec);
|
| + }
|
| + }
|
| + }
|
| + }
|
| +
|
| + applyIntPropertyValue(p, v, ec);
|
| + if(invert) {
|
| + complement();
|
| + }
|
| +
|
| + if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
|
| + // mustNotBeEmpty is set to true if an empty set indicates
|
| + // invalid input.
|
| + ec = U_ILLEGAL_ARGUMENT_ERROR;
|
| + }
|
| +
|
| + if (isBogus() && U_SUCCESS(ec)) {
|
| + // We likely ran out of memory. AHHH!
|
| + ec = U_MEMORY_ALLOCATION_ERROR;
|
| + }
|
| + return *this;
|
| +}
|
| +
|
| +//----------------------------------------------------------------
|
| +// Property set patterns
|
| +//----------------------------------------------------------------
|
| +
|
| +/**
|
| + * Return true if the given position, in the given pattern, appears
|
| + * to be the start of a property set pattern.
|
| + */
|
| +UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
|
| + int32_t pos) {
|
| + // Patterns are at least 5 characters long
|
| + if ((pos+5) > pattern.length()) {
|
| + return FALSE;
|
| + }
|
| +
|
| + // Look for an opening [:, [:^, \p, or \P
|
| + return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
|
| +}
|
| +
|
| +/**
|
| + * Return true if the given iterator appears to point at a
|
| + * property pattern. Regardless of the result, return with the
|
| + * iterator unchanged.
|
| + * @param chars iterator over the pattern characters. Upon return
|
| + * it will be unchanged.
|
| + * @param iterOpts RuleCharacterIterator options
|
| + */
|
| +UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
|
| + int32_t iterOpts) {
|
| + // NOTE: literal will always be FALSE, because we don't parse escapes.
|
| + UBool result = FALSE, literal;
|
| + UErrorCode ec = U_ZERO_ERROR;
|
| + iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
|
| + RuleCharacterIterator::Pos pos;
|
| + chars.getPos(pos);
|
| + UChar32 c = chars.next(iterOpts, literal, ec);
|
| + if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
|
| + UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
|
| + literal, ec);
|
| + result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
|
| + (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
|
| + }
|
| + chars.setPos(pos);
|
| + return result && U_SUCCESS(ec);
|
| +}
|
| +
|
| +/**
|
| + * Parse the given property pattern at the given parse position.
|
| + */
|
| +UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
|
| + ParsePosition& ppos,
|
| + UErrorCode &ec) {
|
| + int32_t pos = ppos.getIndex();
|
| +
|
| + UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
|
| + UBool isName = FALSE; // true for \N{pat}, o/w false
|
| + UBool invert = FALSE;
|
| +
|
| + if (U_FAILURE(ec)) return *this;
|
| +
|
| + // Minimum length is 5 characters, e.g. \p{L}
|
| + if ((pos+5) > pattern.length()) {
|
| + FAIL(ec);
|
| + }
|
| +
|
| + // On entry, ppos should point to one of the following locations:
|
| + // Look for an opening [:, [:^, \p, or \P
|
| + if (isPOSIXOpen(pattern, pos)) {
|
| + posix = TRUE;
|
| + pos += 2;
|
| + pos = ICU_Utility::skipWhitespace(pattern, pos);
|
| + if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
|
| + ++pos;
|
| + invert = TRUE;
|
| + }
|
| + } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
|
| + UChar c = pattern.charAt(pos+1);
|
| + invert = (c == UPPER_P);
|
| + isName = (c == UPPER_N);
|
| + pos += 2;
|
| + pos = ICU_Utility::skipWhitespace(pattern, pos);
|
| + if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
|
| + // Syntax error; "\p" or "\P" not followed by "{"
|
| + FAIL(ec);
|
| + }
|
| + } else {
|
| + // Open delimiter not seen
|
| + FAIL(ec);
|
| + }
|
| +
|
| + // Look for the matching close delimiter, either :] or }
|
| + int32_t close;
|
| + if (posix) {
|
| + close = pattern.indexOf(POSIX_CLOSE, 2, pos);
|
| + } else {
|
| + close = pattern.indexOf(CLOSE_BRACE, pos);
|
| + }
|
| + if (close < 0) {
|
| + // Syntax error; close delimiter missing
|
| + FAIL(ec);
|
| + }
|
| +
|
| + // Look for an '=' sign. If this is present, we will parse a
|
| + // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
|
| + // pattern.
|
| + int32_t equals = pattern.indexOf(EQUALS, pos);
|
| + UnicodeString propName, valueName;
|
| + if (equals >= 0 && equals < close && !isName) {
|
| + // Equals seen; parse medium/long pattern
|
| + pattern.extractBetween(pos, equals, propName);
|
| + pattern.extractBetween(equals+1, close, valueName);
|
| + }
|
| +
|
| + else {
|
| + // Handle case where no '=' is seen, and \N{}
|
| + pattern.extractBetween(pos, close, propName);
|
| +
|
| + // Handle \N{name}
|
| + if (isName) {
|
| + // This is a little inefficient since it means we have to
|
| + // parse NAME_PROP back to UCHAR_NAME even though we already
|
| + // know it's UCHAR_NAME. If we refactor the API to
|
| + // support args of (UProperty, char*) then we can remove
|
| + // NAME_PROP and make this a little more efficient.
|
| + valueName = propName;
|
| + propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
|
| + }
|
| + }
|
| +
|
| + applyPropertyAlias(propName, valueName, ec);
|
| +
|
| + if (U_SUCCESS(ec)) {
|
| + if (invert) {
|
| + complement();
|
| + }
|
| +
|
| + // Move to the limit position after the close delimiter if the
|
| + // parse succeeded.
|
| + ppos.setIndex(close + (posix ? 2 : 1));
|
| + }
|
| +
|
| + return *this;
|
| +}
|
| +
|
| +/**
|
| + * Parse a property pattern.
|
| + * @param chars iterator over the pattern characters. Upon return
|
| + * it will be advanced to the first character after the parsed
|
| + * pattern, or the end of the iteration if all characters are
|
| + * parsed.
|
| + * @param rebuiltPat the pattern that was parsed, rebuilt or
|
| + * copied from the input pattern, as appropriate.
|
| + */
|
| +void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
|
| + UnicodeString& rebuiltPat,
|
| + UErrorCode& ec) {
|
| + if (U_FAILURE(ec)) return;
|
| + UnicodeString pattern;
|
| + chars.lookahead(pattern);
|
| + ParsePosition pos(0);
|
| + applyPropertyPattern(pattern, pos, ec);
|
| + if (U_FAILURE(ec)) return;
|
| + if (pos.getIndex() == 0) {
|
| + // syntaxError(chars, "Invalid property pattern");
|
| + ec = U_MALFORMED_SET;
|
| + return;
|
| + }
|
| + chars.jumpahead(pos.getIndex());
|
| + rebuiltPat.append(pattern, 0, pos.getIndex());
|
| +}
|
| +
|
| +U_NAMESPACE_END
|
|
|
| Property changes on: icu51/source/common/uniset_props.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|