icu51/source/common/uniset_props.cpp - Issue 20882002: Check in the pristine copy of ICU 51.2

Unified Diff: icu51/source/common/uniset_props.cpp

Issue 20882002: Check in the pristine copy of ICU 51.2 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu51/source/common/uniset_props.cpp

===================================================================

--- icu51/source/common/uniset_props.cpp (revision 0)

+++ icu51/source/common/uniset_props.cpp (revision 0)

@@ -0,0 +1,1304 @@

+/*

+*******************************************************************************

+* file name: uniset_props.cpp

+* encoding: US-ASCII

+* tab size: 8 (not used)

+* indentation:4

+* created on: 2004aug25

+* created by: Markus W. Scherer

+* Character property dependent functions moved here from uniset.cpp

+*/

+#include "unicode/utypes.h"

+#include "unicode/uniset.h"

+#include "unicode/parsepos.h"

+#include "unicode/uchar.h"

+#include "unicode/uscript.h"

+#include "unicode/symtable.h"

+#include "unicode/uset.h"

+#include "unicode/locid.h"

+#include "unicode/brkiter.h"

+#include "uset_imp.h"

+#include "ruleiter.h"

+#include "cmemory.h"

+#include "ucln_cmn.h"

+#include "util.h"

+#include "uvector.h"

+#include "uprops.h"

+#include "propname.h"

+#include "normalizer2impl.h"

+#include "ucase.h"

+#include "ubidi_props.h"

+#include "uinvchar.h"

+#include "uprops.h"

+#include "charstr.h"

+#include "cstring.h"

+#include "mutex.h"

+#include "umutex.h"

+#include "uassert.h"

+#include "hash.h"

+U_NAMESPACE_USE

+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

+// initial storage. Must be >= 0

+// *** same as in uniset.cpp ! ***

+#define START_EXTRA 16

+// Define UChar constants using hex for EBCDIC compatibility

+// Used #define to reduce private static exports and memory access time.

+#define SET_OPEN ((UChar)0x005B) /*[*/

+#define SET_CLOSE ((UChar)0x005D) /*]*/

+#define HYPHEN ((UChar)0x002D) /*-*/

+#define COMPLEMENT ((UChar)0x005E) /*^*/

+#define COLON ((UChar)0x003A) /*:*/

+#define BACKSLASH ((UChar)0x005C) /*\*/

+#define INTERSECTION ((UChar)0x0026) /*&*/

+#define UPPER_U ((UChar)0x0055) /*U*/

+#define LOWER_U ((UChar)0x0075) /*u*/

+#define OPEN_BRACE ((UChar)123) /*{*/

+#define CLOSE_BRACE ((UChar)125) /*}*/

+#define UPPER_P ((UChar)0x0050) /*P*/

+#define LOWER_P ((UChar)0x0070) /*p*/

+#define UPPER_N ((UChar)78) /*N*/

+#define EQUALS ((UChar)0x003D) /*=*/

+//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"

+static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"

+//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"

+//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"

+//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"

+static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/

+// Special property set IDs

+static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]

+static const char ASCII[] = "ASCII"; // [\u0000-\u007F]

+static const char ASSIGNED[] = "Assigned"; // [:^Cn:]

+// Unicode name property alias

+#define NAME_PROP "na"

+#define NAME_PROP_LENGTH 2

+/**

+ * Delimiter string used in patterns to close a category reference:

+ * ":]". Example: "[:Lu:]".

+ */

+//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */

+// Cached sets ------------------------------------------------------------- ***

+U_CDECL_BEGIN

+static UBool U_CALLCONV uset_cleanup();

+U_CDECL_END

+// Not a TriStateSingletonWrapper because we think the UnicodeSet constructor

+// can only fail with an out-of-memory error

+// if we have a correct pattern and the properties data is hardcoded and always available.

+class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> {

+public:

+ UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) :

+ SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {}

+ UnicodeSet *getInstance(UErrorCode &errorCode) {

+ return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode);

+ }

+private:

+ static void *createInstance(const void *context, UErrorCode &errorCode) {

+ UnicodeString pattern((const char *)context, -1, US_INV);

+ UnicodeSet *set=new UnicodeSet(pattern, errorCode);

+ if(set==NULL) {

+ errorCode=U_MEMORY_ALLOCATION_ERROR;

+ return NULL;

+ }

+ set->freeze();

+ ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);

+ return set;

+ }

+ const char *fPattern;

+};

+U_CDECL_BEGIN

+static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()

+STATIC_SIMPLE_SINGLETON(uni32Singleton);

+//----------------------------------------------------------------

+// Inclusions list

+//----------------------------------------------------------------

+// USetAdder implementation

+// Does not use uset.h to reduce code dependencies

+static void U_CALLCONV

+_set_add(USet *set, UChar32 c) {

+ ((UnicodeSet *)set)->add(c);

+static void U_CALLCONV

+_set_addRange(USet *set, UChar32 start, UChar32 end) {

+ ((UnicodeSet *)set)->add(start, end);

+static void U_CALLCONV

+_set_addString(USet *set, const UChar *str, int32_t length) {

+ ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));

+/**

+ * Cleanup function for UnicodeSet

+ */

+static UBool U_CALLCONV uset_cleanup(void) {

+ int32_t i;

+ for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {

+ if (INCLUSIONS[i] != NULL) {

+ delete INCLUSIONS[i];

+ INCLUSIONS[i] = NULL;

+ }

+ UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance();

+ return TRUE;

+U_CDECL_END

+U_NAMESPACE_BEGIN

+/*

+Reduce excessive reallocation, and make it easier to detect initialization

+problems.

+Usually you don't see smaller sets than this for Unicode 5.0.

+*/

+#define DEFAULT_INCLUSION_CAPACITY 3072

+const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {

+ UBool needInit;

+ UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);

+ if (needInit) {

+ UnicodeSet* incl = new UnicodeSet();

+ USetAdder sa = {

+ (USet *)incl,

+ _set_add,

+ _set_addRange,

+ _set_addString,

+ NULL, // don't need remove()

+ NULL // don't need removeRange()

+ };

+ if (incl != NULL) {

+ incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);

+ switch(src) {

+ case UPROPS_SRC_CHAR:

+ uchar_addPropertyStarts(&sa, &status);

+ break;

+ case UPROPS_SRC_PROPSVEC:

+ upropsvec_addPropertyStarts(&sa, &status);

+ break;

+ case UPROPS_SRC_CHAR_AND_PROPSVEC:

+ uchar_addPropertyStarts(&sa, &status);

+ upropsvec_addPropertyStarts(&sa, &status);

+ break;

+#if !UCONFIG_NO_NORMALIZATION

+ case UPROPS_SRC_CASE_AND_NORM: {

+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);

+ if(U_SUCCESS(status)) {

+ impl->addPropertyStarts(&sa, status);

+ }

+ ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);

+ break;

+ }

+ case UPROPS_SRC_NFC: {

+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);

+ if(U_SUCCESS(status)) {

+ impl->addPropertyStarts(&sa, status);

+ }

+ break;

+ }

+ case UPROPS_SRC_NFKC: {

+ const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);

+ if(U_SUCCESS(status)) {

+ impl->addPropertyStarts(&sa, status);

+ }

+ break;

+ }

+ case UPROPS_SRC_NFKC_CF: {

+ const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);

+ if(U_SUCCESS(status)) {

+ impl->addPropertyStarts(&sa, status);

+ }

+ break;

+ }

+ case UPROPS_SRC_NFC_CANON_ITER: {

+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);

+ if(U_SUCCESS(status)) {

+ impl->addCanonIterPropertyStarts(&sa, status);

+ }

+ break;

+ }

+#endif

+ case UPROPS_SRC_CASE:

+ ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);

+ break;

+ case UPROPS_SRC_BIDI:

+ ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);

+ break;

+ default:

+ status = U_INTERNAL_PROGRAM_ERROR;

+ break;

+ }

+ if (U_SUCCESS(status)) {

+ // Compact for caching

+ incl->compact();

+ umtx_lock(NULL);

+ if (INCLUSIONS[src] == NULL) {

+ INCLUSIONS[src] = incl;

+ incl = NULL;

+ ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);

+ }

+ umtx_unlock(NULL);

+ }

+ delete incl;

+ } else {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ }

+ return INCLUSIONS[src];

+// Cache some sets for other services -------------------------------------- ***

+U_CFUNC UnicodeSet *

+uniset_getUnicode32Instance(UErrorCode &errorCode) {

+ return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode);

+// helper functions for matching of pattern syntax pieces ------------------ ***

+// these functions are parallel to the PERL_OPEN etc. strings above

+// using these functions is not only faster than UnicodeString::compare() and

+// caseCompare(), but they also make UnicodeSet work for simple patterns when

+// no Unicode properties data is available - when caseCompare() fails

+static inline UBool

+isPerlOpen(const UnicodeString &pattern, int32_t pos) {

+ UChar c;

+ return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);

+/*static inline UBool

+isPerlClose(const UnicodeString &pattern, int32_t pos) {

+ return pattern.charAt(pos)==CLOSE_BRACE;

+}*/

+static inline UBool

+isNameOpen(const UnicodeString &pattern, int32_t pos) {

+ return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;

+static inline UBool

+isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {

+ return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;

+/*static inline UBool

+isPOSIXClose(const UnicodeString &pattern, int32_t pos) {

+ return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;

+}*/

+// TODO memory debugging provided inside uniset.cpp

+// could be made available here but probably obsolete with use of modern

+// memory leak checker tools

+#define _dbgct(me)

+//----------------------------------------------------------------

+// Constructors &c

+//----------------------------------------------------------------

+/**

+ * Constructs a set from the given pattern, optionally ignoring

+ * white space. See the class description for the syntax of the

+ * pattern language.

+ * @param pattern a string specifying what characters are in the set

+ */

+UnicodeSet::UnicodeSet(const UnicodeString& pattern,

+ UErrorCode& status) :

+ len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),

+ bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),

+ fFlags(0)

+ if(U_SUCCESS(status)){

+ list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);

+ /* test for NULL */

+ if(list == NULL) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ }else{

+ allocateStrings(status);

+ applyPattern(pattern, status);

+ }

+ _dbgct(this);

+//----------------------------------------------------------------

+// Public API

+//----------------------------------------------------------------

+UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,

+ UErrorCode& status) {

+ // Equivalent to

+ // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);

+ // but without dependency on closeOver().

+ ParsePosition pos(0);

+ applyPatternIgnoreSpace(pattern, pos, NULL, status);

+ if (U_FAILURE(status)) return *this;

+ int32_t i = pos.getIndex();

+ // Skip over trailing whitespace

+ ICU_Utility::skipWhitespace(pattern, i, TRUE);

+ if (i != pattern.length()) {

+ status = U_ILLEGAL_ARGUMENT_ERROR;

+ }

+ return *this;

+void

+UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,

+ ParsePosition& pos,

+ const SymbolTable* symbols,

+ UErrorCode& status) {

+ if (U_FAILURE(status)) {

+ return;

+ }

+ if (isFrozen()) {

+ status = U_NO_WRITE_PERMISSION;

+ return;

+ }

+ // Need to build the pattern in a temporary string because

+ // _applyPattern calls add() etc., which set pat to empty.

+ UnicodeString rebuiltPat;

+ RuleCharacterIterator chars(pattern, symbols, pos);

+ applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);

+ if (U_FAILURE(status)) return;

+ if (chars.inVariable()) {

+ // syntaxError(chars, "Extra chars in variable value");

+ status = U_MALFORMED_SET;

+ return;

+ }

+ setPattern(rebuiltPat);

+/**

+ * Return true if the given position, in the given pattern, appears

+ * to be the start of a UnicodeSet pattern.

+ */

+UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {

+ return ((pos+1) < pattern.length() &&

+ pattern.charAt(pos) == (UChar)91/*[*/) ||

+ resemblesPropertyPattern(pattern, pos);

+//----------------------------------------------------------------

+// Implementation: Pattern parsing

+//----------------------------------------------------------------

+/**

+ * A small all-inline class to manage a UnicodeSet pointer. Add

+ * operator->() etc. as needed.

+ */

+class UnicodeSetPointer {

+ UnicodeSet* p;

+public:

+ inline UnicodeSetPointer() : p(0) {}

+ inline ~UnicodeSetPointer() { delete p; }

+ inline UnicodeSet* pointer() { return p; }

+ inline UBool allocate() {

+ if (p == 0) {

+ p = new UnicodeSet();

+ }

+ return p != 0;

+ }

+};

+/**

+ * Parse the pattern from the given RuleCharacterIterator. The

+ * iterator is advanced over the parsed pattern.

+ * @param chars iterator over the pattern characters. Upon return

+ * it will be advanced to the first character after the parsed

+ * pattern, or the end of the iteration if all characters are

+ * parsed.

+ * @param symbols symbol table to use to parse and dereference

+ * variables, or null if none.

+ * @param rebuiltPat the pattern that was parsed, rebuilt or

+ * copied from the input pattern, as appropriate.

+ * @param options a bit mask of zero or more of the following:

+ * IGNORE_SPACE, CASE.

+ */

+void UnicodeSet::applyPattern(RuleCharacterIterator& chars,

+ const SymbolTable* symbols,

+ UnicodeString& rebuiltPat,

+ uint32_t options,

+ UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),

+ UErrorCode& ec) {

+ if (U_FAILURE(ec)) return;

+ // Syntax characters: [ ] ^ - & { }

+ // Recognized special forms for chars, sets: c-c s-s s&s

+ int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |

+ RuleCharacterIterator::PARSE_ESCAPES;

+ if ((options & USET_IGNORE_SPACE) != 0) {

+ opts |= RuleCharacterIterator::SKIP_WHITESPACE;

+ }

+ UnicodeString patLocal, buf;

+ UBool usePat = FALSE;

+ UnicodeSetPointer scratch;

+ RuleCharacterIterator::Pos backup;

+ // mode: 0=before [, 1=between [...], 2=after ]

+ // lastItem: 0=none, 1=char, 2=set

+ int8_t lastItem = 0, mode = 0;

+ UChar32 lastChar = 0;

+ UChar op = 0;

+ UBool invert = FALSE;

+ clear();

+ while (mode != 2 && !chars.atEnd()) {

+ U_ASSERT((lastItem == 0 && op == 0) ||

+ (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||

+ (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||

+ op == INTERSECTION /*'&'*/)));

+ UChar32 c = 0;

+ UBool literal = FALSE;

+ UnicodeSet* nested = 0; // alias - do not delete

+ // -------- Check for property pattern

+ // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed

+ int8_t setMode = 0;

+ if (resemblesPropertyPattern(chars, opts)) {

+ setMode = 2;

+ }

+ // -------- Parse '[' of opening delimiter OR nested set.

+ // If there is a nested set, use `setMode' to define how

+ // the set should be parsed. If the '[' is part of the

+ // opening delimiter for this pattern, parse special

+ // strings "[", "[^", "[-", and "[^-". Check for stand-in

+ // characters representing a nested set in the symbol

+ // table.

+ else {

+ // Prepare to backup if necessary

+ chars.getPos(backup);

+ c = chars.next(opts, literal, ec);

+ if (U_FAILURE(ec)) return;

+ if (c == 0x5B /*'['*/ && !literal) {

+ if (mode == 1) {

+ chars.setPos(backup); // backup

+ setMode = 1;

+ } else {

+ // Handle opening '[' delimiter

+ mode = 1;

+ patLocal.append((UChar) 0x5B /*'['*/);

+ chars.getPos(backup); // prepare to backup

+ c = chars.next(opts, literal, ec);

+ if (U_FAILURE(ec)) return;

+ if (c == 0x5E /*'^'*/ && !literal) {

+ invert = TRUE;

+ patLocal.append((UChar) 0x5E /*'^'*/);

+ chars.getPos(backup); // prepare to backup

+ c = chars.next(opts, literal, ec);

+ if (U_FAILURE(ec)) return;

+ }

+ // Fall through to handle special leading '-';

+ // otherwise restart loop for nested [], \p{}, etc.

+ if (c == HYPHEN /*'-'*/) {

+ literal = TRUE;

+ // Fall through to handle literal '-' below

+ } else {

+ chars.setPos(backup); // backup

+ continue;

+ }

+ } else if (symbols != 0) {

+ const UnicodeFunctor *m = symbols->lookupMatcher(c);

+ if (m != 0) {

+ const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);

+ if (ms == NULL) {

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ // casting away const, but `nested' won't be modified

+ // (important not to modify stored set)

+ nested = const_cast<UnicodeSet*>(ms);

+ setMode = 3;

+ }

+ // -------- Handle a nested set. This either is inline in

+ // the pattern or represented by a stand-in that has

+ // previously been parsed and was looked up in the symbol

+ // table.

+ if (setMode != 0) {

+ if (lastItem == 1) {

+ if (op != 0) {

+ // syntaxError(chars, "Char expected after operator");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ add(lastChar, lastChar);

+ _appendToPat(patLocal, lastChar, FALSE);

+ lastItem = 0;

+ op = 0;

+ }

+ if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {

+ patLocal.append(op);

+ }

+ if (nested == 0) {

+ // lazy allocation

+ if (!scratch.allocate()) {

+ ec = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ nested = scratch.pointer();

+ }

+ switch (setMode) {

+ case 1:

+ nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);

+ break;

+ case 2:

+ chars.skipIgnored(opts);

+ nested->applyPropertyPattern(chars, patLocal, ec);

+ if (U_FAILURE(ec)) return;

+ break;

+ case 3: // `nested' already parsed

+ nested->_toPattern(patLocal, FALSE);

+ break;

+ }

+ usePat = TRUE;

+ if (mode == 0) {

+ // Entire pattern is a category; leave parse loop

+ *this = *nested;

+ mode = 2;

+ break;

+ }

+ switch (op) {

+ case HYPHEN: /*'-'*/

+ removeAll(*nested);

+ break;

+ case INTERSECTION: /*'&'*/

+ retainAll(*nested);

+ break;

+ case 0:

+ addAll(*nested);

+ break;

+ }

+ op = 0;

+ lastItem = 2;

+ continue;

+ }

+ if (mode == 0) {

+ // syntaxError(chars, "Missing '['");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ // -------- Parse special (syntax) characters. If the

+ // current character is not special, or if it is escaped,

+ // then fall through and handle it below.

+ if (!literal) {

+ switch (c) {

+ case 0x5D /*']'*/:

+ if (lastItem == 1) {

+ add(lastChar, lastChar);

+ _appendToPat(patLocal, lastChar, FALSE);

+ }

+ // Treat final trailing '-' as a literal

+ if (op == HYPHEN /*'-'*/) {

+ add(op, op);

+ patLocal.append(op);

+ } else if (op == INTERSECTION /*'&'*/) {

+ // syntaxError(chars, "Trailing '&'");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ patLocal.append((UChar) 0x5D /*']'*/);

+ mode = 2;

+ continue;

+ case HYPHEN /*'-'*/:

+ if (op == 0) {

+ if (lastItem != 0) {

+ op = (UChar) c;

+ continue;

+ } else {

+ // Treat final trailing '-' as a literal

+ add(c, c);

+ c = chars.next(opts, literal, ec);

+ if (U_FAILURE(ec)) return;

+ if (c == 0x5D /*']'*/ && !literal) {

+ patLocal.append(HYPHEN_RIGHT_BRACE, 2);

+ mode = 2;

+ continue;

+ }

+ // syntaxError(chars, "'-' not after char or set");

+ ec = U_MALFORMED_SET;

+ return;

+ case INTERSECTION /*'&'*/:

+ if (lastItem == 2 && op == 0) {

+ op = (UChar) c;

+ continue;

+ }

+ // syntaxError(chars, "'&' not after set");

+ ec = U_MALFORMED_SET;

+ return;

+ case 0x5E /*'^'*/:

+ // syntaxError(chars, "'^' not after '['");

+ ec = U_MALFORMED_SET;

+ return;

+ case 0x7B /*'{'*/:

+ if (op != 0) {

+ // syntaxError(chars, "Missing operand after operator");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ if (lastItem == 1) {

+ add(lastChar, lastChar);

+ _appendToPat(patLocal, lastChar, FALSE);

+ }

+ lastItem = 0;

+ buf.truncate(0);

+ {

+ UBool ok = FALSE;

+ while (!chars.atEnd()) {

+ c = chars.next(opts, literal, ec);

+ if (U_FAILURE(ec)) return;

+ if (c == 0x7D /*'}'*/ && !literal) {

+ ok = TRUE;

+ break;

+ }

+ buf.append(c);

+ }

+ if (buf.length() < 1 || !ok) {

+ // syntaxError(chars, "Invalid multicharacter string");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ // We have new string. Add it to set and continue;

+ // we don't need to drop through to the further

+ // processing

+ add(buf);

+ patLocal.append((UChar) 0x7B /*'{'*/);

+ _appendToPat(patLocal, buf, FALSE);

+ patLocal.append((UChar) 0x7D /*'}'*/);

+ continue;

+ case SymbolTable::SYMBOL_REF:

+ // symbols nosymbols

+ // [a-$] error error (ambiguous)

+ // [a$] anchor anchor

+ // [a-$x] var "x"* literal '$'

+ // [a-$.] error literal '$'

+ // *We won't get here in the case of var "x"

+ {

+ chars.getPos(backup);

+ c = chars.next(opts, literal, ec);

+ if (U_FAILURE(ec)) return;

+ UBool anchor = (c == 0x5D /*']'*/ && !literal);

+ if (symbols == 0 && !anchor) {

+ c = SymbolTable::SYMBOL_REF;

+ chars.setPos(backup);

+ break; // literal '$'

+ }

+ if (anchor && op == 0) {

+ if (lastItem == 1) {

+ add(lastChar, lastChar);

+ _appendToPat(patLocal, lastChar, FALSE);

+ }

+ add(U_ETHER);

+ usePat = TRUE;

+ patLocal.append((UChar) SymbolTable::SYMBOL_REF);

+ patLocal.append((UChar) 0x5D /*']'*/);

+ mode = 2;

+ continue;

+ }

+ // syntaxError(chars, "Unquoted '$'");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ default:

+ break;

+ }

+ // -------- Parse literal characters. This includes both

+ // escaped chars ("\u4E01") and non-syntax characters

+ // ("a").

+ switch (lastItem) {

+ case 0:

+ lastItem = 1;

+ lastChar = c;

+ break;

+ case 1:

+ if (op == HYPHEN /*'-'*/) {

+ if (lastChar >= c) {

+ // Don't allow redundant (a-a) or empty (b-a) ranges;

+ // these are most likely typos.

+ // syntaxError(chars, "Invalid range");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ add(lastChar, c);

+ _appendToPat(patLocal, lastChar, FALSE);

+ patLocal.append(op);

+ _appendToPat(patLocal, c, FALSE);

+ lastItem = 0;

+ op = 0;

+ } else {

+ add(lastChar, lastChar);

+ _appendToPat(patLocal, lastChar, FALSE);

+ lastChar = c;

+ }

+ break;

+ case 2:

+ if (op != 0) {

+ // syntaxError(chars, "Set expected after operator");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ lastChar = c;

+ lastItem = 1;

+ break;

+ }

+ if (mode != 2) {

+ // syntaxError(chars, "Missing ']'");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ chars.skipIgnored(opts);

+ /**

+ * Handle global flags (invert, case insensitivity). If this

+ * pattern should be compiled case-insensitive, then we need

+ * to close over case BEFORE COMPLEMENTING. This makes

+ * patterns like /[^abc]/i work.

+ */

+ if ((options & USET_CASE_INSENSITIVE) != 0) {

+ (this->*caseClosure)(USET_CASE_INSENSITIVE);

+ }

+ else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {

+ (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);

+ }

+ if (invert) {

+ complement();

+ }

+ // Use the rebuilt pattern (patLocal) only if necessary. Prefer the

+ // generated pattern.

+ if (usePat) {

+ rebuiltPat.append(patLocal);

+ } else {

+ _generatePattern(rebuiltPat, FALSE);

+ }

+ if (isBogus() && U_SUCCESS(ec)) {

+ // We likely ran out of memory. AHHH!

+ ec = U_MEMORY_ALLOCATION_ERROR;

+ }

+//----------------------------------------------------------------

+// Property set implementation

+//----------------------------------------------------------------

+static UBool numericValueFilter(UChar32 ch, void* context) {

+ return u_getNumericValue(ch) == *(double*)context;

+static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {

+ int32_t value = *(int32_t*)context;

+ return (U_GET_GC_MASK((UChar32) ch) & value) != 0;

+static UBool versionFilter(UChar32 ch, void* context) {

+ static const UVersionInfo none = { 0, 0, 0, 0 };

+ UVersionInfo v;

+ u_charAge(ch, v);

+ UVersionInfo* version = (UVersionInfo*)context;

+ return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;

+typedef struct {

+ UProperty prop;

+ int32_t value;

+} IntPropertyContext;

+static UBool intPropertyFilter(UChar32 ch, void* context) {

+ IntPropertyContext* c = (IntPropertyContext*)context;

+ return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;

+static UBool scriptExtensionsFilter(UChar32 ch, void* context) {

+ return uscript_hasScript(ch, *(UScriptCode*)context);

+/**

+ * Generic filter-based scanning code for UCD property UnicodeSets.

+ */

+void UnicodeSet::applyFilter(UnicodeSet::Filter filter,

+ void* context,

+ int32_t src,

+ UErrorCode &status) {

+ if (U_FAILURE(status)) return;

+ // Logically, walk through all Unicode characters, noting the start

+ // and end of each range for which filter.contain(c) is

+ // true. Add each range to a set.

+ //

+ // To improve performance, use an inclusions set which

+ // encodes information about character ranges that are known

+ // to have identical properties.

+ // getInclusions(src) contains exactly the first characters of

+ // same-value ranges for the given properties "source".

+ const UnicodeSet* inclusions = getInclusions(src, status);

+ if (U_FAILURE(status)) {

+ return;

+ }

+ clear();

+ UChar32 startHasProperty = -1;

+ int32_t limitRange = inclusions->getRangeCount();

+ for (int j=0; j<limitRange; ++j) {

+ // get current range

+ UChar32 start = inclusions->getRangeStart(j);

+ UChar32 end = inclusions->getRangeEnd(j);

+ // for all the code points in the range, process

+ for (UChar32 ch = start; ch <= end; ++ch) {

+ // only add to this UnicodeSet on inflection points --

+ // where the hasProperty value changes to false

+ if ((*filter)(ch, context)) {

+ if (startHasProperty < 0) {

+ startHasProperty = ch;

+ }

+ } else if (startHasProperty >= 0) {

+ add(startHasProperty, ch-1);

+ startHasProperty = -1;

+ }

+ if (startHasProperty >= 0) {

+ add((UChar32)startHasProperty, (UChar32)0x10FFFF);

+ }

+ if (isBogus() && U_SUCCESS(status)) {

+ // We likely ran out of memory. AHHH!

+ status = U_MEMORY_ALLOCATION_ERROR;

+ }

+static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {

+ /* Note: we use ' ' in compiler code page */

+ int32_t j = 0;

+ char ch;

+ --dstCapacity; /* make room for term. zero */

+ while ((ch = *src++) != 0) {

+ if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {

+ continue;

+ }

+ if (j >= dstCapacity) return FALSE;

+ dst[j++] = ch;

+ }

+ if (j > 0 && dst[j-1] == ' ') --j;

+ dst[j] = 0;

+ return TRUE;

+//----------------------------------------------------------------

+// Property set API

+//----------------------------------------------------------------

+#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}

+UnicodeSet&

+UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {

+ if (U_FAILURE(ec) || isFrozen()) return *this;

+ if (prop == UCHAR_GENERAL_CATEGORY_MASK) {

+ applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);

+ } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {

+ UScriptCode script = (UScriptCode)value;

+ applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);

+ } else {

+ IntPropertyContext c = {prop, value};

+ applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);

+ }

+ return *this;

+UnicodeSet&

+UnicodeSet::applyPropertyAlias(const UnicodeString& prop,

+ const UnicodeString& value,

+ UErrorCode& ec) {

+ if (U_FAILURE(ec) || isFrozen()) return *this;

+ // prop and value used to be converted to char * using the default

+ // converter instead of the invariant conversion.

+ // This should not be necessary because all Unicode property and value

+ // names use only invariant characters.

+ // If there are any variant characters, then we won't find them anyway.

+ // Checking first avoids assertion failures in the conversion.

+ if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||

+ !uprv_isInvariantUString(value.getBuffer(), value.length())

+ ) {

+ FAIL(ec);

+ }

+ CharString pname, vname;

+ pname.appendInvariantChars(prop, ec);

+ vname.appendInvariantChars(value, ec);

+ if (U_FAILURE(ec)) return *this;

+ UProperty p;

+ int32_t v;

+ UBool mustNotBeEmpty = FALSE, invert = FALSE;

+ if (value.length() > 0) {

+ p = u_getPropertyEnum(pname.data());

+ if (p == UCHAR_INVALID_CODE) FAIL(ec);

+ // Treat gc as gcm

+ if (p == UCHAR_GENERAL_CATEGORY) {

+ p = UCHAR_GENERAL_CATEGORY_MASK;

+ }

+ if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||

+ (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||

+ (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {

+ v = u_getPropertyValueEnum(p, vname.data());

+ if (v == UCHAR_INVALID_CODE) {

+ // Handle numeric CCC

+ if (p == UCHAR_CANONICAL_COMBINING_CLASS ||

+ p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||

+ p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {

+ char* end;

+ double value = uprv_strtod(vname.data(), &end);

+ v = (int32_t) value;

+ if (v != value || v < 0 || *end != 0) {

+ // non-integral or negative value, or trailing junk

+ FAIL(ec);

+ }

+ // If the resultant set is empty then the numeric value

+ // was invalid.

+ mustNotBeEmpty = TRUE;

+ } else {

+ FAIL(ec);

+ }

+ else {

+ switch (p) {

+ case UCHAR_NUMERIC_VALUE:

+ {

+ char* end;

+ double value = uprv_strtod(vname.data(), &end);

+ if (*end != 0) {

+ FAIL(ec);

+ }

+ applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);

+ return *this;

+ }

+ case UCHAR_NAME:

+ {

+ // Must munge name, since u_charFromName() does not do

+ // 'loose' matching.

+ char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength

+ if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);

+ UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);

+ if (U_SUCCESS(ec)) {

+ clear();

+ add(ch);

+ return *this;

+ } else {

+ FAIL(ec);

+ }

+ case UCHAR_UNICODE_1_NAME:

+ // ICU 49 deprecates the Unicode_1_Name property APIs.

+ FAIL(ec);

+ case UCHAR_AGE:

+ {

+ // Must munge name, since u_versionFromString() does not do

+ // 'loose' matching.

+ char buf[128];

+ if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);

+ UVersionInfo version;

+ u_versionFromString(version, buf);

+ applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);

+ return *this;

+ }

+ case UCHAR_SCRIPT_EXTENSIONS:

+ v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());

+ if (v == UCHAR_INVALID_CODE) {

+ FAIL(ec);

+ }

+ // fall through to calling applyIntPropertyValue()

+ break;

+ default:

+ // p is a non-binary, non-enumerated property that we

+ // don't support (yet).

+ FAIL(ec);

+ }

+ else {

+ // value is empty. Interpret as General Category, Script, or

+ // Binary property.

+ p = UCHAR_GENERAL_CATEGORY_MASK;

+ v = u_getPropertyValueEnum(p, pname.data());

+ if (v == UCHAR_INVALID_CODE) {

+ p = UCHAR_SCRIPT;

+ v = u_getPropertyValueEnum(p, pname.data());

+ if (v == UCHAR_INVALID_CODE) {

+ p = u_getPropertyEnum(pname.data());

+ if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {

+ v = 1;

+ } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {

+ set(MIN_VALUE, MAX_VALUE);

+ return *this;

+ } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {

+ set(0, 0x7F);

+ return *this;

+ } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {

+ // [:Assigned:]=[:^Cn:]

+ p = UCHAR_GENERAL_CATEGORY_MASK;

+ v = U_GC_CN_MASK;

+ invert = TRUE;

+ } else {

+ FAIL(ec);

+ }

+ applyIntPropertyValue(p, v, ec);

+ if(invert) {

+ complement();

+ }

+ if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {

+ // mustNotBeEmpty is set to true if an empty set indicates

+ // invalid input.

+ ec = U_ILLEGAL_ARGUMENT_ERROR;

+ }

+ if (isBogus() && U_SUCCESS(ec)) {

+ // We likely ran out of memory. AHHH!

+ ec = U_MEMORY_ALLOCATION_ERROR;

+ }

+ return *this;

+//----------------------------------------------------------------

+// Property set patterns

+//----------------------------------------------------------------

+/**

+ * Return true if the given position, in the given pattern, appears

+ * to be the start of a property set pattern.

+ */

+UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,

+ int32_t pos) {

+ // Patterns are at least 5 characters long

+ if ((pos+5) > pattern.length()) {

+ return FALSE;

+ }

+ // Look for an opening [:, [:^, \p, or \P

+ return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);

+/**

+ * Return true if the given iterator appears to point at a

+ * property pattern. Regardless of the result, return with the

+ * iterator unchanged.

+ * @param chars iterator over the pattern characters. Upon return

+ * it will be unchanged.

+ * @param iterOpts RuleCharacterIterator options

+ */

+UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,

+ int32_t iterOpts) {

+ // NOTE: literal will always be FALSE, because we don't parse escapes.

+ UBool result = FALSE, literal;

+ UErrorCode ec = U_ZERO_ERROR;

+ iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;

+ RuleCharacterIterator::Pos pos;

+ chars.getPos(pos);

+ UChar32 c = chars.next(iterOpts, literal, ec);

+ if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {

+ UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,

+ literal, ec);

+ result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :

+ (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);

+ }

+ chars.setPos(pos);

+ return result && U_SUCCESS(ec);

+/**

+ * Parse the given property pattern at the given parse position.

+ */

+UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,

+ ParsePosition& ppos,

+ UErrorCode &ec) {

+ int32_t pos = ppos.getIndex();

+ UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}

+ UBool isName = FALSE; // true for \N{pat}, o/w false

+ UBool invert = FALSE;

+ if (U_FAILURE(ec)) return *this;

+ // Minimum length is 5 characters, e.g. \p{L}

+ if ((pos+5) > pattern.length()) {

+ FAIL(ec);

+ }

+ // On entry, ppos should point to one of the following locations:

+ // Look for an opening [:, [:^, \p, or \P

+ if (isPOSIXOpen(pattern, pos)) {

+ posix = TRUE;

+ pos += 2;

+ pos = ICU_Utility::skipWhitespace(pattern, pos);

+ if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {

+ ++pos;

+ invert = TRUE;

+ }

+ } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {

+ UChar c = pattern.charAt(pos+1);

+ invert = (c == UPPER_P);

+ isName = (c == UPPER_N);

+ pos += 2;

+ pos = ICU_Utility::skipWhitespace(pattern, pos);

+ if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {

+ // Syntax error; "\p" or "\P" not followed by "{"

+ FAIL(ec);

+ }

+ } else {

+ // Open delimiter not seen

+ FAIL(ec);

+ }

+ // Look for the matching close delimiter, either :] or }

+ int32_t close;

+ if (posix) {

+ close = pattern.indexOf(POSIX_CLOSE, 2, pos);

+ } else {

+ close = pattern.indexOf(CLOSE_BRACE, pos);

+ }

+ if (close < 0) {

+ // Syntax error; close delimiter missing

+ FAIL(ec);

+ }

+ // Look for an '=' sign. If this is present, we will parse a

+ // medium \p{gc=Cf} or long \p{GeneralCategory=Format}

+ // pattern.

+ int32_t equals = pattern.indexOf(EQUALS, pos);

+ UnicodeString propName, valueName;

+ if (equals >= 0 && equals < close && !isName) {

+ // Equals seen; parse medium/long pattern

+ pattern.extractBetween(pos, equals, propName);

+ pattern.extractBetween(equals+1, close, valueName);

+ }

+ else {

+ // Handle case where no '=' is seen, and \N{}

+ pattern.extractBetween(pos, close, propName);

+ // Handle \N{name}

+ if (isName) {

+ // This is a little inefficient since it means we have to

+ // parse NAME_PROP back to UCHAR_NAME even though we already

+ // know it's UCHAR_NAME. If we refactor the API to

+ // support args of (UProperty, char*) then we can remove

+ // NAME_PROP and make this a little more efficient.

+ valueName = propName;

+ propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);

+ }

+ applyPropertyAlias(propName, valueName, ec);

+ if (U_SUCCESS(ec)) {

+ if (invert) {

+ complement();

+ }

+ // Move to the limit position after the close delimiter if the

+ // parse succeeded.

+ ppos.setIndex(close + (posix ? 2 : 1));

+ }

+ return *this;

+/**

+ * Parse a property pattern.

+ * @param chars iterator over the pattern characters. Upon return

+ * it will be advanced to the first character after the parsed

+ * pattern, or the end of the iteration if all characters are

+ * parsed.

+ * @param rebuiltPat the pattern that was parsed, rebuilt or

+ * copied from the input pattern, as appropriate.

+ */

+void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,

+ UnicodeString& rebuiltPat,

+ UErrorCode& ec) {

+ if (U_FAILURE(ec)) return;

+ UnicodeString pattern;

+ chars.lookahead(pattern);

+ ParsePosition pos(0);

+ applyPropertyPattern(pattern, pos, ec);

+ if (U_FAILURE(ec)) return;

+ if (pos.getIndex() == 0) {

+ // syntaxError(chars, "Invalid property pattern");

+ ec = U_MALFORMED_SET;

+ return;

+ }

+ chars.jumpahead(pos.getIndex());

+ rebuiltPat.append(pattern, 0, pos.getIndex());

+U_NAMESPACE_END

Property changes on: icu51/source/common/uniset_props.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu51/source/common/uniset_closure.cpp ('k') | icu51/source/common/unisetspan.h » ('j') | no next file with comments »