const CollationKey& CollationKey::operator=(const CollationKey& other) { if (this != &other) { if (other.isBogus()) { return setToBogus(); } if (other.fBytes != NULL) { ensureCapacity(other.fCount); if (isBogus()) { return *this; } fHashCode = other.fHashCode; uprv_memcpy(fBytes, other.fBytes, fCount); } else { fCount = 0; fBogus = FALSE; fHashCode = kEmptyHashCode; } } return *this; }
StatusWith<std::unique_ptr<CollatorInterface>> CollatorFactoryICU::makeFromBSON( const BSONObj& spec) { // Parse the locale ID out of the spec. auto parsedLocaleID = parseLocaleID(spec); if (!parsedLocaleID.isOK()) { return parsedLocaleID.getStatus(); } // If spec = {locale: "simple"}, return a null pointer. A null CollatorInterface indicates // simple binary compare. if (parsedLocaleID.getValue() == CollationSpec::kSimpleBinaryComparison) { if (spec.nFields() > 1) { return {ErrorCodes::FailedToParse, str::stream() << "If " << CollationSpec::kLocaleField << "=" << CollationSpec::kSimpleBinaryComparison << ", no other fields should be present in: " << spec}; } return {nullptr}; } // Construct an icu::Locale. auto userLocale = icu::Locale::createFromName(parsedLocaleID.getValue().c_str()); if (userLocale.isBogus()) { return {ErrorCodes::BadValue, str::stream() << "Field '" << CollationSpec::kLocaleField << "' is not valid in: " << spec}; } // Construct an icu::Collator. UErrorCode status = U_ZERO_ERROR; std::unique_ptr<icu::Collator> icuCollator(icu::Collator::createInstance(userLocale, status)); if (U_FAILURE(status)) { icu::ErrorCode icuError; icuError.set(status); return {ErrorCodes::OperationFailed, str::stream() << "Failed to create collator: " << icuError.errorName() << ". Collation spec: " << spec}; } Status localeValidationStatus = validateLocaleID(spec, parsedLocaleID.getValue(), *icuCollator); if (!localeValidationStatus.isOK()) { return localeValidationStatus; } // Construct a CollationSpec using the options provided in spec or the defaults in icuCollator. // Use userLocale.getName() for the localeID, since it is canonicalized and includes options. auto parsedSpec = parseToCollationSpec(spec, userLocale.getName(), icuCollator.get()); if (!parsedSpec.isOK()) { return parsedSpec.getStatus(); } auto mongoCollator = stdx::make_unique<CollatorInterfaceICU>(std::move(parsedSpec.getValue()), std::move(icuCollator)); return {std::move(mongoCollator)}; }
/** * Generic filter-based scanning code for UCD property UnicodeSets. */ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, void* context, int32_t src, UErrorCode &status) { // Walk through all Unicode characters, noting the start // and end of each range for which filter.contain(c) is // true. Add each range to a set. // // To improve performance, use the INCLUSIONS set, which // encodes information about character ranges that are known // to have identical properties. INCLUSIONS contains // only the first characters of such ranges. // // TODO Where possible, instead of scanning over code points, // use internal property data to initialize UnicodeSets for // those properties. Scanning code points is slow. if (U_FAILURE(status)) return; const UnicodeSet* inclusions = getInclusions(src, status); if (U_FAILURE(status)) { return; } clear(); UChar32 startHasProperty = -1; int32_t limitRange = inclusions->getRangeCount(); for (int j=0; j<limitRange; ++j) { // get current range UChar32 start = inclusions->getRangeStart(j); UChar32 end = inclusions->getRangeEnd(j); // for all the code points in the range, process for (UChar32 ch = start; ch <= end; ++ch) { // only add to this UnicodeSet on inflection points -- // where the hasProperty value changes to false if ((*filter)(ch, context)) { if (startHasProperty < 0) { startHasProperty = ch; } } else if (startHasProperty >= 0) { add(startHasProperty, ch-1); startHasProperty = -1; } } } if (startHasProperty >= 0) { add((UChar32)startHasProperty, (UChar32)0x10FFFF); } if (isBogus() && U_SUCCESS(status)) { // We likely ran out of memory. AHHH! status = U_MEMORY_ALLOCATION_ERROR; } }
UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { if (isFrozen() || isBogus()) { return *this; } if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { const UCaseProps *csp = ucase_getSingleton(); { UnicodeSet foldSet(*this); UnicodeString str; USetAdder sa = { foldSet.toUSet(), _set_add, _set_addRange, _set_addString, NULL, // don't need remove() NULL // don't need removeRange() }; // start with input set to guarantee inclusion // USET_CASE: remove strings because the strings will actually be reduced (folded); // therefore, start with no strings and add only those needed if (attribute & USET_CASE_INSENSITIVE) { foldSet.strings->removeAllElements(); } int32_t n = getRangeCount(); UChar32 result; const UChar *full; int32_t locCache = 0; for (int32_t i=0; i<n; ++i) { UChar32 start = getRangeStart(i); UChar32 end = getRangeEnd(i); if (attribute & USET_CASE_INSENSITIVE) { // full case closure for (UChar32 cp=start; cp<=end; ++cp) { ucase_addCaseClosure(csp, cp, &sa); } } else { // add case mappings // (does not add long s for regular s, or Kelvin for k, for example) for (UChar32 cp=start; cp<=end; ++cp) { result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache); addCaseMapping(foldSet, result, full, str); result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache); addCaseMapping(foldSet, result, full, str); result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache); addCaseMapping(foldSet, result, full, str); result = ucase_toFullFolding(csp, cp, &full, 0); addCaseMapping(foldSet, result, full, str); } } } if (strings != NULL && strings->size() > 0) { if (attribute & USET_CASE_INSENSITIVE) { for (int32_t j=0; j<strings->size(); ++j) { str = *(const UnicodeString *) strings->elementAt(j); str.foldCase(); if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { foldSet.add(str); // does not map to code points: add the folded string itself } } } else { Locale root(""); #if !UCONFIG_NO_BREAK_ITERATION UErrorCode status = U_ZERO_ERROR; BreakIterator *bi = BreakIterator::createWordInstance(root, status); if (U_SUCCESS(status)) { #endif const UnicodeString *pStr; for (int32_t j=0; j<strings->size(); ++j) { pStr = (const UnicodeString *) strings->elementAt(j); (str = *pStr).toLower(root); foldSet.add(str); #if !UCONFIG_NO_BREAK_ITERATION (str = *pStr).toTitle(bi, root); foldSet.add(str); #endif (str = *pStr).toUpper(root); foldSet.add(str); (str = *pStr).foldCase(); foldSet.add(str); } #if !UCONFIG_NO_BREAK_ITERATION } delete bi; #endif } } *this = foldSet; } } return *this; }
UnicodeSet& UnicodeSet::applyPropertyAlias(const UnicodeString& prop, const UnicodeString& value, UErrorCode& ec) { if (U_FAILURE(ec) || isFrozen()) return *this; // prop and value used to be converted to char * using the default // converter instead of the invariant conversion. // This should not be necessary because all Unicode property and value // names use only invariant characters. // If there are any variant characters, then we won't find them anyway. // Checking first avoids assertion failures in the conversion. if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || !uprv_isInvariantUString(value.getBuffer(), value.length()) ) { FAIL(ec); } CharString pname, vname; pname.appendInvariantChars(prop, ec); vname.appendInvariantChars(value, ec); if (U_FAILURE(ec)) return *this; UProperty p; int32_t v; UBool mustNotBeEmpty = FALSE, invert = FALSE; if (value.length() > 0) { p = u_getPropertyEnum(pname.data()); if (p == UCHAR_INVALID_CODE) FAIL(ec); // Treat gc as gcm if (p == UCHAR_GENERAL_CATEGORY) { p = UCHAR_GENERAL_CATEGORY_MASK; } if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { v = u_getPropertyValueEnum(p, vname.data()); if (v == UCHAR_INVALID_CODE) { // Handle numeric CCC if (p == UCHAR_CANONICAL_COMBINING_CLASS || p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { char* end; double value = uprv_strtod(vname.data(), &end); v = (int32_t) value; if (v != value || v < 0 || *end != 0) { // non-integral or negative value, or trailing junk FAIL(ec); } // If the resultant set is empty then the numeric value // was invalid. mustNotBeEmpty = TRUE; } else { FAIL(ec); } } } else { switch (p) { case UCHAR_NUMERIC_VALUE: { char* end; double value = uprv_strtod(vname.data(), &end); if (*end != 0) { FAIL(ec); } applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); return *this; } case UCHAR_NAME: { // Must munge name, since u_charFromName() does not do // 'loose' matching. char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); if (U_SUCCESS(ec)) { clear(); add(ch); return *this; } else { FAIL(ec); } } case UCHAR_UNICODE_1_NAME: // ICU 49 deprecates the Unicode_1_Name property APIs. FAIL(ec); case UCHAR_AGE: { // Must munge name, since u_versionFromString() does not do // 'loose' matching. char buf[128]; if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); UVersionInfo version; u_versionFromString(version, buf); applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); return *this; } case UCHAR_SCRIPT_EXTENSIONS: v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); if (v == UCHAR_INVALID_CODE) { FAIL(ec); } // fall through to calling applyIntPropertyValue() break; default: // p is a non-binary, non-enumerated property that we // don't support (yet). FAIL(ec); } } } else { // value is empty. Interpret as General Category, Script, or // Binary property. p = UCHAR_GENERAL_CATEGORY_MASK; v = u_getPropertyValueEnum(p, pname.data()); if (v == UCHAR_INVALID_CODE) { p = UCHAR_SCRIPT; v = u_getPropertyValueEnum(p, pname.data()); if (v == UCHAR_INVALID_CODE) { p = u_getPropertyEnum(pname.data()); if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { v = 1; } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { set(MIN_VALUE, MAX_VALUE); return *this; } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { set(0, 0x7F); return *this; } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { // [:Assigned:]=[:^Cn:] p = UCHAR_GENERAL_CATEGORY_MASK; v = U_GC_CN_MASK; invert = TRUE; } else { FAIL(ec); } } } } applyIntPropertyValue(p, v, ec); if(invert) { complement(); } if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { // mustNotBeEmpty is set to true if an empty set indicates // invalid input. ec = U_ILLEGAL_ARGUMENT_ERROR; } if (isBogus() && U_SUCCESS(ec)) { // We likely ran out of memory. AHHH! ec = U_MEMORY_ALLOCATION_ERROR; } return *this; }
/** * Parse the pattern from the given RuleCharacterIterator. The * iterator is advanced over the parsed pattern. * @param chars iterator over the pattern characters. Upon return * it will be advanced to the first character after the parsed * pattern, or the end of the iteration if all characters are * parsed. * @param symbols symbol table to use to parse and dereference * variables, or null if none. * @param rebuiltPat the pattern that was parsed, rebuilt or * copied from the input pattern, as appropriate. * @param options a bit mask of zero or more of the following: * IGNORE_SPACE, CASE. */ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, const SymbolTable* symbols, UnicodeString& rebuiltPat, uint32_t options, UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), UErrorCode& ec) { if (U_FAILURE(ec)) return; // Syntax characters: [ ] ^ - & { } // Recognized special forms for chars, sets: c-c s-s s&s int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | RuleCharacterIterator::PARSE_ESCAPES; if ((options & USET_IGNORE_SPACE) != 0) { opts |= RuleCharacterIterator::SKIP_WHITESPACE; } UnicodeString patLocal, buf; UBool usePat = FALSE; UnicodeSetPointer scratch; RuleCharacterIterator::Pos backup; // mode: 0=before [, 1=between [...], 2=after ] // lastItem: 0=none, 1=char, 2=set int8_t lastItem = 0, mode = 0; UChar32 lastChar = 0; UChar op = 0; UBool invert = FALSE; clear(); while (mode != 2 && !chars.atEnd()) { U_ASSERT((lastItem == 0 && op == 0) || (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/))); UChar32 c = 0; UBool literal = FALSE; UnicodeSet* nested = 0; // alias - do not delete // -------- Check for property pattern // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed int8_t setMode = 0; if (resemblesPropertyPattern(chars, opts)) { setMode = 2; } // -------- Parse '[' of opening delimiter OR nested set. // If there is a nested set, use `setMode' to define how // the set should be parsed. If the '[' is part of the // opening delimiter for this pattern, parse special // strings "[", "[^", "[-", and "[^-". Check for stand-in // characters representing a nested set in the symbol // table. else { // Prepare to backup if necessary chars.getPos(backup); c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; if (c == 0x5B /*'['*/ && !literal) { if (mode == 1) { chars.setPos(backup); // backup setMode = 1; } else { // Handle opening '[' delimiter mode = 1; patLocal.append((UChar) 0x5B /*'['*/); chars.getPos(backup); // prepare to backup c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; if (c == 0x5E /*'^'*/ && !literal) { invert = TRUE; patLocal.append((UChar) 0x5E /*'^'*/); chars.getPos(backup); // prepare to backup c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; } // Fall through to handle special leading '-'; // otherwise restart loop for nested [], \p{}, etc. if (c == HYPHEN /*'-'*/) { literal = TRUE; // Fall through to handle literal '-' below } else { chars.setPos(backup); // backup continue; } } } else if (symbols != 0) { const UnicodeFunctor *m = symbols->lookupMatcher(c); if (m != 0) { const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); if (ms == NULL) { ec = U_MALFORMED_SET; return; } // casting away const, but `nested' won't be modified // (important not to modify stored set) nested = const_cast<UnicodeSet*>(ms); setMode = 3; } } } // -------- Handle a nested set. This either is inline in // the pattern or represented by a stand-in that has // previously been parsed and was looked up in the symbol // table. if (setMode != 0) { if (lastItem == 1) { if (op != 0) { // syntaxError(chars, "Char expected after operator"); ec = U_MALFORMED_SET; return; } add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); lastItem = 0; op = 0; } if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { patLocal.append(op); } if (nested == 0) { // lazy allocation if (!scratch.allocate()) { ec = U_MEMORY_ALLOCATION_ERROR; return; } nested = scratch.pointer(); } switch (setMode) { case 1: nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); break; case 2: chars.skipIgnored(opts); nested->applyPropertyPattern(chars, patLocal, ec); if (U_FAILURE(ec)) return; break; case 3: // `nested' already parsed nested->_toPattern(patLocal, FALSE); break; } usePat = TRUE; if (mode == 0) { // Entire pattern is a category; leave parse loop *this = *nested; mode = 2; break; } switch (op) { case HYPHEN: /*'-'*/ removeAll(*nested); break; case INTERSECTION: /*'&'*/ retainAll(*nested); break; case 0: addAll(*nested); break; } op = 0; lastItem = 2; continue; } if (mode == 0) { // syntaxError(chars, "Missing '['"); ec = U_MALFORMED_SET; return; } // -------- Parse special (syntax) characters. If the // current character is not special, or if it is escaped, // then fall through and handle it below. if (!literal) { switch (c) { case 0x5D /*']'*/: if (lastItem == 1) { add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); } // Treat final trailing '-' as a literal if (op == HYPHEN /*'-'*/) { add(op, op); patLocal.append(op); } else if (op == INTERSECTION /*'&'*/) { // syntaxError(chars, "Trailing '&'"); ec = U_MALFORMED_SET; return; } patLocal.append((UChar) 0x5D /*']'*/); mode = 2; continue; case HYPHEN /*'-'*/: if (op == 0) { if (lastItem != 0) { op = (UChar) c; continue; } else { // Treat final trailing '-' as a literal add(c, c); c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; if (c == 0x5D /*']'*/ && !literal) { patLocal.append(HYPHEN_RIGHT_BRACE, 2); mode = 2; continue; } } } // syntaxError(chars, "'-' not after char or set"); ec = U_MALFORMED_SET; return; case INTERSECTION /*'&'*/: if (lastItem == 2 && op == 0) { op = (UChar) c; continue; } // syntaxError(chars, "'&' not after set"); ec = U_MALFORMED_SET; return; case 0x5E /*'^'*/: // syntaxError(chars, "'^' not after '['"); ec = U_MALFORMED_SET; return; case 0x7B /*'{'*/: if (op != 0) { // syntaxError(chars, "Missing operand after operator"); ec = U_MALFORMED_SET; return; } if (lastItem == 1) { add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); } lastItem = 0; buf.truncate(0); { UBool ok = FALSE; while (!chars.atEnd()) { c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; if (c == 0x7D /*'}'*/ && !literal) { ok = TRUE; break; } buf.append(c); } if (buf.length() < 1 || !ok) { // syntaxError(chars, "Invalid multicharacter string"); ec = U_MALFORMED_SET; return; } } // We have new string. Add it to set and continue; // we don't need to drop through to the further // processing add(buf); patLocal.append((UChar) 0x7B /*'{'*/); _appendToPat(patLocal, buf, FALSE); patLocal.append((UChar) 0x7D /*'}'*/); continue; case SymbolTable::SYMBOL_REF: // symbols nosymbols // [a-$] error error (ambiguous) // [a$] anchor anchor // [a-$x] var "x"* literal '$' // [a-$.] error literal '$' // *We won't get here in the case of var "x" { chars.getPos(backup); c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; UBool anchor = (c == 0x5D /*']'*/ && !literal); if (symbols == 0 && !anchor) { c = SymbolTable::SYMBOL_REF; chars.setPos(backup); break; // literal '$' } if (anchor && op == 0) { if (lastItem == 1) { add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); } add(U_ETHER); usePat = TRUE; patLocal.append((UChar) SymbolTable::SYMBOL_REF); patLocal.append((UChar) 0x5D /*']'*/); mode = 2; continue; } // syntaxError(chars, "Unquoted '$'"); ec = U_MALFORMED_SET; return; } default: break; } } // -------- Parse literal characters. This includes both // escaped chars ("\u4E01") and non-syntax characters // ("a"). switch (lastItem) { case 0: lastItem = 1; lastChar = c; break; case 1: if (op == HYPHEN /*'-'*/) { if (lastChar >= c) { // Don't allow redundant (a-a) or empty (b-a) ranges; // these are most likely typos. // syntaxError(chars, "Invalid range"); ec = U_MALFORMED_SET; return; } add(lastChar, c); _appendToPat(patLocal, lastChar, FALSE); patLocal.append(op); _appendToPat(patLocal, c, FALSE); lastItem = 0; op = 0; } else { add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); lastChar = c; } break; case 2: if (op != 0) { // syntaxError(chars, "Set expected after operator"); ec = U_MALFORMED_SET; return; } lastChar = c; lastItem = 1; break; } } if (mode != 2) { // syntaxError(chars, "Missing ']'"); ec = U_MALFORMED_SET; return; } chars.skipIgnored(opts); /** * Handle global flags (invert, case insensitivity). If this * pattern should be compiled case-insensitive, then we need * to close over case BEFORE COMPLEMENTING. This makes * patterns like /[^abc]/i work. */ if ((options & USET_CASE_INSENSITIVE) != 0) { (this->*caseClosure)(USET_CASE_INSENSITIVE); } else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); } if (invert) { complement(); } // Use the rebuilt pattern (patLocal) only if necessary. Prefer the // generated pattern. if (usePat) { rebuiltPat.append(patLocal); } else { _generatePattern(rebuiltPat, FALSE); } if (isBogus() && U_SUCCESS(ec)) { // We likely ran out of memory. AHHH! ec = U_MEMORY_ALLOCATION_ERROR; } }