NumberingSystem* U_EXPORT2 NumberingSystem::createInstance(int32_t radix_in, UBool isAlgorithmic_in, const UnicodeString & desc_in, UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } if ( radix_in < 2 ) { status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } if ( !isAlgorithmic_in ) { if ( desc_in.countChar32() != radix_in ) { status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } } NumberingSystem *ns = new NumberingSystem(); ns->setRadix(radix_in); ns->setDesc(desc_in); ns->setAlgorithmic(isAlgorithmic_in); ns->setName(NULL); return ns; }
void StaticUnicodeSetsTest::assertInSet(const UnicodeString &localeName, const UnicodeString &setName, const UnicodeSet &set, const UnicodeString &str) { if (str.countChar32(0, str.length()) != 1) { // Ignore locale strings with more than one code point (usually a bidi mark) return; } assertInSet(localeName, setName, set, str.char32At(0)); }
Variant f_icu_match(CStrRef pattern, CStrRef subject, VRefParam matches /* = null */, int64_t flags /* = 0 */) { UErrorCode status = U_ZERO_ERROR; if (matches.isReferenced()) { matches = Array(); } // Create hash map key by concatenating pattern and flags. StringBuffer bpattern; bpattern.append(pattern); bpattern.append(':'); bpattern.append(flags); String spattern = bpattern.detach(); // Find compiled pattern matcher in hash map or add it. PatternStringMap::accessor accessor; const RegexPattern* rpattern; if (s_patternCacheMap.find(accessor, spattern.get())) { rpattern = accessor->second; } else { // First 32 bits are reserved for ICU-specific flags. rpattern = RegexPattern::compile( UnicodeString::fromUTF8(pattern.data()), (flags & 0xFFFFFFFF), status); if (U_FAILURE(status)) { return false; } if (s_patternCacheMap.insert( accessor, StringData::GetStaticString(spattern.get()))) { accessor->second = rpattern; } else { delete rpattern; rpattern = accessor->second; } } // Build regex matcher from compiled pattern and passed-in subject. UnicodeString usubject = UnicodeString::fromUTF8(subject.data()); boost::scoped_ptr<RegexMatcher> matcher(rpattern->matcher(usubject, status)); if (U_FAILURE(status)) { return false; } // Return 0 or 1 depending on whether or not a match was found and // (optionally), set matched (sub-)patterns for passed-in reference. int matched = 0; if (matcher->find()) { matched = 1; if (matches.isReferenced()) { int32_t count = matcher->groupCount(); for (int32_t i = 0; i <= count; i++) { UnicodeString ustring = matcher->group(i, status); if (U_FAILURE(status)) { return false; } // Convert UnicodeString back to UTF-8. std::string string; ustring.toUTF8String(string); String match = String(string); if (flags & k_UREGEX_OFFSET_CAPTURE) { // start() returns the index in UnicodeString, which // normally means the index into an array of 16-bit // code "units" (not "points"). int32_t start = matcher->start(i, status); if (U_FAILURE(status)) { return false; } start = usubject.countChar32(0, start); matches->append(CREATE_VECTOR2(match, start)); } else { matches->append(match); } } } } return matched; }
seec::Maybe<IndexedString> IndexedString::from(UnicodeString const &String) { if (String.isBogus()) return seec::Maybe<IndexedString>(); UnicodeString const NeedleStart("@["); UnicodeString const NeedleEscape("@[["); UnicodeString const NeedleEnd("]"); UnicodeString CleanedString; // String with index indicators removed. std::multimap<UnicodeString, Needle> Needles; std::vector<std::pair<UnicodeString, int32_t>> IndexStack; int32_t SearchFrom = 0; // Current offset in String. int32_t FoundStart; // Position of matched index indicator. while ((FoundStart = String.indexOf(NeedleStart, SearchFrom)) != -1) { // Copy all the literal string data. CleanedString.append(String, SearchFrom, FoundStart - SearchFrom); // Check if this is an escape sequence. if (String.compare(FoundStart, NeedleEscape.length(), NeedleEscape) == 0) { CleanedString.append(NeedleStart); SearchFrom = FoundStart + NeedleEscape.length(); continue; } // Find the end of this sequence. int32_t FoundEnd = String.indexOf(NeedleEnd, SearchFrom); if (FoundEnd == -1) return seec::Maybe<IndexedString>(); if (FoundEnd == FoundStart + NeedleStart.length()) { // This is a closing sequence. if (IndexStack.size() == 0) return seec::Maybe<IndexedString>(); // Pop the starting details of the last-opened sequence. auto const Start = IndexStack.back(); IndexStack.pop_back(); // Store the needle for this sequence. Needles.insert(std::make_pair(Start.first, Needle(Start.second, CleanedString.countChar32()))); } else { // This is an opening sequence. int32_t const NameStart = FoundStart + NeedleStart.length(); int32_t const NameLength = FoundEnd - NameStart; IndexStack.emplace_back(UnicodeString(String, NameStart, NameLength), CleanedString.countChar32()); } SearchFrom = FoundEnd + NeedleEnd.length(); } // Copy all remaining literal data. CleanedString.append(String, SearchFrom, String.length() - SearchFrom); return IndexedString(std::move(CleanedString), std::move(Needles)); }
/** * Dumb recursive implementation of permutation. * TODO: optimize * @param source the string to find permutations for * @return the results in a set. */ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { if(U_FAILURE(status)) { return; } //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); int32_t i = 0; // optimization: // if zero or one character, just return a set with it // we check for length < 2 to keep from counting code points all the time if (source.length() <= 2 && source.countChar32() <= 1) { UnicodeString *toPut = new UnicodeString(source); /* test for NULL */ if (toPut == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } result->put(source, toPut, status); return; } // otherwise iterate through the string, and recursively permute all the other characters UChar32 cp; Hashtable subpermute(status); if(U_FAILURE(status)) { return; } subpermute.setValueDeleter(uprv_deleteUObject); for (i = 0; i < source.length(); i += U16_LENGTH(cp)) { cp = source.char32At(i); const UHashElement *ne = NULL; int32_t el = UHASH_FIRST; UnicodeString subPermuteString = source; // optimization: // if the character is canonical combining class zero, // don't permute it if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); continue; } subpermute.removeAll(); // see what the permutations of the characters before and after this one are //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); /* Test for buffer overflows */ if(U_FAILURE(status)) { return; } // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents // of source at this point. // prefix this character to all of them ne = subpermute.nextElement(el); while (ne != NULL) { UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); UnicodeString *chStr = new UnicodeString(cp); //test for NULL if (chStr == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); result->put(*chStr, chStr, status); ne = subpermute.nextElement(el); } } //return result; }