void FStringConverter::ConvertString(const TCHAR* Source, const int32 SourceStartIndex, const int32 SourceLen, icu::UnicodeString& Destination, const bool ShouldNullTerminate) { if (SourceLen > 0) { UErrorCode ICUStatus = U_ZERO_ERROR; ucnv_reset(ICUConverter); // Get the internal buffer of the string, we're going to use it as scratch space const int32_t DestinationCapacityUChars = SourceLen * 2; UChar* InternalStringBuffer = Destination.getBuffer(DestinationCapacityUChars); // Perform the conversion into the string buffer const int32_t SourceSizeBytes = SourceLen * sizeof(TCHAR); const int32_t DestinationLength = ucnv_toUChars(ICUConverter, InternalStringBuffer, DestinationCapacityUChars, reinterpret_cast<const char*>(Source + SourceStartIndex), SourceSizeBytes, &ICUStatus); // Optionally null terminate the string if (ShouldNullTerminate) { InternalStringBuffer[DestinationLength] = 0; } // Size it back down to the correct size and release our lock on the string buffer Destination.releaseBuffer(DestinationLength); check(U_SUCCESS(ICUStatus)); } else { Destination.remove(); } }
U_CAPI int32_t U_EXPORT2 uspoof_checkUnicodeString(const USpoofChecker *sc, const icu::UnicodeString &text, int32_t *position, UErrorCode *status) { int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status); return result; }
int32 GetUnicodeStringLengthImpl(const TCHAR* Source, const int32 InSourceStartIndex, const int32 InSourceLength) { if (InSourceLength > 0) { const icu::UnicodeString TmpStr = ConvertString(Source, InSourceStartIndex, InSourceLength); return TmpStr.length(); } return 0; }
/** * Write an Unitex file content (to system filesystem or filespace) * it write from two buffer (prefix and suffix). This is useful for writing both header and footer (or BOM and text...) */ UNITEX_FUNC int UNITEX_CALL WriteUnicodeUnitexFile(const char*filename, icu::UnicodeString const& uString) { UChar uBom = 0xfeff; const UChar * uBuffer = uString.getBuffer(); int32_t uLength = uString.length(); bool result = WriteUnitexFile(filename, &uBom, sizeof(UChar), uBuffer, uLength * sizeof(UChar)) == 0; return result; }
U_CAPI int32_t U_EXPORT2 uspoof_areConfusableUnicodeString(const USpoofChecker *sc, const icu::UnicodeString &s1, const icu::UnicodeString &s2, UErrorCode *status) { const UChar *u1 = s1.getBuffer(); int32_t length1 = s1.length(); const UChar *u2 = s2.getBuffer(); int32_t length2 = s2.length(); int32_t results = uspoof_areConfusable(sc, u1, length1, u2, length2, status); return results; }
void FStringConverter::ConvertString(const icu::UnicodeString& Source, const int32 SourceStartIndex, const int32 SourceLen, FString& Destination) { if (Source.length() > 0) { UErrorCode ICUStatus = U_ZERO_ERROR; ucnv_reset(ICUConverter); // Get the internal buffer of the string, we're going to use it as scratch space TArray<TCHAR>& InternalStringBuffer = Destination.GetCharArray(); // Work out the maximum size required and resize the buffer so it can hold enough data const int32_t DestinationCapacityBytes = UCNV_GET_MAX_BYTES_FOR_STRING(SourceLen, ucnv_getMaxCharSize(ICUConverter)); const int32 DestinationCapacityTCHARs = DestinationCapacityBytes / sizeof(TCHAR); InternalStringBuffer.SetNumUninitialized(DestinationCapacityTCHARs); // Perform the conversion into the string buffer, and then null terminate the FString and size it back down to the correct size const int32_t DestinationSizeBytes = ucnv_fromUChars(ICUConverter, reinterpret_cast<char*>(InternalStringBuffer.GetData()), DestinationCapacityBytes, Source.getBuffer() + SourceStartIndex, SourceLen, &ICUStatus); const int32 DestinationSizeTCHARs = DestinationSizeBytes / sizeof(TCHAR); InternalStringBuffer[DestinationSizeTCHARs] = 0; InternalStringBuffer.SetNum(DestinationSizeTCHARs + 1, /*bAllowShrinking*/false); // the array size includes null check(U_SUCCESS(ICUStatus)); } else { Destination.Empty(); } }
bool ustring_from_char(icu::UnicodeString& ret, const String& str, UErrorCode &error) { int32_t capacity = str.size() + 1; UChar *utf16 = ret.getBuffer(capacity); int32_t utf16_len = 0; error = U_ZERO_ERROR; u_strFromUTF8WithSub(utf16, ret.getCapacity(), &utf16_len, str.c_str(), str.size(), U_SENTINEL /* no substitution */, nullptr, &error); ret.releaseBuffer(utf16_len); if (U_FAILURE(error)) { ret.setToBogus(); return false; } return true; }
// -------------------------------------------------------------------------- void processor::on_start_tag_type ( icu::UnicodeString const& type ) // -------------------------------------------------------------------------- { element_info info; if (!m_character_data.isEmpty()) { character_data(m_character_data); m_character_data.remove(); } info.type = type; info.child_counter = 0; if (m_element_info.empty()) { if (m_validating && !m_document_type.m_root_type.isEmpty() && type != m_document_type.m_root_type) { std::string msg; msg += "Root element type does not match the document type.\n"; msg += "Document type name: "; m_document_type.m_root_type.toUTF8String(msg); msg += "\nRoot element type: "; type.toUTF8String(msg); throw semantic_error(msg); } info.xmlns[""] = uri(); info.base = m_base_iri; info.space = false; } else { if (m_validating) { throw not_implemented("Element validity checking."); } ++m_element_info.top().child_counter; info.xmlns = m_element_info.top().xmlns; info.base = m_element_info.top().base; info.lang = m_element_info.top().lang; info.space = m_element_info.top().space; } m_element_info.push(info); m_sax_attrs.clear(); }
static bool ustring_from_char(icu::UnicodeString& ret, const String& str, UErrorCode &error) { error = U_ZERO_ERROR; ret = u16(str, error, U_SENTINEL); if (U_FAILURE(error)) { ret.setToBogus(); return false; } return true; }
void ConvertString(const FString& Source, icu::UnicodeString& Destination, const bool ShouldNullTerminate) { if (Source.Len() > 0) { FStringConverter StringConverter; StringConverter.ConvertString(Source, Destination, ShouldNullTerminate); } else { Destination.remove(); } }
void ConvertString(const TCHAR* Source, const int32 SourceStartIndex, const int32 SourceLen, icu::UnicodeString& Destination, const bool ShouldNullTerminate) { if (SourceLen > 0) { FStringConverter StringConverter; StringConverter.ConvertString(Source, SourceStartIndex, SourceLen, Destination, ShouldNullTerminate); } else { Destination.remove(); } }
static void printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) { if(start==end) { printf("%04lX ", (long)start); } else { printf("%04lX..%04lX ", (long)start, (long)end); } printf("; %s", statusNames[status]); if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) { printf(" ;"); const UChar *buffer=mapping.getBuffer(); int32_t length=mapping.length(); int32_t i=0; UChar32 c; while(i<length) { U16_NEXT(buffer, i, length, c); printf(" %04lX", (long)c); } } puts(""); }
// -------------------------------------------------------------------------- void processor::on_end_tag ( icu::UnicodeString const& type ) // -------------------------------------------------------------------------- { if (!m_character_data.isEmpty()) { character_data(m_character_data); m_character_data.remove(); } if (type != m_element_info.top().type) { std::string msg, tree; msg += "STag-ETag name mismatch.\n"; msg += "ETag name: "; type.toUTF8String(msg); while (!m_element_info.empty()) { std::string tmp; m_element_info.top().type.toUTF8String(tmp); m_element_info.pop(); tree = "/" + tmp + tree; } msg += "\nElement tree: " + tree; throw semantic_error(msg); } element_end(); if (!m_element_info.empty()) { m_element_info.pop(); } // Update current element's variables. if (!m_element_info.empty()) { m_element.assign(m_element_info.top().type, m_element_info.top().xmlns); m_attributes.clear(); m_base_iri = m_element_info.top().base; m_language = m_element_info.top().lang; m_preserve_space = m_element_info.top().space; } }
static int toIDNA2003(const UStringPrepProfile *prep, UChar32 c, icu::UnicodeString &destString) { UChar src[2]; int32_t srcLength=0; U16_APPEND_UNSAFE(src, srcLength, c); UChar *dest; int32_t destLength; dest=destString.getBuffer(32); if(dest==NULL) { return FALSE; } UErrorCode errorCode=U_ZERO_ERROR; destLength=usprep_prepare(prep, src, srcLength, dest, destString.getCapacity(), USPREP_DEFAULT, NULL, &errorCode); destString.releaseBuffer(destLength); if(errorCode==U_STRINGPREP_PROHIBITED_ERROR) { return -1; } else { // Returns FALSE=0 for U_STRINGPREP_UNASSIGNED_ERROR and processing errors, // TRUE=1 if c is valid or mapped. return U_SUCCESS(errorCode); } }
void alignedNormalizeUnicodeString(icu::UnicodeString const& u, IcuNormalizer2Ptr normalizer, ITakeAlignedChars& out) { // TODO: test Position start = 0; int32 len = u.length(), pos; UErrorCode err = U_ZERO_ERROR; int nfcPrefixLen = normalizer->spanQuickCheckYes(u, err); assert(U_SUCCESS(err)); assert(len >= 0 && nfcPrefixLen >= 0); TokenSpan span; span.first = 0; icu::StringCharacterIterator it(u); while ((pos = it.getIndex()) < nfcPrefixLen) { assert(it.hasNext()); Unicode c = it.next32PostInc(); span.second = span.first + 1; out.takeWithSpan(c, span); ++span.first; } icu::UnicodeString remainder(u.tempSubString(nfcPrefixLen)), normalized; CharsFromUnicodeStringImpl chars(remainder); // TODO: docs say normalizeSecondAndAppend IcuNormalizeByChunks<CharsFromUnicodeStringImpl> norm(chars, normalizer); norm.takeAllWithSpan(out); }
// -------------------------------------------------------------------------- icu::UnicodeString processor::normalize_enum ( icu::UnicodeString const& value ) // -------------------------------------------------------------------------- { icu::UnicodeString normalized; int32_t pos; bool space_before = false; bool leading = true; for (pos=0; pos<value.length(); ++pos) { if (value[pos] == ' ') { space_before = true; } else { if (space_before) { if (leading) { leading = false; } else { normalized += ' '; } space_before = false; } normalized += value[pos]; } } return normalized; }
jobject operator()(icu::UnicodeString const& value) const { return env->NewString(value.getBuffer(), value.length()); }
inline cxxopts::UnicodeStringIterator end(const icu::UnicodeString& s) { return cxxopts::UnicodeStringIterator(&s, s.length()); }
QString EnabledLocalesModel::unicodeStringToQString( const icu::UnicodeString& sourceStr ) { return QString( reinterpret_cast<const QChar*>( sourceStr.getBuffer() ), sourceStr.length() ); }
U_CAPI int32_t U_EXPORT2 uspoof_checkUnicodeString(const USpoofChecker *sc, const icu::UnicodeString &id, int32_t *position, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (This == NULL) { return 0; } int32_t result = 0; IdentifierInfo *identifierInfo = NULL; if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) { identifierInfo = This->getIdentifierInfo(*status); if (U_FAILURE(*status)) { goto cleanupAndReturn; } identifierInfo->setIdentifier(id, *status); identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet); } if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) { URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status); if (idRestrictionLevel > This->fRestrictionLevel) { result |= USPOOF_RESTRICTION_LEVEL; } if (This->fChecks & USPOOF_AUX_INFO) { result |= idRestrictionLevel; } } if ((This->fChecks) & USPOOF_MIXED_NUMBERS) { const UnicodeSet *numerics = identifierInfo->getNumerics(); if (numerics->size() > 1) { result |= USPOOF_MIXED_NUMBERS; } // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier. // We have no easy way to do the same in C. // if (checkResult != null) { // checkResult.numerics = numerics; // } } if (This->fChecks & (USPOOF_CHAR_LIMIT)) { int32_t i; UChar32 c; int32_t length = id.length(); for (i=0; i<length ;) { c = id.char32At(i); i += U16_LENGTH(c); if (!This->fAllowedCharsSet->contains(c)) { result |= USPOOF_CHAR_LIMIT; break; } } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { // These are the checks that need to be done on NFD input UnicodeString nfdText; gNfdNormalizer->normalize(id, nfdText, *status); int32_t nfdLength = nfdText.length(); if (This->fChecks & USPOOF_INVISIBLE) { // scan for more than one occurence of the same non-spacing mark // in a sequence of non-spacing marks. int32_t i; UChar32 c; UChar32 firstNonspacingMark = 0; UBool haveMultipleMarks = FALSE; UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. for (i=0; i<nfdLength ;) { c = nfdText.char32At(i); i += U16_LENGTH(c); if (u_charType(c) != U_NON_SPACING_MARK) { firstNonspacingMark = 0; if (haveMultipleMarks) { marksSeenSoFar.clear(); haveMultipleMarks = FALSE; } continue; } if (firstNonspacingMark == 0) { firstNonspacingMark = c; continue; } if (!haveMultipleMarks) { marksSeenSoFar.add(firstNonspacingMark); haveMultipleMarks = TRUE; } if (marksSeenSoFar.contains(c)) { // report the error, and stop scanning. // No need to find more than the first failure. result |= USPOOF_INVISIBLE; break; } marksSeenSoFar.add(c); } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { // The basic test is the same for both whole and mixed script confusables. // Compute the set of scripts that every input character has a confusable in. // For this computation an input character is always considered to be // confusable with itself in its own script. // // If the number of such scripts is two or more, and the input consisted of // characters all from a single script, we have a whole script confusable. // (The two scripts will be the original script and the one that is confusable) // // If the number of such scripts >= one, and the original input contained characters from // more than one script, we have a mixed script confusable. (We can transform // some of the characters, and end up with a visually similar string all in // one script.) if (identifierInfo == NULL) { identifierInfo = This->getIdentifierInfo(*status); if (U_FAILURE(*status)) { goto cleanupAndReturn; } identifierInfo->setIdentifier(id, *status); } int32_t scriptCount = identifierInfo->getScriptCount(); ScriptSet scripts; This->wholeScriptCheck(nfdText, &scripts, *status); int32_t confusableScriptCount = scripts.countMembers(); //printf("confusableScriptCount = %d\n", confusableScriptCount); if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && confusableScriptCount >= 2 && scriptCount == 1) { result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; } if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && confusableScriptCount >= 1 && scriptCount > 1) { result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; } } } cleanupAndReturn: This->releaseIdentifierInfo(identifierInfo); if (position != NULL) { *position = 0; } return result; }
void FStringConverter::ConvertString(const icu::UnicodeString& Source, FString& Destination) { return ConvertString(Source, 0, Source.length(), Destination); }
// -------------------------------------------------------------------------- void processor::on_reference ( icu::UnicodeString const& name, bool attvalue ) // -------------------------------------------------------------------------- { if (!attvalue && m_ref_history.empty() && m_auto_replace_general) { if (!m_character_data.isEmpty()) { character_data(m_character_data); m_character_data.remove(); } reference(name); } // Pre-defined entities. if (name == "lt") { icu::UnicodeString entity = "<"; m_buffers.emplace("<", new io::uistring(entity, false)); if (attvalue) { parse_included_attvalue(); } else { parse_content(); } m_buffers.pop(); return; } else if (name == "gt") { icu::UnicodeString entity = ">"; m_buffers.emplace(">", new io::uistring(entity, false)); if (attvalue) { parse_included_attvalue(); } else { parse_content(); } m_buffers.pop(); return; } else if (name == "amp") { icu::UnicodeString entity = "&"; m_buffers.emplace("&", new io::uistring(entity, false)); if (attvalue) { parse_included_attvalue(); } else { parse_content(); } m_buffers.pop(); return; } else if (name == "apos") { icu::UnicodeString entity = "'"; m_buffers.emplace("'", new io::uistring(entity, false)); if (attvalue) { parse_included_attvalue(); } else { parse_content(); } m_buffers.pop(); return; } else if (name == "quot") { icu::UnicodeString entity = '"'; m_buffers.emplace(""", new io::uistring(entity, false)); if (attvalue) { parse_included_attvalue(); } else { parse_content(); } m_buffers.pop(); return; } std::pair<std::set<icu::UnicodeString>::iterator, bool> hist; std::map<icu::UnicodeString, general_entity_declaration*>::iterator it; // Look for the entity. it = m_dtd.general_entities.find(name); if (it == m_dtd.general_entities.end()) { std::string msg; msg += "Reference to undeclared general entity '"; name.toUTF8String(msg); msg += "'."; throw semantic_error(msg); } // Unparsed entity references are forbidden. if (it->second->unparsed) { std::string msg; msg += "Reference to an unparsed general entity '"; name.toUTF8String(msg); msg += "'."; throw semantic_error(msg); } // Check for recursive references. hist = m_ref_history.insert(name); if (!hist.second) { std::string msg; msg += "Recursive reference to general entity '"; name.toUTF8String(msg); msg += "'."; throw semantic_error(msg); } // Process the entity. bool const state = m_parsing_entity; std::string nameutf8 = "&"; name.toUTF8String(nameutf8); nameutf8 += ';'; m_parsing_entity = true; if (!it->second->id.sys.isBogus() || !it->second->id.pub.isBogus()) { if (attvalue) { std::string msg; msg += "Reference to an external parsed general entity '"; name.toUTF8String(msg); msg += "' in attribute value."; throw semantic_error(msg); } io::input* input = nullptr; std::string encoding; resolve_id(it->second->id, input, encoding); if (input == nullptr) { if (m_validating) { throw runtime_error( "Could not dereference external parsed general entity." ); } } else { size_t const size = m_buffers.size(); try { m_buffers.emplace(nameutf8, *input, true, encoding); if (attvalue) { parse_included_attvalue(); } else { parse_content(); } m_buffers.pop(); } catch (...) { if (size < m_buffers.size()) { m_buffers.pop(); } delete input; m_ref_history.erase(hist.first); throw; } delete input; } } else { m_buffers.emplace( nameutf8, new io::uistring(it->second->text_or_notation, false) ); if (attvalue) { parse_included_attvalue(); } else { parse_content(); } m_buffers.pop(); } m_parsing_entity = state; m_ref_history.erase(hist.first); }
// -------------------------------------------------------------------------- void processor::on_pe_reference ( icu::UnicodeString const& name, bool entityvalue ) // -------------------------------------------------------------------------- { icu::UnicodeString text; std::map<icu::UnicodeString, parameter_entity_declaration*>::iterator it; std::pair<std::set<icu::UnicodeString>::iterator, bool> hist; if (!m_parsing_entity && !entityvalue && m_ref_history.empty()) { dtd_element e; e.type = dtd_element::parameter_reference; e.text = new icu::UnicodeString(name); m_document_type.m_subset.emplace_back(std::move(e)); } // Look for the entity. it = m_dtd.parameter_entities.find(name); if (it == m_dtd.parameter_entities.end()) { if (m_validating) { std::string msg; msg += "Reference to undeclared parameter entity '"; name.toUTF8String(msg); msg += "'."; throw semantic_error(msg); } else { m_dtd_stop = true; return; } } // Check for recursive references. hist = m_ref_history.insert(name); if (!hist.second) { std::string msg; msg += "Recursive reference to parameter entity '"; name.toUTF8String(msg); msg += "'."; throw semantic_error(msg); } // Process the entity. bool const state = m_parsing_entity; std::string nameutf8 = "%"; name.toUTF8String(nameutf8); nameutf8 += ';'; m_parsing_entity = true; if (it->second->external) { io::input* input = nullptr; std::string encoding; resolve_id(it->second->id, input, encoding); if (input == nullptr) { if (m_validating) { throw runtime_error( "Could not dereference external parameter entity." ); } else { m_dtd_stop = true; } } else { size_t const size = m_buffers.size(); try { if (entityvalue) { m_buffers.emplace(nameutf8, *input, true, encoding); parse_included_entityvalue(); m_buffers.pop(); } else { m_buffers.emplace(nameutf8, *input, true, encoding, true); parse_ext_subset(); m_buffers.pop(); } } catch (...) { if (size < m_buffers.size()) { m_buffers.pop(); } delete input; m_ref_history.erase(hist.first); throw; } delete input; } } else { // The literal value is stored as public ID. if (entityvalue) { m_buffers.emplace( nameutf8, new io::uistring(it->second->id.pub, false) ); parse_included_entityvalue(); m_buffers.pop(); } else { m_buffers.emplace( nameutf8, new io::uistring(it->second->id.pub, false), false ); parse_ext_subset(); m_buffers.pop(); } } m_parsing_entity = state; m_ref_history.erase(hist.first); }
int32 GetNativeStringLength(const icu::UnicodeString& Source) { return GetNativeStringLength(Source, 0, Source.length()); }
int32 GetNativeStringLengthImpl<true, 4>(const icu::UnicodeString& Source, const int32 InSourceStartIndex, const int32 InSourceLength) { return InSourceLength == 0 ? 0 : Source.countChar32(InSourceStartIndex, InSourceLength); }
// -------------------------------------------------------------------------- void processor::on_attribute ( icu::UnicodeString const& name, icu::UnicodeString&& value ) // -------------------------------------------------------------------------- { // Check for duplicates. if (!m_sax_attrs.emplace(name, std::move(value)).second) { std::string msg, tree; msg += "Duplicate attribute on an element.\n"; msg += "Attribute name: "; name.toUTF8String(msg); while (!m_element_info.empty()) { std::string tmp; m_element_info.top().type.toUTF8String(tmp); m_element_info.pop(); tree = "/" + tmp + tree; } msg += "\nElement tree: " + tree; throw semantic_error(msg); } /* icu::UnicodeString normalized; UChar32 chr; int32_t const size = value.countChar32(); for (int32_t i=0; i<size; i=value.moveIndex32(i, 1)) { chr = value.char32At(i); if (chr == 0x20 || chr == 0x0D || chr == 0x0A || chr == 0x09) { normalized += 0x20; } else if (chr == '&') { int32_t end = value.indexOf(';', i); icu::UnicodeString name{value, i+1, end-i-1}; i = end; if (name[0] == '#') { normalized += dereference_character(name.tempSubString(1)); } else { if (on_reference(name, true)) { icu::UnicodeString text; m_buffers.top().pipe(text); normalized += normalize_attvalue(text); m_buffers.pop(); } } } else { normalized += chr; } } return normalized; */ if (m_validating) { throw not_implemented("Attribute validity checking."); } }
size_t hash<icu::UnicodeString>::operator()(const icu::UnicodeString& x) const { return x.hashCode(); }
inline void clearString(icu::UnicodeString& str) { str.truncate(0); }