String Font::normalizeSpaces(const UChar* characters, unsigned length) { StringBuilder normalized; normalized.reserveCapacity(length); for (unsigned i = 0; i < length; ++i) normalized.append(normalizeSpaces(characters[i])); return normalized.toString(); }
String Font::normalizeSpaces(const UChar* characters, unsigned length) { UChar* buffer; String normalized = String::createUninitialized(length, buffer); for (unsigned i = 0; i < length; ++i) buffer[i] = normalizeSpaces(characters[i]); return normalized; }
void TokenizerEn::splitSpecialSymbols(const wstring& i_word, vector<wstring>& o_content, vector<size_t>& o_positions, bool isSentenceEnd) const { o_content.clear(); o_positions.clear(); // i_word should not contain spaces wstring result = i_word; //splitting punctuations static const boost::wregex punktPattern(L"(\\w|[[.period.]])((?!\"|[[.period.]])[[:punct:]]+)(?=\\s|$)"); static const boost::wregex apostrophPairPattern(L"(^|\\s)(')([^\"'\\s]+)(')(\\s|$|\\W|[[:punct:]])"); static const boost::wregex n_tPattern(L"n't"); static const boost::wregex modalPattern(L"'(s|m|(re)|(ll)|d|(ve))(?=\\W|$)"); static const boost::wregex rightApostrophPattern(L"'(\\W)"); static const boost::wregex wonnaPattern(L"\\b(won|gon)(na)\\b"); static const boost::wregex bracketsPattern(L"([\\(\\[\\{\\)\\]\\}\"])"); static const boost::wregex specialCharPattern(L"([&%$])"); static const wstring apostrophPairReplacement(L"$1 $2 $3 $4 $5"); static const wstring n_tReplacement(L" n't"); static const wstring modalReplacement(L" '$1"); static const wstring rightApostrophReplacement(L" '$1"); static const wstring wonnaReplacement(L"$1 $2"); static const wstring bracketsReplacement(L" $1 "); static const wstring specialCharReplacement(L" $1 "); static const wstring replacement = L"$1 $2 "; result = boost::regex_replace(result, punktPattern, replacement); //left apostroph result = boost::regex_replace(result, apostrophPairPattern, apostrophPairReplacement), // children's --> children 's; won't --> wo n't etc result = boost::regex_replace(result, n_tPattern, n_tReplacement); result = boost::regex_replace(result, modalPattern, modalReplacement); result = boost::regex_replace(result, rightApostrophPattern, rightApostrophReplacement); // split special words // gonna --> gon na wonna --> won na result = boost::regex_replace(result, wonnaPattern, wonnaReplacement); // split brackets and quotations result = boost::regex_replace(result, bracketsPattern, bracketsReplacement); // split special characters result = boost::regex_replace(result, specialCharPattern, specialCharReplacement); if(isSentenceEnd) { result = splitEndingPeriod(result); } result = normalizeSpaces(result); o_content = Tools::Split(result, L" "); size_t pos = 0; o_positions.resize(o_content.size()); for(size_t index = 0; index < o_content.size(); index++) { o_positions[index] = pos; pos += o_content[index].length(); } }
String Font::normalizeSpaces(const String& string) { const UChar* characters = string.characters(); unsigned length = string.length(); Vector<UChar, 256> buffer(length); bool didReplacement = false; for (unsigned i = 0; i < length; ++i) { UChar originalCharacter = characters[i]; buffer[i] = normalizeSpaces(originalCharacter); if (buffer[i] != originalCharacter) didReplacement = true; } return didReplacement ? String(buffer.data(), length) : string; }