Esempio n. 1
0
String Font::normalizeSpaces(const UChar* characters, unsigned length)
{
    StringBuilder normalized;
    normalized.reserveCapacity(length);

    for (unsigned i = 0; i < length; ++i)
        normalized.append(normalizeSpaces(characters[i]));

    return normalized.toString();
}
Esempio n. 2
0
String Font::normalizeSpaces(const UChar* characters, unsigned length)
{
    UChar* buffer;
    String normalized = String::createUninitialized(length, buffer);

    for (unsigned i = 0; i < length; ++i)
        buffer[i] = normalizeSpaces(characters[i]);

    return normalized;
}
Esempio n. 3
0
void TokenizerEn::splitSpecialSymbols(const wstring& i_word, vector<wstring>& o_content, vector<size_t>& o_positions, bool isSentenceEnd) const
{
    o_content.clear();
    o_positions.clear();
    // i_word should not contain spaces
    wstring result = i_word;
    //splitting punctuations
    static const boost::wregex punktPattern(L"(\\w|[[.period.]])((?!\"|[[.period.]])[[:punct:]]+)(?=\\s|$)");
    static const boost::wregex apostrophPairPattern(L"(^|\\s)(')([^\"'\\s]+)(')(\\s|$|\\W|[[:punct:]])");
    static const boost::wregex n_tPattern(L"n't");
    static const boost::wregex modalPattern(L"'(s|m|(re)|(ll)|d|(ve))(?=\\W|$)");
    static const boost::wregex rightApostrophPattern(L"'(\\W)");
    static const boost::wregex wonnaPattern(L"\\b(won|gon)(na)\\b");
    static const boost::wregex bracketsPattern(L"([\\(\\[\\{\\)\\]\\}\"])");
    static const boost::wregex specialCharPattern(L"([&%$])");

    static const wstring apostrophPairReplacement(L"$1 $2 $3 $4 $5");
    static const wstring n_tReplacement(L" n't");
    static const wstring modalReplacement(L" '$1");
    static const wstring rightApostrophReplacement(L" '$1");
    static const wstring wonnaReplacement(L"$1 $2");
    static const wstring bracketsReplacement(L" $1 ");
    static const wstring specialCharReplacement(L" $1 ");

    static const wstring replacement = L"$1 $2 ";
    result = boost::regex_replace(result, punktPattern, replacement);
    //left apostroph
    result = boost::regex_replace(result, apostrophPairPattern, apostrophPairReplacement),
    // children's --> children 's; won't --> wo n't etc
    result = boost::regex_replace(result, n_tPattern, n_tReplacement);
    result = boost::regex_replace(result, modalPattern, modalReplacement);
    result = boost::regex_replace(result, rightApostrophPattern, rightApostrophReplacement);
    // split special words
    // gonna --> gon na wonna --> won na
    result = boost::regex_replace(result, wonnaPattern, wonnaReplacement);
    // split brackets and quotations
    result = boost::regex_replace(result, bracketsPattern, bracketsReplacement);
    // split special characters
    result = boost::regex_replace(result, specialCharPattern, specialCharReplacement);
    if(isSentenceEnd)
    {
        result = splitEndingPeriod(result);
    }
    result = normalizeSpaces(result);
    o_content = Tools::Split(result, L" ");
    size_t pos = 0;
    o_positions.resize(o_content.size());
    for(size_t index = 0; index < o_content.size(); index++) {
        o_positions[index] = pos;
        pos += o_content[index].length();
    }
}
Esempio n. 4
0
String Font::normalizeSpaces(const String& string)
{
    const UChar* characters = string.characters();
    unsigned length = string.length();
    Vector<UChar, 256> buffer(length);
    bool didReplacement = false;

    for (unsigned i = 0; i < length; ++i) {
        UChar originalCharacter = characters[i];
        buffer[i] = normalizeSpaces(originalCharacter);
        if (buffer[i] != originalCharacter)
            didReplacement = true;
    }

    return didReplacement ? String(buffer.data(), length) : string;
}