bool LexicsCutter::_ReadInnormativeWords(const std::string& fileName)
{
    char szLine[1024];

    FILE* file = fopen(fileName.c_str(), "rb");
    if (!file)
        return false;

    while (!feof(file))
    {
        szLine[0] = 0x0;
        fgets(szLine, 1020, file);

        std::string line;
        if (!_ProcessLine(szLine, line))
            continue;

        // Create word vector of vectors
        LC_WordVector vw;
        std::string ch;
        unsigned int pos = 0;
        while (ReadUTF8(line, ch, pos))
        {
            LC_LetterSet vl;

            // Initialize letter set with letter read
            vl.insert(ch);

            // Find letter analogs and push them onto the vector
            LC_AnalogMap::iterator itr = _analogMap.find(ch);
            if (itr != _analogMap.end())
                // Analogs present, iterate
                for (LC_AnalogVector::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); ++itr2)
                    vl.insert(*itr2);

            // Add letter vector to word vector
            vw.push_back(vl);
        }

        // Push new word to words list
        _wordList.push_back(vw);
    }
    fclose(file);
    return true;
}
Beispiel #2
0
bool LexicsCutter::Compare_Word(std::string& str, unsigned int pos, LC_WordVector word)
{
   std::string lchar_prev;
    std::string lchar;

   // read first letter of the word into lchar_prev
   ReadUTF8(str, lchar, pos);

    // okay, here we go, comparing word
    // first letter is already okay, we do begin from second and go on
    LC_WordVector::iterator i = word.begin();
    i++;
    while (i != word.end())
    {
        // get letter from word, return false if the string is shorter
        if (!ReadUTF8(str, lchar, pos)) return(false);
        // check, if the letter is in the set
        LC_LetterSet ls = *i;
        if (ls.count(lchar) == 0)
       {
           // letter is not in set, but we must check, if it is not space or repeat
           if ( (!(IgnoreMiddleSpaces && (lchar == " "))) &&
               (!(IgnoreLetterRepeat && (lchar == lchar_prev))) )
           {
               // no checks viable
               return(false);
           }
       }
       else
       {
           // next word letter
           i++;
       }
       // set previous string letter to compare if needed (this check can really conserve time)
       if (IgnoreLetterRepeat) lchar_prev = lchar;
   }

    return(true);
}
bool LexicsCutter::_CompareWord(const std::string& str, unsigned int pos, LC_WordVector word) const
{
    std::string chPrev;
    std::string ch;

    // Read first letter of the word into lchar_prev
    ReadUTF8(str, ch, pos);

    // Compare word
    // First letter is already okay, we do begin from second and go on
    LC_WordVector::iterator i = word.begin();
    ++i;
    while (i != word.end())
    {
        // Get letter from word, return false if the string is shorter
        if (!ReadUTF8(str, ch, pos))
            return false;
        // Check, if the letter is in the set
        LC_LetterSet ls = *i;
        if (ls.count(ch) == 0)
        {
            // Letter is not in set, but we must check, if it is not space or repeat
            if ((!(_ignoreMiddleSpaces && (ch == " "))) &&
                (!(_ignoreLetterRepeat && (ch == chPrev))))
                // No checks viable
                return false;
        }
        else
            // Next word letter
            ++i;

        // Set previous string letter to compare if needed (this check can really conserve time)
        if (_ignoreLetterRepeat)
            chPrev = ch;
    }
    return true;
}
Beispiel #4
0
bool LexicsCutter::Read_Innormative_Words(std::string& FileName)
{
    FILE *ma_file;
    char line[1024];
    unsigned int pos;
    std::string line_s;
    std::string lchar;

    ma_file = fopen(FileName.c_str(), "rb");

    if (!ma_file)
    {
        sLog.outError("Chat lexics cutter disabled. Reason: LexicsCutterWordsFile file does not exist in the server directory.");
        return false;
    }

    while (!feof(ma_file))
    {
        line[0] = 0x0;
        fgets(line, 1020, ma_file);

        // check for UTF8 prefix and comment
        if (strlen(line) >= 3)
        {
            if (line[0] == '\xEF' && line[1] == '\xBB' && line[2] == '\xBF')
            {
                strncpy(&line[0], &line[3], strlen(line) - 3);
            }
        }

        if (strlen(line) >= 2)
        {
            if (line[0] == '/' && line[1] == '/') continue;
        }

        // check for empty string
        line_s = line;
        line_s = trim(line_s, "\x0A\x0D\x20");
        if (line_s == "") continue;

        // process line without CR/LF
        line_s = line;
        line_s = trim(line_s, "\x0A\x0D");

        // create word vector of vectors
        LC_WordVector vw;
        pos = 0;
        while (ReadUTF8(line_s, lchar, pos))
        {
            // create letter set
            LC_LetterSet vl;

            // initialize letter set with letter read
            vl.insert(lchar);

            // find letter analogs and push them onto the vector
            LC_AnalogMap::iterator itr = AnalogMap.find(lchar);
            if (itr != AnalogMap.end())
            {
                // analogs present, iterate
                for (LC_AnalogVector::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++)
                {
                    vl.insert(*itr2);
                }
            }

            // add letter vector to word vector
            vw.push_back(vl);
        }

        // push new word to words list
        WordList.push_back(vw);
    }

    fclose(ma_file);

    return true;
}
Beispiel #5
0
bool LexicsCutter::ReadInnormativeWords(std::string& fileName)
{
    char line[1024];
    unsigned int pos;
    std::string line_s;
    std::string lchar;

    FILE* file = fopen(fileName.c_str(), "rb");
    if (!file)
        return false;

    while (!feof(file))
    {
        line[0] = 0x0;
        fgets(line, 1020, file);

        // check for UTF8 prefix and comment
        if (strlen(line) >= 3)
            if (line[0] == '\xEF' && line[1] == '\xBB' && line[2] == '\xBF')
                strncpy(&line[0], &line[3], strlen(line) - 3);

        if (strlen(line) >= 2)
            if (line[0] == '/' && line[1] == '/')
                continue;

        // check for empty string
        line_s = line; 
        line_s = trim(line_s, "\x0A\x0D\x20");
        if (line_s == "")
            continue;

        // process line without CR/LF
        line_s = line; 
        line_s = trim(line_s, "\x0A\x0D");
    
        // create word vector of vectors
        LC_WordVector vw;
        pos = 0;
        while (ReadUTF8(line_s, lchar, pos))
        {
            // create letter set
            LC_LetterSet vl;

            // initialize letter set with letter read
            vl.insert(lchar);

            // find letter analogs and push them onto the vector
            LC_AnalogMap::iterator itr = m_AnalogMap.find(lchar);
            if (itr != m_AnalogMap.end())
                // analogs present, iterate
                for (LC_AnalogVector::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++)
                    vl.insert(*itr2);

            // add letter vector to word vector
            vw.push_back(vl);
        }

        // push new word to words list
        m_WordList.push_back(vw);
    }
    fclose(file);
    return true;
}