Exemplo n.º 1
0
bool CRealTextParser::GetString(wstring& p_rszLine, unsigned int& p_riPos, wstring& p_rszString, const wstring& p_crszEndChars)
{
    while (p_rszLine.length() > p_riPos && p_crszEndChars.find(p_rszLine.at(p_riPos)) == wstring::npos) {
        p_rszString += p_rszLine.at(p_riPos);
        ++p_riPos;
    }

    return p_rszLine.length() > p_riPos;
}
Exemplo n.º 2
0
bool CRealTextParser::ExtractString(wstring& p_rszLine, wstring& p_rszString)
{
	if (p_rszLine.length() == 0 || p_rszLine.at(0) == '<')
	{
		if (m_bTryToIgnoreErrors)
		{
			p_rszString = L"";
			return true;
		}
		else
		{
			return false;
		}
	}

	unsigned int iPos = 0;

	if (!SkipSpaces(p_rszLine, iPos))
		return false;

	if (!GetString(p_rszLine, iPos, p_rszString, L"<"))
		return false;

	p_rszLine = p_rszLine.substr(iPos);
	return true;
}
Exemplo n.º 3
0
void XDBF::injectStringEntry(wstring wstr, unsigned long long id)
{
    // if no id was provided, then we need to get the next available one
    if (id == 0)
        id = getNextId(ET_STRING);

    // create a character array to hold the data to write, we need to
    // make a copy so that we can reverse the endian of the wstring
    unsigned short *dataToWrite = new unsigned short[wstr.length() + 1];

    // copy the characters to the array
    for (int i = 0; i < wstr.length(); i++)
    {
        dataToWrite[i] = (unsigned short)wstr.at(i);
        SwapEndian(&dataToWrite[i], 1, 2);
    }

    dataToWrite[wstr.length()] = 0;

    // inject the new string entry
    injectEntry_private(ET_STRING, (char*)dataToWrite, WSTRING_BYTES(wstr.length()), id);

    // give the memory back
    delete[] dataToWrite;
}
Exemplo n.º 4
0
vector<int> CJPWordsVector::makeTone(wstring str)
{
	//目标整形数组
	vector<int> tone;
	//分割符为英文逗号
	wchar_t flag = ',';
	//起始位置
	size_t start = 0;
	//结束位置
	size_t end = 0 ;
	for( ; end<str.size(); end++ )
	{
		//当前符号位分隔符
		if(str.at( end )== flag)
		{
			//截取子串
			wstring numStr = str.substr(start,end-start);
			//转换成数字加入数组
			int num = _wtoi(numStr.c_str());
			tone.push_back(num);
			//跳过分割符
			end++;
			//移动开始位置
			start = end;
		}
	}
	//截取子串
	wstring numStr = str.substr(start,end-start);
	//转换成数字加入数组
	int num = _wtoi(numStr.c_str());
	tone.push_back(num);
	return tone;
}
Exemplo n.º 5
0
bool FixPath(wstring& path) {
	// Fix unix paths
	std::replace( path.begin(), path.end(), L'/', L'\\' );

	// Remove double slashes
	while(true) {
		size_t p = path.find(L"\\\\");
		if (p == string::npos) break;
		path.replace(p, 2, L"\\");
	}


	// Are we pointing at a real destination?
	if (DirectoryExists(path)) {
		if (path[path.length()-1] != L'\\')
			path += L'\\';
		return true;
	} else if (path.at(path.length() - 1) == L'\\') {
		// It says its a directory but it's not, must be a file
		path = path.substr(0, path.length() - 1);
	}

	return FileExists(path);
	
}
Exemplo n.º 6
0
int GetCommonCharIndex(wchar_t c) {
  for (size_t i = 0; i < kCommonCharTable.size(); i++)
    if (kCommonCharTable.at(i) == c)
      return i;

  return -1;
}
Exemplo n.º 7
0
wstring CRealTextParser::StringToLower(const wstring& p_crszString)
{
    wstring szLowercaseString;
    for (unsigned int i = 0; i < p_crszString.length(); ++i) {
        szLowercaseString += towlower(p_crszString.at(i));
    }
    return szLowercaseString;
}
Exemplo n.º 8
0
/**
 * add a suffix with _basicFeatureListId and _featureListid to trie
 */
void SuffixModelTrie::updateSuffix(wstring _suffix, int _basicFeatureListId, int _featureListId)
{
	SuffixModelNode* currentNode = root;
	for (int i = (int) _suffix.length() - 1; i >= 0; --i)
	{
		SuffixModelNode* tmpNode = currentNode->findChildNode(_suffix.at(i));
		if (tmpNode == NULL)
		{
			tmpNode = new SuffixModelNode(_suffix.at(i));
			numberOfNodes++;
			currentNode->addChildNode(tmpNode);
		}
		tmpNode->updateFeature(_basicFeatureListId, _featureListId);
		updateFeatureId(_basicFeatureListId, _featureListId);
		currentNode = tmpNode;
	}
}
Exemplo n.º 9
0
bool CRealTextParser::SkipSpaces(wstring& p_rszLine, unsigned int& p_riPos)
{
    while (p_rszLine.length() > p_riPos && iswspace(p_rszLine.at(p_riPos))) {
        ++p_riPos;
    }

    return p_rszLine.length() > p_riPos;
}
Exemplo n.º 10
0
void MorphologicalDictionary::getMorphologicalPrediction(const wstring & lower_word,
        shared_ptr<vector<shared_ptr<Morphology> > > result)
{
    prediction_count++;

    bool debug = false;
    int l = lower_word.length();
    if (debug)
    {
        wcout << "Prediction: Word = " << lower_word << " length = " << l << endl;
    }
    shared_ptr<SuffixNode> current_node = suffix_root;
    for (int i = l - 1; i >= 0; --i)
    {
        wchar_t character = lower_word.at(i);
        map<wchar_t, shared_ptr<SuffixNode> >::iterator scn_iter
            = current_node->children.find(character);
        if (scn_iter != current_node->children.end())
        {
            current_node = scn_iter->second;
        }
        else
        {
            for (vector<int>::iterator
                    m_iter = current_node->suffix_trie_model_ids.begin();
                    m_iter != current_node->suffix_trie_model_ids.end(); ++m_iter)
            {
                int suffix_model_id = *m_iter;
                shared_ptr<SuffixModel> suffix_model = suffix_models.at(suffix_model_id);
                if (suffix_model->feature_list_id <= 0)
                {
                    continue;
                }

                shared_ptr<Morphology> morphology = std::make_shared<Morphology>();
                morphology->lemma_id = 0;
                morphology->suffix_length = l - 1 - i;
                // lemma
                shared_ptr<wstring> lemma = make_shared<wstring>(lower_word, 0, i + 1);
                lemma->append(suffix_model->lemma_suffix);
                morphology->lemma = lemma;
                morphology->word = make_shared<wstring>(lower_word);

                // feature from current model elements
                for (vector<int>::iterator
                        f_iter = id_feature_list.at(suffix_model->feature_list_id).begin();
                        f_iter != id_feature_list.at(suffix_model->feature_list_id).end();
                        ++f_iter)
                {
                    morphology->features.push_back(id_short_feature.at(*f_iter));
                    morphology->descriptions.push_back(id_long_feature.at(*f_iter));
                }
                result->push_back(morphology);
            }
            break;
        }
    }
}
Exemplo n.º 11
0
void base_string::_Replace(wstring &src, const wchar_t cOld, const wchar_t cNew)
{
	size_t nSize = src.size();
	for(size_t i=0; i<nSize; ++i)
	{
		if(src.at(i) == cOld)
			src[i] = cNew;
	}
}
Exemplo n.º 12
0
int CRealTextParser::GetTimecode(const wstring& p_crszTimecode)
{
	int iTimecode(0);
	int iMultiplier(1);

	// Exception: if the timecode doesn't contain any separators, assume the time code is in seconds (and change multiplier to reflect that)
	if (p_crszTimecode.find_first_of('.') == wstring::npos && p_crszTimecode.find_first_of(':') == wstring::npos)
		iMultiplier = 1000;

	wstring szCurrentPart;

	for (int i = p_crszTimecode.length() - 1; i >= 0; --i)
	{
		if (p_crszTimecode.at(i) == '.' || p_crszTimecode.at(i) == ':')
		{
			if (iMultiplier == 1)
			{
				while (szCurrentPart.length() < 3)
					szCurrentPart += L"0";
			}

			iTimecode += iMultiplier * ::_wtoi(szCurrentPart.c_str());

			if (iMultiplier == 1)
			{
				iMultiplier = 1000;
			}
			else
			{
				iMultiplier *= 60;
			}

			szCurrentPart = L"";
		}
		else
		{
			szCurrentPart = p_crszTimecode.substr(i, 1) + szCurrentPart;
		}
	}

	iTimecode += iMultiplier * ::_wtoi(szCurrentPart.c_str());

	return iTimecode;
}
Exemplo n.º 13
0
void SuffixModelTrie::updateSuffix(wstring _suffix, vector<MorphologicalInfo> minfos)
{
	SuffixModelNode* currentNode = root;
	for (int i = (int) _suffix.length() - 1; i >= 0; --i)
	{
		SuffixModelNode* tmpNode = currentNode->findChildNode(_suffix.at(i));
		if (tmpNode == NULL)
		{
			tmpNode = new SuffixModelNode(_suffix.at(i));
			numberOfNodes++;
			currentNode->addChildNode(tmpNode);
		}
		for (int j = 0; j < (int) minfos.size(); ++j)
		{
			tmpNode->updateFeature(minfos.at(j).basicFeatureListId, minfos.at(j).featureListId);
			updateFeatureId(minfos.at(j).basicFeatureListId, minfos.at(j).featureListId);
		}
		currentNode = tmpNode;
	}
}
Exemplo n.º 14
0
float XFont::getSubStringWidth(const wstring &s, int begin, int end)
{
    float width = 0;

    for (int i = begin; i < end; i++)
    {
        width += getCharWidth(s.at(i));
    }

    return width;
}
Exemplo n.º 15
0
void zpt::html::entities_encode(wstring s, ostream& out, bool quote, bool tags) {
	ostringstream oss;
	for (size_t i = 0; i != s.length(); i++) {
		if (((unsigned char)s[i]) > 127) {
			oss << "&#" << dec << ((int)s.at(i)) << ";";
		} else if (s[i] == '"' && quote) {
			oss << "&quot;";
		} else if (s[i] == '<' && tags) {
			oss << "&lt;";
		} else if (s[i] == '>' && tags) {
			oss << "&gt;";
		} else if (s[i] == '&') {
			oss << "&amp;";
		} else {
			oss << ((char)s.at(i));
		}
	}
	oss << flush;
	out << oss.str();
}
Exemplo n.º 16
0
void ReplaceChar(wstring& str, const wchar_t c, const wchar_t replace_with) {
  if (c == replace_with)
    return;

  size_t pos = 0;

  do {
    pos = str.find_first_of(c, pos);
    if (pos != wstring::npos)
      str.at(pos) = replace_with;
  } while (pos != wstring::npos);
}
Exemplo n.º 17
0
// HACK: This shouldn't be here but there isn't really anywhere else to put
// it right now unless we want to create a utility class inside common.
string
wstring2string( wstring ws )
{
    string s;
    for ( size_t i = 0; i < ws.size(); ++i )
    {
        unsigned short us = ws.at( i );
        char c = (char)us;
        s += c;
    }
    return s;
}
Exemplo n.º 18
0
static void ConT4_Dump_wstring(const string& msg, const wstring& s)
{
#if defined(VERBOSE)
	printf("%s (%d)", msg.c_str(), s.length());

#if defined(ABRIDGE_LARGE_DUMP)
	for (int i = 0; i < min(s.length(),4*16); i++)
	{
		if (i % 16 == 0)
			printf("\n");
		else
			printf(" ");
		unsigned int c = s.at(i);
		printf("%04X", c);
	}
	if(s.length() > 4*16)
		printf("\n. . . ABRIDGE_LARGE_DUMP is defined.");
	for (i = max(s.length()-(4*16),4*16); i < s.length(); i++)
	{
		if (i % 16 == 0)
			printf("\n");
		else
			printf(" ");
		unsigned int c = s.at(i);
		printf("%04X", c);
	}
#else
	for (int i = 0; i < s.length(); i++)
	{
		if (i % 16 == 0)
			printf("\n");
		else
			printf(" ");
		unsigned int c = s.at(i);
		printf("%04X", c);
	}
#endif
	printf("\n");
#endif
}
Exemplo n.º 19
0
/**
 * predict MorphologicalInfo by suffix
 */
vector<MorphologicalInfo> SuffixModelTrie::getMorphologicalPredictionBySuffix(wstring _word)
{
	vector<MorphologicalInfo> result = vector<MorphologicalInfo>();
	SuffixModelNode* currentNode = root;
	int suffixLength = 0;
	for (int i = (int) _word.length() - 1; i >= 0; --i)
	{
		SuffixModelNode* tmpNode = currentNode->findChildNode(_word.at(i));
		if (tmpNode == NULL)
		{
			break;
		}
		currentNode = tmpNode;
		suffixLength++;
        //wcout << _word.at(i) << " : " << currentNode->getFeatureFrequencyMap().size() << endl;
	}
	if (suffixLength == 0)
	{
		return result;
	}
    //wcout << "Suffix length = " << suffixLength << endl;
	map<int, int> _featureFrequencyMap = currentNode->getFeatureFrequencyMap();
    //wcout << "_featureFrequencyMap's size = " << _featureFrequencyMap.size() << endl;
	map<int, int>::iterator iter;
    //@TODO : \u043f\u0435\u0440\u0440\u0441\u0441\u043e\u043d//here was cyrrilic symbols: перрссон
	for (iter = _featureFrequencyMap.begin(); iter != _featureFrequencyMap.end(); ++iter)
	{
		int _featureId = iter->first;
		int _frequency = iter->second;
		int _basicFeatureListId = _featureId / 1000;
		int _featureListId = _featureId % 1000;
		wstring _initial_form = suffixLength < (int) _word.length() ? L"-" + _word.substr(_word.length() - suffixLength) : _word;
		MorphologicalInfo _morphologicalInfo;
		_morphologicalInfo.basicFeatureListId = _basicFeatureListId;
		_morphologicalInfo.featureListId = _featureListId;
		_morphologicalInfo.frequency = _frequency;
		_morphologicalInfo.initial_form = _initial_form;
		_morphologicalInfo.lemmaId = 0;
		_morphologicalInfo.suffix_length = suffixLength;
		result.push_back(_morphologicalInfo);
	}
	return result;
}
Exemplo n.º 20
0
OSErr StringToHandle(const wstring & inString, Handle & outHandle)
{
	OSErr error = kNoErr;

	outHandle = NULL;

	size_t s = inString.length();

	if (s) 
	{
		outHandle = sPSHandle->New((int32)s);
		if (outHandle != NULL)
		{
			Boolean oldLock = FALSE;
			uint16 * p = NULL;
			sPSHandle->SetLock(outHandle, true, reinterpret_cast<char**>(&p), &oldLock);
			if (p != NULL)
			{
				Ptr originalP = (Ptr)p;
				for(size_t a = 0; a < s; a++, p++)
					*p = inString.at(a);
				sPSHandle->SetLock(outHandle, false, &originalP, &oldLock);
			}
			else
			{
				sPSHandle->Dispose(outHandle);
				outHandle = NULL;
				error = errPlugInHostInsufficient;
			}
		}
		else
		{
			error = errPlugInHostInsufficient;
		}
	}
	else
	{
		error = errPlugInHostInsufficient;
	}

	return error;
}
Exemplo n.º 21
0
float FontHelper::getStringWidth(XFont *font, const wstring &text, bool snap)
{
    if (snap)
    {
        float w = 0;
        int len = text.size();
        
        for (int i = 0; i < len; i++)
        {
            wchar_t ch = text.at(i);
            w += math<float>::floor(font->getCharWidth(ch));
        }
        
        return w;
    }
    else
    {
        return font->getStringWidth(text);
    }
}
Exemplo n.º 22
0
static
string MakeUCS2LE(const wstring& str)
{
    string result;

#if defined(WORDS_BIGENDIAN)
    if (!str.empty()) {
        result.resize(str.size() * 2);
        for(wstring::size_type i = 0; i < str.size(); ++i) {
            wchar_t chracter = str.at(i);

            result.at(i * 2) = (chracter & 0x000000FF);
            result.at(i * 2 + 1) = (chracter & 0x0000FF00);
        }
    }
#else
    result.assign((const char*)str.data(), str.size() * sizeof(wchar_t));
#endif

    return result;
}
Exemplo n.º 23
0
void MorphologicalDictionary::getMorphologyE(const wstring & lower_word,
        shared_ptr<vector<shared_ptr<Morphology> > > result)
{
    bool debug = false;
    size_t l = lower_word.length();
    for (size_t i = 0; i < l; ++i)
    {
        if (lower_word.at(i) == L'е')
        {
            e_count++;
            wstring e_word(lower_word);
            e_word[i] = L'ё';

            // get morphology
            if (debug)
            {
                wcout << "E:getMorphologyPo: " << e_word << endl;
            }
            this->getMorphologyPo(e_word, false, result);
            if (debug)
            {
                wcout << "E:getMorphologyPo: ok" << e_word << endl;
            }

            // "по"
            if (l >= 2 && e_word.at(0) == L'п' && e_word.at(1) == L'о')
            {
                if (debug)
                {
                    wcout << "E:getMorphologyPo:Po: " << e_word << endl;
                }
                this->getMorphologyPo(e_word, true, result);
            }
        }
    }
    if (debug)
    {
        wcout << "getMorphologyE >> ok" << endl;
    }
}
Exemplo n.º 24
0
void FontHelper::drawText(XFont *font, XFontSequence *sequence, const wstring &text, float x, float y, bool snap)
{
    int len = text.size();
    
    if (snap)
    {
        x = math<float>::floor(x);
        y = math<float>::floor(y);
    }
    
    font->beginSequence(sequence, 2);
    
    for (int i = 0; i < len; i++)
    {
        wchar_t ch = text.at(i);
        font->addSequenceCharacter(ch, x, y);
        
        float ww = font->getCharWidth(ch);
        x += snap ? math<float>::floor(ww) : ww;
    }
    
    font->endSequence();
}
Exemplo n.º 25
0
float FontHelper::drawTextOnPath(XFont *font, XFontSequence *sequence, const wstring &text, FollowablePath *path, float offset)
{
    float res[3];
    
    int len = text.size();
    float offsetX = offset;
    float offsetY = font->getMaxDescent();
    float sampleSize = font->getSize() / 2;
    
    FontMatrix *matrix = font->getMatrix();
    font->beginSequence(sequence, 2);
    
    for (int i = 0; i < len; i++)
    {
        wchar_t ch = text.at(i);
        float half = 0.5f * font->getCharWidth(ch);
        offsetX += half;
        
        int cc = font->lookup(ch);
        if (cc > -1)
        {
            path->pos2Point(offsetX, res);
            float theta = path->pos2SampledAngle(offsetX, sampleSize);
            
            matrix->setTranslation(res[0], res[1], 0);
            matrix->rotateZ(theta);
            font->addTransformedEntity2D(cc, -half, offsetY);
        }
        
        offsetX += half;
    }
    
    font->endSequence();
    
    return offsetX;
}
Exemplo n.º 26
0
void FontHelper::drawWrappedText(XFont *font, XFontSequence *sequence, const wstring &text, WordWrapper *wrapper, float x, float y, float lineHeight)
{
    float yy = y + font->getMaxAscent();
    
    font->beginSequence(sequence, 2);
    
    for (int j = 0; j < wrapper->size; j++)
    {
        float offset = wrapper->offsets[j];
        float length = wrapper->lengths[j];
        float xx = x;
        
        for (int i = offset; i < offset + length; i++)
        {
            wchar_t c = text.at(i);
            font->addSequenceCharacter(c, xx, yy);
            xx += font->getCharWidth(c);
        }
        
        yy += lineHeight;
    }
    
    font->endSequence();
}
Exemplo n.º 27
0
bool CRealTextParser::ParseRealText(wstring p_szFile)
{
    vector<int> vStartTimecodes;
    vector<int> vEndTimecodes;
    bool bPrevEndTimeMissing = false;
    list<Tag> listTags;
    list<Tag> listPreviousOpenTags;

    while (p_szFile.length() > 0) {
        if (p_szFile.at(0) == '<') {
            Tag oTag;
            if (!ExtractTag(p_szFile, oTag)) {
                return false;
            }

            if (oTag.m_bComment) {
                continue;
            }

            if (oTag.m_szName == L"time") {
                int iStartTimecode = GetTimecode(oTag.m_mapAttributes[L"begin"]);
                int iEndTimecode = GetTimecode(oTag.m_mapAttributes[L"end"]);

                //FilterReduntantTags(listTags);
                wstring szLine = RenderTags(listTags);

                if (bPrevEndTimeMissing) {
                    pair<int, int> pairTimecodes(vStartTimecodes.back(), iStartTimecode);

                    // Fix issues where the next time code isn't valid end time code for the previous subtitle
                    if (pairTimecodes.first >= pairTimecodes.second) {
                        pairTimecodes.second = pairTimecodes.first + m_iDefaultSubtitleDurationInMillisecs;
                    }

                    if (szLine.length() > 0) {
                        m_RealText.m_mapLines[pairTimecodes] = szLine;
                    }

                    bPrevEndTimeMissing = false;
                } else if (!vStartTimecodes.empty() && !vEndTimecodes.empty()) {
                    pair<int, int> pairTimecodes(vStartTimecodes.back(), vEndTimecodes.back());

                    if (szLine.length() > 0) {
                        m_RealText.m_mapLines[pairTimecodes] = szLine;
                    }

                }

                vStartTimecodes.push_back(iStartTimecode);
                if (iEndTimecode <= 0) {
                    bPrevEndTimeMissing = true;
                } else {
                    vEndTimecodes.push_back(iEndTimecode);
                }
            } else if (oTag.m_szName == L"b" || oTag.m_szName == L"i" || oTag.m_szName == L"font") {
                if (oTag.m_bOpen) {
                    listPreviousOpenTags.push_back(oTag);
                }

                if (oTag.m_bClose) {
                    PopTag(listPreviousOpenTags, oTag.m_szName);
                }

                listTags.push_back(oTag);
            } else if (oTag.m_szName == L"clear") {
                listTags.clear();

                // set existing tags
                listTags.insert(listTags.end(), listPreviousOpenTags.begin(), listPreviousOpenTags.end());
            } else if (oTag.m_szName == L"window") {
                if (oTag.m_bOpen) {
                    m_RealText.m_WindowTag = oTag;
                }

                // Ignore close
            } else if (oTag.m_szName == L"center") {
                m_RealText.m_bCenter = true;
            } else if (oTag.m_szName == L"required") {
                // Ignore
            } else if (oTag.m_szName == L"") {
                // Ignore
            } else {
                // assume formating tag (handled later)
                listTags.push_back(oTag);
            }
        } else {
            Tag oTextTag;
            if (!ExtractTextTag(p_szFile, oTextTag)) {
                return false;
            }

            listTags.push_back(oTextTag);
        }
    }

    // Handle final line
    //FilterReduntantTags(listTags);
    wstring szLine = RenderTags(listTags);

    if (bPrevEndTimeMissing) {
        pair<int, int> pairTimecodes(vStartTimecodes.back(), vStartTimecodes.back() + m_iDefaultSubtitleDurationInMillisecs);

        if (szLine.length() > 0) {
            m_RealText.m_mapLines[pairTimecodes] = szLine;
        }

        bPrevEndTimeMissing = false;
    } else if (!vStartTimecodes.empty() && !vEndTimecodes.empty()) {
        pair<int, int> pairTimecodes(vStartTimecodes.back(), vEndTimecodes.back());

        if (szLine.length() > 0) {
            m_RealText.m_mapLines[pairTimecodes] = szLine;
        }

    }

    return true;
}
Exemplo n.º 28
0
void MorphologicalDictionary::getMorphologyPo(const wstring & lower_word, bool po, shared_ptr<vector<shared_ptr<Morphology> > > result)
{
    morphology_count++;

    bool debug = false;
    size_t l = lower_word.length();
    shared_ptr<MNode> current_node = root;
    wstring current_prefix = po ? L"по" : L"";
    size_t begin_index = po ? 2 : 0;
    for (size_t i = begin_index; i <= l; ++i)
    {
        if (debug)
        {
            wcout << "Current_prefix = " << current_prefix << endl;
            wcout << "Number of models = " << current_node->lemmaId_MNodeModel.size() << endl;
        }

        wstring suffix = lower_word.substr(i, l - i);

        map<wstring, shared_ptr<vector<shared_ptr<MNodeItem> > > >::iterator s_iter
            = current_node->suffix_MNodeItem.find(suffix);
        if (s_iter != current_node->suffix_MNodeItem.end())
        {
            if (debug)
            {
                wcout << "Found suffix: " << suffix << endl;
            }
            shared_ptr<vector<shared_ptr<MNodeItem> > > items = s_iter->second;
            if (debug)
            {
                wcout << "items->size() = " << items->size() << endl;
            }
            for (vector<shared_ptr<MNodeItem> >::iterator mn_iter = items->begin();
                    mn_iter != items->end(); ++mn_iter)
            {
                shared_ptr<MNodeItem> item = *mn_iter;
                if (item->feature_list_id <= 0 || item->po != po)
                {
                    continue;
                }
                shared_ptr<Morphology> morphology = std::make_shared<Morphology>();
                morphology->lemma_id = item->lemma_id;
                morphology->lemma = lemmas.at(item->lemma_id);
                morphology->word = make_shared<wstring>(lower_word);
                morphology->suffix_length = l - i;

                //morphology->features
                if (debug)
                {
                    wcout << "item->feature_list_id = " << item->feature_list_id << endl;
                }
                for (vector<int>::iterator
                        f_iter = id_feature_list.at(item->feature_list_id).begin();
                        f_iter != id_feature_list.at(item->feature_list_id).end();
                        ++f_iter)
                {
                    if (debug)
                    {
                        wcout << "*f_iter = " << *f_iter << endl;
                    }
                    morphology->features.push_back(id_short_feature.at(*f_iter));
                    morphology->descriptions.push_back(id_long_feature.at(*f_iter));
                    if (debug)
                    {
                        wcout << "f_iter ok " << endl;
                    }
                }
                result->push_back(morphology);
            }
        }
        // go to child node
        if (i == l)
        {
            break;
        }
        map<wchar_t, shared_ptr<MNode> >::iterator child_iterator = current_node->children.find(lower_word.at(i));
        if (child_iterator == current_node->children.end())
        {
            break;
        }
        current_node = child_iterator->second;
        current_prefix.push_back(lower_word.at(i));
    }
    if (debug)
    {
        wcout << "getMorphologyPo >> OK" << endl;
    }
}
Exemplo n.º 29
0
void MorphologicalDictionary::getMorphologicalInfoListByRules(const wstring & lower_word, shared_ptr<vector<shared_ptr<Morphology> > > result)
{
    rule_count++;

    bool debug = false;

    set<pair<wstring, int> > resultSet; // avoid duplications
    resultSet.clear();

    size_t lw = lower_word.length();
    for (vector<MRule>::iterator iter = ruleSet.morphologyRules.begin();
            iter != ruleSet.morphologyRules.end(); ++iter)
    {
        MRule rule = *iter;
        size_t lws = rule.word_suffix.length();
        size_t lls = rule.lemma_suffix.length();

        // check length
        if (lw < lws || lw - lws + lls < ruleSet.min_lemma_length)
        {
            continue;
        }

        // check suffix
        int i1 = lw - 1;
        int i2 = lws - 1;
        bool ok = true;
        while (i2 >= 0)
        {
            if (lower_word.at(i1) != rule.word_suffix.at(i2))
            {
                ok = false;
                break;
            }
            i1--;
            i2--;
        }
        if (!ok)
        {
            continue;
        }

        // lemma
        shared_ptr<wstring> lemma = std::make_shared<wstring>();
        lemma->clear();
        for (size_t i = 0; i < lw - lws; ++i)
        {
            lemma->push_back(lower_word.at(i));
        }
        for (size_t i = 0; i < lls; ++i)
        {
            lemma->push_back(rule.lemma_suffix.at(i));
        }

        if (debug)
        {
            wcout << "Found, lemma = " << endl;
        }

        // check <lemma, word_feature_list_id>
        pair<wstring, int> pp(*lemma, rule.word_feature_list_id);
        if (resultSet.find(pp) != resultSet.end())
        {
            if (debug)
            {
                wcout << "### Duplication: " << *lemma << " - " << rule.word_feature_list_id << endl;
            }
            continue;
        }
        resultSet.insert(pp);

        // get feature_list_id of all word forms of lemma
        shared_ptr<map<int, shared_ptr<set<int> > > > lemmaId_wordFLIDs = make_shared<map<int, shared_ptr<set<int> > > >();
        this->getFeatureListOfLemma(lemma, lemmaId_wordFLIDs);

        // check feature_list_id
        for (map<int, shared_ptr<set<int> > >::iterator l_iter = lemmaId_wordFLIDs->begin();
                l_iter != lemmaId_wordFLIDs->end(); ++l_iter)
        {
            shared_ptr<set<int> > ss = l_iter->second;
            if (ss->find(rule.lemma_feature_list_id) != ss->end() &&
                    ss->find(rule.word_feature_list_id) == ss->end())
            {
                if (debug)
                {
                    wcout << ">>> FOUND" << endl;
                    wcout << "rule.word_suffix = " << rule.word_suffix << endl;
                    wcout << "rule.word_feature_list_id = " << rule.word_feature_list_id << endl;
                    wcout << "rule.lemma_suffix = " << rule.lemma_suffix << endl;
                    wcout << "rule.lemma_feature_list_id = " << rule.lemma_feature_list_id << endl;
                    wcout << endl;
                }
                shared_ptr<Morphology> morphology = std::make_shared<Morphology>();
                morphology->lemma_id = l_iter->first;
                morphology->lemma = lemma;
                morphology->word = make_shared<wstring>(lower_word);
                morphology->suffix_length = lws;

                // features
                for (vector<int>::iterator
                        f_iter = id_feature_list.at(rule.word_feature_list_id).begin();
                        f_iter != id_feature_list.at(rule.word_feature_list_id).end();
                        ++f_iter)
                {
                    morphology->features.push_back(id_short_feature.at(*f_iter));
                    morphology->descriptions.push_back(id_long_feature.at(*f_iter));
                }
                result->push_back(morphology);
            }
        }
    }
}
Exemplo n.º 30
0
void MorphologicalDictionary::getMorphology(const wstring & word,
        shared_ptr<vector<shared_ptr<Morphology> > > result)
{
    bool debug = false;
    // convert word to lower-cased
    size_t l = word.length();
    wstring lower_word;
//    lower_word.clear();
//    wstring lower_word = word;
    int e_count = 0;
    int ee_count = 0;
    for (size_t i = 0; i < l; ++ i)
    {
        wchar_t lower_ch = tools->charToLowerCase(word.at(i));
        if (lower_ch == L'е')
        {
            e_count++;
        }
        else if (lower_ch == L'ё')
        {
            ee_count++;
        }
        lower_word.push_back(lower_ch);
//        lower_word[i] = towlower(lower_word.c_str()[i]);
    }
    if (debug)
    {
        wcout << "lower_word = " << lower_word << endl;
    }

    // get morphology
    this->getMorphologyPo(lower_word, false, result);

    // "по"
    if (l >= 2 && lower_word.at(0) == L'п' && lower_word.at(1) == L'о')
    {
        this->getMorphologyPo(lower_word, true, result);
    }

    if (useE && e_count > 0 && ee_count == 0)
    {
        if (debug)
        {
            wcout << "E: " << endl;
        }
        this->getMorphologyE(lower_word, result);
    }

    // morphology by rules (English)
    if (useRules)
    {
        if (debug)
        {
            wcout << "Rules (English): " << endl;
        }
        this->getMorphologicalInfoListByRules(lower_word, result);
    }

    // morphology prediction (Russian)
    if (usePrediction && result->empty())
    {
        if (debug)
        {
            wcout << "Prediction (Russian): " << endl;
        }
        this->getMorphologicalPrediction(lower_word, result);
    }
}