bool CRealTextParser::GetString(wstring& p_rszLine, unsigned int& p_riPos, wstring& p_rszString, const wstring& p_crszEndChars) { while (p_rszLine.length() > p_riPos && p_crszEndChars.find(p_rszLine.at(p_riPos)) == wstring::npos) { p_rszString += p_rszLine.at(p_riPos); ++p_riPos; } return p_rszLine.length() > p_riPos; }
bool CRealTextParser::ExtractString(wstring& p_rszLine, wstring& p_rszString) { if (p_rszLine.length() == 0 || p_rszLine.at(0) == '<') { if (m_bTryToIgnoreErrors) { p_rszString = L""; return true; } else { return false; } } unsigned int iPos = 0; if (!SkipSpaces(p_rszLine, iPos)) return false; if (!GetString(p_rszLine, iPos, p_rszString, L"<")) return false; p_rszLine = p_rszLine.substr(iPos); return true; }
void XDBF::injectStringEntry(wstring wstr, unsigned long long id) { // if no id was provided, then we need to get the next available one if (id == 0) id = getNextId(ET_STRING); // create a character array to hold the data to write, we need to // make a copy so that we can reverse the endian of the wstring unsigned short *dataToWrite = new unsigned short[wstr.length() + 1]; // copy the characters to the array for (int i = 0; i < wstr.length(); i++) { dataToWrite[i] = (unsigned short)wstr.at(i); SwapEndian(&dataToWrite[i], 1, 2); } dataToWrite[wstr.length()] = 0; // inject the new string entry injectEntry_private(ET_STRING, (char*)dataToWrite, WSTRING_BYTES(wstr.length()), id); // give the memory back delete[] dataToWrite; }
vector<int> CJPWordsVector::makeTone(wstring str) { //目标整形数组 vector<int> tone; //分割符为英文逗号 wchar_t flag = ','; //起始位置 size_t start = 0; //结束位置 size_t end = 0 ; for( ; end<str.size(); end++ ) { //当前符号位分隔符 if(str.at( end )== flag) { //截取子串 wstring numStr = str.substr(start,end-start); //转换成数字加入数组 int num = _wtoi(numStr.c_str()); tone.push_back(num); //跳过分割符 end++; //移动开始位置 start = end; } } //截取子串 wstring numStr = str.substr(start,end-start); //转换成数字加入数组 int num = _wtoi(numStr.c_str()); tone.push_back(num); return tone; }
bool FixPath(wstring& path) { // Fix unix paths std::replace( path.begin(), path.end(), L'/', L'\\' ); // Remove double slashes while(true) { size_t p = path.find(L"\\\\"); if (p == string::npos) break; path.replace(p, 2, L"\\"); } // Are we pointing at a real destination? if (DirectoryExists(path)) { if (path[path.length()-1] != L'\\') path += L'\\'; return true; } else if (path.at(path.length() - 1) == L'\\') { // It says its a directory but it's not, must be a file path = path.substr(0, path.length() - 1); } return FileExists(path); }
int GetCommonCharIndex(wchar_t c) { for (size_t i = 0; i < kCommonCharTable.size(); i++) if (kCommonCharTable.at(i) == c) return i; return -1; }
wstring CRealTextParser::StringToLower(const wstring& p_crszString) { wstring szLowercaseString; for (unsigned int i = 0; i < p_crszString.length(); ++i) { szLowercaseString += towlower(p_crszString.at(i)); } return szLowercaseString; }
/** * add a suffix with _basicFeatureListId and _featureListid to trie */ void SuffixModelTrie::updateSuffix(wstring _suffix, int _basicFeatureListId, int _featureListId) { SuffixModelNode* currentNode = root; for (int i = (int) _suffix.length() - 1; i >= 0; --i) { SuffixModelNode* tmpNode = currentNode->findChildNode(_suffix.at(i)); if (tmpNode == NULL) { tmpNode = new SuffixModelNode(_suffix.at(i)); numberOfNodes++; currentNode->addChildNode(tmpNode); } tmpNode->updateFeature(_basicFeatureListId, _featureListId); updateFeatureId(_basicFeatureListId, _featureListId); currentNode = tmpNode; } }
bool CRealTextParser::SkipSpaces(wstring& p_rszLine, unsigned int& p_riPos) { while (p_rszLine.length() > p_riPos && iswspace(p_rszLine.at(p_riPos))) { ++p_riPos; } return p_rszLine.length() > p_riPos; }
void MorphologicalDictionary::getMorphologicalPrediction(const wstring & lower_word, shared_ptr<vector<shared_ptr<Morphology> > > result) { prediction_count++; bool debug = false; int l = lower_word.length(); if (debug) { wcout << "Prediction: Word = " << lower_word << " length = " << l << endl; } shared_ptr<SuffixNode> current_node = suffix_root; for (int i = l - 1; i >= 0; --i) { wchar_t character = lower_word.at(i); map<wchar_t, shared_ptr<SuffixNode> >::iterator scn_iter = current_node->children.find(character); if (scn_iter != current_node->children.end()) { current_node = scn_iter->second; } else { for (vector<int>::iterator m_iter = current_node->suffix_trie_model_ids.begin(); m_iter != current_node->suffix_trie_model_ids.end(); ++m_iter) { int suffix_model_id = *m_iter; shared_ptr<SuffixModel> suffix_model = suffix_models.at(suffix_model_id); if (suffix_model->feature_list_id <= 0) { continue; } shared_ptr<Morphology> morphology = std::make_shared<Morphology>(); morphology->lemma_id = 0; morphology->suffix_length = l - 1 - i; // lemma shared_ptr<wstring> lemma = make_shared<wstring>(lower_word, 0, i + 1); lemma->append(suffix_model->lemma_suffix); morphology->lemma = lemma; morphology->word = make_shared<wstring>(lower_word); // feature from current model elements for (vector<int>::iterator f_iter = id_feature_list.at(suffix_model->feature_list_id).begin(); f_iter != id_feature_list.at(suffix_model->feature_list_id).end(); ++f_iter) { morphology->features.push_back(id_short_feature.at(*f_iter)); morphology->descriptions.push_back(id_long_feature.at(*f_iter)); } result->push_back(morphology); } break; } } }
void base_string::_Replace(wstring &src, const wchar_t cOld, const wchar_t cNew) { size_t nSize = src.size(); for(size_t i=0; i<nSize; ++i) { if(src.at(i) == cOld) src[i] = cNew; } }
int CRealTextParser::GetTimecode(const wstring& p_crszTimecode) { int iTimecode(0); int iMultiplier(1); // Exception: if the timecode doesn't contain any separators, assume the time code is in seconds (and change multiplier to reflect that) if (p_crszTimecode.find_first_of('.') == wstring::npos && p_crszTimecode.find_first_of(':') == wstring::npos) iMultiplier = 1000; wstring szCurrentPart; for (int i = p_crszTimecode.length() - 1; i >= 0; --i) { if (p_crszTimecode.at(i) == '.' || p_crszTimecode.at(i) == ':') { if (iMultiplier == 1) { while (szCurrentPart.length() < 3) szCurrentPart += L"0"; } iTimecode += iMultiplier * ::_wtoi(szCurrentPart.c_str()); if (iMultiplier == 1) { iMultiplier = 1000; } else { iMultiplier *= 60; } szCurrentPart = L""; } else { szCurrentPart = p_crszTimecode.substr(i, 1) + szCurrentPart; } } iTimecode += iMultiplier * ::_wtoi(szCurrentPart.c_str()); return iTimecode; }
void SuffixModelTrie::updateSuffix(wstring _suffix, vector<MorphologicalInfo> minfos) { SuffixModelNode* currentNode = root; for (int i = (int) _suffix.length() - 1; i >= 0; --i) { SuffixModelNode* tmpNode = currentNode->findChildNode(_suffix.at(i)); if (tmpNode == NULL) { tmpNode = new SuffixModelNode(_suffix.at(i)); numberOfNodes++; currentNode->addChildNode(tmpNode); } for (int j = 0; j < (int) minfos.size(); ++j) { tmpNode->updateFeature(minfos.at(j).basicFeatureListId, minfos.at(j).featureListId); updateFeatureId(minfos.at(j).basicFeatureListId, minfos.at(j).featureListId); } currentNode = tmpNode; } }
float XFont::getSubStringWidth(const wstring &s, int begin, int end) { float width = 0; for (int i = begin; i < end; i++) { width += getCharWidth(s.at(i)); } return width; }
void zpt::html::entities_encode(wstring s, ostream& out, bool quote, bool tags) { ostringstream oss; for (size_t i = 0; i != s.length(); i++) { if (((unsigned char)s[i]) > 127) { oss << "&#" << dec << ((int)s.at(i)) << ";"; } else if (s[i] == '"' && quote) { oss << """; } else if (s[i] == '<' && tags) { oss << "<"; } else if (s[i] == '>' && tags) { oss << ">"; } else if (s[i] == '&') { oss << "&"; } else { oss << ((char)s.at(i)); } } oss << flush; out << oss.str(); }
void ReplaceChar(wstring& str, const wchar_t c, const wchar_t replace_with) { if (c == replace_with) return; size_t pos = 0; do { pos = str.find_first_of(c, pos); if (pos != wstring::npos) str.at(pos) = replace_with; } while (pos != wstring::npos); }
// HACK: This shouldn't be here but there isn't really anywhere else to put // it right now unless we want to create a utility class inside common. string wstring2string( wstring ws ) { string s; for ( size_t i = 0; i < ws.size(); ++i ) { unsigned short us = ws.at( i ); char c = (char)us; s += c; } return s; }
static void ConT4_Dump_wstring(const string& msg, const wstring& s) { #if defined(VERBOSE) printf("%s (%d)", msg.c_str(), s.length()); #if defined(ABRIDGE_LARGE_DUMP) for (int i = 0; i < min(s.length(),4*16); i++) { if (i % 16 == 0) printf("\n"); else printf(" "); unsigned int c = s.at(i); printf("%04X", c); } if(s.length() > 4*16) printf("\n. . . ABRIDGE_LARGE_DUMP is defined."); for (i = max(s.length()-(4*16),4*16); i < s.length(); i++) { if (i % 16 == 0) printf("\n"); else printf(" "); unsigned int c = s.at(i); printf("%04X", c); } #else for (int i = 0; i < s.length(); i++) { if (i % 16 == 0) printf("\n"); else printf(" "); unsigned int c = s.at(i); printf("%04X", c); } #endif printf("\n"); #endif }
/** * predict MorphologicalInfo by suffix */ vector<MorphologicalInfo> SuffixModelTrie::getMorphologicalPredictionBySuffix(wstring _word) { vector<MorphologicalInfo> result = vector<MorphologicalInfo>(); SuffixModelNode* currentNode = root; int suffixLength = 0; for (int i = (int) _word.length() - 1; i >= 0; --i) { SuffixModelNode* tmpNode = currentNode->findChildNode(_word.at(i)); if (tmpNode == NULL) { break; } currentNode = tmpNode; suffixLength++; //wcout << _word.at(i) << " : " << currentNode->getFeatureFrequencyMap().size() << endl; } if (suffixLength == 0) { return result; } //wcout << "Suffix length = " << suffixLength << endl; map<int, int> _featureFrequencyMap = currentNode->getFeatureFrequencyMap(); //wcout << "_featureFrequencyMap's size = " << _featureFrequencyMap.size() << endl; map<int, int>::iterator iter; //@TODO : \u043f\u0435\u0440\u0440\u0441\u0441\u043e\u043d//here was cyrrilic symbols: перрссон for (iter = _featureFrequencyMap.begin(); iter != _featureFrequencyMap.end(); ++iter) { int _featureId = iter->first; int _frequency = iter->second; int _basicFeatureListId = _featureId / 1000; int _featureListId = _featureId % 1000; wstring _initial_form = suffixLength < (int) _word.length() ? L"-" + _word.substr(_word.length() - suffixLength) : _word; MorphologicalInfo _morphologicalInfo; _morphologicalInfo.basicFeatureListId = _basicFeatureListId; _morphologicalInfo.featureListId = _featureListId; _morphologicalInfo.frequency = _frequency; _morphologicalInfo.initial_form = _initial_form; _morphologicalInfo.lemmaId = 0; _morphologicalInfo.suffix_length = suffixLength; result.push_back(_morphologicalInfo); } return result; }
OSErr StringToHandle(const wstring & inString, Handle & outHandle) { OSErr error = kNoErr; outHandle = NULL; size_t s = inString.length(); if (s) { outHandle = sPSHandle->New((int32)s); if (outHandle != NULL) { Boolean oldLock = FALSE; uint16 * p = NULL; sPSHandle->SetLock(outHandle, true, reinterpret_cast<char**>(&p), &oldLock); if (p != NULL) { Ptr originalP = (Ptr)p; for(size_t a = 0; a < s; a++, p++) *p = inString.at(a); sPSHandle->SetLock(outHandle, false, &originalP, &oldLock); } else { sPSHandle->Dispose(outHandle); outHandle = NULL; error = errPlugInHostInsufficient; } } else { error = errPlugInHostInsufficient; } } else { error = errPlugInHostInsufficient; } return error; }
float FontHelper::getStringWidth(XFont *font, const wstring &text, bool snap) { if (snap) { float w = 0; int len = text.size(); for (int i = 0; i < len; i++) { wchar_t ch = text.at(i); w += math<float>::floor(font->getCharWidth(ch)); } return w; } else { return font->getStringWidth(text); } }
static string MakeUCS2LE(const wstring& str) { string result; #if defined(WORDS_BIGENDIAN) if (!str.empty()) { result.resize(str.size() * 2); for(wstring::size_type i = 0; i < str.size(); ++i) { wchar_t chracter = str.at(i); result.at(i * 2) = (chracter & 0x000000FF); result.at(i * 2 + 1) = (chracter & 0x0000FF00); } } #else result.assign((const char*)str.data(), str.size() * sizeof(wchar_t)); #endif return result; }
void MorphologicalDictionary::getMorphologyE(const wstring & lower_word, shared_ptr<vector<shared_ptr<Morphology> > > result) { bool debug = false; size_t l = lower_word.length(); for (size_t i = 0; i < l; ++i) { if (lower_word.at(i) == L'е') { e_count++; wstring e_word(lower_word); e_word[i] = L'ё'; // get morphology if (debug) { wcout << "E:getMorphologyPo: " << e_word << endl; } this->getMorphologyPo(e_word, false, result); if (debug) { wcout << "E:getMorphologyPo: ok" << e_word << endl; } // "по" if (l >= 2 && e_word.at(0) == L'п' && e_word.at(1) == L'о') { if (debug) { wcout << "E:getMorphologyPo:Po: " << e_word << endl; } this->getMorphologyPo(e_word, true, result); } } } if (debug) { wcout << "getMorphologyE >> ok" << endl; } }
void FontHelper::drawText(XFont *font, XFontSequence *sequence, const wstring &text, float x, float y, bool snap) { int len = text.size(); if (snap) { x = math<float>::floor(x); y = math<float>::floor(y); } font->beginSequence(sequence, 2); for (int i = 0; i < len; i++) { wchar_t ch = text.at(i); font->addSequenceCharacter(ch, x, y); float ww = font->getCharWidth(ch); x += snap ? math<float>::floor(ww) : ww; } font->endSequence(); }
float FontHelper::drawTextOnPath(XFont *font, XFontSequence *sequence, const wstring &text, FollowablePath *path, float offset) { float res[3]; int len = text.size(); float offsetX = offset; float offsetY = font->getMaxDescent(); float sampleSize = font->getSize() / 2; FontMatrix *matrix = font->getMatrix(); font->beginSequence(sequence, 2); for (int i = 0; i < len; i++) { wchar_t ch = text.at(i); float half = 0.5f * font->getCharWidth(ch); offsetX += half; int cc = font->lookup(ch); if (cc > -1) { path->pos2Point(offsetX, res); float theta = path->pos2SampledAngle(offsetX, sampleSize); matrix->setTranslation(res[0], res[1], 0); matrix->rotateZ(theta); font->addTransformedEntity2D(cc, -half, offsetY); } offsetX += half; } font->endSequence(); return offsetX; }
void FontHelper::drawWrappedText(XFont *font, XFontSequence *sequence, const wstring &text, WordWrapper *wrapper, float x, float y, float lineHeight) { float yy = y + font->getMaxAscent(); font->beginSequence(sequence, 2); for (int j = 0; j < wrapper->size; j++) { float offset = wrapper->offsets[j]; float length = wrapper->lengths[j]; float xx = x; for (int i = offset; i < offset + length; i++) { wchar_t c = text.at(i); font->addSequenceCharacter(c, xx, yy); xx += font->getCharWidth(c); } yy += lineHeight; } font->endSequence(); }
bool CRealTextParser::ParseRealText(wstring p_szFile) { vector<int> vStartTimecodes; vector<int> vEndTimecodes; bool bPrevEndTimeMissing = false; list<Tag> listTags; list<Tag> listPreviousOpenTags; while (p_szFile.length() > 0) { if (p_szFile.at(0) == '<') { Tag oTag; if (!ExtractTag(p_szFile, oTag)) { return false; } if (oTag.m_bComment) { continue; } if (oTag.m_szName == L"time") { int iStartTimecode = GetTimecode(oTag.m_mapAttributes[L"begin"]); int iEndTimecode = GetTimecode(oTag.m_mapAttributes[L"end"]); //FilterReduntantTags(listTags); wstring szLine = RenderTags(listTags); if (bPrevEndTimeMissing) { pair<int, int> pairTimecodes(vStartTimecodes.back(), iStartTimecode); // Fix issues where the next time code isn't valid end time code for the previous subtitle if (pairTimecodes.first >= pairTimecodes.second) { pairTimecodes.second = pairTimecodes.first + m_iDefaultSubtitleDurationInMillisecs; } if (szLine.length() > 0) { m_RealText.m_mapLines[pairTimecodes] = szLine; } bPrevEndTimeMissing = false; } else if (!vStartTimecodes.empty() && !vEndTimecodes.empty()) { pair<int, int> pairTimecodes(vStartTimecodes.back(), vEndTimecodes.back()); if (szLine.length() > 0) { m_RealText.m_mapLines[pairTimecodes] = szLine; } } vStartTimecodes.push_back(iStartTimecode); if (iEndTimecode <= 0) { bPrevEndTimeMissing = true; } else { vEndTimecodes.push_back(iEndTimecode); } } else if (oTag.m_szName == L"b" || oTag.m_szName == L"i" || oTag.m_szName == L"font") { if (oTag.m_bOpen) { listPreviousOpenTags.push_back(oTag); } if (oTag.m_bClose) { PopTag(listPreviousOpenTags, oTag.m_szName); } listTags.push_back(oTag); } else if (oTag.m_szName == L"clear") { listTags.clear(); // set existing tags listTags.insert(listTags.end(), listPreviousOpenTags.begin(), listPreviousOpenTags.end()); } else if (oTag.m_szName == L"window") { if (oTag.m_bOpen) { m_RealText.m_WindowTag = oTag; } // Ignore close } else if (oTag.m_szName == L"center") { m_RealText.m_bCenter = true; } else if (oTag.m_szName == L"required") { // Ignore } else if (oTag.m_szName == L"") { // Ignore } else { // assume formating tag (handled later) listTags.push_back(oTag); } } else { Tag oTextTag; if (!ExtractTextTag(p_szFile, oTextTag)) { return false; } listTags.push_back(oTextTag); } } // Handle final line //FilterReduntantTags(listTags); wstring szLine = RenderTags(listTags); if (bPrevEndTimeMissing) { pair<int, int> pairTimecodes(vStartTimecodes.back(), vStartTimecodes.back() + m_iDefaultSubtitleDurationInMillisecs); if (szLine.length() > 0) { m_RealText.m_mapLines[pairTimecodes] = szLine; } bPrevEndTimeMissing = false; } else if (!vStartTimecodes.empty() && !vEndTimecodes.empty()) { pair<int, int> pairTimecodes(vStartTimecodes.back(), vEndTimecodes.back()); if (szLine.length() > 0) { m_RealText.m_mapLines[pairTimecodes] = szLine; } } return true; }
void MorphologicalDictionary::getMorphologyPo(const wstring & lower_word, bool po, shared_ptr<vector<shared_ptr<Morphology> > > result) { morphology_count++; bool debug = false; size_t l = lower_word.length(); shared_ptr<MNode> current_node = root; wstring current_prefix = po ? L"по" : L""; size_t begin_index = po ? 2 : 0; for (size_t i = begin_index; i <= l; ++i) { if (debug) { wcout << "Current_prefix = " << current_prefix << endl; wcout << "Number of models = " << current_node->lemmaId_MNodeModel.size() << endl; } wstring suffix = lower_word.substr(i, l - i); map<wstring, shared_ptr<vector<shared_ptr<MNodeItem> > > >::iterator s_iter = current_node->suffix_MNodeItem.find(suffix); if (s_iter != current_node->suffix_MNodeItem.end()) { if (debug) { wcout << "Found suffix: " << suffix << endl; } shared_ptr<vector<shared_ptr<MNodeItem> > > items = s_iter->second; if (debug) { wcout << "items->size() = " << items->size() << endl; } for (vector<shared_ptr<MNodeItem> >::iterator mn_iter = items->begin(); mn_iter != items->end(); ++mn_iter) { shared_ptr<MNodeItem> item = *mn_iter; if (item->feature_list_id <= 0 || item->po != po) { continue; } shared_ptr<Morphology> morphology = std::make_shared<Morphology>(); morphology->lemma_id = item->lemma_id; morphology->lemma = lemmas.at(item->lemma_id); morphology->word = make_shared<wstring>(lower_word); morphology->suffix_length = l - i; //morphology->features if (debug) { wcout << "item->feature_list_id = " << item->feature_list_id << endl; } for (vector<int>::iterator f_iter = id_feature_list.at(item->feature_list_id).begin(); f_iter != id_feature_list.at(item->feature_list_id).end(); ++f_iter) { if (debug) { wcout << "*f_iter = " << *f_iter << endl; } morphology->features.push_back(id_short_feature.at(*f_iter)); morphology->descriptions.push_back(id_long_feature.at(*f_iter)); if (debug) { wcout << "f_iter ok " << endl; } } result->push_back(morphology); } } // go to child node if (i == l) { break; } map<wchar_t, shared_ptr<MNode> >::iterator child_iterator = current_node->children.find(lower_word.at(i)); if (child_iterator == current_node->children.end()) { break; } current_node = child_iterator->second; current_prefix.push_back(lower_word.at(i)); } if (debug) { wcout << "getMorphologyPo >> OK" << endl; } }
void MorphologicalDictionary::getMorphologicalInfoListByRules(const wstring & lower_word, shared_ptr<vector<shared_ptr<Morphology> > > result) { rule_count++; bool debug = false; set<pair<wstring, int> > resultSet; // avoid duplications resultSet.clear(); size_t lw = lower_word.length(); for (vector<MRule>::iterator iter = ruleSet.morphologyRules.begin(); iter != ruleSet.morphologyRules.end(); ++iter) { MRule rule = *iter; size_t lws = rule.word_suffix.length(); size_t lls = rule.lemma_suffix.length(); // check length if (lw < lws || lw - lws + lls < ruleSet.min_lemma_length) { continue; } // check suffix int i1 = lw - 1; int i2 = lws - 1; bool ok = true; while (i2 >= 0) { if (lower_word.at(i1) != rule.word_suffix.at(i2)) { ok = false; break; } i1--; i2--; } if (!ok) { continue; } // lemma shared_ptr<wstring> lemma = std::make_shared<wstring>(); lemma->clear(); for (size_t i = 0; i < lw - lws; ++i) { lemma->push_back(lower_word.at(i)); } for (size_t i = 0; i < lls; ++i) { lemma->push_back(rule.lemma_suffix.at(i)); } if (debug) { wcout << "Found, lemma = " << endl; } // check <lemma, word_feature_list_id> pair<wstring, int> pp(*lemma, rule.word_feature_list_id); if (resultSet.find(pp) != resultSet.end()) { if (debug) { wcout << "### Duplication: " << *lemma << " - " << rule.word_feature_list_id << endl; } continue; } resultSet.insert(pp); // get feature_list_id of all word forms of lemma shared_ptr<map<int, shared_ptr<set<int> > > > lemmaId_wordFLIDs = make_shared<map<int, shared_ptr<set<int> > > >(); this->getFeatureListOfLemma(lemma, lemmaId_wordFLIDs); // check feature_list_id for (map<int, shared_ptr<set<int> > >::iterator l_iter = lemmaId_wordFLIDs->begin(); l_iter != lemmaId_wordFLIDs->end(); ++l_iter) { shared_ptr<set<int> > ss = l_iter->second; if (ss->find(rule.lemma_feature_list_id) != ss->end() && ss->find(rule.word_feature_list_id) == ss->end()) { if (debug) { wcout << ">>> FOUND" << endl; wcout << "rule.word_suffix = " << rule.word_suffix << endl; wcout << "rule.word_feature_list_id = " << rule.word_feature_list_id << endl; wcout << "rule.lemma_suffix = " << rule.lemma_suffix << endl; wcout << "rule.lemma_feature_list_id = " << rule.lemma_feature_list_id << endl; wcout << endl; } shared_ptr<Morphology> morphology = std::make_shared<Morphology>(); morphology->lemma_id = l_iter->first; morphology->lemma = lemma; morphology->word = make_shared<wstring>(lower_word); morphology->suffix_length = lws; // features for (vector<int>::iterator f_iter = id_feature_list.at(rule.word_feature_list_id).begin(); f_iter != id_feature_list.at(rule.word_feature_list_id).end(); ++f_iter) { morphology->features.push_back(id_short_feature.at(*f_iter)); morphology->descriptions.push_back(id_long_feature.at(*f_iter)); } result->push_back(morphology); } } } }
void MorphologicalDictionary::getMorphology(const wstring & word, shared_ptr<vector<shared_ptr<Morphology> > > result) { bool debug = false; // convert word to lower-cased size_t l = word.length(); wstring lower_word; // lower_word.clear(); // wstring lower_word = word; int e_count = 0; int ee_count = 0; for (size_t i = 0; i < l; ++ i) { wchar_t lower_ch = tools->charToLowerCase(word.at(i)); if (lower_ch == L'е') { e_count++; } else if (lower_ch == L'ё') { ee_count++; } lower_word.push_back(lower_ch); // lower_word[i] = towlower(lower_word.c_str()[i]); } if (debug) { wcout << "lower_word = " << lower_word << endl; } // get morphology this->getMorphologyPo(lower_word, false, result); // "по" if (l >= 2 && lower_word.at(0) == L'п' && lower_word.at(1) == L'о') { this->getMorphologyPo(lower_word, true, result); } if (useE && e_count > 0 && ee_count == 0) { if (debug) { wcout << "E: " << endl; } this->getMorphologyE(lower_word, result); } // morphology by rules (English) if (useRules) { if (debug) { wcout << "Rules (English): " << endl; } this->getMorphologicalInfoListByRules(lower_word, result); } // morphology prediction (Russian) if (usePrediction && result->empty()) { if (debug) { wcout << "Prediction (Russian): " << endl; } this->getMorphologicalPrediction(lower_word, result); } }