multimap<LimaString,string> extractNormalization(const LimaString& source,const BoWText& bowText,MediaId lang) { const Common::PropertyCode::PropertyManager& macroManager = static_cast<const Common::MediaticData::LanguageData&>(MediaticData::single().mediaData(lang)).getPropertyCodeManager().getPropertyManager("MACRO"); multimap<LimaString,string> result; // si un seul bowtoken on le prend // if (bowText.size()==1) // { // cerr << "- found only one norm : " << bowText.front()->getLemma() << endl; // result.push_back(bowText.front()->getLemma()); // } // sinon on prend tous les bowtoken qui vont du d�ut �la fin // else // { // cerr << "extractNormalisation : " << source << endl; for (BoWText::const_iterator bowItr=bowText.begin(); bowItr!=bowText.end(); bowItr++) { pair<int,int> posLen=getStartEnd(*bowItr); // cerr << " - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second; if ((posLen.first==1) && (posLen.second==int(source.size()+1))) { result.insert(make_pair( (*bowItr)->getLemma(), macroManager.getPropertySymbolicValue((*bowItr)->getCategory()))); // cerr << " keep it !"; } // cerr << endl; } // } return result; }
LimaStatusCode OrthographicAlternatives::process( AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); MORPHOLOGINIT; LINFO << "MorphologicalAnalysis: starting process OrthographicAlternatives"; StringsPool& sp=Common::LinguisticData::LinguisticData::changeable().stringsPool(m_language); AnalysisGraph* tokenList=static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph")); LinguisticGraph* g=tokenList->getGraph(); LinguisticGraphVertexIt it,itEnd; VertexDataPropertyMap dataMap=get(vertex_data,*g); VertexTokenPropertyMap tokenMap=get(vertex_token,*g); boost::tie(it,itEnd)=vertices(*g); for (;it!=itEnd;it++) { LDEBUG << "processing vertex " << *it; MorphoSyntacticData* currentTokenData=dataMap[*it]; Token* tok=tokenMap[*it]; if (currentTokenData!=0) { // if in confidentMode and token has already ling infos, skip if ( m_confidentMode && (currentTokenData->size()>0) ) continue; // set orthographic alternatives given by dictionary // using the alternatives directly given by the morphosyntactic data { LDEBUG << "processing alternatives from dico"; DictionaryEntry* entry=tok->dictionaryEntry(); entry->reset(); if (entry->hasAccented()) { LimaString oa = entry->nextAccented(); while ( oa.size() > 0 ) { createAlternative(tok,currentTokenData,oa,m_dictionary,sp); oa = entry->nextAccented(); } } } // if in confidentMode and token has already ling infos, skip if (m_confidentMode && (currentTokenData->size() > 0) ) continue; // if no ling infos, then lower and unmark string LDEBUG << "set unmark alternatives"; setOrthographicAlternatives( tok, currentTokenData, m_dictionary, m_charChart, sp); } } LINFO << "MorphologicalAnalysis: ending process OrthographicAlternatives"; TimeUtils::logElapsedTime("OrthographicAlternatives"); return SUCCESS_ID; }
void EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives( Token* token, MorphoSyntacticData* tokenData, FsaStringsPool& sp) { // try to find simple Uncapitalization MORPHOLOGINIT; // FIXME Conditions below could be process unit parameters const LimaString& tokenStr=token->stringForm(); if (token->status().getAlphaCapital() == T_CAPITAL || token->status().getAlphaCapital() == T_CAPITAL_1ST || token->status().getAlphaCapital() == T_CAPITAL_SMALL || token->status().isAlphaConcatAbbrev() || token->status().isAlphaHyphen() || token->status().isAlphaPossessive() || tokenStr.toUpper() == tokenStr) { return; } std::vector<std::string> suggestions = m_enchantDictionary->suggest(tokenStr.toUtf8().constData()); for (std::vector<std::string>::const_iterator it = suggestions.begin(); it != suggestions.end();it++) { LimaString correction = LimaString::fromUtf8((*it).c_str()); // FIXME Conditions below could be process unit parameters if ( correction.size() > 1 && correction != tokenStr ) { DictionaryEntry* entry = new DictionaryEntry(m_dictionary->getEntry(correction)); MorphoSyntacticDataHandler lingInfosHandler(*tokenData, SPELLING_ALTERNATIVE); if (!entry->isEmpty()) { LINFO << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives correcting" << tokenStr << "into" << correction; // add orthographic alternative to Token; StringsPoolIndex idx=sp[correction]; token->addOrthographicAlternatives(idx); if (entry->hasLingInfos()) { entry->parseLingInfos(&lingInfosHandler); } } else { delete entry; } } } }