/** @addtogroup ResourceConfiguration * - <b><group name="..." class="SentenceBoundsFinder"></b> */ void StopList::init( Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, Manager* manager) { LIMA_UNUSED(manager); DUMPERLOGINIT; const string& resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); string stopListFileName; try { stopListFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("file"); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { LERROR << "No param 'file' in StopList configuration group ! "; throw InvalidConfiguration(); } std::ifstream stopListFile(stopListFileName.c_str(), std::ifstream::binary); if (!stopListFile) { LERROR << "invalid file " << stopListFileName; throw InvalidConfiguration(); } LimaString wword = Common::Misc::utf8stdstring2limastring(Common::Misc::readLine(stopListFile)); LDEBUG << "Loading stop list file: " << stopListFileName; while (!wword.isEmpty()) { insert(wword); wword = Common::Misc::utf8stdstring2limastring(Common::Misc::readLine(stopListFile)); } }
//********************************************************************** // find the numeric form of type of expression from a text form //********************************************************************** void setType(Rule& r, const LimaString& s, const std::vector<LimaString>& activeEntityGroups) { LimaString str; //std::string::size_type i(findSpecialCharacter(s,CHAR_POS_TR,0)); int i(findSpecialCharacter(s,CHAR_POS_TR,0)); if (i != -1) { // there are linguistic properties r.setLinguisticProperties(static_cast<LinguisticCode>(s.mid(i+1).toInt())); str=s.left(i); } else { r.setLinguisticProperties(static_cast<LinguisticCode>(0)); str=s; } Common::MediaticData::EntityType type= resolveEntityName(str,activeEntityGroups); if (type.isNull()) { std::ostringstream oss; oss << "type [" << str.toUtf8().data() << "] not recognized"; throw UnknownTypeException(oss.str()); } r.setType(type); }
QDebug& operator << (QDebug& os, const Constraint& c) { string functionName; LimaString complement; if (! ConstraintFunctionManager::single(). getFunctionName(c.m_functionAddr, functionName, complement)) { AULOGINIT; LERROR << "constraint function " << c.m_functionAddr << " not availale" << LENDL; } switch (c.action()) { case EXECUTE_IF_SUCCESS: case EXECUTE_IF_SUCCESS_REVERSE: os << "=>"; break; case EXECUTE_IF_FAILURE: case EXECUTE_IF_FAILURE_REVERSE: os << "=<"; break; default: os << "+"; break; } if (c.index() == Constraint::noindex) { os << "["; } else { os << "[" << c.index()<< ","; } if (c.m_negative) { os << "!"; } os << functionName << "," << c.actionString(); if (! complement.isEmpty()) { os << "," << Common::Misc::limastring2utf8stdstring(complement); } os << "]"; return os; }
LimaString Constraint::str() const { string functionName; LimaString complement; if (! ConstraintFunctionManager::single(). getFunctionName(m_functionAddr, functionName, complement)) { AULOGINIT; LERROR << "constraint function " << m_functionAddr << " not availale" << LENDL; } ostringstream oss; oss << (unsigned char)CHAR_BEGIN_CONSTRAINT_INTERNAL << m_index << (unsigned char)CHAR_SEP_CONSTRAINT_INTERNAL << static_cast<uint64_t>(m_action) << (unsigned char)CHAR_SEP_CONSTRAINT_INTERNAL; if (m_negative) { oss << CHAR_NEGATIVE_CONSTRAINT_INTERNAL; } oss << functionName; if (! complement.isEmpty()) { oss << (unsigned char)CHAR_SEP_CONSTRAINT_INTERNAL << Common::Misc::limastring2utf8stdstring(complement); } return Common::Misc::utf8stdstring2limastring(oss.str()); }
LimaStatusCode OrthographicAlternatives::process( AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); MORPHOLOGINIT; LINFO << "MorphologicalAnalysis: starting process OrthographicAlternatives"; StringsPool& sp=Common::LinguisticData::LinguisticData::changeable().stringsPool(m_language); AnalysisGraph* tokenList=static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph")); LinguisticGraph* g=tokenList->getGraph(); LinguisticGraphVertexIt it,itEnd; VertexDataPropertyMap dataMap=get(vertex_data,*g); VertexTokenPropertyMap tokenMap=get(vertex_token,*g); boost::tie(it,itEnd)=vertices(*g); for (;it!=itEnd;it++) { LDEBUG << "processing vertex " << *it; MorphoSyntacticData* currentTokenData=dataMap[*it]; Token* tok=tokenMap[*it]; if (currentTokenData!=0) { // if in confidentMode and token has already ling infos, skip if ( m_confidentMode && (currentTokenData->size()>0) ) continue; // set orthographic alternatives given by dictionary // using the alternatives directly given by the morphosyntactic data { LDEBUG << "processing alternatives from dico"; DictionaryEntry* entry=tok->dictionaryEntry(); entry->reset(); if (entry->hasAccented()) { LimaString oa = entry->nextAccented(); while ( oa.size() > 0 ) { createAlternative(tok,currentTokenData,oa,m_dictionary,sp); oa = entry->nextAccented(); } } } // if in confidentMode and token has already ling infos, skip if (m_confidentMode && (currentTokenData->size() > 0) ) continue; // if no ling infos, then lower and unmark string LDEBUG << "set unmark alternatives"; setOrthographicAlternatives( tok, currentTokenData, m_dictionary, m_charChart, sp); } } LINFO << "MorphologicalAnalysis: ending process OrthographicAlternatives"; TimeUtils::logElapsedTime("OrthographicAlternatives"); return SUCCESS_ID; }
void EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives( Token* token, MorphoSyntacticData* tokenData, FsaStringsPool& sp) { // try to find simple Uncapitalization MORPHOLOGINIT; // FIXME Conditions below could be process unit parameters const LimaString& tokenStr=token->stringForm(); if (token->status().getAlphaCapital() == T_CAPITAL || token->status().getAlphaCapital() == T_CAPITAL_1ST || token->status().getAlphaCapital() == T_CAPITAL_SMALL || token->status().isAlphaConcatAbbrev() || token->status().isAlphaHyphen() || token->status().isAlphaPossessive() || tokenStr.toUpper() == tokenStr) { return; } std::vector<std::string> suggestions = m_enchantDictionary->suggest(tokenStr.toUtf8().constData()); for (std::vector<std::string>::const_iterator it = suggestions.begin(); it != suggestions.end();it++) { LimaString correction = LimaString::fromUtf8((*it).c_str()); // FIXME Conditions below could be process unit parameters if ( correction.size() > 1 && correction != tokenStr ) { DictionaryEntry* entry = new DictionaryEntry(m_dictionary->getEntry(correction)); MorphoSyntacticDataHandler lingInfosHandler(*tokenData, SPELLING_ALTERNATIVE); if (!entry->isEmpty()) { LINFO << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives correcting" << tokenStr << "into" << correction; // add orthographic alternative to Token; StringsPoolIndex idx=sp[correction]; token->addOrthographicAlternatives(idx); if (entry->hasLingInfos()) { entry->parseLingInfos(&lingInfosHandler); } } else { delete entry; } } } }
multimap<LimaString,string> extractNormalization(const LimaString& source,const BoWText& bowText,MediaId lang) { const Common::PropertyCode::PropertyManager& macroManager = static_cast<const Common::MediaticData::LanguageData&>(MediaticData::single().mediaData(lang)).getPropertyCodeManager().getPropertyManager("MACRO"); multimap<LimaString,string> result; // si un seul bowtoken on le prend // if (bowText.size()==1) // { // cerr << "- found only one norm : " << bowText.front()->getLemma() << endl; // result.push_back(bowText.front()->getLemma()); // } // sinon on prend tous les bowtoken qui vont du d�ut �la fin // else // { // cerr << "extractNormalisation : " << source << endl; for (BoWText::const_iterator bowItr=bowText.begin(); bowItr!=bowText.end(); bowItr++) { pair<int,int> posLen=getStartEnd(*bowItr); // cerr << " - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second; if ((posLen.first==1) && (posLen.second==int(source.size()+1))) { result.insert(make_pair( (*bowItr)->getLemma(), macroManager.getPropertySymbolicValue((*bowItr)->getCategory()))); // cerr << " keep it !"; } // cerr << endl; } // } return result; }
bool operator()(const LimaString & s1, const LimaString & s2) const { return( s1.compare(s2) < 0 ); }
// utility function for conversion of limastring to int uint64_t LimaStringToInt(LimaString s) { return s.toUInt(); }