Exemplo n.º 1
0
multimap<LimaString,string> extractNormalization(const LimaString& source,const BoWText& bowText,MediaId lang)
{
  const Common::PropertyCode::PropertyManager& macroManager = static_cast<const Common::MediaticData::LanguageData&>(MediaticData::single().mediaData(lang)).getPropertyCodeManager().getPropertyManager("MACRO");
  multimap<LimaString,string> result;
  // si un seul bowtoken on le prend
  //  if (bowText.size()==1)
  //  {
  //    cerr << "- found only one norm : " << bowText.front()->getLemma() << endl;
  //    result.push_back(bowText.front()->getLemma());
  //  }
  // sinon on prend tous les bowtoken qui vont du d�ut �la fin
  //  else
  //  {
  //    cerr << "extractNormalisation : " << source << endl;
  for (BoWText::const_iterator bowItr=bowText.begin();
       bowItr!=bowText.end();
       bowItr++)
  {
    pair<int,int> posLen=getStartEnd(*bowItr);
    //      cerr << "  - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second;
    if ((posLen.first==1) && (posLen.second==int(source.size()+1)))
    {
      result.insert(make_pair(
                      (*bowItr)->getLemma(),
                      macroManager.getPropertySymbolicValue((*bowItr)->getCategory())));
      //        cerr << " keep it !";
    }
    //      cerr << endl;
  }
  //   }
  return result;
}
Exemplo n.º 2
0
LimaStatusCode OrthographicAlternatives::process(
  AnalysisContent& analysis) const
{

  TimeUtils::updateCurrentTime();
  MORPHOLOGINIT;
  LINFO << "MorphologicalAnalysis: starting process OrthographicAlternatives";

  StringsPool& sp=Common::LinguisticData::LinguisticData::changeable().stringsPool(m_language);
  AnalysisGraph* tokenList=static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph"));
  LinguisticGraph* g=tokenList->getGraph();
  LinguisticGraphVertexIt it,itEnd;
  VertexDataPropertyMap dataMap=get(vertex_data,*g);
  VertexTokenPropertyMap tokenMap=get(vertex_token,*g);
  boost::tie(it,itEnd)=vertices(*g);
  for (;it!=itEnd;it++)
  {
    LDEBUG << "processing vertex " << *it;
    MorphoSyntacticData* currentTokenData=dataMap[*it];
    Token* tok=tokenMap[*it];
    if (currentTokenData!=0)
    {

      // if in confidentMode and token has already ling infos, skip
      if ( m_confidentMode && (currentTokenData->size()>0) ) continue;

      // set orthographic alternatives given by dictionary
      // using the alternatives directly given by the morphosyntactic data
      {
        LDEBUG << "processing alternatives from dico";
        DictionaryEntry* entry=tok->dictionaryEntry();
        entry->reset();
        if (entry->hasAccented()) {
          LimaString oa = entry->nextAccented();
          while ( oa.size() > 0 )
          {
            createAlternative(tok,currentTokenData,oa,m_dictionary,sp);
            oa = entry->nextAccented();
          }
        }
      }

      // if in confidentMode and token has already ling infos, skip
      if (m_confidentMode && (currentTokenData->size() > 0) ) continue;

      // if no ling infos, then lower and unmark string
      LDEBUG << "set unmark alternatives";
      setOrthographicAlternatives(
        tok,
        currentTokenData,
        m_dictionary,
        m_charChart,
        sp);
    }
  }
  LINFO << "MorphologicalAnalysis: ending process OrthographicAlternatives";
  TimeUtils::logElapsedTime("OrthographicAlternatives");
  return SUCCESS_ID;
}
void EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives(
  Token* token,
  MorphoSyntacticData* tokenData,
  FsaStringsPool& sp)
{
  // try to find simple Uncapitalization
  MORPHOLOGINIT;
  // FIXME Conditions below could be process unit parameters
  const LimaString& tokenStr=token->stringForm();
  if (token->status().getAlphaCapital() == T_CAPITAL
    || token->status().getAlphaCapital() == T_CAPITAL_1ST
    || token->status().getAlphaCapital() == T_CAPITAL_SMALL
    || token->status().isAlphaConcatAbbrev()
    || token->status().isAlphaHyphen()
    || token->status().isAlphaPossessive()
    || tokenStr.toUpper() == tokenStr)
  {
    return;
  }
  std::vector<std::string> suggestions = m_enchantDictionary->suggest(tokenStr.toUtf8().constData());
  for (std::vector<std::string>::const_iterator it = suggestions.begin(); it != suggestions.end();it++)
  {
    LimaString correction = LimaString::fromUtf8((*it).c_str());
    // FIXME Conditions below could be process unit parameters
    if ( correction.size() > 1 && correction != tokenStr )
    {
      DictionaryEntry* entry = new DictionaryEntry(m_dictionary->getEntry(correction));
      MorphoSyntacticDataHandler lingInfosHandler(*tokenData, SPELLING_ALTERNATIVE);
      
      
      if (!entry->isEmpty())
      {
        LINFO << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives correcting" << tokenStr << "into" << correction;
        // add orthographic alternative to Token;
        StringsPoolIndex idx=sp[correction];
        token->addOrthographicAlternatives(idx);
        
        if (entry->hasLingInfos())
        {
          entry->parseLingInfos(&lingInfosHandler);
        }
      } 
      else 
      {
        delete entry;
      }
    }
  }
}