Exemplo n.º 1
0
/** @addtogroup ResourceConfiguration
 * - <b>&lt;group name="..." class="SentenceBoundsFinder"&gt;</b>
 */
void StopList::init(
  Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration,
                     Manager* manager)

{
  LIMA_UNUSED(manager);
  DUMPERLOGINIT;
  const string& resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath();
  string stopListFileName;
  try
  {
    stopListFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("file");
  }
  catch (Common::XMLConfigurationFiles::NoSuchParam& )
  {
    LERROR << "No param 'file' in StopList configuration group ! ";
    throw InvalidConfiguration();
  }

  std::ifstream stopListFile(stopListFileName.c_str(), std::ifstream::binary);
  if (!stopListFile) {
    LERROR << "invalid file " << stopListFileName;
    throw InvalidConfiguration();
  }
  
  LimaString wword = Common::Misc::utf8stdstring2limastring(Common::Misc::readLine(stopListFile));
  LDEBUG << "Loading stop list file: " << stopListFileName;
  while (!wword.isEmpty())
  {
    insert(wword);
    wword = Common::Misc::utf8stdstring2limastring(Common::Misc::readLine(stopListFile));
  }
}
Exemplo n.º 2
0
//**********************************************************************
// find the numeric form of type of expression from a text form
//**********************************************************************
void setType(Rule& r,
             const LimaString& s,
             const std::vector<LimaString>& activeEntityGroups) 
{
  LimaString str;
  //std::string::size_type i(findSpecialCharacter(s,CHAR_POS_TR,0));
  int i(findSpecialCharacter(s,CHAR_POS_TR,0));
  if (i != -1) { // there are linguistic properties
    r.setLinguisticProperties(static_cast<LinguisticCode>(s.mid(i+1).toInt()));
    str=s.left(i);
  }
  else {
    r.setLinguisticProperties(static_cast<LinguisticCode>(0));
    str=s;
  }

  Common::MediaticData::EntityType type=
    resolveEntityName(str,activeEntityGroups);
  if (type.isNull()) {
    std::ostringstream oss;
    oss << "type [" << str.toUtf8().data() << "] not recognized";
    throw UnknownTypeException(oss.str());
  }
  r.setType(type);
}
Exemplo n.º 3
0
QDebug& operator << (QDebug& os, const Constraint& c) {
  string functionName;
  LimaString complement;
  if (! ConstraintFunctionManager::single().
    getFunctionName(c.m_functionAddr,
                    functionName,
                    complement)) {
    AULOGINIT;
  LERROR << "constraint function "
  << c.m_functionAddr << " not availale" << LENDL;
                    }
                    switch (c.action()) {
                      case EXECUTE_IF_SUCCESS:
                      case EXECUTE_IF_SUCCESS_REVERSE: os << "=>"; break;
                      case EXECUTE_IF_FAILURE:
                      case EXECUTE_IF_FAILURE_REVERSE: os << "=<"; break;
                      default: os << "+"; break;
                    }
                    if (c.index() == Constraint::noindex) {
                      os << "[";
                    }
                    else {
                      os << "[" << c.index()<< ",";
                    }
                    if (c.m_negative) { os << "!"; }
                    os << functionName << ","
                    << c.actionString();
                    if (! complement.isEmpty()) {
                      os << "," << Common::Misc::limastring2utf8stdstring(complement);
                    }
                    os << "]";
                    return os;
}
Exemplo n.º 4
0
LimaString Constraint::str() const {
  string functionName;
  LimaString complement;
  if (! ConstraintFunctionManager::single().
      getFunctionName(m_functionAddr,
                      functionName,
                      complement)) {
    AULOGINIT;
    LERROR << "constraint function "
           << m_functionAddr << " not availale" << LENDL;
  }

  ostringstream oss;
  oss << (unsigned char)CHAR_BEGIN_CONSTRAINT_INTERNAL << m_index
      << (unsigned char)CHAR_SEP_CONSTRAINT_INTERNAL
      << static_cast<uint64_t>(m_action)
      << (unsigned char)CHAR_SEP_CONSTRAINT_INTERNAL;
  if (m_negative) {
    oss << CHAR_NEGATIVE_CONSTRAINT_INTERNAL;
  }
  oss << functionName;
  if (! complement.isEmpty()) {
    oss << (unsigned char)CHAR_SEP_CONSTRAINT_INTERNAL 
      << Common::Misc::limastring2utf8stdstring(complement);
  }
  return Common::Misc::utf8stdstring2limastring(oss.str());
}
Exemplo n.º 5
0
LimaStatusCode OrthographicAlternatives::process(
  AnalysisContent& analysis) const
{

  TimeUtils::updateCurrentTime();
  MORPHOLOGINIT;
  LINFO << "MorphologicalAnalysis: starting process OrthographicAlternatives";

  StringsPool& sp=Common::LinguisticData::LinguisticData::changeable().stringsPool(m_language);
  AnalysisGraph* tokenList=static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph"));
  LinguisticGraph* g=tokenList->getGraph();
  LinguisticGraphVertexIt it,itEnd;
  VertexDataPropertyMap dataMap=get(vertex_data,*g);
  VertexTokenPropertyMap tokenMap=get(vertex_token,*g);
  boost::tie(it,itEnd)=vertices(*g);
  for (;it!=itEnd;it++)
  {
    LDEBUG << "processing vertex " << *it;
    MorphoSyntacticData* currentTokenData=dataMap[*it];
    Token* tok=tokenMap[*it];
    if (currentTokenData!=0)
    {

      // if in confidentMode and token has already ling infos, skip
      if ( m_confidentMode && (currentTokenData->size()>0) ) continue;

      // set orthographic alternatives given by dictionary
      // using the alternatives directly given by the morphosyntactic data
      {
        LDEBUG << "processing alternatives from dico";
        DictionaryEntry* entry=tok->dictionaryEntry();
        entry->reset();
        if (entry->hasAccented()) {
          LimaString oa = entry->nextAccented();
          while ( oa.size() > 0 )
          {
            createAlternative(tok,currentTokenData,oa,m_dictionary,sp);
            oa = entry->nextAccented();
          }
        }
      }

      // if in confidentMode and token has already ling infos, skip
      if (m_confidentMode && (currentTokenData->size() > 0) ) continue;

      // if no ling infos, then lower and unmark string
      LDEBUG << "set unmark alternatives";
      setOrthographicAlternatives(
        tok,
        currentTokenData,
        m_dictionary,
        m_charChart,
        sp);
    }
  }
  LINFO << "MorphologicalAnalysis: ending process OrthographicAlternatives";
  TimeUtils::logElapsedTime("OrthographicAlternatives");
  return SUCCESS_ID;
}
void EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives(
  Token* token,
  MorphoSyntacticData* tokenData,
  FsaStringsPool& sp)
{
  // try to find simple Uncapitalization
  MORPHOLOGINIT;
  // FIXME Conditions below could be process unit parameters
  const LimaString& tokenStr=token->stringForm();
  if (token->status().getAlphaCapital() == T_CAPITAL
    || token->status().getAlphaCapital() == T_CAPITAL_1ST
    || token->status().getAlphaCapital() == T_CAPITAL_SMALL
    || token->status().isAlphaConcatAbbrev()
    || token->status().isAlphaHyphen()
    || token->status().isAlphaPossessive()
    || tokenStr.toUpper() == tokenStr)
  {
    return;
  }
  std::vector<std::string> suggestions = m_enchantDictionary->suggest(tokenStr.toUtf8().constData());
  for (std::vector<std::string>::const_iterator it = suggestions.begin(); it != suggestions.end();it++)
  {
    LimaString correction = LimaString::fromUtf8((*it).c_str());
    // FIXME Conditions below could be process unit parameters
    if ( correction.size() > 1 && correction != tokenStr )
    {
      DictionaryEntry* entry = new DictionaryEntry(m_dictionary->getEntry(correction));
      MorphoSyntacticDataHandler lingInfosHandler(*tokenData, SPELLING_ALTERNATIVE);
      
      
      if (!entry->isEmpty())
      {
        LINFO << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives correcting" << tokenStr << "into" << correction;
        // add orthographic alternative to Token;
        StringsPoolIndex idx=sp[correction];
        token->addOrthographicAlternatives(idx);
        
        if (entry->hasLingInfos())
        {
          entry->parseLingInfos(&lingInfosHandler);
        }
      } 
      else 
      {
        delete entry;
      }
    }
  }
}
Exemplo n.º 7
0
multimap<LimaString,string> extractNormalization(const LimaString& source,const BoWText& bowText,MediaId lang)
{
  const Common::PropertyCode::PropertyManager& macroManager = static_cast<const Common::MediaticData::LanguageData&>(MediaticData::single().mediaData(lang)).getPropertyCodeManager().getPropertyManager("MACRO");
  multimap<LimaString,string> result;
  // si un seul bowtoken on le prend
  //  if (bowText.size()==1)
  //  {
  //    cerr << "- found only one norm : " << bowText.front()->getLemma() << endl;
  //    result.push_back(bowText.front()->getLemma());
  //  }
  // sinon on prend tous les bowtoken qui vont du d�ut �la fin
  //  else
  //  {
  //    cerr << "extractNormalisation : " << source << endl;
  for (BoWText::const_iterator bowItr=bowText.begin();
       bowItr!=bowText.end();
       bowItr++)
  {
    pair<int,int> posLen=getStartEnd(*bowItr);
    //      cerr << "  - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second;
    if ((posLen.first==1) && (posLen.second==int(source.size()+1)))
    {
      result.insert(make_pair(
                      (*bowItr)->getLemma(),
                      macroManager.getPropertySymbolicValue((*bowItr)->getCategory())));
      //        cerr << " keep it !";
    }
    //      cerr << endl;
  }
  //   }
  return result;
}
Exemplo n.º 8
0
 bool operator()(const LimaString & s1,
   const LimaString & s2) const
 {
   return( s1.compare(s2) < 0 );
 }
Exemplo n.º 9
0
// utility function for conversion of limastring to int
uint64_t LimaStringToInt(LimaString s)
{
  return s.toUInt();
}