std::string EventParagraph::toString(std::string parentURI, uint64_t index, bool main) const { LIMA_UNUSED(parentURI); LIMA_UNUSED(index); LIMA_UNUSED(main); std::ostringstream oss; oss << "<rdf:rdf xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"" << " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"" << std::endl; oss << "<dc:creator>CEA LIST Specific entities extraction service</dc:creator>"; oss << "</rdf:rdf>"; // return oss.str(); std::string out; for(std::vector<Entity *>::const_iterator iT=m_evententities.second.begin();iT!=m_evententities.second.end();iT++) { } for(std::map<Common::MediaticData::EntityType,std::vector<Entity *> >::const_iterator iTm=m_otherentities.begin();iTm!=m_otherentities.end();iTm++) { for(std::vector<Entity *>::const_iterator iT=(*iTm).second.begin();iT!=(*iTm).second.end();iT++) { } } return out; }
bool XMLPropertyHandler::startElement(const QString & namespaceURI, const QString & name, const QString & qName, const QXmlAttributes & attributes) { LIMA_UNUSED(namespaceURI); LIMA_UNUSED(qName); PROPERTYCODELOGINIT; const QString& stringName = name; if (stringName == "property") { m_currentProp=PROP; m_properties.push_back(PropertyDescription()); m_properties.back().name = attributes.value("name").toUtf8().data(); LDEBUG << "read property " << m_properties.back().name; } else if (stringName == "subproperty") { m_currentProp=SUBPROP; m_subproperties.push_back(SubPropertyDescription()); m_subproperties.back().name = attributes.value("name").toUtf8().data(); m_subproperties.back().parentName = attributes.value("parent").toUtf8().data(); LDEBUG << "read subproperty " << m_subproperties.back().name << " of parent property " << m_subproperties.back().parentName; } else if (stringName == "value") { string value=attributes.value("name").toUtf8().data(); LDEBUG << "read value " << value; if (m_currentProp == PROP) { m_properties.back().values.push_back(value); } else if (m_currentProp == SUBPROP) { m_subproperties.back().values.back().second.push_back(value); } else { LERROR << "Don't know what to do with value " << value << " !"; } } else if (stringName == "subvalues") { string value=attributes.value("value").toUtf8().data(); LDEBUG << "read subvalues " << value; if (m_currentProp == SUBPROP) { m_subproperties.back().values.push_back(make_pair(value,vector<string>())); } else { LERROR << "Don't know what to do with subvalues " << value << " !"; } } return true; }
bool ExampleLoader::XMLHandler::startElement(const QString & namespaceURI, const QString & eltName, const QString & qName, const QXmlAttributes & attributes) { LIMA_UNUSED(namespaceURI); LIMA_UNUSED(qName); //PROCESSORSLOGINIT; //LDEBUG << "ExampleLoader::XMLHandler start element " << eltName; if (eltName=="w") { LinguisticCode posInt=static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertyValue(attributes.value("pos").toStdString()); m_tagIndex[attributes.value("p").toInt()] = posInt; } return true; }
bool XMLPropertyHandler::endElement(const QString & namespaceURI, const QString & name, const QString & qName) { LIMA_UNUSED(namespaceURI); LIMA_UNUSED(qName); const QString& stringName = name; if (m_currentProp==PROP && stringName == "property") { m_currentProp=NONE; } else if (m_currentProp==SUBPROP && stringName == "subproperty") { m_currentProp=NONE; } return true; }
/** @addtogroup ResourceConfiguration * - <b><group name="..." class="SentenceBoundsFinder"></b> */ void StopList::init( Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, Manager* manager) { LIMA_UNUSED(manager); DUMPERLOGINIT; const string& resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); string stopListFileName; try { stopListFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("file"); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { LERROR << "No param 'file' in StopList configuration group ! "; throw InvalidConfiguration(); } std::ifstream stopListFile(stopListFileName.c_str(), std::ifstream::binary); if (!stopListFile) { LERROR << "invalid file " << stopListFileName; throw InvalidConfiguration(); } LimaString wword = Common::Misc::utf8stdstring2limastring(Common::Misc::readLine(stopListFile)); LDEBUG << "Loading stop list file: " << stopListFileName; while (!wword.isEmpty()) { insert(wword); wword = Common::Misc::utf8stdstring2limastring(Common::Misc::readLine(stopListFile)); } }
bool SpecificEntitiesLoader::XMLHandler::endElement(const QString & namespaceURI, const QString & eltName, const QString & qName) { LIMA_UNUSED(namespaceURI); LIMA_UNUSED(qName); //LOGINIT("LP::SpecificEntities"); //LDEBUG << "SpecificEntitiesLoader::XMLHandler end element " << toString(eltName); string name=toString(eltName); if (name=="specific_entity") { LOGINIT("LP::SpecificEntities"); LDEBUG << "SpecificEntitiesLoader::XMLHandler add SE " << m_type << "," << m_position << "," << m_length << "," << m_graph; addSpecificEntity(m_analysis, m_graph, m_string, m_type, m_position, m_length); } // no more current element m_currentElement=""; return true; }
void TextFormater::putTag(std::ostream& status, std::ostream& output, bool wide, const std::string& endSentenceTag, std::ostringstream& tag, std::ostringstream& tagValue) const { LIMA_UNUSED(status); LIMA_UNUSED(wide); LILOGINIT; std::string stag = tag.str(); if ( (endSentenceTag != "") && (tagValue.str().find(endSentenceTag,0) != std::string::npos) && (stag.length() > 0) ) stag[0] = '.'; output << stag; tag.str(""); tagValue.str(""); }
bool SpecificEntitiesLoader::XMLHandler:: startElement(const QString & namespaceURI, const QString & eltName, const QString & qName, const QXmlAttributes & attributes) { LIMA_UNUSED(namespaceURI); LIMA_UNUSED(qName); LIMA_UNUSED(attributes); //LOGINIT("LP::SpecificEntities"); //LDEBUG << "SpecificEntitiesLoader::XMLHandler start element " << toString(eltName); m_currentElement=toString(eltName); if (m_currentElement=="specific_entity") { // clear stored values m_string=""; m_type=""; m_position=0; m_length=0; } return true; }
void TextFormater::putWhites(std::ostream& status, std::ostream& output, bool wide, const std::string& endSentenceTag, std::ostringstream& tagValue, std::ostream& carLuTarget, char carLu, char fillChar, uint64_t nb, uint64_t nbNewLines) const { LIMA_UNUSED(status); LIMA_UNUSED(wide); std::string s; s = std::string(nb/2, fillChar); s.append(nbNewLines, '\n'); if (nb%2 == 0) s.append(nb/2, fillChar); else s.append(nb/2 + 1, fillChar); if ( (endSentenceTag != "") && (tagValue.str().find(endSentenceTag,0) != std::string::npos) && (s.length() > 0) ) s[0] = '.'; output << s; nb = 0; nbNewLines = 0; carLuTarget << carLu; }
std::string FeatureStoredData:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, LinguisticGraphVertex v, AnalysisContent &analysis) const { LIMA_UNUSED(analysis) Token* token=get(vertex_token,*(graph->getGraph()),v); if (token==0) { return ""; } ostringstream oss; oss << token->position() ; return oss.str(); }
bool CreateSemanticRelation:: operator()(const LinguisticAnalysisStructure::AnalysisGraph& anagraph, const LinguisticGraphVertex& vertex1, const LinguisticGraphVertex& vertex2, AnalysisContent& analysis ) const { LIMA_UNUSED(anagraph); SemanticRelationData * semanticData=static_cast<SemanticRelationData*>(analysis.getData("SemanticRelationData")); if (semanticData==0) { semanticData=new SemanticRelationData(); analysis.setData("SemanticRelationData",semanticData); } return semanticData->relation(vertex1,vertex2,m_semanticRelationType); }
/** This method replaces all the tags in the UTF-8 encoded input stream by spaces or underscores in the output stream. Underscores are written when the tag is adjacent to a word (directly or by another) tag. Spaces are written otherwise. If an unclosed tag finishes the input stream, the result is written but an error message is written in <I>status</I> and a non zero-value is returned. If an opening '<' is found inside a tag, it is ignored (replaced) but an error message is written in <I>status</I> and a non zero-value will be returned. If a closing '>' is found outside a tag, it is ignored (replaced) but an error message is written in <I>status</I> and a non zero-value will be returned. This method does not modify its receiver and thus is constant. Throws a runtime_error in case of an unknown internal state It does not record the removed tags. Should be done in another method; @param std::ostream& status the stream where errors and warning will be written @param std::istream& input the input stream. Should contain a valid SGML file @param std::ostream& output the output stream. The result stream where un-tagged results are written @return int 0 if there is no error nor warning ; non-zero otherwise. */ int TextFormater::untaggingWithSpaces(std::ostream& status, std::istream& input, std::ostream& output, bool wide, const std::string& endSentenceTag) const { LIMA_UNUSED(wide); LILOGINIT; setlocale(LC_CTYPE,"fr_FR.UTF-8"); size_t nb = 0; // the number of chars in the current tag when we cannot know // if it is bond to the following text (tag after a white space) size_t nbNewLines = 0; // the number of newlines in the current tag when we cannot know // if it is bond to the following text (tag after a white space) size_t position=0; // position in the input stream; enum RetVal {SUCCESS, INVALID_OPENING_TAG_CHAR, INVALID_CLOSING_TAG_CHAR, DUPLICATED_OPENING_TAG_CHAR, DUPLICATED_CLOSING_TAG_CHAR, UNCLOSED_OPENING_TAG_CHAR}; RetVal retVal = SUCCESS; enum Etat {TEXT, DEBCOL, FINCOL, BLANC, DEBBLANC, FINBLANC, BEGENTITY, ENTITY}; Etat etat = BLANC; char carLu; char carLu2; std::string s; std::ostringstream txt; std::ostringstream tag; std::ostringstream tagValue; std::ostringstream entity; while (input.good()) { input.get(carLu); LDEBUG << carLu; if (input.eof()) continue; switch (etat) { case TEXT: switch (carLu) { case '<': output << txt.str(); txt.str(""); tag << '_'; tag << ' '; tagValue << carLu; LDEBUG << "TEXT-> DEBCOL" << LENDL; etat = DEBCOL; break; case '>': txt << carLu; status << "Invalid '>' character at " << position << std::endl; retVal = INVALID_CLOSING_TAG_CHAR; break; case '&': output << txt.str(); txt.str(""); entity << carLu; LDEBUG << "TEXT -> BEGENTITY;" << LENDL; etat = BEGENTITY; break; case ' ':case '\t':case '\n': output << txt.str();; output << carLu; txt.str(""); LDEBUG << "TEXT-> BLANC" << LENDL; etat = BLANC; break; default: txt << carLu; break; } break; case BEGENTITY: switch (carLu) { case 'A':;case 'B':;case 'C':;case 'D':;case 'E':;case 'F':;case 'G':; case 'H':;case 'I':;case 'J':;case 'K':;case 'L':;case 'M':;case 'N':; case 'O':;case 'P':;case 'Q':;case 'R':;case 'S':;case 'T':;case 'U':; case 'V':;case 'W':;case 'X':;case 'Y':;case 'Z':; case 'a':;case 'b':;case 'c':;case 'd':;case 'e':;case 'f':;case 'g':; case 'h':;case 'i':;case 'j':;case 'k':;case 'l':;case 'm':;case 'n':; case 'o':;case 'p':;case 'q':;case 'r':;case 's':;case 't':;case 'u':; case 'v':;case 'w':;case 'x':;case 'y':;case 'z': entity << carLu; LDEBUG << "BEGENTITY-> ENTITY" << LENDL; etat = ENTITY; break; case '<': output.put('_'); output.put(' '); entity.str(""); tag << '_'; tag << ' '; LDEBUG << "BEGENTITY-> DEBCOL" << LENDL; etat = DEBCOL; break; case ' ':case '\t':case '\n': // output.put('_'); output.put(' '); output << carLu; entity.str(""); LDEBUG << "BEGENTITY-> BLANC" << LENDL; etat = BLANC; break; default: // output.put('_'); output.put(' '); output << carLu; entity.str(""); LDEBUG << "BEGENTITY-> TEXT" << LENDL; etat = TEXT; break; } break; case ENTITY: switch (carLu) { case 'A':;case 'B':;case 'C':;case 'D':;case 'E':;case 'F':;case 'G':; case 'H':;case 'I':;case 'J':;case 'K':;case 'L':;case 'M':;case 'N':; case 'O':;case 'P':;case 'Q':;case 'R':;case 'S':;case 'T':;case 'U':; case 'V':;case 'W':;case 'X':;case 'Y':;case 'Z':; case 'a':;case 'b':;case 'c':;case 'd':;case 'e':;case 'f':;case 'g':; case 'h':;case 'i':;case 'j':;case 'k':;case 'l':;case 'm':;case 'n':; case 'o':;case 'p':;case 'q':;case 'r':;case 's':;case 't':;case 'u':; case 'v':;case 'w':;case 'x':;case 'y':;case 'z': entity << carLu; break; case '<': for (uint64_t i = 0; i < entity.str().size(); i++) { // output << '_'; output << ' '; } entity.str(""); // tag << '_'; tag << ' '; LDEBUG << "ENTITY-> DEBCOL" << LENDL; etat = DEBCOL; break; case ' ':case '\t':case '\n': for (uint64_t i = 0; i < entity.str().size(); i++) { // output << '_'; output << ' '; } output << carLu; entity.str(""); LDEBUG << "ENTITY-> BLANC" << LENDL; etat = BLANC; break; case ';': for (uint64_t i = 0; i < entity.str().size()+1; i++) { // output << '_'; output << ' '; } entity.str(""); LDEBUG << "ENTITY-> TEXT" << LENDL; etat = TEXT; break; default: for (uint64_t i = 0; i < entity.str().size(); i++) { // output << '_'; output << ' '; } output << carLu; entity.str(""); LDEBUG << "ENTITY-> TEXT" << LENDL; etat = TEXT; break; } break; case DEBCOL: tagValue << carLu; switch (carLu) { case '<': // tag << '_'; tag << ' '; status << "Invalid '<' character at " << position << std::endl; retVal = DUPLICATED_OPENING_TAG_CHAR; break; case '>': // tag << '_'; tag << ' '; LDEBUG << "DEBCOL-> FINCOL" << LENDL; etat = FINCOL; break; case ' ':case '\t':case '\n': tag << carLu; break; default: LDEBUG << "Looking at " << carLu << LENDL; char buf[MB_LEN_MAX]; buf[0] = carLu; input.rdbuf()-> sgetn(buf+1, 9); wchar_t mbc; int transRes = mbtowc(&mbc, buf, MB_LEN_MAX); LDEBUG << "transres value is " << transRes << LENDL; if (transRes > 1) { LDEBUG << "Got a multibyte char inside tag: " << mbc << LENDL; for (int i = 1; i < transRes; i++) { input.get(carLu); } } // tag << '_'; tag << ' '; break; } break; case FINCOL: switch (carLu) { case '<': // tag << '_'; tag << ' '; tagValue << carLu; LDEBUG << "FINCOL-> DEBCOL" << LENDL; etat = DEBCOL; break; case '>': // tag << '_'; tag << ' '; tagValue << carLu; status << "Invalid '>' character at " << position << std::endl; retVal = DUPLICATED_CLOSING_TAG_CHAR; break; case ' ':case '\t':case '\n': putTag(status, output, wide, endSentenceTag, tag, tagValue); output << carLu; LDEBUG << "FINCOL-> BLANC" << LENDL; etat = BLANC; break; case '&': putTag(status, output, wide, endSentenceTag, tag, tagValue); entity << carLu; LDEBUG << "FINCOL -> BEGENTITY;" << LENDL; etat = BEGENTITY; break; default: putTag(status, output, wide, endSentenceTag, tag, tagValue); txt << carLu; tagValue << carLu; LDEBUG << "FINCOL-> TEXT" << LENDL; etat = TEXT; break; } break; case BLANC: switch (carLu) { case '&': entity << carLu; LDEBUG << "BLANC -> BEGENTITY;" << LENDL; etat = BEGENTITY; break; case '<': nb = 1; nbNewLines = 0; tagValue << carLu; LDEBUG << "BLANC-> DEBBLANC" << LENDL; etat = DEBBLANC; break; case '>': output << '>'; status << "Invalid '>' character at " << position << std::endl; retVal = INVALID_CLOSING_TAG_CHAR; LDEBUG << "BLANC-> TEXT" << LENDL; etat = TEXT; break; case ' ':case '\t':case '\n': output << carLu; break; default: txt << carLu; LDEBUG << "BLANC-> TEXT" << LENDL; etat = TEXT; break; } break; case DEBBLANC: tagValue << carLu; switch (carLu) { case '<': nb++; status << "Duplicated '<' character at " << position << std::endl; retVal = DUPLICATED_OPENING_TAG_CHAR; break; case '>': nb++; LDEBUG << "DEBBLANC-> FINBLANC" << LENDL; etat = FINBLANC; break; case '\n': nbNewLines++; break; default: LDEBUG << "Looking at " << carLu << LENDL; char buf[MB_LEN_MAX]; buf[0] = carLu; std::streamsize got = input.rdbuf()-> sgetn(buf+1, 9); for (std::streamsize i = 0; i < got ; i++) input.rdbuf()-> sungetc(); for (uint64_t i = 0; i<MB_LEN_MAX; i++) LDEBUG << buf[i]; LDEBUG << LENDL; wchar_t mbc; int transRes = mbtowc(&mbc, buf, MB_LEN_MAX); LDEBUG << "transres is " << transRes << LENDL; if (transRes > 1) { LDEBUG << "Got a multibyte char inside tag: " << mbc << LENDL; for (int i = 1; i < transRes; i++) { input.get(carLu); } } nb++; break; } break; case FINBLANC: switch (carLu) { case '<': nb++; tagValue << carLu; LDEBUG << "FINBLANC-> DEBBLANC" << LENDL; etat = DEBBLANC; break; case '>': status << "Duplicated '>' character at " << position << std::endl; retVal = DUPLICATED_CLOSING_TAG_CHAR; putWhites(status, output, wide,endSentenceTag, tagValue, output, carLu, ' ', nb, nbNewLines); LDEBUG << "FINBLANC-> TEXT" << LENDL; etat = TEXT; break; case ' ':case '\t':case '\n': putWhites(status, output, wide,endSentenceTag, tagValue, output, carLu, ' ', nb, nbNewLines); LDEBUG << "FINBLANC-> BLANC" << LENDL; etat = BLANC; break; case '&': // putWhites(status, output, wide,endSentenceTag, tagValue, // entity, carLu, '_', nb, nbNewLines); putWhites(status, output, wide,endSentenceTag, tagValue, entity, carLu, ' ', nb, nbNewLines); LDEBUG << "FINBLANC -> BEGENTITY;" << LENDL; etat = BEGENTITY; break; default: // putWhites(status, output, wide,endSentenceTag, tagValue, // txt, carLu, '_', nb, nbNewLines); putWhites(status, output, wide,endSentenceTag, tagValue, txt, carLu, ' ', nb, nbNewLines); LDEBUG << "FINBLANC-> TEXT" << LENDL; etat = TEXT; break; } break; default: throw std::runtime_error((std::string("unknown state %d.\n", int(etat))).c_str()); } ++position; } if ( (etat == DEBCOL) || (etat == DEBBLANC) ) { status << "Unclosed tag at EOF (" << position << ")" << std::endl; retVal = UNCLOSED_OPENING_TAG_CHAR; if (etat == DEBCOL) { output << tag.str(); } else { // s = std::string(nb/2, '_'); s = std::string(nb/2, ' '); s.append(nbNewLines, '\n'); // if (nb%2 == 0) s.append(nb/2, '_'); if (nb%2 == 0) s.append(nb/2, ' '); // else s.append(nb/2 + 1, '_'); else s.append(nb/2 + 1, ' '); output << s; nb = 0; nbNewLines = 0; } } else if ((etat == FINCOL) || (etat == FINBLANC)) { output << tag.str(); } else if (etat == TEXT) { output << txt.str(); } else {} // BLANC nothing to do return int(retVal); }