PAmazonItem TAmazonItem::New(const PXmlDoc& XmlDoc){ // create item PAmazonItem AmazonItem=PAmazonItem(new TAmazonItem()); // item-id AmazonItem->ItemId=XmlDoc->GetTagTok("AmazonItem|ItemId")->GetTokStr(false); // title AmazonItem->TitleStr=XmlDoc->GetTagTok("AmazonItem|Title")->GetTokStr(false); // authors TXmlTokV AuthorNmTokV; XmlDoc->GetTagTokV("AmazonItem|Authors|Name", AuthorNmTokV); for (int AuthorNmTokN=0; AuthorNmTokN<AuthorNmTokV.Len(); AuthorNmTokN++){ PXmlTok AuthorNmTok=AuthorNmTokV[AuthorNmTokN]; TStr AuthorNm=AuthorNmTok->GetTokStr(false); AmazonItem->AuthorNmV.Add(AuthorNm); } // x-sell item-ids TXmlTokV NextItemIdTokV; XmlDoc->GetTagTokV("AmazonItem|XSell|ItemId", NextItemIdTokV); for (int ItemIdTokN=0; ItemIdTokN<NextItemIdTokV.Len(); ItemIdTokN++){ PXmlTok NextItemIdTok=NextItemIdTokV[ItemIdTokN]; TStr NextItemId=NextItemIdTok->GetTokStr(false); AmazonItem->NextItemIdV.Add(NextItemId); } // return item return AmazonItem; }
///////////////////////////////////////////////// // SkyGrid-Document void TSkyGridBinDoc::SaveBinDocV( const TStr& InXmlFPath, const TStr& OutBinFNm, const int& MxDocs){ printf("Processing SkyGrid-News-Xml files from '%s'...\n", InXmlFPath.CStr()); TFOut SOut(OutBinFNm); TFFile FFile(InXmlFPath, true); TStr FNm; int Docs=0; int DateDocs=0; uint64 PrevTm=0; while (FFile.Next(FNm)){ if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} //printf(" Processing '%s' ...", FNm.CStr()); PXmlDoc XmlDoc=TXmlDoc::LoadTxt(FNm); PXmlTok ContentTok=XmlDoc->GetTagTok("item|content"); TStr SwIdStr=ContentTok->GetTagTok("swid")->GetArgVal("value"); TStr UrlStr=ContentTok->GetTagTok("url")->GetTokStr(false); TStr TitleStr=ContentTok->GetTagTok("title")->GetTokStr(false); TStr FetchedValStr=ContentTok->GetTagTok("fetched")->GetArgVal("value"); TXmlTokV EntityTokV; ContentTok->GetTagTokV("annotations|entity", EntityTokV); TStr BodyStr=ContentTok->GetTagTok("body")->GetTokStr(false); // extract date TStr DateStr=SwIdStr.GetSubStr(0, 7); TStr YearStr=DateStr.GetSubStr(0, 3); TStr MonthStr=DateStr.GetSubStr(4, 5); TStr DayStr=DateStr.GetSubStr(6, 7); TTm DateTm(YearStr.GetInt(), MonthStr.GetInt(), DayStr.GetInt()); uint64 Tm=TTm::GetMSecsFromTm(DateTm); // extract entities TStrIntH EntNmToFqH; for (int EntityTokN=0; EntityTokN<EntityTokV.Len(); EntityTokN++){ PXmlTok EntityTok=EntityTokV[EntityTokN]; if (!EntityTok->IsTag("entity")){continue;} TStr CanonicalNm=EntityTok->GetArgVal("canonical", ""); TStr TextStr=EntityTok->GetArgVal("text", ""); TStr TypeNm=EntityTok->GetArgVal("type", ""); TStr EntNm=CanonicalNm.Empty() ? TextStr : CanonicalNm; EntNmToFqH.AddDat(EntNm)++; } TIntStrPrV FqEntNmPrV; EntNmToFqH.GetDatKeyPrV(FqEntNmPrV); FqEntNmPrV.Sort(false); // extract headline TChA HeadlineChA=BodyStr.GetSubStr(0, 250); while ((HeadlineChA.Len()>0)&&(HeadlineChA.LastCh()!=' ')){ HeadlineChA.Trunc(HeadlineChA.Len()-1);} HeadlineChA+="..."; // create document TSkyGridBinDoc Doc(SwIdStr, Tm, TitleStr, HeadlineChA, FqEntNmPrV); // save document Doc.Save(SOut); // screen log if (PrevTm!=Tm){ if (PrevTm!=0){printf("\n");} PrevTm=Tm; DateDocs=0; } Docs++; DateDocs++; printf(" %s [Day:%d / All:%d]\r", DateStr.CStr(), DateDocs, Docs); } printf("\nDone.\n"); }
PBowDocBs TBowFl::LoadReuters21578Txt( const TStr& FPath, const int& MxDocs, const TStr& SwSetTypeNm, const TStr& StemmerTypeNm, const int& MxNGramLen, const int& MnNGramFq, const bool& SaveDocP, const PNotify& Notify){ // prepare stop-words PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm); // prepare stemmer PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm); // create ngrams PNGramBs NGramBs; if (!((MxNGramLen==1)&&(MnNGramFq==1))){ NGramBs=TNGramBs::GetNGramBsFromReuters21578( FPath, MxDocs, MxNGramLen, MnNGramFq, SwSet, Stemmer); } // create document-base PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs); // traverse directory TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ printf("Processing file '%s'\n", FNm.CStr()); TIntH DocWIdToFqH(100); TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV); for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){ Docs++; if (Docs%100==0){printf("%d\r", Docs);} if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} // get document-name PXmlDoc Doc=LDocV[LDocN]; PXmlTok DocTok=Doc->GetTok(); TStr DocNm=DocTok->GetArgVal("NEWID"); // get document-categories TStrV CatNmV; TXmlTokV TopicsTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopicsTokV); for (int TopicsTokN=0; TopicsTokN<TopicsTokV.Len(); TopicsTokN++){ TStr CatNm=TopicsTokV[TopicsTokN]->GetTokStr(false); CatNmV.Add(CatNm); } // get document-contents PXmlTok DocStrTok=Doc->GetTagTok("REUTERS|TEXT"); TStr DocStr=DocStrTok->GetTokStr(false); // add document to bow int DId=BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP); // train & test data if ((DocTok->GetArgVal("LEWISSPLIT")=="TRAIN")&&(DocTok->GetArgVal("TOPICS")=="YES")){ BowDocBs->AddTrainDId(DId);} if ((DocTok->GetArgVal("LEWISSPLIT")=="TEST")&&(DocTok->GetArgVal("TOPICS")=="YES")){ BowDocBs->AddTestDId(DId);} } if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} } // return results BowDocBs->AssertOk(); return BowDocBs; }
void TCpDoc::SaveReuters21578ToCpd( const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // traverse directory with .sgm files TFFile FFile(InFPath, ".SGM", false); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ printf("Processing file '%s'\n", FNm.CStr()); TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV); for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){ Docs++; if (Docs%100==0){printf("%d\r", Docs);} if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} // create reuters document PCpDoc CpDoc=TCpDoc::New(); // load xml document PXmlDoc Doc=LDocV[LDocN]; PXmlTok DocTok=Doc->GetTok(); // document id CpDoc->DocNm=DocTok->GetArgVal("NEWID"); // date CpDoc->DateStr=Doc->GetTagTok("REUTERS|DATE")->GetTokStr(false); // document title PXmlTok TitleTok=Doc->GetTagTok("REUTERS|TEXT|TITLE"); if (!TitleTok.Empty()){ CpDoc->TitleStr=TitleTok->GetTokStr(false);} // dateline PXmlTok DatelineTok=Doc->GetTagTok("REUTERS|TEXT|DATELINE"); if (!DatelineTok.Empty()){ CpDoc->DatelineStr=DatelineTok->GetTokStr(false);} // get text string TStr TextStr; PXmlTok BodyTok=Doc->GetTagTok("REUTERS|TEXT|BODY"); if (!BodyTok.Empty()){ TextStr=BodyTok->GetTokStr(false); } else { // if <BODY> doesn't exist, take the whole <TEXT> PXmlTok TextTok=Doc->GetTagTok("REUTERS|TEXT"); if (!TextTok.Empty()){ TextStr=TextTok->GetTokStr(false); } } CpDoc->ParStrV.Add(TextStr, 1); // topic categories TXmlTokV TopCatTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopCatTokV); for (int TokN=0; TokN<TopCatTokV.Len(); TokN++){ TStr CatNm=TopCatTokV[TokN]->GetTokStr(false); CpDoc->TopCdNmV.Add(CatNm); } // save cpd document CpDoc->Save(*SOut); } if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} } }
TIntV GetEmailIds(const PXmlTok& QueryXml, const TStr& TagPath) { TIntV EmailIdV; TXmlTokV Ids; QueryXml->GetTagTokV(TagPath, Ids); for (int EmlInd = 0; EmlInd < Ids.Len(); EmlInd++) { TInt EmailIdInt = Ids[EmlInd]->GetIntArgVal("id", -1); if (EmailIdInt != -1) EmailIdV.AddUnique(EmailIdInt); } return EmailIdV; }
void GetKeywords(const PXmlTok& QueryXml, const TStr& TagPath, TStrV& KeywordsV, TStrV& IgnoreKeywordsV) { TXmlTokV KwsXmlV; QueryXml->GetTagTokV(TagPath, KwsXmlV); for (int KwInd = 0; KwInd < KwsXmlV.Len(); KwInd++) { TStr Kw = KwsXmlV[KwInd]->GetTokStr(false); int hide = KwsXmlV[KwInd]->GetIntArgVal("hide", 0); if (hide) IgnoreKeywordsV.Add(Kw); else KeywordsV.Add(Kw); } }
TStr TDzsBsDoc::GetDataTokVStr(const TXmlTokV& TokV, const TStr& SepStr){ TChA ChA; for (int TokN=0; TokN<TokV.Len(); TokN++){ if (TokN>0){ChA+=SepStr;} ChA+=GetDataTokStr(TokV[TokN]); } return ChA; }
PTransCorpus TTransCorpus::LoadAC(const TStr& InXmlFNm, const int& MxSents) { // prepare prset structures PTransCorpus TransCorpus = TTransCorpus::New(); // we load xml by skiping first tags PSIn XmlSIn=TFIn::New(InXmlFNm); TXmlDoc::SkipTopTag(XmlSIn); // ignore TEI printf("Ignoring: %s\n", TXmlDoc::LoadTxt(XmlSIn)->GetTok()->GetTagNm().CStr()); // ignore teiHeader TXmlDoc::SkipTopTag(XmlSIn); // ignore text TXmlDoc::SkipTopTag(XmlSIn); // ignore body PXmlDoc XmlDoc; int XmlDocs = 0, SentId = 0;; forever{ // load xml tree XmlDocs++; printf("%7d Sentences \r", SentId); XmlDoc=TXmlDoc::LoadTxt(XmlSIn); // stop if at the last tag if (!XmlDoc->IsOk()) { /*printf("Error: %s\n", XmlDoc->GetMsgStr().CStr());*/ break; } // extract documents from xml-trees PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("div")){ // extract document Id TStr DocNm = TopTok->GetArgVal("n"); // and paragraphs TXmlTokV LinkTokV; TopTok->GetTagTokV("linkGrp|link", LinkTokV); for (int LinkTokN = 0; LinkTokN < LinkTokV.Len(); LinkTokN++) { PXmlTok LinkTok = LinkTokV[LinkTokN]; TStr LinkType = LinkTok->GetArgVal("type"); // skip if paragraph for one language is empty if (LinkType == "1:1") { TXmlTokV S1TokV; LinkTok->GetTagTokV("s1", S1TokV); TXmlTokV S2TokV; LinkTok->GetTagTokV("s2", S2TokV); IAssert(S1TokV.Len() == 1); IAssert(S2TokV.Len() == 1); TStr ParaStr1 = S1TokV[0]->GetTagTokStr(""); TStr ParaStr2 = S2TokV[0]->GetTagTokStr(""); TransCorpus->AddSentenceNoTrans(SentId, ParaStr1, ParaStr2); SentId++; } } } else { printf("Unknow tag: %s\n", TopTok->GetTagNm().CStr()); } if ((MxSents != -1) && (TransCorpus->GetSentences() > MxSents)) { break; } } printf("\n"); // finish return TransCorpus; }
void TCpDoc::LoadReuters2000DocFromXml(const TStr& FNm, TStr& DocId, TStr& DateStr, TStr& TitleStr, TStr& HeadlineStr, TStr& BylineStr, TStr& DatelineStr, TStrV& ParStrV, TStrV& TopCdNmV, TStrV& GeoCdNmV, TStrV& IndCdNmV){ PXmlDoc Doc=TXmlDoc::LoadTxt(FNm); // get text strings // general document data DocId=Doc->GetTagTok("newsitem")->GetArgVal("itemid"); DateStr=Doc->GetTagTok("newsitem")->GetArgVal("date"); TitleStr=Doc->GetTagTok("newsitem|title")->GetTokStr(false); HeadlineStr=Doc->GetTagTok("newsitem|headline")->GetTokStr(false); BylineStr=""; PXmlTok BylineTok; if (Doc->IsTagTok("newsitem|byline", BylineTok)){ BylineStr=BylineTok->GetTokStr(false);} DatelineStr=""; PXmlTok DatelineTok; if (Doc->IsTagTok("newsitem|dateline", DatelineTok)){ DatelineStr=DatelineTok->GetTokStr(false);} // text paragraphs ParStrV.Clr(); TXmlTokV ParTokV; Doc->GetTagTokV("newsitem|text|p", ParTokV); for (int ParTokN=0; ParTokN<ParTokV.Len(); ParTokN++){ TStr ParStr=ParTokV[ParTokN]->GetTokStr(false); ParStrV.Add(ParStr); } // codes TopCdNmV.Clr(); GeoCdNmV.Clr(); IndCdNmV.Clr(); TXmlTokV CdsTokV; Doc->GetTagTokV("newsitem|metadata|codes", CdsTokV); for (int CdsTokN=0; CdsTokN<CdsTokV.Len(); CdsTokN++){ PXmlTok CdsTok=CdsTokV[CdsTokN]; TXmlTokV CdTokV; CdsTok->GetTagTokV("code", CdTokV); if (CdsTok->GetArgVal("class")=="bip:topics:1.0"){ for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){ TStr CdNm=CdTokV[CdTokN]->GetArgVal("code"); TopCdNmV.Add(CdNm); } } else if (CdsTok->GetArgVal("class")=="bip:countries:1.0"){ for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){ TStr CdNm=CdTokV[CdTokN]->GetArgVal("code"); GeoCdNmV.Add(CdNm); } } else if (CdsTok->GetArgVal("class")=="bip:industries:1.0"){ for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){ TStr CdNm=CdTokV[CdTokN]->GetArgVal("code"); IndCdNmV.Add(CdNm); } } else { Fail; } } }
///////////////////////////////////////////////// // Expression-Help PExpHelp TExpHelp::LoadXml(const PSIn& SIn){ // create expression help PExpHelp ExpHelp=TExpHelp::New(); // load xml with expression help PXmlDoc Doc=TXmlDoc::LoadTxt(SIn); // retrieve objects TXmlTokV ObjTokV; Doc->GetTagTokV("ExpHelp|Obj", ObjTokV); for (int ObjTokN=0; ObjTokN<ObjTokV.Len(); ObjTokN++){ PXmlTok ObjTok=ObjTokV[ObjTokN]; // type TStr TypeStr=ObjTok->GetTagTok("Type")->GetTokStr(false); // category TStr CatNm=ObjTok->GetTagTok("Cat")->GetTokStr(false); // header TStr HdNm=ObjTok->GetTagTok("Head|Name")->GetTokStr(false); TStr HdTypeStr=ObjTok->GetTagTok("Head|Type")->GetTokStr(false); TStr HdDescStr=ObjTok->GetTagTok("Head|Desc")->GetTokStr(false); PExpHelpItem HdItem= TExpHelpItem::New(HdNm, HdTypeStr, HdDescStr, ""); // arguments TXmlTokV ArgTokV; ObjTok->GetTagTokV("Args|Arg", ArgTokV); TExpHelpItemV ArgItemV; for (int ArgTokN=0; ArgTokN<ArgTokV.Len(); ArgTokN++){ PXmlTok ArgTok=ArgTokV[ArgTokN]; // argument TStr ArgNm=ArgTok->GetTagTok("Name")->GetTokStr(false); TStr ArgTypeStr=ArgTok->GetTagTok("Type")->GetTokStr(false); TStr ArgDescStr=ArgTok->GetTagTok("Desc")->GetTokStr(false); TStr ArgDfValStr=ArgTok->GetTagTok("Default")->GetTokStr(false); PExpHelpItem ArgItem= TExpHelpItem::New(ArgNm, ArgTypeStr, ArgDescStr, ArgDfValStr); ArgItemV.Add(ArgItem); } // create & add object TExpHelpObjType Type=TExpHelpObj::GetObjTypeFromStr(TypeStr); PExpHelpObj Obj=TExpHelpObj::New(Type, CatNm, HdItem, ArgItemV); ExpHelp->AddObj(Obj); } // return result return ExpHelp; }
PTransCorpus TTransCorpus::LoadTMX(const TStr& InTmxFPath, const TStr& OrgLang, const TStr& RefTransLang) { // prepare prset structures PTransCorpus TransCorpus = TTransCorpus::New(); // iterate over all the TMX files TFFile TmxFNms(InTmxFPath, "tmx", false); TStr TmxFNm; int SentId = 0; while (TmxFNms.Next(TmxFNm)) { printf("Loading %s ...\n", TmxFNm.CStr()); // we load xml by skiping first tag... TStr CleanTmxFNm = TmxFNm + ".xml"; CleanTmx(TmxFNm, CleanTmxFNm); PSIn XmlSIn = TFIn::New(CleanTmxFNm); PXmlDoc XmlDoc = TXmlDoc::LoadTxt(XmlSIn); // stop if at the last tag if (!XmlDoc->IsOk()) { printf(" error: %s\n", XmlDoc->GetMsgStr().CStr()); continue; } // extract sentences from xml-trees TXmlTokV TuTokV; XmlDoc->GetTagTokV("tmx|body|tu", TuTokV); const int TuToks = TuTokV.Len(); for (int TuTokN = 0; TuTokN < TuToks; TuTokN++) { if (TuTokN % 100 == 0) { printf(" %d / %d\r", TuTokN, TuToks); } TXmlTokV TuvTokV; TuTokV[TuTokN]->GetTagTokV("tuv", TuvTokV); IAssert(TuvTokV.Len() == 2); TStr OrgSent, RefTransSent; for (int TuvTokN = 0; TuvTokN < TuvTokV.Len(); TuvTokN++) { TStr Lang = TuvTokV[TuvTokN]->GetStrArgVal("xml:lang", ""); TStr Sent = CleanRtf(TuvTokV[TuvTokN]->GetTagTok("seg")->GetTokStr(false)); if (Lang == OrgLang) { OrgSent = Sent; } else if (Lang == RefTransLang) { RefTransSent = Sent; } } TransCorpus->AddSentenceNoTrans(SentId, OrgSent, RefTransSent); SentId++; } printf(" %d / %d\n", TuToks, TuToks); } // finish return TransCorpus; }
PLwOnto TLwOnto::LoadAsfaVoc(const TStr& FPath){ // normalize path TStr NrFPath=TStr::GetNrFPath(FPath); // create ontology PLwOnto LwOnto=TLwOnto::New(); // create language object int EnLangId=LwOnto->GetLangBs()->AddLang("EN", "English"); // create term-types {PLwTermType D_TermType=TLwTermType::New(0, "Descriptor", EnLangId); PLwTermType ND_TermType=TLwTermType::New(1, "Non-descriptor", EnLangId); LwOnto->GetTermTypeBs()->AddTermType(D_TermType); LwOnto->GetTermTypeBs()->AddTermType(ND_TermType);} // create link-types {PLwLinkType BT_LinkType=TLwLinkType::New(0, "BT", EnLangId, "Broader-Term"); PLwLinkType NT_LinkType=TLwLinkType::New(1, "NT", EnLangId, "Narrower-Term"); PLwLinkType RT_LinkType=TLwLinkType::New(2, "RT", EnLangId, "Related-Term"); PLwLinkType UF_LinkType=TLwLinkType::New(3, "UF", EnLangId, "Used-For"); PLwLinkType USE_LinkType=TLwLinkType::New(4, "USE", EnLangId, "Used-By"); LwOnto->GetLinkTypeBs()->AddLinkType(BT_LinkType); LwOnto->GetLinkTypeBs()->AddLinkType(NT_LinkType); LwOnto->GetLinkTypeBs()->AddLinkType(RT_LinkType); LwOnto->GetLinkTypeBs()->AddLinkType(UF_LinkType); LwOnto->GetLinkTypeBs()->AddLinkType(USE_LinkType);} // load ontology file TStr AsfaOntoFNm=NrFPath+"asfa_xml_20060522.xml"; printf("Loading '%s' ...", AsfaOntoFNm.CStr()); PXmlDoc AsfaXmlDoc=TXmlDoc::LoadTxt(AsfaOntoFNm); IAssert(AsfaXmlDoc->IsOk()); TXmlTokV ConceptXmlTokV; AsfaXmlDoc->GetTagTokV("THESAURUS|CONCEPT", ConceptXmlTokV); printf(" Done.\n"); // create terms {printf("Creating terms ..."); for (int ConceptN=0; ConceptN<ConceptXmlTokV.Len(); ConceptN++){ PXmlTok ConceptXmlTok=ConceptXmlTokV[ConceptN]; // term-name TStr TermNm; if (ConceptXmlTok->IsSubTag("NON-DESCRIPTOR")){ TermNm=ConceptXmlTok->GetTagTokStr("NON-DESCRIPTOR");} else if (ConceptXmlTok->IsSubTag("DESCRIPTOR")){ TermNm=ConceptXmlTok->GetTagTokStr("DESCRIPTOR");} // term-type TStr TermTypeNm=ConceptXmlTok->GetTagTokStr("TYP"); int TermTypeId=LwOnto->GetTermTypeBs()->GetTermTypeId(TermTypeNm, EnLangId); // description TStr DescStr; if (ConceptXmlTok->IsSubTag("SN")){ DescStr=ConceptXmlTok->GetTagTokStr("SN"); DescStr.ChangeChAll('\r', ' '); DescStr.ChangeChAll('\n', ' '); DescStr.ChangeStrAll(" ", " "); DescStr.ToTrunc(); } // create term PLwTerm Term=TLwTerm::New(-1, TermNm, EnLangId, TermTypeId, DescStr); LwOnto->GetTermBs()->AddTermGetTermId(Term); } printf(" Done. (%d)\n", LwOnto->GetTermBs()->GetTerms());} // create links {printf("Creating links ..."); for (int ConceptN=0; ConceptN<ConceptXmlTokV.Len(); ConceptN++){ PXmlTok ConceptXmlTok=ConceptXmlTokV[ConceptN]; // source-term-name TStr TermNm1; if (ConceptXmlTok->IsSubTag("NON-DESCRIPTOR")){ TermNm1=ConceptXmlTok->GetTagTokStr("NON-DESCRIPTOR");} else if (ConceptXmlTok->IsSubTag("DESCRIPTOR")){ TermNm1=ConceptXmlTok->GetTagTokStr("DESCRIPTOR");} int TermId1=LwOnto->GetTermBs()->GetTermId(TermNm1, EnLangId); // links for (int SubTokN=0; SubTokN<ConceptXmlTok->GetSubToks(); SubTokN++){ PXmlTok SubTok=ConceptXmlTok->GetSubTok(SubTokN); if (SubTok->IsTag()){ TStr LinkTypeNm=SubTok->GetTagNm(); if (LwOnto->GetLinkTypeBs()->IsLinkType(LinkTypeNm, EnLangId)){ // destination-term-name TStr TermNm2=ConceptXmlTok->GetTagTokStr(LinkTypeNm); int TermId2=LwOnto->GetTermBs()->GetTermId(TermNm2, EnLangId); int LinkTypeId=LwOnto->GetLinkTypeBs()->GetLinkTypeId(LinkTypeNm, EnLangId); LwOnto->GetLinkBs()->AddLink(TermId1, LinkTypeId, TermId2); } } } } printf(" Done. (%d)\n", LwOnto->GetLinkBs()->GetLinks());} // return ontology return LwOnto; }
PCycBs TCycBs::LoadCycXmlDump(const TStr& FPath){ // file-names TStr NrFPath=TStr::GetNrFPath(FPath); TStr CycLexiconFNm=NrFPath+"lexicon-dump.xml"; TStr CycTaxonomyFNm=NrFPath+"taxonomy-dump.xml"; TStr CycRelevanceFNm=NrFPath+"relevance-dump.xml"; TStr CycKBaseFNm=NrFPath+"kb-dump.xml"; // create cyc-base PCycBs CycBs=TCycBs::New(); // lexicon {printf("Processing Lexicon %s ...\n", CycLexiconFNm.CStr()); PSIn CycLexiconSIn=TFIn::New(CycLexiconFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycWStr; TStr PrevCycLStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycLexiconSIn); if (!XmlDoc->IsOk()){ printf("%s - %s\n", PrevCycWStr.CStr(), PrevCycLStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("word")); TStr CycWStr=TopTok->GetArgVal("string"); TStr CycLStr=TopTok->GetArgVal("cycl"); PrevCycWStr=CycWStr; PrevCycLStr; // insert data CycBs->AddEdge(CycLStr, "#$nameString", CycWStr); CycBs->AddEdge(CycWStr, "~#$nameString", CycLStr); } printf("%d Docs\nDone.\n", XmlDocs);} // taxonomy {printf("Processing Taxonomy %s ...\n", CycTaxonomyFNm.CStr()); PSIn CycTaxonomySIn=TFIn::New(CycTaxonomyFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevSrcCycLStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycTaxonomySIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevSrcCycLStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("term")); TStr SrcCycLStr=TopTok->GetArgVal("cycl"); PrevSrcCycLStr=SrcCycLStr; for (int SubTokN=0; SubTokN<TopTok->GetSubToks(); SubTokN++){ PXmlTok SubTok=TopTok->GetSubTok(SubTokN); TStr DstCycLStr=SubTok->GetTagNm(); if (SubTok->IsTag("isa")){ DstCycLStr=SubTok->GetArgVal("value"); CycBs->AddEdge(SrcCycLStr, "#$isa", DstCycLStr); CycBs->AddEdge(DstCycLStr, "~#$isa", SrcCycLStr); } else if (SubTok->IsTag("genl")){ DstCycLStr=SubTok->GetArgVal("value"); CycBs->AddEdge(SrcCycLStr, "#$genls", DstCycLStr); CycBs->AddEdge(DstCycLStr, "~#$genls", SrcCycLStr); } else { Fail; } } } printf("%d Docs\nDone.\n", XmlDocs);} // relevance {printf("Processing Relevance %s ...\n", CycRelevanceFNm.CStr()); PSIn CycRelevanceSIn=TFIn::New(CycRelevanceFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycRelevanceSIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevCycStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("term")); TStr CycStr=TopTok->GetArgVal("cyc"); PrevCycStr=CycStr; //IAssert(CycBs->IsVNm(CycStr)); if (CycBs->IsVNm(CycStr)){ if (TopTok->GetArgVal("thcl")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cvfHumanRelevant, true);} if (TopTok->GetArgVal("irrel")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cfvHumanIrrelevant, true);} if (TopTok->GetArgVal("clarifying")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cfvHumanClarifying, true);} if ((TopTok->GetArgVal("thcl")=="T")||(TopTok->GetArgVal("clarifying")=="T")){ CycBs->GetVrtx(CycStr).SetFlag(cvfHumanOk, true);} } else { //printf("%s\n", CycStr.CStr()); } } printf("%d Docs\nDone.\n", XmlDocs);} // knowledge-base {printf("Processing KBase %s ...\n", CycKBaseFNm.CStr()); PSIn CycKBaseSIn=TFIn::New(CycKBaseFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycLStr; TStrV PrevArgCycLStrV; TStrIntH HdCycLToFq; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} //if (XmlDocs>10000){break;} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycKBaseSIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevCycLStr.CStr()); for (int ArgN=0; ArgN<PrevArgCycLStrV.Len(); ArgN++){ printf(" [%s]", PrevArgCycLStrV[ArgN].CStr());} printf("\n"); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("sentence")); TStr CycLStr=TopTok->GetArgVal("cycl"); TXmlTokV ArgXmlTokV; XmlDoc->GetTagTokV("sentence|arg", ArgXmlTokV); TStrV ArgCycLStrV; for (int ArgN=0; ArgN<ArgXmlTokV.Len(); ArgN++){ PXmlTok Tok=ArgXmlTokV[ArgN]; IAssert(Tok->IsTag("arg")); if (Tok->IsArg("cycl")){ TStr ArgCycLStr=Tok->GetArgVal("cycl"); ArgCycLStrV.Add(ArgCycLStr); } else { ArgCycLStrV.Add("Empty"); } } PrevCycLStr=CycLStr; PrevArgCycLStrV=ArgCycLStrV; if (ArgCycLStrV.Len()>0){ HdCycLToFq.AddDat(ArgCycLStrV[0]+" - "+TInt::GetStr(ArgCycLStrV.Len()-1))++;} // insert if (ArgCycLStrV.Len()==3){ TStr PredNm=ArgCycLStrV[0]; if ((PredNm!="#$isa")&&(PredNm!="#$termOfUnit")&&(PredNm!="#$genls")){ TStr BackLinkPredNm=TStr("~")+PredNm; TStr Arg1=ArgCycLStrV[1]; TStr Arg2=ArgCycLStrV[2]; CycBs->AddEdge(Arg1, PredNm, Arg2); CycBs->AddEdge(Arg2, BackLinkPredNm, Arg1); } } } // output top cycl relations {TFOut CycLSOut("CycKB-CycLFq.Stat.Txt"); FILE* fCycL=CycLSOut.GetFileId(); TIntStrPrV FqCycLStrPrV; HdCycLToFq.GetDatKeyPrV(FqCycLStrPrV); FqCycLStrPrV.Sort(false); for (int CycLN=0; CycLN<FqCycLStrPrV.Len(); CycLN++){ fprintf(fCycL, "%6d. %s\n", 1+FqCycLStrPrV[CycLN].Val1, FqCycLStrPrV[CycLN].Val2.CStr()); }} printf("%d Docs\nDone.\n", XmlDocs);} // return cyc-base return CycBs; }