PAmazonItem TAmazonItem::New(const PXmlDoc& XmlDoc){ // create item PAmazonItem AmazonItem=PAmazonItem(new TAmazonItem()); // item-id AmazonItem->ItemId=XmlDoc->GetTagTok("AmazonItem|ItemId")->GetTokStr(false); // title AmazonItem->TitleStr=XmlDoc->GetTagTok("AmazonItem|Title")->GetTokStr(false); // authors TXmlTokV AuthorNmTokV; XmlDoc->GetTagTokV("AmazonItem|Authors|Name", AuthorNmTokV); for (int AuthorNmTokN=0; AuthorNmTokN<AuthorNmTokV.Len(); AuthorNmTokN++){ PXmlTok AuthorNmTok=AuthorNmTokV[AuthorNmTokN]; TStr AuthorNm=AuthorNmTok->GetTokStr(false); AmazonItem->AuthorNmV.Add(AuthorNm); } // x-sell item-ids TXmlTokV NextItemIdTokV; XmlDoc->GetTagTokV("AmazonItem|XSell|ItemId", NextItemIdTokV); for (int ItemIdTokN=0; ItemIdTokN<NextItemIdTokV.Len(); ItemIdTokN++){ PXmlTok NextItemIdTok=NextItemIdTokV[ItemIdTokN]; TStr NextItemId=NextItemIdTok->GetTokStr(false); AmazonItem->NextItemIdV.Add(NextItemId); } // return item return AmazonItem; }
void TDzsBsDoc::GetDocParts( const TStr& FNm, const PXmlDoc& XmlDoc, const TStr& FPath, const TStr& WebAlias, bool& Ok, TStr& IdStr, TStr& TitleStr, TStr& DataStr, int& YearN){ Ok=false; if (!XmlDoc->IsOk()){return;} // id IdStr=FNm; IdStr.ChangeStr(FPath, WebAlias); // PXmlTok IdTok; // if (XmlDoc->IsTagTok("term|metadata|identifier", IdTok)){ // IdStr=IdTok->GetTokStr(false);} // else {return;} // title PXmlTok TitleTok; if (XmlDoc->IsTagTok("term|metadata|title", TitleTok)){ TitleStr=TitleTok->GetTokStr(false);} else {return;} // timedata TXmlTokV TimeDataTokV; XmlDoc->GetTagTokV("term|data|frame|timedata|fromyear", TimeDataTokV); TStr TimeDataStr=TXmlTok::GetTokVStr(TimeDataTokV, false); if (TimeDataStr.IsInt(YearN)){} else {YearN=0;} // locdata TXmlTokV LocDataTokV; XmlDoc->GetTagTokV("term|data|frame|locdata", LocDataTokV); TStr LocDataStr=TXmlTok::GetTokVStr(LocDataTokV, true); // pages TXmlTokV PageTokV; XmlDoc->GetTagTokV("term|data|frame|page", PageTokV); DataStr=GetDataTokVStr(PageTokV, "\n")+" "+LocDataStr; // character-set transformation TitleStr=THtmlLxChDef::GetCSZFromWin1250(TitleStr); DataStr=THtmlLxChDef::GetCSZFromWin1250(DataStr); // success Ok=true; }
PTransCorpus TTransCorpus::LoadTMX(const TStr& InTmxFPath, const TStr& OrgLang, const TStr& RefTransLang) { // prepare prset structures PTransCorpus TransCorpus = TTransCorpus::New(); // iterate over all the TMX files TFFile TmxFNms(InTmxFPath, "tmx", false); TStr TmxFNm; int SentId = 0; while (TmxFNms.Next(TmxFNm)) { printf("Loading %s ...\n", TmxFNm.CStr()); // we load xml by skiping first tag... TStr CleanTmxFNm = TmxFNm + ".xml"; CleanTmx(TmxFNm, CleanTmxFNm); PSIn XmlSIn = TFIn::New(CleanTmxFNm); PXmlDoc XmlDoc = TXmlDoc::LoadTxt(XmlSIn); // stop if at the last tag if (!XmlDoc->IsOk()) { printf(" error: %s\n", XmlDoc->GetMsgStr().CStr()); continue; } // extract sentences from xml-trees TXmlTokV TuTokV; XmlDoc->GetTagTokV("tmx|body|tu", TuTokV); const int TuToks = TuTokV.Len(); for (int TuTokN = 0; TuTokN < TuToks; TuTokN++) { if (TuTokN % 100 == 0) { printf(" %d / %d\r", TuTokN, TuToks); } TXmlTokV TuvTokV; TuTokV[TuTokN]->GetTagTokV("tuv", TuvTokV); IAssert(TuvTokV.Len() == 2); TStr OrgSent, RefTransSent; for (int TuvTokN = 0; TuvTokN < TuvTokV.Len(); TuvTokN++) { TStr Lang = TuvTokV[TuvTokN]->GetStrArgVal("xml:lang", ""); TStr Sent = CleanRtf(TuvTokV[TuvTokN]->GetTagTok("seg")->GetTokStr(false)); if (Lang == OrgLang) { OrgSent = Sent; } else if (Lang == RefTransLang) { RefTransSent = Sent; } } TransCorpus->AddSentenceNoTrans(SentId, OrgSent, RefTransSent); SentId++; } printf(" %d / %d\n", TuToks, TuToks); } // finish return TransCorpus; }
PCycBs TCycBs::LoadCycXmlDump(const TStr& FPath){ // file-names TStr NrFPath=TStr::GetNrFPath(FPath); TStr CycLexiconFNm=NrFPath+"lexicon-dump.xml"; TStr CycTaxonomyFNm=NrFPath+"taxonomy-dump.xml"; TStr CycRelevanceFNm=NrFPath+"relevance-dump.xml"; TStr CycKBaseFNm=NrFPath+"kb-dump.xml"; // create cyc-base PCycBs CycBs=TCycBs::New(); // lexicon {printf("Processing Lexicon %s ...\n", CycLexiconFNm.CStr()); PSIn CycLexiconSIn=TFIn::New(CycLexiconFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycWStr; TStr PrevCycLStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycLexiconSIn); if (!XmlDoc->IsOk()){ printf("%s - %s\n", PrevCycWStr.CStr(), PrevCycLStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("word")); TStr CycWStr=TopTok->GetArgVal("string"); TStr CycLStr=TopTok->GetArgVal("cycl"); PrevCycWStr=CycWStr; PrevCycLStr; // insert data CycBs->AddEdge(CycLStr, "#$nameString", CycWStr); CycBs->AddEdge(CycWStr, "~#$nameString", CycLStr); } printf("%d Docs\nDone.\n", XmlDocs);} // taxonomy {printf("Processing Taxonomy %s ...\n", CycTaxonomyFNm.CStr()); PSIn CycTaxonomySIn=TFIn::New(CycTaxonomyFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevSrcCycLStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycTaxonomySIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevSrcCycLStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("term")); TStr SrcCycLStr=TopTok->GetArgVal("cycl"); PrevSrcCycLStr=SrcCycLStr; for (int SubTokN=0; SubTokN<TopTok->GetSubToks(); SubTokN++){ PXmlTok SubTok=TopTok->GetSubTok(SubTokN); TStr DstCycLStr=SubTok->GetTagNm(); if (SubTok->IsTag("isa")){ DstCycLStr=SubTok->GetArgVal("value"); CycBs->AddEdge(SrcCycLStr, "#$isa", DstCycLStr); CycBs->AddEdge(DstCycLStr, "~#$isa", SrcCycLStr); } else if (SubTok->IsTag("genl")){ DstCycLStr=SubTok->GetArgVal("value"); CycBs->AddEdge(SrcCycLStr, "#$genls", DstCycLStr); CycBs->AddEdge(DstCycLStr, "~#$genls", SrcCycLStr); } else { Fail; } } } printf("%d Docs\nDone.\n", XmlDocs);} // relevance {printf("Processing Relevance %s ...\n", CycRelevanceFNm.CStr()); PSIn CycRelevanceSIn=TFIn::New(CycRelevanceFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycRelevanceSIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevCycStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("term")); TStr CycStr=TopTok->GetArgVal("cyc"); PrevCycStr=CycStr; //IAssert(CycBs->IsVNm(CycStr)); if (CycBs->IsVNm(CycStr)){ if (TopTok->GetArgVal("thcl")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cvfHumanRelevant, true);} if (TopTok->GetArgVal("irrel")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cfvHumanIrrelevant, true);} if (TopTok->GetArgVal("clarifying")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cfvHumanClarifying, true);} if ((TopTok->GetArgVal("thcl")=="T")||(TopTok->GetArgVal("clarifying")=="T")){ CycBs->GetVrtx(CycStr).SetFlag(cvfHumanOk, true);} } else { //printf("%s\n", CycStr.CStr()); } } printf("%d Docs\nDone.\n", XmlDocs);} // knowledge-base {printf("Processing KBase %s ...\n", CycKBaseFNm.CStr()); PSIn CycKBaseSIn=TFIn::New(CycKBaseFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycLStr; TStrV PrevArgCycLStrV; TStrIntH HdCycLToFq; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} //if (XmlDocs>10000){break;} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycKBaseSIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevCycLStr.CStr()); for (int ArgN=0; ArgN<PrevArgCycLStrV.Len(); ArgN++){ printf(" [%s]", PrevArgCycLStrV[ArgN].CStr());} printf("\n"); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("sentence")); TStr CycLStr=TopTok->GetArgVal("cycl"); TXmlTokV ArgXmlTokV; XmlDoc->GetTagTokV("sentence|arg", ArgXmlTokV); TStrV ArgCycLStrV; for (int ArgN=0; ArgN<ArgXmlTokV.Len(); ArgN++){ PXmlTok Tok=ArgXmlTokV[ArgN]; IAssert(Tok->IsTag("arg")); if (Tok->IsArg("cycl")){ TStr ArgCycLStr=Tok->GetArgVal("cycl"); ArgCycLStrV.Add(ArgCycLStr); } else { ArgCycLStrV.Add("Empty"); } } PrevCycLStr=CycLStr; PrevArgCycLStrV=ArgCycLStrV; if (ArgCycLStrV.Len()>0){ HdCycLToFq.AddDat(ArgCycLStrV[0]+" - "+TInt::GetStr(ArgCycLStrV.Len()-1))++;} // insert if (ArgCycLStrV.Len()==3){ TStr PredNm=ArgCycLStrV[0]; if ((PredNm!="#$isa")&&(PredNm!="#$termOfUnit")&&(PredNm!="#$genls")){ TStr BackLinkPredNm=TStr("~")+PredNm; TStr Arg1=ArgCycLStrV[1]; TStr Arg2=ArgCycLStrV[2]; CycBs->AddEdge(Arg1, PredNm, Arg2); CycBs->AddEdge(Arg2, BackLinkPredNm, Arg1); } } } // output top cycl relations {TFOut CycLSOut("CycKB-CycLFq.Stat.Txt"); FILE* fCycL=CycLSOut.GetFileId(); TIntStrPrV FqCycLStrPrV; HdCycLToFq.GetDatKeyPrV(FqCycLStrPrV); FqCycLStrPrV.Sort(false); for (int CycLN=0; CycLN<FqCycLStrPrV.Len(); CycLN++){ fprintf(fCycL, "%6d. %s\n", 1+FqCycLStrPrV[CycLN].Val1, FqCycLStrPrV[CycLN].Val2.CStr()); }} printf("%d Docs\nDone.\n", XmlDocs);} // return cyc-base return CycBs; }