PAmazonItem TAmazonItem::New(const PXmlDoc& XmlDoc){ // create item PAmazonItem AmazonItem=PAmazonItem(new TAmazonItem()); // item-id AmazonItem->ItemId=XmlDoc->GetTagTok("AmazonItem|ItemId")->GetTokStr(false); // title AmazonItem->TitleStr=XmlDoc->GetTagTok("AmazonItem|Title")->GetTokStr(false); // authors TXmlTokV AuthorNmTokV; XmlDoc->GetTagTokV("AmazonItem|Authors|Name", AuthorNmTokV); for (int AuthorNmTokN=0; AuthorNmTokN<AuthorNmTokV.Len(); AuthorNmTokN++){ PXmlTok AuthorNmTok=AuthorNmTokV[AuthorNmTokN]; TStr AuthorNm=AuthorNmTok->GetTokStr(false); AmazonItem->AuthorNmV.Add(AuthorNm); } // x-sell item-ids TXmlTokV NextItemIdTokV; XmlDoc->GetTagTokV("AmazonItem|XSell|ItemId", NextItemIdTokV); for (int ItemIdTokN=0; ItemIdTokN<NextItemIdTokV.Len(); ItemIdTokN++){ PXmlTok NextItemIdTok=NextItemIdTokV[ItemIdTokN]; TStr NextItemId=NextItemIdTok->GetTokStr(false); AmazonItem->NextItemIdV.Add(NextItemId); } // return item return AmazonItem; }
PTransCorpus TTransCorpus::LoadAC(const TStr& InXmlFNm, const int& MxSents) { // prepare prset structures PTransCorpus TransCorpus = TTransCorpus::New(); // we load xml by skiping first tags PSIn XmlSIn=TFIn::New(InXmlFNm); TXmlDoc::SkipTopTag(XmlSIn); // ignore TEI printf("Ignoring: %s\n", TXmlDoc::LoadTxt(XmlSIn)->GetTok()->GetTagNm().CStr()); // ignore teiHeader TXmlDoc::SkipTopTag(XmlSIn); // ignore text TXmlDoc::SkipTopTag(XmlSIn); // ignore body PXmlDoc XmlDoc; int XmlDocs = 0, SentId = 0;; forever{ // load xml tree XmlDocs++; printf("%7d Sentences \r", SentId); XmlDoc=TXmlDoc::LoadTxt(XmlSIn); // stop if at the last tag if (!XmlDoc->IsOk()) { /*printf("Error: %s\n", XmlDoc->GetMsgStr().CStr());*/ break; } // extract documents from xml-trees PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("div")){ // extract document Id TStr DocNm = TopTok->GetArgVal("n"); // and paragraphs TXmlTokV LinkTokV; TopTok->GetTagTokV("linkGrp|link", LinkTokV); for (int LinkTokN = 0; LinkTokN < LinkTokV.Len(); LinkTokN++) { PXmlTok LinkTok = LinkTokV[LinkTokN]; TStr LinkType = LinkTok->GetArgVal("type"); // skip if paragraph for one language is empty if (LinkType == "1:1") { TXmlTokV S1TokV; LinkTok->GetTagTokV("s1", S1TokV); TXmlTokV S2TokV; LinkTok->GetTagTokV("s2", S2TokV); IAssert(S1TokV.Len() == 1); IAssert(S2TokV.Len() == 1); TStr ParaStr1 = S1TokV[0]->GetTagTokStr(""); TStr ParaStr2 = S2TokV[0]->GetTagTokStr(""); TransCorpus->AddSentenceNoTrans(SentId, ParaStr1, ParaStr2); SentId++; } } } else { printf("Unknow tag: %s\n", TopTok->GetTagNm().CStr()); } if ((MxSents != -1) && (TransCorpus->GetSentences() > MxSents)) { break; } } printf("\n"); // finish return TransCorpus; }
PTransCorpus TTransCorpus::LoadTMX(const TStr& InTmxFPath, const TStr& OrgLang, const TStr& RefTransLang) { // prepare prset structures PTransCorpus TransCorpus = TTransCorpus::New(); // iterate over all the TMX files TFFile TmxFNms(InTmxFPath, "tmx", false); TStr TmxFNm; int SentId = 0; while (TmxFNms.Next(TmxFNm)) { printf("Loading %s ...\n", TmxFNm.CStr()); // we load xml by skiping first tag... TStr CleanTmxFNm = TmxFNm + ".xml"; CleanTmx(TmxFNm, CleanTmxFNm); PSIn XmlSIn = TFIn::New(CleanTmxFNm); PXmlDoc XmlDoc = TXmlDoc::LoadTxt(XmlSIn); // stop if at the last tag if (!XmlDoc->IsOk()) { printf(" error: %s\n", XmlDoc->GetMsgStr().CStr()); continue; } // extract sentences from xml-trees TXmlTokV TuTokV; XmlDoc->GetTagTokV("tmx|body|tu", TuTokV); const int TuToks = TuTokV.Len(); for (int TuTokN = 0; TuTokN < TuToks; TuTokN++) { if (TuTokN % 100 == 0) { printf(" %d / %d\r", TuTokN, TuToks); } TXmlTokV TuvTokV; TuTokV[TuTokN]->GetTagTokV("tuv", TuvTokV); IAssert(TuvTokV.Len() == 2); TStr OrgSent, RefTransSent; for (int TuvTokN = 0; TuvTokN < TuvTokV.Len(); TuvTokN++) { TStr Lang = TuvTokV[TuvTokN]->GetStrArgVal("xml:lang", ""); TStr Sent = CleanRtf(TuvTokV[TuvTokN]->GetTagTok("seg")->GetTokStr(false)); if (Lang == OrgLang) { OrgSent = Sent; } else if (Lang == RefTransLang) { RefTransSent = Sent; } } TransCorpus->AddSentenceNoTrans(SentId, OrgSent, RefTransSent); SentId++; } printf(" %d / %d\n", TuToks, TuToks); } // finish return TransCorpus; }
void TDzsBsDoc::GetDocParts( const TStr& FNm, const PXmlDoc& XmlDoc, const TStr& FPath, const TStr& WebAlias, bool& Ok, TStr& IdStr, TStr& TitleStr, TStr& DataStr, int& YearN){ Ok=false; if (!XmlDoc->IsOk()){return;} // id IdStr=FNm; IdStr.ChangeStr(FPath, WebAlias); // PXmlTok IdTok; // if (XmlDoc->IsTagTok("term|metadata|identifier", IdTok)){ // IdStr=IdTok->GetTokStr(false);} // else {return;} // title PXmlTok TitleTok; if (XmlDoc->IsTagTok("term|metadata|title", TitleTok)){ TitleStr=TitleTok->GetTokStr(false);} else {return;} // timedata TXmlTokV TimeDataTokV; XmlDoc->GetTagTokV("term|data|frame|timedata|fromyear", TimeDataTokV); TStr TimeDataStr=TXmlTok::GetTokVStr(TimeDataTokV, false); if (TimeDataStr.IsInt(YearN)){} else {YearN=0;} // locdata TXmlTokV LocDataTokV; XmlDoc->GetTagTokV("term|data|frame|locdata", LocDataTokV); TStr LocDataStr=TXmlTok::GetTokVStr(LocDataTokV, true); // pages TXmlTokV PageTokV; XmlDoc->GetTagTokV("term|data|frame|page", PageTokV); DataStr=GetDataTokVStr(PageTokV, "\n")+" "+LocDataStr; // character-set transformation TitleStr=THtmlLxChDef::GetCSZFromWin1250(TitleStr); DataStr=THtmlLxChDef::GetCSZFromWin1250(DataStr); // success Ok=true; }
PDzsBsDoc TDzsBsDoc::GetDzsBsDoc( const TStr& FNm, const PXmlDoc& XmlDoc, const TStr& FPath, const TStr& WebAlias){ TStr TitleStr; TStr IdStr; TStr DataStr; int YearN; if (!XmlDoc->IsOk()){return NULL;} bool Ok; GetDocParts(FNm, XmlDoc, FPath, WebAlias, Ok, IdStr, TitleStr, DataStr, YearN); if (!Ok){return NULL;} PDzsBsDoc DzsBsDoc=TDzsBsDoc::New(IdStr, TitleStr, DataStr, YearN); // printf("---------------------\n"); // printf("%s\n", TitleStr.CStr()); // printf("%s\n", IdStr.CStr()); // printf("%s\n", DataStr.CStr()); // printf("%s\n", TInt::GetStr(YearN)); return DzsBsDoc; }
void TSAppSrvFun::Exec(const TStrKdV& FldNmValPrV, const PSAppSrvRqEnv& RqEnv) { const PNotify& Notify = RqEnv->GetWebSrv()->GetNotify(); PHttpResp HttpResp; try { // log the call if (NotifyOnRequest) Notify->OnStatus(TStr::Fmt("RequestStart %s", FunNm.CStr())); TTmStopWatch StopWatch(true); // execute the actual function, according to the type PSIn BodySIn; TStr ContTypeVal; if (GetFunOutType() == saotXml) { PXmlDoc ResXmlDoc = ExecXml(FldNmValPrV, RqEnv); TStr ResXmlStr; ResXmlDoc->SaveStr(ResXmlStr); BodySIn = TMIn::New(XmlHdStr + ResXmlStr); ContTypeVal = THttp::TextXmlFldVal; } else if (GetFunOutType() == saotJSon) { TStr ResStr = ExecJSon(FldNmValPrV, RqEnv); BodySIn = TMIn::New(ResStr); ContTypeVal = THttp::AppJSonFldVal; } else { BodySIn = ExecSIn(FldNmValPrV, RqEnv, ContTypeVal); } if (ReportResponseSize) Notify->OnStatusFmt("Response size: %.1f KB", BodySIn->Len() / (double) TInt::Kilo); // log finish of the call if (NotifyOnRequest) Notify->OnStatus(TStr::Fmt("RequestFinish %s [request took %d ms]", FunNm.CStr(), StopWatch.GetMSecInt())); // prepare response HttpResp = THttpResp::New(THttp::OkStatusCd, ContTypeVal, false, BodySIn); } catch (PExcept Except) { // known internal error Notify->OnStatusFmt("Exception: %s", Except->GetMsgStr().CStr()); Notify->OnStatusFmt("Location: %s", Except->GetLocStr().CStr()); TStr ResStr, ContTypeVal = THttp::TextPlainFldVal; if (GetFunOutType() == saotXml) { PXmlTok TopTok = TXmlTok::New("error"); TopTok->AddSubTok(TXmlTok::New("message", Except->GetMsgStr())); TopTok->AddSubTok(TXmlTok::New("location", Except->GetLocStr())); PXmlDoc ErrorXmlDoc = TXmlDoc::New(TopTok); ResStr = XmlHdStr + ErrorXmlDoc->SaveStr(); ContTypeVal = THttp::TextXmlFldVal; } else if (GetFunOutType() == saotJSon) { PJsonVal ResVal = TJsonVal::NewObj(); ResVal->AddToObj("message", Except->GetMsgStr()); ResVal->AddToObj("location", Except->GetLocStr()); ResStr = TJsonVal::NewObj("error", ResVal)->SaveStr(); ContTypeVal = THttp::AppJSonFldVal; } // prepare response HttpResp = THttpResp::New(THttp::InternalErrStatusCd, ContTypeVal, false, TMIn::New(ResStr)); } catch (...) { // unknown internal error TStr ResStr, ContTypeVal = THttp::TextPlainFldVal; if (GetFunOutType() == saotXml) { PXmlDoc ErrorXmlDoc = TXmlDoc::New(TXmlTok::New("error")); ResStr = XmlHdStr + ErrorXmlDoc->SaveStr(); ContTypeVal = THttp::TextXmlFldVal; } else if (GetFunOutType() == saotJSon) { ResStr = TJsonVal::NewObj("error", "Unknown")->SaveStr(); ContTypeVal = THttp::AppJSonFldVal; } // prepare response HttpResp = THttpResp::New(THttp::InternalErrStatusCd, ContTypeVal, false, TMIn::New(ResStr)); } if (LogRqToFile) LogReqRes(FldNmValPrV, HttpResp); // send response RqEnv->GetWebSrv()->SendHttpResp(RqEnv->GetSockId(), HttpResp); }
void TSAppSrv::OnHttpRq(const uint64& SockId, const PHttpRq& HttpRq) { // last appropriate error code, start with bad request int ErrStatusCd = THttp::BadRqStatusCd; try { // check http-request correctness - return if error EAssertR(HttpRq->IsOk(), "Bad HTTP request!"); // check url correctness - return if error PUrl RqUrl = HttpRq->GetUrl(); EAssertR(RqUrl->IsOk(), "Bad request URL!"); // extract function name PUrl HttpRqUrl = HttpRq->GetUrl(); TStr FunNm = HttpRqUrl->GetPathSeg(0); // check if we have the function registered if (FunNm == "favicon.ico") { PHttpResp HttpResp = THttpResp::New(THttp::OkStatusCd, THttp::ImageIcoFldVal, false, Favicon.GetSIn()); SendHttpResp(SockId, HttpResp); return; } else if (!FunNm.Empty() && !FunNmToFunH.IsKey(FunNm)) { ErrStatusCd = THttp::ErrNotFoundStatusCd; GetNotify()->OnStatusFmt("[AppSrv] Unknown function '%s'!", FunNm.CStr()); TExcept::Throw("Unknown function '" + FunNm + "'!"); } // extract parameters TStrKdV FldNmValPrV; PUrlEnv HttpRqUrlEnv = HttpRq->GetUrlEnv(); const int Keys = HttpRqUrlEnv->GetKeys(); for (int KeyN = 0; KeyN < Keys; KeyN++) { TStr KeyNm = HttpRqUrlEnv->GetKeyNm(KeyN); const int Vals = HttpRqUrlEnv->GetVals(KeyN); for (int ValN = 0; ValN < Vals; ValN++) { TStr Val = HttpRqUrlEnv->GetVal(KeyN, ValN); FldNmValPrV.Add(TStrKd(KeyNm, Val)); } } // report call if (ShowParamP) { GetNotify()->OnStatus(" " + HttpRq->GetUrl()->GetUrlStr()); } // request parsed well, from now on it's internal error ErrStatusCd = THttp::InternalErrStatusCd; // processed requested function if (!FunNm.Empty()) { // prepare request environment PSAppSrvRqEnv RqEnv = TSAppSrvRqEnv::New(this, SockId, HttpRq, FunNmToFunH); // retrieve function PSAppSrvFun SrvFun = FunNmToFunH.GetDat(FunNm); // call function SrvFun->Exec(FldNmValPrV, RqEnv); } else { // internal SAppSrv call if (!ListFunP) { // we are not allowed to list functions ErrStatusCd = THttp::ErrNotFoundStatusCd; TExcept::Throw("Unknown page"); } // prepare a list of registered functions PXmlTok TopTok = TXmlTok::New("registered-functions"); int KeyId = FunNmToFunH.FFirstKeyId(); while (FunNmToFunH.FNextKeyId(KeyId)) { PXmlTok FunTok = TXmlTok::New("function"); FunTok->AddArg("name", FunNmToFunH.GetKey(KeyId)); TopTok->AddSubTok(FunTok); } TStr ResXmlStr; TXmlDoc::New(TopTok)->SaveStr(ResXmlStr); PSIn BodySIn = TMIn::New(TSAppSrvFun::XmlHdStr + ResXmlStr); // prepare response PHttpResp HttpResp = THttpResp::New(THttp::OkStatusCd, THttp::TextXmlFldVal, false, BodySIn); // send response SendHttpResp(SockId, HttpResp); } } catch (PExcept Except) { // known internal error PXmlTok TopTok = TXmlTok::New("error"); TopTok->AddSubTok(TXmlTok::New("message", Except->GetMsgStr())); TopTok->AddSubTok(TXmlTok::New("location", Except->GetLocStr())); PXmlDoc ErrorXmlDoc = TXmlDoc::New(TopTok); TStr ResXmlStr; ErrorXmlDoc->SaveStr(ResXmlStr); // prepare response PHttpResp HttpResp = THttpResp::New(ErrStatusCd, THttp::TextHtmlFldVal, false, TMIn::New(TSAppSrvFun::XmlHdStr + ResXmlStr)); // send response SendHttpResp(SockId, HttpResp); } catch (...) { // unknown internal error PXmlDoc ErrorXmlDoc = TXmlDoc::New(TXmlTok::New("error")); TStr ResXmlStr; ErrorXmlDoc->SaveStr(ResXmlStr); // prepare response PHttpResp HttpResp = THttpResp::New(ErrStatusCd, THttp::TextHtmlFldVal, false, TMIn::New(TSAppSrvFun::XmlHdStr + ResXmlStr)); // send response SendHttpResp(SockId, HttpResp); } }
void TSAppSrv::OnHttpRq(const int& SockId, const PHttpRq& HttpRq) { PHttpResp HttpResp; try { // check http-request correctness - return if error EAssertR(HttpRq->IsOk(), "Bad HTTP request!"); // check url correctness - return if error PUrl RqUrl = HttpRq->GetUrl(); EAssertR(RqUrl->IsOk(), "Bad request URL!"); // extract function name PUrl HttpRqUrl = HttpRq->GetUrl(); TStr FunNm = HttpRqUrl->GetPathSeg(0); EAssertR(FunNmToFunH.IsKey(FunNm) || FunNm.Empty(), "Unknown function '" + FunNm + "' !"); // extract parameters TStrKdV FldNmValPrV; PUrlEnv HttpRqUrlEnv = HttpRq->GetUrlEnv(); const int Keys = HttpRqUrlEnv->GetKeys(); for (int KeyN = 0; KeyN < Keys; KeyN++) { TStr KeyNm = HttpRqUrlEnv->GetKeyNm(KeyN); const int Vals = HttpRqUrlEnv->GetVals(KeyN); for (int ValN = 0; ValN < Vals; ValN++) { TStr Val = HttpRqUrlEnv->GetVal(KeyN, ValN); FldNmValPrV.Add(TStrKd(KeyNm, Val)); } } // log the call TStr TimeNow = TTm::GetCurLocTm().GetWebLogDateTimeStr(true); GetNotify()->OnStatus(TStr::Fmt("[%s] Request %s", TimeNow.CStr(), FunNm.CStr())); // prepare request environment PSAppSrvRqEnv RqEnv = TSAppSrvRqEnv::New(this, SockId, HttpRq); PSIn BodySIn; TStr ContTypeVal; if (!FunNm.Empty()) { // call function PSAppSrvFun SrvFun = FunNmToFunH.GetDat(FunNm); if (SrvFun->GetFunOutType() == saotXml) { PXmlDoc ResXmlDoc = SrvFun->Exec(FldNmValPrV, RqEnv); TStr ResXmlStr; ResXmlDoc->SaveStr(ResXmlStr); //ResXmlDoc->SaveTxt(TFile::GetUniqueFNm("test.xml")); BodySIn = TMIn::New(XmlHdStr + ResXmlStr); ContTypeVal = THttp::TextXmlFldVal; } else if (SrvFun->GetFunOutType() == saotJSon) { TStr ResStr = SrvFun->ExecJSon(FldNmValPrV, RqEnv); BodySIn = TMIn::New(ResStr); //ContTypeVal = THttp::TextHtmlFldVal; ContTypeVal = THttp::AppJSonFldVal; } else { BodySIn = SrvFun->ExecCustom(FldNmValPrV, RqEnv, ContTypeVal); } } else { PXmlTok TopTok = TXmlTok::New("registered-functions"); int KeyId = FunNmToFunH.FFirstKeyId(); while (FunNmToFunH.FNextKeyId(KeyId)) { PXmlTok FunTok = TXmlTok::New("function"); FunTok->AddArg("name", FunNmToFunH.GetKey(KeyId)); TopTok->AddSubTok(FunTok); } PXmlDoc ResXmlDoc = TXmlDoc::New(TopTok); TStr ResXmlStr; ResXmlDoc->SaveStr(ResXmlStr); BodySIn = TMIn::New(XmlHdStr + ResXmlStr); ContTypeVal = THttp::TextXmlFldVal; } // prepare response HttpResp = THttpResp::New(THttp::OkStatusCd, ContTypeVal, false, BodySIn); // send response } catch (PExcept Except) { PXmlTok TopTok = TXmlTok::New("error"); TopTok->AddSubTok(TXmlTok::New("message", Except->GetMsgStr())); TopTok->AddSubTok(TXmlTok::New("location", Except->GetLocStr())); PXmlDoc ErrorXmlDoc = TXmlDoc::New(TopTok); TStr ResXmlStr; ErrorXmlDoc->SaveStr(ResXmlStr); HttpResp = THttpResp::New(THttp::OkStatusCd, THttp::TextHtmlFldVal, false, TMIn::New(XmlHdStr + ResXmlStr)); } catch (...) { PXmlDoc ErrorXmlDoc = TXmlDoc::New(TXmlTok::New("error")); TStr ResXmlStr; ErrorXmlDoc->SaveStr(ResXmlStr); HttpResp = THttpResp::New(THttp::OkStatusCd, THttp::TextHtmlFldVal, false, TMIn::New(XmlHdStr + ResXmlStr)); } SendHttpResp(SockId, HttpResp); }
PCycBs TCycBs::LoadCycXmlDump(const TStr& FPath){ // file-names TStr NrFPath=TStr::GetNrFPath(FPath); TStr CycLexiconFNm=NrFPath+"lexicon-dump.xml"; TStr CycTaxonomyFNm=NrFPath+"taxonomy-dump.xml"; TStr CycRelevanceFNm=NrFPath+"relevance-dump.xml"; TStr CycKBaseFNm=NrFPath+"kb-dump.xml"; // create cyc-base PCycBs CycBs=TCycBs::New(); // lexicon {printf("Processing Lexicon %s ...\n", CycLexiconFNm.CStr()); PSIn CycLexiconSIn=TFIn::New(CycLexiconFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycWStr; TStr PrevCycLStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycLexiconSIn); if (!XmlDoc->IsOk()){ printf("%s - %s\n", PrevCycWStr.CStr(), PrevCycLStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("word")); TStr CycWStr=TopTok->GetArgVal("string"); TStr CycLStr=TopTok->GetArgVal("cycl"); PrevCycWStr=CycWStr; PrevCycLStr; // insert data CycBs->AddEdge(CycLStr, "#$nameString", CycWStr); CycBs->AddEdge(CycWStr, "~#$nameString", CycLStr); } printf("%d Docs\nDone.\n", XmlDocs);} // taxonomy {printf("Processing Taxonomy %s ...\n", CycTaxonomyFNm.CStr()); PSIn CycTaxonomySIn=TFIn::New(CycTaxonomyFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevSrcCycLStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycTaxonomySIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevSrcCycLStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("term")); TStr SrcCycLStr=TopTok->GetArgVal("cycl"); PrevSrcCycLStr=SrcCycLStr; for (int SubTokN=0; SubTokN<TopTok->GetSubToks(); SubTokN++){ PXmlTok SubTok=TopTok->GetSubTok(SubTokN); TStr DstCycLStr=SubTok->GetTagNm(); if (SubTok->IsTag("isa")){ DstCycLStr=SubTok->GetArgVal("value"); CycBs->AddEdge(SrcCycLStr, "#$isa", DstCycLStr); CycBs->AddEdge(DstCycLStr, "~#$isa", SrcCycLStr); } else if (SubTok->IsTag("genl")){ DstCycLStr=SubTok->GetArgVal("value"); CycBs->AddEdge(SrcCycLStr, "#$genls", DstCycLStr); CycBs->AddEdge(DstCycLStr, "~#$genls", SrcCycLStr); } else { Fail; } } } printf("%d Docs\nDone.\n", XmlDocs);} // relevance {printf("Processing Relevance %s ...\n", CycRelevanceFNm.CStr()); PSIn CycRelevanceSIn=TFIn::New(CycRelevanceFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycRelevanceSIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevCycStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("term")); TStr CycStr=TopTok->GetArgVal("cyc"); PrevCycStr=CycStr; //IAssert(CycBs->IsVNm(CycStr)); if (CycBs->IsVNm(CycStr)){ if (TopTok->GetArgVal("thcl")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cvfHumanRelevant, true);} if (TopTok->GetArgVal("irrel")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cfvHumanIrrelevant, true);} if (TopTok->GetArgVal("clarifying")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cfvHumanClarifying, true);} if ((TopTok->GetArgVal("thcl")=="T")||(TopTok->GetArgVal("clarifying")=="T")){ CycBs->GetVrtx(CycStr).SetFlag(cvfHumanOk, true);} } else { //printf("%s\n", CycStr.CStr()); } } printf("%d Docs\nDone.\n", XmlDocs);} // knowledge-base {printf("Processing KBase %s ...\n", CycKBaseFNm.CStr()); PSIn CycKBaseSIn=TFIn::New(CycKBaseFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycLStr; TStrV PrevArgCycLStrV; TStrIntH HdCycLToFq; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} //if (XmlDocs>10000){break;} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycKBaseSIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevCycLStr.CStr()); for (int ArgN=0; ArgN<PrevArgCycLStrV.Len(); ArgN++){ printf(" [%s]", PrevArgCycLStrV[ArgN].CStr());} printf("\n"); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("sentence")); TStr CycLStr=TopTok->GetArgVal("cycl"); TXmlTokV ArgXmlTokV; XmlDoc->GetTagTokV("sentence|arg", ArgXmlTokV); TStrV ArgCycLStrV; for (int ArgN=0; ArgN<ArgXmlTokV.Len(); ArgN++){ PXmlTok Tok=ArgXmlTokV[ArgN]; IAssert(Tok->IsTag("arg")); if (Tok->IsArg("cycl")){ TStr ArgCycLStr=Tok->GetArgVal("cycl"); ArgCycLStrV.Add(ArgCycLStr); } else { ArgCycLStrV.Add("Empty"); } } PrevCycLStr=CycLStr; PrevArgCycLStrV=ArgCycLStrV; if (ArgCycLStrV.Len()>0){ HdCycLToFq.AddDat(ArgCycLStrV[0]+" - "+TInt::GetStr(ArgCycLStrV.Len()-1))++;} // insert if (ArgCycLStrV.Len()==3){ TStr PredNm=ArgCycLStrV[0]; if ((PredNm!="#$isa")&&(PredNm!="#$termOfUnit")&&(PredNm!="#$genls")){ TStr BackLinkPredNm=TStr("~")+PredNm; TStr Arg1=ArgCycLStrV[1]; TStr Arg2=ArgCycLStrV[2]; CycBs->AddEdge(Arg1, PredNm, Arg2); CycBs->AddEdge(Arg2, BackLinkPredNm, Arg1); } } } // output top cycl relations {TFOut CycLSOut("CycKB-CycLFq.Stat.Txt"); FILE* fCycL=CycLSOut.GetFileId(); TIntStrPrV FqCycLStrPrV; HdCycLToFq.GetDatKeyPrV(FqCycLStrPrV); FqCycLStrPrV.Sort(false); for (int CycLN=0; CycLN<FqCycLStrPrV.Len(); CycLN++){ fprintf(fCycL, "%6d. %s\n", 1+FqCycLStrPrV[CycLN].Val1, FqCycLStrPrV[CycLN].Val2.CStr()); }} printf("%d Docs\nDone.\n", XmlDocs);} // return cyc-base return CycBs; }