TIntV GetEmailIds(const PXmlTok& QueryXml, const TStr& TagPath) { TIntV EmailIdV; TXmlTokV Ids; QueryXml->GetTagTokV(TagPath, Ids); for (int EmlInd = 0; EmlInd < Ids.Len(); EmlInd++) { TInt EmailIdInt = Ids[EmlInd]->GetIntArgVal("id", -1); if (EmailIdInt != -1) EmailIdV.AddUnique(EmailIdInt); } return EmailIdV; }
void GetKeywords(const PXmlTok& QueryXml, const TStr& TagPath, TStrV& KeywordsV, TStrV& IgnoreKeywordsV) { TXmlTokV KwsXmlV; QueryXml->GetTagTokV(TagPath, KwsXmlV); for (int KwInd = 0; KwInd < KwsXmlV.Len(); KwInd++) { TStr Kw = KwsXmlV[KwInd]->GetTokStr(false); int hide = KwsXmlV[KwInd]->GetIntArgVal("hide", 0); if (hide) IgnoreKeywordsV.Add(Kw); else KeywordsV.Add(Kw); } }
PTransCorpus TTransCorpus::LoadAC(const TStr& InXmlFNm, const int& MxSents) { // prepare prset structures PTransCorpus TransCorpus = TTransCorpus::New(); // we load xml by skiping first tags PSIn XmlSIn=TFIn::New(InXmlFNm); TXmlDoc::SkipTopTag(XmlSIn); // ignore TEI printf("Ignoring: %s\n", TXmlDoc::LoadTxt(XmlSIn)->GetTok()->GetTagNm().CStr()); // ignore teiHeader TXmlDoc::SkipTopTag(XmlSIn); // ignore text TXmlDoc::SkipTopTag(XmlSIn); // ignore body PXmlDoc XmlDoc; int XmlDocs = 0, SentId = 0;; forever{ // load xml tree XmlDocs++; printf("%7d Sentences \r", SentId); XmlDoc=TXmlDoc::LoadTxt(XmlSIn); // stop if at the last tag if (!XmlDoc->IsOk()) { /*printf("Error: %s\n", XmlDoc->GetMsgStr().CStr());*/ break; } // extract documents from xml-trees PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("div")){ // extract document Id TStr DocNm = TopTok->GetArgVal("n"); // and paragraphs TXmlTokV LinkTokV; TopTok->GetTagTokV("linkGrp|link", LinkTokV); for (int LinkTokN = 0; LinkTokN < LinkTokV.Len(); LinkTokN++) { PXmlTok LinkTok = LinkTokV[LinkTokN]; TStr LinkType = LinkTok->GetArgVal("type"); // skip if paragraph for one language is empty if (LinkType == "1:1") { TXmlTokV S1TokV; LinkTok->GetTagTokV("s1", S1TokV); TXmlTokV S2TokV; LinkTok->GetTagTokV("s2", S2TokV); IAssert(S1TokV.Len() == 1); IAssert(S2TokV.Len() == 1); TStr ParaStr1 = S1TokV[0]->GetTagTokStr(""); TStr ParaStr2 = S2TokV[0]->GetTagTokStr(""); TransCorpus->AddSentenceNoTrans(SentId, ParaStr1, ParaStr2); SentId++; } } } else { printf("Unknow tag: %s\n", TopTok->GetTagNm().CStr()); } if ((MxSents != -1) && (TransCorpus->GetSentences() > MxSents)) { break; } } printf("\n"); // finish return TransCorpus; }