void TCpDoc::LoadReuters2000DocFromXml(const TStr& FNm, TStr& DocId, TStr& DateStr, TStr& TitleStr, TStr& HeadlineStr, TStr& BylineStr, TStr& DatelineStr, TStrV& ParStrV, TStrV& TopCdNmV, TStrV& GeoCdNmV, TStrV& IndCdNmV){ PXmlDoc Doc=TXmlDoc::LoadTxt(FNm); // get text strings // general document data DocId=Doc->GetTagTok("newsitem")->GetArgVal("itemid"); DateStr=Doc->GetTagTok("newsitem")->GetArgVal("date"); TitleStr=Doc->GetTagTok("newsitem|title")->GetTokStr(false); HeadlineStr=Doc->GetTagTok("newsitem|headline")->GetTokStr(false); BylineStr=""; PXmlTok BylineTok; if (Doc->IsTagTok("newsitem|byline", BylineTok)){ BylineStr=BylineTok->GetTokStr(false);} DatelineStr=""; PXmlTok DatelineTok; if (Doc->IsTagTok("newsitem|dateline", DatelineTok)){ DatelineStr=DatelineTok->GetTokStr(false);} // text paragraphs ParStrV.Clr(); TXmlTokV ParTokV; Doc->GetTagTokV("newsitem|text|p", ParTokV); for (int ParTokN=0; ParTokN<ParTokV.Len(); ParTokN++){ TStr ParStr=ParTokV[ParTokN]->GetTokStr(false); ParStrV.Add(ParStr); } // codes TopCdNmV.Clr(); GeoCdNmV.Clr(); IndCdNmV.Clr(); TXmlTokV CdsTokV; Doc->GetTagTokV("newsitem|metadata|codes", CdsTokV); for (int CdsTokN=0; CdsTokN<CdsTokV.Len(); CdsTokN++){ PXmlTok CdsTok=CdsTokV[CdsTokN]; TXmlTokV CdTokV; CdsTok->GetTagTokV("code", CdTokV); if (CdsTok->GetArgVal("class")=="bip:topics:1.0"){ for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){ TStr CdNm=CdTokV[CdTokN]->GetArgVal("code"); TopCdNmV.Add(CdNm); } } else if (CdsTok->GetArgVal("class")=="bip:countries:1.0"){ for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){ TStr CdNm=CdTokV[CdTokN]->GetArgVal("code"); GeoCdNmV.Add(CdNm); } } else if (CdsTok->GetArgVal("class")=="bip:industries:1.0"){ for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){ TStr CdNm=CdTokV[CdTokN]->GetArgVal("code"); IndCdNmV.Add(CdNm); } } else { Fail; } } }
void TDzsBsDoc::GetDocParts( const TStr& FNm, const PXmlDoc& XmlDoc, const TStr& FPath, const TStr& WebAlias, bool& Ok, TStr& IdStr, TStr& TitleStr, TStr& DataStr, int& YearN){ Ok=false; if (!XmlDoc->IsOk()){return;} // id IdStr=FNm; IdStr.ChangeStr(FPath, WebAlias); // PXmlTok IdTok; // if (XmlDoc->IsTagTok("term|metadata|identifier", IdTok)){ // IdStr=IdTok->GetTokStr(false);} // else {return;} // title PXmlTok TitleTok; if (XmlDoc->IsTagTok("term|metadata|title", TitleTok)){ TitleStr=TitleTok->GetTokStr(false);} else {return;} // timedata TXmlTokV TimeDataTokV; XmlDoc->GetTagTokV("term|data|frame|timedata|fromyear", TimeDataTokV); TStr TimeDataStr=TXmlTok::GetTokVStr(TimeDataTokV, false); if (TimeDataStr.IsInt(YearN)){} else {YearN=0;} // locdata TXmlTokV LocDataTokV; XmlDoc->GetTagTokV("term|data|frame|locdata", LocDataTokV); TStr LocDataStr=TXmlTok::GetTokVStr(LocDataTokV, true); // pages TXmlTokV PageTokV; XmlDoc->GetTagTokV("term|data|frame|page", PageTokV); DataStr=GetDataTokVStr(PageTokV, "\n")+" "+LocDataStr; // character-set transformation TitleStr=THtmlLxChDef::GetCSZFromWin1250(TitleStr); DataStr=THtmlLxChDef::GetCSZFromWin1250(DataStr); // success Ok=true; }