Exemple #1
0
void TCpDoc::LoadReuters2000DocFromXml(const TStr& FNm,
 TStr& DocId, TStr& DateStr, TStr& TitleStr,
 TStr& HeadlineStr, TStr& BylineStr, TStr& DatelineStr,
 TStrV& ParStrV,
 TStrV& TopCdNmV, TStrV& GeoCdNmV, TStrV& IndCdNmV){
  PXmlDoc Doc=TXmlDoc::LoadTxt(FNm);
  // get text strings
  // general document data
  DocId=Doc->GetTagTok("newsitem")->GetArgVal("itemid");
  DateStr=Doc->GetTagTok("newsitem")->GetArgVal("date");
  TitleStr=Doc->GetTagTok("newsitem|title")->GetTokStr(false);
  HeadlineStr=Doc->GetTagTok("newsitem|headline")->GetTokStr(false);
  BylineStr=""; PXmlTok BylineTok;
  if (Doc->IsTagTok("newsitem|byline", BylineTok)){
    BylineStr=BylineTok->GetTokStr(false);}
  DatelineStr=""; PXmlTok DatelineTok;
  if (Doc->IsTagTok("newsitem|dateline", DatelineTok)){
    DatelineStr=DatelineTok->GetTokStr(false);}
  // text paragraphs
  ParStrV.Clr(); TXmlTokV ParTokV; Doc->GetTagTokV("newsitem|text|p", ParTokV);
  for (int ParTokN=0; ParTokN<ParTokV.Len(); ParTokN++){
    TStr ParStr=ParTokV[ParTokN]->GetTokStr(false);
    ParStrV.Add(ParStr);
  }
  // codes
  TopCdNmV.Clr(); GeoCdNmV.Clr(); IndCdNmV.Clr();
  TXmlTokV CdsTokV; Doc->GetTagTokV("newsitem|metadata|codes", CdsTokV);
  for (int CdsTokN=0; CdsTokN<CdsTokV.Len(); CdsTokN++){
    PXmlTok CdsTok=CdsTokV[CdsTokN];
    TXmlTokV CdTokV; CdsTok->GetTagTokV("code", CdTokV);
    if (CdsTok->GetArgVal("class")=="bip:topics:1.0"){
      for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){
        TStr CdNm=CdTokV[CdTokN]->GetArgVal("code");
        TopCdNmV.Add(CdNm);
      }
    } else
    if (CdsTok->GetArgVal("class")=="bip:countries:1.0"){
      for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){
        TStr CdNm=CdTokV[CdTokN]->GetArgVal("code");
        GeoCdNmV.Add(CdNm);
      }
    } else
    if (CdsTok->GetArgVal("class")=="bip:industries:1.0"){
      for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){
        TStr CdNm=CdTokV[CdTokN]->GetArgVal("code");
        IndCdNmV.Add(CdNm);
      }
    } else {
      Fail;
    }
  }
}
Exemple #2
0
void TDzsBsDoc::GetDocParts(
 const TStr& FNm, const PXmlDoc& XmlDoc,
 const TStr& FPath, const TStr& WebAlias,
 bool& Ok, TStr& IdStr, TStr& TitleStr, TStr& DataStr, int& YearN){
  Ok=false;
  if (!XmlDoc->IsOk()){return;}
  // id
  IdStr=FNm;
  IdStr.ChangeStr(FPath, WebAlias);
//  PXmlTok IdTok;
//  if (XmlDoc->IsTagTok("term|metadata|identifier", IdTok)){
//    IdStr=IdTok->GetTokStr(false);}
//  else {return;}
  // title
  PXmlTok TitleTok;
  if (XmlDoc->IsTagTok("term|metadata|title", TitleTok)){
    TitleStr=TitleTok->GetTokStr(false);}
  else {return;}
  // timedata
  TXmlTokV TimeDataTokV;
  XmlDoc->GetTagTokV("term|data|frame|timedata|fromyear", TimeDataTokV);
  TStr TimeDataStr=TXmlTok::GetTokVStr(TimeDataTokV, false);
  if (TimeDataStr.IsInt(YearN)){} else {YearN=0;}
  // locdata
  TXmlTokV LocDataTokV;
  XmlDoc->GetTagTokV("term|data|frame|locdata", LocDataTokV);
  TStr LocDataStr=TXmlTok::GetTokVStr(LocDataTokV, true);
  // pages
  TXmlTokV PageTokV; XmlDoc->GetTagTokV("term|data|frame|page", PageTokV);
  DataStr=GetDataTokVStr(PageTokV, "\n")+" "+LocDataStr;
  // character-set transformation
  TitleStr=THtmlLxChDef::GetCSZFromWin1250(TitleStr);
  DataStr=THtmlLxChDef::GetCSZFromWin1250(DataStr);
  // success
  Ok=true;
}