예제 #1
0
void TCpDoc::SaveAcmTechNewsToCpd(
 const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){
  // create output file
  PSOut SOut=TFOut::New(OutCpdFNm);
  // processing xml files
  TFFile FFile(TStrV()+InFPath, TStrV()+".Html"+".Htm", "", true);
  TStr FNm; int Docs=0;
  while (FFile.Next(FNm)){
    printf("Processing file '%s'\r", FNm.CStr());
    PSIn SIn=TFIn::New(FNm);
    THtmlLx Lx(SIn);
    while (Lx.GetSym()!=hsyEof){
      //printf("%d\r", Docs);
      if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
      Lx.MoveToBTagOrEof("<SPAN>");
      if (Lx.GetArg("CLASS")!="title"){continue;}
      Lx.MoveToBTagOrEof("<A>");
      TStr TitleStr=Lx.GetStrToETag("<A>", false); TitleStr="";
      Lx.MoveToETagOrEof("<SPAN>");
      Lx.MoveToBTagOrEof("<P>");
      TStr ParStr=Lx.GetStrToETag("<P>", false);
      if (!ParStr.Empty()){
        Docs++;
        PCpDoc CpDoc=TCpDoc::New(TInt::GetStr(Docs), TitleStr, ParStr);
        CpDoc->Save(*SOut);
      }
    }
  }
  printf("\n");
}
예제 #2
0
파일: phrase.cpp 프로젝트: bergloman/qminer
PNGramBs TNGramBs::GetNGramBsFromReuters21578(
 const TStr& FPath, const int& MxDocs,
 const int& MxNGramLen, const int& MnNGramFq,
 const PSwSet& SwSet, const PStemmer& Stemmer){
  // create n-gram-base
  /* printf("Generating frequent n-grams (MaxLen:%d MinFq:%d) ...\n", MxNGramLen, MnNGramFq); */
  PNGramBs NGramBs=TNGramBs::New(MxNGramLen, MnNGramFq, SwSet, Stemmer);
  // interations over document-set
  while (!NGramBs->IsFinished()){
    TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0;
    while (FFile.Next(FNm)){
      /* printf("Processing file '%s'\n", FNm.CStr()); */
      TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV);
      for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){
        Docs++;
        /* printf("  Pass %2d: %6d Docs\r", NGramBs->GetPassN(), Docs); */
        if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
        PXmlDoc Doc=LDocV[LDocN];
        PXmlTok DocTok=Doc->GetTok();
        // get html string
        PXmlTok Tok=Doc->GetTagTok("REUTERS|TEXT");
        TStr HtmlStr=Tok->GetTokStr(false);
        // extract words & update n-gram-base
        _UpdateNGramBsFromHtmlStr(NGramBs, HtmlStr, SwSet, Stemmer);
      }
      if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    }
    NGramBs->ConcPass();
    /* printf("  Pass %2d: %6d Docs\r", NGramBs->GetPassN(), Docs); */
  }
  /* printf("\nDone.\n"); */
  // return
  return NGramBs;
}
예제 #3
0
파일: phrase.cpp 프로젝트: bergloman/qminer
PNGramBs TNGramBs::GetNGramBsFromHtmlFPathV(
 const TStr& FPath, const bool& RecurseDirP, const int& MxDocs,
 const int& MxNGramLen, const int& MnNGramFq,
 const PSwSet& SwSet, const PStemmer& Stemmer){
  // create n-gram-base
  /* printf("Generating frequent n-grams (MaxLen:%d MinFq:%d) ...\n", MxNGramLen, MnNGramFq); */
  PNGramBs NGramBs=TNGramBs::New(MxNGramLen, MnNGramFq, SwSet, Stemmer);
  // interations over document-set
  while (!NGramBs->IsFinished()){
    // prepare file-directory traversal
    TFFile FFile(FPath, "", RecurseDirP);
    // traverse files
    TStr FNm; int Docs=0;
    while (FFile.Next(FNm)){
      Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
      /* if (Docs%100==0){ */
        /* printf("  Pass %d: %6d\r", NGramBs->GetPassN(), Docs);} */
      if (TFile::Exists(FNm)) {
        // load html
        TStr HtmlStr=TStr::LoadTxt(FNm);
        // extract words & update ngram-base
        _UpdateNGramBsFromHtmlStr(NGramBs, HtmlStr, SwSet, Stemmer);
      }
    }
    NGramBs->ConcPass();
    /* printf("  Pass %d: %6d\n", NGramBs->GetPassN()-1, Docs); */
  }
  /* printf("Done.\n"); */
  // return
  return NGramBs;
}
예제 #4
0
파일: flx.cpp 프로젝트: Accio/snap
void TFile::DelWc(const TStr& WcStr, const bool& RecurseDirP){
  // collect file-names
  TStrV FNmV;
  TFFile FFile(WcStr, RecurseDirP); TStr FNm;
  while (FFile.Next(FNm)){
    FNmV.Add(FNm);}
  // delete files
  for (int FNmN=0; FNmN<FNmV.Len(); FNmN++){
    Del(FNmV[FNmN], false);}
}
예제 #5
0
파일: skygrid.cpp 프로젝트: Accio/snap
/////////////////////////////////////////////////
// SkyGrid-Document
void TSkyGridBinDoc::SaveBinDocV(
 const TStr& InXmlFPath, const TStr& OutBinFNm, const int& MxDocs){
  printf("Processing SkyGrid-News-Xml files from '%s'...\n", InXmlFPath.CStr());
  TFOut SOut(OutBinFNm);
  TFFile FFile(InXmlFPath, true); TStr FNm;
  int Docs=0; int DateDocs=0; uint64 PrevTm=0;
  while (FFile.Next(FNm)){
    if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    //printf("  Processing '%s' ...", FNm.CStr());
    PXmlDoc XmlDoc=TXmlDoc::LoadTxt(FNm);
    PXmlTok ContentTok=XmlDoc->GetTagTok("item|content");
    TStr SwIdStr=ContentTok->GetTagTok("swid")->GetArgVal("value");
    TStr UrlStr=ContentTok->GetTagTok("url")->GetTokStr(false);
    TStr TitleStr=ContentTok->GetTagTok("title")->GetTokStr(false);
    TStr FetchedValStr=ContentTok->GetTagTok("fetched")->GetArgVal("value");
    TXmlTokV EntityTokV; ContentTok->GetTagTokV("annotations|entity", EntityTokV);
    TStr BodyStr=ContentTok->GetTagTok("body")->GetTokStr(false);
    // extract date
    TStr DateStr=SwIdStr.GetSubStr(0, 7);
    TStr YearStr=DateStr.GetSubStr(0, 3);
    TStr MonthStr=DateStr.GetSubStr(4, 5);
    TStr DayStr=DateStr.GetSubStr(6, 7);
    TTm DateTm(YearStr.GetInt(), MonthStr.GetInt(), DayStr.GetInt());
    uint64 Tm=TTm::GetMSecsFromTm(DateTm);
    // extract entities
    TStrIntH EntNmToFqH;
    for (int EntityTokN=0; EntityTokN<EntityTokV.Len(); EntityTokN++){
      PXmlTok EntityTok=EntityTokV[EntityTokN];
      if (!EntityTok->IsTag("entity")){continue;}
      TStr CanonicalNm=EntityTok->GetArgVal("canonical", "");
      TStr TextStr=EntityTok->GetArgVal("text", "");
      TStr TypeNm=EntityTok->GetArgVal("type", "");
      TStr EntNm=CanonicalNm.Empty() ? TextStr : CanonicalNm;
      EntNmToFqH.AddDat(EntNm)++;
    }
    TIntStrPrV FqEntNmPrV; EntNmToFqH.GetDatKeyPrV(FqEntNmPrV); FqEntNmPrV.Sort(false);
    // extract headline
    TChA HeadlineChA=BodyStr.GetSubStr(0, 250);
    while ((HeadlineChA.Len()>0)&&(HeadlineChA.LastCh()!=' ')){
      HeadlineChA.Trunc(HeadlineChA.Len()-1);}
    HeadlineChA+="...";
    // create document
    TSkyGridBinDoc Doc(SwIdStr, Tm, TitleStr, HeadlineChA, FqEntNmPrV);
    // save document
    Doc.Save(SOut);
    // screen log
    if (PrevTm!=Tm){
      if (PrevTm!=0){printf("\n");}
      PrevTm=Tm; DateDocs=0;
    }
    Docs++; DateDocs++;
    printf("  %s [Day:%d / All:%d]\r", DateStr.CStr(), DateDocs, Docs);
  }
  printf("\nDone.\n");
}
예제 #6
0
파일: bowflx.cpp 프로젝트: Accio/snap
PBowDocBs TBowFl::LoadReuters21578Txt(
 const TStr& FPath, const int& MxDocs,
 const TStr& SwSetTypeNm, const TStr& StemmerTypeNm,
 const int& MxNGramLen, const int& MnNGramFq, const bool& SaveDocP,
 const PNotify& Notify){
  // prepare stop-words
  PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
  // prepare stemmer
  PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
  // create ngrams
  PNGramBs NGramBs;
  if (!((MxNGramLen==1)&&(MnNGramFq==1))){
    NGramBs=TNGramBs::GetNGramBsFromReuters21578(
     FPath, MxDocs,
     MxNGramLen, MnNGramFq, SwSet, Stemmer);
  }
  // create document-base
  PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs);
  // traverse directory
  TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0;
  while (FFile.Next(FNm)){
    printf("Processing file '%s'\n", FNm.CStr());
    TIntH DocWIdToFqH(100);
    TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV);
    for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){
      Docs++; if (Docs%100==0){printf("%d\r", Docs);}
      if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
      // get document-name
      PXmlDoc Doc=LDocV[LDocN];
      PXmlTok DocTok=Doc->GetTok();
      TStr DocNm=DocTok->GetArgVal("NEWID");
      // get document-categories
      TStrV CatNmV;
      TXmlTokV TopicsTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopicsTokV);
      for (int TopicsTokN=0; TopicsTokN<TopicsTokV.Len(); TopicsTokN++){
        TStr CatNm=TopicsTokV[TopicsTokN]->GetTokStr(false);
        CatNmV.Add(CatNm);
      }
      // get document-contents
      PXmlTok DocStrTok=Doc->GetTagTok("REUTERS|TEXT");
      TStr DocStr=DocStrTok->GetTokStr(false);
      // add document to bow
      int DId=BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP);
      // train & test data
      if ((DocTok->GetArgVal("LEWISSPLIT")=="TRAIN")&&(DocTok->GetArgVal("TOPICS")=="YES")){
        BowDocBs->AddTrainDId(DId);}
      if ((DocTok->GetArgVal("LEWISSPLIT")=="TEST")&&(DocTok->GetArgVal("TOPICS")=="YES")){
        BowDocBs->AddTestDId(DId);}
    }
    if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
  }
  // return results
  BowDocBs->AssertOk();
  return BowDocBs;
}
예제 #7
0
void TCpDoc::SaveReuters21578ToCpd(
 const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){
  // create output file
  PSOut SOut=TFOut::New(OutCpdFNm);
  // traverse directory with .sgm files
  TFFile FFile(InFPath, ".SGM", false); TStr FNm; int Docs=0;
  while (FFile.Next(FNm)){
    printf("Processing file '%s'\n", FNm.CStr());
    TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV);
    for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){
      Docs++; if (Docs%100==0){printf("%d\r", Docs);}
      if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
      // create reuters document
      PCpDoc CpDoc=TCpDoc::New();
      // load xml document
      PXmlDoc Doc=LDocV[LDocN];
      PXmlTok DocTok=Doc->GetTok();
      // document id
      CpDoc->DocNm=DocTok->GetArgVal("NEWID");
      // date
      CpDoc->DateStr=Doc->GetTagTok("REUTERS|DATE")->GetTokStr(false);
      // document title
      PXmlTok TitleTok=Doc->GetTagTok("REUTERS|TEXT|TITLE");
      if (!TitleTok.Empty()){
        CpDoc->TitleStr=TitleTok->GetTokStr(false);}
      // dateline
      PXmlTok DatelineTok=Doc->GetTagTok("REUTERS|TEXT|DATELINE");
      if (!DatelineTok.Empty()){
        CpDoc->DatelineStr=DatelineTok->GetTokStr(false);}
      // get text string
      TStr TextStr;
      PXmlTok BodyTok=Doc->GetTagTok("REUTERS|TEXT|BODY");
      if (!BodyTok.Empty()){
        TextStr=BodyTok->GetTokStr(false);
      } else {
        // if <BODY> doesn't exist, take the whole <TEXT>
        PXmlTok TextTok=Doc->GetTagTok("REUTERS|TEXT");
        if (!TextTok.Empty()){
          TextStr=TextTok->GetTokStr(false);
        }
      }
      CpDoc->ParStrV.Add(TextStr, 1);
      // topic categories
      TXmlTokV TopCatTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopCatTokV);
      for (int TokN=0; TokN<TopCatTokV.Len(); TokN++){
        TStr CatNm=TopCatTokV[TokN]->GetTokStr(false);
        CpDoc->TopCdNmV.Add(CatNm);
      }
      // save cpd document
      CpDoc->Save(*SOut);
    }
    if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
  }
}
예제 #8
0
void TFFile::GetFNmV(
 const TStr& FPath, const TStrV& FExtV, const bool& RecurseP, TStrV& FNmV){
  // prepare file-directory traversal
  TStrV FPathV; FPathV.Add(FPath);
  TFFile FFile(FPathV, FExtV, "", RecurseP); TStr FNm;
  // traverse directory
  FNmV.Clr();
  while (FFile.Next(FNm)){
    FNmV.Add(FNm);
  }
}
PAlignPairBs TAlignPairBs::LoadAcXml(const TStr& FPath, const int& MxSents) {
    // prepare base
    PAlignPairBs AlignPairBs = TAlignPairBs::New();
    // iterate over files and load language to language alignments
    TFFile FFile(FPath, ".xml", false); TStr FNm;
    while (FFile.Next(FNm)) {
        PAlignPair AlignPair = TAlignPair::LoadAcXml(FNm, MxSents);
        AlignPairBs->AddAlignPair(AlignPair);
    }
    // finish
    return AlignPairBs;
}
예제 #10
0
void FData::NewDataFile(int _Type, int _Size)
{
    /*
      Cria um buffer para novo arquivo
    */

    if(!ExistFileType(_Type))
    {
        Files.push_back(FFile());
        Files[Files.size()-1].Type = _Type;

        int Id = GetId(_Type);
        Files[Id].Buffer = new char[_Size];
        Files[Id].SizeBuffer = _Size;
    }

    UpdateMaxFileType();
    DefSerieMain();

}
예제 #11
0
TDzsBs::TDzsBs(const TStr& FPath, const TStr& WebAlias):
  TBook("Dzs-Base", "Dzs", "Lexicon"),
  DzsBsDocV(), Bix(TBix::New()){
  TStrV FPathV; FPathV.Add(FPath);
  TStrV FExtV;
  FExtV.Add("xml"); FExtV.Add("html"); FExtV.Add("htm"); FExtV.Add("txt");
  TFFile FFile(FPathV, FExtV, true); TStr FNm; int FNmN=0;
  while (FFile.Next(FNm)){
    printf("%d\r", ++FNmN);
    PDzsBsDoc DzsBsDoc;
    if (FNm.GetFExt().GetUc()==".XML"){
      PXmlDoc XmlDoc=TXmlDoc::LoadTxt(FNm);
      DzsBsDoc=TDzsBsDoc::GetDzsBsDoc(FNm, XmlDoc, FPath, WebAlias);
    } else {
      PHtmlDoc HtmlDoc=THtmlDoc::LoadTxt(FNm, hdtAll, false);
      DzsBsDoc=TDzsBsDoc::GetDzsBsDoc(FNm, HtmlDoc, FPath, WebAlias);
    }
    AddDoc(DzsBsDoc);
  }
}
예제 #12
0
파일: bowfl.cpp 프로젝트: Accio/snap
/////////////////////////////////////////////////
// BagOfWords-Files
void TBowFl::LoadHtmlTxt(
 PBowDocBs BowDocBs, const TStr& FPath, TIntV& NewDIdV,
 const bool& RecurseDirP, const int& MxDocs,
 const bool& SaveDocP, const PNotify& Notify) {
  // prepare file-directory traversal
  TStr LcNrFPath=TStr::GetNrFPath(FPath).GetLc();
  Notify->OnStatus("Creating Bow from file-path " + FPath + " ...");
  TFFile FFile(FPath, "", RecurseDirP);
  // traverse files
  TStr FNm; int Docs=0; NewDIdV.Clr();
  while (FFile.Next(FNm)){
    Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
    Notify->OnStatus(TStr::Fmt("%d\r", Docs));
    // prepare document-name
    if (TFile::Exists(FNm)) { //B:
        TStr DocNm=FNm.GetLc();
        if (DocNm.IsPrefix(LcNrFPath)){
          DocNm=DocNm.GetSubStr(LcNrFPath.Len(), DocNm.Len()-1);}
        // categories
        TStrV CatNmV; TStr CatNm;
        if (DocNm.IsChIn('/')){
          TStr Str; DocNm.SplitOnCh(CatNm, '/', Str);
        } else if (DocNm.IsChIn('\\')){
          TStr Str; DocNm.SplitOnCh(CatNm, '\\', Str);
        }
        if (!CatNm.Empty()){
          CatNmV.Add(CatNm);}
        // load document-content
        TStr DocStr=TStr::LoadTxt(FNm);
        // add document to bow
        NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP));
    }
  }
  Notify->OnStatus(TStr::Fmt("%d", Docs));
  // return results
  Notify->OnStatus("Done.");
  BowDocBs->AssertOk();
}
예제 #13
0
void TCpDoc::SaveHtmlToCpd(
 const TStr& InHtmlFPath, const TStr& OutCpdFNm,
 const bool& /*RecurseDirP*/, const int& MxDocs){
  // create output file
  PSOut SOut=TFOut::New(OutCpdFNm);
  // prepare file-directory traversal
  TStrV FPathV; FPathV.Add(InHtmlFPath);
  TStrV FExtV; FExtV.Add("html"); FExtV.Add("htm"); FExtV.Add("xml");
  TFFile FFile(FPathV, FExtV, "", true); TStr FNm;
  // traverse files
  printf("Processing '%s' ...\n", InHtmlFPath.CStr());
  int Docs=0;
  while (FFile.Next(FNm)){
    // get file-name
    if ((MxDocs!=-1)&&(FFile.GetFNmN()>MxDocs)){break;}
    Docs++; if (FFile.GetFNmN()%10==0){printf("%d\r", Docs);}
    // load html
    PCpDoc CpDoc=TCpDoc::LoadHtmlDoc(FNm);
    // save cpd document
    CpDoc->Save(*SOut);
  }
  printf("%d\nDone.\n", Docs);
}
예제 #14
0
PPlBs TPlBs::LoadTxtPl(const TStr& FNmWc){
  PPlBs PlBs=TPlBs::New();
  printf("Loading Prolog from '%s'...\n", FNmWc.CStr());
  TFFile FFile(FNmWc); TStr FNm;
  while (FFile.Next(FNm)){
    printf("  ...loading '%s'\n", FNm.CStr());
    // process prolog-file
    PSIn SIn=TFIn::New(FNm);
    TILx Lx(SIn, TFSet()|/*iloList|*/iloCmtAlw|iloCsSens|iloUniStr|iloExcept);
    Lx.GetSym(TFSet(TupExpect)|syEof);
    while (Lx.Sym!=syEof){
      TPlVal TupVal=TPlBs::ParseTup(Lx, TFSet()|syPeriod, PlBs);
      int FuncId=PlBs->TupV[TupVal.GetTupId()]->GetFuncId();
      int Arity=PlBs->TupV[TupVal.GetTupId()]->GetVals();
      PlBs->FuncIdArityPrToTupIdVH.
       AddDat(TIntPr(FuncId, Arity)).Add(TupVal.GetTupId());
      Lx.GetSym(TFSet(TupExpect)|syEof);
    }
    //break;
  }
  printf("Done.\n");
  return PlBs;
}
예제 #15
0
void TCpDoc::SaveReuters2000ToCpd(
 const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){
  // create output file 
  PSOut SOut=TFOut::New(OutCpdFNm);
  // processing xml files
  TStrStrH DocIdToDateStrH;
  TFFile FFile(InFPath, ".XML", true); TStr FNm;
  while (FFile.Next(FNm)){
    if ((MxDocs!=-1)&&(FFile.GetFNmN()>=MxDocs)){break;}
    if ((1+FFile.GetFNmN())%100==0){
      printf("Processing file '%s' (%d)\r", FNm.CStr(), 1+FFile.GetFNmN());}
    PXmlDoc Doc=TXmlDoc::LoadTxt(FNm);
    // get document
    PCpDoc CpDoc=TCpDoc::New();
    LoadReuters2000DocFromXml(FNm,
     CpDoc->DocNm, CpDoc->DateStr, CpDoc->TitleStr,
     CpDoc->HeadlineStr, CpDoc->BylineStr, CpDoc->DatelineStr,
     CpDoc->ParStrV,
     CpDoc->TopCdNmV, CpDoc->GeoCdNmV, CpDoc->IndCdNmV);
    // save cpd document
    CpDoc->Save(*SOut);
  }
  printf("\n");
}
예제 #16
0
void TCpDoc::SaveAsfaToCpd(const TStr& InFPath, const TStr& OutCpdFNm){
  // create output file
  PSOut SOut=TFOut::New(OutCpdFNm);
  // traverse files
  TStrH AccessionIdH;
  TFFile FFile(TStr::GetNrFPath(InFPath)+"*.Asfa"); TStr AsfaFNm;
  while (FFile.Next(AsfaFNm)){
    printf("Processing file '%s'\n", AsfaFNm.CStr());
    PSIn SIn=TFIn::New(AsfaFNm);
    TILx Lx(SIn, TFSet(iloRetEoln, iloExcept));
    Lx.GetSym(syLn, syEof);
    while (Lx.Sym!=syEof){
      // Query Line
      TStr QueryLnStr=Lx.Str;
      TStrV QueryStrV; QueryLnStr.SplitOnAllCh('\t', QueryStrV, false);
      IAssert(QueryStrV[0]=="Query");
      // RecordNo Line
      Lx.GetSym(syLn); TStr RecNoLnStr=Lx.Str;
      TStrV RecNoStrV; RecNoLnStr.SplitOnAllCh('\t', RecNoStrV, false);
      IAssert(RecNoStrV[0]=="RecordNo");
      //int RecN=RecNoStrV[1].GetInt();
      // fields (format: Short-Name Tab Long-Name Tab Value-String)
      TStr TitleStr, AbstractStr, PublicationYearStr, AccessionId;
      TStrV AuthorNmV; TStrV TermNmV1, TermNmV2;
      while (true){
        Lx.GetSym(syLn); TStr FldLnStr=Lx.Str;
        TStrV FldStrV; FldLnStr.SplitOnAllCh('\t', FldStrV, false);
        if (FldStrV[0]=="----"){
          if (!AccessionIdH.IsKey(AccessionId)){
            AccessionIdH.AddKey(AccessionId);
            // create & save cpd document
            PCpDoc CpDoc=TCpDoc::New();
            CpDoc->DocNm=AccessionId;
            CpDoc->DateStr=PublicationYearStr;
            CpDoc->TitleStr=TitleStr;
            CpDoc->ParStrV.Add(AbstractStr);
            CpDoc->TopCdNmV=TermNmV1;
            CpDoc->GeoCdNmV=TermNmV2;
            CpDoc->IndCdNmV=AuthorNmV;
            CpDoc->Save(*SOut);
          } else {/*printf("[%s]", AccessionId.CStr());*/}
          break;
        } else
        if (FldStrV[0]=="TI"){
          TitleStr=FldStrV[2];
        } else if (FldStrV[0]=="TI"){
          TitleStr=FldStrV[2];
        } else if (FldStrV[0]=="AU"){
          FldStrV[2].SplitOnAllCh(';', AuthorNmV);
          for (int StrN=0; StrN<AuthorNmV.Len(); StrN++){AuthorNmV[StrN].ToTrunc();}
        } else if (FldStrV[0]=="AB"){
          AbstractStr=FldStrV[2];
        } else if (FldStrV[0]=="PY"){
          PublicationYearStr=FldStrV[2];
        } else if (FldStrV[0]=="DE"){
          FldStrV[2].SplitOnAllCh(';', TermNmV1);
          for (int StrN=0; StrN<TermNmV1.Len(); StrN++){TermNmV1[StrN].ToTrunc();}
        } else if (FldStrV[0]=="CL"){
          FldStrV[2].SplitOnAllCh(';', TermNmV2);
          for (int StrN=0; StrN<TermNmV2.Len(); StrN++){TermNmV2[StrN].ToTrunc();}
        } else if (FldStrV[0]=="AN"){
          AccessionId=FldStrV[2];
        }
      }
      printf("%d\r", AccessionIdH.Len());
      Lx.GetSym(syLn, syEof);
    }
  }
}