Beispiel #1
0
void TBowFl::LoadLnDocTxt(PBowDocBs BowDocBs, const TStr& LnDocFNm,
 TIntV& NewDIdV, const bool& NamedP, const int& MxDocs, const bool& SaveDocP) {
  // open line-doc file
  NewDIdV.Clr(); TFIn FIn(LnDocFNm); char Ch=' '; int Docs=0;
  while (!FIn.Eof()){
    Docs++; if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    printf("%d\r", Docs);
    // document name
    TChA DocNm;
    Ch=FIn.GetCh();
    if (NamedP){
      while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
        DocNm+=Ch; Ch=FIn.GetCh();}
      DocNm.Trunc();
      if (DocNm.Empty()){Docs--; continue;}
    } else {
        DocNm = TInt::GetStr(Docs);
    }
    // categories
    TStrV CatNmV;
    forever {
      while ((!FIn.Eof())&&(Ch==' ')){Ch=FIn.GetCh();}
      if (Ch=='!'){
        if (!FIn.Eof()){Ch=FIn.GetCh();}
        TChA CatNm;
        while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
          CatNm+=Ch; Ch=FIn.GetCh();}
        if (!CatNm.Empty()){CatNmV.Add(CatNm);}
      } else {
        break;
      }
    }
    // document text
    TChA DocChA;
    while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')){
      DocChA+=Ch; Ch=FIn.GetCh();}
    // skip empty documents (empty lines)
    if (DocNm.Empty()&&DocChA.Empty()){
      continue;}
    // add document to document-base
    NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocChA, SaveDocP));
  }
  // return document-base
  BowDocBs->AssertOk();
  printf("\n");
}
Beispiel #2
0
/////////////////////////////////////////////////
// BagOfWords-Files
void TBowFl::LoadHtmlTxt(
 PBowDocBs BowDocBs, const TStr& FPath, TIntV& NewDIdV,
 const bool& RecurseDirP, const int& MxDocs,
 const bool& SaveDocP, const PNotify& Notify) {
  // prepare file-directory traversal
  TStr LcNrFPath=TStr::GetNrFPath(FPath).GetLc();
  Notify->OnStatus("Creating Bow from file-path " + FPath + " ...");
  TFFile FFile(FPath, "", RecurseDirP);
  // traverse files
  TStr FNm; int Docs=0; NewDIdV.Clr();
  while (FFile.Next(FNm)){
    Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
    Notify->OnStatus(TStr::Fmt("%d\r", Docs));
    // prepare document-name
    if (TFile::Exists(FNm)) { //B:
        TStr DocNm=FNm.GetLc();
        if (DocNm.IsPrefix(LcNrFPath)){
          DocNm=DocNm.GetSubStr(LcNrFPath.Len(), DocNm.Len()-1);}
        // categories
        TStrV CatNmV; TStr CatNm;
        if (DocNm.IsChIn('/')){
          TStr Str; DocNm.SplitOnCh(CatNm, '/', Str);
        } else if (DocNm.IsChIn('\\')){
          TStr Str; DocNm.SplitOnCh(CatNm, '\\', Str);
        }
        if (!CatNm.Empty()){
          CatNmV.Add(CatNm);}
        // load document-content
        TStr DocStr=TStr::LoadTxt(FNm);
        // add document to bow
        NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP));
    }
  }
  Notify->OnStatus(TStr::Fmt("%d", Docs));
  // return results
  Notify->OnStatus("Done.");
  BowDocBs->AssertOk();
}