void TBowFl::LoadLnDocTxt(PBowDocBs BowDocBs, const TStr& LnDocFNm, TIntV& NewDIdV, const bool& NamedP, const int& MxDocs, const bool& SaveDocP) { // open line-doc file NewDIdV.Clr(); TFIn FIn(LnDocFNm); char Ch=' '; int Docs=0; while (!FIn.Eof()){ Docs++; if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} printf("%d\r", Docs); // document name TChA DocNm; Ch=FIn.GetCh(); if (NamedP){ while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){ DocNm+=Ch; Ch=FIn.GetCh();} DocNm.Trunc(); if (DocNm.Empty()){Docs--; continue;} } else { DocNm = TInt::GetStr(Docs); } // categories TStrV CatNmV; forever { while ((!FIn.Eof())&&(Ch==' ')){Ch=FIn.GetCh();} if (Ch=='!'){ if (!FIn.Eof()){Ch=FIn.GetCh();} TChA CatNm; while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){ CatNm+=Ch; Ch=FIn.GetCh();} if (!CatNm.Empty()){CatNmV.Add(CatNm);} } else { break; } } // document text TChA DocChA; while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')){ DocChA+=Ch; Ch=FIn.GetCh();} // skip empty documents (empty lines) if (DocNm.Empty()&&DocChA.Empty()){ continue;} // add document to document-base NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocChA, SaveDocP)); } // return document-base BowDocBs->AssertOk(); printf("\n"); }
///////////////////////////////////////////////// // BagOfWords-Files void TBowFl::LoadHtmlTxt( PBowDocBs BowDocBs, const TStr& FPath, TIntV& NewDIdV, const bool& RecurseDirP, const int& MxDocs, const bool& SaveDocP, const PNotify& Notify) { // prepare file-directory traversal TStr LcNrFPath=TStr::GetNrFPath(FPath).GetLc(); Notify->OnStatus("Creating Bow from file-path " + FPath + " ..."); TFFile FFile(FPath, "", RecurseDirP); // traverse files TStr FNm; int Docs=0; NewDIdV.Clr(); while (FFile.Next(FNm)){ Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} Notify->OnStatus(TStr::Fmt("%d\r", Docs)); // prepare document-name if (TFile::Exists(FNm)) { //B: TStr DocNm=FNm.GetLc(); if (DocNm.IsPrefix(LcNrFPath)){ DocNm=DocNm.GetSubStr(LcNrFPath.Len(), DocNm.Len()-1);} // categories TStrV CatNmV; TStr CatNm; if (DocNm.IsChIn('/')){ TStr Str; DocNm.SplitOnCh(CatNm, '/', Str); } else if (DocNm.IsChIn('\\')){ TStr Str; DocNm.SplitOnCh(CatNm, '\\', Str); } if (!CatNm.Empty()){ CatNmV.Add(CatNm);} // load document-content TStr DocStr=TStr::LoadTxt(FNm); // add document to bow NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP)); } } Notify->OnStatus(TStr::Fmt("%d", Docs)); // return results Notify->OnStatus("Done."); BowDocBs->AssertOk(); }