///////////////////////////////////////////////// // Find-File TFFile::TFFile(const TStr& FNmWc, const bool& _RecurseP): FPathV(), FExtV(), FBaseWc(), CsImpP(false), RecurseP(_RecurseP), FPathN(0-1), FFileDesc(TFFileDesc::New()), SubFFile(), CurFNm(), CurFNmN(0-1){ // prepare file-base-name wild-card FBaseWc=FNmWc.GetFBase(); if (!CsImpP){FBaseWc.ToUc();} // get & assign file-name TStr FPath=FNmWc.GetFPath(); FPathV.Add(TStr::GetNrFPath(FPath)); }
PTransCorpus TTransCorpus::LoadEP(const TStr& InOrgFPath, const TStr& InTransFPath) { // prepare prset structures PTransCorpus TransCorpus = TTransCorpus::New(); // iterate over all the files TStr NrmInTransFPath = TStr::GetNrAbsFPath(InTransFPath); TFFile OrgFNms(InOrgFPath, "txt", false); TStr OrgFNm; int SentId = 0; while (OrgFNms.Next(OrgFNm)) { // get name of the file with aligned sentences TStr TransFNm = NrmInTransFPath + OrgFNm.GetFBase(); IAssertR(TFile::Exists(TransFNm), TransFNm); // load file printf("Loading %s and %s ...\r", OrgFNm.CStr(), TransFNm.CStr()); TLnRet OrgLnRet(TFIn::New(OrgFNm)); TLnRet TransLnRet(TFIn::New(TransFNm)); TStr OrgLn, TransLn; int LnN = 1; bool EmptyLnP = false; while (OrgLnRet.NextLn(OrgLn)) { if (!TransLnRet.NextLn(TransLn)) { printf("\nEarly stop in line (%s:%s)[%d]\n", OrgLn.CStr(), TransLn.CStr(), LnN); break; // first file finished, let's stop } if (OrgLn.Empty() || TransFNm.Empty()) { // skip empty line and skip till EmptyLnP = true; } else if (OrgLn[0] == '<' || TransLn[0] == '<') { if (TransLn[0] != OrgLn[0]) { printf("\nError in line (%s:%s)[%d]\n", OrgLn.CStr(), TransLn.CStr(), LnN); break; // we stop, lines not aligned anymore ... } // reset the empty count EmptyLnP = false; // skip XML tags } else if (!EmptyLnP) { // aligned sentence! TransCorpus->AddSentenceNoTrans(SentId, OrgLn.ToTrunc(), TransLn.ToTrunc()); SentId++; } LnN++; } } printf("\nDone!\n"); // finish return TransCorpus; }
void TCpDoc::SaveTBsToCpd( const TStr& InTBsFNm, const TStr& OutCpdFNm, const int& /*MxDocs*/){ // open input text-base TStr TxtBsNm=InTBsFNm.GetFBase(); TStr TxtBsFPath=InTBsFNm.GetFPath(); PTxtBs TxtBs=TTxtBs::New(TxtBsNm, TxtBsFPath, faRdOnly); // create output file PSOut SOut=TFOut::New(OutCpdFNm); // traverse input documents TBlobPt TrvBlobPt=TxtBs->FFirstDocId(); TBlobPt DocId; int DocN=0; TStr DocNm; TStr DocStr; while (TxtBs->FNextDocId(TrvBlobPt, DocId)){ DocN++; if (DocN%100==0){printf("%d docs\r", DocN);} // get document data TxtBs->GetDocNmStr(DocId, DocNm, DocStr); // create cpd document PCpDoc CpDoc=TCpDoc::New(); CpDoc->DocNm=DocNm; CpDoc->ParStrV.Add(DocStr, 1); // save cpd document CpDoc->Save(*SOut); } }
PBowDocBs TBowFl::LoadTBsTxt( const TStr& TBsFNm, const int& MxDocs, const TStr& SwSetTypeNm, const TStr& StemmerTypeNm, const int& MxNGramLen, const int& MnNGramFq){ // prepare stop-words PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm); // prepare stemmer PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm); // create ngrams PNGramBs NGramBs; if (!((MxNGramLen==1)&&(MnNGramFq==1))){ NGramBs=TNGramBs::GetNGramBsFromTBs( TBsFNm, MxDocs, MxNGramLen, MnNGramFq, SwSet, Stemmer); } // create document-base PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs); // open input text-base TStr TxtBsNm=TBsFNm.GetFBase(); TStr TxtBsFPath=TBsFNm.GetFPath(); PTxtBs TxtBs=TTxtBs::New(TxtBsNm, TxtBsFPath, faRdOnly); // traverse documents TBlobPt TxtBsTrvBlobPt=TxtBs->FFirstDocId(); TBlobPt TxtBsDocId; int Docs=0; while (TxtBs->FNextDocId(TxtBsTrvBlobPt, TxtBsDocId)){ Docs++; if (Docs%100==0){printf("%d\r", Docs);} if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} // get document data TStr DocNm; TStr DocStr; TxtBs->GetDocNmStr(TxtBsDocId, DocNm, DocStr); // add document to bow BowDocBs->AddHtmlDoc(DocNm, TStrV(), DocStr, false); } // return results BowDocBs->AssertOk(); return BowDocBs; }