Beispiel #1
0
/////////////////////////////////////////////////
// Find-File
TFFile::TFFile(const TStr& FNmWc, const bool& _RecurseP):
  FPathV(), FExtV(), FBaseWc(),
  CsImpP(false), RecurseP(_RecurseP), FPathN(0-1),
  FFileDesc(TFFileDesc::New()), SubFFile(), CurFNm(), CurFNmN(0-1){
  // prepare file-base-name wild-card
  FBaseWc=FNmWc.GetFBase(); if (!CsImpP){FBaseWc.ToUc();}
  // get & assign file-name
  TStr FPath=FNmWc.GetFPath();
  FPathV.Add(TStr::GetNrFPath(FPath));
}
PTransCorpus TTransCorpus::LoadEP(const TStr& InOrgFPath, const TStr& InTransFPath) {
    // prepare prset structures
    PTransCorpus TransCorpus = TTransCorpus::New();
    // iterate over all the files
    TStr NrmInTransFPath = TStr::GetNrAbsFPath(InTransFPath);
    TFFile OrgFNms(InOrgFPath, "txt", false); TStr OrgFNm;
    int SentId = 0;
    while (OrgFNms.Next(OrgFNm)) {
        // get name of the file with aligned sentences
        TStr TransFNm = NrmInTransFPath + OrgFNm.GetFBase();
        IAssertR(TFile::Exists(TransFNm), TransFNm);
        // load file
        printf("Loading %s and %s ...\r", OrgFNm.CStr(), TransFNm.CStr());
        TLnRet OrgLnRet(TFIn::New(OrgFNm));
        TLnRet TransLnRet(TFIn::New(TransFNm));
        TStr OrgLn, TransLn; int LnN = 1; bool EmptyLnP = false;
        while (OrgLnRet.NextLn(OrgLn)) {
            if (!TransLnRet.NextLn(TransLn)) {
                printf("\nEarly stop in line (%s:%s)[%d]\n", OrgLn.CStr(), TransLn.CStr(), LnN);
                break; // first file finished, let's stop
            }
            if (OrgLn.Empty() || TransFNm.Empty()) {
                // skip empty line and skip till 
                EmptyLnP = true;
            } else if (OrgLn[0] == '<' || TransLn[0] == '<') {
                if (TransLn[0] != OrgLn[0]) {
                    printf("\nError in line (%s:%s)[%d]\n", OrgLn.CStr(), TransLn.CStr(), LnN);
                    break; // we stop, lines not aligned anymore ...
                }
                // reset the empty count
                EmptyLnP = false;
                // skip XML tags
            } else if (!EmptyLnP) {
                // aligned sentence!
                TransCorpus->AddSentenceNoTrans(SentId, 
                    OrgLn.ToTrunc(), TransLn.ToTrunc()); 
                SentId++;
            }
            LnN++;
        }         
    }
    printf("\nDone!\n");
    // finish
    return TransCorpus;
}
Beispiel #3
0
void TCpDoc::SaveTBsToCpd(
 const TStr& InTBsFNm, const TStr& OutCpdFNm, const int& /*MxDocs*/){
  // open input text-base
  TStr TxtBsNm=InTBsFNm.GetFBase();
  TStr TxtBsFPath=InTBsFNm.GetFPath();
  PTxtBs TxtBs=TTxtBs::New(TxtBsNm, TxtBsFPath, faRdOnly);
  // create output file
  PSOut SOut=TFOut::New(OutCpdFNm);
  // traverse input documents
  TBlobPt TrvBlobPt=TxtBs->FFirstDocId(); TBlobPt DocId;
  int DocN=0; TStr DocNm; TStr DocStr;
  while (TxtBs->FNextDocId(TrvBlobPt, DocId)){
    DocN++; if (DocN%100==0){printf("%d docs\r", DocN);}
    // get document data
    TxtBs->GetDocNmStr(DocId, DocNm, DocStr);
    // create cpd document
    PCpDoc CpDoc=TCpDoc::New();
    CpDoc->DocNm=DocNm;
    CpDoc->ParStrV.Add(DocStr, 1);
    // save cpd document
    CpDoc->Save(*SOut);
  }
}
Beispiel #4
0
PBowDocBs TBowFl::LoadTBsTxt(
 const TStr& TBsFNm, const int& MxDocs,
 const TStr& SwSetTypeNm, const TStr& StemmerTypeNm,
 const int& MxNGramLen, const int& MnNGramFq){
  // prepare stop-words
  PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
  // prepare stemmer
  PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
  // create ngrams
  PNGramBs NGramBs;
  if (!((MxNGramLen==1)&&(MnNGramFq==1))){
    NGramBs=TNGramBs::GetNGramBsFromTBs(
     TBsFNm, MxDocs,
     MxNGramLen, MnNGramFq, SwSet, Stemmer);
  }
  // create document-base
  PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs);
  // open input text-base
  TStr TxtBsNm=TBsFNm.GetFBase();
  TStr TxtBsFPath=TBsFNm.GetFPath();
  PTxtBs TxtBs=TTxtBs::New(TxtBsNm, TxtBsFPath, faRdOnly);
  // traverse documents
  TBlobPt TxtBsTrvBlobPt=TxtBs->FFirstDocId(); TBlobPt TxtBsDocId; int Docs=0;
  while (TxtBs->FNextDocId(TxtBsTrvBlobPt, TxtBsDocId)){
    Docs++; if (Docs%100==0){printf("%d\r", Docs);}
    if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    // get document data
    TStr DocNm; TStr DocStr;
    TxtBs->GetDocNmStr(TxtBsDocId, DocNm, DocStr);
    // add document to bow
    BowDocBs->AddHtmlDoc(DocNm, TStrV(), DocStr, false);
  }
  // return results
  BowDocBs->AssertOk();
  return BowDocBs;
}