void TNmObjBs::LoadNrWordBs(const PSIn& SIn){ if (SIn.Empty()){return;} TILx Lx(SIn, TFSet(iloCmtAlw, iloRetEoln, iloExcept)); // traverse lines Lx.GetSym(syQStr, syEoln, syEof); while (Lx.Sym!=syEof){ if (Lx.Sym==syQStr){ // get normalized word TStr NrWordStr=Lx.Str; // get inflected words Lx.GetSym(syColon); Lx.GetSym(syQStr, syEoln); while (Lx.Sym!=syEoln){ // get inflected word TStr WordStr=Lx.Str; // test if inflected word already exists if (WordStrToNrH.IsKey(WordStr)){ printf("Word already normalized (%s)", WordStr.CStr());} // add inflected word and corresponding normalized word WordStrToNrH.AddDat(WordStr, NrWordStr); //printf("'%s' ->'%s'\n", WordStr.CStr(), NrWordStr.CStr()); Lx.GetSym(syQStr, syEoln); } Lx.GetSym(syQStr, syEoln, syEof); } else if (Lx.Sym==syEoln){ // empty line Lx.GetSym(syQStr, syEoln, syEof); } else { Fail; } } }
PTransCorpus TTransCorpus::LoadTxt(const TStr& InOrgFNm, const TStr& InTransFNm, const TStr& InRefTransFNm) { // open files PSIn OrgSIn = !InOrgFNm.Empty() ? TFIn::New(InOrgFNm) : PSIn(); PSIn TransSIn = !InTransFNm.Empty() ? TFIn::New(InTransFNm) : PSIn(); PSIn RefTransSIn = !InRefTransFNm.Empty() ? TFIn::New(InRefTransFNm) : PSIn(); // check which are given const bool IsOrgP = !OrgSIn.Empty(); const bool IsTransP = !TransSIn.Empty(); const bool IsRefTransP = !RefTransSIn.Empty(); // print warnings if (!IsOrgP) { printf("No original sentences!\n"); } if (!IsTransP) { printf("No machine translation sentences!\n"); } if (!IsRefTransP) { printf("No reference translation sentences!\n"); } // traverse the files and add sentences PTransCorpus TransCorpus = TTransCorpus::New(); TLnRet OrgLnRet(OrgSIn), TransLnRet(TransSIn), RefTransLnRet(RefTransSIn); TStr OrgLnStr, TransLnStr, RefTransLnStr; int LnN = 1; forever { // try to read next line, otherwise break if (IsOrgP && !OrgLnRet.NextLn(OrgLnStr)) { break; } if (IsTransP && !TransLnRet.NextLn(TransLnStr)) { break; } if (IsRefTransP && !RefTransLnRet.NextLn(RefTransLnStr)) { break; } // print progress if (LnN % 100 == 0) { printf(" %7d Sentences\r", LnN); } // add sentence and translation(s) to the corpus if (!IsOrgP) { TransCorpus->AddSentenceNoOrg(LnN, TransLnStr, RefTransLnStr); } else if (!IsTransP) { TransCorpus->AddSentenceNoTrans(LnN, OrgLnStr, RefTransLnStr); } else { IAssert(IsRefTransP); TransCorpus->AddSentence(LnN, OrgLnStr, TransLnStr, RefTransLnStr); } // next sentence :-) LnN++; } printf("\n"); // finish return TransCorpus; }
void TNmObjBs::LoadNmObjTypeBs(const PSIn& SIn){ if (SIn.Empty()){return;} TILx Lx(SIn, TFSet(iloCmtAlw, iloRetEoln, iloExcept)); // traverse lines Lx.GetSym(syQStr, syIdStr, syEoln, syEof); while (Lx.Sym!=syEof){ if ((Lx.Sym==syQStr)||(Lx.Sym==syIdStr)){ TVec<TStrV> NmObjWordStrVV; TB32Set NmObjAttrSet; while ((Lx.Sym==syQStr)||(Lx.Sym==syIdStr)){ if (Lx.Sym==syQStr){ // named-object word-string TStr WordStrVStr=Lx.Str; TStrV WordStrV; WordStrVStr.SplitOnWs(WordStrV); NmObjWordStrVV.Add(WordStrV); } else if (Lx.Sym==syIdStr){ // named-object attribute TNmObjAttr NmObjAttr=TNmObjBs::GetNmObjTypeFromStr(Lx.Str); NmObjAttrSet.Incl(NmObjAttr); } else { Fail; } Lx.GetSym(syQStr, syIdStr, syEoln, syEof); } // assign 'defined' attribute if 'not ignore' if (!NmObjAttrSet.In(noaIgnore)){ NmObjAttrSet.Incl(noaDefined);} // assign attribute-sets to word-vectors for (int NmObjN=0; NmObjN<NmObjWordStrVV.Len(); NmObjN++){ WordStrVToNmObjAttrSetH.AddDat(NmObjWordStrVV[NmObjN])|=NmObjAttrSet; } // assign aliases {for (int NmObjN=1; NmObjN<NmObjWordStrVV.Len(); NmObjN++){ NmObjWordStrVToNrH.AddDat(NmObjWordStrVV[NmObjN], NmObjWordStrVV[0]); }} // get eoln } else if (Lx.Sym==syEoln){ // empty line Lx.GetSym(syQStr, syEoln, syEof); } else { Fail; } } }
void TNmObjBs::LoadCustSwSet(const PSIn& SIn){ if (SIn.Empty()){return;} TILx Lx(SIn, TFSet(iloCmtAlw, iloRetEoln, iloExcept)); // traverse lines Lx.GetSym(syLn, syEof); while (Lx.Sym!=syEof){ // get stop-phrase string TStr WordStrVStr=Lx.Str; // split phrase to words TStrV WordStrV; WordStrVStr.SplitOnWs(WordStrV); if (!WordStrV.Empty()){ // define phrase as stop-word WordStrVToNmObjAttrSetH.AddDat(WordStrV).Incl(noaIgnore); } // get next symbol Lx.GetSym(syLn, syEof); } }