Пример #1
0
void TNmObjBs::LoadNrWordBs(const PSIn& SIn){
  if (SIn.Empty()){return;}
  TILx Lx(SIn, TFSet(iloCmtAlw, iloRetEoln, iloExcept));
  // traverse lines
  Lx.GetSym(syQStr, syEoln, syEof);
  while (Lx.Sym!=syEof){
    if (Lx.Sym==syQStr){
      // get normalized word
      TStr NrWordStr=Lx.Str;
      // get inflected words
      Lx.GetSym(syColon);
      Lx.GetSym(syQStr, syEoln);
      while (Lx.Sym!=syEoln){
        // get inflected word
        TStr WordStr=Lx.Str;
        // test if inflected word already exists
        if (WordStrToNrH.IsKey(WordStr)){
          printf("Word already normalized (%s)", WordStr.CStr());}
         // add inflected word and corresponding normalized word
        WordStrToNrH.AddDat(WordStr, NrWordStr);
        //printf("'%s' ->'%s'\n", WordStr.CStr(), NrWordStr.CStr());
        Lx.GetSym(syQStr, syEoln);
      }
      Lx.GetSym(syQStr, syEoln, syEof);
    } else
    if (Lx.Sym==syEoln){
      // empty line
      Lx.GetSym(syQStr, syEoln, syEof);
    } else {
      Fail;
    }
  }
}
PTransCorpus TTransCorpus::LoadTxt(const TStr& InOrgFNm, 
        const TStr& InTransFNm, const TStr& InRefTransFNm) {

    // open files
    PSIn OrgSIn = !InOrgFNm.Empty() ? TFIn::New(InOrgFNm) : PSIn();
    PSIn TransSIn = !InTransFNm.Empty() ? TFIn::New(InTransFNm) : PSIn();
    PSIn RefTransSIn = !InRefTransFNm.Empty() ? TFIn::New(InRefTransFNm) : PSIn();
    // check which are given
    const bool IsOrgP = !OrgSIn.Empty();
    const bool IsTransP = !TransSIn.Empty();
    const bool IsRefTransP = !RefTransSIn.Empty();
    // print warnings
    if (!IsOrgP) { printf("No original sentences!\n"); }
    if (!IsTransP) { printf("No machine translation sentences!\n"); }
    if (!IsRefTransP) { printf("No reference translation sentences!\n"); }
    // traverse the files and add sentences
    PTransCorpus TransCorpus = TTransCorpus::New();
    TLnRet OrgLnRet(OrgSIn), TransLnRet(TransSIn), RefTransLnRet(RefTransSIn);
    TStr OrgLnStr, TransLnStr, RefTransLnStr; int LnN = 1;
    forever {
        // try to read next line, otherwise break
        if (IsOrgP && !OrgLnRet.NextLn(OrgLnStr)) { break; }
        if (IsTransP && !TransLnRet.NextLn(TransLnStr)) { break; }
        if (IsRefTransP && !RefTransLnRet.NextLn(RefTransLnStr)) { break; }
        // print progress
        if (LnN % 100 == 0) { printf("  %7d Sentences\r", LnN); }
        // add sentence and translation(s) to the corpus
        if (!IsOrgP) { 
            TransCorpus->AddSentenceNoOrg(LnN, TransLnStr, RefTransLnStr);
        } else if (!IsTransP) { 
            TransCorpus->AddSentenceNoTrans(LnN, OrgLnStr, RefTransLnStr);
        } else { IAssert(IsRefTransP);
            TransCorpus->AddSentence(LnN, OrgLnStr, TransLnStr, RefTransLnStr);
        }
        // next sentence :-)
        LnN++;
    } printf("\n");
    // finish
    return TransCorpus;
}
Пример #3
0
void TNmObjBs::LoadNmObjTypeBs(const PSIn& SIn){
  if (SIn.Empty()){return;}
  TILx Lx(SIn, TFSet(iloCmtAlw, iloRetEoln, iloExcept));
  // traverse lines
  Lx.GetSym(syQStr, syIdStr, syEoln, syEof);
  while (Lx.Sym!=syEof){
    if ((Lx.Sym==syQStr)||(Lx.Sym==syIdStr)){
      TVec<TStrV> NmObjWordStrVV;
      TB32Set NmObjAttrSet;
      while ((Lx.Sym==syQStr)||(Lx.Sym==syIdStr)){
        if (Lx.Sym==syQStr){
          // named-object word-string
          TStr WordStrVStr=Lx.Str;
          TStrV WordStrV; WordStrVStr.SplitOnWs(WordStrV);
          NmObjWordStrVV.Add(WordStrV);
        } else
        if (Lx.Sym==syIdStr){
          // named-object attribute
          TNmObjAttr NmObjAttr=TNmObjBs::GetNmObjTypeFromStr(Lx.Str);
          NmObjAttrSet.Incl(NmObjAttr);
        } else {
          Fail;
        }
        Lx.GetSym(syQStr, syIdStr, syEoln, syEof);
      }
      // assign 'defined' attribute if 'not ignore'
      if (!NmObjAttrSet.In(noaIgnore)){
        NmObjAttrSet.Incl(noaDefined);}
      // assign attribute-sets to word-vectors
      for (int NmObjN=0; NmObjN<NmObjWordStrVV.Len(); NmObjN++){
        WordStrVToNmObjAttrSetH.AddDat(NmObjWordStrVV[NmObjN])|=NmObjAttrSet;
      }
      // assign aliases
      {for (int NmObjN=1; NmObjN<NmObjWordStrVV.Len(); NmObjN++){
        NmObjWordStrVToNrH.AddDat(NmObjWordStrVV[NmObjN], NmObjWordStrVV[0]);
      }}
      // get eoln
    } else
    if (Lx.Sym==syEoln){
      // empty line
      Lx.GetSym(syQStr, syEoln, syEof);
    } else {
      Fail;
    }
  }
}
Пример #4
0
void TNmObjBs::LoadCustSwSet(const PSIn& SIn){
  if (SIn.Empty()){return;}
  TILx Lx(SIn, TFSet(iloCmtAlw, iloRetEoln, iloExcept));
  // traverse lines
  Lx.GetSym(syLn, syEof);
  while (Lx.Sym!=syEof){
    // get stop-phrase string
    TStr WordStrVStr=Lx.Str;
    // split phrase to words
    TStrV WordStrV; WordStrVStr.SplitOnWs(WordStrV);
    if (!WordStrV.Empty()){
      // define phrase as stop-word
      WordStrVToNmObjAttrSetH.AddDat(WordStrV).Incl(noaIgnore);
    }
    // get next symbol
    Lx.GetSym(syLn, syEof);
  }
}