void LEX2DCT::LoeTekstifailist( const TAGS2DCT& tags ) { lexArr.Start(500,500); CPFSFile in; if(in.Open(FSTSTR("lex.txt"), FSTSTR("rb"))==false) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili lex.txt avamisega"); CFSAString rida; for(int reaNr=1; in.ReadLine(&rida)==true; reaNr++) { LEXINF* lexInf=lexArr.AddPlaceHolder(); // sõna [ N] tag1=prob1 ... tagN=probN rida.Trim();// white space eest-tagant maha rida+=' '; // tühik lõppu int pos1=(int)rida.Find(' '), pos2, pos3; if(pos1<=0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Jamane rida andmefailis lex.txt", (const char*)rida); if(rida[pos1+5]!=']' || rida[pos1+6]!=' ' || rida[pos1+7]=='\0') throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Jamane rida andmefailis lex.txt", (const char*)rida); CFSAString tagStr, probStr; //CFSWString wTagStr; lexInf->str=rida.Mid(0,pos1); sscanf(((const char*)rida)+pos1+2, "%d", &(lexInf->n)); lexInf->tagIdxProb=new LEXINF::LEXINFEL[lexInf->n]; pos1+=7; for(int i=0; i<lexInf->n; i++) { if((pos2=(int)rida.Find('=', pos1))<=0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Jamane rida andmefailis lex.txt", (const char*)rida); if((pos3=(int)rida.Find(' ',pos2))<=0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Jamane rida andmefailis lex.txt", (const char*)rida); tagStr=rida.Mid(pos1, pos2-pos1); if((lexInf->tagIdxProb[i].tagIdx=tags.GetIdx(&tagStr))<0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Tundmatu ühestamismärgendandmefailis lex.txt", (const char*)rida); sscanf((const char*)rida+pos2+1, "%e", &(lexInf->tagIdxProb[i].tagProb)); pos1=pos3+1; } for(int i=1; i<lexInf->n; i++) { assert(lexInf->tagIdxProb[i-1].tagIdx<lexInf->tagIdxProb[i].tagIdx); } printf("%d\r", lexArr.idxLast); } printf("%d\n", lexArr.idxLast); printf("Leksikoni järjestamine..."); lexArr.Sort(); printf("OK\n"); }
void TAGS2DCT::LoeTekstifailist(void) { CFSFileName fileName(FSTSTR("taglist.txt")); CPFSFile in; if(in.Open(fileName, FSTSTR("rb"))==false) throw VEAD(ERR_X_TYKK, ERR_OPN, __FILE__, __LINE__," ", "Ei suuda avada faili taglist.txt"); TMPLPTRARRAYBIN<PCFSAString,CFSAString>::Start(100,10); CFSAString rida; PCFSAString tagStr; // Loeme märgendite loendi mällu while(in.ReadLine(&rida)==true) { tagStr=rida.Mid(4); tagStr.Trim(); if(TMPLPTRARRAYBIN<PCFSAString,CFSAString>::AddClone(tagStr)==NULL) throw VEAD(ERR_HMM_MOOTOR, ERR_NOMEM, __FILE__, __LINE__," "); } in.Close(); printf("Märgendite järjestamine..."); // Garanteerime järjestatuse TMPLPTRARRAYBIN<PCFSAString,CFSAString>::Sort(); // Kontrollime veel üle, et ikka tõesti järjestatud for(int i=1; i<idxLast; i++) { if(*(operator[](i-1)) >= *(operator[](i))) throw VEAD(ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__," ", "Jama märgendite järjekorraga andmefailis taglist.txt"); } printf("OK\n"); gramm1.Start(idxLast); CFSFileName fileName2(FSTSTR("margcnt.txt")); if(in.Open(fileName2, FSTSTR("rb"))==false) throw VEAD(ERR_X_TYKK, ERR_OPN, __FILE__, __LINE__," ", "Ei suuda avada faili margcnt.txt"); for(int i=0; i<idxLast; i++) { if(in.ReadLine(&rida)==false) throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," "); int tyhikuPos=rida.Find(' '); if(tyhikuPos<=0) throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," "); CFSAString tag(rida.Left(tyhikuPos)); if(tag!=*(operator[](i))) throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," "); int nKorda; if(sscanf(((const char*)rida)+tyhikuPos, "%d", &nKorda)!=1) throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," "); gramm1.Obj(i)=nKorda; } }
void CONV_HTML_UC2::ConvToUc( CFSWString& wStr, const CFSAString& aStr, const PFSCODEPAGE koodiTabel ) { wStr.Empty(); if(koodiTabel!=PFSCP_HTMLEXT) // Krutime Renee algoritmi j�rgi { wStr = FSStrAtoW(aStr, koodiTabel); // Kui teisendus k�ib Rene tabelite j�rgi, siis teeme �ra ja valmis return; } assert(koodiTabel==PFSCP_HTMLEXT); // Kasutame teisendamiseks failist loetud tabelit if(sgml2uc.idxLast<=0) throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "SGML olemite tabel mallu lugemata"); int l, n=aStr.GetLength(); for(l=0; l < n; l++) { if((aStr[l] & (~0x7F))!=0) // peab olema 7bitine ascii throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "String peab koosnema ASCII (7bitistest) koodidest", (const char*)aStr+l); if(aStr[l]!='&') // ei alusta SGML olemit... { tryki: wStr += ((FSWCHAR)(aStr[l])) & 0x7F; // ...l�heb niisama continue; } // V�ib alustada mingit SGML olemit - &blah; int lSemiPos=(int)aStr.Find(";", l+1); if(lSemiPos<0) // see ampersand ilma l�petava semita { if(ignoramp==true) goto tryki; throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Ampersandi tagant semi puudu", (const char*)aStr+l); } if(autosgml==true && aStr[l+1]=='#') // teisenda &#[{x|X}]12345; s�mbolid { int tmp=0, j=l+2; if(aStr[j]=='x' || aStr[j]=='X') // teisenda 𒍅 ja 𒍅 hexakoodid { j++; //if(sscanf(((const char*)aStr)+j, "%x", &tmp)!=1) // throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", // "Vigane SGML olem", (const char*)aStr+l); //for(; j<lSemiPos; j++) // { // if(strchr("0123456789aAbBcCdDeEfF", aStr[j])==NULL) // throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", // "Vigane SGML olem", (const char*)aStr+l); // } j+=STRSOUP::UnsignedStr2Hex<int, char>(&tmp, ((const char*)aStr)+j); if(j<=0 || aStr[j]!=';') throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Vigane SGML olem", (const char*)aStr+l); if(tmp>0xFFFF) throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Vigane SGML olem (peab mahtuma 2 baidi peale)", (const char*)aStr+l); } else // teisenda 〹 ja 〹 k�mnendkoodid { //for(; j<lSemiPos; j++) // { // if(aStr[j]<'0' || aStr[j]>'9') // throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", // "Vigane SGML olem (lubatud 0-9)", (const char*)aStr+l); // if((tmp=10*tmp+aStr[j]-'0')>0xFFFF) // throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", // "Vigane SGML olem (peab mahtuma 2 baidi peale)", (const char*)aStr+l); // } j+=STRSOUP::UnsignedStr2Num<int, char>(&tmp, ((const char*)aStr)+j); if(j<=0 || aStr[j]!=';') throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Vigane SGML olem", (const char*)aStr+l); if(tmp>0xFFFF) throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Vigane SGML olem (peab mahtuma 2 baidi peale)", (const char*)aStr+l); } wStr += (WCHAR)tmp; l=lSemiPos; continue; } if(lSemiPos-l+1 > sgml_stringi_max_pikkus) // nii pikk ei saa olla tabelis { if(ignoramp==true) goto tryki; throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Puudub SGML olemite tabelist", (const char*)aStr+l); } CFSAString szSymbol=aStr.Mid(l, lSemiPos-l+1); // l�ikame &bla; sisendstringist v�lja SGML_UC* rec; if((rec=sgml2uc.Get(&szSymbol))==NULL) // ei leidnud kahendtabelist - jama lahti { if(ignoramp==true) goto tryki; throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Puudub SGML olemite tabelist", (const char*)szSymbol); } wStr += rec->uc; l=lSemiPos; } }
void MKLASSID2DCT::Run( DCTMETASTRCT& meta, ///< Sõnastiku struktuurihoidla const TAGS2DCT& tags ///< Ühestamismärgendite massiiv ) { CPFSFile in; if(in.Open(FSTSTR("klassid.txt"), FSTSTR("rb"))==false) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt avamisega"); CFSAString rida; // tõsta kohe sõnastikku ümber... int n, reaNr, kokkuRidu; long pos=meta.Tell(); meta.Add(DCTELEMID_T3M_KLASSID, pos); if(in.ReadLine(&rida)==false) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); sscanf((const char*)rida, "%d", &kokkuRidu); if(meta.WriteUnsigned<UB4, int>(kokkuRidu)==false) // mitmsusklasside arv throw VEAD( ERR_HMM_MOOTOR, ERR_WRITE, __FILE__, __LINE__, " ", "Jama pakitud sõnastikku kirjutamisega"); for(reaNr=1; in.ReadLine(&rida)==true; reaNr++) { printf("%06d:%06d\r", kokkuRidu, reaNr); int tyhik, vordus; rida += " "; if((tyhik=(int)rida.Find(' '))<0) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); if(sscanf((const char*)rida, "%d", &n)!=1) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); meta.WriteUnsigned<UB1, int>(n); // jooksva mitmesusklassi suurus for(int i=0; i<n; i++) { if((vordus=(int)rida.Find('=', tyhik+1))<=tyhik+1) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega", (const char*)rida); CFSAString tagStr=rida.Mid(tyhik+1, vordus-tyhik-1); int tagIdx=tags.GetIdx(&tagStr); if(tagIdx<0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Tundmatu ühestamismärgend andmefailis klassid.txt ", (const char*)rida); if(meta.WriteUnsigned<UB1,int>(tagIdx)==false) throw VEAD( ERR_HMM_MOOTOR, ERR_WRITE, __FILE__, __LINE__, " ", "Jama pakitud sõnastikku kirjutamisega"); UKAPROB tagProb; if(sscanf(((const char*)rida)+vordus+1, "%e", &tagProb)!=1) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Puuduv tõenäosus failis klassid.txt ", (const char*)rida); if(meta.WriteBuffer(&tagProb,sizeof(UKAPROB))==false) throw VEAD( ERR_HMM_MOOTOR, ERR_WRITE, __FILE__, __LINE__, " ", "Jama pakitud sõnastikku kirjutamisega"); if((tyhik=(int)rida.Find(' ', tyhik+1))<=0) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); if(vordus >= tyhik) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); } } printf("\n"); }