void LEX2DCT::LoeTekstifailist(
    const TAGS2DCT& tags
    )
    {
    lexArr.Start(500,500);
    CPFSFile in;
    if(in.Open(FSTSTR("lex.txt"), FSTSTR("rb"))==false)
        throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__,
                " ", "Jama andmefaili lex.txt avamisega");
    CFSAString rida;
    for(int reaNr=1; in.ReadLine(&rida)==true; reaNr++)
        {        
        LEXINF* lexInf=lexArr.AddPlaceHolder();
        // sõna [  N] tag1=prob1 ... tagN=probN
        rida.Trim();// white space eest-tagant maha
        rida+=' ';  // tühik lõppu
        int pos1=(int)rida.Find(' '), pos2, pos3;
        if(pos1<=0)
            throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__,
                " ", "Jamane rida andmefailis lex.txt", (const char*)rida);
        if(rida[pos1+5]!=']' || rida[pos1+6]!=' ' || rida[pos1+7]=='\0')
            throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__,
                " ", "Jamane rida andmefailis lex.txt", (const char*)rida);
        CFSAString tagStr, probStr;
        //CFSWString wTagStr;
        lexInf->str=rida.Mid(0,pos1);
        sscanf(((const char*)rida)+pos1+2, "%d", &(lexInf->n));
        lexInf->tagIdxProb=new LEXINF::LEXINFEL[lexInf->n];

        pos1+=7;
        for(int i=0; i<lexInf->n; i++)
            {
            if((pos2=(int)rida.Find('=', pos1))<=0)
                 throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__,
                    " ", "Jamane rida andmefailis lex.txt", (const char*)rida);
            if((pos3=(int)rida.Find(' ',pos2))<=0)
                 throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__,
                    " ", "Jamane rida andmefailis lex.txt", (const char*)rida);

            tagStr=rida.Mid(pos1, pos2-pos1);
            if((lexInf->tagIdxProb[i].tagIdx=tags.GetIdx(&tagStr))<0)
                throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__,
                    " ", "Tundmatu ühestamismärgendandmefailis lex.txt", (const char*)rida);

            sscanf((const char*)rida+pos2+1, "%e", &(lexInf->tagIdxProb[i].tagProb));
            pos1=pos3+1;
            }
        for(int i=1; i<lexInf->n; i++)
            {
            assert(lexInf->tagIdxProb[i-1].tagIdx<lexInf->tagIdxProb[i].tagIdx);
            }
        printf("%d\r", lexArr.idxLast);
        }
    printf("%d\n", lexArr.idxLast);
    printf("Leksikoni järjestamine...");
    lexArr.Sort();
    printf("OK\n");
    }
void TAGS2DCT::LoeTekstifailist(void)
    {
    CFSFileName fileName(FSTSTR("taglist.txt"));
    CPFSFile in;
    if(in.Open(fileName, FSTSTR("rb"))==false)
        throw VEAD(ERR_X_TYKK, ERR_OPN, __FILE__, __LINE__," ", "Ei suuda avada faili taglist.txt");
    TMPLPTRARRAYBIN<PCFSAString,CFSAString>::Start(100,10);
    CFSAString rida;
    PCFSAString tagStr;

    // Loeme märgendite loendi mällu
    while(in.ReadLine(&rida)==true)
        {
        tagStr=rida.Mid(4);
        tagStr.Trim();
        if(TMPLPTRARRAYBIN<PCFSAString,CFSAString>::AddClone(tagStr)==NULL)
            throw VEAD(ERR_HMM_MOOTOR, ERR_NOMEM, __FILE__, __LINE__," ");
        }
    in.Close();
    printf("Märgendite järjestamine...");
    // Garanteerime järjestatuse
    TMPLPTRARRAYBIN<PCFSAString,CFSAString>::Sort();
    // Kontrollime veel üle, et ikka tõesti järjestatud
    for(int i=1; i<idxLast; i++)
        {
        if(*(operator[](i-1)) >= *(operator[](i)))
            throw VEAD(ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__," ",
                "Jama märgendite järjekorraga andmefailis taglist.txt");
        }
    printf("OK\n");

    gramm1.Start(idxLast);
    CFSFileName fileName2(FSTSTR("margcnt.txt"));
    if(in.Open(fileName2, FSTSTR("rb"))==false)
        throw VEAD(ERR_X_TYKK, ERR_OPN, __FILE__, __LINE__," ", "Ei suuda avada faili margcnt.txt");

    for(int i=0; i<idxLast; i++)
        {
        if(in.ReadLine(&rida)==false)
             throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," ");
        int tyhikuPos=rida.Find(' ');
        if(tyhikuPos<=0)
            throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," ");
        CFSAString tag(rida.Left(tyhikuPos));
        if(tag!=*(operator[](i)))
            throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," ");
        int nKorda;
        if(sscanf(((const char*)rida)+tyhikuPos, "%d", &nKorda)!=1)
            throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," ");
        gramm1.Obj(i)=nKorda;
        }        
    }
Exemple #3
0
void CONV_HTML_UC2::ConvToUc(
    CFSWString& wStr,
    const CFSAString& aStr,
    const PFSCODEPAGE koodiTabel
)
{
    wStr.Empty();
    if(koodiTabel!=PFSCP_HTMLEXT) // Krutime Renee algoritmi j�rgi
    {
        wStr = FSStrAtoW(aStr, koodiTabel); // Kui teisendus k�ib Rene tabelite j�rgi, siis teeme �ra ja valmis
        return;
    }
    assert(koodiTabel==PFSCP_HTMLEXT); // Kasutame teisendamiseks failist loetud tabelit
    if(sgml2uc.idxLast<=0)
        throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                   "SGML olemite tabel mallu lugemata");
    int l, n=aStr.GetLength();
    for(l=0; l < n; l++)
    {
        if((aStr[l] & (~0x7F))!=0) // peab olema 7bitine ascii
            throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                       "String peab koosnema ASCII (7bitistest) koodidest", (const char*)aStr+l);
        if(aStr[l]!='&') // ei alusta SGML olemit...
        {
tryki:
            wStr += ((FSWCHAR)(aStr[l])) & 0x7F; // ...l�heb niisama
            continue;
        }
        // V�ib alustada mingit SGML olemit - &blah;
        int lSemiPos=(int)aStr.Find(";", l+1);
        if(lSemiPos<0) // see ampersand ilma l�petava semita
        {
            if(ignoramp==true)
                goto tryki;
            throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                       "Ampersandi tagant semi puudu", (const char*)aStr+l);
        }
        if(autosgml==true && aStr[l+1]=='#') // teisenda &#[{x|X}]12345; s�mbolid
        {
            int tmp=0, j=l+2;
            if(aStr[j]=='x' || aStr[j]=='X')    // teisenda &#x12345; ja &#X12345; hexakoodid
            {
                j++;
                //if(sscanf(((const char*)aStr)+j, "%x", &tmp)!=1)
                //        throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                //                    "Vigane SGML olem", (const char*)aStr+l);
                //for(; j<lSemiPos; j++)
                //    {
                //    if(strchr("0123456789aAbBcCdDeEfF", aStr[j])==NULL)
                //       throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                //                    "Vigane SGML olem", (const char*)aStr+l);
                //    }
                j+=STRSOUP::UnsignedStr2Hex<int, char>(&tmp, ((const char*)aStr)+j);
                if(j<=0 || aStr[j]!=';')
                    throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                               "Vigane SGML olem", (const char*)aStr+l);
                if(tmp>0xFFFF)
                    throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                               "Vigane SGML olem (peab mahtuma 2 baidi peale)", (const char*)aStr+l);
            }
            else                                // teisenda &#12345; ja &#12345; k�mnendkoodid
            {
                //for(; j<lSemiPos; j++)
                //    {
                //    if(aStr[j]<'0' || aStr[j]>'9')
                //        throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                //                    "Vigane SGML olem (lubatud 0-9)", (const char*)aStr+l);
                //    if((tmp=10*tmp+aStr[j]-'0')>0xFFFF)
                //        throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                //                    "Vigane SGML olem (peab mahtuma 2 baidi peale)", (const char*)aStr+l);
                //    }
                j+=STRSOUP::UnsignedStr2Num<int, char>(&tmp, ((const char*)aStr)+j);
                if(j<=0 || aStr[j]!=';')
                    throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                               "Vigane SGML olem", (const char*)aStr+l);
                if(tmp>0xFFFF)
                    throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                               "Vigane SGML olem (peab mahtuma 2 baidi peale)", (const char*)aStr+l);
            }
            wStr += (WCHAR)tmp;
            l=lSemiPos;
            continue;
        }
        if(lSemiPos-l+1 > sgml_stringi_max_pikkus) // nii pikk ei saa olla tabelis
        {
            if(ignoramp==true)
                goto tryki;
            throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                       "Puudub SGML olemite tabelist", (const char*)aStr+l);
        }
        CFSAString szSymbol=aStr.Mid(l, lSemiPos-l+1); // l�ikame &bla; sisendstringist v�lja
        SGML_UC* rec;
        if((rec=sgml2uc.Get(&szSymbol))==NULL) // ei leidnud kahendtabelist - jama lahti
        {
            if(ignoramp==true)
                goto tryki;
            throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $",
                       "Puudub SGML olemite tabelist", (const char*)szSymbol);
        }
        wStr += rec->uc;
        l=lSemiPos;
    }
}
void MKLASSID2DCT::Run(
    DCTMETASTRCT& meta,     ///< Sõnastiku struktuurihoidla
    const TAGS2DCT& tags    ///< Ühestamismärgendite massiiv
    )
    {
    CPFSFile in;
    if(in.Open(FSTSTR("klassid.txt"), FSTSTR("rb"))==false)
        throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__,
                " ", "Jama andmefaili klassid.txt avamisega");
    CFSAString rida;

    // tõsta kohe sõnastikku ümber...
    int n, reaNr, kokkuRidu;
    long pos=meta.Tell();
    meta.Add(DCTELEMID_T3M_KLASSID, pos);
    if(in.ReadLine(&rida)==false)
        throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__,
                " ", "Jama andmefaili klassid.txt lugemisega");
    sscanf((const char*)rida, "%d", &kokkuRidu);
    if(meta.WriteUnsigned<UB4, int>(kokkuRidu)==false) // mitmsusklasside arv
        throw VEAD( ERR_HMM_MOOTOR, ERR_WRITE, __FILE__, __LINE__,
                " ", "Jama pakitud sõnastikku kirjutamisega");
    for(reaNr=1; in.ReadLine(&rida)==true; reaNr++)
        {
        printf("%06d:%06d\r", kokkuRidu, reaNr);
        int tyhik, vordus;
        rida += " ";
        if((tyhik=(int)rida.Find(' '))<0)
            throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__,
                    " ", "Jama andmefaili  klassid.txt  lugemisega");
        if(sscanf((const char*)rida, "%d", &n)!=1)
            throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__,
                    " ", "Jama andmefaili  klassid.txt  lugemisega");
        meta.WriteUnsigned<UB1, int>(n); // jooksva mitmesusklassi suurus
        for(int i=0; i<n; i++)
            {
            if((vordus=(int)rida.Find('=', tyhik+1))<=tyhik+1)
                throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__,
                        " ", "Jama andmefaili  klassid.txt  lugemisega", (const char*)rida);

            CFSAString tagStr=rida.Mid(tyhik+1, vordus-tyhik-1);
            int tagIdx=tags.GetIdx(&tagStr);
            if(tagIdx<0)
                throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__,
                    " ", "Tundmatu ühestamismärgend andmefailis  klassid.txt  ", 
                                                                            (const char*)rida);
            if(meta.WriteUnsigned<UB1,int>(tagIdx)==false)
                throw VEAD( ERR_HMM_MOOTOR, ERR_WRITE, __FILE__, __LINE__,
                        " ", "Jama pakitud sõnastikku kirjutamisega");

            UKAPROB tagProb;
            if(sscanf(((const char*)rida)+vordus+1, "%e", &tagProb)!=1)
                throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__,
                    " ", "Puuduv tõenäosus failis  klassid.txt  ", 
                                                                            (const char*)rida);

            if(meta.WriteBuffer(&tagProb,sizeof(UKAPROB))==false)
                throw VEAD( ERR_HMM_MOOTOR, ERR_WRITE, __FILE__, __LINE__,
                        " ", "Jama pakitud sõnastikku kirjutamisega");

            if((tyhik=(int)rida.Find(' ', tyhik+1))<=0)
                throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__,
                        " ", "Jama andmefaili  klassid.txt  lugemisega");

            if(vordus >= tyhik)
                throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__,
                        " ", "Jama andmefaili  klassid.txt  lugemisega");
            }
        }
    printf("\n");
    }