void CJSONWriter::IntVal(INTPTR iInt) { Comma(); CFSAString szStr; szStr.Format("%zd", iInt); Text(szStr); m_Comma[GetLevel()]=COMMA_VAL; }
void CJSONWriter::FloatVal(double dFloat) { Comma(); CFSAString szStr; szStr.Format("%f", dFloat); Text(szStr); m_Comma[GetLevel()]=COMMA_VAL; }
void TAGS2DCT::LoeTekstifailist(void) { CFSFileName fileName(FSTSTR("taglist.txt")); CPFSFile in; if(in.Open(fileName, FSTSTR("rb"))==false) throw VEAD(ERR_X_TYKK, ERR_OPN, __FILE__, __LINE__," ", "Ei suuda avada faili taglist.txt"); TMPLPTRARRAYBIN<PCFSAString,CFSAString>::Start(100,10); CFSAString rida; PCFSAString tagStr; // Loeme märgendite loendi mällu while(in.ReadLine(&rida)==true) { tagStr=rida.Mid(4); tagStr.Trim(); if(TMPLPTRARRAYBIN<PCFSAString,CFSAString>::AddClone(tagStr)==NULL) throw VEAD(ERR_HMM_MOOTOR, ERR_NOMEM, __FILE__, __LINE__," "); } in.Close(); printf("Märgendite järjestamine..."); // Garanteerime järjestatuse TMPLPTRARRAYBIN<PCFSAString,CFSAString>::Sort(); // Kontrollime veel üle, et ikka tõesti järjestatud for(int i=1; i<idxLast; i++) { if(*(operator[](i-1)) >= *(operator[](i))) throw VEAD(ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__," ", "Jama märgendite järjekorraga andmefailis taglist.txt"); } printf("OK\n"); gramm1.Start(idxLast); CFSFileName fileName2(FSTSTR("margcnt.txt")); if(in.Open(fileName2, FSTSTR("rb"))==false) throw VEAD(ERR_X_TYKK, ERR_OPN, __FILE__, __LINE__," ", "Ei suuda avada faili margcnt.txt"); for(int i=0; i<idxLast; i++) { if(in.ReadLine(&rida)==false) throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," "); int tyhikuPos=rida.Find(' '); if(tyhikuPos<=0) throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," "); CFSAString tag(rida.Left(tyhikuPos)); if(tag!=*(operator[](i))) throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," "); int nKorda; if(sscanf(((const char*)rida)+tyhikuPos, "%d", &nKorda)!=1) throw VEAD(ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__," "); gramm1.Obj(i)=nKorda; } }
void LEX2DCT::LoeTekstifailist( const TAGS2DCT& tags ) { lexArr.Start(500,500); CPFSFile in; if(in.Open(FSTSTR("lex.txt"), FSTSTR("rb"))==false) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili lex.txt avamisega"); CFSAString rida; for(int reaNr=1; in.ReadLine(&rida)==true; reaNr++) { LEXINF* lexInf=lexArr.AddPlaceHolder(); // sõna [ N] tag1=prob1 ... tagN=probN rida.Trim();// white space eest-tagant maha rida+=' '; // tühik lõppu int pos1=(int)rida.Find(' '), pos2, pos3; if(pos1<=0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Jamane rida andmefailis lex.txt", (const char*)rida); if(rida[pos1+5]!=']' || rida[pos1+6]!=' ' || rida[pos1+7]=='\0') throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Jamane rida andmefailis lex.txt", (const char*)rida); CFSAString tagStr, probStr; //CFSWString wTagStr; lexInf->str=rida.Mid(0,pos1); sscanf(((const char*)rida)+pos1+2, "%d", &(lexInf->n)); lexInf->tagIdxProb=new LEXINF::LEXINFEL[lexInf->n]; pos1+=7; for(int i=0; i<lexInf->n; i++) { if((pos2=(int)rida.Find('=', pos1))<=0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Jamane rida andmefailis lex.txt", (const char*)rida); if((pos3=(int)rida.Find(' ',pos2))<=0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Jamane rida andmefailis lex.txt", (const char*)rida); tagStr=rida.Mid(pos1, pos2-pos1); if((lexInf->tagIdxProb[i].tagIdx=tags.GetIdx(&tagStr))<0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Tundmatu ühestamismärgendandmefailis lex.txt", (const char*)rida); sscanf((const char*)rida+pos2+1, "%e", &(lexInf->tagIdxProb[i].tagProb)); pos1=pos3+1; } for(int i=1; i<lexInf->n; i++) { assert(lexInf->tagIdxProb[i-1].tagIdx<lexInf->tagIdxProb[i].tagIdx); } printf("%d\r", lexArr.idxLast); } printf("%d\n", lexArr.idxLast); printf("Leksikoni järjestamine..."); lexArr.Sort(); printf("OK\n"); }
void CJSONWriter::Text(const CFSAString &szStr) { for (INTPTR ip=0; ip<szStr.GetLength(); ip++) { if (szStr[ip]=='\\') m_Stream.WriteText("\\\\"); else if (szStr[ip]=='\"') m_Stream.WriteText("\\\""); else if (szStr[ip]=='\n') m_Stream.WriteText("\\n"); else if (szStr[ip]=='\r') m_Stream.WriteText("\\r"); else if (szStr[ip]=='\t') m_Stream.WriteText("\\t"); else if (szStr[ip]=='\b') m_Stream.WriteText("\\b"); else if (szStr[ip]=='\f') m_Stream.WriteText("\\f"); else m_Stream.WriteChar(szStr[ip]); } }
void OnValReadStart(const CFSAString &szKey) { if (szKey.IsEmpty()) { m_Writer.ObjectStart(); } else if (szKey=="/paragraphs") { m_Writer.Key("paragraphs"); m_Writer.ArrayStart(); m_iCollectData--; } else if (KeyMatch(szKey, "/paragraphs/%d")) { m_Writer.ObjectStart(); m_iCollectData++; } else if (KeyMatch(szKey, "/paragraphs/%d/sentences")) { m_Writer.Key("sentences"); m_Writer.ArrayStart(); m_iCollectData--; } else if (KeyMatch(szKey, "/paragraphs/%d/sentences/%d")) { m_iCollectData++; } }
void OnValReadEnd(const CFSAString &szKey, CFSVar &Data) { if (szKey.IsEmpty()) { SubKeys("paragraphs", Data); m_Writer.ObjectEnd(); } else if (szKey=="/paragraphs") { m_Writer.ArrayEnd(); m_iCollectData++; } else if (KeyMatch(szKey, "/paragraphs/%d")) { SubKeys("sentences", Data); m_Writer.ObjectEnd(); m_iCollectData--; } else if (KeyMatch(szKey, "/paragraphs/%d/sentences")) { m_Writer.ArrayEnd(); m_iCollectData++; } else if (KeyMatch(szKey, "/paragraphs/%d/sentences/%d")) { if (Data.KeyExist("words")) { CFSVar &Words=Data["words"]; CFSArray<CMorphInfos> WordsAnalysis; for (INTPTR ip=0; ip<Words.GetSize(); ip++) { const CFSVar &Word=Words[ip]; CMorphInfos Analysis; Analysis.m_szWord=Word["text"].GetWString(); const CFSVar &VarAnalysis=Word["analysis"]; for (INTPTR ip2=0; ip2<VarAnalysis.GetSize(); ip2++) { const CFSVar &VarAnalysis1=VarAnalysis[ip2]; CMorphInfo Analysis1; Analysis1.m_szRoot=VarAnalysis1["root"].GetWString(); Analysis1.m_szEnding=VarAnalysis1["ending"].GetWString(); Analysis1.m_szClitic=VarAnalysis1["clitic"].GetWString(); Analysis1.m_cPOS=VarAnalysis1["partofspeech"].GetWString()[0]; Analysis1.m_szForm=VarAnalysis1["form"].GetWString(); Analysis.m_MorphInfo.AddItem(Analysis1); } WordsAnalysis.AddItem(Analysis); } WordsAnalysis=m_Disambiguator.Disambiguate(WordsAnalysis); RT_ASSERT(Words.GetSize()==WordsAnalysis.GetSize()); for (INTPTR ip=0; ip<Words.GetSize(); ip++) { const CMorphInfos &Analysis=WordsAnalysis[ip]; CFSVar VarAnalysis; VarAnalysis.Cast(CFSVar::VAR_ARRAY); for (INTPTR ipRes=0; ipRes<Analysis.m_MorphInfo.GetSize(); ipRes++) { const CMorphInfo &Analysis1=Analysis.m_MorphInfo[ipRes]; CFSVar VarAnalysis1; VarAnalysis1["root"]=Analysis1.m_szRoot; VarAnalysis1["ending"]=Analysis1.m_szEnding; VarAnalysis1["clitic"]=Analysis1.m_szClitic; VarAnalysis1["partofspeech"]=CFSWString(Analysis1.m_cPOS); VarAnalysis1["form"]=Analysis1.m_szForm; VarAnalysis[ipRes]=VarAnalysis1; } Words[ip]["analysis"]=VarAnalysis; } } m_Writer.Val(Data); m_iCollectData--; } }
void CONV_HTML_UC2::ConvToUc( CFSWString& wStr, const CFSAString& aStr, const PFSCODEPAGE koodiTabel ) { wStr.Empty(); if(koodiTabel!=PFSCP_HTMLEXT) // Krutime Renee algoritmi j�rgi { wStr = FSStrAtoW(aStr, koodiTabel); // Kui teisendus k�ib Rene tabelite j�rgi, siis teeme �ra ja valmis return; } assert(koodiTabel==PFSCP_HTMLEXT); // Kasutame teisendamiseks failist loetud tabelit if(sgml2uc.idxLast<=0) throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "SGML olemite tabel mallu lugemata"); int l, n=aStr.GetLength(); for(l=0; l < n; l++) { if((aStr[l] & (~0x7F))!=0) // peab olema 7bitine ascii throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "String peab koosnema ASCII (7bitistest) koodidest", (const char*)aStr+l); if(aStr[l]!='&') // ei alusta SGML olemit... { tryki: wStr += ((FSWCHAR)(aStr[l])) & 0x7F; // ...l�heb niisama continue; } // V�ib alustada mingit SGML olemit - &blah; int lSemiPos=(int)aStr.Find(";", l+1); if(lSemiPos<0) // see ampersand ilma l�petava semita { if(ignoramp==true) goto tryki; throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Ampersandi tagant semi puudu", (const char*)aStr+l); } if(autosgml==true && aStr[l+1]=='#') // teisenda &#[{x|X}]12345; s�mbolid { int tmp=0, j=l+2; if(aStr[j]=='x' || aStr[j]=='X') // teisenda 𒍅 ja 𒍅 hexakoodid { j++; //if(sscanf(((const char*)aStr)+j, "%x", &tmp)!=1) // throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", // "Vigane SGML olem", (const char*)aStr+l); //for(; j<lSemiPos; j++) // { // if(strchr("0123456789aAbBcCdDeEfF", aStr[j])==NULL) // throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", // "Vigane SGML olem", (const char*)aStr+l); // } j+=STRSOUP::UnsignedStr2Hex<int, char>(&tmp, ((const char*)aStr)+j); if(j<=0 || aStr[j]!=';') throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Vigane SGML olem", (const char*)aStr+l); if(tmp>0xFFFF) throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Vigane SGML olem (peab mahtuma 2 baidi peale)", (const char*)aStr+l); } else // teisenda 〹 ja 〹 k�mnendkoodid { //for(; j<lSemiPos; j++) // { // if(aStr[j]<'0' || aStr[j]>'9') // throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", // "Vigane SGML olem (lubatud 0-9)", (const char*)aStr+l); // if((tmp=10*tmp+aStr[j]-'0')>0xFFFF) // throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", // "Vigane SGML olem (peab mahtuma 2 baidi peale)", (const char*)aStr+l); // } j+=STRSOUP::UnsignedStr2Num<int, char>(&tmp, ((const char*)aStr)+j); if(j<=0 || aStr[j]!=';') throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Vigane SGML olem", (const char*)aStr+l); if(tmp>0xFFFF) throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Vigane SGML olem (peab mahtuma 2 baidi peale)", (const char*)aStr+l); } wStr += (WCHAR)tmp; l=lSemiPos; continue; } if(lSemiPos-l+1 > sgml_stringi_max_pikkus) // nii pikk ei saa olla tabelis { if(ignoramp==true) goto tryki; throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Puudub SGML olemite tabelist", (const char*)aStr+l); } CFSAString szSymbol=aStr.Mid(l, lSemiPos-l+1); // l�ikame &bla; sisendstringist v�lja SGML_UC* rec; if((rec=sgml2uc.Get(&szSymbol))==NULL) // ei leidnud kahendtabelist - jama lahti { if(ignoramp==true) goto tryki; throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "Puudub SGML olemite tabelist", (const char*)szSymbol); } wStr += rec->uc; l=lSemiPos; } }
void CONV_HTML_UC2::ConvFromUc( CFSAString& aStr, const PFSCODEPAGE koodiTabel, const CFSWString& wStr ) { aStr.Empty(); if(koodiTabel!=PFSCP_HTMLEXT) // Krutime Renee algoritmi j�rgi { aStr = FSStrWtoA(wStr, koodiTabel); // Kui teisendus k�ib Rene tabelite j�rgi, siis teeme �ra ja valmis return; } assert(koodiTabel==PFSCP_HTMLEXT); // Kasutame teisendamiseks failist loetud tabelit if(sgml2uc.idxLast<=0) throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "SGML olemite tabel mallu lugemata"); FSWCHAR wc; for(int i=0; (wc=wStr[i])!=0; i++) { if((wc & (~0x7F))==0) // Oli ASCII (7bitine) kood { if(ignoramp==false && wc==(FSWCHAR)'&') // Ampersand SGML olemiks aStr += "&"; else aStr += (char)(wc & 0x7F); // Muud ASCII koodid niisama �le continue; } // Polnud ASCII kood, peab olema SGML olemite loendis SGML_UC* rec; if((rec=uc2sgml.Get((const FSWCHAR*)wStr+i))!=NULL) // leidsime loendist { aStr += rec->sgml; continue; } int olemiAlgusPos=aStr.GetLength(); aStr+="&#"; STRSOUP::UnsignedNum2Str<int, CFSAString, char, 10>((unsigned int)(wStr[i]), aStr); aStr+=';'; if(autosgml==false) { throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "UniCode'i kood programmi SGML olemite tabelist puudu, 10ndkood", (const char*)aStr+olemiAlgusPos); } /* if(autosgml==false) { char tmpBuf[128]; sprintf(tmpBuf, "%d", (unsigned int)(wStr[i])); throw VEAD(ERR_X_TYKK, ERR_ARGVAL, __FILE__,__LINE__, "$Revision: 557 $", "UniCode s�mbol programmi SGML olemite tabelist puudu, kood", tmpBuf); } //autosgml==true; CFSAString revSgml; int j=-1; assert(wc > 0); do { revSgml[++j] = (unsigned)(wc%10)+(unsigned)'0'; wc /= 10; } while(wc > 0); aStr += "&#"; while(j>=0) aStr+=revSgml[j--]; aStr += ';'; */ } }
CFSVar CJSONReader::ReadVal(const CFSAString &szKeyPath) { OnValReadStart(szKeyPath); CFSVar Data; if (m_cCh=='[') { Data.Cast(CFSVar::VAR_ARRAY); GetChar(true); INTPTR ipPos=0; for (;;) { if (m_cCh==0) { throw CJSONException(FSTSTR("Unexpetcted EOF")); } else if (m_cCh==']') { GetChar(true); break; } else if (ipPos>0) { if (m_cCh==',') { GetChar(true); } else { throw CJSONException(FSTSTR("Missing ',' in array")); } } CFSAString szKey; szKey.Format("%zd", ipPos); CFSVar Data1=ReadVal(szKeyPath+"/"+szKey); if (m_iCollectData>0) { Data[ipPos]=Data1; } ipPos++; } } else if (m_cCh=='{') { Data.Cast(CFSVar::VAR_MAP); GetChar(true); INTPTR ipPos=0; for (;;) { if (m_cCh==0) { throw CJSONException(FSTSTR("Unexpetcted EOF")); } else if (m_cCh=='}') { GetChar(true); break; } else if (ipPos>0) { if (m_cCh==',') { GetChar(true); } else { throw CJSONException(FSTSTR("Missing ',' in map")); } } CFSAString szKey; if (m_cCh=='\"' || m_cCh=='\'') { szKey=ReadString(); } else if (FSIsLetter(m_cCh)) { szKey=ReadText(); } else { throw CJSONException(FSTSTR("Expected key")); } if (m_cCh==':') { GetChar(true); } else { throw CJSONException(FSTSTR("Expected ':'")); } CFSVar Data1=ReadVal(szKeyPath+"/"+szKey); if (m_iCollectData>0) { Data[szKey]=Data1; } ipPos++; } } else if (m_cCh=='\"' || m_cCh=='\'') { Data=ReadString(); } else if ((m_cCh>='0' && m_cCh<='9') || FSStrChr("-+.", m_cCh)) { Data=ReadNumber(); } else if (FSIsLetter(m_cCh)) { Data=ReadConst(); } else if (!m_cCh) { } else { throw CJSONException(FSTSTR("Unknown value type")); } OnValReadEnd(szKeyPath, Data); return Data; }
void MKLASSID2DCT::Run( DCTMETASTRCT& meta, ///< Sõnastiku struktuurihoidla const TAGS2DCT& tags ///< Ühestamismärgendite massiiv ) { CPFSFile in; if(in.Open(FSTSTR("klassid.txt"), FSTSTR("rb"))==false) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt avamisega"); CFSAString rida; // tõsta kohe sõnastikku ümber... int n, reaNr, kokkuRidu; long pos=meta.Tell(); meta.Add(DCTELEMID_T3M_KLASSID, pos); if(in.ReadLine(&rida)==false) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); sscanf((const char*)rida, "%d", &kokkuRidu); if(meta.WriteUnsigned<UB4, int>(kokkuRidu)==false) // mitmsusklasside arv throw VEAD( ERR_HMM_MOOTOR, ERR_WRITE, __FILE__, __LINE__, " ", "Jama pakitud sõnastikku kirjutamisega"); for(reaNr=1; in.ReadLine(&rida)==true; reaNr++) { printf("%06d:%06d\r", kokkuRidu, reaNr); int tyhik, vordus; rida += " "; if((tyhik=(int)rida.Find(' '))<0) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); if(sscanf((const char*)rida, "%d", &n)!=1) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); meta.WriteUnsigned<UB1, int>(n); // jooksva mitmesusklassi suurus for(int i=0; i<n; i++) { if((vordus=(int)rida.Find('=', tyhik+1))<=tyhik+1) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega", (const char*)rida); CFSAString tagStr=rida.Mid(tyhik+1, vordus-tyhik-1); int tagIdx=tags.GetIdx(&tagStr); if(tagIdx<0) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Tundmatu ühestamismärgend andmefailis klassid.txt ", (const char*)rida); if(meta.WriteUnsigned<UB1,int>(tagIdx)==false) throw VEAD( ERR_HMM_MOOTOR, ERR_WRITE, __FILE__, __LINE__, " ", "Jama pakitud sõnastikku kirjutamisega"); UKAPROB tagProb; if(sscanf(((const char*)rida)+vordus+1, "%e", &tagProb)!=1) throw VEAD( ERR_X_TYKK, ERR_ROTTEN, __FILE__, __LINE__, " ", "Puuduv tõenäosus failis klassid.txt ", (const char*)rida); if(meta.WriteBuffer(&tagProb,sizeof(UKAPROB))==false) throw VEAD( ERR_HMM_MOOTOR, ERR_WRITE, __FILE__, __LINE__, " ", "Jama pakitud sõnastikku kirjutamisega"); if((tyhik=(int)rida.Find(' ', tyhik+1))<=0) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); if(vordus >= tyhik) throw VEAD( ERR_HMM_MOOTOR, ERR_RD, __FILE__, __LINE__, " ", "Jama andmefaili klassid.txt lugemisega"); } } printf("\n"); }
void CJSONWriter::Text(const CFSAString &szStr) { for (INTPTR ip=0; ip<szStr.GetLength(); ip++) { if (szStr[ip]=='\"') m_Stream.WriteChar('\\'); m_Stream.WriteChar(szStr[ip]); } }