void CTime::serialize(CDictionary& toDictionary) const { toDictionary.setValueForKey(cMillisecondsKey, milliseconds()); toDictionary.setValueForKey(cSecondsKey, seconds()); toDictionary.setValueForKey(cMinutesKey, minutes()); toDictionary.setValueForKey(cHoursKey, hours()); }
//对字符串用最大匹配法(正向或逆向)处理 string SegmentSentence(string s1) { string s2 = ""; //用s2存放分词结果 while (!s1.empty()) { int len = (int)s1.length(); // 取输入串长度 if (len > MaxWordLength) // 如果输入串长度大于最大词长 { len = MaxWordLength; // 只在最大词长范围内进行处理 } //string w = s1.substr(0, len); // (正向用)将输入串左边等于最大词长长度串取出作为候选词 string w = s1.substr(s1.length() - len, len); //逆向用 int n = WordDic.FindWord(w); // 在词典中查找相应的词 while (len > 2 && n == 0) // 如果不是词 { len -= 2; // 从候选词右边减掉一个汉字,将剩下的部分作为候选词 //w = w.substr(0, len); //正向用 w = s1.substr(s1.length() - len, len); //逆向用 n = WordDic.FindWord(w); } //s2 += w + Separator; // (正向用)将匹配得到的词连同词界标记加到输出串末尾 w = w + Separator; // (逆向用) s2 = w + s2; // (逆向用) //s1 = s1.substr(w.length(), s1.length()); //(正向用)从s1-w处开始 s1 = s1.substr(0, s1.length() - len); // (逆向用) } return s2; }
/* * логика внутри цикла метода может показаться запутанной и тяжеловатой * для сопровождения. Собственно, это было сделано в пользу малого размера словаря * при желании его можно сделать более пригодным для сопровождения */ string Translate(int num,int power) { string result; int temp; vector<int> digits; char buff[100]; digits=Num2Digits(num,power); vector<int>::iterator it=digits.begin(); vector<int>::iterator end=digits.end(); int dig=digits.size()-1; while(it!=end) { switch(dig%3) { case 0: { sprintf(buff,"d%d%d", dig ,*it ); result+=D.GetTranslate(buff) + " "; break; } case 1: { if(*it) { if(*it==1) { temp=10*(*it); ++it; --dig; temp+=(*it); sprintf(buff,"%d", temp ); result+=D.GetTranslate(buff) + " "; sprintf(buff,"d%d0", dig ); result+=D.GetTranslate(buff) + " "; } else { sprintf(buff,"%d", *it*10 ); result+=D.GetTranslate(buff) + " "; } } break; } case 2: { if(*it) { sprintf(buff,"%d", *it*100 ); result+=D.GetTranslate(buff) + " "; } break; } } --dig; ++it; } return result; }
//CDynamicArray &aWord: the words array //CDynamicArray &aWordBinaryNet:the net between words //double dSmoothingPara: the parameter of data smoothing //CDictionary &DictBinary: the binary dictionary //CDictionary &DictCore: the Core dictionary bool CSegment::BiGraphGenerate(CDynamicArray &aWord, CDynamicArray &aBinaryWordNet,double dSmoothingPara,CDictionary &DictBinary,CDictionary &DictCore) { PARRAY_CHAIN pTail,pCur,pNextWords;//Temp buffer unsigned int nWordIndex=0,nTwoWordsFreq=0,nCurWordIndex,nNextWordIndex; //nWordIndex: the index number of current word double dCurFreqency,dValue,dTemp; char sTwoWords[WORD_MAXLENGTH]; m_nWordCount=aWord.GetTail(&pTail);//Get tail element and return the words count if(m_npWordPosMapTable) {//free buffer delete [] m_npWordPosMapTable; m_npWordPosMapTable=0; } if(m_nWordCount>0)//Word count is greater than 0 { m_npWordPosMapTable=new int[m_nWordCount];//Record the position of possible words memset(m_npWordPosMapTable,0,m_nWordCount*sizeof(int)); } pCur=aWord.GetHead(); while(pCur!=NULL)//Set the position map of words { m_npWordPosMapTable[nWordIndex++]=pCur->row*MAX_SENTENCE_LEN+pCur->col; pCur=pCur->next; } pCur=aWord.GetHead(); while(pCur!=NULL)// { if(pCur->nPOS>=0)//It's not an unknown words dCurFreqency=pCur->value; else//Unknown words dCurFreqency=DictCore.GetFrequency(pCur->sWord,2); aWord.GetElement(pCur->col,-1,pCur,&pNextWords);//Get next words which begin with pCur->col while(pNextWords&&pNextWords->row==pCur->col)//Next words { //Current words frequency strcpy(sTwoWords,pCur->sWord); strcat(sTwoWords,WORD_SEGMENTER); strcat(sTwoWords,pNextWords->sWord); nTwoWordsFreq=DictBinary.GetFrequency(sTwoWords,3); //Two linked Words frequency dTemp=(double)1/MAX_FREQUENCE; //Smoothing dValue=-log(dSmoothingPara*(1+dCurFreqency)/(MAX_FREQUENCE+80000)+(1-dSmoothingPara)*((1-dTemp)*nTwoWordsFreq/(1+dCurFreqency)+dTemp)); //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 if(pCur->nPOS<0)//Unknown words: P(Wi|Ci);while known words:1 dValue+=pCur->value; //Get the position index of current word in the position map table nCurWordIndex=BinarySearch(pCur->row*MAX_SENTENCE_LEN+pCur->col,m_npWordPosMapTable,m_nWordCount); nNextWordIndex=BinarySearch(pNextWords->row*MAX_SENTENCE_LEN+pNextWords->col,m_npWordPosMapTable,m_nWordCount); aBinaryWordNet.SetElement(nCurWordIndex,nNextWordIndex,dValue,pCur->nPOS); pNextWords=pNextWords->next;//Get next word } pCur=pCur->next; } return true; }
void CTime::deserialize(const CDictionary& fromDictionary) { UInt32 ms = fromDictionary.valueAsUInt32ForKey(cMillisecondsKey); UInt32 s = fromDictionary.valueAsUInt32ForKey(cSecondsKey); UInt32 m = fromDictionary.valueAsUInt32ForKey(cMinutesKey); UInt32 h = fromDictionary.valueAsUInt32ForKey(cHoursKey); mData = h * MILLISECONDS_IN_HOUR + m * MILLISECONDS_IN_MINUTE + s * MILLISECONDS_IN_SECOND + ms; }
//POS tagging with Hidden Markov Model bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown) { //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary int i=0,j,nStartPos; Reset(false); while(i>-1&&pWordItems[i].sWord[0]!=0) { nStartPos=i;//Start Position i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown); GetBestPOS(); switch(m_tagType) { case TT_NORMAL://normal POS tagging j=1; while(m_nBestTag[j]!=-1&&j<m_nCurLength) {//Store the best POS tagging pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j]; //Let 。be 0 if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value pWordItems[j+nStartPos-1].dValue=LOG_MAX_FRQUENCE-log((double)dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j])+1); j+=1; } break; case TT_PERSON://Person recognition /*clock_t lStart,lEnd; lStart=clock(); */ SplitPersonPOS(dictUnknown); //lEnd=clock(); //printf("SplitPersonPOS=%f\n",(double)(lEnd-lStart)*1000/CLOCKS_PER_SEC); //Spit Persons POS //lStart=clock(); PersonRecognize(dictUnknown); //lEnd=clock(); //printf("PersonRecognize=%f\n",(double)(lEnd-lStart)/CLOCKS_PER_SEC); //Person Recognition with the person recognition dictionary break; case TT_PLACE://Place name recognition PlaceRecognize(dictCore,dictUnknown); break; case TT_TRANS://Transliteration TransRecognize(dictCore,dictUnknown); break; default: break; } Reset(); } return true; }
bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict) { int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType,nFreqGiven; char sTemp[3]; if(nLen<3||nLen>8)//Not a traditional Chinese person name return false; while(i<nLen)//No Including non-CHinese char { nCharType=charType((unsigned char*)sPersonName+i); if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER) return false; i+=2; } sSurname2[0]=0;//init strncpy(sSurname,sPersonName,nSurNameLen); sSurname[nSurNameLen]=0; if(!personDict.IsExist(sSurname,1)) { nSurNameLen=2; sSurname[nSurNameLen]=0; if(!personDict.IsExist(sSurname,1)) { nSurNameLen=0; sSurname[nSurNameLen]=0; } } strcpy(sGivenName,sPersonName+nSurNameLen); if(nLen>6) { strncpy(sTemp,sPersonName+nSurNameLen,2); sTemp[2]=0;//Get the second possible surname if(personDict.IsExist(sTemp,1)) {//Hongkong women's name: Surname+surname+given name strcpy(sSurname2,sTemp); strcpy(sGivenName,sPersonName+nSurNameLen+2); } } nFreq=personDict.GetFrequency(sSurname,1); strncpy(sTemp,sGivenName,2); sTemp[2]=0; nFreqGiven=personDict.GetFrequency(sTemp,2); if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)>=3&&nFreq<personDict.GetFrequency("张",1)/40&&nFreqGiven<personDict.GetFrequency("华",2)/20)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2))) return false; if(nLen==4&&m_uPerson.IsGivenName(sPersonName)) {//Single Surname+given name return false; } return true; }
int main(int argc, char *argv[]) { SetConsoleOutputCP(1251); SetConsoleCP(1251); if (argc != 1) { cout << "The command has no additional parameters" << endl; return 0; } string englishWord; CDictionary dictionary; CTranslator translator(dictionary); while (getline(cin, englishWord) && englishWord != EXIT_LINE) { if (!englishWord.empty()) { ConvertToLowercase(englishWord); translator.DoDialogWithUser(englishWord); } } string answer; if (!dictionary.IsNewWordsListEmpty()) { cout << "Do you want to save new words? YES/NO" << endl; while (getline(cin, answer)) { if (answer == "YES") { dictionary.DictionarySave(); break; } else if (answer == "NO") { break; } else { cout << "Try again" << endl; } } } return 0; }
ELEMENT_TYPE CSpan::ComputePossibility(int nStartPos,int nLength,CDictionary &dict) { ELEMENT_TYPE dRetValue=0,dPOSPoss; //dPOSPoss: the possibility of a POS appears //dContextPoss: The possibility of context POS appears int nFreq; for(int i=nStartPos;i<nStartPos+nLength;i++) { nFreq=dict.GetFrequency(m_sWords[i],m_nBestTag[i]); //nFreq is word being the POS dPOSPoss=log((double)(m_context.GetFrequency(0,m_nBestTag[i])+1))-log((double)(nFreq+1)); dRetValue+=dPOSPoss; /* if(i<nStartPos+nLength-1) { dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1)); dRetValue+=dPOSPoss-dContextPoss; } */ } return dRetValue; }
int main(int argc, char *argv[]) { SetConsoleOutputCP(1251); SetConsoleCP(1251); if (argc != 2) { cout << "Not enough parameters. The correct command line format:\ndictionary.exe <file name>" << "\n"; return 1; } else { ifstream inputFile(argv[1]); string inputWord; CDictionary dictionary; if (inputFile.is_open()) { dictionary.FillDictionary(inputFile); } cout << "Для перевода введите слово на английском. Для выхода введите '...'\n"; ofstream outputFile; outputFile.open(argv[1], ios::app); getline(cin, inputWord); while (inputWord != "...") { if (dictionary.IsKnowWord(inputWord)) { dictionary.PrintTranslation(inputWord); } else if (!inputWord.empty()) { dictionary.SaveTranslationForNewWordOnUserDemand(inputWord); } getline(cin, inputWord); } dictionary.SaveNewWordsOnUserDemand(outputFile); } return 0; }
int CSpan::GetFrom(PWORD_RESULT pWordItems,int nIndex,CDictionary &dictCore, CDictionary &dictUnknown) { int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD]; int nFreq=0,j,nRetPos=0,nWordsIndex=0; bool bSplit=false;//Need to split in Transliteration recognition int i=1; nWordsIndex=i+nIndex-1; for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++) { if(m_tagType==TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44)) { strcpy(m_sWords[i],pWordItems[nWordsIndex].sWord);//store current word m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]); } else { if(!bSplit) { strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord,2);//store current word m_sWords[i][2]=0; bSplit=true; } else { unsigned int nLen=strlen(pWordItems[nWordsIndex].sWord+2); strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);//store current word m_sWords[i][nLen]=0; bSplit=false; } m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]); } //Record the position of current word m_nStartPos=m_nWordPosition[i+1]; //Move the Start POS to the ending if(m_tagType!=TT_NORMAL) { //Get the POSs from the unknown recognition dictionary dictUnknown.GetHandle(m_sWords[i],&nCount,aPOS,aFreq); for(j=0;j<nCount;j++) {//Get the POS set of sCurWord in the unknown dictionary m_nTags[i][j]=aPOS[j]; m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(m_context.GetFrequency(0,aPOS[j])+1)); } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq); nFreq=0; for(int k=0;k<nCount;k++) { nFreq+=aFreq[k]; } if(nCount>0) { m_nTags[i][j]=0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(m_context.GetFrequency(0,0)+1)); j++; } } else//For normal POS tagging { j=0; //Get the POSs from the unknown recognition dictionary if(pWordItems[nWordsIndex].nHandle>0) {//The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i][j]=pWordItems[nWordsIndex].nHandle; m_dFrequency[i][j]=pWordItems[nWordsIndex].dValue-LOG_MAX_FRQUENCE+log((double)(m_context.GetFrequency(0,m_nTags[i][j])+1)); if(m_dFrequency[i][j]<0)//Not permit the value less than 0 m_dFrequency[i][j]=0; j++; } else {//The word has multiple POSs, we should retrieve the information from Core Dictionary if(pWordItems[nWordsIndex].nHandle<0) {//The word has is only one POS value //We have record its POS and nFrequncy in the items. if(pWordItems[nWordsIndex].nHandle==-'t'*256-'t')//tt { char sWordOrg[100],sPostfix[10]; double dRatio=0.6925;//The ratio which transliteration as a person name PostfixSplit(pWordItems[nWordsIndex].sWord,sWordOrg,sPostfix); if(sPostfix[0]!=0) dRatio=0.01; m_nTags[i][j]='n'*256+'r'; m_dFrequency[i][j]=-log(dRatio)+pWordItems[nWordsIndex].dValue; //m_dFrequency[i][j]=log(dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE); //P(W|R)=P(WRT)/P(RT)=P(R)*P(W|T)/P(R|T) j++; m_nTags[i][j]='n'*256+'s'; m_dFrequency[i][j]=-log(1-dRatio)+pWordItems[nWordsIndex].dValue; //m_dFrequency[i][j]=log(1-dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE); j++; } else//Unknown words such as Chinese person name or place name { m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle; // m_dFrequency[i][j++]=(double)(1+pWordItems[nWordsIndex].nFrequency)/(double)(m_context.GetFrequency(0,aPOS[j])+1); m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue; } } dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq); for(;j<nCount;j++) {//Get the POS set of sCurWord in the unknown dictionary m_nTags[i][j]=aPOS[j]; m_dFrequency[i][j]=-log((double)1+aFreq[j])+log((double)m_context.GetFrequency(0,m_nTags[i][j])+1); } } } if(j==0) {//We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i,&j);//Guess the POS of current word } m_nTags[i][j]=-1;//Set the ending POS if(j==1)//No ambuguity {//No ambuguity, so we can break from the loop i++; m_sWords[i][0]=0; break; } if(!bSplit) {nWordsIndex++;} } if(pWordItems[nWordsIndex].sWord[0]==0) nRetPos=-1;//Reaching ending if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0 {//Set end for words like "张/华/平" if(m_tagType!=TT_NORMAL) m_nTags[i][0]=101; else m_nTags[i][0]=1; m_dFrequency[i][0]=0; m_sWords[i][0]=0;//Set virtual ending m_nTags[i++][1]=-1; } m_nCurLength=i;//The current word count if(nRetPos!=-1) return nWordsIndex+1;//Next start position return -1;//Reaching ending }
bool CSpan::PersonRecognize(CDictionary &personDict) { char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100]; //0 1 2 3 4 5 char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE", "BG", "BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""}; double dFactor[]={0.0011,0.0011,0.0011,0.0011,0.7614,0.0011,0.2055, 0.0160,0.0011,0.0011,0,0.0160,0.0011, 0.0160,0.0011,0.0011,0 }; //About parameter: /* Given Name: 486 0.0160 Surname+postfix:484 0.0160 m_lPerson2Num:6265 0.2055 m_lPerson3Num: 23184 0.7614 m_lPerson4Num:32 0.0011 */ //The person recognition patterns set //BBCD:姓+姓+名1+名2; //BBE: 姓+姓+单名; //BBZ: 姓+姓+双名成词; //BCD: 姓+名1+名2; //BE: 姓+单名; //BEE: 姓+单名+单名;韩磊磊 //BG: 姓+后缀 //BXD: 姓+姓双名首字成词+双名末字 //BZ: 姓+双名成词; //B: 姓 //CD: 名1+名2; //EE: 单名+单名; //FB: 前缀+姓 //XD: 姓双名首字成词+双名末字 //Y: 姓单名成词 int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0}; int i; for(i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS sPOS[i]=m_nBestTag[i]+'A'; sPOS[i]=0; int j=1,k,nPos;//Find the proper pattern from the first POS int nLittleFreqCount;//Counter for the person name role with little frequecy bool bMatched=false; while(j<i) { bMatched=false; for(k=0;!bMatched&&nPatternLen[k]>0;k++) { if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0) {//Find the proper pattern k if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G')) {//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效; continue; } /* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0) {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊 continue; } if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12) {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘 continue; } */ //Get the possible name nPos=j;//Record the person position in the tag sequence sPersonName[0]=0; nLittleFreqCount=0;//Record the number of role with little frequency while(nPos<j+nPatternLen[k]) {//Get the possible person name // if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY) nLittleFreqCount++;//The counter increase strcat(sPersonName,m_sWords[nPos]); nPos+=1; } if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY) {//Exclusion foreign name //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效 j+=nPatternLen[k]-1; continue; } if(strcmp(sPatterns[k],"CDCD")==0) {//Rule for exclusion //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱 //Rule 3 for exclusion:含外国人名用字 规则适用 //否则,排除规则失效:黑妞白妞姐俩拔了头筹。 if(GetForeignCharCount(sPersonName)>0) j+=nPatternLen[k]-1; continue; } if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName)) {// j+=nPatternLen[k]-1; continue; } if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3) //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀, //The all roles appear with two lower frequecy,we will ignore them continue; m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j]; m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]]; m_dWordsPossibility[m_nUnknownIndex]=log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict); //Mutiply the factor m_nUnknownIndex+=1; j+=nPatternLen[k]; bMatched=true; } } if(!bMatched)//Not matched, add j by 1 j+=1; } return true; }
int main(int argc, char **argv) { if (argc != 4) PrintUsage(); if ( !strcmp (argv[1], "-h") || !strcmp (argv[1], "-help") || !strcmp (argv[1], "/h") || !strcmp (argv[1], "/help") ) PrintUsage(); string Action = argv[1]; if ( (Action != "ToTxt") && (Action != "FromTxt") ) PrintUsage(); string FileName = argv[2]; CDictionary Dict; if (Action == "FromTxt") { if (access (FileName.c_str(), 04) != 0) { fprintf (stderr, "Cannot read %s\n",FileName.c_str()); return 1; }; if (!Dict.LoadOnlyConstants(argv[3])) { fprintf (stderr, "Cannot load an empty dictionary from %s\n",argv[3]); return 1; }; Dict.m_bShouldSaveComments = true; string Messages; bool bResult = Dict.ImportFromText(FileName,false, iceSkip,1, Messages); fprintf (stderr, "%s", Messages.c_str() ); if (bResult) if (Dict.Save()) return 0; return 1; } else { if (!Dict.Load(argv[3]) || !Dict.ReadUnitComments()) { fprintf (stderr, "Cannot load dictionary from %s\n",argv[3]); return 1; }; FILE * fp = fopen (FileName.c_str(),"wb"); if (!fp) { fprintf (stderr, "Cannot write to %s\n",FileName.c_str()); return 1; }; CTempArticle A; A.m_pRoss = &Dict; for (WORD i = 0; i < Dict.m_Units.size(); i++) { fprintf (fp,"============\r\n"); fprintf (fp,"%s", Dict.GetUnitTextHeader(i).c_str()); try { A.ReadFromDictionary(i, false, true); if (!A.ArticleToText()) { fprintf (fp,"Error! Cannot get the entry No %i\r\n", i); return 1; }; fprintf (fp,"%s",A.GetArticleStr().c_str()); } catch (...) { fprintf (fp,"Error! Cannot get the entry No %i\r\n", i); return 1; } }; fclose(fp); return 0; }; }
void InjectWord(unsigned w, unsigned val) { dictionary.AddWord(w, val, false); }
bool CSpan::SplitPersonPOS(CDictionary &unlistDict) {//Split the word with POS 21 and 22 int i=m_nCurLength-1,j; unsigned int nLenWord,nLenPart; char sFirstPart[50],sLastPart[50]; int nFirstPOS,nLastPOS; for(;i>0;i--) { if(m_nBestTag[i]==21||m_nBestTag[i]==22) {//Find the POS which need to split for(j=m_nCurLength-1;j>i;j--) {//Move the POS and words strcpy(m_sWords[j+1],m_sWords[j]); m_nBestTag[j+1]=m_nBestTag[j]; m_nWordPosition[j+1]=m_nWordPosition[j]; } m_nCurLength+=1;//The length increment //Generate new segment words and POS if(m_nBestTag[i]==21) {//Combination by Previous and first component nLenWord=strlen(m_sWords[i]); if(nLenWord>4)//Get first component { strcpy(sLastPart,m_sWords[i]+nLenWord-4); if(!unlistDict.IsExist(sLastPart,-1)) strcpy(sLastPart,m_sWords[i]+nLenWord-2); } else { strcpy(sLastPart,m_sWords[i]+nLenWord-2); } nLenPart=strlen(sLastPart); if(nLenPart<nLenWord) {//Get first part strncpy(sFirstPart,m_sWords[i],nLenWord-nLenPart); sFirstPart[nLenWord-nLenPart]=0; } else { strncpy(sFirstPart,m_sWords[i],nLenWord-2); sFirstPart[nLenWord-2]=0; strncpy(sLastPart,m_sWords[i]+nLenWord-2,2); sLastPart[2]=0; } nFirstPOS=11; nLastPOS=1; } else {//Combination by Next word and last component nLenWord=strlen(m_sWords[i]); if(nLenWord>4)//Get last component { strncpy(sFirstPart,m_sWords[i],4); sFirstPart[4]=0; if(!unlistDict.IsExist(sFirstPart,-1)) sFirstPart[2]=0; } else { strncpy(sFirstPart,m_sWords[i],2); sFirstPart[2]=0; } nLenPart=strlen(sFirstPart); if(nLenPart<nLenWord) {//Get first part strncpy(sLastPart,m_sWords[i]+nLenPart,nLenWord-nLenPart); sLastPart[nLenWord-nLenPart]=0; } else { strncpy(sFirstPart,m_sWords[i],2); sFirstPart[2]=0; strncpy(sLastPart,m_sWords[i]+2,nLenWord-2); sLastPart[nLenWord-2]=0; } if(unlistDict.IsExist(sFirstPart,1)&&m_nBestTag[i-1]==5) //小陈说: nFirstPOS=1; else if(unlistDict.IsExist(m_sWords[i-1],1)&&!unlistDict.IsExist(m_sWords[i-2],1)) nFirstPOS=4; else nFirstPOS=3; nLastPOS=12; } strcpy(m_sWords[i],sFirstPart); m_nBestTag[i]=nFirstPOS; strcpy(m_sWords[i+1],sLastPart); m_nBestTag[i+1]=nLastPOS; m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(sFirstPart); } } return true; }
Dictionary_() { ss << "cat" << endl << "кошка" << endl << "dog" << endl << "собака"; dictionary.LoadDictionary(ss); }
string TranslateIn8Power(int Num) { return Translate(Num,8)+Comments.GetTranslate("Ru comment about hex system"); }
void BeginDictionaryTransaction() { dictionary.BeginTransaction(); }
void CViewport::LoadDictionary(void) { CSV::CSVDocument doc; CSV::CSVDocument::row_index_type row_count; //Parse from the document try { row_count = doc.load_file("captionmod/dictionary.csv"); } catch(std::exception &err) { Sys_ErrorEx("%s\n%s", "LoadDictionary: ", err.what()); } if(row_count < 2) return; IScheme *ischeme = scheme()->GetIScheme(GetScheme()); if(!ischeme) return; Color defaultColor = ischeme->GetColor("BaseText", Color(255, 255, 255, 200)); //Initialize the dictionary hashtable m_StringsHashTable.SetSize(2048); for (int i = 0; i < m_StringsHashTable.Count(); i++) m_StringsHashTable[i].next = NULL; EmptyDictionaryHash(); int nRowCount = row_count; //parse the dictionary line by line... for (int i = 1;i < nRowCount; ++i) { CSV::CSVDocument::row_type row = doc.get_row(i); if(row.size() < 1) continue; const char *title = row[0].c_str(); if(!title || !title[0]) continue; CDictionary *Dict = new CDictionary; Dict->Load(row, defaultColor, ischeme); m_Dictionary.AddToTail(Dict); AddDictionaryHash(Dict, Dict->m_szTitle); } //Link the dictionaries for(int i = 0; i < m_Dictionary.Count(); ++i) { CDictionary *Dict = m_Dictionary[i]; if(Dict->m_szNext[0]) { Dict->m_pNext = FindDictionary(Dict->m_szNext); } } }
void HandleCreate(const CDictionary& dict) { super::HandleCreate(dict); CRGBA rgbaTint(255,255,255,255); dict.GetRGBA("tint", &rgbaTint); for ( uint32 iButtonImage = 0 ; iButtonImage < 9 ; ++iButtonImage ) { m_apImages[iButtonImage]->SetShader(GetTheme()->GetButtonImage(iButtonImage)); m_apImages[iButtonImage]->SetTint(rgbaTint); } if ( dict.Exists("font") ) { m_pText->SetFont(dict.GetString("font")); } else { m_pText->SetFont(GetTheme()->GetButtonTextFont()); } if ( dict.Exists("text") ) { m_pText->SetText(dict.GetString("text").c_str()); } if ( dict.Exists("text_shadow") ) { bool bShadow = dict.GetBool("text_shadow"); int nStyle = m_pText->GetStyle(); m_pText->SetStyle(IText::Style(bShadow ? (nStyle | IText::eStyleShadowed) : (nStyle & ~IText::eStyleShadowed))); } if ( dict.Exists("text_tint") ) { m_pText->SetTint(dict.GetRGBA("text_tint")); } if ( dict.Exists("onpress") ) { m_sOnPress = dict.GetString("onpress"); } if ( dict.Exists("onunpress") ) { m_sOnUnpress = dict.GetString("onunpress"); } }
void BootstrapDB() { dictionary.ClearAll(); markov.ClearAll(); }
bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict) { char sPOS[MAX_WORDS_PER_SENTENCE]="Z"; int nStart=1,nEnd=1,i=1; while(m_nBestTag[i]>-1) { if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23 nEnd++; while(m_nBestTag[nEnd]==30)//3,13,23 nEnd++; } else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22 nEnd++; while(m_nBestTag[nEnd]==30)//3,13,23 nEnd++; } if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1)))) { m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart]; m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd]; m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict); nStart=nEnd; } if(i<nEnd) i=nEnd; else i=i+1; } return true; }
string TranslateIn10Power(int Num) { return Translate(Num,10)+Comments.GetTranslate("Ru comment about dec system"); }
bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq) { //Gernerate the word net from the sLine, that's list all the possible word unsigned int i=0,j,nLen=strlen(sSentence); char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH]; int nWordIndex=0,nHandleTemp,k,nPOS; int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount; double dValue=0; m_nAtomCount=0; m_segGraph.SetEmpty();//Set segmentation graph empty AtomSegment(sSentence); //Atomic Segmentation for(i=0;i<m_nAtomCount;i++)//Init the cost array { if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value else m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value } else//Other atom { strcpy(sWord,m_sAtom[i]);//init the word dValue=MAX_FREQUENCE; switch(m_nAtomPOS[i]) { case CT_INDEX: case CT_NUM: nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); dValue=0; break; case CT_DELIMITER: nPOS=30464;//'w'*256; break; case CT_LETTER: nPOS=-'n'*256-'x';// dValue=0; strcpy(sWord,"未##串"); break; case CT_SINGLE://12021-2129-3121 if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i])) { nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); } else { nPOS=-'n'*256-'x';// strcpy(sWord,"未##串"); } dValue=0; break; default: nPOS=m_nAtomPOS[i];//'?'*256; break; } if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum else m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum } } i=0; while(i<m_nAtomCount)//All the word { strcpy(sWord,m_sAtom[i]);//Get the current atom j=i+1; if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份 j+=1; while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp)) {//Add a condition to control the end of string //retrieve the dictionary with the word if(strcmp(sWordMatch,sWord)==0)//find the current word { nTotalFreq=0; dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq); for(k=0;k<nMatchCount;k++)//Add the frequency { nTotalFreq+=nMatchFreq[k]; } //Adding a rule to exclude some words to be formed. if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0)) {//1年内、1999年末 if(CC_Find("末内中底前间初",sWord+2)) break; } if(nMatchCount==1)//The possible word has only one POS, store it { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]); else m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord); } else { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0); else m_segGraph.SetElement(i,j,nTotalFreq,0,sWord); } } strcat(sWord,m_sAtom[j++]); } i+=1;//Start from i++; } return true; }
void EndDictionaryTransaction() { dictionary.EndTransaction(); }