//Unknown word recognition //pWordSegResult:word Segmentation result;graphOptimum: The optimized segmentation graph //graphSeg: The original segmentation graph bool CUnknowWord::Recognition(PWORD_RESULT pWordSegResult, CDynamicArray &graphOptimum,CSegGraph &graphSeg,CDictionary &dictCore) { int nStartPos=0,j=0,nAtomStart,nAtomEnd,nPOSOriginal; ELEMENT_TYPE dValue; m_roleTag.POSTagging(pWordSegResult,dictCore,m_dict); //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary for(int i=0;i<m_roleTag.m_nUnknownIndex;i++) { while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][0]) { nStartPos+=graphSeg.m_nAtomLength[j++]; } nAtomStart=j; while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][1]) { nStartPos+=graphSeg.m_nAtomLength[j++]; } nAtomEnd=j; if(nAtomStart<nAtomEnd) { graphOptimum.GetElement(nAtomStart,nAtomEnd,&dValue,&nPOSOriginal); if(dValue>m_roleTag.m_dWordsPossibility[i])//Set the element with less frequency graphOptimum.SetElement(nAtomStart,nAtomEnd,m_roleTag.m_dWordsPossibility[i],m_nPOS,m_sUnknownFlags); } } return true; }
//CDynamicArray &aWord: the words array //CDynamicArray &aWordBinaryNet:the net between words //double dSmoothingPara: the parameter of data smoothing //CDictionary &DictBinary: the binary dictionary //CDictionary &DictCore: the Core dictionary bool CSegment::BiGraphGenerate(CDynamicArray &aWord, CDynamicArray &aBinaryWordNet,double dSmoothingPara,CDictionary &DictBinary,CDictionary &DictCore) { PARRAY_CHAIN pTail,pCur,pNextWords;//Temp buffer unsigned int nWordIndex=0,nTwoWordsFreq=0,nCurWordIndex,nNextWordIndex; //nWordIndex: the index number of current word double dCurFreqency,dValue,dTemp; char sTwoWords[WORD_MAXLENGTH]; m_nWordCount=aWord.GetTail(&pTail);//Get tail element and return the words count if(m_npWordPosMapTable) {//free buffer delete [] m_npWordPosMapTable; m_npWordPosMapTable=0; } if(m_nWordCount>0)//Word count is greater than 0 { m_npWordPosMapTable=new int[m_nWordCount];//Record the position of possible words memset(m_npWordPosMapTable,0,m_nWordCount*sizeof(int)); } pCur=aWord.GetHead(); while(pCur!=NULL)//Set the position map of words { m_npWordPosMapTable[nWordIndex++]=pCur->row*MAX_SENTENCE_LEN+pCur->col; pCur=pCur->next; } pCur=aWord.GetHead(); while(pCur!=NULL)// { if(pCur->nPOS>=0)//It's not an unknown words dCurFreqency=pCur->value; else//Unknown words dCurFreqency=DictCore.GetFrequency(pCur->sWord,2); aWord.GetElement(pCur->col,-1,pCur,&pNextWords);//Get next words which begin with pCur->col while(pNextWords&&pNextWords->row==pCur->col)//Next words { //Current words frequency strcpy(sTwoWords,pCur->sWord); strcat(sTwoWords,WORD_SEGMENTER); strcat(sTwoWords,pNextWords->sWord); nTwoWordsFreq=DictBinary.GetFrequency(sTwoWords,3); //Two linked Words frequency dTemp=(double)1/MAX_FREQUENCE; //Smoothing dValue=-log(dSmoothingPara*(1+dCurFreqency)/(MAX_FREQUENCE+80000)+(1-dSmoothingPara)*((1-dTemp)*nTwoWordsFreq/(1+dCurFreqency)+dTemp)); //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 if(pCur->nPOS<0)//Unknown words: P(Wi|Ci);while known words:1 dValue+=pCur->value; //Get the position index of current word in the position map table nCurWordIndex=BinarySearch(pCur->row*MAX_SENTENCE_LEN+pCur->col,m_npWordPosMapTable,m_nWordCount); nNextWordIndex=BinarySearch(pNextWords->row*MAX_SENTENCE_LEN+pNextWords->col,m_npWordPosMapTable,m_nWordCount); aBinaryWordNet.SetElement(nCurWordIndex,nNextWordIndex,dValue,pCur->nPOS); pNextWords=pNextWords->next;//Get next word } pCur=pCur->next; } return true; }