bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq) { //Gernerate the word net from the sLine, that's list all the possible word unsigned int i=0,j,nLen=strlen(sSentence); char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH]; int nWordIndex=0,nHandleTemp,k,nPOS; int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount; double dValue=0; m_nAtomCount=0; m_segGraph.SetEmpty();//Set segmentation graph empty AtomSegment(sSentence); //Atomic Segmentation for(i=0;i<m_nAtomCount;i++)//Init the cost array { if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value else m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value } else//Other atom { strcpy(sWord,m_sAtom[i]);//init the word dValue=MAX_FREQUENCE; switch(m_nAtomPOS[i]) { case CT_INDEX: case CT_NUM: nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); dValue=0; break; case CT_DELIMITER: nPOS=30464;//'w'*256; break; case CT_LETTER: nPOS=-'n'*256-'x';// dValue=0; strcpy(sWord,"未##串"); break; case CT_SINGLE://12021-2129-3121 if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i])) { nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); } else { nPOS=-'n'*256-'x';// strcpy(sWord,"未##串"); } dValue=0; break; default: nPOS=m_nAtomPOS[i];//'?'*256; break; } if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum else m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum } } i=0; while(i<m_nAtomCount)//All the word { strcpy(sWord,m_sAtom[i]);//Get the current atom j=i+1; if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份 j+=1; while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp)) {//Add a condition to control the end of string //retrieve the dictionary with the word if(strcmp(sWordMatch,sWord)==0)//find the current word { nTotalFreq=0; dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq); for(k=0;k<nMatchCount;k++)//Add the frequency { nTotalFreq+=nMatchFreq[k]; } //Adding a rule to exclude some words to be formed. if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0)) {//1年内、1999年末 if(CC_Find("末内中底前间初",sWord+2)) break; } if(nMatchCount==1)//The possible word has only one POS, store it { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]); else m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord); } else { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0); else m_segGraph.SetElement(i,j,nTotalFreq,0,sWord); } } strcat(sWord,m_sAtom[j++]); } i+=1;//Start from i++; } return true; }
int CSpan::GetFrom(PWORD_RESULT pWordItems,int nIndex,CDictionary &dictCore, CDictionary &dictUnknown) { int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD]; int nFreq=0,j,nRetPos=0,nWordsIndex=0; bool bSplit=false;//Need to split in Transliteration recognition int i=1; nWordsIndex=i+nIndex-1; for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++) { if(m_tagType==TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44)) { strcpy(m_sWords[i],pWordItems[nWordsIndex].sWord);//store current word m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]); } else { if(!bSplit) { strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord,2);//store current word m_sWords[i][2]=0; bSplit=true; } else { unsigned int nLen=strlen(pWordItems[nWordsIndex].sWord+2); strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);//store current word m_sWords[i][nLen]=0; bSplit=false; } m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]); } //Record the position of current word m_nStartPos=m_nWordPosition[i+1]; //Move the Start POS to the ending if(m_tagType!=TT_NORMAL) { //Get the POSs from the unknown recognition dictionary dictUnknown.GetHandle(m_sWords[i],&nCount,aPOS,aFreq); for(j=0;j<nCount;j++) {//Get the POS set of sCurWord in the unknown dictionary m_nTags[i][j]=aPOS[j]; m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(m_context.GetFrequency(0,aPOS[j])+1)); } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq); nFreq=0; for(int k=0;k<nCount;k++) { nFreq+=aFreq[k]; } if(nCount>0) { m_nTags[i][j]=0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(m_context.GetFrequency(0,0)+1)); j++; } } else//For normal POS tagging { j=0; //Get the POSs from the unknown recognition dictionary if(pWordItems[nWordsIndex].nHandle>0) {//The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i][j]=pWordItems[nWordsIndex].nHandle; m_dFrequency[i][j]=pWordItems[nWordsIndex].dValue-LOG_MAX_FRQUENCE+log((double)(m_context.GetFrequency(0,m_nTags[i][j])+1)); if(m_dFrequency[i][j]<0)//Not permit the value less than 0 m_dFrequency[i][j]=0; j++; } else {//The word has multiple POSs, we should retrieve the information from Core Dictionary if(pWordItems[nWordsIndex].nHandle<0) {//The word has is only one POS value //We have record its POS and nFrequncy in the items. if(pWordItems[nWordsIndex].nHandle==-'t'*256-'t')//tt { char sWordOrg[100],sPostfix[10]; double dRatio=0.6925;//The ratio which transliteration as a person name PostfixSplit(pWordItems[nWordsIndex].sWord,sWordOrg,sPostfix); if(sPostfix[0]!=0) dRatio=0.01; m_nTags[i][j]='n'*256+'r'; m_dFrequency[i][j]=-log(dRatio)+pWordItems[nWordsIndex].dValue; //m_dFrequency[i][j]=log(dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE); //P(W|R)=P(WRT)/P(RT)=P(R)*P(W|T)/P(R|T) j++; m_nTags[i][j]='n'*256+'s'; m_dFrequency[i][j]=-log(1-dRatio)+pWordItems[nWordsIndex].dValue; //m_dFrequency[i][j]=log(1-dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE); j++; } else//Unknown words such as Chinese person name or place name { m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle; // m_dFrequency[i][j++]=(double)(1+pWordItems[nWordsIndex].nFrequency)/(double)(m_context.GetFrequency(0,aPOS[j])+1); m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue; } } dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq); for(;j<nCount;j++) {//Get the POS set of sCurWord in the unknown dictionary m_nTags[i][j]=aPOS[j]; m_dFrequency[i][j]=-log((double)1+aFreq[j])+log((double)m_context.GetFrequency(0,m_nTags[i][j])+1); } } } if(j==0) {//We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i,&j);//Guess the POS of current word } m_nTags[i][j]=-1;//Set the ending POS if(j==1)//No ambuguity {//No ambuguity, so we can break from the loop i++; m_sWords[i][0]=0; break; } if(!bSplit) {nWordsIndex++;} } if(pWordItems[nWordsIndex].sWord[0]==0) nRetPos=-1;//Reaching ending if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0 {//Set end for words like "张/华/平" if(m_tagType!=TT_NORMAL) m_nTags[i][0]=101; else m_nTags[i][0]=1; m_dFrequency[i][0]=0; m_sWords[i][0]=0;//Set virtual ending m_nTags[i++][1]=-1; } m_nCurLength=i;//The current word count if(nRetPos!=-1) return nWordsIndex+1;//Next start position return -1;//Reaching ending }