bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict) { int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType,nFreqGiven; char sTemp[3]; if(nLen<3||nLen>8)//Not a traditional Chinese person name return false; while(i<nLen)//No Including non-CHinese char { nCharType=charType((unsigned char*)sPersonName+i); if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER) return false; i+=2; } sSurname2[0]=0;//init strncpy(sSurname,sPersonName,nSurNameLen); sSurname[nSurNameLen]=0; if(!personDict.IsExist(sSurname,1)) { nSurNameLen=2; sSurname[nSurNameLen]=0; if(!personDict.IsExist(sSurname,1)) { nSurNameLen=0; sSurname[nSurNameLen]=0; } } strcpy(sGivenName,sPersonName+nSurNameLen); if(nLen>6) { strncpy(sTemp,sPersonName+nSurNameLen,2); sTemp[2]=0;//Get the second possible surname if(personDict.IsExist(sTemp,1)) {//Hongkong women's name: Surname+surname+given name strcpy(sSurname2,sTemp); strcpy(sGivenName,sPersonName+nSurNameLen+2); } } nFreq=personDict.GetFrequency(sSurname,1); strncpy(sTemp,sGivenName,2); sTemp[2]=0; nFreqGiven=personDict.GetFrequency(sTemp,2); if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)>=3&&nFreq<personDict.GetFrequency("张",1)/40&&nFreqGiven<personDict.GetFrequency("华",2)/20)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2))) return false; if(nLen==4&&m_uPerson.IsGivenName(sPersonName)) {//Single Surname+given name return false; } return true; }
//POS tagging with Hidden Markov Model bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown) { //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary int i=0,j,nStartPos; Reset(false); while(i>-1&&pWordItems[i].sWord[0]!=0) { nStartPos=i;//Start Position i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown); GetBestPOS(); switch(m_tagType) { case TT_NORMAL://normal POS tagging j=1; while(m_nBestTag[j]!=-1&&j<m_nCurLength) {//Store the best POS tagging pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j]; //Let 。be 0 if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value pWordItems[j+nStartPos-1].dValue=LOG_MAX_FRQUENCE-log((double)dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j])+1); j+=1; } break; case TT_PERSON://Person recognition /*clock_t lStart,lEnd; lStart=clock(); */ SplitPersonPOS(dictUnknown); //lEnd=clock(); //printf("SplitPersonPOS=%f\n",(double)(lEnd-lStart)*1000/CLOCKS_PER_SEC); //Spit Persons POS //lStart=clock(); PersonRecognize(dictUnknown); //lEnd=clock(); //printf("PersonRecognize=%f\n",(double)(lEnd-lStart)/CLOCKS_PER_SEC); //Person Recognition with the person recognition dictionary break; case TT_PLACE://Place name recognition PlaceRecognize(dictCore,dictUnknown); break; case TT_TRANS://Transliteration TransRecognize(dictCore,dictUnknown); break; default: break; } Reset(); } return true; }
bool CSpan::SplitPersonPOS(CDictionary &unlistDict) {//Split the word with POS 21 and 22 int i=m_nCurLength-1,j; unsigned int nLenWord,nLenPart; char sFirstPart[50],sLastPart[50]; int nFirstPOS,nLastPOS; for(;i>0;i--) { if(m_nBestTag[i]==21||m_nBestTag[i]==22) {//Find the POS which need to split for(j=m_nCurLength-1;j>i;j--) {//Move the POS and words strcpy(m_sWords[j+1],m_sWords[j]); m_nBestTag[j+1]=m_nBestTag[j]; m_nWordPosition[j+1]=m_nWordPosition[j]; } m_nCurLength+=1;//The length increment //Generate new segment words and POS if(m_nBestTag[i]==21) {//Combination by Previous and first component nLenWord=strlen(m_sWords[i]); if(nLenWord>4)//Get first component { strcpy(sLastPart,m_sWords[i]+nLenWord-4); if(!unlistDict.IsExist(sLastPart,-1)) strcpy(sLastPart,m_sWords[i]+nLenWord-2); } else { strcpy(sLastPart,m_sWords[i]+nLenWord-2); } nLenPart=strlen(sLastPart); if(nLenPart<nLenWord) {//Get first part strncpy(sFirstPart,m_sWords[i],nLenWord-nLenPart); sFirstPart[nLenWord-nLenPart]=0; } else { strncpy(sFirstPart,m_sWords[i],nLenWord-2); sFirstPart[nLenWord-2]=0; strncpy(sLastPart,m_sWords[i]+nLenWord-2,2); sLastPart[2]=0; } nFirstPOS=11; nLastPOS=1; } else {//Combination by Next word and last component nLenWord=strlen(m_sWords[i]); if(nLenWord>4)//Get last component { strncpy(sFirstPart,m_sWords[i],4); sFirstPart[4]=0; if(!unlistDict.IsExist(sFirstPart,-1)) sFirstPart[2]=0; } else { strncpy(sFirstPart,m_sWords[i],2); sFirstPart[2]=0; } nLenPart=strlen(sFirstPart); if(nLenPart<nLenWord) {//Get first part strncpy(sLastPart,m_sWords[i]+nLenPart,nLenWord-nLenPart); sLastPart[nLenWord-nLenPart]=0; } else { strncpy(sFirstPart,m_sWords[i],2); sFirstPart[2]=0; strncpy(sLastPart,m_sWords[i]+2,nLenWord-2); sLastPart[nLenWord-2]=0; } if(unlistDict.IsExist(sFirstPart,1)&&m_nBestTag[i-1]==5) //小陈说: nFirstPOS=1; else if(unlistDict.IsExist(m_sWords[i-1],1)&&!unlistDict.IsExist(m_sWords[i-2],1)) nFirstPOS=4; else nFirstPOS=3; nLastPOS=12; } strcpy(m_sWords[i],sFirstPart); m_nBestTag[i]=nFirstPOS; strcpy(m_sWords[i+1],sLastPart); m_nBestTag[i+1]=nLastPOS; m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(sFirstPart); } } return true; }
bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict) { char sPOS[MAX_WORDS_PER_SENTENCE]="Z"; int nStart=1,nEnd=1,i=1; while(m_nBestTag[i]>-1) { if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23 nEnd++; while(m_nBestTag[nEnd]==30)//3,13,23 nEnd++; } else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22 nEnd++; while(m_nBestTag[nEnd]==30)//3,13,23 nEnd++; } if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1)))) { m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart]; m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd]; m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict); nStart=nEnd; } if(i<nEnd) i=nEnd; else i=i+1; } return true; }
int CSpan::GetFrom(PWORD_RESULT pWordItems,int nIndex,CDictionary &dictCore, CDictionary &dictUnknown) { int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD]; int nFreq=0,j,nRetPos=0,nWordsIndex=0; bool bSplit=false;//Need to split in Transliteration recognition int i=1; nWordsIndex=i+nIndex-1; for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++) { if(m_tagType==TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44)) { strcpy(m_sWords[i],pWordItems[nWordsIndex].sWord);//store current word m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]); } else { if(!bSplit) { strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord,2);//store current word m_sWords[i][2]=0; bSplit=true; } else { unsigned int nLen=strlen(pWordItems[nWordsIndex].sWord+2); strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);//store current word m_sWords[i][nLen]=0; bSplit=false; } m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]); } //Record the position of current word m_nStartPos=m_nWordPosition[i+1]; //Move the Start POS to the ending if(m_tagType!=TT_NORMAL) { //Get the POSs from the unknown recognition dictionary dictUnknown.GetHandle(m_sWords[i],&nCount,aPOS,aFreq); for(j=0;j<nCount;j++) {//Get the POS set of sCurWord in the unknown dictionary m_nTags[i][j]=aPOS[j]; m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(m_context.GetFrequency(0,aPOS[j])+1)); } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq); nFreq=0; for(int k=0;k<nCount;k++) { nFreq+=aFreq[k]; } if(nCount>0) { m_nTags[i][j]=0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(m_context.GetFrequency(0,0)+1)); j++; } } else//For normal POS tagging { j=0; //Get the POSs from the unknown recognition dictionary if(pWordItems[nWordsIndex].nHandle>0) {//The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i][j]=pWordItems[nWordsIndex].nHandle; m_dFrequency[i][j]=pWordItems[nWordsIndex].dValue-LOG_MAX_FRQUENCE+log((double)(m_context.GetFrequency(0,m_nTags[i][j])+1)); if(m_dFrequency[i][j]<0)//Not permit the value less than 0 m_dFrequency[i][j]=0; j++; } else {//The word has multiple POSs, we should retrieve the information from Core Dictionary if(pWordItems[nWordsIndex].nHandle<0) {//The word has is only one POS value //We have record its POS and nFrequncy in the items. if(pWordItems[nWordsIndex].nHandle==-'t'*256-'t')//tt { char sWordOrg[100],sPostfix[10]; double dRatio=0.6925;//The ratio which transliteration as a person name PostfixSplit(pWordItems[nWordsIndex].sWord,sWordOrg,sPostfix); if(sPostfix[0]!=0) dRatio=0.01; m_nTags[i][j]='n'*256+'r'; m_dFrequency[i][j]=-log(dRatio)+pWordItems[nWordsIndex].dValue; //m_dFrequency[i][j]=log(dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE); //P(W|R)=P(WRT)/P(RT)=P(R)*P(W|T)/P(R|T) j++; m_nTags[i][j]='n'*256+'s'; m_dFrequency[i][j]=-log(1-dRatio)+pWordItems[nWordsIndex].dValue; //m_dFrequency[i][j]=log(1-dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE); j++; } else//Unknown words such as Chinese person name or place name { m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle; // m_dFrequency[i][j++]=(double)(1+pWordItems[nWordsIndex].nFrequency)/(double)(m_context.GetFrequency(0,aPOS[j])+1); m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue; } } dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq); for(;j<nCount;j++) {//Get the POS set of sCurWord in the unknown dictionary m_nTags[i][j]=aPOS[j]; m_dFrequency[i][j]=-log((double)1+aFreq[j])+log((double)m_context.GetFrequency(0,m_nTags[i][j])+1); } } } if(j==0) {//We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i,&j);//Guess the POS of current word } m_nTags[i][j]=-1;//Set the ending POS if(j==1)//No ambuguity {//No ambuguity, so we can break from the loop i++; m_sWords[i][0]=0; break; } if(!bSplit) {nWordsIndex++;} } if(pWordItems[nWordsIndex].sWord[0]==0) nRetPos=-1;//Reaching ending if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0 {//Set end for words like "张/华/平" if(m_tagType!=TT_NORMAL) m_nTags[i][0]=101; else m_nTags[i][0]=1; m_dFrequency[i][0]=0; m_sWords[i][0]=0;//Set virtual ending m_nTags[i++][1]=-1; } m_nCurLength=i;//The current word count if(nRetPos!=-1) return nWordsIndex+1;//Next start position return -1;//Reaching ending }