//CDynamicArray &aWord: the words array //CDynamicArray &aWordBinaryNet:the net between words //double dSmoothingPara: the parameter of data smoothing //CDictionary &DictBinary: the binary dictionary //CDictionary &DictCore: the Core dictionary bool CSegment::BiGraphGenerate(CDynamicArray &aWord, CDynamicArray &aBinaryWordNet,double dSmoothingPara,CDictionary &DictBinary,CDictionary &DictCore) { PARRAY_CHAIN pTail,pCur,pNextWords;//Temp buffer unsigned int nWordIndex=0,nTwoWordsFreq=0,nCurWordIndex,nNextWordIndex; //nWordIndex: the index number of current word double dCurFreqency,dValue,dTemp; char sTwoWords[WORD_MAXLENGTH]; m_nWordCount=aWord.GetTail(&pTail);//Get tail element and return the words count if(m_npWordPosMapTable) {//free buffer delete [] m_npWordPosMapTable; m_npWordPosMapTable=0; } if(m_nWordCount>0)//Word count is greater than 0 { m_npWordPosMapTable=new int[m_nWordCount];//Record the position of possible words memset(m_npWordPosMapTable,0,m_nWordCount*sizeof(int)); } pCur=aWord.GetHead(); while(pCur!=NULL)//Set the position map of words { m_npWordPosMapTable[nWordIndex++]=pCur->row*MAX_SENTENCE_LEN+pCur->col; pCur=pCur->next; } pCur=aWord.GetHead(); while(pCur!=NULL)// { if(pCur->nPOS>=0)//It's not an unknown words dCurFreqency=pCur->value; else//Unknown words dCurFreqency=DictCore.GetFrequency(pCur->sWord,2); aWord.GetElement(pCur->col,-1,pCur,&pNextWords);//Get next words which begin with pCur->col while(pNextWords&&pNextWords->row==pCur->col)//Next words { //Current words frequency strcpy(sTwoWords,pCur->sWord); strcat(sTwoWords,WORD_SEGMENTER); strcat(sTwoWords,pNextWords->sWord); nTwoWordsFreq=DictBinary.GetFrequency(sTwoWords,3); //Two linked Words frequency dTemp=(double)1/MAX_FREQUENCE; //Smoothing dValue=-log(dSmoothingPara*(1+dCurFreqency)/(MAX_FREQUENCE+80000)+(1-dSmoothingPara)*((1-dTemp)*nTwoWordsFreq/(1+dCurFreqency)+dTemp)); //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 if(pCur->nPOS<0)//Unknown words: P(Wi|Ci);while known words:1 dValue+=pCur->value; //Get the position index of current word in the position map table nCurWordIndex=BinarySearch(pCur->row*MAX_SENTENCE_LEN+pCur->col,m_npWordPosMapTable,m_nWordCount); nNextWordIndex=BinarySearch(pNextWords->row*MAX_SENTENCE_LEN+pNextWords->col,m_npWordPosMapTable,m_nWordCount); aBinaryWordNet.SetElement(nCurWordIndex,nNextWordIndex,dValue,pCur->nPOS); pNextWords=pNextWords->next;//Get next word } pCur=pCur->next; } return true; }
bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict) { int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType,nFreqGiven; char sTemp[3]; if(nLen<3||nLen>8)//Not a traditional Chinese person name return false; while(i<nLen)//No Including non-CHinese char { nCharType=charType((unsigned char*)sPersonName+i); if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER) return false; i+=2; } sSurname2[0]=0;//init strncpy(sSurname,sPersonName,nSurNameLen); sSurname[nSurNameLen]=0; if(!personDict.IsExist(sSurname,1)) { nSurNameLen=2; sSurname[nSurNameLen]=0; if(!personDict.IsExist(sSurname,1)) { nSurNameLen=0; sSurname[nSurNameLen]=0; } } strcpy(sGivenName,sPersonName+nSurNameLen); if(nLen>6) { strncpy(sTemp,sPersonName+nSurNameLen,2); sTemp[2]=0;//Get the second possible surname if(personDict.IsExist(sTemp,1)) {//Hongkong women's name: Surname+surname+given name strcpy(sSurname2,sTemp); strcpy(sGivenName,sPersonName+nSurNameLen+2); } } nFreq=personDict.GetFrequency(sSurname,1); strncpy(sTemp,sGivenName,2); sTemp[2]=0; nFreqGiven=personDict.GetFrequency(sTemp,2); if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)>=3&&nFreq<personDict.GetFrequency("张",1)/40&&nFreqGiven<personDict.GetFrequency("华",2)/20)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2))) return false; if(nLen==4&&m_uPerson.IsGivenName(sPersonName)) {//Single Surname+given name return false; } return true; }
//POS tagging with Hidden Markov Model bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown) { //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary int i=0,j,nStartPos; Reset(false); while(i>-1&&pWordItems[i].sWord[0]!=0) { nStartPos=i;//Start Position i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown); GetBestPOS(); switch(m_tagType) { case TT_NORMAL://normal POS tagging j=1; while(m_nBestTag[j]!=-1&&j<m_nCurLength) {//Store the best POS tagging pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j]; //Let 。be 0 if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value pWordItems[j+nStartPos-1].dValue=LOG_MAX_FRQUENCE-log((double)dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j])+1); j+=1; } break; case TT_PERSON://Person recognition /*clock_t lStart,lEnd; lStart=clock(); */ SplitPersonPOS(dictUnknown); //lEnd=clock(); //printf("SplitPersonPOS=%f\n",(double)(lEnd-lStart)*1000/CLOCKS_PER_SEC); //Spit Persons POS //lStart=clock(); PersonRecognize(dictUnknown); //lEnd=clock(); //printf("PersonRecognize=%f\n",(double)(lEnd-lStart)/CLOCKS_PER_SEC); //Person Recognition with the person recognition dictionary break; case TT_PLACE://Place name recognition PlaceRecognize(dictCore,dictUnknown); break; case TT_TRANS://Transliteration TransRecognize(dictCore,dictUnknown); break; default: break; } Reset(); } return true; }
ELEMENT_TYPE CSpan::ComputePossibility(int nStartPos,int nLength,CDictionary &dict) { ELEMENT_TYPE dRetValue=0,dPOSPoss; //dPOSPoss: the possibility of a POS appears //dContextPoss: The possibility of context POS appears int nFreq; for(int i=nStartPos;i<nStartPos+nLength;i++) { nFreq=dict.GetFrequency(m_sWords[i],m_nBestTag[i]); //nFreq is word being the POS dPOSPoss=log((double)(m_context.GetFrequency(0,m_nBestTag[i])+1))-log((double)(nFreq+1)); dRetValue+=dPOSPoss; /* if(i<nStartPos+nLength-1) { dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1)); dRetValue+=dPOSPoss-dContextPoss; } */ } return dRetValue; }
bool CSpan::PersonRecognize(CDictionary &personDict) { char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100]; //0 1 2 3 4 5 char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE", "BG", "BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""}; double dFactor[]={0.0011,0.0011,0.0011,0.0011,0.7614,0.0011,0.2055, 0.0160,0.0011,0.0011,0,0.0160,0.0011, 0.0160,0.0011,0.0011,0 }; //About parameter: /* Given Name: 486 0.0160 Surname+postfix:484 0.0160 m_lPerson2Num:6265 0.2055 m_lPerson3Num: 23184 0.7614 m_lPerson4Num:32 0.0011 */ //The person recognition patterns set //BBCD:姓+姓+名1+名2; //BBE: 姓+姓+单名; //BBZ: 姓+姓+双名成词; //BCD: 姓+名1+名2; //BE: 姓+单名; //BEE: 姓+单名+单名;韩磊磊 //BG: 姓+后缀 //BXD: 姓+姓双名首字成词+双名末字 //BZ: 姓+双名成词; //B: 姓 //CD: 名1+名2; //EE: 单名+单名; //FB: 前缀+姓 //XD: 姓双名首字成词+双名末字 //Y: 姓单名成词 int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0}; int i; for(i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS sPOS[i]=m_nBestTag[i]+'A'; sPOS[i]=0; int j=1,k,nPos;//Find the proper pattern from the first POS int nLittleFreqCount;//Counter for the person name role with little frequecy bool bMatched=false; while(j<i) { bMatched=false; for(k=0;!bMatched&&nPatternLen[k]>0;k++) { if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0) {//Find the proper pattern k if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G')) {//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效; continue; } /* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0) {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊 continue; } if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12) {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘 continue; } */ //Get the possible name nPos=j;//Record the person position in the tag sequence sPersonName[0]=0; nLittleFreqCount=0;//Record the number of role with little frequency while(nPos<j+nPatternLen[k]) {//Get the possible person name // if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY) nLittleFreqCount++;//The counter increase strcat(sPersonName,m_sWords[nPos]); nPos+=1; } if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY) {//Exclusion foreign name //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效 j+=nPatternLen[k]-1; continue; } if(strcmp(sPatterns[k],"CDCD")==0) {//Rule for exclusion //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱 //Rule 3 for exclusion:含外国人名用字 规则适用 //否则,排除规则失效:黑妞白妞姐俩拔了头筹。 if(GetForeignCharCount(sPersonName)>0) j+=nPatternLen[k]-1; continue; } if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName)) {// j+=nPatternLen[k]-1; continue; } if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3) //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀, //The all roles appear with two lower frequecy,we will ignore them continue; m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j]; m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]]; m_dWordsPossibility[m_nUnknownIndex]=log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict); //Mutiply the factor m_nUnknownIndex+=1; j+=nPatternLen[k]; bMatched=true; } } if(!bMatched)//Not matched, add j by 1 j+=1; } return true; }