/********************************************************************* * * Func Name : IsForeign * * Description: Decide whether the word is not a Non-fereign word * * Parameters : sWord: the word * * Returns : the index value * Author : Kevin Zhang * History : * 1.create 2002-1-26 *********************************************************************/ bool IsForeign(char *sWord) { int nForeignCount=GetForeignCharCount(sWord),nCharCount=strlen(sWord); if(nCharCount>2||nForeignCount>=1*nCharCount/2) return true; return false; }
/********************************************************************* * * Func Name : IsAllForeign * * Description: Decide whether the word is not a Non-fereign word * * Parameters : sWord: the word * * Returns : the index value * Author : Kevin Zhang * History : * 1.create 2002-3-25 *********************************************************************/ bool IsAllForeign(char *sWord) { unsigned int nForeignCount=(unsigned int)GetForeignCharCount(sWord); if(2*nForeignCount==strlen(sWord)) return true; return false; }
bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict) { int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType,nFreqGiven; char sTemp[3]; if(nLen<3||nLen>8)//Not a traditional Chinese person name return false; while(i<nLen)//No Including non-CHinese char { nCharType=charType((unsigned char*)sPersonName+i); if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER) return false; i+=2; } sSurname2[0]=0;//init strncpy(sSurname,sPersonName,nSurNameLen); sSurname[nSurNameLen]=0; if(!personDict.IsExist(sSurname,1)) { nSurNameLen=2; sSurname[nSurNameLen]=0; if(!personDict.IsExist(sSurname,1)) { nSurNameLen=0; sSurname[nSurNameLen]=0; } } strcpy(sGivenName,sPersonName+nSurNameLen); if(nLen>6) { strncpy(sTemp,sPersonName+nSurNameLen,2); sTemp[2]=0;//Get the second possible surname if(personDict.IsExist(sTemp,1)) {//Hongkong women's name: Surname+surname+given name strcpy(sSurname2,sTemp); strcpy(sGivenName,sPersonName+nSurNameLen+2); } } nFreq=personDict.GetFrequency(sSurname,1); strncpy(sTemp,sGivenName,2); sTemp[2]=0; nFreqGiven=personDict.GetFrequency(sTemp,2); if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)>=3&&nFreq<personDict.GetFrequency("张",1)/40&&nFreqGiven<personDict.GetFrequency("华",2)/20)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2))) return false; if(nLen==4&&m_uPerson.IsGivenName(sPersonName)) {//Single Surname+given name return false; } return true; }
bool CSpan::PersonRecognize(CDictionary &personDict) { char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100]; //0 1 2 3 4 5 char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE", "BG", "BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""}; double dFactor[]={0.0011,0.0011,0.0011,0.0011,0.7614,0.0011,0.2055, 0.0160,0.0011,0.0011,0,0.0160,0.0011, 0.0160,0.0011,0.0011,0 }; //About parameter: /* Given Name: 486 0.0160 Surname+postfix:484 0.0160 m_lPerson2Num:6265 0.2055 m_lPerson3Num: 23184 0.7614 m_lPerson4Num:32 0.0011 */ //The person recognition patterns set //BBCD:姓+姓+名1+名2; //BBE: 姓+姓+单名; //BBZ: 姓+姓+双名成词; //BCD: 姓+名1+名2; //BE: 姓+单名; //BEE: 姓+单名+单名;韩磊磊 //BG: 姓+后缀 //BXD: 姓+姓双名首字成词+双名末字 //BZ: 姓+双名成词; //B: 姓 //CD: 名1+名2; //EE: 单名+单名; //FB: 前缀+姓 //XD: 姓双名首字成词+双名末字 //Y: 姓单名成词 int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0}; int i; for(i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS sPOS[i]=m_nBestTag[i]+'A'; sPOS[i]=0; int j=1,k,nPos;//Find the proper pattern from the first POS int nLittleFreqCount;//Counter for the person name role with little frequecy bool bMatched=false; while(j<i) { bMatched=false; for(k=0;!bMatched&&nPatternLen[k]>0;k++) { if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0) {//Find the proper pattern k if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G')) {//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效; continue; } /* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0) {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊 continue; } if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12) {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘 continue; } */ //Get the possible name nPos=j;//Record the person position in the tag sequence sPersonName[0]=0; nLittleFreqCount=0;//Record the number of role with little frequency while(nPos<j+nPatternLen[k]) {//Get the possible person name // if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY) nLittleFreqCount++;//The counter increase strcat(sPersonName,m_sWords[nPos]); nPos+=1; } if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY) {//Exclusion foreign name //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效 j+=nPatternLen[k]-1; continue; } if(strcmp(sPatterns[k],"CDCD")==0) {//Rule for exclusion //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱 //Rule 3 for exclusion:含外国人名用字 规则适用 //否则,排除规则失效:黑妞白妞姐俩拔了头筹。 if(GetForeignCharCount(sPersonName)>0) j+=nPatternLen[k]-1; continue; } if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName)) {// j+=nPatternLen[k]-1; continue; } if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3) //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀, //The all roles appear with two lower frequecy,we will ignore them continue; m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j]; m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]]; m_dWordsPossibility[m_nUnknownIndex]=log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict); //Mutiply the factor m_nUnknownIndex+=1; j+=nPatternLen[k]; bMatched=true; } } if(!bMatched)//Not matched, add j by 1 j+=1; } return true; }