bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict) { char sPOS[MAX_WORDS_PER_SENTENCE]="Z"; int nStart=1,nEnd=1,i=1; while(m_nBestTag[i]>-1) { if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23 nEnd++; while(m_nBestTag[nEnd]==30)//3,13,23 nEnd++; } else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22 nEnd++; while(m_nBestTag[nEnd]==30)//3,13,23 nEnd++; } if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1)))) { m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart]; m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd]; m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict); nStart=nEnd; } if(i<nEnd) i=nEnd; else i=i+1; } return true; }
bool CSpan::PlaceRecognize(CDictionary &dictCore,CDictionary &placeDict) { int nStart=1,nEnd=1,i=1; while(m_nBestTag[i]>-1) { if(m_nBestTag[i]==1)//1 Trigger the recognition procession { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==1)// nEnd++; while(m_nBestTag[nEnd]==2)//2,12,22 nEnd++; while(m_nBestTag[nEnd]==3) nEnd++; while(m_nBestTag[nEnd]==4) nEnd++; } else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==2)//2 nEnd++; while(m_nBestTag[nEnd]==3)//2 nEnd++; while(m_nBestTag[nEnd]==4)//2 nEnd++; } if(nEnd>nStart) { m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart]; m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd]; m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,placeDict); nStart=nEnd; } if(i<nEnd) i=nEnd; else i=i+1; } return true; }
//Sort the segmentation and POS result according its possibility bool CResult::Sort() { ELEMENT_TYPE dPossibility[MAX_SEGMENT_NUM],dTemp; int nIndex[MAX_SEGMENT_NUM],nTemp;//Index memset(dPossibility,0,sizeof(dPossibility)); //Init the possibility for(int i=0;i<m_Seg.m_nSegmentCount;i++) {//Computing the possibility dPossibility[i]=ComputePossibility(m_Seg.m_pWordSeg[i]); nIndex[i]=i;//Record the index } //Sort with Bubble sort algorithm for(int i=0;i<m_Seg.m_nSegmentCount;i++) for(int j=i+1;j<m_Seg.m_nSegmentCount;j++) { if(dPossibility[i]<dPossibility[j]) {//Swap the possition and value nTemp=nIndex[i]; dTemp=dPossibility[i]; nIndex[i]=nIndex[j]; dPossibility[i]=dPossibility[j]; nIndex[j]=nTemp; dPossibility[j]=dTemp; } } for(int i=0;i<m_Seg.m_nSegmentCount;i++) {//Adjust the segmentation and POS result and store them in the final result array //Store them according their possibility ascendly Adjust(m_Seg.m_pWordSeg[nIndex[i]],m_pResult[i]); m_dResultPossibility[i]=dPossibility[i]; } return true; }
bool CSpan::PersonRecognize(CDictionary &personDict) { char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100]; //0 1 2 3 4 5 char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE", "BG", "BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""}; double dFactor[]={0.0011,0.0011,0.0011,0.0011,0.7614,0.0011,0.2055, 0.0160,0.0011,0.0011,0,0.0160,0.0011, 0.0160,0.0011,0.0011,0 }; //About parameter: /* Given Name: 486 0.0160 Surname+postfix:484 0.0160 m_lPerson2Num:6265 0.2055 m_lPerson3Num: 23184 0.7614 m_lPerson4Num:32 0.0011 */ //The person recognition patterns set //BBCD:姓+姓+名1+名2; //BBE: 姓+姓+单名; //BBZ: 姓+姓+双名成词; //BCD: 姓+名1+名2; //BE: 姓+单名; //BEE: 姓+单名+单名;韩磊磊 //BG: 姓+后缀 //BXD: 姓+姓双名首字成词+双名末字 //BZ: 姓+双名成词; //B: 姓 //CD: 名1+名2; //EE: 单名+单名; //FB: 前缀+姓 //XD: 姓双名首字成词+双名末字 //Y: 姓单名成词 int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0}; int i; for(i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS sPOS[i]=m_nBestTag[i]+'A'; sPOS[i]=0; int j=1,k,nPos;//Find the proper pattern from the first POS int nLittleFreqCount;//Counter for the person name role with little frequecy bool bMatched=false; while(j<i) { bMatched=false; for(k=0;!bMatched&&nPatternLen[k]>0;k++) { if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0) {//Find the proper pattern k if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G')) {//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效; continue; } /* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0) {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊 continue; } if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12) {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘 continue; } */ //Get the possible name nPos=j;//Record the person position in the tag sequence sPersonName[0]=0; nLittleFreqCount=0;//Record the number of role with little frequency while(nPos<j+nPatternLen[k]) {//Get the possible person name // if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY) nLittleFreqCount++;//The counter increase strcat(sPersonName,m_sWords[nPos]); nPos+=1; } if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY) {//Exclusion foreign name //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效 j+=nPatternLen[k]-1; continue; } if(strcmp(sPatterns[k],"CDCD")==0) {//Rule for exclusion //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱 //Rule 3 for exclusion:含外国人名用字 规则适用 //否则,排除规则失效:黑妞白妞姐俩拔了头筹。 if(GetForeignCharCount(sPersonName)>0) j+=nPatternLen[k]-1; continue; } if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName)) {// j+=nPatternLen[k]-1; continue; } if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3) //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀, //The all roles appear with two lower frequecy,we will ignore them continue; m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j]; m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]]; m_dWordsPossibility[m_nUnknownIndex]=log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict); //Mutiply the factor m_nUnknownIndex+=1; j+=nPatternLen[k]; bMatched=true; } } if(!bMatched)//Not matched, add j by 1 j+=1; } return true; }