/********************************************************************* * * Func Name : GetCharCount * * Description: Get the count of char which is in sWord and in sCharSet * * Parameters : sWord: the word * * Returns : COUNT * Author : Kevin Zhang * History : * 1.create 2002-5-21 2. corrected by Wang Zhifu 2004-2-18 *********************************************************************/ int GetCharCount(char *sCharSet,char *sWord) { unsigned int k=0; char tchar[3]; int nCount=0; tchar[2]=0; while(k < strlen(sWord)) { tchar[0]=sWord[k]; tchar[1]=0; if(sWord[k]<0) { tchar[1]=sWord[k+1]; k+=1; } k+=1; // 这里有改动,原语句为 // if((tchar[0]<0&&CC_Find(sCharSet, tchar))||strchr(sCharSet,tchar[0])) // 改为:因为后面部分经常为true!!!应该加入限制条件! // corrected by Wang Zhifu if(((tchar[0]<0&&(CC_Find(sCharSet, tchar)!=NULL)))||(tchar[0]>0&&(strchr(sCharSet,tchar[0])!=NULL))) nCount++; } return nCount; }
bool PostfixSplit(char *sWord, char *sWordRet, char *sPostfix) { char sSinglePostfix[]=POSTFIX_SINGLE; char sMultiPostfix[][9]=POSTFIX_MUTIPLE; unsigned int nPostfixLen=0,nWordLen=strlen(sWord); int i=0; while(sMultiPostfix[i][0]!=0&&strncmp(sWord+nWordLen-strlen(sMultiPostfix[i]),sMultiPostfix[i],strlen(sMultiPostfix[i]))!=0) {//Try to get the postfix of an address i++; } strcpy(sPostfix,sMultiPostfix[i]); nPostfixLen=strlen(sMultiPostfix[i]);//Get the length of place postfix if(nPostfixLen==0) { sPostfix[2]=0; strncpy(sPostfix,sWord+nWordLen-2,2); if(CC_Find(sSinglePostfix,sPostfix)) nPostfixLen=2; } strncpy(sWordRet,sWord,nWordLen-nPostfixLen); sWordRet[nWordLen-nPostfixLen]=0;//Get the place name which have erasing the postfix sPostfix[nPostfixLen]=0; return true; }
/********************************************************************* * * Func Name : GetMaxMatch * * Description: Get the max match to the word * * * Parameters : nHandle: the only handle which will be attached to the word * Returns : success or fail * Author : Kevin Zhang * History : * 1.create 2002-1-21 *********************************************************************/ bool CDictionary::GetMaxMatch(char *sWord, char *sWordRet,int *npHandleRet) { char sWordGet[WORD_MAXLENGTH-2],sFirstChar[3]; int nPos,nFoundPos,nTemp; PWORD_CHAIN pCur; *npHandleRet=-1; if(!PreProcessing(sWord, &nPos,sWordGet)) return false; if (nPos < 0) return false; sWordRet[0]=0; strncpy(sFirstChar,sWord,strlen(sWord)-strlen(sWordGet));//Get the first char sFirstChar[strlen(sWord)-strlen(sWordGet)]=0;//Set the end flag FindInOriginalTable(nPos,sWordGet,-1,&nFoundPos); nTemp=nFoundPos;//Check its previous position if(nFoundPos==-1) nTemp=0; while(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)!=m_IndexTable[nPos].pWordItemHead[nTemp].sWord) {//Get the next nTemp+=1; } if(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)==m_IndexTable[nPos].pWordItemHead[nTemp].sWord) { strcpy(sWordRet,sFirstChar); strcat(sWordRet,m_IndexTable[nPos].pWordItemHead[nTemp].sWord); *npHandleRet=m_IndexTable[nPos].pWordItemHead[nTemp].nHandle; return true; }//Cannot get the item and retrieve the modified data if exists //Operation in the index table and its items if(m_pModifyTable&&m_pModifyTable[nPos].pWordItemHead)//Exists pCur=m_pModifyTable[nPos].pWordItemHead; else pCur=NULL; while(pCur!=NULL&&strcmp(pCur->data.sWord,sWordGet)<=0&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord)// { pCur=pCur->next; } if(pCur!=NULL&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord) {//Get it strcpy(sWordRet,sFirstChar); strcat(sWordRet,pCur->data.sWord); *npHandleRet=pCur->data.nHandle; return true; } return false; }
/********************************************************************* * * Func Name : IsForeign * * Description: Decide whether the word is Chinese Num word * * Parameters : sWord: the word * * Returns : the index value * Author : Kevin Zhang * History : * 1.create 2002-1-26 *********************************************************************/ bool IsAllChineseNum(char *sWord) {//百分之五点六的人早上八点十八分起床 unsigned int k; char tchar[3]; char ChineseNum[]="零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·./点";// char sPrefix[]="几数第上成"; for(k = 0; k < strlen(sWord); k+=2) { strncpy(tchar,sWord+k,2) ; tchar[2]='\0'; if(strncmp(sWord+k,"分之",4)==0)//百分之五 { k+=2; continue; } if(!CC_Find(ChineseNum, tchar)&&!(k==0&&CC_Find(sPrefix, tchar))) return false; } return true; }
bool CSegment::IsYearTime(char *sNum) { //Judge whether the sNum is a num genearating year unsigned int nLen=strlen(sNum); char sTemp[3]; strncpy(sTemp,sNum,2); sTemp[2]=0; if(IsAllSingleByte((unsigned char *)sNum)&&(nLen>=3||nLen==2&&sNum[0]>'4'))//1992年, 90年 return true; if(IsAllNum((unsigned char *)sNum)&&(nLen>=6||nLen==4&&CC_Find("56789",sTemp))) return true; if(GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖",sNum)==(int)nLen/2&&nLen>=3) return true; if(nLen==8&&GetCharCount("千仟零○",sNum)==2)//二仟零二年 return true; return false; }
/********************************************************************* * * Func Name : GetCharCount * * Description: Get the count of char which is in sWord and in sCharSet * * Parameters : sWord: the word * * Returns : COUNT * Author : Kevin Zhang * History : * 1.create 2002-5-21 *********************************************************************/ int GetCharCount(char *sCharSet,char *sWord) { unsigned int k=0; char tchar[3]; int nCount=0; tchar[2]=0; while(k < strlen(sWord)) { tchar[0]=sWord[k]; tchar[1]=0; if(sWord[k]<0) { tchar[1]=sWord[k+1]; k+=1; } k+=1; if((tchar[0]<0&&CC_Find(sCharSet, tchar))||strchr(sCharSet,tchar[0])) nCount++; } return nCount; }
/********************************************************************* * * Func Name : IsNumExist * * Description: Judge whether there is Num Char in the string * * * Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char * * Returns : the end of the sub-sentence * Author : Wangzhifu * History : * 1.create 2004-3-2 *********************************************************************/ bool IsNumExist(char *sWord) { unsigned int k; char tchar[3]; char ChineseNum[]="零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟";// for(k = 0; k < strlen(sWord); k++) { tchar[0]=sWord[k]; if(tchar[0]<0) { tchar[1]=sWord[k++]; tchar[2]='\0'; if(CC_Find(ChineseNum, tchar)) return true; } else { if((tchar[0]>'0'-1)&&(tchar[0]<'9'+1)) return true; } } return false; }
bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq) { //Gernerate the word net from the sLine, that's list all the possible word unsigned int i=0,j,nLen=strlen(sSentence); char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH]; int nWordIndex=0,nHandleTemp,k,nPOS; int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount; double dValue=0; m_nAtomCount=0; m_segGraph.SetEmpty();//Set segmentation graph empty AtomSegment(sSentence); //Atomic Segmentation for(i=0;i<m_nAtomCount;i++)//Init the cost array { if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value else m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value } else//Other atom { strcpy(sWord,m_sAtom[i]);//init the word dValue=MAX_FREQUENCE; switch(m_nAtomPOS[i]) { case CT_INDEX: case CT_NUM: nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); dValue=0; break; case CT_DELIMITER: nPOS=30464;//'w'*256; break; case CT_LETTER: nPOS=-'n'*256-'x';// dValue=0; strcpy(sWord,"未##串"); break; case CT_SINGLE://12021-2129-3121 if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i])) { nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); } else { nPOS=-'n'*256-'x';// strcpy(sWord,"未##串"); } dValue=0; break; default: nPOS=m_nAtomPOS[i];//'?'*256; break; } if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum else m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum } } i=0; while(i<m_nAtomCount)//All the word { strcpy(sWord,m_sAtom[i]);//Get the current atom j=i+1; if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份 j+=1; while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp)) {//Add a condition to control the end of string //retrieve the dictionary with the word if(strcmp(sWordMatch,sWord)==0)//find the current word { nTotalFreq=0; dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq); for(k=0;k<nMatchCount;k++)//Add the frequency { nTotalFreq+=nMatchFreq[k]; } //Adding a rule to exclude some words to be formed. if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0)) {//1年内、1999年末 if(CC_Find("末内中底前间初",sWord+2)) break; } if(nMatchCount==1)//The possible word has only one POS, store it { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]); else m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord); } else { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0); else m_segGraph.SetElement(i,j,nTotalFreq,0,sWord); } } strcat(sWord,m_sAtom[j++]); } i+=1;//Start from i++; } return true; }
//Guess the POS of No. nIndex word item bool CSpan::GuessPOS(int nIndex,int *pSubIndex) { int j=0,i=nIndex,nCharType; unsigned int nLen; switch(m_tagType) { case TT_NORMAL: break; case TT_PERSON: j=0; if(CC_Find("××",m_sWords[nIndex])) { m_nTags[i][j]=6; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1); } else { m_nTags[i][j]=0; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1); nLen=strlen(m_sWords[nIndex]); if(nLen>=4) { m_nTags[i][j]=0; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1); m_nTags[i][j]=11; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8); m_nTags[i][j]=12; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8); m_nTags[i][j]=13; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8); } else if(nLen==2) { m_nTags[i][j]=0; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1); nCharType=charType((unsigned char *)m_sWords[nIndex]); if(nCharType==CT_OTHER||nCharType==CT_CHINESE) { m_nTags[i][j]=1; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1); m_nTags[i][j]=2; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1); m_nTags[i][j]=3; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1); m_nTags[i][j]=4; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1); } m_nTags[i][j]=11; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8); m_nTags[i][j]=12; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8); m_nTags[i][j]=13; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8); } } break; case TT_PLACE: j=0; m_nTags[i][j]=0; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1); nLen=strlen(m_sWords[nIndex]); if(nLen>=4) { m_nTags[i][j]=11; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8); m_nTags[i][j]=12; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8); m_nTags[i][j]=13; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8); } else if(nLen==2) { m_nTags[i][j]=0; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1); nCharType=charType((unsigned char *)m_sWords[nIndex]); if(nCharType==CT_OTHER||nCharType==CT_CHINESE) { m_nTags[i][j]=1; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1); m_nTags[i][j]=2; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1); m_nTags[i][j]=3; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1); m_nTags[i][j]=4; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1); } m_nTags[i][j]=11; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8); m_nTags[i][j]=12; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8); m_nTags[i][j]=13; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8); } break; case TT_TRANS: j=0; nLen=strlen(m_sWords[nIndex]); m_nTags[i][j]=0; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1); if(!IsAllChinese((unsigned char *)m_sWords[nIndex])) { if(IsAllLetter((unsigned char *)m_sWords[nIndex])) { m_nTags[i][j]=1; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1); m_nTags[i][j]=11; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1); /* } if(IsAllNum((unsigned char *)m_sWords[nIndex])||IsAllLetter((unsigned char *)m_sWords[nIndex])) { */ m_nTags[i][j]=2; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1); m_nTags[i][j]=3; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1); m_nTags[i][j]=12; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1); m_nTags[i][j]=13; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1); } m_nTags[i][j]=41; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8); m_nTags[i][j]=42; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8); m_nTags[i][j]=43; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8); } else if(nLen>=4) { m_nTags[i][j]=41; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8); m_nTags[i][j]=42; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8); m_nTags[i][j]=43; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8); } else if(nLen==2) { nCharType=charType((unsigned char *)m_sWords[nIndex]); if(nCharType==CT_OTHER||nCharType==CT_CHINESE) { m_nTags[i][j]=1; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1); m_nTags[i][j]=2; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1); m_nTags[i][j]=3; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1); m_nTags[i][j]=30; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1); m_nTags[i][j]=11; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1); m_nTags[i][j]=12; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1); m_nTags[i][j]=13; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1); m_nTags[i][j]=21; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1); m_nTags[i][j]=22; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1); m_nTags[i][j]=23; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1); } m_nTags[i][j]=41; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8); m_nTags[i][j]=42; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8); m_nTags[i][j]=43; m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8); } break; default: break; } *pSubIndex=j; return true; }
//Generate Word according the segmentation route bool CSegment::GenerateWord(int **nSegRoute, int nIndex) { unsigned int i=0,k=0; int j,nStartVertex,nEndVertex,nPOS; char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100]; ELEMENT_TYPE fValue; while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1]) { nStartVertex=nSegRoute[nIndex][i]; j=nStartVertex;//Set the start vertex nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex nPOS=0; m_graphSeg.m_segGraph.GetElement(nStartVertex,nEndVertex,&fValue,&nPOS); sAtom[0]=0; while(j<nEndVertex) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending strcpy(sNumCandidate,sAtom); while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate))) {//Merge all seperate continue num into one number //sAtom[0]!=0: add in 2002-5-9 strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate); //Save them in the result segmentation i++;//Skip to next atom now sAtom[0]=0; while(j<nSegRoute[nIndex][i+1]) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } //add a judge for out of memery, //del, not nessasseray for ordinary text file,becasuse no word's lenth can larger than100 //so remain the same // if(strlen(sNumCandidate)+strlen(sAtom)<100) strcat(sNumCandidate,sAtom); // else // break; } unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord); if(nLen==4&&CC_Find("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",m_pWordSeg[nIndex][k].sWord[0])) {//Only one word strcpy(sCurWord,m_pWordSeg[nIndex][k].sWord);//Record current word i--; } else if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop { strcpy(m_pWordSeg[nIndex][k].sWord,sAtom); //Save them in the result segmentation strcpy(sCurWord,sAtom);//Record current word } else {//It is a num if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-'&&m_pWordSeg[nIndex][k].sWord[1]==0)//The delimiter "--" { nPOS=30464;//'w'*256;Set the POS with 'w' i--;//Not num, back to previous word } else {//Adding time suffix char sInitChar[3]; unsigned int nCharIndex=0;//Get first char sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; if(sInitChar[nCharIndex]<0) { nCharIndex+=1; sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; } nCharIndex+=1; sInitChar[nCharIndex]='\0'; if(k>0&&(abs(m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(m_pWordSeg[nIndex][k-1].nHandle)==29696)&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex)) {//3-4月 //27904='m'*256 //Split the sInitChar from the original word strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex); m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue; m_pWordSeg[nIndex][k+1].nHandle=27904; m_pWordSeg[nIndex][k].sWord[nCharIndex]=0; m_pWordSeg[nIndex][k].dValue=0; m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256; m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle,m_pWordSeg[nIndex][k].sWord); nStartVertex+=1; k+=1; } nLen=strlen(m_pWordSeg[nIndex][k].sWord); if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0) {//2001年 strcat(m_pWordSeg[nIndex][k].sWord,sAtom); strcpy(sCurWord,"未##时"); nPOS=-29696;//'t'*256;//Set the POS with 'm' } else if(strcmp(sAtom,"年")==0) { if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&& {//1998年, strcat(m_pWordSeg[nIndex][k].sWord,sAtom); strcpy(sCurWord,"未##时"); nPOS=-29696;//Set the POS with 't' } else { strcpy(sCurWord,"未##数"); nPOS=-27904;//Set the POS with 'm' i--;//Can not be a time word } } else { //早晨/t 五点/t if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0) { strcpy(sCurWord,"未##时"); nPOS=-29696;//Set the POS with 't' } else { // if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/') // { /// strcpy(sCurWord,"未##数"); // nPOS=-27904;//'m'*256;Set the POS with 'm' // } // else if(nLen>strlen(sInitChar)) // {//Get rid of . example 1. // if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/') // m_pWordSeg[nIndex][k].sWord[nLen-1]=0; // else // m_pWordSeg[nIndex][k].sWord[nLen-2]=0; // strcpy(sCurWord,"未##数"); // nPOS=-27904;//'m'*256;Set the POS with 'm' // i--; // } //here's bug in it +...... do not del . //2004-3-2 fixed by Wangzhifu // if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/') //2004_06_18 修改,见04_06_18新华网语料bug.txt if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.') { strcpy(sCurWord,"未##数"); nPOS=-27904;//'m'*256;Set the POS with 'm' } else if(nLen>strlen(sInitChar)) {//Get rid of . example 1. //,but if" +......" do not del .//fixed at 2004-3-2 by Wang Zhifu char TempWord[100]; strcpy(TempWord,m_pWordSeg[nIndex][k].sWord); TempWord[nLen-2]=0;//去掉最后一个字符! if(IsNumExist(TempWord)) { if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/') m_pWordSeg[nIndex][k].sWord[nLen-1]=0; else m_pWordSeg[nIndex][k].sWord[nLen-2]=0; strcpy(sCurWord,"未##数"); nPOS=-27904;//'m'*256;Set the POS with 'm' i--; } else { nPOS=-'n'*256-'x';// strcpy(sCurWord,"未##串"); } } } // end of fixed line; i--;//Not num, back to previous word } } fValue=0; nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter } m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS,sCurWord); //Generate optimum segmentation graph according the segmentation result i++;//Skip to next atom k++;//Accept next word } m_pWordSeg[nIndex][k].sWord[0]=0; m_pWordSeg[nIndex][k].nHandle=-1;//Set ending return true; }
/********************************************************************* * * Func Name : IsAllNum * * Description: Judge the string is all made up of Num Char * * * Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char * * Returns : the end of the sub-sentence * Author : Kevin Zhang * History : * 1.create 2002-1-24 *********************************************************************/ bool IsAllNum(unsigned char *sString) { unsigned int nLen=strlen((const char *)sString),i=0; char sChar[3]; sChar[2]=0; if(i<nLen)//Get prefix such as + - { sChar[0]=sString[i++]; if(sChar[0]<0)//Get first char sChar[1]=sString[i++]; else sChar[1]=0; if(!strstr("±+—-+",sChar)) { i=0; } } while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186) { i+=2; } if(i<nLen)//Get middle delimiter such as . { sChar[0]=sString[i++]; if(sChar[0]<0)//Get first char sChar[1]=sString[i++]; else sChar[1]=0; if(CC_Find("∶·./",sChar)||sChar[0]=='.'||sChar[0]=='/') {//98.1% while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186) { i+=2; } } else { i-=strlen(sChar); } } if(i>=nLen) return true; while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1) {//single byte number char i+=1; } if(i<nLen)//Get middle delimiter such as . { sChar[0]=sString[i++]; if(sChar[0]<0)//Get first char sChar[1]=sString[i++]; else sChar[1]=0; if(CC_Find("∶·./",sChar)||sChar[0]=='.'||sChar[0]=='/') {//98.1% while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1) { i+=1; } } else { i-=strlen(sChar); } } if(i<nLen)//Get middle delimiter such as . { sChar[0]=sString[i++]; if(sChar[0]<0)//Get first char sChar[1]=sString[i++]; else sChar[1]=0; if(!CC_Find("百千万亿佰仟%‰",sChar)&&sChar[0]!='%') i-=strlen(sChar); } if(i>=nLen) return true; return false; }
//Paragraph Segment and POS Tagging bool CResult::ParagraphProcessing(char *sParagraph,char *sResult) { char *sSentence,sChar[3]; char *sSentenceResult; unsigned int nLen=strlen(sParagraph)+13; sSentence=new char[nLen];//malloc buffer sSentenceResult=new char[nLen*3];//malloc buffer sSentence[0]=0; unsigned int nPosIndex=0,nParagraphLen=strlen(sParagraph),nSentenceIndex=0; sChar[2]=0; sResult[0]=0;//Init the result bool bFirstIgnore=true; strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag while(nPosIndex<nParagraphLen) {//Find a whole sentence which separated by ! . \n \r sChar[0]=sParagraph[nPosIndex];//Get a char sChar[1]=0; if(sParagraph[nPosIndex]<0) {//double byte char nPosIndex+=1; sChar[1]=sParagraph[nPosIndex]; } nPosIndex+=1; /* #define SEPERATOR_C_SENTENCE "。!?:;…" #define SEPERATOR_C_SUB_SENTENCE "、,()“”‘’" #define SEPERATOR_E_SENTENCE "!?:;" #define SEPERATOR_E_SUB_SENTENCE ",()\042'" #define SEPERATOR_LINK "\n\r " */ if(CC_Find(SEPERATOR_C_SENTENCE,sChar)||CC_Find(SEPERATOR_C_SUB_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar)||strstr(SEPERATOR_E_SUB_SENTENCE,sChar)||strstr(SEPERATOR_LINK,sChar)) {//Reach end of a sentence.Get a whole sentence if(!strstr(SEPERATOR_LINK,sChar))//Not link seperator { strcat(sSentence,sChar); } if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0) { if(!strstr(SEPERATOR_C_SUB_SENTENCE,sChar)&&!strstr(SEPERATOR_E_SUB_SENTENCE,sChar)) strcat(sSentence,SENTENCE_END);//Add sentence ending flag Processing(sSentence,1);//Processing and output the result of current sentence. Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result //bFirstIgnore=true; strcat(sResult,sSentenceResult);//Store in the result buffer } if(strstr(SEPERATOR_LINK,sChar))//Link the result with the SEPERATOR_LINK { strcat(sResult,sChar); strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag //sSentence[0]=0;//New sentence, and begin new segmentation //bFirstIgnore=false; } else if(strstr(SEPERATOR_C_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar)) { strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag //sSentence[0]=0;//New sentence, and begin new segmentation //bFirstIgnore=false; } else { strcpy(sSentence,sChar);//reset current sentence, and add the previous end at begin position } } else //Other chars and store in the sentence buffer strcat(sSentence,sChar); } if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0) { strcat(sSentence,SENTENCE_END);//Add sentence ending flag Processing(sSentence,1);//Processing and output the result of current sentence. Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result strcat(sResult,sSentenceResult);//Store in the result buffer } delete [] sSentence;//FREE sentence buffer delete [] sSentenceResult;//free buffer return true; }
//Adjust the result with some rules bool CResult::Adjust(PWORD_RESULT pItem,PWORD_RESULT pItemRet) { int i=0,j=0; unsigned int nLen; char sSurName[10],sSurName2[10],sGivenName[10]; bool bProcessed=false;//Have been processed while(pItem[i].sWord[0]!=0) { nLen=strlen(pItem[i].sWord); bProcessed=false; //Rule1: adjust person name if(pItem[i].nHandle==28274&&ChineseNameSplit(pItem[i].sWord,sSurName,sSurName2,sGivenName,m_uPerson.m_dict)&&strcmp(pItem[i].sWord,"叶利钦")!=0)//'nr' {//Divide name into surname and given name if(sSurName[0]) { strcpy(pItemRet[j].sWord,sSurName); pItemRet[j++].nHandle=28274; } if(sSurName2[0]) { strcpy(pItemRet[j].sWord,sSurName2); pItemRet[j++].nHandle=28274; } if(sGivenName[0]) { strcpy(pItemRet[j].sWord,sGivenName); pItemRet[j++].nHandle=28274; } bProcessed=true; } //Rule2 for overlap words ABB 一段段、一片片 else if(pItem[i].nHandle==27904&&strlen(pItem[i+1].sWord)==2&&strcmp(pItem[i+1].sWord,pItem[i+2].sWord)==0) {//(pItem[i+1].nHandle/256=='q'||pItem[i+1].nHandle/256=='a')&& strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); strcat(pItemRet[j].sWord,pItem[i+2].sWord); pItemRet[j].nHandle=27904; j+=1; i+=2; bProcessed=true; } //Rule3 for overlap words AA else if(nLen==2&&strcmp(pItem[i].sWord,pItem[i+1].sWord)==0) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); //24832=='a'*256 pItemRet[j].nHandle=24832;//a if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256 { pItemRet[j].nHandle=30208; } if(pItem[i].nHandle/256=='n'||pItem[i+1].nHandle/256=='n')//30208='v'8256 { pItemRet[j].nHandle='n'*256; } i+=1; if(strlen(pItem[i+1].sWord)==2) {//AAB:洗/洗/脸、蒙蒙亮 if((pItemRet[j].nHandle==30208&&pItem[i+1].nHandle/256=='n')|| (pItemRet[j].nHandle==24832&&pItem[i+1].nHandle/256=='a') ) { strcat(pItemRet[j].sWord,pItem[i+1].sWord); i+=1; } } j+=1; bProcessed=true; } //Rule 4: AAB 洗/洗澡 else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&(pItem[i].nHandle/256=='v'||pItem[i].nHandle==24832))//v,a { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); //24832=='a'*256 pItemRet[j].nHandle=24832;//'a' if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256 { pItemRet[j].nHandle=30208; } i+=1; j+=1; bProcessed=true; } else if(pItem[i].nHandle/256=='u'&&pItem[i].nHandle%256)//uj,ud,uv,uz,ul,ug->u pItem[i].nHandle='u'*256; else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&strncmp(pItem[i+1].sWord+2,pItem[i+2].sWord,2)==0) {//AABB 朴朴素素 枝枝叶叶 strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); strcat(pItemRet[j].sWord,pItem[i+2].sWord); pItemRet[j].nHandle=pItem[i+1].nHandle; i+=2; j+=1; bProcessed=true; } else if(pItem[i].nHandle==28275)//PostFix { if(m_uPlace.m_dict.IsExist(pItem[i+1].sWord,4)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28275; i+=1; j+=1; bProcessed=true; } else if(strlen(pItem[i+1].sWord)==2&&CC_Find("队",pItem[i+1].sWord)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28276; i+=1; j+=1; bProcessed=true; } else if(strlen(pItem[i+1].sWord)==2&&CC_Find("语文字杯",pItem[i+1].sWord)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28282; i+=1; j+=1; bProcessed=true; } else if(strlen(pItem[i+1].sWord)==2&&CC_Find("裔",pItem[i+1].sWord)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28160; i+=1; j+=1; bProcessed=true; } } else if(pItem[i].nHandle==30208||pItem[i].nHandle==28160)//v { if(strlen(pItem[i+1].sWord)==2&&CC_Find("员",pItem[i+1].sWord)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28160; i+=1; j+=1; bProcessed=true; } } else if(pItem[i].nHandle==28280) {//www/nx ./w sina/nx; EIM/nx -601/m strcpy(pItemRet[j].sWord,pItem[i].sWord); pItemRet[j].nHandle=28280; while(pItem[i+1].nHandle==28280||strstr("..",pItem[i+1].sWord)||(pItem[i+1].nHandle==27904&&IsAllNum((unsigned char *)pItem[i+1].sWord))) { strcat(pItemRet[j].sWord,pItem[i+1].sWord); i+=1; } j+=1; bProcessed=true; } if(!bProcessed) {//If not processed,that's mean: not need to adjust; //just copy to the final result strcpy(pItemRet[j].sWord,pItem[i].sWord); pItemRet[j++].nHandle=pItem[i].nHandle; } i++; } pItemRet[j].sWord[0]=0;//Set ending return true; }
//Generate Word according the segmentation route bool CSegment::GenerateWord(int **nSegRoute, int nIndex) { unsigned int i=0,k=0; int j,nStartVertex,nEndVertex,nPOS; char sAtom[WORD_MAXLENGTH],sNumCandidate[100]; ELEMENT_TYPE fValue; while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1]) { nStartVertex=nSegRoute[nIndex][i]; j=nStartVertex;//Set the start vertex nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex nPOS=0; m_graphSeg.m_segGraph.GetElementInfo(nStartVertex,nEndVertex,fValue,nPOS); sAtom[0]=0; while(j < nEndVertex) { //Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending strcpy(sNumCandidate,sAtom); while(sAtom[0]!=0 && (IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate))) { //Merge all seperate continue num into one number //sAtom[0]!=0: add in 2002-5-9 strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate); //Save them in the result segmentation i++;//Skip to next atom now sAtom[0]=0; while(j<nSegRoute[nIndex][i+1]) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } strcat(sNumCandidate,sAtom); } if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop { strcpy(m_pWordSeg[nIndex][k].sWord,sAtom); //Save them in the result segmentation } else {//It is a num if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-')//The delimiter "--" { nPOS=30464;//'w'*256;Set the POS with 'w' i--;//Not num, back to previous word } else {//Adding time suffix char sInitChar[3]; unsigned int nCharIndex=0;//Get first char sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; if(sInitChar[nCharIndex]<0) { nCharIndex+=1; sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; } nCharIndex+=1; sInitChar[nCharIndex]='\0'; if(k>0&&m_pWordSeg[nIndex][k-1].nHandle==27904&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex)) {//3-4月 //27904='m'*256 //Split the sInitChar from the original word strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex); m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue; m_pWordSeg[nIndex][k+1].nHandle=27904; m_pWordSeg[nIndex][k].sWord[nCharIndex]=0; m_pWordSeg[nIndex][k].dValue=0; m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256; m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle); nStartVertex+=1; k+=1; } unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord); if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0) {//2001年 strcat(m_pWordSeg[nIndex][k].sWord,sAtom); nPOS=29696;//'t'*256;//Set the POS with 'm' } else if(strcmp(sAtom,"年")==0) { if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&& {//1998年, strcat(m_pWordSeg[nIndex][k++].sWord,sAtom); nPOS='t'*256;//Set the POS with 'm' } else i--;//Can not be a time word } else { //早晨/t 五点/t if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0) { nPOS='t'*256;//Set the POS with 'm' } else { if(m_pWordSeg[nIndex][k].sWord[0]!='.') nPOS='m'*256;//Set the POS with 'm' if(nLen>1&&m_pWordSeg[nIndex][k].sWord[nLen-1]=='.') {//Get rid of . example 1. m_pWordSeg[nIndex][k].sWord[nLen-1]=0; i--; } } i--;//Not num, back to previous word } } fValue=0; nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter } m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS); //Generate optimum segmentation graph according the segmentation result i++;//Skip to next atom k++;//Accept next word } m_pWordSeg[nIndex][k].sWord[0]=0; m_pWordSeg[nIndex][k].nHandle=-1;//Set ending return true; }