//Generate Word according the segmentation route bool CSegment::GenerateWord(int **nSegRoute, int nIndex) { unsigned int i=0,k=0; int j,nStartVertex,nEndVertex,nPOS; char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100]; ELEMENT_TYPE fValue; while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1]) { nStartVertex=nSegRoute[nIndex][i]; j=nStartVertex;//Set the start vertex nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex nPOS=0; m_graphSeg.m_segGraph.GetElement(nStartVertex,nEndVertex,&fValue,&nPOS); sAtom[0]=0; while(j<nEndVertex) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending strcpy(sNumCandidate,sAtom); while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate))) {//Merge all seperate continue num into one number //sAtom[0]!=0: add in 2002-5-9 strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate); //Save them in the result segmentation i++;//Skip to next atom now sAtom[0]=0; while(j<nSegRoute[nIndex][i+1]) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } //add a judge for out of memery, //del, not nessasseray for ordinary text file,becasuse no word's lenth can larger than100 //so remain the same // if(strlen(sNumCandidate)+strlen(sAtom)<100) strcat(sNumCandidate,sAtom); // else // break; } unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord); if(nLen==4&&CC_Find("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",m_pWordSeg[nIndex][k].sWord[0])) {//Only one word strcpy(sCurWord,m_pWordSeg[nIndex][k].sWord);//Record current word i--; } else if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop { strcpy(m_pWordSeg[nIndex][k].sWord,sAtom); //Save them in the result segmentation strcpy(sCurWord,sAtom);//Record current word } else {//It is a num if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-'&&m_pWordSeg[nIndex][k].sWord[1]==0)//The delimiter "--" { nPOS=30464;//'w'*256;Set the POS with 'w' i--;//Not num, back to previous word } else {//Adding time suffix char sInitChar[3]; unsigned int nCharIndex=0;//Get first char sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; if(sInitChar[nCharIndex]<0) { nCharIndex+=1; sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; } nCharIndex+=1; sInitChar[nCharIndex]='\0'; if(k>0&&(abs(m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(m_pWordSeg[nIndex][k-1].nHandle)==29696)&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex)) {//3-4月 //27904='m'*256 //Split the sInitChar from the original word strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex); m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue; m_pWordSeg[nIndex][k+1].nHandle=27904; m_pWordSeg[nIndex][k].sWord[nCharIndex]=0; m_pWordSeg[nIndex][k].dValue=0; m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256; m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle,m_pWordSeg[nIndex][k].sWord); nStartVertex+=1; k+=1; } nLen=strlen(m_pWordSeg[nIndex][k].sWord); if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0) {//2001年 strcat(m_pWordSeg[nIndex][k].sWord,sAtom); strcpy(sCurWord,"未##时"); nPOS=-29696;//'t'*256;//Set the POS with 'm' } else if(strcmp(sAtom,"年")==0) { if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&& {//1998年, strcat(m_pWordSeg[nIndex][k].sWord,sAtom); strcpy(sCurWord,"未##时"); nPOS=-29696;//Set the POS with 't' } else { strcpy(sCurWord,"未##数"); nPOS=-27904;//Set the POS with 'm' i--;//Can not be a time word } } else { //早晨/t 五点/t if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0) { strcpy(sCurWord,"未##时"); nPOS=-29696;//Set the POS with 't' } else { // if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/') // { /// strcpy(sCurWord,"未##数"); // nPOS=-27904;//'m'*256;Set the POS with 'm' // } // else if(nLen>strlen(sInitChar)) // {//Get rid of . example 1. // if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/') // m_pWordSeg[nIndex][k].sWord[nLen-1]=0; // else // m_pWordSeg[nIndex][k].sWord[nLen-2]=0; // strcpy(sCurWord,"未##数"); // nPOS=-27904;//'m'*256;Set the POS with 'm' // i--; // } //here's bug in it +...... do not del . //2004-3-2 fixed by Wangzhifu // if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/') //2004_06_18 修改,见04_06_18新华网语料bug.txt if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.') { strcpy(sCurWord,"未##数"); nPOS=-27904;//'m'*256;Set the POS with 'm' } else if(nLen>strlen(sInitChar)) {//Get rid of . example 1. //,but if" +......" do not del .//fixed at 2004-3-2 by Wang Zhifu char TempWord[100]; strcpy(TempWord,m_pWordSeg[nIndex][k].sWord); TempWord[nLen-2]=0;//去掉最后一个字符! if(IsNumExist(TempWord)) { if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/') m_pWordSeg[nIndex][k].sWord[nLen-1]=0; else m_pWordSeg[nIndex][k].sWord[nLen-2]=0; strcpy(sCurWord,"未##数"); nPOS=-27904;//'m'*256;Set the POS with 'm' i--; } else { nPOS=-'n'*256-'x';// strcpy(sCurWord,"未##串"); } } } // end of fixed line; i--;//Not num, back to previous word } } fValue=0; nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter } m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS,sCurWord); //Generate optimum segmentation graph according the segmentation result i++;//Skip to next atom k++;//Accept next word } m_pWordSeg[nIndex][k].sWord[0]=0; m_pWordSeg[nIndex][k].nHandle=-1;//Set ending return true; }
bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq) { //Gernerate the word net from the sLine, that's list all the possible word unsigned int i=0,j,nLen=strlen(sSentence); char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH]; int nWordIndex=0,nHandleTemp,k,nPOS; int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount; double dValue=0; m_nAtomCount=0; m_segGraph.SetEmpty();//Set segmentation graph empty AtomSegment(sSentence); //Atomic Segmentation for(i=0;i<m_nAtomCount;i++)//Init the cost array { if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value else m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value } else//Other atom { strcpy(sWord,m_sAtom[i]);//init the word dValue=MAX_FREQUENCE; switch(m_nAtomPOS[i]) { case CT_INDEX: case CT_NUM: nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); dValue=0; break; case CT_DELIMITER: nPOS=30464;//'w'*256; break; case CT_LETTER: nPOS=-'n'*256-'x';// dValue=0; strcpy(sWord,"未##串"); break; case CT_SINGLE://12021-2129-3121 if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i])) { nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); } else { nPOS=-'n'*256-'x';// strcpy(sWord,"未##串"); } dValue=0; break; default: nPOS=m_nAtomPOS[i];//'?'*256; break; } if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum else m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum } } i=0; while(i<m_nAtomCount)//All the word { strcpy(sWord,m_sAtom[i]);//Get the current atom j=i+1; if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份 j+=1; while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp)) {//Add a condition to control the end of string //retrieve the dictionary with the word if(strcmp(sWordMatch,sWord)==0)//find the current word { nTotalFreq=0; dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq); for(k=0;k<nMatchCount;k++)//Add the frequency { nTotalFreq+=nMatchFreq[k]; } //Adding a rule to exclude some words to be formed. if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0)) {//1年内、1999年末 if(CC_Find("末内中底前间初",sWord+2)) break; } if(nMatchCount==1)//The possible word has only one POS, store it { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]); else m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord); } else { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0); else m_segGraph.SetElement(i,j,nTotalFreq,0,sWord); } } strcat(sWord,m_sAtom[j++]); } i+=1;//Start from i++; } return true; }
//Generate Word according the segmentation route bool CSegment::GenerateWord(int **nSegRoute, int nIndex) { unsigned int i=0,k=0; int j,nStartVertex,nEndVertex,nPOS; char sAtom[WORD_MAXLENGTH],sNumCandidate[100]; ELEMENT_TYPE fValue; while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1]) { nStartVertex=nSegRoute[nIndex][i]; j=nStartVertex;//Set the start vertex nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex nPOS=0; m_graphSeg.m_segGraph.GetElementInfo(nStartVertex,nEndVertex,fValue,nPOS); sAtom[0]=0; while(j < nEndVertex) { //Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending strcpy(sNumCandidate,sAtom); while(sAtom[0]!=0 && (IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate))) { //Merge all seperate continue num into one number //sAtom[0]!=0: add in 2002-5-9 strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate); //Save them in the result segmentation i++;//Skip to next atom now sAtom[0]=0; while(j<nSegRoute[nIndex][i+1]) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } strcat(sNumCandidate,sAtom); } if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop { strcpy(m_pWordSeg[nIndex][k].sWord,sAtom); //Save them in the result segmentation } else {//It is a num if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-')//The delimiter "--" { nPOS=30464;//'w'*256;Set the POS with 'w' i--;//Not num, back to previous word } else {//Adding time suffix char sInitChar[3]; unsigned int nCharIndex=0;//Get first char sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; if(sInitChar[nCharIndex]<0) { nCharIndex+=1; sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; } nCharIndex+=1; sInitChar[nCharIndex]='\0'; if(k>0&&m_pWordSeg[nIndex][k-1].nHandle==27904&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex)) {//3-4月 //27904='m'*256 //Split the sInitChar from the original word strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex); m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue; m_pWordSeg[nIndex][k+1].nHandle=27904; m_pWordSeg[nIndex][k].sWord[nCharIndex]=0; m_pWordSeg[nIndex][k].dValue=0; m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256; m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle); nStartVertex+=1; k+=1; } unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord); if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0) {//2001年 strcat(m_pWordSeg[nIndex][k].sWord,sAtom); nPOS=29696;//'t'*256;//Set the POS with 'm' } else if(strcmp(sAtom,"年")==0) { if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&& {//1998年, strcat(m_pWordSeg[nIndex][k++].sWord,sAtom); nPOS='t'*256;//Set the POS with 'm' } else i--;//Can not be a time word } else { //早晨/t 五点/t if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0) { nPOS='t'*256;//Set the POS with 'm' } else { if(m_pWordSeg[nIndex][k].sWord[0]!='.') nPOS='m'*256;//Set the POS with 'm' if(nLen>1&&m_pWordSeg[nIndex][k].sWord[nLen-1]=='.') {//Get rid of . example 1. m_pWordSeg[nIndex][k].sWord[nLen-1]=0; i--; } } i--;//Not num, back to previous word } } fValue=0; nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter } m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS); //Generate optimum segmentation graph according the segmentation result i++;//Skip to next atom k++;//Accept next word } m_pWordSeg[nIndex][k].sWord[0]=0; m_pWordSeg[nIndex][k].nHandle=-1;//Set ending return true; }