//Generate Word according the segmentation route bool CSegment::GenerateWord(int **nSegRoute, int nIndex) { unsigned int i=0,k=0; int j,nStartVertex,nEndVertex,nPOS; char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100]; ELEMENT_TYPE fValue; while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1]) { nStartVertex=nSegRoute[nIndex][i]; j=nStartVertex;//Set the start vertex nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex nPOS=0; m_graphSeg.m_segGraph.GetElement(nStartVertex,nEndVertex,&fValue,&nPOS); sAtom[0]=0; while(j<nEndVertex) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending strcpy(sNumCandidate,sAtom); while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate))) {//Merge all seperate continue num into one number //sAtom[0]!=0: add in 2002-5-9 strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate); //Save them in the result segmentation i++;//Skip to next atom now sAtom[0]=0; while(j<nSegRoute[nIndex][i+1]) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } //add a judge for out of memery, //del, not nessasseray for ordinary text file,becasuse no word's lenth can larger than100 //so remain the same // if(strlen(sNumCandidate)+strlen(sAtom)<100) strcat(sNumCandidate,sAtom); // else // break; } unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord); if(nLen==4&&CC_Find("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",m_pWordSeg[nIndex][k].sWord[0])) {//Only one word strcpy(sCurWord,m_pWordSeg[nIndex][k].sWord);//Record current word i--; } else if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop { strcpy(m_pWordSeg[nIndex][k].sWord,sAtom); //Save them in the result segmentation strcpy(sCurWord,sAtom);//Record current word } else {//It is a num if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-'&&m_pWordSeg[nIndex][k].sWord[1]==0)//The delimiter "--" { nPOS=30464;//'w'*256;Set the POS with 'w' i--;//Not num, back to previous word } else {//Adding time suffix char sInitChar[3]; unsigned int nCharIndex=0;//Get first char sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; if(sInitChar[nCharIndex]<0) { nCharIndex+=1; sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; } nCharIndex+=1; sInitChar[nCharIndex]='\0'; if(k>0&&(abs(m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(m_pWordSeg[nIndex][k-1].nHandle)==29696)&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex)) {//3-4月 //27904='m'*256 //Split the sInitChar from the original word strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex); m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue; m_pWordSeg[nIndex][k+1].nHandle=27904; m_pWordSeg[nIndex][k].sWord[nCharIndex]=0; m_pWordSeg[nIndex][k].dValue=0; m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256; m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle,m_pWordSeg[nIndex][k].sWord); nStartVertex+=1; k+=1; } nLen=strlen(m_pWordSeg[nIndex][k].sWord); if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0) {//2001年 strcat(m_pWordSeg[nIndex][k].sWord,sAtom); strcpy(sCurWord,"未##时"); nPOS=-29696;//'t'*256;//Set the POS with 'm' } else if(strcmp(sAtom,"年")==0) { if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&& {//1998年, strcat(m_pWordSeg[nIndex][k].sWord,sAtom); strcpy(sCurWord,"未##时"); nPOS=-29696;//Set the POS with 't' } else { strcpy(sCurWord,"未##数"); nPOS=-27904;//Set the POS with 'm' i--;//Can not be a time word } } else { //早晨/t 五点/t if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0) { strcpy(sCurWord,"未##时"); nPOS=-29696;//Set the POS with 't' } else { // if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/') // { /// strcpy(sCurWord,"未##数"); // nPOS=-27904;//'m'*256;Set the POS with 'm' // } // else if(nLen>strlen(sInitChar)) // {//Get rid of . example 1. // if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/') // m_pWordSeg[nIndex][k].sWord[nLen-1]=0; // else // m_pWordSeg[nIndex][k].sWord[nLen-2]=0; // strcpy(sCurWord,"未##数"); // nPOS=-27904;//'m'*256;Set the POS with 'm' // i--; // } //here's bug in it +...... do not del . //2004-3-2 fixed by Wangzhifu // if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/') //2004_06_18 修改,见04_06_18新华网语料bug.txt if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.') { strcpy(sCurWord,"未##数"); nPOS=-27904;//'m'*256;Set the POS with 'm' } else if(nLen>strlen(sInitChar)) {//Get rid of . example 1. //,but if" +......" do not del .//fixed at 2004-3-2 by Wang Zhifu char TempWord[100]; strcpy(TempWord,m_pWordSeg[nIndex][k].sWord); TempWord[nLen-2]=0;//去掉最后一个字符! if(IsNumExist(TempWord)) { if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/') m_pWordSeg[nIndex][k].sWord[nLen-1]=0; else m_pWordSeg[nIndex][k].sWord[nLen-2]=0; strcpy(sCurWord,"未##数"); nPOS=-27904;//'m'*256;Set the POS with 'm' i--; } else { nPOS=-'n'*256-'x';// strcpy(sCurWord,"未##串"); } } } // end of fixed line; i--;//Not num, back to previous word } } fValue=0; nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter } m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS,sCurWord); //Generate optimum segmentation graph according the segmentation result i++;//Skip to next atom k++;//Accept next word } m_pWordSeg[nIndex][k].sWord[0]=0; m_pWordSeg[nIndex][k].nHandle=-1;//Set ending return true; }
//Generate Word according the segmentation route bool CSegment::GenerateWord(int **nSegRoute, int nIndex) { unsigned int i=0,k=0; int j,nStartVertex,nEndVertex,nPOS; char sAtom[WORD_MAXLENGTH],sNumCandidate[100]; ELEMENT_TYPE fValue; while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1]) { nStartVertex=nSegRoute[nIndex][i]; j=nStartVertex;//Set the start vertex nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex nPOS=0; m_graphSeg.m_segGraph.GetElementInfo(nStartVertex,nEndVertex,fValue,nPOS); sAtom[0]=0; while(j < nEndVertex) { //Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending strcpy(sNumCandidate,sAtom); while(sAtom[0]!=0 && (IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate))) { //Merge all seperate continue num into one number //sAtom[0]!=0: add in 2002-5-9 strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate); //Save them in the result segmentation i++;//Skip to next atom now sAtom[0]=0; while(j<nSegRoute[nIndex][i+1]) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } strcat(sNumCandidate,sAtom); } if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop { strcpy(m_pWordSeg[nIndex][k].sWord,sAtom); //Save them in the result segmentation } else {//It is a num if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-')//The delimiter "--" { nPOS=30464;//'w'*256;Set the POS with 'w' i--;//Not num, back to previous word } else {//Adding time suffix char sInitChar[3]; unsigned int nCharIndex=0;//Get first char sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; if(sInitChar[nCharIndex]<0) { nCharIndex+=1; sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; } nCharIndex+=1; sInitChar[nCharIndex]='\0'; if(k>0&&m_pWordSeg[nIndex][k-1].nHandle==27904&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex)) {//3-4月 //27904='m'*256 //Split the sInitChar from the original word strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex); m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue; m_pWordSeg[nIndex][k+1].nHandle=27904; m_pWordSeg[nIndex][k].sWord[nCharIndex]=0; m_pWordSeg[nIndex][k].dValue=0; m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256; m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle); nStartVertex+=1; k+=1; } unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord); if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0) {//2001年 strcat(m_pWordSeg[nIndex][k].sWord,sAtom); nPOS=29696;//'t'*256;//Set the POS with 'm' } else if(strcmp(sAtom,"年")==0) { if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&& {//1998年, strcat(m_pWordSeg[nIndex][k++].sWord,sAtom); nPOS='t'*256;//Set the POS with 'm' } else i--;//Can not be a time word } else { //早晨/t 五点/t if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0) { nPOS='t'*256;//Set the POS with 'm' } else { if(m_pWordSeg[nIndex][k].sWord[0]!='.') nPOS='m'*256;//Set the POS with 'm' if(nLen>1&&m_pWordSeg[nIndex][k].sWord[nLen-1]=='.') {//Get rid of . example 1. m_pWordSeg[nIndex][k].sWord[nLen-1]=0; i--; } } i--;//Not num, back to previous word } } fValue=0; nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter } m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS); //Generate optimum segmentation graph according the segmentation result i++;//Skip to next atom k++;//Accept next word } m_pWordSeg[nIndex][k].sWord[0]=0; m_pWordSeg[nIndex][k].nHandle=-1;//Set ending return true; }