bool CSegment::IsYearTime(char *sNum) { //Judge whether the sNum is a num genearating year unsigned int nLen=strlen(sNum); char sTemp[3]; strncpy(sTemp,sNum,2); sTemp[2]=0; if(IsAllSingleByte((unsigned char *)sNum)&&(nLen>=3||nLen==2&&sNum[0]>'4'))//1992年, 90年 return true; if(IsAllNum((unsigned char *)sNum)&&(nLen>=6||nLen==4&&CC_Find("56789",sTemp))) return true; if(GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖",sNum)==(int)nLen/2&&nLen>=3) return true; if(nLen==8&&GetCharCount("千仟零○",sNum)==2)//二仟零二年 return true; return false; }
bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict) { char sPOS[MAX_WORDS_PER_SENTENCE]="Z"; int nStart=1,nEnd=1,i=1; while(m_nBestTag[i]>-1) { if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23 nEnd++; while(m_nBestTag[nEnd]==30)//3,13,23 nEnd++; } else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition { nStart=i; nEnd=nStart+1; while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22 nEnd++; while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22 nEnd++; while(m_nBestTag[nEnd]==30)//3,13,23 nEnd++; } if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1)))) { m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart]; m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd]; m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict); nStart=nEnd; } if(i<nEnd) i=nEnd; else i=i+1; } return true; }
bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq) { //Gernerate the word net from the sLine, that's list all the possible word unsigned int i=0,j,nLen=strlen(sSentence); char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH]; int nWordIndex=0,nHandleTemp,k,nPOS; int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount; double dValue=0; m_nAtomCount=0; m_segGraph.SetEmpty();//Set segmentation graph empty AtomSegment(sSentence); //Atomic Segmentation for(i=0;i<m_nAtomCount;i++)//Init the cost array { if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value else m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value } else//Other atom { strcpy(sWord,m_sAtom[i]);//init the word dValue=MAX_FREQUENCE; switch(m_nAtomPOS[i]) { case CT_INDEX: case CT_NUM: nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); dValue=0; break; case CT_DELIMITER: nPOS=30464;//'w'*256; break; case CT_LETTER: nPOS=-'n'*256-'x';// dValue=0; strcpy(sWord,"未##串"); break; case CT_SINGLE://12021-2129-3121 if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i])) { nPOS=-27904;//'m'*256 strcpy(sWord,"未##数"); } else { nPOS=-'n'*256-'x';// strcpy(sWord,"未##串"); } dValue=0; break; default: nPOS=m_nAtomPOS[i];//'?'*256; break; } if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum else m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum } } i=0; while(i<m_nAtomCount)//All the word { strcpy(sWord,m_sAtom[i]);//Get the current atom j=i+1; if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份 j+=1; while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp)) {//Add a condition to control the end of string //retrieve the dictionary with the word if(strcmp(sWordMatch,sWord)==0)//find the current word { nTotalFreq=0; dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq); for(k=0;k<nMatchCount;k++)//Add the frequency { nTotalFreq+=nMatchFreq[k]; } //Adding a rule to exclude some words to be formed. if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0)) {//1年内、1999年末 if(CC_Find("末内中底前间初",sWord+2)) break; } if(nMatchCount==1)//The possible word has only one POS, store it { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]); else m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord); } else { if(!bOriginalFreq)//Not original frequency m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0); else m_segGraph.SetElement(i,j,nTotalFreq,0,sWord); } } strcat(sWord,m_sAtom[j++]); } i+=1;//Start from i++; } return true; }
//Generate Word according the segmentation route bool CSegment::GenerateWord(int **nSegRoute, int nIndex) { unsigned int i=0,k=0; int j,nStartVertex,nEndVertex,nPOS; char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100]; ELEMENT_TYPE fValue; while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1]) { nStartVertex=nSegRoute[nIndex][i]; j=nStartVertex;//Set the start vertex nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex nPOS=0; m_graphSeg.m_segGraph.GetElement(nStartVertex,nEndVertex,&fValue,&nPOS); sAtom[0]=0; while(j<nEndVertex) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending strcpy(sNumCandidate,sAtom); while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate))) {//Merge all seperate continue num into one number //sAtom[0]!=0: add in 2002-5-9 strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate); //Save them in the result segmentation i++;//Skip to next atom now sAtom[0]=0; while(j<nSegRoute[nIndex][i+1]) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } //add a judge for out of memery, //del, not nessasseray for ordinary text file,becasuse no word's lenth can larger than100 //so remain the same // if(strlen(sNumCandidate)+strlen(sAtom)<100) strcat(sNumCandidate,sAtom); // else // break; } unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord); if(nLen==4&&CC_Find("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",m_pWordSeg[nIndex][k].sWord[0])) {//Only one word strcpy(sCurWord,m_pWordSeg[nIndex][k].sWord);//Record current word i--; } else if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop { strcpy(m_pWordSeg[nIndex][k].sWord,sAtom); //Save them in the result segmentation strcpy(sCurWord,sAtom);//Record current word } else {//It is a num if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-'&&m_pWordSeg[nIndex][k].sWord[1]==0)//The delimiter "--" { nPOS=30464;//'w'*256;Set the POS with 'w' i--;//Not num, back to previous word } else {//Adding time suffix char sInitChar[3]; unsigned int nCharIndex=0;//Get first char sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; if(sInitChar[nCharIndex]<0) { nCharIndex+=1; sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; } nCharIndex+=1; sInitChar[nCharIndex]='\0'; if(k>0&&(abs(m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(m_pWordSeg[nIndex][k-1].nHandle)==29696)&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex)) {//3-4月 //27904='m'*256 //Split the sInitChar from the original word strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex); m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue; m_pWordSeg[nIndex][k+1].nHandle=27904; m_pWordSeg[nIndex][k].sWord[nCharIndex]=0; m_pWordSeg[nIndex][k].dValue=0; m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256; m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle,m_pWordSeg[nIndex][k].sWord); nStartVertex+=1; k+=1; } nLen=strlen(m_pWordSeg[nIndex][k].sWord); if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0) {//2001年 strcat(m_pWordSeg[nIndex][k].sWord,sAtom); strcpy(sCurWord,"未##时"); nPOS=-29696;//'t'*256;//Set the POS with 'm' } else if(strcmp(sAtom,"年")==0) { if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&& {//1998年, strcat(m_pWordSeg[nIndex][k].sWord,sAtom); strcpy(sCurWord,"未##时"); nPOS=-29696;//Set the POS with 't' } else { strcpy(sCurWord,"未##数"); nPOS=-27904;//Set the POS with 'm' i--;//Can not be a time word } } else { //早晨/t 五点/t if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0) { strcpy(sCurWord,"未##时"); nPOS=-29696;//Set the POS with 't' } else { // if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/') // { /// strcpy(sCurWord,"未##数"); // nPOS=-27904;//'m'*256;Set the POS with 'm' // } // else if(nLen>strlen(sInitChar)) // {//Get rid of . example 1. // if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/') // m_pWordSeg[nIndex][k].sWord[nLen-1]=0; // else // m_pWordSeg[nIndex][k].sWord[nLen-2]=0; // strcpy(sCurWord,"未##数"); // nPOS=-27904;//'m'*256;Set the POS with 'm' // i--; // } //here's bug in it +...... do not del . //2004-3-2 fixed by Wangzhifu // if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/') //2004_06_18 修改,见04_06_18新华网语料bug.txt if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.') { strcpy(sCurWord,"未##数"); nPOS=-27904;//'m'*256;Set the POS with 'm' } else if(nLen>strlen(sInitChar)) {//Get rid of . example 1. //,but if" +......" do not del .//fixed at 2004-3-2 by Wang Zhifu char TempWord[100]; strcpy(TempWord,m_pWordSeg[nIndex][k].sWord); TempWord[nLen-2]=0;//去掉最后一个字符! if(IsNumExist(TempWord)) { if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/') m_pWordSeg[nIndex][k].sWord[nLen-1]=0; else m_pWordSeg[nIndex][k].sWord[nLen-2]=0; strcpy(sCurWord,"未##数"); nPOS=-27904;//'m'*256;Set the POS with 'm' i--; } else { nPOS=-'n'*256-'x';// strcpy(sCurWord,"未##串"); } } } // end of fixed line; i--;//Not num, back to previous word } } fValue=0; nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter } m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS,sCurWord); //Generate optimum segmentation graph according the segmentation result i++;//Skip to next atom k++;//Accept next word } m_pWordSeg[nIndex][k].sWord[0]=0; m_pWordSeg[nIndex][k].nHandle=-1;//Set ending return true; }
//Adjust the result with some rules bool CResult::Adjust(PWORD_RESULT pItem,PWORD_RESULT pItemRet) { int i=0,j=0; unsigned int nLen; char sSurName[10],sSurName2[10],sGivenName[10]; bool bProcessed=false;//Have been processed while(pItem[i].sWord[0]!=0) { nLen=strlen(pItem[i].sWord); bProcessed=false; //Rule1: adjust person name if(pItem[i].nHandle==28274&&ChineseNameSplit(pItem[i].sWord,sSurName,sSurName2,sGivenName,m_uPerson.m_dict)&&strcmp(pItem[i].sWord,"叶利钦")!=0)//'nr' {//Divide name into surname and given name if(sSurName[0]) { strcpy(pItemRet[j].sWord,sSurName); pItemRet[j++].nHandle=28274; } if(sSurName2[0]) { strcpy(pItemRet[j].sWord,sSurName2); pItemRet[j++].nHandle=28274; } if(sGivenName[0]) { strcpy(pItemRet[j].sWord,sGivenName); pItemRet[j++].nHandle=28274; } bProcessed=true; } //Rule2 for overlap words ABB 一段段、一片片 else if(pItem[i].nHandle==27904&&strlen(pItem[i+1].sWord)==2&&strcmp(pItem[i+1].sWord,pItem[i+2].sWord)==0) {//(pItem[i+1].nHandle/256=='q'||pItem[i+1].nHandle/256=='a')&& strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); strcat(pItemRet[j].sWord,pItem[i+2].sWord); pItemRet[j].nHandle=27904; j+=1; i+=2; bProcessed=true; } //Rule3 for overlap words AA else if(nLen==2&&strcmp(pItem[i].sWord,pItem[i+1].sWord)==0) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); //24832=='a'*256 pItemRet[j].nHandle=24832;//a if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256 { pItemRet[j].nHandle=30208; } if(pItem[i].nHandle/256=='n'||pItem[i+1].nHandle/256=='n')//30208='v'8256 { pItemRet[j].nHandle='n'*256; } i+=1; if(strlen(pItem[i+1].sWord)==2) {//AAB:洗/洗/脸、蒙蒙亮 if((pItemRet[j].nHandle==30208&&pItem[i+1].nHandle/256=='n')|| (pItemRet[j].nHandle==24832&&pItem[i+1].nHandle/256=='a') ) { strcat(pItemRet[j].sWord,pItem[i+1].sWord); i+=1; } } j+=1; bProcessed=true; } //Rule 4: AAB 洗/洗澡 else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&(pItem[i].nHandle/256=='v'||pItem[i].nHandle==24832))//v,a { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); //24832=='a'*256 pItemRet[j].nHandle=24832;//'a' if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256 { pItemRet[j].nHandle=30208; } i+=1; j+=1; bProcessed=true; } else if(pItem[i].nHandle/256=='u'&&pItem[i].nHandle%256)//uj,ud,uv,uz,ul,ug->u pItem[i].nHandle='u'*256; else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&strncmp(pItem[i+1].sWord+2,pItem[i+2].sWord,2)==0) {//AABB 朴朴素素 枝枝叶叶 strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); strcat(pItemRet[j].sWord,pItem[i+2].sWord); pItemRet[j].nHandle=pItem[i+1].nHandle; i+=2; j+=1; bProcessed=true; } else if(pItem[i].nHandle==28275)//PostFix { if(m_uPlace.m_dict.IsExist(pItem[i+1].sWord,4)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28275; i+=1; j+=1; bProcessed=true; } else if(strlen(pItem[i+1].sWord)==2&&CC_Find("队",pItem[i+1].sWord)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28276; i+=1; j+=1; bProcessed=true; } else if(strlen(pItem[i+1].sWord)==2&&CC_Find("语文字杯",pItem[i+1].sWord)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28282; i+=1; j+=1; bProcessed=true; } else if(strlen(pItem[i+1].sWord)==2&&CC_Find("裔",pItem[i+1].sWord)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28160; i+=1; j+=1; bProcessed=true; } } else if(pItem[i].nHandle==30208||pItem[i].nHandle==28160)//v { if(strlen(pItem[i+1].sWord)==2&&CC_Find("员",pItem[i+1].sWord)) { strcpy(pItemRet[j].sWord,pItem[i].sWord); strcat(pItemRet[j].sWord,pItem[i+1].sWord); pItemRet[j].nHandle=28160; i+=1; j+=1; bProcessed=true; } } else if(pItem[i].nHandle==28280) {//www/nx ./w sina/nx; EIM/nx -601/m strcpy(pItemRet[j].sWord,pItem[i].sWord); pItemRet[j].nHandle=28280; while(pItem[i+1].nHandle==28280||strstr("..",pItem[i+1].sWord)||(pItem[i+1].nHandle==27904&&IsAllNum((unsigned char *)pItem[i+1].sWord))) { strcat(pItemRet[j].sWord,pItem[i+1].sWord); i+=1; } j+=1; bProcessed=true; } if(!bProcessed) {//If not processed,that's mean: not need to adjust; //just copy to the final result strcpy(pItemRet[j].sWord,pItem[i].sWord); pItemRet[j++].nHandle=pItem[i].nHandle; } i++; } pItemRet[j].sWord[0]=0;//Set ending return true; }
//Generate Word according the segmentation route bool CSegment::GenerateWord(int **nSegRoute, int nIndex) { unsigned int i=0,k=0; int j,nStartVertex,nEndVertex,nPOS; char sAtom[WORD_MAXLENGTH],sNumCandidate[100]; ELEMENT_TYPE fValue; while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1]) { nStartVertex=nSegRoute[nIndex][i]; j=nStartVertex;//Set the start vertex nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex nPOS=0; m_graphSeg.m_segGraph.GetElementInfo(nStartVertex,nEndVertex,fValue,nPOS); sAtom[0]=0; while(j < nEndVertex) { //Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending strcpy(sNumCandidate,sAtom); while(sAtom[0]!=0 && (IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate))) { //Merge all seperate continue num into one number //sAtom[0]!=0: add in 2002-5-9 strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate); //Save them in the result segmentation i++;//Skip to next atom now sAtom[0]=0; while(j<nSegRoute[nIndex][i+1]) {//Generate the word according the segmentation route strcat(sAtom,m_graphSeg.m_sAtom[j]); j++; } strcat(sNumCandidate,sAtom); } if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop { strcpy(m_pWordSeg[nIndex][k].sWord,sAtom); //Save them in the result segmentation } else {//It is a num if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-')//The delimiter "--" { nPOS=30464;//'w'*256;Set the POS with 'w' i--;//Not num, back to previous word } else {//Adding time suffix char sInitChar[3]; unsigned int nCharIndex=0;//Get first char sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; if(sInitChar[nCharIndex]<0) { nCharIndex+=1; sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex]; } nCharIndex+=1; sInitChar[nCharIndex]='\0'; if(k>0&&m_pWordSeg[nIndex][k-1].nHandle==27904&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex)) {//3-4月 //27904='m'*256 //Split the sInitChar from the original word strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex); m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue; m_pWordSeg[nIndex][k+1].nHandle=27904; m_pWordSeg[nIndex][k].sWord[nCharIndex]=0; m_pWordSeg[nIndex][k].dValue=0; m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256; m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle); nStartVertex+=1; k+=1; } unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord); if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0) {//2001年 strcat(m_pWordSeg[nIndex][k].sWord,sAtom); nPOS=29696;//'t'*256;//Set the POS with 'm' } else if(strcmp(sAtom,"年")==0) { if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&& {//1998年, strcat(m_pWordSeg[nIndex][k++].sWord,sAtom); nPOS='t'*256;//Set the POS with 'm' } else i--;//Can not be a time word } else { //早晨/t 五点/t if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0) { nPOS='t'*256;//Set the POS with 'm' } else { if(m_pWordSeg[nIndex][k].sWord[0]!='.') nPOS='m'*256;//Set the POS with 'm' if(nLen>1&&m_pWordSeg[nIndex][k].sWord[nLen-1]=='.') {//Get rid of . example 1. m_pWordSeg[nIndex][k].sWord[nLen-1]=0; i--; } } i--;//Not num, back to previous word } } fValue=0; nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter } m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS); //Generate optimum segmentation graph according the segmentation result i++;//Skip to next atom k++;//Accept next word } m_pWordSeg[nIndex][k].sWord[0]=0; m_pWordSeg[nIndex][k].nHandle=-1;//Set ending return true; }