コード例 #1
0
ファイル: Utility.cpp プロジェクト: kiichi7/Search-Engine
/*********************************************************************
 *
 *  Func Name  : GetCharCount
 *
 *  Description: Get the count of char which is in sWord and in sCharSet
 *
 *  Parameters : sWord: the word
 * 
 *  Returns    : COUNT
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-5-21
                2. corrected by Wang Zhifu 2004-2-18
 *********************************************************************/
int GetCharCount(char *sCharSet,char *sWord)
{
  unsigned int  k=0; 
  char tchar[3];
  int nCount=0;
  tchar[2]=0;
  while(k < strlen(sWord))
  {
     tchar[0]=sWord[k];
 	 tchar[1]=0;
	 if(sWord[k]<0)
	 {
		 tchar[1]=sWord[k+1];
		 k+=1;
	 }
	 k+=1;

//   这里有改动,原语句为
//	 if((tchar[0]<0&&CC_Find(sCharSet, tchar))||strchr(sCharSet,tchar[0]))
//   改为:因为后面部分经常为true!!!应该加入限制条件!
//   corrected by Wang Zhifu
	 if(((tchar[0]<0&&(CC_Find(sCharSet, tchar)!=NULL)))||(tchar[0]>0&&(strchr(sCharSet,tchar[0])!=NULL)))
          nCount++;

  }
  return nCount;
}
コード例 #2
0
ファイル: Utility.cpp プロジェクト: ProgramLeague/FreeICTCLAS
bool PostfixSplit(char *sWord, char *sWordRet, char *sPostfix)
{
	char sSinglePostfix[]=POSTFIX_SINGLE;
	char sMultiPostfix[][9]=POSTFIX_MUTIPLE;
	unsigned int nPostfixLen=0,nWordLen=strlen(sWord);
	int i=0;

	while(sMultiPostfix[i][0]!=0&&strncmp(sWord+nWordLen-strlen(sMultiPostfix[i]),sMultiPostfix[i],strlen(sMultiPostfix[i]))!=0)
	{//Try to get the postfix of an address
		i++;
	}
	strcpy(sPostfix,sMultiPostfix[i]);
	nPostfixLen=strlen(sMultiPostfix[i]);//Get the length of place postfix

	if(nPostfixLen==0)
	{
		sPostfix[2]=0;
		strncpy(sPostfix,sWord+nWordLen-2,2);
		if(CC_Find(sSinglePostfix,sPostfix))
			nPostfixLen=2;
	}
	
	strncpy(sWordRet,sWord,nWordLen-nPostfixLen);
	sWordRet[nWordLen-nPostfixLen]=0;//Get the place name which have erasing the postfix
	sPostfix[nPostfixLen]=0;
    return true;
}
コード例 #3
0
/*********************************************************************
 *
 *  Func Name  : GetMaxMatch
 *
 *  Description: Get the max match to the word
 *              
 *
 *  Parameters : nHandle: the only handle which will be attached to the word

 *  Returns    : success or fail
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-21
 *********************************************************************/
bool CDictionary::GetMaxMatch(char *sWord, char *sWordRet,int *npHandleRet)
{
   char sWordGet[WORD_MAXLENGTH-2],sFirstChar[3];
   int nPos,nFoundPos,nTemp;
   PWORD_CHAIN pCur;
   *npHandleRet=-1;
   if(!PreProcessing(sWord, &nPos,sWordGet))
	   return false;
   if (nPos < 0) return false;
   sWordRet[0]=0;
   strncpy(sFirstChar,sWord,strlen(sWord)-strlen(sWordGet));//Get the first char
   sFirstChar[strlen(sWord)-strlen(sWordGet)]=0;//Set the end flag
   FindInOriginalTable(nPos,sWordGet,-1,&nFoundPos);
   nTemp=nFoundPos;//Check its previous position
   if(nFoundPos==-1)
		nTemp=0;
   while(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)!=m_IndexTable[nPos].pWordItemHead[nTemp].sWord)
   {//Get the next
	   nTemp+=1;
   }
   if(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)==m_IndexTable[nPos].pWordItemHead[nTemp].sWord)
   {
	   strcpy(sWordRet,sFirstChar);
	   strcat(sWordRet,m_IndexTable[nPos].pWordItemHead[nTemp].sWord);
	   *npHandleRet=m_IndexTable[nPos].pWordItemHead[nTemp].nHandle;
	   return true;
   }//Cannot get the item and retrieve the modified data if exists
    //Operation in the index table and its items 
   if(m_pModifyTable&&m_pModifyTable[nPos].pWordItemHead)//Exists 
	   pCur=m_pModifyTable[nPos].pWordItemHead;
   else
	   pCur=NULL;
   while(pCur!=NULL&&strcmp(pCur->data.sWord,sWordGet)<=0&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord)//
   {
	   pCur=pCur->next;
   }
   if(pCur!=NULL&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord)
   {//Get it
	   strcpy(sWordRet,sFirstChar);
	   strcat(sWordRet,pCur->data.sWord);
	   *npHandleRet=pCur->data.nHandle;
	   return true;
   }
   return false;
}
コード例 #4
0
ファイル: Utility.cpp プロジェクト: ProgramLeague/FreeICTCLAS
/*********************************************************************
 *
 *  Func Name  : IsForeign
 *
 *  Description: Decide whether the word is Chinese Num word
 *
 *  Parameters : sWord: the word
 *
 *  Returns    : the index value
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-26
 *********************************************************************/
bool IsAllChineseNum(char *sWord)
{//百分之五点六的人早上八点十八分起床
  unsigned int  k; 
  char tchar[3];
  char ChineseNum[]="零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·./点";//
  char sPrefix[]="几数第上成";
  for(k = 0; k < strlen(sWord); k+=2)
  {
     strncpy(tchar,sWord+k,2) ;
     tchar[2]='\0';
	 if(strncmp(sWord+k,"分之",4)==0)//百分之五
	 {
		k+=2;
		continue;
	 }

	 if(!CC_Find(ChineseNum, tchar)&&!(k==0&&CC_Find(sPrefix, tchar)))
		 return false;
  }
  return true;
}
コード例 #5
0
ファイル: Segment.cpp プロジェクト: tedzhang/SearchMonkey
bool CSegment::IsYearTime(char *sNum)
{
	//Judge whether the sNum is a num genearating year
	unsigned int nLen=strlen(sNum);
	char sTemp[3];
	strncpy(sTemp,sNum,2);
	sTemp[2]=0;
	if(IsAllSingleByte((unsigned char *)sNum)&&(nLen>=3||nLen==2&&sNum[0]>'4'))//1992年, 90年
		return true;
	if(IsAllNum((unsigned char *)sNum)&&(nLen>=6||nLen==4&&CC_Find("56789",sTemp)))
		return true;
	if(GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖",sNum)==(int)nLen/2&&nLen>=3)
		return true;
	if(nLen==8&&GetCharCount("千仟零○",sNum)==2)//二仟零二年
		return true;
	return false;
}
コード例 #6
0
ファイル: Utility.cpp プロジェクト: ProgramLeague/FreeICTCLAS
/*********************************************************************
 *
 *  Func Name  : GetCharCount
 *
 *  Description: Get the count of char which is in sWord and in sCharSet
 *
 *  Parameters : sWord: the word
 * 
 *  Returns    : COUNT
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-5-21
 *********************************************************************/
int GetCharCount(char *sCharSet,char *sWord)
{
  unsigned int  k=0; 
  char tchar[3];
  int nCount=0;
  tchar[2]=0;
  while(k < strlen(sWord))
  {
     tchar[0]=sWord[k];
 	 tchar[1]=0;
	 if(sWord[k]<0)
	 {
		 tchar[1]=sWord[k+1];
		 k+=1;
	 }
	 k+=1;
	 if((tchar[0]<0&&CC_Find(sCharSet, tchar))||strchr(sCharSet,tchar[0]))
          nCount++;
  }
  return nCount;
}
コード例 #7
0
ファイル: Utility.cpp プロジェクト: kiichi7/Search-Engine
/*********************************************************************
 *
 *  Func Name  : IsNumExist
 *
 *  Description: Judge whether there is Num Char in the string
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Wangzhifu
 *  History    : 
 *              1.create 2004-3-2
 *********************************************************************/
bool IsNumExist(char *sWord)
{
  unsigned int  k; 
  char tchar[3];
  char ChineseNum[]="零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟";//
  for(k = 0; k < strlen(sWord); k++)
  {
     tchar[0]=sWord[k];
	 if(tchar[0]<0)
	 {
		 tchar[1]=sWord[k++];
		 tchar[2]='\0';
		 if(CC_Find(ChineseNum, tchar))
			 return true;
	 }
	 else
	 {
		 if((tchar[0]>'0'-1)&&(tchar[0]<'9'+1))
			 return true;
	 }
  }	 
  return false;
}
bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool	bOriginalFreq)
{
//Gernerate the word net from the sLine, that's list all the possible word
	unsigned int i=0,j,nLen=strlen(sSentence);
	char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
	int nWordIndex=0,nHandleTemp,k,nPOS;
	int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
	double dValue=0;
	m_nAtomCount=0;
	m_segGraph.SetEmpty();//Set segmentation graph empty

	AtomSegment(sSentence);
	//Atomic Segmentation

    for(i=0;i<m_nAtomCount;i++)//Init the cost array
    {
		if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
		{
			if(!bOriginalFreq)//Not original frequency
				m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value
			else
				m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
		}
		else//Other atom
		{
			strcpy(sWord,m_sAtom[i]);//init the word 
			dValue=MAX_FREQUENCE;
			switch(m_nAtomPOS[i])
			{
			case CT_INDEX:
			case CT_NUM:
				nPOS=-27904;//'m'*256
				strcpy(sWord,"未##数");
				dValue=0;
				break;
			case CT_DELIMITER:
				nPOS=30464;//'w'*256;
				break;
			case CT_LETTER:
				nPOS=-'n'*256-'x';//
				dValue=0;
				strcpy(sWord,"未##串");
				break;
			case CT_SINGLE://12021-2129-3121
				if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i]))
				{
					nPOS=-27904;//'m'*256
					strcpy(sWord,"未##数");
				}
				else
				{
					nPOS=-'n'*256-'x';//
					strcpy(sWord,"未##串");
				}
				dValue=0;
				break;
			default:
				nPOS=m_nAtomPOS[i];//'?'*256;
				break;
			}
			if(!bOriginalFreq)//Not original frequency
				m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum
			else
				m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum
		}
    }
	i=0;
	while(i<m_nAtomCount)//All the word
	{
	  strcpy(sWord,m_sAtom[i]);//Get the current atom
	  j=i+1;
	  if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份
		  j+=1;
	  while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
	  {//Add a condition to control the end of string
	   //retrieve the dictionary with the word
          if(strcmp(sWordMatch,sWord)==0)//find the current word
		  {
			  nTotalFreq=0;
			  dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
			  for(k=0;k<nMatchCount;k++)//Add the frequency
			  {
				 nTotalFreq+=nMatchFreq[k];
			  }
			  //Adding a rule to exclude some words to be formed.
			  if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
			  {//1年内、1999年末
			     if(CC_Find("末内中底前间初",sWord+2))
				     break;
			  }
			  if(nMatchCount==1)//The possible word has only one POS, store it
			  {
				if(!bOriginalFreq)//Not original frequency
					m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]);
				else
					m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
			  }
			  else 
			  {
					if(!bOriginalFreq)//Not original frequency
						m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0);
					else
						m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
			  }
		  }
		  strcat(sWord,m_sAtom[j++]);
	  }
	  i+=1;//Start from i++;
	}
	return true;
}
コード例 #9
0
ファイル: Span.cpp プロジェクト: tedzhang/SearchMonkey
//Guess the POS of No. nIndex word item
bool CSpan::GuessPOS(int nIndex,int *pSubIndex)
{
	int j=0,i=nIndex,nCharType;
	unsigned int nLen;
	switch(m_tagType)
	{
	case TT_NORMAL:
		break;
	case TT_PERSON:
		j=0;
		if(CC_Find("××",m_sWords[nIndex]))
		{
			m_nTags[i][j]=6;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1);
		}
		else
		{
			m_nTags[i][j]=0;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
			nLen=strlen(m_sWords[nIndex]);
			if(nLen>=4)
			{
				m_nTags[i][j]=0;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
			}
			else if(nLen==2)
			{
				m_nTags[i][j]=0;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
				nCharType=charType((unsigned char *)m_sWords[nIndex]);
				if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
				{
					m_nTags[i][j]=1;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
					m_nTags[i][j]=2;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
					m_nTags[i][j]=3;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
					m_nTags[i][j]=4;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
				}
					m_nTags[i][j]=11;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
					m_nTags[i][j]=12;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
					m_nTags[i][j]=13;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
			}
		}
		break;
	case TT_PLACE:
		j=0;
		m_nTags[i][j]=0;
		m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
		nLen=strlen(m_sWords[nIndex]);
		if(nLen>=4)
		{
			m_nTags[i][j]=11;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
			m_nTags[i][j]=12;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
			m_nTags[i][j]=13;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
		}
		else if(nLen==2)
		{
			m_nTags[i][j]=0;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
			nCharType=charType((unsigned char *)m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				m_nTags[i][j]=1;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
				m_nTags[i][j]=2;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
				m_nTags[i][j]=3;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
				m_nTags[i][j]=4;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
			}
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
		}
		break;
	case TT_TRANS:
		j=0;
		nLen=strlen(m_sWords[nIndex]);
		
		m_nTags[i][j]=0;
		m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);

		if(!IsAllChinese((unsigned char *)m_sWords[nIndex]))
		{
			if(IsAllLetter((unsigned char *)m_sWords[nIndex]))
			{
				m_nTags[i][j]=1;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1);
/*			}
			if(IsAllNum((unsigned char *)m_sWords[nIndex])||IsAllLetter((unsigned char *)m_sWords[nIndex]))
			{
*/				m_nTags[i][j]=2;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
				m_nTags[i][j]=3;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1);
			}
			m_nTags[i][j]=41;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
			m_nTags[i][j]=42;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
			m_nTags[i][j]=43;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
		}
		else if(nLen>=4)
		{
			m_nTags[i][j]=41;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
			m_nTags[i][j]=42;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
			m_nTags[i][j]=43;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
		}
		else if(nLen==2)
		{
			nCharType=charType((unsigned char *)m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				m_nTags[i][j]=1;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1);
				m_nTags[i][j]=2;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
				m_nTags[i][j]=3;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
				m_nTags[i][j]=30;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1);
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1);
				m_nTags[i][j]=21;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1);
				m_nTags[i][j]=22;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1);
				m_nTags[i][j]=23;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1);
			}
				m_nTags[i][j]=41;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
				m_nTags[i][j]=42;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
				m_nTags[i][j]=43;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
		}
		break;
	default:
		break;
	}
	*pSubIndex=j;
	return true;
}
コード例 #10
0
ファイル: Segment.cpp プロジェクト: kiichi7/Search-Engine
//Generate Word according the segmentation route
bool CSegment::GenerateWord(int **nSegRoute, int nIndex)
{
	unsigned int i=0,k=0;
	int j,nStartVertex,nEndVertex,nPOS;
	char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100];
	ELEMENT_TYPE fValue;
	while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1])
	{
		nStartVertex=nSegRoute[nIndex][i];
		j=nStartVertex;//Set the start vertex
		nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex
		nPOS=0;
		m_graphSeg.m_segGraph.GetElement(nStartVertex,nEndVertex,&fValue,&nPOS);
		sAtom[0]=0;
		while(j<nEndVertex)
		{//Generate the word according the segmentation route
			strcat(sAtom,m_graphSeg.m_sAtom[j]);
			j++;
		}
		m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending
		strcpy(sNumCandidate,sAtom);
		while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate)))
		{//Merge all seperate continue num into one number
		 //sAtom[0]!=0: add in 2002-5-9
			strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate);
			//Save them in the result segmentation
			i++;//Skip to next atom now 
			sAtom[0]=0;
			
			while(j<nSegRoute[nIndex][i+1])
			{//Generate the word according the segmentation route
				strcat(sAtom,m_graphSeg.m_sAtom[j]);
				j++;
			}
			//add a judge for out of memery, 
			//del, not nessasseray for ordinary text file,becasuse no word's lenth can larger than100
			//so remain the same
//			if(strlen(sNumCandidate)+strlen(sAtom)<100)
				strcat(sNumCandidate,sAtom);
//			else 
//				break;
		}
		unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord);
		if(nLen==4&&CC_Find("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",m_pWordSeg[nIndex][k].sWord[0]))
		{//Only one word
			strcpy(sCurWord,m_pWordSeg[nIndex][k].sWord);//Record current word
			i--;
		}
		else if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop
		{
			strcpy(m_pWordSeg[nIndex][k].sWord,sAtom);
			//Save them in the result segmentation
			strcpy(sCurWord,sAtom);//Record current word
		}
		else
		{//It is a num
			if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-'&&m_pWordSeg[nIndex][k].sWord[1]==0)//The delimiter "--"
			{
				nPOS=30464;//'w'*256;Set the POS with 'w'
				i--;//Not num, back to previous word
			}
			else
			{//Adding time suffix

				char sInitChar[3];
				unsigned int nCharIndex=0;//Get first char
				sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
				if(sInitChar[nCharIndex]<0)
				{
					nCharIndex+=1;
					sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
				}
				nCharIndex+=1;
				sInitChar[nCharIndex]='\0';
				if(k>0&&(abs(m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(m_pWordSeg[nIndex][k-1].nHandle)==29696)&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex))
				{//3-4月                                 //27904='m'*256
				   //Split the sInitChar from the original word
					strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex);
					m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue;
					m_pWordSeg[nIndex][k+1].nHandle=27904;
					m_pWordSeg[nIndex][k].sWord[nCharIndex]=0;
					m_pWordSeg[nIndex][k].dValue=0;
					m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256;
					m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle,m_pWordSeg[nIndex][k].sWord);
					nStartVertex+=1;
					k+=1;
				}
				nLen=strlen(m_pWordSeg[nIndex][k].sWord);
				if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0)
				{//2001年
					strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
					strcpy(sCurWord,"未##时");
					nPOS=-29696;//'t'*256;//Set the POS with 'm'
				}
				else if(strcmp(sAtom,"年")==0)
				{
					 if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&&
					 {//1998年,
						strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
						strcpy(sCurWord,"未##时");
						nPOS=-29696;//Set the POS with 't'
					 }
					 else
					 {
						strcpy(sCurWord,"未##数");
						nPOS=-27904;//Set the POS with 'm'
						i--;//Can not be a time word
					 }
				}
     			else
				{
					//早晨/t  五点/t 
					if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0)
					{
						strcpy(sCurWord,"未##时");
						nPOS=-29696;//Set the POS with 't'
					}	
					else 
					{
//						if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/')
//						{
///							strcpy(sCurWord,"未##数");
//							nPOS=-27904;//'m'*256;Set the POS with 'm'
//						}
//						else if(nLen>strlen(sInitChar))
//						{//Get rid of . example 1.
//							if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/')
//								m_pWordSeg[nIndex][k].sWord[nLen-1]=0;
//							else
//								m_pWordSeg[nIndex][k].sWord[nLen-2]=0;
//							strcpy(sCurWord,"未##数");
//							nPOS=-27904;//'m'*256;Set the POS with 'm'
//							i--;
//						}
//here's bug in it +...... do not del .
//2004-3-2 fixed by Wangzhifu
//						if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/')
//2004_06_18 修改,见04_06_18新华网语料bug.txt
						if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.')
						{
							strcpy(sCurWord,"未##数");
							nPOS=-27904;//'m'*256;Set the POS with 'm'
						}
						else if(nLen>strlen(sInitChar))
						{//Get rid of . example 1.
							//,but if" +......" do not del .//fixed at 2004-3-2 by Wang Zhifu
							char TempWord[100];
							strcpy(TempWord,m_pWordSeg[nIndex][k].sWord);
							TempWord[nLen-2]=0;//去掉最后一个字符!
							if(IsNumExist(TempWord))
							{
								if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/')
									m_pWordSeg[nIndex][k].sWord[nLen-1]=0;
								else
									m_pWordSeg[nIndex][k].sWord[nLen-2]=0;
								strcpy(sCurWord,"未##数");								
								nPOS=-27904;//'m'*256;Set the POS with 'm'
								i--;
							}
							else
							{
								nPOS=-'n'*256-'x';//
								strcpy(sCurWord,"未##串");
							}
						}
					}
// end of fixed line;

					i--;//Not num, back to previous word
				}
			}
			fValue=0;
			nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter
		}
		m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word
		m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word
		m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS,sCurWord);
		//Generate optimum segmentation graph according the segmentation result
		i++;//Skip to next atom
		k++;//Accept next word
	}
	m_pWordSeg[nIndex][k].sWord[0]=0;
	m_pWordSeg[nIndex][k].nHandle=-1;//Set ending
	return true;
}
コード例 #11
0
ファイル: Utility.cpp プロジェクト: ProgramLeague/FreeICTCLAS
/*********************************************************************
 *
 *  Func Name  : IsAllNum
 *
 *  Description: Judge the string is all made up of Num Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllNum(unsigned char *sString)
{

	unsigned int nLen=strlen((const char *)sString),i=0;
	char sChar[3];
	sChar[2]=0;
	if(i<nLen)//Get prefix such as + -
	{
		sChar[0]=sString[i++];
		if(sChar[0]<0)//Get first char
			sChar[1]=sString[i++];
		else
			sChar[1]=0;
		if(!strstr("±+—-+",sChar))
		{
			i=0;
		}
	}
	while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186)
	{
		i+=2;
	}
	if(i<nLen)//Get middle delimiter such as .
	{
		sChar[0]=sString[i++];
		if(sChar[0]<0)//Get first char
			sChar[1]=sString[i++];
		else
			sChar[1]=0;
		if(CC_Find("∶·./",sChar)||sChar[0]=='.'||sChar[0]=='/')
		{//98.1%
			while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186)
			{
				i+=2;
			}
		}	
		else
		{
			i-=strlen(sChar);
		}
	}

	if(i>=nLen)
		return true;
	while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1)
	{//single byte number char
		i+=1;
	}
	if(i<nLen)//Get middle delimiter such as .
	{
		sChar[0]=sString[i++];
		if(sChar[0]<0)//Get first char
			sChar[1]=sString[i++];
		else
			sChar[1]=0;
		if(CC_Find("∶·./",sChar)||sChar[0]=='.'||sChar[0]=='/')
		{//98.1%
			while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1)
			{
				i+=1;
			}
		}	
		else
		{
			i-=strlen(sChar);
		}
	}
	if(i<nLen)//Get middle delimiter such as .
	{
		sChar[0]=sString[i++];
		if(sChar[0]<0)//Get first char
			sChar[1]=sString[i++];
		else
			sChar[1]=0;
		if(!CC_Find("百千万亿佰仟%‰",sChar)&&sChar[0]!='%')
			i-=strlen(sChar);
	}
	if(i>=nLen)
		return true;
	return false;
}
コード例 #12
0
ファイル: Result.cpp プロジェクト: Dashboard-X/WebSearch3.1
//Paragraph Segment and POS Tagging
bool CResult::ParagraphProcessing(char *sParagraph,char *sResult)
{
	char *sSentence,sChar[3];
	char *sSentenceResult;
	unsigned int nLen=strlen(sParagraph)+13;
	sSentence=new char[nLen];//malloc buffer
	sSentenceResult=new char[nLen*3];//malloc buffer
	sSentence[0]=0;
	unsigned int nPosIndex=0,nParagraphLen=strlen(sParagraph),nSentenceIndex=0;
	sChar[2]=0;
	sResult[0]=0;//Init the result
	bool bFirstIgnore=true;
	strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
	while(nPosIndex<nParagraphLen)
	{//Find a whole sentence which separated by ! . \n \r
		sChar[0]=sParagraph[nPosIndex];//Get a char
		sChar[1]=0;
		if(sParagraph[nPosIndex]<0)
		{//double byte char
			nPosIndex+=1;
			sChar[1]=sParagraph[nPosIndex];
		}
		nPosIndex+=1;
/*
#define  SEPERATOR_C_SENTENCE "。!?:;…"
#define  SEPERATOR_C_SUB_SENTENCE "、,()“”‘’"
#define  SEPERATOR_E_SENTENCE "!?:;"
#define  SEPERATOR_E_SUB_SENTENCE ",()\042'"
#define  SEPERATOR_LINK "\n\r  "
*/
		if(CC_Find(SEPERATOR_C_SENTENCE,sChar)||CC_Find(SEPERATOR_C_SUB_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar)||strstr(SEPERATOR_E_SUB_SENTENCE,sChar)||strstr(SEPERATOR_LINK,sChar))
		{//Reach end of a sentence.Get a whole sentence
			if(!strstr(SEPERATOR_LINK,sChar))//Not link seperator
			{
				strcat(sSentence,sChar);
			}
			if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0)
			{
				if(!strstr(SEPERATOR_C_SUB_SENTENCE,sChar)&&!strstr(SEPERATOR_E_SUB_SENTENCE,sChar))
					strcat(sSentence,SENTENCE_END);//Add sentence ending flag

				Processing(sSentence,1);//Processing and output the result of current sentence.
				Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
				//bFirstIgnore=true;
				strcat(sResult,sSentenceResult);//Store in the result buffer
			}
			if(strstr(SEPERATOR_LINK,sChar))//Link the result with the SEPERATOR_LINK
			{
				strcat(sResult,sChar);
				strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag

				//sSentence[0]=0;//New sentence, and begin new segmentation
				//bFirstIgnore=false;
			}
			else if(strstr(SEPERATOR_C_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar))
			{
				strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
				//sSentence[0]=0;//New sentence, and begin new segmentation
				//bFirstIgnore=false;
			}
			else
			{
				strcpy(sSentence,sChar);//reset current sentence, and add the previous end at begin position
			}
		}
		else //Other chars and store in the sentence buffer
			strcat(sSentence,sChar);
	}
	if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0)
	{
		strcat(sSentence,SENTENCE_END);//Add sentence ending flag
		Processing(sSentence,1);//Processing and output the result of current sentence.
		Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
		strcat(sResult,sSentenceResult);//Store in the result buffer
	}
	delete []  sSentence;//FREE sentence buffer 	
	delete []  sSentenceResult;//free buffer
	return true;
}
コード例 #13
0
ファイル: Result.cpp プロジェクト: Dashboard-X/WebSearch3.1
//Adjust the result with some rules
bool CResult::Adjust(PWORD_RESULT pItem,PWORD_RESULT pItemRet)
{
	int i=0,j=0;
	unsigned int nLen;
	char sSurName[10],sSurName2[10],sGivenName[10];
	bool bProcessed=false;//Have been processed
	while(pItem[i].sWord[0]!=0)
	{
		nLen=strlen(pItem[i].sWord);
		bProcessed=false;
		
		//Rule1: adjust person name
		if(pItem[i].nHandle==28274&&ChineseNameSplit(pItem[i].sWord,sSurName,sSurName2,sGivenName,m_uPerson.m_dict)&&strcmp(pItem[i].sWord,"叶利钦")!=0)//'nr'
		{//Divide name into surname and given name
			
			if(sSurName[0])
			{
				strcpy(pItemRet[j].sWord,sSurName);
				pItemRet[j++].nHandle=28274;
			}
			if(sSurName2[0])
			{
				strcpy(pItemRet[j].sWord,sSurName2);
				pItemRet[j++].nHandle=28274;
			}
			if(sGivenName[0])
			{
				strcpy(pItemRet[j].sWord,sGivenName);
				pItemRet[j++].nHandle=28274;
			}
			bProcessed=true;
		}
		//Rule2 for overlap words ABB 一段段、一片片
		else if(pItem[i].nHandle==27904&&strlen(pItem[i+1].sWord)==2&&strcmp(pItem[i+1].sWord,pItem[i+2].sWord)==0)
		{//(pItem[i+1].nHandle/256=='q'||pItem[i+1].nHandle/256=='a')&&
			strcpy(pItemRet[j].sWord,pItem[i].sWord);
			strcat(pItemRet[j].sWord,pItem[i+1].sWord);
			strcat(pItemRet[j].sWord,pItem[i+2].sWord);
			pItemRet[j].nHandle=27904;
			j+=1;
			i+=2;
			bProcessed=true;
		}
		//Rule3 for overlap words AA
		else if(nLen==2&&strcmp(pItem[i].sWord,pItem[i+1].sWord)==0)
		{
			strcpy(pItemRet[j].sWord,pItem[i].sWord);
			strcat(pItemRet[j].sWord,pItem[i+1].sWord);
		     //24832=='a'*256
			pItemRet[j].nHandle=24832;//a
			if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
			{
				pItemRet[j].nHandle=30208;
			}
			if(pItem[i].nHandle/256=='n'||pItem[i+1].nHandle/256=='n')//30208='v'8256
			{
				pItemRet[j].nHandle='n'*256;
			}			
			i+=1;
			if(strlen(pItem[i+1].sWord)==2)
			{//AAB:洗/洗/脸、蒙蒙亮
				if((pItemRet[j].nHandle==30208&&pItem[i+1].nHandle/256=='n')||
				   (pItemRet[j].nHandle==24832&&pItem[i+1].nHandle/256=='a')
				   )
				{
					strcat(pItemRet[j].sWord,pItem[i+1].sWord);
					i+=1;
				}
			}
			j+=1;
			bProcessed=true;
		}

		//Rule 4: AAB 洗/洗澡
		else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&(pItem[i].nHandle/256=='v'||pItem[i].nHandle==24832))//v,a
		{
			strcpy(pItemRet[j].sWord,pItem[i].sWord);
			strcat(pItemRet[j].sWord,pItem[i+1].sWord);
		     //24832=='a'*256
			pItemRet[j].nHandle=24832;//'a'
			if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
			{
				pItemRet[j].nHandle=30208;
			}

			i+=1;
			j+=1;
			bProcessed=true;
		}
		else if(pItem[i].nHandle/256=='u'&&pItem[i].nHandle%256)//uj,ud,uv,uz,ul,ug->u
			pItem[i].nHandle='u'*256;
		else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&strncmp(pItem[i+1].sWord+2,pItem[i+2].sWord,2)==0)
		{//AABB 朴朴素素 枝枝叶叶
				strcpy(pItemRet[j].sWord,pItem[i].sWord);
				strcat(pItemRet[j].sWord,pItem[i+1].sWord);
				strcat(pItemRet[j].sWord,pItem[i+2].sWord);
				pItemRet[j].nHandle=pItem[i+1].nHandle;
				i+=2;
				j+=1;
				bProcessed=true;
		}
		else if(pItem[i].nHandle==28275)//PostFix
		{
			if(m_uPlace.m_dict.IsExist(pItem[i+1].sWord,4))
			{
				strcpy(pItemRet[j].sWord,pItem[i].sWord);
				strcat(pItemRet[j].sWord,pItem[i+1].sWord);
				pItemRet[j].nHandle=28275;
				i+=1;
				j+=1;
				bProcessed=true;
			}
			else if(strlen(pItem[i+1].sWord)==2&&CC_Find("队",pItem[i+1].sWord))
			{
				strcpy(pItemRet[j].sWord,pItem[i].sWord);
				strcat(pItemRet[j].sWord,pItem[i+1].sWord);
				pItemRet[j].nHandle=28276;
				i+=1;
				j+=1;
				bProcessed=true;
			}
			else if(strlen(pItem[i+1].sWord)==2&&CC_Find("语文字杯",pItem[i+1].sWord))
			{
				strcpy(pItemRet[j].sWord,pItem[i].sWord);
				strcat(pItemRet[j].sWord,pItem[i+1].sWord);
				pItemRet[j].nHandle=28282;
				i+=1;
				j+=1;
				bProcessed=true;
			}
			else if(strlen(pItem[i+1].sWord)==2&&CC_Find("裔",pItem[i+1].sWord))
			{
				strcpy(pItemRet[j].sWord,pItem[i].sWord);
				strcat(pItemRet[j].sWord,pItem[i+1].sWord);
				pItemRet[j].nHandle=28160;
				i+=1;
				j+=1;
				bProcessed=true;
			}
		}
		else if(pItem[i].nHandle==30208||pItem[i].nHandle==28160)//v
		{
			if(strlen(pItem[i+1].sWord)==2&&CC_Find("员",pItem[i+1].sWord))
			{
				strcpy(pItemRet[j].sWord,pItem[i].sWord);
				strcat(pItemRet[j].sWord,pItem[i+1].sWord);
				pItemRet[j].nHandle=28160;
				i+=1;
				j+=1;
				bProcessed=true;
			}
		}
		else if(pItem[i].nHandle==28280)
		{//www/nx ./w sina/nx; EIM/nx  -601/m 
			strcpy(pItemRet[j].sWord,pItem[i].sWord);
			pItemRet[j].nHandle=28280;
			while(pItem[i+1].nHandle==28280||strstr("..",pItem[i+1].sWord)||(pItem[i+1].nHandle==27904&&IsAllNum((unsigned char *)pItem[i+1].sWord)))
			{
				strcat(pItemRet[j].sWord,pItem[i+1].sWord);
				i+=1;
			}
			j+=1;
			bProcessed=true;
		}

		if(!bProcessed)
		{//If not processed,that's mean: not need to adjust;
		 //just copy to the final result
			strcpy(pItemRet[j].sWord,pItem[i].sWord);
			pItemRet[j++].nHandle=pItem[i].nHandle;
		}
		i++;
	}
	pItemRet[j].sWord[0]=0;//Set ending
	return true;
}
コード例 #14
0
ファイル: Segment.cpp プロジェクト: tedzhang/SearchMonkey
//Generate Word according the segmentation route
bool CSegment::GenerateWord(int **nSegRoute, int nIndex)
{
	unsigned int i=0,k=0;
	int j,nStartVertex,nEndVertex,nPOS;
	char sAtom[WORD_MAXLENGTH],sNumCandidate[100];
	ELEMENT_TYPE fValue;
	while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1])
	{
		nStartVertex=nSegRoute[nIndex][i];
		j=nStartVertex;//Set the start vertex
		nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex
		nPOS=0;
		m_graphSeg.m_segGraph.GetElementInfo(nStartVertex,nEndVertex,fValue,nPOS);
		sAtom[0]=0;

		while(j < nEndVertex)
		{
			//Generate the word according the segmentation route
			strcat(sAtom,m_graphSeg.m_sAtom[j]);
			j++;
		}
		m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending
		strcpy(sNumCandidate,sAtom);
		while(sAtom[0]!=0 && (IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate)))
		{
			//Merge all seperate continue num into one number
		    //sAtom[0]!=0: add in 2002-5-9
			strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate);
			//Save them in the result segmentation
			i++;//Skip to next atom now 
			sAtom[0]=0;
			
			while(j<nSegRoute[nIndex][i+1])
			{//Generate the word according the segmentation route
				strcat(sAtom,m_graphSeg.m_sAtom[j]);
				j++;
			}
			strcat(sNumCandidate,sAtom);
		}
		if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop
		{
			strcpy(m_pWordSeg[nIndex][k].sWord,sAtom);
			//Save them in the result segmentation
		}
		else
		{//It is a num
			if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-')//The delimiter "--"
			{
				nPOS=30464;//'w'*256;Set the POS with 'w'
				i--;//Not num, back to previous word
			}
			else
			{//Adding time suffix

				char sInitChar[3];
				unsigned int nCharIndex=0;//Get first char
				sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
				if(sInitChar[nCharIndex]<0)
				{
					nCharIndex+=1;
					sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
				}
				nCharIndex+=1;
				sInitChar[nCharIndex]='\0';
				if(k>0&&m_pWordSeg[nIndex][k-1].nHandle==27904&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex))
				{//3-4月                                 //27904='m'*256
				   //Split the sInitChar from the original word
					strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex);
					m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue;
					m_pWordSeg[nIndex][k+1].nHandle=27904;
					m_pWordSeg[nIndex][k].sWord[nCharIndex]=0;
					m_pWordSeg[nIndex][k].dValue=0;
					m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256;
					m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle);
					nStartVertex+=1;
					k+=1;
				}
				unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord);
				if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0)
				{//2001年
					strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
					nPOS=29696;//'t'*256;//Set the POS with 'm'
				}
				else if(strcmp(sAtom,"年")==0)
				{
					 if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&&
					 {//1998年,
						strcat(m_pWordSeg[nIndex][k++].sWord,sAtom);
						nPOS='t'*256;//Set the POS with 'm'
					 }
					 else
					    i--;//Can not be a time word
				}
     			else
				{
					//早晨/t  五点/t 
					if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0)
					{
						nPOS='t'*256;//Set the POS with 'm'
					}	
					else 
					{
						if(m_pWordSeg[nIndex][k].sWord[0]!='.')
							nPOS='m'*256;//Set the POS with 'm'
						if(nLen>1&&m_pWordSeg[nIndex][k].sWord[nLen-1]=='.')
						{//Get rid of . example 1.
							m_pWordSeg[nIndex][k].sWord[nLen-1]=0;
							i--;
						}
					}
					i--;//Not num, back to previous word
				}
			}
			fValue=0;
			nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter
	}
		m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word
		m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word
		m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS);
		//Generate optimum segmentation graph according the segmentation result
		i++;//Skip to next atom
		k++;//Accept next word
	}
	m_pWordSeg[nIndex][k].sWord[0]=0;
	m_pWordSeg[nIndex][k].nHandle=-1;//Set ending
	return true;
}