Пример #1
0
bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict)
{
	int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType,nFreqGiven;
	char sTemp[3];
	if(nLen<3||nLen>8)//Not a traditional Chinese person name
		return false;
	while(i<nLen)//No Including non-CHinese char
	{
		nCharType=charType((unsigned char*)sPersonName+i);
		if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER)
			return false;
		i+=2;
	}
	sSurname2[0]=0;//init 
	strncpy(sSurname,sPersonName,nSurNameLen);	
	sSurname[nSurNameLen]=0;
	if(!personDict.IsExist(sSurname,1))
	{
		nSurNameLen=2;
		sSurname[nSurNameLen]=0;
		if(!personDict.IsExist(sSurname,1))
		{
			nSurNameLen=0;
			sSurname[nSurNameLen]=0;
		}
	}
	strcpy(sGivenName,sPersonName+nSurNameLen);
	if(nLen>6)
	{
		strncpy(sTemp,sPersonName+nSurNameLen,2);
		sTemp[2]=0;//Get the second possible surname
		if(personDict.IsExist(sTemp,1))
		{//Hongkong women's name: Surname+surname+given name
			strcpy(sSurname2,sTemp);
			strcpy(sGivenName,sPersonName+nSurNameLen+2);
		}
	}
	nFreq=personDict.GetFrequency(sSurname,1);
	strncpy(sTemp,sGivenName,2);
	sTemp[2]=0;
	nFreqGiven=personDict.GetFrequency(sTemp,2);
	if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)>=3&&nFreq<personDict.GetFrequency("张",1)/40&&nFreqGiven<personDict.GetFrequency("华",2)/20)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2)))
		return false;
	if(nLen==4&&m_uPerson.IsGivenName(sPersonName))
	{//Single Surname+given name
		return false;
	}
	return true;
}
Пример #2
0
//POS tagging with Hidden Markov Model
bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown)
{
//pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
    int i=0,j,nStartPos;
	Reset(false);
    while(i>-1&&pWordItems[i].sWord[0]!=0)
	{
		nStartPos=i;//Start Position
		i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown);
		GetBestPOS();
		switch(m_tagType)
		{
		case TT_NORMAL://normal POS tagging
			j=1;
			while(m_nBestTag[j]!=-1&&j<m_nCurLength)
			{//Store the best POS tagging
				pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j];
				//Let 。be 0
				if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value
					pWordItems[j+nStartPos-1].dValue=LOG_MAX_FRQUENCE-log((double)dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j])+1);
				j+=1;
			}
			break;
		case TT_PERSON://Person recognition
			/*clock_t lStart,lEnd;
		    lStart=clock();
			*/
			SplitPersonPOS(dictUnknown);
			//lEnd=clock();
			//printf("SplitPersonPOS=%f\n",(double)(lEnd-lStart)*1000/CLOCKS_PER_SEC);
			//Spit Persons POS
			//lStart=clock();
			PersonRecognize(dictUnknown);
			//lEnd=clock();
			//printf("PersonRecognize=%f\n",(double)(lEnd-lStart)/CLOCKS_PER_SEC);
			//Person Recognition with the person recognition dictionary
			break;
		case TT_PLACE://Place name recognition
			PlaceRecognize(dictCore,dictUnknown);
			break;
		case TT_TRANS://Transliteration
			TransRecognize(dictCore,dictUnknown);
			break;
		default:
			break;
		}
		Reset();
	}
	return true;
}
Пример #3
0
bool CSpan::SplitPersonPOS(CDictionary &unlistDict)
{//Split the word with POS 21 and 22
    int i=m_nCurLength-1,j;
	unsigned int nLenWord,nLenPart;
	char sFirstPart[50],sLastPart[50];
	int nFirstPOS,nLastPOS;
	for(;i>0;i--)
	{
		if(m_nBestTag[i]==21||m_nBestTag[i]==22)
		{//Find the POS which need to split
			for(j=m_nCurLength-1;j>i;j--)
			{//Move the POS and words
				strcpy(m_sWords[j+1],m_sWords[j]);
				m_nBestTag[j+1]=m_nBestTag[j];
				m_nWordPosition[j+1]=m_nWordPosition[j];
			}
			m_nCurLength+=1;//The length increment 
	        
			//Generate new segment words and POS
			if(m_nBestTag[i]==21)
			{//Combination by Previous and first component
				nLenWord=strlen(m_sWords[i]);
				if(nLenWord>4)//Get first component
				{
					strcpy(sLastPart,m_sWords[i]+nLenWord-4);
					if(!unlistDict.IsExist(sLastPart,-1))
						strcpy(sLastPart,m_sWords[i]+nLenWord-2);
				}
				else
				{
					strcpy(sLastPart,m_sWords[i]+nLenWord-2);	
				}
				nLenPart=strlen(sLastPart);
				if(nLenPart<nLenWord)
				{//Get first part
					strncpy(sFirstPart,m_sWords[i],nLenWord-nLenPart);
					sFirstPart[nLenWord-nLenPart]=0;
				}
				else
				{
					strncpy(sFirstPart,m_sWords[i],nLenWord-2);
					sFirstPart[nLenWord-2]=0;
					strncpy(sLastPart,m_sWords[i]+nLenWord-2,2);
					sLastPart[2]=0;
				}
				nFirstPOS=11;
				nLastPOS=1;
			}
			else
			{//Combination by Next word and last component
				nLenWord=strlen(m_sWords[i]);
				if(nLenWord>4)//Get last component
				{
					strncpy(sFirstPart,m_sWords[i],4);
					sFirstPart[4]=0;
					if(!unlistDict.IsExist(sFirstPart,-1))
						sFirstPart[2]=0;
				}
				else
				{
					strncpy(sFirstPart,m_sWords[i],2);	
					sFirstPart[2]=0;
				}
				nLenPart=strlen(sFirstPart);
				if(nLenPart<nLenWord)
				{//Get first part
					strncpy(sLastPart,m_sWords[i]+nLenPart,nLenWord-nLenPart);
					sLastPart[nLenWord-nLenPart]=0;
				}
				else
				{
					strncpy(sFirstPart,m_sWords[i],2);
					sFirstPart[2]=0;
					strncpy(sLastPart,m_sWords[i]+2,nLenWord-2);
					sLastPart[nLenWord-2]=0;
				}
				if(unlistDict.IsExist(sFirstPart,1)&&m_nBestTag[i-1]==5)
					//小陈说:
					nFirstPOS=1;
				else if(unlistDict.IsExist(m_sWords[i-1],1)&&!unlistDict.IsExist(m_sWords[i-2],1))
					nFirstPOS=4;
				else
					nFirstPOS=3;
				nLastPOS=12;
			}
            strcpy(m_sWords[i],sFirstPart);
			m_nBestTag[i]=nFirstPOS;
            strcpy(m_sWords[i+1],sLastPart);
			m_nBestTag[i+1]=nLastPOS;
			m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(sFirstPart);
		}
	}
	return true;
}
Пример #4
0
bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict)
{
  char sPOS[MAX_WORDS_PER_SENTENCE]="Z";
  int nStart=1,nEnd=1,i=1;
  while(m_nBestTag[i]>-1)
  {
	  if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition
	  {
		nStart=i;
		nEnd=nStart+1;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
			nEnd++;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
			nEnd++;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
			nEnd++;
		while(m_nBestTag[nEnd]==30)//3,13,23
			nEnd++;
	  }
	  else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition
	  {
		nStart=i;
		nEnd=nStart+1;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
			nEnd++;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
			nEnd++;
		while(m_nBestTag[nEnd]==30)//3,13,23
			nEnd++;
	  }
	  if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
	  {
			m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
			m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
			m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
			nStart=nEnd;
	  }

	  if(i<nEnd)
		  i=nEnd;
	  else
		  i=i+1;
  }
  return true;
}
Пример #5
0
int CSpan::GetFrom(PWORD_RESULT pWordItems,int nIndex,CDictionary &dictCore, CDictionary &dictUnknown)
{
	int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
	int nFreq=0,j,nRetPos=0,nWordsIndex=0;
	bool bSplit=false;//Need to split in Transliteration recognition 
    int i=1;
	nWordsIndex=i+nIndex-1;
	for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
	{
		if(m_tagType==TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44))
        {
			strcpy(m_sWords[i],pWordItems[nWordsIndex].sWord);//store current word
   		    m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
		}
		else
		{
			if(!bSplit)
			{
				strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord,2);//store current word
				m_sWords[i][2]=0;
				bSplit=true;
			}
			else
			{
				unsigned int nLen=strlen(pWordItems[nWordsIndex].sWord+2);
				strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);//store current word
				m_sWords[i][nLen]=0;
				bSplit=false;
			}
   		    m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
		}
		//Record the position of current word
		m_nStartPos=m_nWordPosition[i+1];
		//Move the Start POS to the ending
		if(m_tagType!=TT_NORMAL)
		{
			//Get the POSs from the unknown recognition dictionary
			dictUnknown.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
			for(j=0;j<nCount;j++) 
			{//Get the POS set of sCurWord in the unknown dictionary
				m_nTags[i][j]=aPOS[j];
   				m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(m_context.GetFrequency(0,aPOS[j])+1));
			}
			//Get the POS set of sCurWord in the core dictionary
			//We ignore the POS in the core dictionary and recognize them as other (0).
			//We add their frequency to get the possibility as POS 0
			dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
			nFreq=0;
			for(int k=0;k<nCount;k++) 
			{
				nFreq+=aFreq[k];
			}
			if(nCount>0)
			{
				m_nTags[i][j]=0;
				//m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
				m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(m_context.GetFrequency(0,0)+1));
				j++;
			}
		}
		else//For normal POS tagging
		{
			j=0;
			//Get the POSs from the unknown recognition dictionary
			if(pWordItems[nWordsIndex].nHandle>0)
			{//The word has  is only one POS value
			 //We have record its POS and nFrequncy in the items.
				m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
				m_dFrequency[i][j]=pWordItems[nWordsIndex].dValue-LOG_MAX_FRQUENCE+log((double)(m_context.GetFrequency(0,m_nTags[i][j])+1));
				if(m_dFrequency[i][j]<0)//Not permit the value less than 0
					m_dFrequency[i][j]=0;
				j++;
			}
			else
			{//The word has multiple POSs, we should retrieve the information from Core Dictionary 
				
				if(pWordItems[nWordsIndex].nHandle<0)
				{//The word has  is only one POS value
				 //We have record its POS and nFrequncy in the items.
					if(pWordItems[nWordsIndex].nHandle==-'t'*256-'t')//tt
					{
						char sWordOrg[100],sPostfix[10];
						double dRatio=0.6925;//The ratio which transliteration as a person name 
						PostfixSplit(pWordItems[nWordsIndex].sWord,sWordOrg,sPostfix);
						if(sPostfix[0]!=0)
								dRatio=0.01;
						m_nTags[i][j]='n'*256+'r';
						m_dFrequency[i][j]=-log(dRatio)+pWordItems[nWordsIndex].dValue;
						//m_dFrequency[i][j]=log(dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE);
						//P(W|R)=P(WRT)/P(RT)=P(R)*P(W|T)/P(R|T)
						j++;
						m_nTags[i][j]='n'*256+'s';
						m_dFrequency[i][j]=-log(1-dRatio)+pWordItems[nWordsIndex].dValue;
						//m_dFrequency[i][j]=log(1-dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE);
						j++;
					}
					else//Unknown words such as Chinese person name or place name
					{
						m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
   					//	m_dFrequency[i][j++]=(double)(1+pWordItems[nWordsIndex].nFrequency)/(double)(m_context.GetFrequency(0,aPOS[j])+1);
						m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;
					}
				}
				dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
				for(;j<nCount;j++) 
				{//Get the POS set of sCurWord in the unknown dictionary
					m_nTags[i][j]=aPOS[j];
   					m_dFrequency[i][j]=-log((double)1+aFreq[j])+log((double)m_context.GetFrequency(0,m_nTags[i][j])+1);
				}
			}
		}
		if(j==0)
		{//We donot know the POS, so we have to guess them according lexical knowledge
			GuessPOS(i,&j);//Guess the POS of current word
		}
		m_nTags[i][j]=-1;//Set the ending POS 
		if(j==1)//No ambuguity
		{//No ambuguity, so we can break from the loop
			i++;
			m_sWords[i][0]=0;
			break;
		}
		if(!bSplit)
		{nWordsIndex++;}
	}
	if(pWordItems[nWordsIndex].sWord[0]==0)
		nRetPos=-1;//Reaching ending

	if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0
	{//Set end for words like "张/华/平"
		if(m_tagType!=TT_NORMAL)
		       m_nTags[i][0]=101;
		else
		       m_nTags[i][0]=1;
		
		m_dFrequency[i][0]=0;
	    m_sWords[i][0]=0;//Set virtual ending
		m_nTags[i++][1]=-1;
	}
	m_nCurLength=i;//The current word count
	if(nRetPos!=-1)
		return nWordsIndex+1;//Next start position
	return -1;//Reaching ending
}