Ejemplo n.º 1
0
bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict)
{
  char sPOS[MAX_WORDS_PER_SENTENCE]="Z";
  int nStart=1,nEnd=1,i=1;
  while(m_nBestTag[i]>-1)
  {
	  if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition
	  {
		nStart=i;
		nEnd=nStart+1;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
			nEnd++;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
			nEnd++;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
			nEnd++;
		while(m_nBestTag[nEnd]==30)//3,13,23
			nEnd++;
	  }
	  else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition
	  {
		nStart=i;
		nEnd=nStart+1;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
			nEnd++;
		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
			nEnd++;
		while(m_nBestTag[nEnd]==30)//3,13,23
			nEnd++;
	  }
	  if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
	  {
			m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
			m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
			m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
			nStart=nEnd;
	  }

	  if(i<nEnd)
		  i=nEnd;
	  else
		  i=i+1;
  }
  return true;
}
Ejemplo n.º 2
0
bool CSpan::PlaceRecognize(CDictionary &dictCore,CDictionary &placeDict)
{
  int nStart=1,nEnd=1,i=1;
  while(m_nBestTag[i]>-1)
  {
	  if(m_nBestTag[i]==1)//1 Trigger the recognition procession
	  {
		nStart=i;
		nEnd=nStart+1;
		while(m_nBestTag[nEnd]==1)//
			nEnd++;
		while(m_nBestTag[nEnd]==2)//2,12,22
			nEnd++;
		while(m_nBestTag[nEnd]==3)
			nEnd++;
		while(m_nBestTag[nEnd]==4)
			nEnd++;	  
	  }
	  else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
	  {
		nStart=i;
		nEnd=nStart+1;
		while(m_nBestTag[nEnd]==2)//2
			nEnd++;
		while(m_nBestTag[nEnd]==3)//2
			nEnd++;
		while(m_nBestTag[nEnd]==4)//2
			nEnd++;
	  }
	  if(nEnd>nStart)
	  {
			m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
			m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
			m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,placeDict);
			nStart=nEnd;
	  }

	  if(i<nEnd)
		  i=nEnd;
	  else
		  i=i+1;
  }
  return true;
}
Ejemplo n.º 3
0
//Sort the segmentation and POS result according its possibility
bool CResult::Sort()
{
	ELEMENT_TYPE dPossibility[MAX_SEGMENT_NUM],dTemp;
	int nIndex[MAX_SEGMENT_NUM],nTemp;//Index

	memset(dPossibility,0,sizeof(dPossibility));
	//Init the possibility
	
	for(int i=0;i<m_Seg.m_nSegmentCount;i++)
	{//Computing the possibility
		dPossibility[i]=ComputePossibility(m_Seg.m_pWordSeg[i]);
		nIndex[i]=i;//Record the index
	}
	
	//Sort with Bubble sort algorithm
	for(int i=0;i<m_Seg.m_nSegmentCount;i++)
		for(int j=i+1;j<m_Seg.m_nSegmentCount;j++)
		{
			if(dPossibility[i]<dPossibility[j])
			{//Swap the possition and value
				nTemp=nIndex[i];
				dTemp=dPossibility[i];
				nIndex[i]=nIndex[j];
				dPossibility[i]=dPossibility[j];
				nIndex[j]=nTemp;
				dPossibility[j]=dTemp;
			}
		}
	
	for(int i=0;i<m_Seg.m_nSegmentCount;i++)
	{//Adjust the segmentation and POS result and store them in the final result array
	 //Store them according their possibility ascendly
		Adjust(m_Seg.m_pWordSeg[nIndex[i]],m_pResult[i]);	
		m_dResultPossibility[i]=dPossibility[i];	
	}
	return true;
}
Ejemplo n.º 4
0
bool CSpan::PersonRecognize(CDictionary &personDict)
{
  char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
                          //0     1    2    3    4   5   
  char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE",
						 "BG",  "BXD","BZ", "CDCD","CD","EE", 
						 "FB", "Y","XD",""};
  double dFactor[]={0.0011,0.0011,0.0011,0.0011,0.7614,0.0011,0.2055,
						 0.0160,0.0011,0.0011,0,0.0160,0.0011,
						 0.0160,0.0011,0.0011,0 };
  //About parameter:
/*
	Given Name: 486     0.0160
	Surname+postfix:484 0.0160
	m_lPerson2Num:6265   0.2055
	m_lPerson3Num: 23184 0.7614
	m_lPerson4Num:32     0.0011
  */
  //The person recognition patterns set
  //BBCD:姓+姓+名1+名2;
  //BBE: 姓+姓+单名;
  //BBZ: 姓+姓+双名成词;
  //BCD: 姓+名1+名2;
  //BE:  姓+单名;
  //BEE: 姓+单名+单名;韩磊磊
  //BG:  姓+后缀
  //BXD: 姓+姓双名首字成词+双名末字
  //BZ:  姓+双名成词;
  //B:	 姓
  //CD:  名1+名2;
  //EE:  单名+单名;
  //FB:  前缀+姓
  //XD:  姓双名首字成词+双名末字
  //Y:   姓单名成词
  int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};

  int i;
  for(i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS
	sPOS[i]=m_nBestTag[i]+'A';
  sPOS[i]=0;
  int j=1,k,nPos;//Find the proper pattern from the first POS
  int nLittleFreqCount;//Counter for the person name role with little frequecy
  bool bMatched=false;   
  while(j<i)
  {
	bMatched=false;   
	for(k=0;!bMatched&&nPatternLen[k]>0;k++)
	{
		if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0)
		{//Find the proper pattern k
			if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))
			{//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
				continue;
			}
/*			if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
			{//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊
				continue;
			}

			if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
			{//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘
				continue;
			}
*/			//Get the possible name
			nPos=j;//Record the person position in the tag sequence
			sPersonName[0]=0;
			nLittleFreqCount=0;//Record the number of role with little frequency
			while(nPos<j+nPatternLen[k])
			{//Get the possible person name
			 //
				if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY)
					nLittleFreqCount++;//The counter increase
				strcat(sPersonName,m_sWords[nPos]);
				nPos+=1;
			}
			if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
			{//Exclusion foreign name
			 //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
				j+=nPatternLen[k]-1;
				continue;
			}
			if(strcmp(sPatterns[k],"CDCD")==0)
			{//Rule for exclusion
			 //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
 			 //Rule 3 for exclusion:含外国人名用字 规则适用
			 //否则,排除规则失效:黑妞白妞姐俩拔了头筹。
				if(GetForeignCharCount(sPersonName)>0)
					j+=nPatternLen[k]-1;
				continue;
			}
			if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
			{//
				j+=nPatternLen[k]-1;
				continue;
			}
			if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
			//马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀,
			//The all roles appear with two lower frequecy,we will ignore them
				continue;
			m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j];
			m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]];
			m_dWordsPossibility[m_nUnknownIndex]=log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict);
			//Mutiply the factor 
			m_nUnknownIndex+=1;
			j+=nPatternLen[k];
			bMatched=true;
		}
	}
    if(!bMatched)//Not matched, add j by 1
		j+=1;
  }
  return true;
}