예제 #1
0
파일: LemWord.cpp 프로젝트: eamosov/lspl
// returns the end point of the graphematical descriptors
int CLemWord::ProcessGraphematicalDescriptors(const char* LineStr)
{
	size_t MorphSignPos = GetMorphSignPosition(LineStr);
	if (MorphSignPos == -1)
		MorphSignPos = strlen(LineStr);

    m_GraDescrs = parse_gra_descriptors(string (LineStr, MorphSignPos).c_str(), m_UnparsedGraphemDescriptorsStr);

	m_bSpace =			HasDes(OSpc) 
					||	HasDes(OEOLN) 
					||	((BYTE)m_strWord[0] == StupidSymbol1)
					||	( ((BYTE)m_strWord[0] == '_') && (m_strWord.length() == 1));

    

	if (HasDes (OUpLw))
		m_Register = UpLow;
	else
	if (HasDes (OUp))
		m_Register = UpUp;
	else
	if (HasDes (OLw))
		m_Register = LowLow;		
	else
		m_Register = AnyRegister;		

	m_bComma = (m_strWord.length() == 1)  && (m_strWord[0] == ',');	
	m_bDash = (m_strWord.length() == 1)  && (m_strWord[0] == '-');	

	bool bRomanNumber = is_roman_number(m_strWord.c_str(), m_strWord.length() );
	size_t hyphen_occur = m_strWord.find("-");
	if ((hyphen_occur != string::npos) && (hyphen_occur!=0))
	{
		// "ѕавла I-го" 
		// "I-го" - одно слово
		bRomanNumber = is_roman_number(m_strWord.c_str(), hyphen_occur);
	};
	if (bRomanNumber)
        m_GraDescrs |= _QM(ORoman);

    m_bWord  = !bRomanNumber && (HasDes(ORLE) || HasDes(OLLE)); 

	return (int)MorphSignPos;

}
예제 #2
0
파일: PlmLine.cpp 프로젝트: deNULL/seman
bool CPlmLine :: LoadPlmLineFromString (string LineStr, bool bStartLine, const CAgramtab* pRusGramTab) 
{
	m_bToDelete = false;
	m_bQuoteMarks = false;
	
	if (!bStartLine)
	{
		m_bHomonym = (LineStr[0] == ' ');

		Trim(LineStr);
		SetWord(::GetWordForm(LineStr) );
	}
	else
	{
		m_bHomonym = false;
		SetWord( "" );

	};


	long i = 1; // the first char can be a space (if this line contains a homonym)
	for (; i < LineStr.length(); i++)
		if (!isspace((BYTE) LineStr[i]) == 0)
			break;

	if (sscanf (LineStr.c_str()+i, "%i %i", &m_FilePosition, &m_TokenLengthInFile) != 2)
		return false;

	// pass all numbers
	for (; i < LineStr.length(); i++)
		 if (		(isdigit((BYTE) LineStr[i]) == 0)
				&&	(isspace((BYTE) LineStr[i]) == 0)
				&&	(((BYTE)LineStr[i]) != '-')
			)
			break;

	int MorphSignPos = GetMorphSignPosition(LineStr.c_str()+i);
	if (MorphSignPos == -1)
		MorphSignPos = LineStr.length();
	else
		MorphSignPos += i; // make MorphSignPos an absolute offset in LineStr

	m_GraphDescr = LineStr.substr (i, MorphSignPos - i);
	/* вставим пробел в начало, потому что часто ищут графету с пробелом в начале,
	например, " ЛЕ"*/
	m_GraphDescr = " " + m_GraphDescr;

	if (MorphSignPos != LineStr.length())
	{
		StringTokenizer tok(LineStr.c_str()+MorphSignPos," ");

		if (!tok() ) return false;
		string MorphSign  = tok.val();
		if (MorphSign.length() != 3) return false;
		m_MorphSign = MorphSign[0];
		m_CommonGramCode = MorphSign.substr(1);

		
		if (!tok() ) return false;
		m_Lemma = tok.val();
		if (m_Lemma.empty())  return false;
		
		if (!tok() ) return false;
		SetGramCodes ( tok.val(), pRusGramTab);

		if (!tok() ) return false;
		m_ParadigmId = tok.val();

		if (!tok() ) return false;
		m_HomoWeight = tok.val();

	};

	m_TokenType = OTHER_TOKEN_TYPE;
	for (int k=(int)RLE; k < OTHER_TOKEN_TYPE; k++)
		if (init_flag (m_GraphDescr, TokenTypeToString((MainTokenTypeEnum)k).c_str() ))
		{
            m_TokenType = (MainTokenTypeEnum)k;
			break;
		};

	if (init_flag (m_GraphDescr, "Aa"))
		m_Register = UpLow;
	else
	if (init_flag (m_GraphDescr, "AA"))
		m_Register = UpUp;
	else
	{
		init_flag (m_GraphDescr, "aa");
		m_Register = LowLow;		
	};

	m_bFirstUpperAlpha =  (m_Register == UpUp) || (m_Register == UpLow);

	m_bFI1 = init_flag (m_GraphDescr, "FAM1");
	m_bFI2 = init_flag (m_GraphDescr, "FAM2");
	m_bName = init_flag (m_GraphDescr, "NAM?");
	m_bSent2 = init_flag (m_GraphDescr, "SENT_END");
	int hyphen_occur = m_Word.find("-");
	m_bHyphenWord = (hyphen_occur != string::npos) && ( (m_TokenType == RLE) ||(m_TokenType == LLE));

	m_bOborot1 = (m_GraphDescr.find("EXPR1") != string::npos);
	m_bOborot2 = (m_GraphDescr.find("EXPR2") != string::npos);
	bool bRomanNumber = is_roman_number(m_Word.c_str(), m_Word.length());
	
	if ((hyphen_occur != string::npos) && (hyphen_occur!=0))
	{
		// "Павла I-го" 
		// "I-го" - одно слово
		bRomanNumber = is_roman_number(m_Word.c_str(), hyphen_occur);
	};
	if (bRomanNumber)
	{
		m_TokenType = ROMAN_NUM;
		m_CommonGramCode = "";
		m_MorphSign = 0;
		m_ParadigmId = "";
	};
	Trim(m_GraphDescr);
	return true;

};