Ejemplo n.º 1
0
CWord::size_type LongestPieceFixedStart(const CWord& w) {
  CWord::size_type result = 1;
  auto shifted = w;
  shifted.CyclicLeftShift();
  while (shifted != w) {
    result = std::max(result, CommonPrefixLength(w, shifted));
    if (result * 6 >= w.size()) {
      return result;
    }
    shifted.CyclicLeftShift();
  }

  auto inverse = w.Inverse();
  if (LeastCyclicShift(inverse) == LeastCyclicShift(w)) {
    return result;
  }
  result = std::max(result, CommonPrefixLength(w, inverse));
  inverse.CyclicLeftShift();

  while (inverse != w.Inverse()) {
    result = std::max(result, CommonPrefixLength(w, inverse));
    if (result * 6 >= w.size()) {
      return result;
    }
    inverse.CyclicLeftShift();
  }

  return result;
}
Ejemplo n.º 2
0
void CMultiWordCreator::AddQuoteMultiWord(CWordSequence& ws, const TArticleRef& article)
{
    SWordHomonymNum wh;
    Wtroka str;
    CWord* pNewWord = GetWordForMultiWord(ws, str, wh);
    if (pNewWord->m_SourceWords.Size() == 1 && pNewWord->HasOnlyUnknownPOS()) {
        size_t firstId = pNewWord->IterHomonyms().GetID();
        CHomonym& h = pNewWord->GetRusHomonym(firstId);
        h.SetSourceWordSequence(&ws);
        h.PutArticle(article);
        wh.m_HomNum = firstId;
    } else {
        pNewWord->m_SourceWords.SetPair(ws.FirstWord(), ws.LastWord());
        if (str.size() == 0)
            str = pNewWord->m_txt;
        TMorph::ToLower(str);
        CHomonym* pNewHom = new CHomonym(TMorph::GetMainLanguage(), str);
        pNewHom->SetSourceWordSequence(&ws);
        pNewHom->PutArticle(article);
        wh.m_HomNum = pNewWord->AddRusHomonym(pNewHom);
    }

    if (article.AuxDic().IsValid()) {
        const article_t* pArt =  GlobalDictsHolder->GetAuxArticle(article.AuxDic());
        YASSERT(pArt != NULL);
        AddFoundArticle(pArt->get_kw_type(), pArt->get_title(), wh);
    } else {
        YASSERT(!article.Gzt().Empty());
        AddFoundArticle(article.Gzt().GetType(), article.Gzt().GetTitle(), wh);
    }

    m_wordSequences.push_back(&ws);
}
Ejemplo n.º 3
0
bool CAsmFile::ParseBlockCMT()
{
		// 扫描注释.
	SetCurLine(0);
	CWord* pWord = NULL;
	bool bCmt = false;
	do 
	{
		// 寻找 / *
		while ((pWord = GetAllNextWord()) != NULL)
		{
			string& str = pWord->GetWord();
			if (str == "/*")
			{
				bCmt = true;
				pWord->SetAttrib(EWORD_COMMENT);
				break;
			}
		}
		// 标记注释和结束符
		while ((pWord = GetAllNextWord()) != NULL)
		{
			string& str = pWord->GetWord();
			pWord->SetAttrib(EWORD_COMMENT);
			if (str == "*/")
			{
				bCmt = false;
				break;
			}
		}
	} while (pWord != NULL);
	return (bCmt == false);
}
Ejemplo n.º 4
0
inline Wtroka ExtractText(const CWord& word, const CWordVector& allWords) {
    Wtroka res = word.IsMultiWord() ? allWords.ToString(word) : word.GetText();
    if (word.IsFirstInSentence() && !word.HasAtLeastTwoUpper()) {
        // Remove title-case for the first word of a sentence (not having other upper-cased letters)
        TMorph::ToLower(res);
    }
    return res;
}
Ejemplo n.º 5
0
CWord::size_type CommonPrefixLength(CWord u, CWord v) {
  CWord::size_type result = 0;
  while (u.GetFront() == v.GetFront()) {
    ++result;
    u.PopFront();
    v.PopFront();
  }
  return result;
}
Ejemplo n.º 6
0
CWord ConjugationInverseNormalForm(const CWord& w) {
  assert(w.Empty() || w.GetBack().Inverse() != w.GetFront());
  auto min_w = LeastCyclicPermutation(w);
  auto min_i = LeastCyclicPermutation(w.Inverse());
  if (min_w < min_i) {
    return min_w;
  } else {
    return min_i;
  }
}
bool CAnalyticFormBuilder::HasDeclinableSynNounInInstrumentalis(const CWord& _W) const
{
    for (CWord::SHomIt it = _W.IterHomonyms(); it.Ok(); ++it)
        if (IsSynNoun(*it) && it->HasGrammem(gInstrumental) && !it->HasGrammem(gNominative))
            return true;
    return false;
}
bool CAnalyticFormBuilder::HasShortParticipleOrAdj(const CWord& _W) const
{
    for (CWord::SHomIt it = _W.IterHomonyms(); it.Ok(); ++it)
        if (it->IsShortAdjectiveOrParticiple())
            return true;
    return false;
}
Ejemplo n.º 9
0
bool CWord::HasCommonHomonyms(const CWord& w) const
{
    if (!IsDictionary() || !w.IsDictionary()) {
        SHomIt it1 = IterHomonyms();
        SHomIt it2 = w.IterHomonyms();
        if (it1.Ok() && it2.Ok() && TMorph::IdentifSimilar(it1->Lemma, it2->Lemma))
            return true;
    }

    for (SHomIt it1 = IterHomonyms(); it1.Ok(); ++it1)
        for (SHomIt it2 = w.IterHomonyms(); it2.Ok(); ++it2)
            if (it1->Lemma == it2->Lemma)
                return true;

    return false;
}
bool CAnalyticFormBuilder::HasCompar(const CWord& W)
{
    for (CWord::SHomIt it = W.IterHomonyms(); it.Ok(); ++it)
        if (it->IsFullAdjective() && it->HasGrammem(gComparative))
            return true;

    return false;
}
bool CAnalyticFormBuilder::HasInfinitive(const CWord& W)
{
    for (CWord::SHomIt it = W.IterHomonyms(); it.Ok(); ++it)
        if (it->HasGrammem(gInfinitive))
            return true;

    return false;
}
Ejemplo n.º 12
0
Stroka CSentence::ToHTMLColorString(const yvector<CWordsPair>& wp, const Stroka& sColor, ECharset encoding) const
{
    Stroka str;
    for (size_t i = 0; i + 1 <= m_words.size(); ++i) {
        CWord* pW = m_words[i].Get();
        for (size_t j = 0; j < wp.size(); ++j)
            if (wp[j].FirstWord() == (int)i)
                str += Substitute("<font color=\"$0\">", sColor);
        if (i > 0 && (!pW->IsPunct() || pW->IsOpenBracket() || pW->IsCloseBracket()))
            str += " ";
        str += NStr::Encode(pW->GetOriginalText(), encoding);
        for (size_t j = 0; j < wp.size(); ++j)
            if (wp[j].LastWord() == (int)i)
                str += "</font>";
    }
    return StripString(str);
}
bool CAnalyticFormBuilder::HasPredik(const CWord& W)
{
    for (CWord::SHomIt it = W.IterHomonyms(); it.Ok(); ++it)
        if (!it->Grammems.IsIndeclinable()) //  "бух","спасибо" не строятся с анал. формами
            if (it->HasGrammem(gPraedic))
                return true;

    return false;
}
Ejemplo n.º 14
0
bool CAsmFile::ParseKeyword()
{
	SetCurLine(0);
	CWord* pWord = NULL;

	do 
	{
		// bool bCmt = false;
		while ((pWord = GetLineNextWord()) != NULL)
		{
			if (pWord->GetAttrib() == EWORD_INIT)
			{
				pWord->SetAttrib(EWORD_KEYWORD);
				break;
			}
		}
	} while (GetNextLine());
	return true;
}
Ejemplo n.º 15
0
unsigned long long TARGET_LANGUAGE::CTagger::getPossibleTagsForWord( const CWord &word ) {
   static unsigned long long possible_tags;
   possible_tags = m_TagDict.lookup(word);
   if (possible_tags==0) possible_tags = ~0L;
#ifdef _ENGLISH_TAGS_H
   possible_tags |= getPossibleTagsBySuffix( word.str() );
   possible_tags |= PENN_TAG_MUST_SEE ;
#endif
   assert(possible_tags!=0);
   return possible_tags;
}
Ejemplo n.º 16
0
// return found (or newly made) homonym id
static int FindOrMakeMultiwordHomonym(const CWordSequence& ws, CWord& word, TKeyWordType kwtype, THomonymGrammems grammems, THomonymPtr& res) {
    // use the content of CWordSequence.m_Lemmas as new homonym lemma
    Wtroka str = ws.GetLemma();
    if (!str)
        str = word.GetLowerText();
    TMorph::ToLower(str);

    int homId = -1;
    // search if we already have a multiword homonym with such text
    if (!FindBestHomonym(word, str, kwtype, grammems, homId)) {
        // create new homonym, if there is no ready one
        res = new CHomonym(TMorph::GetMainLanguage(), str);
        if (ws.HasAuxArticle())
            res->PutAuxArticle(ws.GetAuxArticleIndex());
        else
            res->PutGztArticle(ws.GetGztArticle());
        homId = word.AddRusHomonym(res);
    }
    return homId;
}
Ejemplo n.º 17
0
bool CAsmFile::ParseLineCMT()
{
	SetCurLine(0);
	CWord* pWord = NULL;

	do 
	{
		bool bCmt = false;
		while ((pWord = GetLineNextWord()) != NULL)
		{
			string& str = pWord->GetWord();
			if (str == "@")
			{
				bCmt = true;
			}
			if (bCmt)
			{
				pWord->SetAttrib(EWORD_COMMENT);
			}
		}
	} while (GetNextLine());
	SetCurLine(0);

	do 
	{
		bool bCmt = false;
		while ((pWord = GetLineNextWord()) != NULL)
		{
			string& str = pWord->GetWord();
			if (str == "//")
			{
				bCmt = true;
			}
			if (bCmt)
			{
				pWord->SetAttrib(EWORD_COMMENT);
			}
		}
	} while (GetNextLine());
	return true;
}
Ejemplo n.º 18
0
CWord::size_type LongestPiece(const CWord& w) {
  auto shifted = w;
  CWord::size_type result = 1;
  do {
    result = std::max(result, LongestPieceFixedStart(shifted));
    if (result * 6 >= w.size()) {
      return result;
    }
    shifted.CyclicLeftShift();
  } while (shifted != w);
  return result;
}
bool CAnalyticFormBuilder::AllHomonymsArePredicates(const CWord& W) const
{
    const TGramBitSet VerbGerundPraedic(gVerb, gGerund, gPraedic);
    for (CWord::SHomIt it = W.IterHomonyms(); it.Ok(); ++it) {
        if (it->HasAnyOfPOS(VerbGerundPraedic) || it->IsShortAdjectiveOrParticiple())
             continue;
        //verbs that can not be predicates or an.f.
        if (!TAnalyticFormPredicates::Has(it->Lemma))
            return false;
     }
    return true;
}
Ejemplo n.º 20
0
std::tuple<unsigned short, unsigned short, unsigned short> LongestCommonSubwordCyclic(CWord u, CWord v) {
  unsigned short max_common_prefix = 0;
  unsigned short u_max_begin = u.size();
  unsigned short v_max_begin = v.size();

  for (unsigned short current_u_begin = 0; current_u_begin < u.size(); ++current_u_begin) {
    for (unsigned short current_v_begin = 0; current_v_begin < v.size(); ++current_v_begin) {
      auto u_copy = u;
      auto v_copy = v;
      unsigned short current_common_prefix_length = 0;
      while (u_copy.GetFront() == v_copy.GetFront()) {
        ++current_common_prefix_length;
        u_copy.PopFront();
        if (u_copy.Empty()) {
          break;
        }
        v_copy.PopFront();
        if (v_copy.Empty()) {
          break;
        }
      }
      if (current_common_prefix_length > max_common_prefix) {
        u_max_begin = current_u_begin;
        v_max_begin = current_v_begin;
        max_common_prefix = current_common_prefix_length;
      }
      v.CyclicLeftShift();
    }
    u.CyclicLeftShift();
  }

  return std::make_tuple(u_max_begin, v_max_begin, max_common_prefix);
}
bool CAnalyticFormBuilder::HasAnalyticalBe(const CWord& _W) const
{
    //NB! в настоящем варианте парсера нет словаря оборотов, поэтому проверка слова на оборот временно отключена
    // если мы попали на оборот(например, "может быть"), тогда не будем строить здесь анал. форму.
    //if (_W.IsInOborot()) return false;

    // "быто" предсказывается как "быть"
    //if (_W.m_bPredicted) return false;
    //NB! пока предсказание отсутствует в парсере

    //if ( _W.HasPOS(UnknownPOS) ) return false;
    if (_W.HasUnknownPOS()) return false;

    for (CWord::SHomIt it = _W.IterHomonyms(); it.Ok(); ++it) {
        bool is_verb = it->IsPersonalVerb();
        if (is_verb && !it->IsPresentTense() && it->Lemma == kByt)
            return true;
        if ((is_verb || it->HasGrammem(gInfinitive)) && it->Lemma == kStat)
            return true;
    }
    return false;
}
Ejemplo n.º 22
0
void CFeatureHandle::updateLocalFeatureVector(SCORE_UPDATE method, const CStringVector* outout, int index, int round) { 
   // abstd::cout words              
   CWord word = outout->at(index);
   CWord last_word = index>0 ? outout->at(index-1) : g_emptyWord;
   CTwoWords two_word;
   two_word.allocate(word.str(), last_word.str());
   CStringVector chars;
   chars.clear(); getCharactersFromUTF8String(word.str(), &chars);
   // abstd::cout length
   int length = getUTF8StringLength(word.str()); if (length > LENGTH_MAX-1) length = LENGTH_MAX-1;
   int last_length = getUTF8StringLength(last_word.str()); if (last_length > LENGTH_MAX-1) last_length = LENGTH_MAX-1;
   // abstd::cout chars  
   CWord first_char = getFirstCharFromUTF8String(word.str());
   CWord last_char = getLastCharFromUTF8String(word.str());
   CWord first_char_last_word = index>0 ? getFirstCharFromUTF8String(last_word.str()) : g_emptyWord;
   CWord last_char_last_word = index>0 ? getLastCharFromUTF8String(last_word.str()) : g_emptyWord;
   CWord two_char = index>0 ? last_char_last_word.str() + first_char.str() : g_emptyWord;
   CTwoWords first_and_last_char, lastword_firstchar, currentword_lastchar, firstcharlastword_word, lastword_lastchar;
   first_and_last_char.allocate(first_char.str(), last_char.str());
   if (index>0) {
      lastword_firstchar.allocate(last_word.str(), first_char.str());
      currentword_lastchar.allocate(word.str(), last_char_last_word.str());
      firstcharlastword_word.allocate(first_char_last_word.str(), first_char.str());
      lastword_lastchar.allocate(last_char_last_word.str(), last_char.str());
   }
   
   SCORE_TYPE amount = ( (method==eAdd) ? 1 : -1 ) ;

   m_weights.m_mapSeenWords.updateScore(word, amount, round);
   m_weights.m_mapLastWordByWord.updateScore(two_word, amount, round);
   if (length==1) m_weights.m_mapOneCharWord.updateScore(first_char, amount, round);
   else {
      m_weights.m_mapFirstAndLastChars.updateScore(first_and_last_char, amount, round);
      for (int j=0; j<chars.size()-1; j++) {
         m_weights.m_mapConsecutiveChars.updateScore(chars[j]+chars[j+1], amount, round);
      }
      m_weights.m_mapLengthByFirstChar.updateScore(std::make_pair(first_char, length), amount, round);
      m_weights.m_mapLengthByLastChar.updateScore(std::make_pair(last_char, length), amount, round);
   } 
   if (index>0) {
      m_weights.m_mapSeparateChars.updateScore(two_char, amount, round);
      
      m_weights.m_mapLastWordFirstChar.updateScore(lastword_firstchar, amount, round);
      m_weights.m_mapCurrentWordLastChar.updateScore(currentword_lastchar, amount, round);
      
      m_weights.m_mapFirstCharLastWordByWord.updateScore(firstcharlastword_word, amount, round);
      m_weights.m_mapLastWordByLastChar.updateScore(lastword_lastchar, amount, round);

      m_weights.m_mapLengthByLastWord.updateScore(std::make_pair(last_word, length), amount, round);
      m_weights.m_mapLastLengthByWord.updateScore(std::make_pair(word, last_length), amount, round);
   }
}
Ejemplo n.º 23
0
static bool FindBestHomonym(const CWord& word, TWtringBuf lemma, TKeyWordType kwtype, THomonymGrammems grammems, int& homId) {
    // search if we already have a multiword homonym with such text
    bool found = false;
    bool foundSameKwType = false;
    for (CWord::SHomIt it = word.IterHomonyms(); it.Ok(); ++it)
        if (it->CHomonymBase::GetLemma() == lemma) {
            if (it->HasKWType(kwtype, KW_DICT)) {
                if (it->Grammems == grammems) {
                    homId = it.GetID();     // prefer homonyms with same kwtype and same grammems
                    return true;
                } else if (!foundSameKwType) {
                    homId = it.GetID(); // prefer homonyms with same kwtype
                    found = true;
                    foundSameKwType = true;
                }
            } else  if (!found) {
                homId = it.GetID();   // otherwise, return the first one having @lemma text
                found = true;
            }
        }

    return found;
}
Ejemplo n.º 24
0
void CTagger :: updateLocalFeatureVector( SCORE_UPDATE method , const CTwoStringVector * sentence , unsigned long index , unsigned long round ) {
    // abstd::cout words
    CWord word = sentence->at( index ).first ;
    CWord last_word = index > 0 ? sentence->at( index - 1 ).first : g_emptyWord ;
    CWord next_word = index < sentence->size() - 1 ? sentence->at( index + 1 ).first : g_emptyWord ;
    CStringVector chars , last_chars ;
    chars.clear() ;
    getCharactersFromUTF8String( sentence->at(index).first , &chars ) ;
    last_chars.clear() ;
    if ( index > 0 ) getCharactersFromUTF8String( sentence->at( index - 1 ).first , &last_chars ) ;
    // abstd::cout length
    int length = chars.size() ; //if ( length > LENGTH_MAX-1 ) length = LENGTH_MAX-1 ;
    int last_length = last_chars.size() ; //if ( last_length > LENGTH_MAX-1 ) last_length = LENGTH_MAX-1 ;
    // abstd::cout chars
    CWord first_char = chars[ 0 ];
    CWord last_char = chars[ chars.size() - 1 ];
    CWord first_char_last_word = index > 0 ? last_chars[ 0 ] : g_emptyWord;
    CWord last_char_last_word = index > 0 ? last_chars[ last_chars.size() - 1 ] : g_emptyWord;
    CWord first_char_next_word = index + 1 < sentence->size() ? getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord ;
    CWord last_twochar_last_word = last_chars.size() > 1 ? last_chars[ last_chars.size() - 2 ] + last_chars[ last_chars.size() - 1]
                                   : ( index > 1 ? getLastCharFromUTF8String(sentence->at(index-2).first) + last_chars[ 0 ] : g_emptyWord );
    CWord first_twochar = chars.size() > 1 ? chars[ 0 ] + chars [ 1 ] : ( index + 1 <sentence->size() ? chars[ 0 ] + getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord );
    CWord currentword_lasttwochar = index > 1 ? last_twochar_last_word.str() + word.str() : g_emptyWord ;
    CWord lastword_firsttwochar = index > 0 && index+1 < sentence->size() ? last_word.str() + first_twochar.str() : g_emptyWord ;

    CWord two_char = index > 0 ? last_char_last_word.str() + first_char.str() : g_emptyWord ;
    CWord lastword_firstchar = index > 0 ? last_word.str() + first_char.str() : g_emptyWord ;
    CWord currentword_lastchar = index > 0 ? last_char_last_word.str() + word.str() : g_emptyWord ;
    CWord three_char = length == 1 ? last_char_last_word.str() + word.str() + first_char_next_word.str() : g_emptyWord ;

    CTwoWords two_word ;

    // abstd::cout tags
    const CTag tag( sentence->at(index).second ) ;
    const CTag last_tag = index > 0 ? CTag( sentence->at( index-1 ).second) : CTag::SENTENCE_BEGIN ;
    const CTag second_last_tag = index > 1 ? CTag( sentence->at( index-2 ).second) : CTag::SENTENCE_BEGIN ;
    const CTagSet<CTag, 2> tag_bigram(encodeTags(tag, last_tag));
    const CTagSet<CTag, 3> tag_trigram(encodeTags(tag, last_tag, second_last_tag));
    CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2;
    CTwoTaggedWords wt12;

    // abstd::cout the char categories
    long int first_char_cat = m_weights->m_mapCharTagDictionary.lookup(first_char) | (1<<tag.code()) ;
    long int last_char_cat = m_weights->m_mapCharTagDictionary.lookup(last_char) | (1<<tag.code()) ;
    SCORE_TYPE amount = method == eAdd ? 1 : -1 ;

    m_weights->m_mapCurrentTag[ std::make_pair(word, tag) ].updateCurrent( amount , round ) ;
    m_weights->m_mapLastTagByTag[ tag_bigram ].updateCurrent( amount , round ) ;
    m_weights->m_mapLastTwoTagsByTag[ tag_trigram ].updateCurrent( amount , round ) ;
    if ( index > 0 ) {
        if ( last_length <= 2 ) m_weights->m_mapTagByLastWord[ std::make_pair(last_word, tag) ].updateCurrent( amount , round ) ;
        if ( length <= 2 ) m_weights->m_mapLastTagByWord[ std::make_pair(word, last_tag) ].updateCurrent( amount , round ) ;
        if ( length <= 2 ) m_weights->m_mapTagByWordAndPrevChar[ std::make_pair(currentword_lastchar, tag) ].updateCurrent( amount , round ) ;
        if ( last_length <= 2 ) m_weights->m_mapTagByWordAndNextChar[ std::make_pair(lastword_firstchar, last_tag) ].updateCurrent( amount , round ) ;
    }
    if ( length == 1 ) {
        if ( index > 0 && index < sentence->size() - 1 )
            m_weights->m_mapTagOfOneCharWord[ std::make_pair(three_char, tag) ].updateCurrent( amount , round ) ;
    }
    else {
        m_weights->m_mapTagByFirstChar[ std::make_pair(first_char, tag) ].updateCurrent( amount , round ) ;
        m_weights->m_mapTagByLastChar[ std::make_pair(last_char, tag) ].updateCurrent( amount , round ) ;                    //
        m_weights->m_mapTagByFirstCharCat[ std::make_pair(first_char_cat, tag) ].updateCurrent( amount , round ) ;
        m_weights->m_mapTagByLastCharCat[ std::make_pair(last_char_cat, tag) ].updateCurrent( amount , round ) ;
        for ( int j = 0 ; j < chars.size() ; ++ j ) {
            if ( j > 0 && j < chars.size() - 1 )
                m_weights->m_mapTagByChar[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ;
            if ( j > 0 ) {
                wt1.load(chars[j], tag);
                wt2.load(first_char);
                wt12.allocate(wt1, wt2);
                m_weights->m_mapTaggedCharByFirstChar[ wt12 ].updateCurrent( amount , round ) ;
                if ( chars[j] == chars[j-1] ) m_weights->m_mapRepeatedCharByTag[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ; //
            }
            if (j<chars.size()-1) {
                wt1.load(chars[j], tag);
                wt2.load(last_char);
                wt12.allocate(wt1, wt2);
                m_weights->m_mapTaggedCharByLastChar[ wt12 ].updateCurrent(amount, round);
            }
        }
    }

}
Ejemplo n.º 25
0
/////////////////////////////////////////////////////////////////////////////
//                   CFString::ExtractWordsFromString
void CFString::ExtractWordsFromString( CSTR_line* Comingline , PageElementCount* Count)
{
 CSTR_rast_attr	 rast_attr;
 CSTR_rast       rast;
 RecVersions     vers;
 CSTR_line	      line;
 CSTR_attr       line_attr;
 CWord*          CurrentWord;
 const char*     SeparatorsWord=" ";
 const char*     result=NULL;
 int16_t             FlagWord;
 Bool            FlagString,FlagCapDrop;

 line        = *Comingline;
 FlagWord    = 0;
 FlagString  = FALSE;
 FlagCapDrop = FALSE;

 CSTR_GetLineAttr( line , &line_attr );
 if(line_attr.Flags & CSTR_STR_CapDrop) //буквица - см. первая буква в сказках
    FlagCapDrop = TRUE;

 SetRect(&m_rectBaseLine, line_attr.bs1, line_attr.bs2,
                        line_attr.bs3, line_attr.bs4); //don't used now
 SetRect(&m_rectString, line_attr.col - TemplateOffset.x,
                      line_attr.row - TemplateOffset.y,
	                  line_attr.col - TemplateOffset.x + line_attr.wid,
                      line_attr.row - TemplateOffset.y + line_attr.hei);

 #ifdef alDebug //obsolete option
 {
  RECT rect;
  SetRect(&rect,line_attr.col, line_attr.row,
                         line_attr.col + line_attr.wid,
					     line_attr.row + line_attr.hei);
  pInputArray->push_back(rect);
 }
 #endif

 rast =  CSTR_GetFirstRaster( line );            // I фикт. растр
 rast =  CSTR_GetNextRaster( rast , CSTR_f_all );

 while( rast )
 {
	 CSTR_GetCollection( rast,&vers );
     CSTR_GetAttr( rast,&rast_attr );
     //#define NEG_HALF_SPACE  0x1e
     //#define POS_HALF_SPACE  0x1f
     //#define REGULAR_SPACE   0x20

     if((rast_attr.flg&CSTR_f_dust) ||  (rast_attr.flg&CSTR_f_fict ) ||//Дусты и
	    ((rast_attr.flg&CSTR_f_space) &&
	    (!vers.Alt[0].Code || vers.Alt[0].Code == 0x1e ||
	    vers.Alt[0].Code == 0x1f)))//полупробелы  выкидываем.
      goto next_raster;

      FlagString = TRUE;
	  result = strchr( SeparatorsWord, vers.Alt[0].Code );
	  if( FlagWord ){   // word openly
	    if(result)      // word is broken
	       FlagWord =0;
	    else{           // word lasts
    	   assert( CurrentWord );
           CurrentWord->AddLetter2Word( &rast , Count ,&FlagCapDrop );
        }
      }
	  else{             // expect new word
        if(!result){  // beginning word
          ++m_wWordsCount;
          ++Count->Words;
	      FlagWord = 1;
          m_arWords.push_back( new CWord() );
          CurrentWord = m_arWords[m_wWordsCount-1];
          assert( CurrentWord );

   	      if(rast_attr.font_spec == CSTR_fs_courier)
             CurrentWord->m_wFontNumber     = rast_attr.font & 0xFC;
          else
             CurrentWord->m_wFontNumber     = rast_attr.font;

		  CurrentWord->m_wFontPointSize  = rast_attr.keg;
          CurrentWord->AddLetter2Word( &rast , Count ,&FlagCapDrop );
          if(FlagCapDrop){
            CurrentWord->m_wFontPointSize = 14;
		    FlagCapDrop = FALSE;
		    FlagWord    = 0;
          }
        }
      }
      next_raster:
	   rast = CSTR_GetNextRaster( rast , CSTR_f_all );
 }

 if(FlagString == FALSE)
   Count->Strings--;
}
Ejemplo n.º 26
0
SWordHomonymNum CMultiWordCreator::AddMultiWordInt(CWordSequence* ws, bool takeOnwership,
                                                   const TGramBitSet& newPos, const CWordsPair& searchAreaWP)
{
    SWordHomonymNum wh = ws->GetMainWord();
    Wtroka stmp;
    SWordHomonymNum newWH;
    CWord* pNewWord = GetWordForMultiWord(*ws, stmp, newWH);

    pNewWord->m_SourceWords.SetPair(ws->FirstWord(), ws->LastWord());

    TGramBitSet art_grammems;      // output grammems of article
    Wtroka article_title;
    TKeyWordType article_type = NULL;

    if (ws->HasGztArticle()) {
        const TGztArticle& gzt_article = ws->GetGztArticle();
        article_title = gzt_article.GetTitle();
        article_type = gzt_article.GetType();
        const NGzt::TMessage* lemma = gzt_article.GetLemmaInfo();
        if (lemma != NULL)
            art_grammems = gzt_article.GetLemmaOutputGrammems(*lemma);
    } else if (ws->HasAuxArticle()) {
        const article_t* pArt = GlobalDictsHolder->GetAuxArticle(ws->GetAuxArticleIndex());
        art_grammems = pArt->get_new_pos();
        article_title = pArt->get_title();
        article_type = pArt->get_kw_type();
    }

    THomonymGrammems newGram;
    if (!ws->GetGrammems().Empty()) {
        newGram = ws->GetGrammems();
        if (!newGram.HasForms() && wh.IsValid())
            newGram.SetPOS(m_Words[wh].Grammems.GetPOS());
    } else if (wh.IsValid() && HasToAddGrammemsFromMainWord(*ws))
        newGram = m_Words[wh].Grammems;
    MergeGrammems(newGram, art_grammems, newPos);

    THomonymPtr pNewHom;
    if (pNewWord->IsMultiWord() && (pNewWord->GetSourcePair().Size() != 1 || !wh.IsValid())) {
        newWH.m_HomNum = FindOrMakeMultiwordHomonym(*ws, *pNewWord, article_type, newGram, pNewHom);
        YASSERT(newWH.IsValid());
    }

    if (pNewHom.Get() == NULL) {
        if (!pNewWord->IsMultiWord()) {
            if (wh.IsValid())
                newWH = wh;
            else {
                // just take the first homonym
                newWH.m_bOriginalWord = true;
                newWH.m_WordNum = pNewWord->GetSourcePair().FirstWord();
                newWH.m_HomNum = pNewWord->IterHomonyms().GetID();
            }
        }
        YASSERT(newWH.IsValid());
        //часто бывает ситуация, когда мы вынуждены клонировать абсолютно одинаковые
        //омонимы, различающиеся только приписанными статьями из aux_dic,
        //в случае с geo_thesaurus.cxx это чревато порождением огромного количества омонимов
        //(боле 50 для "Петров"), тогда если статьи не отличаются друг от друга полем СОСТАВ
        //приписываемыми граммемами, ЧР и KWType, то мы омонимы не клонируем а дополнительные статьи
        //записываем в CHomonym::m_KWtype2Articles. Это происходит в CWord::PutArticleIndex.
        //если мы считаем, что найденные статьи для одного и того же омонима ничем не отличаются,
        //то главное слово для неотличающихся стаей у ws одно и то же и ему приписана
        //первая попавшаяся среди неразличимы статья
        //например статьи "_петрова_2" и "_петрова_3" для нас одинаковы (отличаются только ГЕО_ЧАСТЬ
        //а это неважно для парсера) и незачем плодить омонимы
        bool bCloneAnyway = (!newGram.Empty() && !(m_Words[newWH].Grammems == newGram)) ||
                            !GlobalDictsHolder->BuiltinKWTypes().IsGeo(article_type);

        if (ws->HasAuxArticle())
            newWH.m_HomNum = m_Words.GetWord(newWH).PutAuxArticle(newWH.m_HomNum, ws->GetAuxArticleIndex(), bCloneAnyway);
        else
            newWH.m_HomNum = m_Words.GetWord(newWH).PutGztArticle(newWH.m_HomNum, ws->GetGztArticle(), bCloneAnyway);
    }
    YASSERT(newWH.IsValid());

    AddFoundArticle(article_type, article_title, newWH, searchAreaWP);
    CHomonym& h = m_Words[newWH];
    h.SetSourceWordSequence(ws);
    if (!newGram.Empty())
        h.SetGrammems(newGram);

    if (takeOnwership) {
        if (!ws->HasLemmas())
            NormalizeMultiWordHomonym(pNewWord, &h);
        m_wordSequences.push_back(ws);
    }

    return newWH;
}