SCORE_TYPE CTagger::getLocalScore( const CStringVector * sentence, CStateItem * item , unsigned long index ) { static SCORE_TYPE nReturn ; static unsigned long int last_start , last_length ; static unsigned long int start , end , length ; // abstd::cout the words start = item->getWordStart( index ) ; end = item->getWordEnd( index ) ; length = item->getWordLength( index ) ; last_start = index > 0 ? item->getWordStart( index-1 ) : 999999 ; last_length = index > 0 ? item->getWordLength( index-1 ) : 99999 ; const CWord &word = m_WordCache.find( start , end , sentence ) ; const CWord &last_word = index > 0 ? m_WordCache.find( last_start , start-1 , sentence ) : g_emptyWord ; // abstd::cout the chars const CWord &first_char = m_WordCache.find( start , start , sentence ) ; const CWord &last_char = m_WordCache.find( end , end , sentence ) ; const CWord &first_char_last_word = index > 0 ? m_WordCache.find( last_start , last_start , sentence ) : g_emptyWord ; const CWord &last_char_last_word = index > 0 ? m_WordCache.find( start-1 , start-1 , sentence) : g_emptyWord; const CWord &first_char_next_word = end+1 < sentence->size() ? m_WordCache.find( end+1 , end+1 , sentence) : g_emptyWord ; const CWord &first_twochar = start+1 < sentence->size() ? m_WordCache.find( start , start+1 , sentence ) : g_emptyWord ; const CWord &last_twochar_last_word = start>1 ? m_WordCache.find( start-2 , start-1 , sentence ) : g_emptyWord ; const CWord &two_char = index > 0 ? m_WordCache.find( start-1 , start, sentence) : g_emptyWord; const CWord &lastword_firstchar = index > 0 ? m_WordCache.find( last_start , start , sentence ) : g_emptyWord ; const CWord ¤tword_lastchar = index > 0 ? m_WordCache.find( start-1 , end , sentence) : g_emptyWord ; const CWord ¤tword_lasttwochar = start > 1 ? m_WordCache.find( start-2 , end , sentence ) : g_emptyWord ; const CWord &lastword_firsttwochar = index > 0 && start+1 < sentence->size() ? m_WordCache.find( last_start , start+1 , sentence ) : g_emptyWord ; const CWord &three_char = length == 1 && start > 0 && end < sentence->size()-1 ? m_WordCache.find( start-1 , end+1 , sentence ) : g_emptyWord ; CTwoWords two_word; // abstd::cout the tags const CTag tag = item->getTag(index); const CTag last_tag = index>0 ? item->getTag(index-1) : CTag::SENTENCE_BEGIN; const CTag second_last_tag = index>1 ? item->getTag(index-2) : CTag::SENTENCE_BEGIN; const CTagSet<CTag, 2> tag_bigram(encodeTags(tag, last_tag)); const CTagSet<CTag, 3> tag_trigram(encodeTags(tag, last_tag, second_last_tag)); static CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; static CTwoTaggedWords wt12; long int first_char_cat = m_weights->m_mapCharTagDictionary.lookup(first_char) | (1<<tag.code()) ; long int last_char_cat = m_weights->m_mapCharTagDictionary.lookup(last_char) | (1<<tag.code()) ; nReturn = m_weights->m_mapCurrentTag.getScore( std::make_pair(word, tag) , m_nScoreIndex ) ; nReturn += m_weights->m_mapLastTagByTag.getScore( tag_bigram , m_nScoreIndex ) ; nReturn += m_weights->m_mapLastTwoTagsByTag.getScore( tag_trigram , m_nScoreIndex ) ; if ( start > 0 ) { if ( last_length <= 2 ) nReturn += m_weights->m_mapTagByLastWord.getScore( std::make_pair(last_word, tag) , m_nScoreIndex ) ; if ( length <= 2 ) nReturn += m_weights->m_mapLastTagByWord.getScore( std::make_pair(word, last_tag) , m_nScoreIndex ) ; if ( length <= 2 ) nReturn += m_weights->m_mapTagByWordAndPrevChar.getScore( std::make_pair(currentword_lastchar, tag) , m_nScoreIndex ) ; if ( last_length <= 2 ) nReturn += m_weights->m_mapTagByWordAndNextChar.getScore( std::make_pair(lastword_firstchar, last_tag) , m_nScoreIndex) ; } if ( length == 1 ) { if ( start > 0 && end < sentence->size()-1 ) nReturn += m_weights->m_mapTagOfOneCharWord.getScore( std::make_pair(three_char, tag) , m_nScoreIndex ) ; } else { nReturn += m_weights->m_mapTagByFirstChar.getScore( std::make_pair(first_char, tag) , m_nScoreIndex ) ; nReturn += m_weights->m_mapTagByLastChar.getScore( std::make_pair(last_char, tag) , m_nScoreIndex ) ; nReturn += m_weights->m_mapTagByFirstCharCat.getScore( std::make_pair(first_char_cat, tag) , m_nScoreIndex ) ; nReturn += m_weights->m_mapTagByLastCharCat.getScore( std::make_pair(last_char_cat, tag) , m_nScoreIndex ) ; for ( int j = 0 ; j < item->getWordLength( index ) ; ++ j ) { if ( j > 0 && j < item->getWordLength( index )-1 ) nReturn += m_weights->m_mapTagByChar.getScore( std::make_pair(m_WordCache.find(start+j, start+j, sentence), tag) , m_nScoreIndex ); if ( j > 0 ) { wt1.load( m_WordCache.find(start+j, start+j, sentence), tag ); wt2.load( first_char ); wt12.refer(&wt1, &wt2); nReturn += m_weights->m_mapTaggedCharByFirstChar.getScore( wt12, m_nScoreIndex ); if ( m_WordCache.find(start+j, start+j, sentence) == m_WordCache.find(start+j-1, start+j-1, sentence)) nReturn += m_weights->m_mapRepeatedCharByTag.getScore( std::make_pair(m_WordCache.find(start+j, start+j, sentence), tag) , m_nScoreIndex ); } if ( j < item->getWordLength( index )-1 ) { wt1.load( m_WordCache.find(start+j, start+j, sentence), tag ); wt2.load( last_char ); wt12.refer(&wt1, &wt2); nReturn += m_weights->m_mapTaggedCharByLastChar.getScore( wt12 , m_nScoreIndex ); } } } return nReturn; }
void CTagger :: updateLocalFeatureVector( SCORE_UPDATE method , const CTwoStringVector * sentence , unsigned long index , unsigned long round ) { // abstd::cout words CWord word = sentence->at( index ).first ; CWord last_word = index > 0 ? sentence->at( index - 1 ).first : g_emptyWord ; CWord next_word = index < sentence->size() - 1 ? sentence->at( index + 1 ).first : g_emptyWord ; CStringVector chars , last_chars ; chars.clear() ; getCharactersFromUTF8String( sentence->at(index).first , &chars ) ; last_chars.clear() ; if ( index > 0 ) getCharactersFromUTF8String( sentence->at( index - 1 ).first , &last_chars ) ; // abstd::cout length int length = chars.size() ; //if ( length > LENGTH_MAX-1 ) length = LENGTH_MAX-1 ; int last_length = last_chars.size() ; //if ( last_length > LENGTH_MAX-1 ) last_length = LENGTH_MAX-1 ; // abstd::cout chars CWord first_char = chars[ 0 ]; CWord last_char = chars[ chars.size() - 1 ]; CWord first_char_last_word = index > 0 ? last_chars[ 0 ] : g_emptyWord; CWord last_char_last_word = index > 0 ? last_chars[ last_chars.size() - 1 ] : g_emptyWord; CWord first_char_next_word = index + 1 < sentence->size() ? getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord ; CWord last_twochar_last_word = last_chars.size() > 1 ? last_chars[ last_chars.size() - 2 ] + last_chars[ last_chars.size() - 1] : ( index > 1 ? getLastCharFromUTF8String(sentence->at(index-2).first) + last_chars[ 0 ] : g_emptyWord ); CWord first_twochar = chars.size() > 1 ? chars[ 0 ] + chars [ 1 ] : ( index + 1 <sentence->size() ? chars[ 0 ] + getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord ); CWord currentword_lasttwochar = index > 1 ? last_twochar_last_word.str() + word.str() : g_emptyWord ; CWord lastword_firsttwochar = index > 0 && index+1 < sentence->size() ? last_word.str() + first_twochar.str() : g_emptyWord ; CWord two_char = index > 0 ? last_char_last_word.str() + first_char.str() : g_emptyWord ; CWord lastword_firstchar = index > 0 ? last_word.str() + first_char.str() : g_emptyWord ; CWord currentword_lastchar = index > 0 ? last_char_last_word.str() + word.str() : g_emptyWord ; CWord three_char = length == 1 ? last_char_last_word.str() + word.str() + first_char_next_word.str() : g_emptyWord ; CTwoWords two_word ; // abstd::cout tags const CTag tag( sentence->at(index).second ) ; const CTag last_tag = index > 0 ? CTag( sentence->at( index-1 ).second) : CTag::SENTENCE_BEGIN ; const CTag second_last_tag = index > 1 ? CTag( sentence->at( index-2 ).second) : CTag::SENTENCE_BEGIN ; const CTagSet<CTag, 2> tag_bigram(encodeTags(tag, last_tag)); const CTagSet<CTag, 3> tag_trigram(encodeTags(tag, last_tag, second_last_tag)); CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; CTwoTaggedWords wt12; // abstd::cout the char categories long int first_char_cat = m_weights->m_mapCharTagDictionary.lookup(first_char) | (1<<tag.code()) ; long int last_char_cat = m_weights->m_mapCharTagDictionary.lookup(last_char) | (1<<tag.code()) ; SCORE_TYPE amount = method == eAdd ? 1 : -1 ; m_weights->m_mapCurrentTag[ std::make_pair(word, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapLastTagByTag[ tag_bigram ].updateCurrent( amount , round ) ; m_weights->m_mapLastTwoTagsByTag[ tag_trigram ].updateCurrent( amount , round ) ; if ( index > 0 ) { if ( last_length <= 2 ) m_weights->m_mapTagByLastWord[ std::make_pair(last_word, tag) ].updateCurrent( amount , round ) ; if ( length <= 2 ) m_weights->m_mapLastTagByWord[ std::make_pair(word, last_tag) ].updateCurrent( amount , round ) ; if ( length <= 2 ) m_weights->m_mapTagByWordAndPrevChar[ std::make_pair(currentword_lastchar, tag) ].updateCurrent( amount , round ) ; if ( last_length <= 2 ) m_weights->m_mapTagByWordAndNextChar[ std::make_pair(lastword_firstchar, last_tag) ].updateCurrent( amount , round ) ; } if ( length == 1 ) { if ( index > 0 && index < sentence->size() - 1 ) m_weights->m_mapTagOfOneCharWord[ std::make_pair(three_char, tag) ].updateCurrent( amount , round ) ; } else { m_weights->m_mapTagByFirstChar[ std::make_pair(first_char, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapTagByLastChar[ std::make_pair(last_char, tag) ].updateCurrent( amount , round ) ; // m_weights->m_mapTagByFirstCharCat[ std::make_pair(first_char_cat, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapTagByLastCharCat[ std::make_pair(last_char_cat, tag) ].updateCurrent( amount , round ) ; for ( int j = 0 ; j < chars.size() ; ++ j ) { if ( j > 0 && j < chars.size() - 1 ) m_weights->m_mapTagByChar[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ; if ( j > 0 ) { wt1.load(chars[j], tag); wt2.load(first_char); wt12.allocate(wt1, wt2); m_weights->m_mapTaggedCharByFirstChar[ wt12 ].updateCurrent( amount , round ) ; if ( chars[j] == chars[j-1] ) m_weights->m_mapRepeatedCharByTag[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ; // } if (j<chars.size()-1) { wt1.load(chars[j], tag); wt2.load(last_char); wt12.allocate(wt1, wt2); m_weights->m_mapTaggedCharByLastChar[ wt12 ].updateCurrent(amount, round); } } } }