SCORE_TYPE CTagger::getOrUpdateAppendScore( const CStringVector *sentence, const CSubStateItem *item, unsigned long index, unsigned long char_index, SCORE_TYPE amount, unsigned long round ) { static SCORE_TYPE nReturn ; assert(char_index>0); static unsigned long start; start = item->getWordStart( index ) ; const CWord &char_unigram = find_or_replace_word_cache( char_index, char_index ); const CWord &char_bigram = find_or_replace_word_cache( char_index-1, char_index ); const CWord &first_char = find_or_replace_word_cache( start, start ); const CWord &prev_char = find_or_replace_word_cache( char_index-1, char_index-1 ); // about the tags const CTag &tag = item->getTag(index); static CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; static CTwoTaggedWords wt12; static CTwoWords first_char_and_char; // adding scores with features nReturn = 0; nReturn += m_weights->m_mapTagByChar.getOrUpdateScore( std::make_pair(char_unigram, tag), m_nScoreIndex , amount , round ) ; wt1.load(char_unigram, tag); wt2.load(first_char); refer_or_allocate(wt12, wt1, wt2); nReturn += m_weights->m_mapTaggedCharByFirstChar.getOrUpdateScore( wt12, m_nScoreIndex, amount, round ) ; // if (char_unigram == prev_char) // nReturn += m_weights->m_mapRepeatedCharByTag.getOrUpdateScore( std::make_pair(char_unigram, tag), m_nScoreIndex, amount, round) ; nReturn += m_weights->m_mapConsecutiveChars.getOrUpdateScore( char_bigram, m_nScoreIndex, amount, round ) ; nReturn += m_weights->m_mapTaggedConsecutiveChars.getOrUpdateScore( std::make_pair(char_bigram, tag), m_nScoreIndex, amount, round ) ; // refer_or_allocate(first_char_and_char, first_char, char_unigram); // nReturn += m_weights->m_mapFirstCharAndChar.getOrUpdateScore( first_char_and_char, m_nScoreIndex , amount , round ) ; return nReturn; }
SCORE_TYPE CTagger::getOrUpdateSeparateScore( const CStringVector *sentence, const CSubStateItem *item, unsigned long index, SCORE_TYPE amount, unsigned long round ) { static SCORE_TYPE nReturn ; static unsigned long start_0; static unsigned long start_1, end_1, length_1; static unsigned long start_2, end_2, length_2; // about the words assert(amount!=0||index==item->size()-1||index==item->size()); start_0 = index==item->size() ? 0 : item->getWordStart( index ) ; start_1 = index > 0 ? item->getWordStart( index-1 ) : 0 ; end_1 = index > 0 ? item->getWordEnd( index-1 ) : 0 ; assert(index==item->size()||index==0 || end_1 == start_0-1); length_1 = index > 0 ? item->getWordLength( index-1 ) : 0; start_2 = index > 1 ? item->getWordStart( index-2 ) : 0 ; end_2 = index > 1 ? item->getWordEnd( index-2 ) : 0 ; assert(index<2 || end_2 == start_1-1); length_2 = index > 1 ? item->getWordLength( index-2 ) : 0; const CWord &word_1 = index>0 ? find_or_replace_word_cache( start_1, end_1 ) : g_emptyWord; const CWord &word_2 = index>1 ? find_or_replace_word_cache( start_2, end_2 ) : g_emptyWord; // about the length if (length_1>LENGTH_MAX) length_1 = LENGTH_MAX; if (length_2>LENGTH_MAX) length_2 = LENGTH_MAX; // about the chars const CWord &first_char_0 = index<item->size() ? find_or_replace_word_cache( start_0, start_0 ) : g_emptyWord ; const CWord &first_char_1 = index>0 ? find_or_replace_word_cache( start_1, start_1 ) : g_emptyWord; const CWord &last_char_1 = index>0 ? find_or_replace_word_cache( end_1, end_1 ) : g_emptyWord; const CWord &last_char_2 = index>1 ? find_or_replace_word_cache( end_2, end_2 ) : g_emptyWord; const CWord &two_char = index>0&&index<item->size() ? find_or_replace_word_cache( end_1, start_0 ) : g_emptyWord ; const CWord &word_1_first_char_0 = index>0&&index<item->size() ? find_or_replace_word_cache( start_1, start_0 ) : g_emptyWord; const CWord &word_1_last_char_2 = index>1 ? find_or_replace_word_cache( end_2, end_1 ) : g_emptyWord; const CWord &three_char = ( length_1==1 && index>1 && index<item->size() ) ? find_or_replace_word_cache( end_2, start_0 ) : g_emptyWord; static CTwoWords word_2_word_1, first_char_1_last_char_1, first_char_0_first_char_1, last_char_1_last_char_2 ; if (amount==0&&index>0) { word_2_word_1.refer( &word_1 , &word_2 ) ; first_char_1_last_char_1.refer( &first_char_1 , &last_char_1 ) ; first_char_0_first_char_1.refer( &first_char_0 , &first_char_1 ) ; last_char_1_last_char_2.refer( &last_char_1 , &last_char_2 ) ; } else { word_2_word_1.allocate( word_1, word_2 ) ; first_char_1_last_char_1.allocate( first_char_1, last_char_1 ) ; first_char_0_first_char_1.allocate( first_char_0, first_char_1 ) ; last_char_1_last_char_2.allocate( last_char_1, last_char_2 ) ; } // about the tags const CTag &tag_0 = index<item->size() ? item->getTag( index ) : g_beginTag; const CTag &tag_1 = index>0 ? item->getTag(index-1) : g_beginTag; const CTag &tag_2 = index>1 ? item->getTag(index-2) : g_beginTag; static CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; static CTwoTaggedWords wt12; unsigned long long first_char_cat_0 = m_weights->m_mapCharTagDictionary.lookup(first_char_0) | (static_cast<unsigned long long>(1)<<tag_0.code()) ; unsigned long long last_char_cat_1 = m_weights->m_mapCharTagDictionary.lookup(last_char_1) | (static_cast<unsigned long long>(1)<<tag_1.code()) ; static CTagSet<CTag, 2> tag_0_tag_1, tag_0_tag_2, tag_1_tag_2; static CTagSet<CTag, 3> tag_0_tag_1_tag_2; tag_0_tag_1.load( encodeTags(tag_0, tag_1) ); tag_0_tag_2.load( encodeTags(tag_0, tag_2) ); tag_1_tag_2.load( encodeTags(tag_1, tag_2) ); tag_0_tag_1_tag_2.load( encodeTags(tag_0, tag_1, tag_2) ); static int j ; // adding scores with features for last word if (index>0) { nReturn = m_weights->m_mapSeenWords.getOrUpdateScore( word_1 , m_nScoreIndex , amount , round ) ; if (index>1) nReturn += m_weights->m_mapLastWordByWord.getOrUpdateScore( word_2_word_1 , m_nScoreIndex , amount , round ) ; if ( length_1 == 1 ) { nReturn += m_weights->m_mapOneCharWord.getOrUpdateScore( word_1 , m_nScoreIndex , amount , round ) ; } else { nReturn += m_weights->m_mapFirstAndLastChars.getOrUpdateScore( first_char_1_last_char_1 , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLengthByFirstChar.getOrUpdateScore( std::make_pair(first_char_1, length_1) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLengthByLastChar.getOrUpdateScore( std::make_pair(last_char_1, length_1) , m_nScoreIndex , amount , round ) ; // nReturn += m_weights->m_mapLengthByTagAndFirstChar.getOrUpdateScore( std::make_pair(first_char_1, (length_1<<CTag::SIZE)|tag_1.code()) , m_nScoreIndex , amount , round ) ; // nReturn += m_weights->m_mapLengthByTagAndLastChar.getOrUpdateScore( std::make_pair(last_char_1, (length_1<<CTag::SIZE)|tag_1.code()) , m_nScoreIndex , amount , round ) ; } if (index>1) { nReturn += m_weights->m_mapCurrentWordLastChar.getOrUpdateScore( word_1_last_char_2 , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastWordByLastChar.getOrUpdateScore( last_char_1_last_char_2 , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLengthByLastWord.getOrUpdateScore( std::make_pair(word_2, length_1) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastLengthByWord.getOrUpdateScore( std::make_pair(word_1, length_2), m_nScoreIndex , amount , round ) ; } nReturn += m_weights->m_mapCurrentTag.getOrUpdateScore( std::make_pair(word_1, tag_1) , m_nScoreIndex , amount , round ) ; if ( length_1 <= 2 ) nReturn += m_weights->m_mapLastTagByWord.getOrUpdateScore( std::make_pair(word_1, tag_2) , m_nScoreIndex , amount , round ) ; if (index>1) { if ( length_1 <= 2 ) nReturn += m_weights->m_mapTagByWordAndPrevChar.getOrUpdateScore( std::make_pair(word_1_last_char_2, tag_1) , m_nScoreIndex , amount , round ) ; if ( length_1 == 1 && index<item->size() ) nReturn += m_weights->m_mapTagOfOneCharWord.getOrUpdateScore( std::make_pair(three_char, tag_1) , m_nScoreIndex , amount , round ) ; } nReturn += m_weights->m_mapTagByLastChar.getOrUpdateScore( std::make_pair(last_char_1, tag_1) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapTagByLastCharCat.getOrUpdateScore( std::make_pair(last_char_cat_1, tag_1) , m_nScoreIndex , amount , round ) ; for (j=0; j<length_1-1; ++j) { wt1.load(find_or_replace_word_cache(start_1+j, start_1+j), tag_1); wt2.load(last_char_1);// if (amount==0) { wt12.refer(&wt1, &wt2); } else { wt12.allocate(wt1, wt2); } nReturn += m_weights->m_mapTaggedCharByLastChar.getOrUpdateScore(wt12, m_nScoreIndex, amount, round) ; } } // all about the current word nReturn += m_weights->m_mapLastTagByTag.getOrUpdateScore( tag_0_tag_1, m_nScoreIndex , amount , round ) ; if (index>0) nReturn += m_weights->m_mapTag0Tag1Size1.getOrUpdateScore( std::make_pair( tag_0_tag_1, length_1 ), m_nScoreIndex , amount , round ) ; if (index>0) nReturn += m_weights->m_mapTag1Tag2Size1.getOrUpdateScore( std::make_pair( tag_1_tag_2, length_1 ), m_nScoreIndex , amount , round ) ; if (index>0) nReturn += m_weights->m_mapTag0Tag1Tag2Size1.getOrUpdateScore( std::make_pair( tag_0_tag_1_tag_2, length_1 ), m_nScoreIndex , amount , round ) ; if ( length_1 <= 2 ) nReturn += m_weights->m_mapTagByLastWord.getOrUpdateScore( std::make_pair(word_1, tag_0) , m_nScoreIndex , amount , round ) ; if ( index > 0 ) { nReturn += m_weights->m_mapLastTwoTagsByTag.getOrUpdateScore( tag_0_tag_1_tag_2, m_nScoreIndex , amount , round ) ; } if (index<item->size()) { if ( index>0 ) { nReturn += m_weights->m_mapSeparateChars.getOrUpdateScore( two_char , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastWordFirstChar.getOrUpdateScore( word_1_first_char_0 , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapFirstCharLastWordByWord.getOrUpdateScore( first_char_0_first_char_1 , m_nScoreIndex , amount , round ) ; if ( length_1 <= 2 ) nReturn += m_weights->m_mapTagByWordAndNextChar.getOrUpdateScore( std::make_pair(word_1_first_char_0, tag_1) , m_nScoreIndex , amount , round ) ; // nReturn += m_weights->m_mapSepCharAndNextChar.getOrUpdateScore( find_or_replace_word_cache(start_0, start_0==sentence->size()-1?start_0:start_0+1) , m_nScoreIndex , amount , round ) ; } nReturn += m_weights->m_mapTagByFirstChar.getOrUpdateScore( std::make_pair(first_char_0, tag_0) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapTagByFirstCharCat.getOrUpdateScore( std::make_pair(first_char_cat_0, tag_0) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapFirstCharBy2Tags.getOrUpdateScore( std::make_pair(first_char_0, tag_0_tag_1) , m_nScoreIndex , amount , round ) ; if (index>0)nReturn += m_weights->m_mapFirstCharBy3Tags.getOrUpdateScore( std::make_pair(first_char_0, tag_0_tag_1_tag_2) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapTagByChar.getOrUpdateScore( std::make_pair(first_char_0, tag_0), m_nScoreIndex , amount , round ) ; if (index>0) { wt1.load(last_char_1, tag_1); wt2.load(first_char_0, tag_0); if (amount==0) { wt12.refer(&wt1, &wt2); } else { wt12.allocate(wt1, wt2); } nReturn += m_weights->m_mapTaggedSeparateChars.getOrUpdateScore( wt12, m_nScoreIndex , amount , round ) ; } if (index>0) nReturn += m_weights->m_mapTagWordTag.getOrUpdateScore( std::make_pair( word_1, tag_0_tag_2 ), m_nScoreIndex, amount, round); if (index>1) nReturn += m_weights->m_mapWordTagTag.getOrUpdateScore( std::make_pair( word_2, tag_0_tag_1 ), m_nScoreIndex, amount, round); } // =================================================================================== // character scores -- with end_1-1 middled // static int char_info; // char_info = encodeCharSegmentation(start_1==end_1, true); // if (index>0) { // for (j = std::max(0, static_cast<int>(end_1)-1); j < std::min(static_cast<unsigned long>(sentence->size()), end_1+2); ++j) { // nReturn += m_weights->m_mapCharUnigram.getOrUpdateScore( std::make_pair( find_or_replace_word_cache(j, j), encodeCharInfoAndPosition(char_info, j-end_1) ), m_nScoreIndex, amount, round); // if (hasCharTypeKnowledge()) nReturn += m_weights->m_mapCharCatUnigram.getOrUpdateScore( std::make_pair( groupCharTypes(segmentor, sentence, j, 1, amount), encodeCharInfoAndPosition(char_info, j-end_1) ), m_nScoreIndex, amount, round); // } // for (j = std::max(0, static_cast<int>(end_1)-1); j < std::min(static_cast<unsigned long>(sentence->size())-1, end_1+1); ++j) { // nReturn += m_weights->m_mapCharBigram.getOrUpdateScore( std::make_pair( find_or_replace_word_cache(j, j+1), encodeCharInfoAndPosition(char_info, j-end_1) ), m_nScoreIndex, amount, round); // if (hasCharTypeKnowledge()) nReturn += m_weights->m_mapCharCatBigram.getOrUpdateScore( std::make_pair( groupCharTypes(segmentor, sentence, j, 2, amount), encodeCharInfoAndPosition(char_info, j-end_1) ), m_nScoreIndex, amount, round); // } // for (j = std::max(0, static_cast<int>(end_1)-1); j < std::min(static_cast<unsigned long>(sentence->size())-2, end_1); ++j) { // nReturn += m_weights->m_mapCharTrigram.getOrUpdateScore( std::make_pair( find_or_replace_word_cache(j, j+2), encodeCharInfoAndPosition(char_info, j-end_1) ), m_nScoreIndex, amount, round); // if (hasCharTypeKnowledge()) nReturn += m_weights->m_mapCharCatTrigram.getOrUpdateScore( std::make_pair( groupCharTypes(segmentor, sentence, j, 3, amount), encodeCharInfoAndPosition(char_info, j-end_1) ), m_nScoreIndex, amount, round); // } // } return nReturn; }
SCORE_TYPE CTagger::getOrUpdateAppendScore( const CStringVector *sentence, const CSubStateItem *item, unsigned long index, unsigned long char_index, SCORE_TYPE amount, unsigned long round ) { static SCORE_TYPE nReturn ; assert(char_index>0); static unsigned long start, length; static unsigned long prev_char_index; start = item->getWordStart( index ) ; length = item->getWordLength( index ) ; prev_char_index = char_index-1; const CTag &tag_0 = item->getTag(index); const CTag &tag_1 = index>0? item->getTag(index-1): g_beginTag; const CTag &tag_2 = index>1? item->getTag(index-2): g_beginTag; static CTagSet<CTag, 2> tag_0_tag_1; static CTagSet<CTag, 2> tag_1_tag_2; static CTagSet<CTag, 3> tag_0_tag_1_tag_2; tag_1_tag_2.load( encodeTags(tag_1, tag_2) ); tag_0_tag_1.load( encodeTags(tag_0, tag_1) ); tag_0_tag_1_tag_2.load( encodeTags(tag_0, tag_1, tag_2) ); const CWord &char_unigram = find_or_replace_word_cache( char_index, char_index ); const CWord &char_bigram = find_or_replace_word_cache( char_index-1, char_index ); const CWord &first_char = find_or_replace_word_cache( start, start ); const CWord &prev_char = find_or_replace_word_cache( char_index-1, char_index-1 ); static CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; static CTwoTaggedWords wt12; static CTwoWords first_char_and_char; static unsigned i; // adding scores with features nReturn = 0; // nReturn += m_weights->m_mapLastTagByTag.getOrUpdateScore( tag_0_tag_1, m_nScoreIndex , amount , round ) ; // if(index>0)nReturn += m_weights->m_mapLastTwoTagsByTag.getOrUpdateScore( tag_0_tag_1_tag_2, m_nScoreIndex, amount, round); nReturn += m_weights->m_mapTagByChar.getOrUpdateScore( std::make_pair(char_unigram, tag_0), m_nScoreIndex , amount , round ) ; wt1.load(char_unigram, tag_0); wt2.load(first_char); refer_or_allocate(wt12, wt1, wt2); nReturn += m_weights->m_mapTaggedCharByFirstChar.getOrUpdateScore( wt12, m_nScoreIndex, amount, round ) ; // if (char_unigram == prev_char) // nReturn += m_weights->m_mapRepeatedCharByTag.getOrUpdateScore( std::make_pair(char_unigram, tag_0), m_nScoreIndex, amount, round) ; nReturn += m_weights->m_mapConsecutiveChars.getOrUpdateScore( char_bigram, m_nScoreIndex, amount, round ) ; // nReturn += m_weights->m_mapAppCharAndNextChar.getOrUpdateScore( find_or_replace_word_cache( char_index, char_index==sentence->size()-1?char_index:char_index+1 ), m_nScoreIndex, amount, round ); nReturn += m_weights->m_mapTaggedConsecutiveChars.getOrUpdateScore( std::make_pair(char_bigram, tag_0), m_nScoreIndex, amount, round ) ; // refer_or_allocate(first_char_and_char, first_char, char_unigram); // nReturn += m_weights->m_mapFirstCharAndChar.getOrUpdateScore( first_char_and_char, m_nScoreIndex , amount , round ) ; //nReturn += m_weights->m_mapPartialWord.getOrUpdateScore( find_or_replace_word_cache( start, char_index ), m_nScoreIndex, amount, round ); // character scores -- the middle character is char_index-1 // static int char_info; // char_info = encodeCharInfoAndPosition(start==prev_char_index, false); // for (i = std::max(0, static_cast<int>(prev_char_index)-1); i < std::min(static_cast<unsigned long>(sentence->size()), prev_char_index+2); ++i) { // nReturn += m_weights->m_mapCharUnigram.getOrUpdateScore( std::make_pair( find_or_replace_word_cache(i, i), encodeCharInfoAndPosition(char_info, i-prev_char_index) ), m_nScoreIndex, amount, round); // if (hasCharTypeKnowledge()) nReturn += m_weights->m_mapCharCatUnigram.getOrUpdateScore( std::make_pair( groupCharTypes(segmentor, sentence, i, 1, amount), encodeCharInfoAndPosition(char_info, i-prev_char_index) ), m_nScoreIndex, amount, round); // } // for (i = std::max(0, static_cast<int>(prev_char_index)-1); i < std::min(static_cast<unsigned long>(sentence->size())-1, prev_char_index+1); ++i) { // nReturn += m_weights->m_mapCharBigram.getOrUpdateScore( std::make_pair( find_or_replace_word_cache(i, i+1), encodeCharInfoAndPosition(char_info, i-prev_char_index) ), m_nScoreIndex, amount, round); // if (hasCharTypeKnowledge()) nReturn += m_weights->m_mapCharCatBigram.getOrUpdateScore( std::make_pair( groupCharTypes(segmentor, sentence, i, 2, amount), encodeCharInfoAndPosition(char_info, i-prev_char_index) ), m_nScoreIndex, amount, round); // } // for (i = std::max(0, static_cast<int>(prev_char_index)-1); i < std::min(static_cast<unsigned long>(sentence->size())-2, prev_char_index); ++i) { // nReturn += m_weights->m_mapCharTrigram.getOrUpdateScore( std::make_pair( find_or_replace_word_cache(i, i+2), encodeCharInfoAndPosition(char_info, i-prev_char_index) ), m_nScoreIndex, amount, round); // if (hasCharTypeKnowledge()) nReturn += m_weights->m_mapCharCatTrigram.getOrUpdateScore( std::make_pair( groupCharTypes(segmentor, sentence, i, 3, amount), encodeCharInfoAndPosition(char_info, i-prev_char_index) ), m_nScoreIndex, amount, round); // } return nReturn; }
SCORE_TYPE CTagger::getLocalScore( const CStringVector * sentence, CStateItem * item , unsigned long index ) { static SCORE_TYPE nReturn ; static unsigned long int last_start , last_length ; static unsigned long int start , end , length ; // abstd::cout the words start = item->getWordStart( index ) ; end = item->getWordEnd( index ) ; length = item->getWordLength( index ) ; last_start = index > 0 ? item->getWordStart( index-1 ) : 999999 ; last_length = index > 0 ? item->getWordLength( index-1 ) : 99999 ; const CWord &word = m_WordCache.find( start , end , sentence ) ; const CWord &last_word = index > 0 ? m_WordCache.find( last_start , start-1 , sentence ) : g_emptyWord ; // abstd::cout the chars const CWord &first_char = m_WordCache.find( start , start , sentence ) ; const CWord &last_char = m_WordCache.find( end , end , sentence ) ; const CWord &first_char_last_word = index > 0 ? m_WordCache.find( last_start , last_start , sentence ) : g_emptyWord ; const CWord &last_char_last_word = index > 0 ? m_WordCache.find( start-1 , start-1 , sentence) : g_emptyWord; const CWord &first_char_next_word = end+1 < sentence->size() ? m_WordCache.find( end+1 , end+1 , sentence) : g_emptyWord ; const CWord &first_twochar = start+1 < sentence->size() ? m_WordCache.find( start , start+1 , sentence ) : g_emptyWord ; const CWord &last_twochar_last_word = start>1 ? m_WordCache.find( start-2 , start-1 , sentence ) : g_emptyWord ; const CWord &two_char = index > 0 ? m_WordCache.find( start-1 , start, sentence) : g_emptyWord; const CWord &lastword_firstchar = index > 0 ? m_WordCache.find( last_start , start , sentence ) : g_emptyWord ; const CWord ¤tword_lastchar = index > 0 ? m_WordCache.find( start-1 , end , sentence) : g_emptyWord ; const CWord ¤tword_lasttwochar = start > 1 ? m_WordCache.find( start-2 , end , sentence ) : g_emptyWord ; const CWord &lastword_firsttwochar = index > 0 && start+1 < sentence->size() ? m_WordCache.find( last_start , start+1 , sentence ) : g_emptyWord ; const CWord &three_char = length == 1 && start > 0 && end < sentence->size()-1 ? m_WordCache.find( start-1 , end+1 , sentence ) : g_emptyWord ; CTwoWords two_word; // abstd::cout the tags const CTag tag = item->getTag(index); const CTag last_tag = index>0 ? item->getTag(index-1) : CTag::SENTENCE_BEGIN; const CTag second_last_tag = index>1 ? item->getTag(index-2) : CTag::SENTENCE_BEGIN; const CTagSet<CTag, 2> tag_bigram(encodeTags(tag, last_tag)); const CTagSet<CTag, 3> tag_trigram(encodeTags(tag, last_tag, second_last_tag)); static CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; static CTwoTaggedWords wt12; long int first_char_cat = m_weights->m_mapCharTagDictionary.lookup(first_char) | (1<<tag.code()) ; long int last_char_cat = m_weights->m_mapCharTagDictionary.lookup(last_char) | (1<<tag.code()) ; nReturn = m_weights->m_mapCurrentTag.getScore( std::make_pair(word, tag) , m_nScoreIndex ) ; nReturn += m_weights->m_mapLastTagByTag.getScore( tag_bigram , m_nScoreIndex ) ; nReturn += m_weights->m_mapLastTwoTagsByTag.getScore( tag_trigram , m_nScoreIndex ) ; if ( start > 0 ) { if ( last_length <= 2 ) nReturn += m_weights->m_mapTagByLastWord.getScore( std::make_pair(last_word, tag) , m_nScoreIndex ) ; if ( length <= 2 ) nReturn += m_weights->m_mapLastTagByWord.getScore( std::make_pair(word, last_tag) , m_nScoreIndex ) ; if ( length <= 2 ) nReturn += m_weights->m_mapTagByWordAndPrevChar.getScore( std::make_pair(currentword_lastchar, tag) , m_nScoreIndex ) ; if ( last_length <= 2 ) nReturn += m_weights->m_mapTagByWordAndNextChar.getScore( std::make_pair(lastword_firstchar, last_tag) , m_nScoreIndex) ; } if ( length == 1 ) { if ( start > 0 && end < sentence->size()-1 ) nReturn += m_weights->m_mapTagOfOneCharWord.getScore( std::make_pair(three_char, tag) , m_nScoreIndex ) ; } else { nReturn += m_weights->m_mapTagByFirstChar.getScore( std::make_pair(first_char, tag) , m_nScoreIndex ) ; nReturn += m_weights->m_mapTagByLastChar.getScore( std::make_pair(last_char, tag) , m_nScoreIndex ) ; nReturn += m_weights->m_mapTagByFirstCharCat.getScore( std::make_pair(first_char_cat, tag) , m_nScoreIndex ) ; nReturn += m_weights->m_mapTagByLastCharCat.getScore( std::make_pair(last_char_cat, tag) , m_nScoreIndex ) ; for ( int j = 0 ; j < item->getWordLength( index ) ; ++ j ) { if ( j > 0 && j < item->getWordLength( index )-1 ) nReturn += m_weights->m_mapTagByChar.getScore( std::make_pair(m_WordCache.find(start+j, start+j, sentence), tag) , m_nScoreIndex ); if ( j > 0 ) { wt1.load( m_WordCache.find(start+j, start+j, sentence), tag ); wt2.load( first_char ); wt12.refer(&wt1, &wt2); nReturn += m_weights->m_mapTaggedCharByFirstChar.getScore( wt12, m_nScoreIndex ); if ( m_WordCache.find(start+j, start+j, sentence) == m_WordCache.find(start+j-1, start+j-1, sentence)) nReturn += m_weights->m_mapRepeatedCharByTag.getScore( std::make_pair(m_WordCache.find(start+j, start+j, sentence), tag) , m_nScoreIndex ); } if ( j < item->getWordLength( index )-1 ) { wt1.load( m_WordCache.find(start+j, start+j, sentence), tag ); wt2.load( last_char ); wt12.refer(&wt1, &wt2); nReturn += m_weights->m_mapTaggedCharByLastChar.getScore( wt12 , m_nScoreIndex ); } } } return nReturn; }
SCORE_TYPE CTagger::getOrUpdateLocalScore( const CStringVector *sentence, const CStateItem *item, unsigned long index, SCORE_TYPE amount, unsigned long round ) { static SCORE_TYPE nReturn ; static unsigned long last_start , last_length ; static unsigned long start , end , length , word_length ; // word length is the un-normalised version // about the words start = item->getWordStart( index ) ; end = item->getWordEnd( index ) ; length = item->getWordLength( index ) ; last_start = index > 0 ? item->getWordStart( index-1 ) : 0 ; last_length = index > 0 ? item->getWordLength( index-1 ) : 0 ; word_length = length ; // use word_length instead of item->getWordLength() because the length can include " ". const CWord &word = amount==0 ? m_WordCache.find( start , end , sentence ) : m_WordCache.replace( start , end , sentence ) ; const CWord &last_word = index > 0 ? ( amount==0 ? m_WordCache.find( last_start , start-1 , sentence ) : m_WordCache.replace( last_start , start-1 , sentence ) ) : g_emptyWord ; // about the length if( length > LENGTH_MAX-1 ) length = LENGTH_MAX-1 ; if( last_length > LENGTH_MAX-1 ) last_length = LENGTH_MAX-1 ; // about the chars const CWord &first_char = amount==0 ? m_WordCache.find( start , start , sentence ) : m_WordCache.replace( start , start , sentence ) ; const CWord &last_char = amount==0 ? m_WordCache.find( end , end , sentence ) : m_WordCache.replace( end , end , sentence ) ; const CWord &first_char_last_word = index > 0 ? ( amount==0 ? m_WordCache.find( last_start , last_start , sentence ) : m_WordCache.replace( last_start , last_start , sentence ) ) : g_emptyWord ; const CWord &last_char_last_word = index > 0 ? ( amount==0 ? m_WordCache.find( start-1 , start-1 , sentence) : m_WordCache.replace( start-1 , start-1 , sentence) ) : g_emptyWord ; const CWord &two_char = index > 0 ? ( amount == 0 ? m_WordCache.find( start-1 , start, sentence) : m_WordCache.replace( start-1 , start, sentence) ) : g_emptyWord ; const CWord &lastword_firstchar = index > 0 ? ( amount==0 ? m_WordCache.find( last_start , start , sentence ) : m_WordCache.replace( last_start , start , sentence ) ) : g_emptyWord ; const CWord ¤tword_lastchar = index > 0 ? ( amount==0 ? m_WordCache.find( start-1 , end , sentence) : m_WordCache.replace( start-1 , end , sentence) ) : g_emptyWord ; const CWord &three_char = ( length == 1 && start > 0 && end < sentence->size()-1 ) ? ( amount==0 ? m_WordCache.find( start-1 , end+1 , sentence ) : m_WordCache.replace( start-1 , end+1 , sentence ) ) : g_emptyWord ; static CTwoWords two_word , first_and_last_char , firstchars_twoword , lastchars_twoword ; if (amount==0) { two_word.refer( &word , &last_word ) ; first_and_last_char.refer( &first_char , &last_char ) ; firstchars_twoword.refer( &first_char_last_word , &first_char ) ; lastchars_twoword.refer( &last_char_last_word , &last_char ) ; } else { two_word.allocate( word, last_word ) ; first_and_last_char.allocate( first_char, last_char ) ; firstchars_twoword.allocate( first_char_last_word, first_char ) ; lastchars_twoword.allocate( last_char_last_word, last_char ) ; } // about the tags const CTag &tag = item->getTag( index ) ; const CTag &last_tag = index>0 ? item->getTag( index-1 ) : CTag(CTag::SENTENCE_BEGIN) ; const CTag &second_last_tag = index>1 ? item->getTag(index-2) : CTag(CTag::SENTENCE_BEGIN) ; static CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; static CTwoTaggedWords wt12; unsigned long long first_char_cat = m_weights->m_mapCharTagDictionary.lookup(first_char) | (static_cast<unsigned long long>(1)<<tag.code()) ; unsigned long long last_char_cat = m_weights->m_mapCharTagDictionary.lookup(last_char) | (static_cast<unsigned long long>(1)<<tag.code()) ; static int j ; // adding scores with features nReturn = m_weights->m_mapSeenWords.getOrUpdateScore( word , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastWordByWord.getOrUpdateScore( two_word , m_nScoreIndex , amount , round ) ; if ( length == 1 ) { nReturn += m_weights->m_mapOneCharWord.getOrUpdateScore( word , m_nScoreIndex , amount , round ) ; } else { nReturn += m_weights->m_mapFirstAndLastChars.getOrUpdateScore( first_and_last_char , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLengthByFirstChar.getOrUpdateScore( std::make_pair(first_char, length) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLengthByLastChar.getOrUpdateScore( std::make_pair(last_char, length) , m_nScoreIndex , amount , round ) ; for (j=0; j<word_length-1; ++j) nReturn += m_weights->m_mapConsecutiveChars.getOrUpdateScore( amount==0 ? m_WordCache.find(start+j, start+j+1, sentence) : m_WordCache.replace(start+j, start+j+1, sentence) , m_nScoreIndex, amount, round ) ; } if ( start > 0 ) { nReturn += m_weights->m_mapSeparateChars.getOrUpdateScore( two_char , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapCurrentWordLastChar.getOrUpdateScore( currentword_lastchar , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastWordFirstChar.getOrUpdateScore( lastword_firstchar , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapFirstCharLastWordByWord.getOrUpdateScore( firstchars_twoword , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastWordByLastChar.getOrUpdateScore( lastchars_twoword , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLengthByLastWord.getOrUpdateScore( std::make_pair(last_word, length) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastLengthByWord.getOrUpdateScore( std::make_pair(word, last_length), m_nScoreIndex , amount , round ) ; } nReturn += m_weights->m_mapCurrentTag.getOrUpdateScore( std::make_pair(word, tag) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastTagByTag.getOrUpdateScore( CTagSet<CTag, 2>(encodeTags( tag, last_tag )), m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastTwoTagsByTag.getOrUpdateScore( CTagSet<CTag, 3>(encodeTags( tag, last_tag, second_last_tag )), m_nScoreIndex , amount , round ) ; if ( start > 0 ) { if ( last_length <= 2 ) nReturn += m_weights->m_mapTagByLastWord.getOrUpdateScore( std::make_pair(last_word, tag) , m_nScoreIndex , amount , round ) ; if ( length <= 2 ) nReturn += m_weights->m_mapLastTagByWord.getOrUpdateScore( std::make_pair(word, last_tag) , m_nScoreIndex , amount , round ) ; if ( length <= 2 ) nReturn += m_weights->m_mapTagByWordAndPrevChar.getOrUpdateScore( std::make_pair(currentword_lastchar, tag) , m_nScoreIndex , amount , round ) ; if ( last_length <= 2 ) nReturn += m_weights->m_mapTagByWordAndNextChar.getOrUpdateScore( std::make_pair(lastword_firstchar, last_tag) , m_nScoreIndex , amount , round ) ; } if ( length == 1 ) { if ( start > 0 && end < sentence->size()-1 ) nReturn += m_weights->m_mapTagOfOneCharWord.getOrUpdateScore( std::make_pair(three_char, tag) , m_nScoreIndex , amount , round ) ; } else { nReturn += m_weights->m_mapTagByFirstChar.getOrUpdateScore( std::make_pair(first_char, tag) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapTagByLastChar.getOrUpdateScore( std::make_pair(last_char, tag) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapTagByFirstCharCat.getOrUpdateScore( std::make_pair(first_char_cat, tag) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapTagByLastCharCat.getOrUpdateScore( std::make_pair(last_char_cat, tag) , m_nScoreIndex , amount , round ) ; for ( j = 0 ; j < word_length ; ++j ) { if ( j > 0 && j < word_length-1 ) nReturn += m_weights->m_mapTagByChar.getOrUpdateScore( std::make_pair( amount==0 ? m_WordCache.find(start+j, start+j, sentence) : m_WordCache.replace(start+j, start+j, sentence), tag), m_nScoreIndex , amount , round ) ; if ( j > 0 ) { if (amount==0) { wt1.load( m_WordCache.find(start+j, start+j, sentence) , tag ); wt2.load(first_char); wt12.refer(&wt1, &wt2); } else { wt1.load( m_WordCache.replace(start+j, start+j, sentence) , tag ); wt2.load(first_char); wt12.allocate(wt1, wt2); } nReturn += m_weights->m_mapTaggedCharByFirstChar.getOrUpdateScore(wt12, m_nScoreIndex, amount, round) ; if ( m_WordCache.find(start+j, start+j, sentence) == m_WordCache.find(start+j-1, start+j-1, sentence)) nReturn += m_weights->m_mapRepeatedCharByTag.getOrUpdateScore( std::make_pair( amount==0 ? m_WordCache.find(start+j, start+j, sentence) : m_WordCache.replace(start+j, start+j, sentence), tag), m_nScoreIndex, amount, round) ; } if ( j < word_length-1 ) { if (amount==0) { wt1.load( m_WordCache.find(start+j, start+j, sentence) , tag ); wt2.load(last_char); wt12.refer(&wt1, &wt2); } else { wt1.load( m_WordCache.replace(start+j, start+j, sentence) , tag ); wt2.load(last_char); wt12.allocate(wt1, wt2); } nReturn += m_weights->m_mapTaggedCharByLastChar.getOrUpdateScore(wt12, m_nScoreIndex, amount, round) ; } } } return nReturn; }
SCORE_TYPE CTagger::getOrUpdateSeparateScore( const CStringVector *sentence, const CSubStateItem *item, unsigned long index, SCORE_TYPE amount, unsigned long round ) { static SCORE_TYPE nReturn ; static unsigned long start_0; static unsigned long start_1, end_1, length_1; static unsigned long start_2, end_2, length_2; // about the words assert(amount!=0||index==item->size()-1||index==item->size()); start_0 = index==item->size() ? 0 : item->getWordStart( index ) ; start_1 = index > 0 ? item->getWordStart( index-1 ) : 0 ; end_1 = index > 0 ? item->getWordEnd( index-1 ) : 0 ; assert(index==item->size()||index==0 || end_1 == start_0-1); length_1 = index > 0 ? item->getWordLength( index-1 ) : 0; start_2 = index > 1 ? item->getWordStart( index-2 ) : 0 ; end_2 = index > 1 ? item->getWordEnd( index-2 ) : 0 ; assert(index<2 || end_2 == start_1-1); length_2 = index > 1 ? item->getWordLength( index-2 ) : 0; const CWord &word_1 = index>0 ? find_or_replace_word_cache( start_1, end_1 ) : g_emptyWord; const CWord &word_2 = index>1 ? find_or_replace_word_cache( start_2, end_2 ) : g_emptyWord; // about the length if( length_1 > LENGTH_MAX-1 ) length_1 = LENGTH_MAX-1 ; if( length_2 > LENGTH_MAX-1 ) length_2 = LENGTH_MAX-1 ; // about the chars const CWord &first_char_0 = index<item->size() ? find_or_replace_word_cache( start_0, start_0 ) : g_emptyWord ; const CWord &first_char_1 = index>0 ? find_or_replace_word_cache( start_1, start_1 ) : g_emptyWord; const CWord &last_char_1 = index>0 ? find_or_replace_word_cache( end_1, end_1 ) : g_emptyWord; const CWord &last_char_2 = index>1 ? find_or_replace_word_cache( end_2, end_2 ) : g_emptyWord; const CWord &two_char = index>0&&index<item->size() ? find_or_replace_word_cache( end_1, start_0 ) : g_emptyWord ; const CWord &word_1_first_char_0 = index>0&&index<item->size() ? find_or_replace_word_cache( start_1, start_0 ) : g_emptyWord; const CWord &word_1_last_char_2 = index>1 ? find_or_replace_word_cache( end_2, end_1 ) : g_emptyWord; const CWord &three_char = ( length_1==1 && index>1 && index<item->size() ) ? find_or_replace_word_cache( end_2, start_0 ) : g_emptyWord; static CTwoWords word_2_word_1, first_char_1_last_char_1, first_char_0_first_char_1, last_char_1_last_char_2 ; if (amount==0&&index>0) { word_2_word_1.refer( &word_1 , &word_2 ) ; first_char_1_last_char_1.refer( &first_char_1 , &last_char_1 ) ; first_char_0_first_char_1.refer( &first_char_0 , &first_char_1 ) ; last_char_1_last_char_2.refer( &last_char_1 , &last_char_2 ) ; } else { word_2_word_1.allocate( word_1, word_2 ) ; first_char_1_last_char_1.allocate( first_char_1, last_char_1 ) ; first_char_0_first_char_1.allocate( first_char_0, first_char_1 ) ; last_char_1_last_char_2.allocate( last_char_1, last_char_2 ) ; } // about the tags const CTag &tag_0 = index<item->size() ? item->getTag( index ) : g_beginTag; const CTag &tag_1 = index>0 ? item->getTag(index-1) : g_beginTag; const CTag &tag_2 = index>1 ? item->getTag(index-2) : g_beginTag; static CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; static CTwoTaggedWords wt12; unsigned long long first_char_cat_0 = m_weights->m_mapCharTagDictionary.lookup(first_char_0) | (static_cast<unsigned long long>(1)<<tag_0.code()) ; unsigned long long last_char_cat_1 = m_weights->m_mapCharTagDictionary.lookup(last_char_1) | (static_cast<unsigned long long>(1)<<tag_1.code()) ; static CTagSet<CTag, 2> tag_0_tag_1, tag_0_tag_2; static CTagSet<CTag, 3> tag_0_tag_1_tag_2; tag_0_tag_1.load( encodeTags(tag_0, tag_1) ); tag_0_tag_2.load( encodeTags(tag_0, tag_2) ); tag_0_tag_1_tag_2.load( encodeTags(tag_0, tag_1, tag_2) ); static int j ; // adding scores with features for last word if (index>0) { nReturn = m_weights->m_mapSeenWords.getOrUpdateScore( word_1 , m_nScoreIndex , amount , round ) ; if (index>1) nReturn += m_weights->m_mapLastWordByWord.getOrUpdateScore( word_2_word_1 , m_nScoreIndex , amount , round ) ; if ( length_1 == 1 ) { nReturn += m_weights->m_mapOneCharWord.getOrUpdateScore( word_1 , m_nScoreIndex , amount , round ) ; } else { nReturn += m_weights->m_mapFirstAndLastChars.getOrUpdateScore( first_char_1_last_char_1 , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLengthByFirstChar.getOrUpdateScore( std::make_pair(first_char_1, length_1) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLengthByLastChar.getOrUpdateScore( std::make_pair(last_char_1, length_1) , m_nScoreIndex , amount , round ) ; } if (index>1) { nReturn += m_weights->m_mapCurrentWordLastChar.getOrUpdateScore( word_1_last_char_2 , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastWordByLastChar.getOrUpdateScore( last_char_1_last_char_2 , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLengthByLastWord.getOrUpdateScore( std::make_pair(word_2, length_1) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastLengthByWord.getOrUpdateScore( std::make_pair(word_1, length_2), m_nScoreIndex , amount , round ) ; } nReturn += m_weights->m_mapCurrentTag.getOrUpdateScore( std::make_pair(word_1, tag_1) , m_nScoreIndex , amount , round ) ; if ( length_1 <= 2 ) nReturn += m_weights->m_mapLastTagByWord.getOrUpdateScore( std::make_pair(word_1, tag_2) , m_nScoreIndex , amount , round ) ; if (index>1) { if ( length_1 <= 2 ) nReturn += m_weights->m_mapTagByWordAndPrevChar.getOrUpdateScore( std::make_pair(word_1_last_char_2, tag_1) , m_nScoreIndex , amount , round ) ; if ( length_1 == 1 && index<item->size() ) nReturn += m_weights->m_mapTagOfOneCharWord.getOrUpdateScore( std::make_pair(three_char, tag_1) , m_nScoreIndex , amount , round ) ; } nReturn += m_weights->m_mapTagByLastChar.getOrUpdateScore( std::make_pair(last_char_1, tag_1) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapTagByLastCharCat.getOrUpdateScore( std::make_pair(last_char_cat_1, tag_1) , m_nScoreIndex , amount , round ) ; for (j=0; j<length_1-1; ++j) { wt1.load(find_or_replace_word_cache(start_1+j, start_1+j), tag_1); wt2.load(last_char_1);// if (amount==0) { wt12.refer(&wt1, &wt2); } else { wt12.allocate(wt1, wt2); } nReturn += m_weights->m_mapTaggedCharByLastChar.getOrUpdateScore(wt12, m_nScoreIndex, amount, round) ; } } // all about the current word nReturn += m_weights->m_mapLastTagByTag.getOrUpdateScore( tag_0_tag_1, m_nScoreIndex , amount , round ) ; if ( length_1 <= 2 ) nReturn += m_weights->m_mapTagByLastWord.getOrUpdateScore( std::make_pair(word_1, tag_0) , m_nScoreIndex , amount , round ) ; if ( index > 0 ) { nReturn += m_weights->m_mapLastTwoTagsByTag.getOrUpdateScore( tag_0_tag_1_tag_2, m_nScoreIndex , amount , round ) ; } if (index<item->size()) { if ( index>0 ) { nReturn += m_weights->m_mapSeparateChars.getOrUpdateScore( two_char , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapLastWordFirstChar.getOrUpdateScore( word_1_first_char_0 , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapFirstCharLastWordByWord.getOrUpdateScore( first_char_0_first_char_1 , m_nScoreIndex , amount , round ) ; if ( length_1 <= 2 ) nReturn += m_weights->m_mapTagByWordAndNextChar.getOrUpdateScore( std::make_pair(word_1_first_char_0, tag_1) , m_nScoreIndex , amount , round ) ; } nReturn += m_weights->m_mapTagByFirstChar.getOrUpdateScore( std::make_pair(first_char_0, tag_0) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapTagByFirstCharCat.getOrUpdateScore( std::make_pair(first_char_cat_0, tag_0) , m_nScoreIndex , amount , round ) ; nReturn += m_weights->m_mapTagByChar.getOrUpdateScore( std::make_pair(first_char_0, tag_0), m_nScoreIndex , amount , round ) ; if (index>0) { wt1.load(last_char_1, tag_1); wt2.load(first_char_0, tag_0); if (amount==0) { wt12.refer(&wt1, &wt2); } else { wt12.allocate(wt1, wt2); } nReturn += m_weights->m_mapTaggedSeparateChars.getOrUpdateScore( wt12, m_nScoreIndex , amount , round ) ; } } // if (index>0) nReturn += m_weights->m_mapTagWordTag.getOrUpdateScore( std::make_pair(word_1, tag_0_tag_2) , m_nScoreIndex , amount , round ) ; // if (index>1) nReturn += m_weights->m_mapWordTagTag.getOrUpdateScore( std::make_pair(word_2, tag_0_tag_1) , m_nScoreIndex , amount , round ) ; return nReturn; }