void CTagger::updateScores(const CTwoStringVector* tagged, const CTwoStringVector* correct, unsigned long round) { static int i , j ; static CStateItem item ; static CStringVector raw; if ( *tagged != *correct ) { // get raw sentence from tagged output raw.clear(); for (i=0; i<tagged->size(); ++i) getCharactersFromUTF8String(tagged->at(i).first, &raw); buildStateItem( &raw, tagged, &item ); for (i=0; i<tagged->size(); ++i) getOrUpdateLocalScore(&raw, &item, i, -1, round); buildStateItem( &raw, correct, &item ); for (i=0; i<correct->size(); ++i) getOrUpdateLocalScore(&raw, &item, i, 1, round); } if ( round > m_nNumberOfCurrentTrainingExample ) { m_nNumberOfCurrentTrainingExample = round ; // Updates that are common for all example for ( i=0; i<correct->size(); ++i ) { const CWord &word = correct->at(i).first ; unsigned long tag = CTag( correct->at(i).second ).code() ; CStringVector chars; chars.clear(); getCharactersFromUTF8String(correct->at(i).first, &chars); m_weights->m_mapWordFrequency[word]++; if (m_weights->m_mapWordFrequency[word]>m_weights->m_nMaxWordFrequency) m_weights->m_nMaxWordFrequency = m_weights->m_mapWordFrequency[word]; m_weights->m_mapTagDictionary.add(word, tag); for ( j = 0 ; j < chars.size() ; ++j ) m_weights->m_mapCharTagDictionary.add(chars[j], tag) ; if ( !m_weights->m_Knowledge || (!m_weights->m_Knowledge->isFWorCD(chars[0])&&!m_weights->m_Knowledge->isFWorCD(chars[chars.size()-1]))) m_weights->setMaxLengthByTag( tag , chars.size() ) ; } } }
void CTagger::updateScores(const CTwoStringVector* tagged, const CTwoStringVector* correct, unsigned long round) { static int i , j ; if ( *tagged != *correct ) { for (i=0; i<tagged->size(); ++i) updateLocalFeatureVector(eSubtract, tagged, i, round); for (i=0; i<correct->size(); ++i) updateLocalFeatureVector(eAdd, correct, i, round); } if (round>m_nNumberOfCurrentTrainingExample) { // // Updates that are common for all example // m_nNumberOfCurrentTrainingExample = round; for (i=0; i<correct->size(); ++i) { CWord word = correct->at(i).first; CTag tag(correct->at(i).second); CStringVector chars; chars.clear(); getCharactersFromUTF8String(correct->at(i).first, &chars); m_weights->m_mapWordFrequency[word]++; m_weights->m_mapTagDictionary.add(word, tag); if (m_weights->m_mapWordFrequency[word]>m_weights->m_nMaxWordFrequency) m_weights->m_nMaxWordFrequency = m_weights->m_mapWordFrequency[word]; for ( j = 0 ; j < chars.size() ; ++j ) m_weights->m_mapCharTagDictionary.add(chars[j], tag) ; } } }
bool CTagger::train( const CStringVector * sentence , const CTwoStringVector * correct) { ++m_nTrainingRound ; buildStateItem( sentence, correct, &m_goldState); // Updates that are common for all example for ( unsigned i=0; i<correct->size(); ++i ) { const CWord &word = correct->at(i).first ; unsigned long tag = CTag( correct->at(i).second ).code() ; static CStringVector chars; chars.clear(); getCharactersFromUTF8String(correct->at(i).first, &chars); m_weights->m_mapWordFrequency[word]++; if (m_weights->m_mapWordFrequency[word]>m_weights->m_nMaxWordFrequency) m_weights->m_nMaxWordFrequency = m_weights->m_mapWordFrequency[word]; m_weights->m_mapTagDictionary.add(word, tag); for ( unsigned j=0 ; j<chars.size() ; ++j ) { m_weights->m_mapCharTagDictionary.add(chars[j], tag) ; } if ( PENN_TAG_CLOSED[tag] ) { m_weights->m_mapCanStart.add(chars[0], tag); } if ( !m_weights->m_Knowledge || (!m_weights->m_Knowledge->isFWorCD(chars[0])&& !m_weights->m_Knowledge->isFWorCD(chars[chars.size()-1]))) m_weights->setMaxLengthByTag( tag , chars.size() ) ; } tag( sentence, NULL, NULL, 1, NULL ); return m_bTrainingError; }
void process(const std::string &sInputFile, const std::string &sOutputFile, unsigned long nMaxSentSize) { CDoc2Snt doc2snt(sInputFile, nMaxSentSize); CSentenceWriter writer(sOutputFile); CStringVector sent; while (doc2snt.getSentence(sent)) { if (sent.size()>0 && sent.back()=="\n") sent.pop_back(); writer.writeSentence(&sent, ""); sent.clear(); } }
void CFeatureHandle::updateLocalFeatureVector(SCORE_UPDATE method, const CStringVector* outout, int index, int round) { // abstd::cout words CWord word = outout->at(index); CWord last_word = index>0 ? outout->at(index-1) : g_emptyWord; CTwoWords two_word; two_word.allocate(word.str(), last_word.str()); CStringVector chars; chars.clear(); getCharactersFromUTF8String(word.str(), &chars); // abstd::cout length int length = getUTF8StringLength(word.str()); if (length > LENGTH_MAX-1) length = LENGTH_MAX-1; int last_length = getUTF8StringLength(last_word.str()); if (last_length > LENGTH_MAX-1) last_length = LENGTH_MAX-1; // abstd::cout chars CWord first_char = getFirstCharFromUTF8String(word.str()); CWord last_char = getLastCharFromUTF8String(word.str()); CWord first_char_last_word = index>0 ? getFirstCharFromUTF8String(last_word.str()) : g_emptyWord; CWord last_char_last_word = index>0 ? getLastCharFromUTF8String(last_word.str()) : g_emptyWord; CWord two_char = index>0 ? last_char_last_word.str() + first_char.str() : g_emptyWord; CTwoWords first_and_last_char, lastword_firstchar, currentword_lastchar, firstcharlastword_word, lastword_lastchar; first_and_last_char.allocate(first_char.str(), last_char.str()); if (index>0) { lastword_firstchar.allocate(last_word.str(), first_char.str()); currentword_lastchar.allocate(word.str(), last_char_last_word.str()); firstcharlastword_word.allocate(first_char_last_word.str(), first_char.str()); lastword_lastchar.allocate(last_char_last_word.str(), last_char.str()); } SCORE_TYPE amount = ( (method==eAdd) ? 1 : -1 ) ; m_weights.m_mapSeenWords.updateScore(word, amount, round); m_weights.m_mapLastWordByWord.updateScore(two_word, amount, round); if (length==1) m_weights.m_mapOneCharWord.updateScore(first_char, amount, round); else { m_weights.m_mapFirstAndLastChars.updateScore(first_and_last_char, amount, round); for (int j=0; j<chars.size()-1; j++) { m_weights.m_mapConsecutiveChars.updateScore(chars[j]+chars[j+1], amount, round); } m_weights.m_mapLengthByFirstChar.updateScore(std::make_pair(first_char, length), amount, round); m_weights.m_mapLengthByLastChar.updateScore(std::make_pair(last_char, length), amount, round); } if (index>0) { m_weights.m_mapSeparateChars.updateScore(two_char, amount, round); m_weights.m_mapLastWordFirstChar.updateScore(lastword_firstchar, amount, round); m_weights.m_mapCurrentWordLastChar.updateScore(currentword_lastchar, amount, round); m_weights.m_mapFirstCharLastWordByWord.updateScore(firstcharlastword_word, amount, round); m_weights.m_mapLastWordByLastChar.updateScore(lastword_lastchar, amount, round); m_weights.m_mapLengthByLastWord.updateScore(std::make_pair(last_word, length), amount, round); m_weights.m_mapLastLengthByWord.updateScore(std::make_pair(word, last_length), amount, round); } }
bool CTagger::train( const CStringVector * sentence_input , const CTwoStringVector * correct) { ++m_nTrainingRound ; static CStringVector sentence; m_weights->m_rules.record( correct, &sentence ); buildStateItem( &sentence, correct, &m_goldState); // for (int i=0; i<sentence.size(); ++i) // std::cout << m_weights->m_rules.canSeparate(i) << std::endl; static unsigned total_size, local_size; total_size=0; // Updates that are common for all example for ( unsigned i=0; i<correct->size(); ++i ) { const CWord &word = correct->at(i).first ; unsigned long tag = CTag( correct->at(i).second ).code() ; static CStringVector chars; static unsigned j; chars.clear(); getCharactersFromUTF8String(correct->at(i).first, &chars); local_size = chars.size(); m_weights->m_mapWordFrequency[word]++; if (m_weights->m_mapWordFrequency[word]>m_weights->m_nMaxWordFrequency) m_weights->m_nMaxWordFrequency = m_weights->m_mapWordFrequency[word]; m_weights->m_mapTagDictionary.add(word, tag); for ( j=0 ; j<local_size; ++j ) { m_weights->m_mapCharTagDictionary.add(chars[j], tag) ; } if ( PENN_TAG_CLOSED[tag] || tag==PENN_TAG_CD ) { m_weights->m_mapCanStart.add(chars[0], tag); } // if ( !m_weights->m_Knowledge || // (!m_weights->m_Knowledge->isFWorCD(chars[0])&& // !m_weights->m_Knowledge->isFWorCD(chars[chars.size()-1]))) bool bNoSep=false; for ( j=total_size+1; j<total_size+local_size; ++j) if (!m_weights->m_rules.canSeparate(j)) bNoSep = true; if (!bNoSep) m_weights->setMaxLengthByTag( tag , local_size ) ; total_size += chars.size(); } work( &sentence, NULL, NULL, 1, NULL ); return m_bTrainingError; }
SCORE_TYPE CTagger::getGlobalScore(const CTwoStringVector* tagged) { static int i; static CStateItem item ; static CStringVector raw; static SCORE_TYPE rv; rv=0; raw.clear(); for (i=0; i<tagged->size(); ++i) getCharactersFromUTF8String(tagged->at(i).first, &raw); buildStateItem( &raw, tagged, &item ); for (i=0; i<tagged->size(); ++i) rv += getOrUpdateLocalScore(&raw, &item, i); return rv; }
void CTagger :: updateLocalFeatureVector( SCORE_UPDATE method , const CTwoStringVector * sentence , unsigned long index , unsigned long round ) { // abstd::cout words CWord word = sentence->at( index ).first ; CWord last_word = index > 0 ? sentence->at( index - 1 ).first : g_emptyWord ; CWord next_word = index < sentence->size() - 1 ? sentence->at( index + 1 ).first : g_emptyWord ; CStringVector chars , last_chars ; chars.clear() ; getCharactersFromUTF8String( sentence->at(index).first , &chars ) ; last_chars.clear() ; if ( index > 0 ) getCharactersFromUTF8String( sentence->at( index - 1 ).first , &last_chars ) ; // abstd::cout length int length = chars.size() ; //if ( length > LENGTH_MAX-1 ) length = LENGTH_MAX-1 ; int last_length = last_chars.size() ; //if ( last_length > LENGTH_MAX-1 ) last_length = LENGTH_MAX-1 ; // abstd::cout chars CWord first_char = chars[ 0 ]; CWord last_char = chars[ chars.size() - 1 ]; CWord first_char_last_word = index > 0 ? last_chars[ 0 ] : g_emptyWord; CWord last_char_last_word = index > 0 ? last_chars[ last_chars.size() - 1 ] : g_emptyWord; CWord first_char_next_word = index + 1 < sentence->size() ? getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord ; CWord last_twochar_last_word = last_chars.size() > 1 ? last_chars[ last_chars.size() - 2 ] + last_chars[ last_chars.size() - 1] : ( index > 1 ? getLastCharFromUTF8String(sentence->at(index-2).first) + last_chars[ 0 ] : g_emptyWord ); CWord first_twochar = chars.size() > 1 ? chars[ 0 ] + chars [ 1 ] : ( index + 1 <sentence->size() ? chars[ 0 ] + getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord ); CWord currentword_lasttwochar = index > 1 ? last_twochar_last_word.str() + word.str() : g_emptyWord ; CWord lastword_firsttwochar = index > 0 && index+1 < sentence->size() ? last_word.str() + first_twochar.str() : g_emptyWord ; CWord two_char = index > 0 ? last_char_last_word.str() + first_char.str() : g_emptyWord ; CWord lastword_firstchar = index > 0 ? last_word.str() + first_char.str() : g_emptyWord ; CWord currentword_lastchar = index > 0 ? last_char_last_word.str() + word.str() : g_emptyWord ; CWord three_char = length == 1 ? last_char_last_word.str() + word.str() + first_char_next_word.str() : g_emptyWord ; CTwoWords two_word ; // abstd::cout tags const CTag tag( sentence->at(index).second ) ; const CTag last_tag = index > 0 ? CTag( sentence->at( index-1 ).second) : CTag::SENTENCE_BEGIN ; const CTag second_last_tag = index > 1 ? CTag( sentence->at( index-2 ).second) : CTag::SENTENCE_BEGIN ; const CTagSet<CTag, 2> tag_bigram(encodeTags(tag, last_tag)); const CTagSet<CTag, 3> tag_trigram(encodeTags(tag, last_tag, second_last_tag)); CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; CTwoTaggedWords wt12; // abstd::cout the char categories long int first_char_cat = m_weights->m_mapCharTagDictionary.lookup(first_char) | (1<<tag.code()) ; long int last_char_cat = m_weights->m_mapCharTagDictionary.lookup(last_char) | (1<<tag.code()) ; SCORE_TYPE amount = method == eAdd ? 1 : -1 ; m_weights->m_mapCurrentTag[ std::make_pair(word, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapLastTagByTag[ tag_bigram ].updateCurrent( amount , round ) ; m_weights->m_mapLastTwoTagsByTag[ tag_trigram ].updateCurrent( amount , round ) ; if ( index > 0 ) { if ( last_length <= 2 ) m_weights->m_mapTagByLastWord[ std::make_pair(last_word, tag) ].updateCurrent( amount , round ) ; if ( length <= 2 ) m_weights->m_mapLastTagByWord[ std::make_pair(word, last_tag) ].updateCurrent( amount , round ) ; if ( length <= 2 ) m_weights->m_mapTagByWordAndPrevChar[ std::make_pair(currentword_lastchar, tag) ].updateCurrent( amount , round ) ; if ( last_length <= 2 ) m_weights->m_mapTagByWordAndNextChar[ std::make_pair(lastword_firstchar, last_tag) ].updateCurrent( amount , round ) ; } if ( length == 1 ) { if ( index > 0 && index < sentence->size() - 1 ) m_weights->m_mapTagOfOneCharWord[ std::make_pair(three_char, tag) ].updateCurrent( amount , round ) ; } else { m_weights->m_mapTagByFirstChar[ std::make_pair(first_char, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapTagByLastChar[ std::make_pair(last_char, tag) ].updateCurrent( amount , round ) ; // m_weights->m_mapTagByFirstCharCat[ std::make_pair(first_char_cat, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapTagByLastCharCat[ std::make_pair(last_char_cat, tag) ].updateCurrent( amount , round ) ; for ( int j = 0 ; j < chars.size() ; ++ j ) { if ( j > 0 && j < chars.size() - 1 ) m_weights->m_mapTagByChar[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ; if ( j > 0 ) { wt1.load(chars[j], tag); wt2.load(first_char); wt12.allocate(wt1, wt2); m_weights->m_mapTaggedCharByFirstChar[ wt12 ].updateCurrent( amount , round ) ; if ( chars[j] == chars[j-1] ) m_weights->m_mapRepeatedCharByTag[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ; // } if (j<chars.size()-1) { wt1.load(chars[j], tag); wt2.load(last_char); wt12.allocate(wt1, wt2); m_weights->m_mapTaggedCharByLastChar[ wt12 ].updateCurrent(amount, round); } } } }