bool CTagger::train( const CStringVector * sentence , const CTwoStringVector * correct) { ++m_nTrainingRound ; buildStateItem( sentence, correct, &m_goldState); // Updates that are common for all example for ( unsigned i=0; i<correct->size(); ++i ) { const CWord &word = correct->at(i).first ; unsigned long tag = CTag( correct->at(i).second ).code() ; static CStringVector chars; chars.clear(); getCharactersFromUTF8String(correct->at(i).first, &chars); m_weights->m_mapWordFrequency[word]++; if (m_weights->m_mapWordFrequency[word]>m_weights->m_nMaxWordFrequency) m_weights->m_nMaxWordFrequency = m_weights->m_mapWordFrequency[word]; m_weights->m_mapTagDictionary.add(word, tag); for ( unsigned j=0 ; j<chars.size() ; ++j ) { m_weights->m_mapCharTagDictionary.add(chars[j], tag) ; } if ( PENN_TAG_CLOSED[tag] ) { m_weights->m_mapCanStart.add(chars[0], tag); } if ( !m_weights->m_Knowledge || (!m_weights->m_Knowledge->isFWorCD(chars[0])&& !m_weights->m_Knowledge->isFWorCD(chars[chars.size()-1]))) m_weights->setMaxLengthByTag( tag , chars.size() ) ; } tag( sentence, NULL, NULL, 1, NULL ); return m_bTrainingError; }
void ServiceController::Start(const CStringVector& ar) { LPCTSTR *p = 0; if (!ar.empty()) { p = (LPCTSTR*)alloca(ar.size()*sizeof(LPCTSTR)); for (size_t i=0; i<ar.size(); ++i) p[i] = ar[i]; } Win32Check(::StartService(m_handle, (DWORD)ar.size(), p)); }
void CTagger::updateScores(const CTwoStringVector* tagged, const CTwoStringVector* correct, unsigned long round) { static int i , j ; static CStateItem item ; static CStringVector raw; if ( *tagged != *correct ) { // get raw sentence from tagged output raw.clear(); for (i=0; i<tagged->size(); ++i) getCharactersFromUTF8String(tagged->at(i).first, &raw); buildStateItem( &raw, tagged, &item ); for (i=0; i<tagged->size(); ++i) getOrUpdateLocalScore(&raw, &item, i, -1, round); buildStateItem( &raw, correct, &item ); for (i=0; i<correct->size(); ++i) getOrUpdateLocalScore(&raw, &item, i, 1, round); } if ( round > m_nNumberOfCurrentTrainingExample ) { m_nNumberOfCurrentTrainingExample = round ; // Updates that are common for all example for ( i=0; i<correct->size(); ++i ) { const CWord &word = correct->at(i).first ; unsigned long tag = CTag( correct->at(i).second ).code() ; CStringVector chars; chars.clear(); getCharactersFromUTF8String(correct->at(i).first, &chars); m_weights->m_mapWordFrequency[word]++; if (m_weights->m_mapWordFrequency[word]>m_weights->m_nMaxWordFrequency) m_weights->m_nMaxWordFrequency = m_weights->m_mapWordFrequency[word]; m_weights->m_mapTagDictionary.add(word, tag); for ( j = 0 ; j < chars.size() ; ++j ) m_weights->m_mapCharTagDictionary.add(chars[j], tag) ; if ( !m_weights->m_Knowledge || (!m_weights->m_Knowledge->isFWorCD(chars[0])&&!m_weights->m_Knowledge->isFWorCD(chars[chars.size()-1]))) m_weights->setMaxLengthByTag( tag , chars.size() ) ; } } }
bool CTagger::train( const CStringVector * sentence_input , const CTwoStringVector * correct) { ++m_nTrainingRound ; static CStringVector sentence; m_weights->m_rules.record( correct, &sentence ); buildStateItem( &sentence, correct, &m_goldState); // for (int i=0; i<sentence.size(); ++i) // std::cout << m_weights->m_rules.canSeparate(i) << std::endl; static unsigned total_size, local_size; total_size=0; // Updates that are common for all example for ( unsigned i=0; i<correct->size(); ++i ) { const CWord &word = correct->at(i).first ; unsigned long tag = CTag( correct->at(i).second ).code() ; static CStringVector chars; static unsigned j; chars.clear(); getCharactersFromUTF8String(correct->at(i).first, &chars); local_size = chars.size(); m_weights->m_mapWordFrequency[word]++; if (m_weights->m_mapWordFrequency[word]>m_weights->m_nMaxWordFrequency) m_weights->m_nMaxWordFrequency = m_weights->m_mapWordFrequency[word]; m_weights->m_mapTagDictionary.add(word, tag); for ( j=0 ; j<local_size; ++j ) { m_weights->m_mapCharTagDictionary.add(chars[j], tag) ; } if ( PENN_TAG_CLOSED[tag] || tag==PENN_TAG_CD ) { m_weights->m_mapCanStart.add(chars[0], tag); } // if ( !m_weights->m_Knowledge || // (!m_weights->m_Knowledge->isFWorCD(chars[0])&& // !m_weights->m_Knowledge->isFWorCD(chars[chars.size()-1]))) bool bNoSep=false; for ( j=total_size+1; j<total_size+local_size; ++j) if (!m_weights->m_rules.canSeparate(j)) bNoSep = true; if (!bNoSep) m_weights->setMaxLengthByTag( tag , local_size ) ; total_size += chars.size(); } work( &sentence, NULL, NULL, 1, NULL ); return m_bTrainingError; }
void CTagger::updateScores(const CTwoStringVector* tagged, const CTwoStringVector* correct, unsigned long round) { static int i , j ; if ( *tagged != *correct ) { for (i=0; i<tagged->size(); ++i) updateLocalFeatureVector(eSubtract, tagged, i, round); for (i=0; i<correct->size(); ++i) updateLocalFeatureVector(eAdd, correct, i, round); } if (round>m_nNumberOfCurrentTrainingExample) { // // Updates that are common for all example // m_nNumberOfCurrentTrainingExample = round; for (i=0; i<correct->size(); ++i) { CWord word = correct->at(i).first; CTag tag(correct->at(i).second); CStringVector chars; chars.clear(); getCharactersFromUTF8String(correct->at(i).first, &chars); m_weights->m_mapWordFrequency[word]++; m_weights->m_mapTagDictionary.add(word, tag); if (m_weights->m_mapWordFrequency[word]>m_weights->m_nMaxWordFrequency) m_weights->m_nMaxWordFrequency = m_weights->m_mapWordFrequency[word]; for ( j = 0 ; j < chars.size() ; ++j ) m_weights->m_mapCharTagDictionary.add(chars[j], tag) ; } } }
void process(const std::string &sInputFile, const std::string &sOutputFile, unsigned long nMaxSentSize) { CDoc2Snt doc2snt(sInputFile, nMaxSentSize); CSentenceWriter writer(sOutputFile); CStringVector sent; while (doc2snt.getSentence(sent)) { if (sent.size()>0 && sent.back()=="\n") sent.pop_back(); writer.writeSentence(&sent, ""); sent.clear(); } }
// line 1 - TOOL (start of tool definition) // line 2 - tool material (1-HSS, 2-CARBIDE, 3-COATED CAR, 4-CERAMIC, 5-BORZON, 10-UNKNOWN) // line 3 - tool comment // line 4 - tool name (geometry reference for backplot) // line 5 - tool manufacturer // line 6 - chuck designation // line 7 - tool_no, tool_type, rad_type, dia, crad, thds, tip_angle, // dia_off, len_off, feed, plunge, retract, rpm, coolant, n_flutes // line 8 - Drilling attributes (see tool_type in line 7 above) // line 8 - cycle, peck1, peck2, peck_clr, chip_brk, dwell, shldr_angle, root_dia (tap), bore_shift // line 8 - Milling attributes (see tool_type in line 7 above) // line 8 - cut_able, rgh_x, rgh_z, fin_x, fin_z, tip_dia, root_dia (thd mill), thd_angle // line 9 - pilot_dia, flute_len, oa_len, shldr_len, arbor_dia, hldr_dia, hldr_len, spindle_ccw, sfm, fpt, metric HRESULT CDBStepNC::LoadMasterCAMTool(CString filename) { CString contents = ReadAFile(filename); if(contents.GetLength()< 1) return E_INVALIDARG; CStringVector lines = CStringVector::Tokenize(contents, "\n"); CStringVector columns; columns.push_back("toolid"); columns.push_back("name"); columns.push_back("tooltypeid"); columns.push_back("materialid"); columns.push_back("number_of_teeth"); columns.push_back("hand_of_cut"); columns.push_back("coolant_through_tool"); columns.push_back("cutting_edge_length"); columns.push_back("flute_length"); columns.push_back("overall_length"); columns.push_back("shoulder_length"); columns.push_back("tip_diameter"); columns.push_back("tool_tip_half_angle"); columns.push_back("sfm"); columns.push_back("fpt"); columns.push_back("metric"); CStringVector values; values.resize(16); for(int i=40, j=1; i< lines.size(); i+=10, j++) { CStringVector items1 = CStringVector::Tokenize(lines[i+1]," "); CStringVector items2 = CStringVector::Tokenize(lines[i+2]," "); CStringVector items3 = CStringVector::Tokenize(lines[i+3]," "); CStringVector items7 = CStringVector::Tokenize(lines[i+7]," "); CStringVector items8 = CStringVector::Tokenize(lines[i+8]," "); CStringVector items9 = CStringVector::Tokenize(lines[i+9]," "); values[0] =StrFormat("%d", j); // toolid values[1]=lines[i+3].Mid(lines[i+3].Find("-")+1).Trim(); // name values[2]=items7[3].Trim(); // tooltypeid values[3]=items2[2].Trim(); // materialid values[4]= items7[16].Trim(); // number_of_teeth values[5]= (items9[9] ==1) ? "LEFT" : "RIGHT"; // hand_of_cut values[6]= items7[15]; // coolant_through_tool values[7]= items9[3]; // cutting_edge_length values[8]= items9[3]; // flute_length values[9]= items7[10]; // overall_length values[10]= items9[5]; // shoulder_length values[11]= items7[5]; // tip_diameter values[12]= items7[8]; // tool_tip_half_angle values[13]= items9[10]; // sfm values[14]= items9[11]; // fpt values[15]= items9[12].Trim(); // metric InsertRow("milling_cutter", columns, values); } return S_OK; }
HRESULT CDBStepNC::InsertRow(CString table, CStringVector & columns, CStringVector & values) { CCommand< CDynamicStringAccessor > sqlInsertCommand; int i; CString tszSQL; HRESULT hr; if(values.size() != columns.size()) return E_INVALIDARG; if(values.size() == 0 || columns.size()==0) return E_INVALIDARG; tszSQL.Format("INSERT INTO %s (", table); for(i=0; i<columns.size(); i++) { if(i>0) tszSQL+=" ,"; tszSQL.AppendFormat("%s", columns[i]); } tszSQL.AppendFormat(") VALUES ("); for(i=0; i<values.size(); i++) { if(i>0) tszSQL+=" ,"; tszSQL.AppendFormat("'%s'", values[i]); } tszSQL.AppendFormat(")"); hr = sqlInsertCommand.Open( m_session, tszSQL ); return hr; }
void CFeatureHandle::updateLocalFeatureVector(SCORE_UPDATE method, const CStringVector* outout, int index, int round) { // abstd::cout words CWord word = outout->at(index); CWord last_word = index>0 ? outout->at(index-1) : g_emptyWord; CTwoWords two_word; two_word.allocate(word.str(), last_word.str()); CStringVector chars; chars.clear(); getCharactersFromUTF8String(word.str(), &chars); // abstd::cout length int length = getUTF8StringLength(word.str()); if (length > LENGTH_MAX-1) length = LENGTH_MAX-1; int last_length = getUTF8StringLength(last_word.str()); if (last_length > LENGTH_MAX-1) last_length = LENGTH_MAX-1; // abstd::cout chars CWord first_char = getFirstCharFromUTF8String(word.str()); CWord last_char = getLastCharFromUTF8String(word.str()); CWord first_char_last_word = index>0 ? getFirstCharFromUTF8String(last_word.str()) : g_emptyWord; CWord last_char_last_word = index>0 ? getLastCharFromUTF8String(last_word.str()) : g_emptyWord; CWord two_char = index>0 ? last_char_last_word.str() + first_char.str() : g_emptyWord; CTwoWords first_and_last_char, lastword_firstchar, currentword_lastchar, firstcharlastword_word, lastword_lastchar; first_and_last_char.allocate(first_char.str(), last_char.str()); if (index>0) { lastword_firstchar.allocate(last_word.str(), first_char.str()); currentword_lastchar.allocate(word.str(), last_char_last_word.str()); firstcharlastword_word.allocate(first_char_last_word.str(), first_char.str()); lastword_lastchar.allocate(last_char_last_word.str(), last_char.str()); } SCORE_TYPE amount = ( (method==eAdd) ? 1 : -1 ) ; m_weights.m_mapSeenWords.updateScore(word, amount, round); m_weights.m_mapLastWordByWord.updateScore(two_word, amount, round); if (length==1) m_weights.m_mapOneCharWord.updateScore(first_char, amount, round); else { m_weights.m_mapFirstAndLastChars.updateScore(first_and_last_char, amount, round); for (int j=0; j<chars.size()-1; j++) { m_weights.m_mapConsecutiveChars.updateScore(chars[j]+chars[j+1], amount, round); } m_weights.m_mapLengthByFirstChar.updateScore(std::make_pair(first_char, length), amount, round); m_weights.m_mapLengthByLastChar.updateScore(std::make_pair(last_char, length), amount, round); } if (index>0) { m_weights.m_mapSeparateChars.updateScore(two_char, amount, round); m_weights.m_mapLastWordFirstChar.updateScore(lastword_firstchar, amount, round); m_weights.m_mapCurrentWordLastChar.updateScore(currentword_lastchar, amount, round); m_weights.m_mapFirstCharLastWordByWord.updateScore(firstcharlastword_word, amount, round); m_weights.m_mapLastWordByLastChar.updateScore(lastword_lastchar, amount, round); m_weights.m_mapLengthByLastWord.updateScore(std::make_pair(last_word, length), amount, round); m_weights.m_mapLastLengthByWord.updateScore(std::make_pair(word, last_length), amount, round); } }
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { #ifdef DEBUG clock_t total_start_time = clock();; #endif TRACE("Starting segmenting a sentence..."); // turn the spaces in the input sentence into rules that separate corresponding characters static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned long length = sentence.size(); assert(length<MAX_SENTENCE_SIZE); assert(vReturn!=NULL); vReturn->clear(); // try to work std::cout the best item with the // correct outout reference param as NULL work(this, sentence, vReturn, out_scores, rules, NULL, nBest, -1); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }
void CSegmentor::train(const CStringVector* sentence_input, const CStringVector* correct, int & round) { #ifdef DEBUG clock_t total_start_time = clock();; #endif TRACE("Starting training using a sentence..."); static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned long int length = sentence.size(); assert(length<MAX_SENTENCE_SIZE); static std::vector<unsigned> correct_starts; static int word_length, word_index, char_length, char_index; // word_xxx are from correct, char_xxx from sentence char_index = 0; int count = 0; correct_starts.clear(); correct_starts.push_back(count); for (word_index=0; word_index<correct->size(); word_index++) { word_length = correct->at(word_index).size(); char_length = 0; while (char_length<word_length) { char_length += sentence[char_index++].size(); count += 1; } assert(char_length==word_length); correct_starts.push_back(count); } // the main learning process with update work(this, sentence, 0, 0, rules, &correct_starts, 1, round); TRACE("Done"); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }
void CTagger :: updateLocalFeatureVector( SCORE_UPDATE method , const CTwoStringVector * sentence , unsigned long index , unsigned long round ) { // abstd::cout words CWord word = sentence->at( index ).first ; CWord last_word = index > 0 ? sentence->at( index - 1 ).first : g_emptyWord ; CWord next_word = index < sentence->size() - 1 ? sentence->at( index + 1 ).first : g_emptyWord ; CStringVector chars , last_chars ; chars.clear() ; getCharactersFromUTF8String( sentence->at(index).first , &chars ) ; last_chars.clear() ; if ( index > 0 ) getCharactersFromUTF8String( sentence->at( index - 1 ).first , &last_chars ) ; // abstd::cout length int length = chars.size() ; //if ( length > LENGTH_MAX-1 ) length = LENGTH_MAX-1 ; int last_length = last_chars.size() ; //if ( last_length > LENGTH_MAX-1 ) last_length = LENGTH_MAX-1 ; // abstd::cout chars CWord first_char = chars[ 0 ]; CWord last_char = chars[ chars.size() - 1 ]; CWord first_char_last_word = index > 0 ? last_chars[ 0 ] : g_emptyWord; CWord last_char_last_word = index > 0 ? last_chars[ last_chars.size() - 1 ] : g_emptyWord; CWord first_char_next_word = index + 1 < sentence->size() ? getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord ; CWord last_twochar_last_word = last_chars.size() > 1 ? last_chars[ last_chars.size() - 2 ] + last_chars[ last_chars.size() - 1] : ( index > 1 ? getLastCharFromUTF8String(sentence->at(index-2).first) + last_chars[ 0 ] : g_emptyWord ); CWord first_twochar = chars.size() > 1 ? chars[ 0 ] + chars [ 1 ] : ( index + 1 <sentence->size() ? chars[ 0 ] + getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord ); CWord currentword_lasttwochar = index > 1 ? last_twochar_last_word.str() + word.str() : g_emptyWord ; CWord lastword_firsttwochar = index > 0 && index+1 < sentence->size() ? last_word.str() + first_twochar.str() : g_emptyWord ; CWord two_char = index > 0 ? last_char_last_word.str() + first_char.str() : g_emptyWord ; CWord lastword_firstchar = index > 0 ? last_word.str() + first_char.str() : g_emptyWord ; CWord currentword_lastchar = index > 0 ? last_char_last_word.str() + word.str() : g_emptyWord ; CWord three_char = length == 1 ? last_char_last_word.str() + word.str() + first_char_next_word.str() : g_emptyWord ; CTwoWords two_word ; // abstd::cout tags const CTag tag( sentence->at(index).second ) ; const CTag last_tag = index > 0 ? CTag( sentence->at( index-1 ).second) : CTag::SENTENCE_BEGIN ; const CTag second_last_tag = index > 1 ? CTag( sentence->at( index-2 ).second) : CTag::SENTENCE_BEGIN ; const CTagSet<CTag, 2> tag_bigram(encodeTags(tag, last_tag)); const CTagSet<CTag, 3> tag_trigram(encodeTags(tag, last_tag, second_last_tag)); CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; CTwoTaggedWords wt12; // abstd::cout the char categories long int first_char_cat = m_weights->m_mapCharTagDictionary.lookup(first_char) | (1<<tag.code()) ; long int last_char_cat = m_weights->m_mapCharTagDictionary.lookup(last_char) | (1<<tag.code()) ; SCORE_TYPE amount = method == eAdd ? 1 : -1 ; m_weights->m_mapCurrentTag[ std::make_pair(word, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapLastTagByTag[ tag_bigram ].updateCurrent( amount , round ) ; m_weights->m_mapLastTwoTagsByTag[ tag_trigram ].updateCurrent( amount , round ) ; if ( index > 0 ) { if ( last_length <= 2 ) m_weights->m_mapTagByLastWord[ std::make_pair(last_word, tag) ].updateCurrent( amount , round ) ; if ( length <= 2 ) m_weights->m_mapLastTagByWord[ std::make_pair(word, last_tag) ].updateCurrent( amount , round ) ; if ( length <= 2 ) m_weights->m_mapTagByWordAndPrevChar[ std::make_pair(currentword_lastchar, tag) ].updateCurrent( amount , round ) ; if ( last_length <= 2 ) m_weights->m_mapTagByWordAndNextChar[ std::make_pair(lastword_firstchar, last_tag) ].updateCurrent( amount , round ) ; } if ( length == 1 ) { if ( index > 0 && index < sentence->size() - 1 ) m_weights->m_mapTagOfOneCharWord[ std::make_pair(three_char, tag) ].updateCurrent( amount , round ) ; } else { m_weights->m_mapTagByFirstChar[ std::make_pair(first_char, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapTagByLastChar[ std::make_pair(last_char, tag) ].updateCurrent( amount , round ) ; // m_weights->m_mapTagByFirstCharCat[ std::make_pair(first_char_cat, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapTagByLastCharCat[ std::make_pair(last_char_cat, tag) ].updateCurrent( amount , round ) ; for ( int j = 0 ; j < chars.size() ; ++ j ) { if ( j > 0 && j < chars.size() - 1 ) m_weights->m_mapTagByChar[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ; if ( j > 0 ) { wt1.load(chars[j], tag); wt2.load(first_char); wt12.allocate(wt1, wt2); m_weights->m_mapTaggedCharByFirstChar[ wt12 ].updateCurrent( amount , round ) ; if ( chars[j] == chars[j-1] ) m_weights->m_mapRepeatedCharByTag[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ; // } if (j<chars.size()-1) { wt1.load(chars[j], tag); wt2.load(last_char); wt12.allocate(wt1, wt2); m_weights->m_mapTaggedCharByLastChar[ wt12 ].updateCurrent(amount, round); } } } }
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { clock_t total_start_time = clock();; CStateItem *pGenerator, *pCandidate; unsigned index; // the index of the current char unsigned j, k; // temporary index int subtract_score; // the score to be subtracted (previous item) static unsigned doneLastWord[MAX_SENTENCE_SIZE]; static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned length = sentence.size(); if (length > MAX_SENTENCE_SIZE) { std::cerr << "The size of the sentence is " << length << " characters, which is larger than the limit of the system (" << MAX_SENTENCE_SIZE <<std::endl; vReturn->clear(); return; } assert(vReturn!=NULL); //clock_t start_time = clock(); TRACE("Initialising the segmentation process..."); vReturn->clear(); clearWordCache(); m_Agenda->clear(); pCandidate = m_Agenda->candidateItem(); // make the first item pCandidate->clear(); // restore state using clean m_Agenda->pushCandidate(); // and push it back m_Agenda->nextRound(); // as the generator item if (nBest == 1) // optimization for one best for (j=0; j<MAX_SENTENCE_SIZE; ++j) doneLastWord[j] = 0; TRACE("Segmenting started"); //TRACE("initialisation time: " << clock() - start_time); for (index=0; index<length; index++) { // generate new state itmes for each character pGenerator = m_Agenda->generatorStart(); for (j=0; j<m_Agenda->generatorSize(); ++j) { // 1. generate new items according to each previous item. if (pGenerator->m_nLength>0) k = pGenerator->getWordStart(pGenerator->m_nLength-1); // If we only ask 1-best, then we take only the best among those with the last word if ( ( nBest > 1 || pGenerator->m_nLength==0 || doneLastWord[k]<index+1 ) && rules.canSeparate( index ) ) { pCandidate = m_Agenda->candidateItem(); pCandidate->copy(pGenerator); pCandidate->append(index); pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); m_Agenda->pushCandidate(); if (nBest == 1 && pGenerator->m_nLength>0) doneLastWord[k] = index+1; } // 2. generate by replacing items if ( index > 0 && rules.canAppend(index) ) { pCandidate = m_Agenda->candidateItem(); pCandidate->copy(pGenerator); subtract_score = m_Feature->getLocalScore(&sentence, pGenerator, pGenerator->m_nLength-1); pCandidate->m_nScore -= subtract_score; pCandidate->replace(index); pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); m_Agenda->pushCandidate(); } pGenerator = m_Agenda->generatorNext(); // next generator } m_Agenda->nextRound(); // move round } // now generate outout sentence // n-best list will be stored in array // from the addr vReturn TRACE("Outputing sentence"); for (k=0; k<nBest; ++k) { // clear vReturn[k].clear(); if (out_scores!=NULL) out_scores[k] = 0; // assign retval if (k<m_Agenda->generatorSize()) { pGenerator = m_Agenda->generator(k); for (j=0; j<pGenerator->m_nLength; j++) { std::string temp = ""; for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) { assert(sentence.at(l)!=" "); // [SPACE] temp += sentence.at(l); } vReturn[k].push_back(temp); } if (out_scores!=NULL) out_scores[k] = pGenerator->m_nScore; } } TRACE("Done, the best score: " << pGenerator->m_nScore); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }
void CTagger::tag( const CStringVector * sentence_input , CTwoStringVector * vReturn , SCORE_TYPE * out_scores , unsigned long nBest , const CBitArray * prunes ) { clock_t total_start_time = clock();; int index , start_index , generator_index , temp_index, word_length; const CStateItem * generator_item ; CStateItem *candidate_item , tempState , maxState ; static CStateItem best_bigram[ 1<<CTag::SIZE ] ; unsigned long long best_bigram_mask = 0; // and the count unsigned long tag, last_tag ; static CStringVector sentence; static CRule rules(m_weights->m_bSegmentationRules); rules.segment(sentence_input, &sentence); const int length = sentence.size() ; if (length>=m_nMaxSentSize) THROW("the length of the sentence is bigger than the maximum sentence size "<<m_nMaxSentSize<<"; try changing the option"); assert(vReturn!=NULL); TRACE("Initialising the tagging process..."); m_WordCache.clear() ; m_Chart.clear() ; // put an empty sentence to the beginning tempState.clear() ; m_Chart[ 0 ]->insertItem( &tempState ) ; TRACE("Tagging started"); // enumerating the end index // ========================= // index is the word index starting from 0 for ( index = 0 ; index < length ; ++ index ) { // m_Chart index 1 correspond to the first char m_Chart[ index + 1 ] ; // this is to make some necessary initialisation for each agenda, when pruning // control for the ending character of the candidate if ( index < length-1 && rules.canSeparate(index+1)==false ) continue ; // enumerating the possible tags // ============================= // the tag 0 is the NONE tag, and tag 1 is the BEGIN tag for ( tag = CTag::FIRST ; tag <= CTag::LAST ; ++ tag ) { start_index = index-1 ; // the end index of last word word_length = 1 ; // current word length // enumerating the start index // =========================== // the start index of the word is actually start_index + 1 while( start_index >= -1 && word_length <= m_weights->m_maxLengthByTag[ tag ] ) { // control for the starting character of the candidate // --------------------------------------------------- while ( start_index >= 0 && rules.canSeparate(start_index+1)==false ) start_index-- ; // start the search process // ------------------------ // with pruning if ( ( prunes==NULL || prunes->isset( ( start_index+1 ) * m_nMaxSentSize + index ) ) && // not pruned ( ( m_weights->m_mapWordFrequency.find( m_WordCache.find( start_index+1 , index , &sentence ) , 0 ) < m_weights->m_nMaxWordFrequency/5000+5 && PENN_TAG_CLOSED[ tag ] == false ) || m_weights->m_mapTagDictionary.lookup( m_WordCache.find( start_index+1 , index , &sentence ), tag ) ) // wordtag match ) { if (nBest==1) best_bigram_mask=0LL; for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) { generator_item = m_Chart[ start_index+1 ]->item( generator_index ) ; tempState.copy( generator_item ) ; tempState.append( index , tag ) ; tempState.score += getOrUpdateLocalScore( &sentence , &tempState , tempState.size()-1 ) ; if (nBest==1) { last_tag = tempState.size()>1 ? tempState.getTag(tempState.size()-2).code() : CTag::SENTENCE_BEGIN; if ( ((best_bigram_mask&(1LL<<last_tag))==0LL) || best_bigram[last_tag].score < tempState.score ) { best_bigram_mask|=(1LL<<last_tag); best_bigram[last_tag].copy(&tempState); } } else { m_Chart[ index+1 ]->insertItem( &tempState ); } } if (nBest==1) { for ( last_tag=0; last_tag<CTag::COUNT; ++last_tag ) { if ( (best_bigram_mask&(1LL<<last_tag)) ) m_Chart[ index+1 ]->insertItem( &(best_bigram[last_tag]) ); } } }//if // control the first character of the candidate if ( rules.canAppend(start_index+1)==false ) break ; // update start index and word len --start_index ; ++word_length ; }//start_index }//tag }//index TRACE("Outputing sentence"); for ( temp_index = 0 ; temp_index < nBest ; ++ temp_index ) { vReturn[ temp_index ].clear() ; if (out_scores) out_scores[ temp_index ] = 0 ; if ( temp_index < m_Chart[length]->size() ) { generate( m_Chart[ length ]->bestItem( temp_index ) , &sentence , this , &(vReturn[ temp_index ]) ) ; if (out_scores) out_scores[ temp_index ] = m_Chart[ length ]->bestItem( temp_index )->score ; } } TRACE("Done, the highest score is: " << m_Chart[ length ]->bestItem( 0 )->score) ; TRACE("The total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC) ; }
bool work(CSegmentor *segmentor, const CStringVector &sentence, CStringVector *vReturn, double *out_scores, CRule &rules, std::vector<unsigned> *correct_starts, unsigned nBest, int round) { static CStateItem lattice[(MAX_SENTENCE_SIZE+2)*BEAM_SIZE]; static CStateItem *lattice_index[MAX_SENTENCE_SIZE+2]; static const CStateItem *pGenerator, *pBestGen; static const CStateItem *correct, *temp; static int index, temp_index; // the index of the current char static unsigned long int doneWordRnd[MAX_SENTENCE_SIZE]; // mask whether candidate with the last word has been cached static unsigned long int doneWordLink[MAX_SENTENCE_SIZE]; // link to the corresponding cache state item from word_length + 1 static CScoredAct doneWordItems[BEAM_SIZE]; static int doneItemPointer; static unsigned correct_word; static bool correct_append; static unsigned long word_length; static bool bCompatible; const int length = sentence.size(); static CAgendaSimple<CScoredAct> beam(BEAM_SIZE); static CScoredAct action; static const CStateItem *best[BEAM_SIZE]; static unsigned nBestGen; //clock_t start_time = clock(); TRACE("Initialising the decoding process..."); segmentor->clearWordCache(); lattice[0].clear(); lattice_index[0] = lattice; lattice_index[1] = lattice+1; if (correct_starts) { correct = lattice; correct_word=0; correct_append=false; } if (nBest == 1) // optimization for one best memset(doneWordRnd, 0, MAX_SENTENCE_SIZE*sizeof(doneWordRnd[0])); TRACE("Decoding started"); // index is character index and lattice index shifts 1 right for (index=0; index<length; ++index) { lattice_index[index+2] = lattice_index[index+1]; // generate new state itmes for each character beam.clear(); doneItemPointer = 0; for (pGenerator=lattice_index[index]; pGenerator!=lattice_index[index+1]; ++pGenerator) { // for each generator // 1. generate new items according to each previous item. if ( rules.canSeparate( index ) ) { action.load(pGenerator, false, getOrUpdateSeparateScore(segmentor, &sentence, pGenerator)); if ( nBest == 1 ) { word_length = pGenerator->getWordLength(); if ( doneWordRnd[word_length] < index+1 ) { doneWordLink[word_length] = doneItemPointer; // doneWordLink[i] caches the last word with length i+1 doneWordItems[doneItemPointer]=action; // copy item to cache. ++doneItemPointer; doneWordRnd[word_length] = index+1; } else { assert(doneWordRnd[word_length] == index+1); if ( action > doneWordItems[doneWordLink[word_length]] ) doneWordItems[doneWordLink[word_length]]=action; } } else { beam.insertItem(&action); } } // 2. generate by replacing items if ( index > 0 && rules.canAppend(index) ) { action.load(pGenerator, true, getOrUpdateAppendScore(segmentor, &sentence, pGenerator, index-1)); beam.insertItem(&action); } } // 3. recollect the items for separate if (nBest == 1) { for (temp_index = 0; temp_index<doneItemPointer; ++temp_index) { beam.insertItem(&doneWordItems[temp_index]); } } // build new items in decode if (correct_starts) { bCompatible = false; if (index==correct_starts->at(correct_word)) { correct_append = false; ++correct_word; } else { assert(correct_word==correct_starts->size()||index<correct_starts->at(correct_word)); correct_append = true; } pBestGen = 0; } for (temp_index=0; temp_index<beam.size(); ++temp_index) { pGenerator = beam.item(temp_index)->item; if (beam.item(temp_index)->append) pGenerator->append(lattice_index[index+2]); else pGenerator->separate(lattice_index[index+2]); lattice_index[index+2]->score = beam.item(temp_index)->score; if (correct_starts) { if (pBestGen==0 || lattice_index[index+2]->score > pBestGen->score) pBestGen = lattice_index[index+2]; if (correct == pGenerator && correct_append == beam.item(temp_index)->append) { bCompatible = true; correct = lattice_index[index+2]; } } ++lattice_index[index+2]; } // update scores if none from the agenda is correct state. if (correct_starts && !bCompatible) { TRACE("Decoding error, updating the weight std::vector"); if (correct_append) correct->append(lattice_index[index+2]); else correct->separate(lattice_index[index+2]); updateScoreVectorForStates(segmentor, &sentence, pBestGen, lattice_index[index+2], round); return false; } } // a final step adding the last separate score for items. beam.clear(); for (pGenerator=lattice_index[length]; pGenerator!=lattice_index[length+1]; ++pGenerator) { action.load(pGenerator, false, getOrUpdateSeparateScore(segmentor, &sentence, pGenerator)); beam.insertItem(&action); } beam.sortItems(); // sort final items nBestGen = beam.size(); for (temp_index=0; temp_index<nBestGen; ++temp_index) { best[temp_index] = beam.item(temp_index)->item; } if (correct_starts) { assert(bCompatible); if (correct!=best[0]) { TRACE("Decoding error, updating the weight std::vector"); updateScoreVectorForStates(segmentor, &sentence, best[0], correct, round); return false; } } TRACE("Decoding finished"); // now generate outout sentence // n-best list will be stored in array if (!correct_starts){ TRACE("Outputing sentence"); for ( index=0; index<std::min(nBest, nBestGen); ++index ) { // clear vReturn[index].clear(); if ( out_scores ) out_scores[index] = 0; // assign retval static unsigned count; static unsigned start; count = 0; temp = best[index]; while (!temp->empty()) { ++count; temp = temp->prev(); } vReturn[index].resize(count); --count; temp = best[index]; while (!temp->empty()) { for (temp_index=temp->getWordStart(); temp_index<=temp->getWordEnd(); ++temp_index) { vReturn[index].at(count) += sentence.at(temp_index); } --count; temp = temp->prev(); } if ( out_scores!=NULL ) out_scores[index] = best[index]->score; } } return true; }
void CTagger::tag( const CStringVector * sentence_input , CTwoStringVector * vReturn , SCORE_TYPE * out_scores , unsigned long nBest , const CBitArray * prunes ) { clock_t total_start_time = clock();; int temp_index; const CSubStateItem *pGenerator; CSubStateItem tempState; int j, k; unsigned tag; unsigned index, last_tag; static CSubStateItem uniqueItems[AGENDA_SIZE]; unsigned long uniqueIndex; static bool bUnique; // unsigned long long uniqueMarkup; // assert(CTag::COUNT<=sizeof(unsigned long long)*8); static CStringVector sentence; static CRule rules(m_weights->m_bSegmentationRules); rules.segment(sentence_input, &sentence); const unsigned length=sentence.size(); static CSubStateItem goldState; goldState.clear(); TRACE("Initialising the tagging process..."); m_WordCache.clear(); tempState.clear(); m_Agenda.clear(); m_Agenda.pushCandidate(&tempState); m_Agenda.nextRound(); TRACE("Tagging started"); //TRACE("initialisation time: " << clock() - start_time); for (index=0; index<length; index++) { // decide correction if ( m_bTrain ) { static bool bAnyCorrect; bAnyCorrect = false; pGenerator = m_Agenda.generatorStart(); for (j=0; j<m_Agenda.generatorSize(); ++j) { if ( *pGenerator == goldState ) bAnyCorrect = true; pGenerator = m_Agenda.generatorNext(); // next generator } if ( !bAnyCorrect ) { TRACE("Training error at character " << index); pGenerator = m_Agenda.bestGenerator(); updateScoreForState(&sentence, pGenerator, -1); updateScoreForState(&sentence, &goldState, 1); m_bTrainingError = true; return; } } // 2. generate by replacing items if ( index > 0 ) { pGenerator = m_Agenda.generatorStart(); for (j=0; j<m_Agenda.generatorSize(); ++j) { assert(pGenerator->size()>0); if ( ( rules.canAppend(index) ) && // ( index > 0 ) && pGenerator->getWordLength(pGenerator->size()-1) < m_weights->m_maxLengthByTag[pGenerator->getTag(pGenerator->size()-1).code()] ) { tempState.copy(pGenerator); tempState.replaceIndex(index); tempState.score += getOrUpdateAppendScore(&sentence, &tempState, tempState.size()-1, index); if (index+1==length) tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()); m_Agenda.pushCandidate(&tempState); } // if pGenerator = m_Agenda.generatorNext(); // next generator } } //_ // 1. generate new items according to each previous item. // iterate postags for (tag=CTag::FIRST; tag<CTag::COUNT; ++tag) { pGenerator = m_Agenda.generatorStart(); // uniqueMarkup=0; uniqueIndex=0; for (j=0; j<m_Agenda.generatorSize(); ++j) { last_tag = pGenerator->size()==0 ? CTag::SENTENCE_BEGIN : pGenerator->getTag(pGenerator->size()-1).code(); if ( rules.canSeparate( index ) && (index == 0 || canAssignTag( m_WordCache.find( pGenerator->getWordStart(pGenerator->size()-1), index-1, &sentence ), last_tag )) && // last word canStartWord(sentence, tag, index) // word ) { tempState.copy(pGenerator); tempState.append(index, tag); tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()-1); if (index+1==length) tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()); if (nBest==1) { // if ( ((uniqueMarkup&(1LL<<last_tag))==0LL) || uniqueItems[last_tag].score < tempState.score ) { // uniqueMarkup |= (1LL<<last_tag); // uniqueItems[last_tag].copy(&tempState); // } bUnique = true; for (temp_index=0; temp_index<uniqueIndex; ++temp_index) { // only one new when index=zero. assert(index>0&&uniqueItems[temp_index].size()>1); if (uniqueItems[temp_index].getTag(uniqueItems[temp_index].size()-2) == tempState.getTag(tempState.size()-2) && uniqueItems[temp_index].getWordStart(uniqueItems[temp_index].size()-2) == tempState.getWordStart(tempState.size()-2) ) { bUnique = false; if (uniqueItems[temp_index].score < tempState.score ) uniqueItems[temp_index].copy(&tempState); }//if }//for if (bUnique) { uniqueItems[uniqueIndex++].copy(&tempState); }//if } else { m_Agenda.pushCandidate(&tempState); } } pGenerator = m_Agenda.generatorNext(); // next generator } // push candidates if (nBest == 1) { // for (last_tag=0; last_tag<CTag::COUNT; ++last_tag) { // if ( (uniqueMarkup&(1LL<<last_tag)) ) // m_Agenda.pushCandidate(&(uniqueItems[last_tag])); // } for (temp_index=0; temp_index<uniqueIndex; ++temp_index) { m_Agenda.pushCandidate(&(uniqueItems[temp_index])); }//for } }//tag m_Agenda.nextRound(); // move round if (m_bTrain) goldState.follow(m_goldState); } if ( m_bTrain && 1 ) { pGenerator = m_Agenda.bestGenerator(); if ( *pGenerator != goldState ) { TRACE("Training error at the last word"); updateScoreForState(&sentence, pGenerator, -1); updateScoreForState(&sentence, &goldState, 1); m_bTrainingError = true; } m_bTrainingError = false; return; } TRACE("Outputing sentence"); vReturn->clear(); if (nBest == 1) { generate( m_Agenda.bestGenerator() , &sentence , this , vReturn ) ; if (out_scores) out_scores[ 0 ] = m_Agenda.bestGenerator( )->score ; } else { m_Agenda.sortGenerators(); for ( temp_index = 0 ; temp_index < nBest ; ++ temp_index ) { vReturn[ temp_index ].clear() ; if (out_scores) out_scores[ temp_index ] = 0 ; if ( temp_index < m_Agenda.generatorSize() ) { generate( m_Agenda.generator( temp_index ) , &sentence , this , &(vReturn[ temp_index ]) ) ; if (out_scores) out_scores[ temp_index ] = m_Agenda.bestGenerator( )->score ; } } } TRACE("Done, the highest score is: " << m_Agenda.bestGenerator()->score) ; TRACE("The total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC) ; }
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { clock_t total_start_time = clock();; const CStateItem *pGenerator, *pCandidate; CStateItem tempState; unsigned index; // the index of the current char unsigned j, k; // temporary index int subtract_score; // the score to be subtracted (previous item) static CStateItem best_bigram; int start_index; int word_length; int generator_index; static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned length = sentence.size(); assert(length<MAX_SENTENCE_SIZE); assert(vReturn!=NULL); //clock_t start_time = clock(); TRACE("Initialising the segmentation process..."); vReturn->clear(); clearWordCache(); m_Chart.clear(); tempState.clear(); m_Chart[0]->insertItem(&tempState); TRACE("Segmenting started"); for (index=0; index<length; index++) { // m_Chart index 1 correspond to the first char m_Chart[index+1]; // control for the ending character of the candidate if ( index < length-1 && rules.canSeparate(index+1)==false ) continue ; start_index = index-1 ; // the end index of last word word_length = 1 ; // current word length // enumerating the start index // =========================== // the start index of the word is actually start_index + 1 while( start_index >= -1 && word_length <= MAX_WORD_SIZE ) { // control for the starting character of the candidate // --------------------------------------------------- while ( start_index >= 0 && rules.canSeparate(start_index+1)==false ) start_index-- ; // start the search process // ------------------------ for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) { pGenerator = m_Chart[ start_index+1 ]->item( generator_index ) ; tempState.copy( pGenerator ) ; tempState.append( index ) ; tempState.m_nScore += m_Feature->getLocalScore( &sentence, &tempState, tempState.m_nLength-1 ) ; if (nBest==1) { if ( generator_index == 0 || tempState.m_nScore > best_bigram.m_nScore ) { best_bigram.copy(&tempState); //@@@ } } else { m_Chart[ index+1 ]->insertItem( &tempState ); } } if (nBest==1) { m_Chart[ index+1 ]->insertItem( &best_bigram ); //@@@ } //@@@ // control the first character of the candidate if ( rules.canAppend(start_index+1)==false ) break ; // update start index and word len --start_index ; ++word_length ; }//start_index } // now generate outout sentence // n-best list will be stored in array // from the addr vReturn TRACE("Outputing sentence"); for (k=0; k<nBest; ++k) { // clear vReturn[k].clear(); if (out_scores!=NULL) out_scores[k] = 0; // assign retval if (k<m_Chart[length]->size()) { pGenerator = m_Chart[length]->bestItem(k); for (j=0; j<pGenerator->m_nLength; j++) { std::string temp = ""; for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) { assert(sentence.at(l)!=" "); // [SPACE] temp += sentence.at(l); } vReturn[k].push_back(temp); } if (out_scores!=NULL) out_scores[k] = pGenerator->m_nScore; } } TRACE("Done, the best score: " << pGenerator->m_nScore); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }