void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { clock_t total_start_time = clock();; CStateItem *pGenerator, *pCandidate; unsigned index; // the index of the current char unsigned j, k; // temporary index int subtract_score; // the score to be subtracted (previous item) static unsigned doneLastWord[MAX_SENTENCE_SIZE]; static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned length = sentence.size(); if (length > MAX_SENTENCE_SIZE) { std::cerr << "The size of the sentence is " << length << " characters, which is larger than the limit of the system (" << MAX_SENTENCE_SIZE <<std::endl; vReturn->clear(); return; } assert(vReturn!=NULL); //clock_t start_time = clock(); TRACE("Initialising the segmentation process..."); vReturn->clear(); clearWordCache(); m_Agenda->clear(); pCandidate = m_Agenda->candidateItem(); // make the first item pCandidate->clear(); // restore state using clean m_Agenda->pushCandidate(); // and push it back m_Agenda->nextRound(); // as the generator item if (nBest == 1) // optimization for one best for (j=0; j<MAX_SENTENCE_SIZE; ++j) doneLastWord[j] = 0; TRACE("Segmenting started"); //TRACE("initialisation time: " << clock() - start_time); for (index=0; index<length; index++) { // generate new state itmes for each character pGenerator = m_Agenda->generatorStart(); for (j=0; j<m_Agenda->generatorSize(); ++j) { // 1. generate new items according to each previous item. if (pGenerator->m_nLength>0) k = pGenerator->getWordStart(pGenerator->m_nLength-1); // If we only ask 1-best, then we take only the best among those with the last word if ( ( nBest > 1 || pGenerator->m_nLength==0 || doneLastWord[k]<index+1 ) && rules.canSeparate( index ) ) { pCandidate = m_Agenda->candidateItem(); pCandidate->copy(pGenerator); pCandidate->append(index); pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); m_Agenda->pushCandidate(); if (nBest == 1 && pGenerator->m_nLength>0) doneLastWord[k] = index+1; } // 2. generate by replacing items if ( index > 0 && rules.canAppend(index) ) { pCandidate = m_Agenda->candidateItem(); pCandidate->copy(pGenerator); subtract_score = m_Feature->getLocalScore(&sentence, pGenerator, pGenerator->m_nLength-1); pCandidate->m_nScore -= subtract_score; pCandidate->replace(index); pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); m_Agenda->pushCandidate(); } pGenerator = m_Agenda->generatorNext(); // next generator } m_Agenda->nextRound(); // move round } // now generate outout sentence // n-best list will be stored in array // from the addr vReturn TRACE("Outputing sentence"); for (k=0; k<nBest; ++k) { // clear vReturn[k].clear(); if (out_scores!=NULL) out_scores[k] = 0; // assign retval if (k<m_Agenda->generatorSize()) { pGenerator = m_Agenda->generator(k); for (j=0; j<pGenerator->m_nLength; j++) { std::string temp = ""; for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) { assert(sentence.at(l)!=" "); // [SPACE] temp += sentence.at(l); } vReturn[k].push_back(temp); } if (out_scores!=NULL) out_scores[k] = pGenerator->m_nScore; } } TRACE("Done, the best score: " << pGenerator->m_nScore); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }
void CTagger::tag( const CStringVector * sentence_input , CTwoStringVector * vReturn , SCORE_TYPE * out_scores , unsigned long nBest , const CBitArray * prunes ) { clock_t total_start_time = clock();; int index , start_index , generator_index , temp_index, word_length; const CStateItem * generator_item ; CStateItem *candidate_item , tempState , maxState ; static CStateItem best_bigram[ 1<<CTag::SIZE ] ; unsigned long long best_bigram_mask = 0; // and the count unsigned long tag, last_tag ; static CStringVector sentence; static CRule rules(m_weights->m_bSegmentationRules); rules.segment(sentence_input, &sentence); const int length = sentence.size() ; if (length>=m_nMaxSentSize) THROW("the length of the sentence is bigger than the maximum sentence size "<<m_nMaxSentSize<<"; try changing the option"); assert(vReturn!=NULL); TRACE("Initialising the tagging process..."); m_WordCache.clear() ; m_Chart.clear() ; // put an empty sentence to the beginning tempState.clear() ; m_Chart[ 0 ]->insertItem( &tempState ) ; TRACE("Tagging started"); // enumerating the end index // ========================= // index is the word index starting from 0 for ( index = 0 ; index < length ; ++ index ) { // m_Chart index 1 correspond to the first char m_Chart[ index + 1 ] ; // this is to make some necessary initialisation for each agenda, when pruning // control for the ending character of the candidate if ( index < length-1 && rules.canSeparate(index+1)==false ) continue ; // enumerating the possible tags // ============================= // the tag 0 is the NONE tag, and tag 1 is the BEGIN tag for ( tag = CTag::FIRST ; tag <= CTag::LAST ; ++ tag ) { start_index = index-1 ; // the end index of last word word_length = 1 ; // current word length // enumerating the start index // =========================== // the start index of the word is actually start_index + 1 while( start_index >= -1 && word_length <= m_weights->m_maxLengthByTag[ tag ] ) { // control for the starting character of the candidate // --------------------------------------------------- while ( start_index >= 0 && rules.canSeparate(start_index+1)==false ) start_index-- ; // start the search process // ------------------------ // with pruning if ( ( prunes==NULL || prunes->isset( ( start_index+1 ) * m_nMaxSentSize + index ) ) && // not pruned ( ( m_weights->m_mapWordFrequency.find( m_WordCache.find( start_index+1 , index , &sentence ) , 0 ) < m_weights->m_nMaxWordFrequency/5000+5 && PENN_TAG_CLOSED[ tag ] == false ) || m_weights->m_mapTagDictionary.lookup( m_WordCache.find( start_index+1 , index , &sentence ), tag ) ) // wordtag match ) { if (nBest==1) best_bigram_mask=0LL; for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) { generator_item = m_Chart[ start_index+1 ]->item( generator_index ) ; tempState.copy( generator_item ) ; tempState.append( index , tag ) ; tempState.score += getOrUpdateLocalScore( &sentence , &tempState , tempState.size()-1 ) ; if (nBest==1) { last_tag = tempState.size()>1 ? tempState.getTag(tempState.size()-2).code() : CTag::SENTENCE_BEGIN; if ( ((best_bigram_mask&(1LL<<last_tag))==0LL) || best_bigram[last_tag].score < tempState.score ) { best_bigram_mask|=(1LL<<last_tag); best_bigram[last_tag].copy(&tempState); } } else { m_Chart[ index+1 ]->insertItem( &tempState ); } } if (nBest==1) { for ( last_tag=0; last_tag<CTag::COUNT; ++last_tag ) { if ( (best_bigram_mask&(1LL<<last_tag)) ) m_Chart[ index+1 ]->insertItem( &(best_bigram[last_tag]) ); } } }//if // control the first character of the candidate if ( rules.canAppend(start_index+1)==false ) break ; // update start index and word len --start_index ; ++word_length ; }//start_index }//tag }//index TRACE("Outputing sentence"); for ( temp_index = 0 ; temp_index < nBest ; ++ temp_index ) { vReturn[ temp_index ].clear() ; if (out_scores) out_scores[ temp_index ] = 0 ; if ( temp_index < m_Chart[length]->size() ) { generate( m_Chart[ length ]->bestItem( temp_index ) , &sentence , this , &(vReturn[ temp_index ]) ) ; if (out_scores) out_scores[ temp_index ] = m_Chart[ length ]->bestItem( temp_index )->score ; } } TRACE("Done, the highest score is: " << m_Chart[ length ]->bestItem( 0 )->score) ; TRACE("The total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC) ; }
void CTagger::tag( const CStringVector * sentence_input , CTwoStringVector * vReturn , SCORE_TYPE * out_scores , unsigned long nBest , const CBitArray * prunes ) { clock_t total_start_time = clock();; int temp_index; const CSubStateItem *pGenerator; CSubStateItem tempState; int j, k; unsigned tag; unsigned index, last_tag; static CSubStateItem uniqueItems[AGENDA_SIZE]; unsigned long uniqueIndex; static bool bUnique; // unsigned long long uniqueMarkup; // assert(CTag::COUNT<=sizeof(unsigned long long)*8); static CStringVector sentence; static CRule rules(m_weights->m_bSegmentationRules); rules.segment(sentence_input, &sentence); const unsigned length=sentence.size(); static CSubStateItem goldState; goldState.clear(); TRACE("Initialising the tagging process..."); m_WordCache.clear(); tempState.clear(); m_Agenda.clear(); m_Agenda.pushCandidate(&tempState); m_Agenda.nextRound(); TRACE("Tagging started"); //TRACE("initialisation time: " << clock() - start_time); for (index=0; index<length; index++) { // decide correction if ( m_bTrain ) { static bool bAnyCorrect; bAnyCorrect = false; pGenerator = m_Agenda.generatorStart(); for (j=0; j<m_Agenda.generatorSize(); ++j) { if ( *pGenerator == goldState ) bAnyCorrect = true; pGenerator = m_Agenda.generatorNext(); // next generator } if ( !bAnyCorrect ) { TRACE("Training error at character " << index); pGenerator = m_Agenda.bestGenerator(); updateScoreForState(&sentence, pGenerator, -1); updateScoreForState(&sentence, &goldState, 1); m_bTrainingError = true; return; } } // 2. generate by replacing items if ( index > 0 ) { pGenerator = m_Agenda.generatorStart(); for (j=0; j<m_Agenda.generatorSize(); ++j) { assert(pGenerator->size()>0); if ( ( rules.canAppend(index) ) && // ( index > 0 ) && pGenerator->getWordLength(pGenerator->size()-1) < m_weights->m_maxLengthByTag[pGenerator->getTag(pGenerator->size()-1).code()] ) { tempState.copy(pGenerator); tempState.replaceIndex(index); tempState.score += getOrUpdateAppendScore(&sentence, &tempState, tempState.size()-1, index); if (index+1==length) tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()); m_Agenda.pushCandidate(&tempState); } // if pGenerator = m_Agenda.generatorNext(); // next generator } } //_ // 1. generate new items according to each previous item. // iterate postags for (tag=CTag::FIRST; tag<CTag::COUNT; ++tag) { pGenerator = m_Agenda.generatorStart(); // uniqueMarkup=0; uniqueIndex=0; for (j=0; j<m_Agenda.generatorSize(); ++j) { last_tag = pGenerator->size()==0 ? CTag::SENTENCE_BEGIN : pGenerator->getTag(pGenerator->size()-1).code(); if ( rules.canSeparate( index ) && (index == 0 || canAssignTag( m_WordCache.find( pGenerator->getWordStart(pGenerator->size()-1), index-1, &sentence ), last_tag )) && // last word canStartWord(sentence, tag, index) // word ) { tempState.copy(pGenerator); tempState.append(index, tag); tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()-1); if (index+1==length) tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()); if (nBest==1) { // if ( ((uniqueMarkup&(1LL<<last_tag))==0LL) || uniqueItems[last_tag].score < tempState.score ) { // uniqueMarkup |= (1LL<<last_tag); // uniqueItems[last_tag].copy(&tempState); // } bUnique = true; for (temp_index=0; temp_index<uniqueIndex; ++temp_index) { // only one new when index=zero. assert(index>0&&uniqueItems[temp_index].size()>1); if (uniqueItems[temp_index].getTag(uniqueItems[temp_index].size()-2) == tempState.getTag(tempState.size()-2) && uniqueItems[temp_index].getWordStart(uniqueItems[temp_index].size()-2) == tempState.getWordStart(tempState.size()-2) ) { bUnique = false; if (uniqueItems[temp_index].score < tempState.score ) uniqueItems[temp_index].copy(&tempState); }//if }//for if (bUnique) { uniqueItems[uniqueIndex++].copy(&tempState); }//if } else { m_Agenda.pushCandidate(&tempState); } } pGenerator = m_Agenda.generatorNext(); // next generator } // push candidates if (nBest == 1) { // for (last_tag=0; last_tag<CTag::COUNT; ++last_tag) { // if ( (uniqueMarkup&(1LL<<last_tag)) ) // m_Agenda.pushCandidate(&(uniqueItems[last_tag])); // } for (temp_index=0; temp_index<uniqueIndex; ++temp_index) { m_Agenda.pushCandidate(&(uniqueItems[temp_index])); }//for } }//tag m_Agenda.nextRound(); // move round if (m_bTrain) goldState.follow(m_goldState); } if ( m_bTrain && 1 ) { pGenerator = m_Agenda.bestGenerator(); if ( *pGenerator != goldState ) { TRACE("Training error at the last word"); updateScoreForState(&sentence, pGenerator, -1); updateScoreForState(&sentence, &goldState, 1); m_bTrainingError = true; } m_bTrainingError = false; return; } TRACE("Outputing sentence"); vReturn->clear(); if (nBest == 1) { generate( m_Agenda.bestGenerator() , &sentence , this , vReturn ) ; if (out_scores) out_scores[ 0 ] = m_Agenda.bestGenerator( )->score ; } else { m_Agenda.sortGenerators(); for ( temp_index = 0 ; temp_index < nBest ; ++ temp_index ) { vReturn[ temp_index ].clear() ; if (out_scores) out_scores[ temp_index ] = 0 ; if ( temp_index < m_Agenda.generatorSize() ) { generate( m_Agenda.generator( temp_index ) , &sentence , this , &(vReturn[ temp_index ]) ) ; if (out_scores) out_scores[ temp_index ] = m_Agenda.bestGenerator( )->score ; } } } TRACE("Done, the highest score is: " << m_Agenda.bestGenerator()->score) ; TRACE("The total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC) ; }
bool work(CSegmentor *segmentor, const CStringVector &sentence, CStringVector *vReturn, double *out_scores, CRule &rules, std::vector<unsigned> *correct_starts, unsigned nBest, int round) { static CStateItem lattice[(MAX_SENTENCE_SIZE+2)*BEAM_SIZE]; static CStateItem *lattice_index[MAX_SENTENCE_SIZE+2]; static const CStateItem *pGenerator, *pBestGen; static const CStateItem *correct, *temp; static int index, temp_index; // the index of the current char static unsigned long int doneWordRnd[MAX_SENTENCE_SIZE]; // mask whether candidate with the last word has been cached static unsigned long int doneWordLink[MAX_SENTENCE_SIZE]; // link to the corresponding cache state item from word_length + 1 static CScoredAct doneWordItems[BEAM_SIZE]; static int doneItemPointer; static unsigned correct_word; static bool correct_append; static unsigned long word_length; static bool bCompatible; const int length = sentence.size(); static CAgendaSimple<CScoredAct> beam(BEAM_SIZE); static CScoredAct action; static const CStateItem *best[BEAM_SIZE]; static unsigned nBestGen; //clock_t start_time = clock(); TRACE("Initialising the decoding process..."); segmentor->clearWordCache(); lattice[0].clear(); lattice_index[0] = lattice; lattice_index[1] = lattice+1; if (correct_starts) { correct = lattice; correct_word=0; correct_append=false; } if (nBest == 1) // optimization for one best memset(doneWordRnd, 0, MAX_SENTENCE_SIZE*sizeof(doneWordRnd[0])); TRACE("Decoding started"); // index is character index and lattice index shifts 1 right for (index=0; index<length; ++index) { lattice_index[index+2] = lattice_index[index+1]; // generate new state itmes for each character beam.clear(); doneItemPointer = 0; for (pGenerator=lattice_index[index]; pGenerator!=lattice_index[index+1]; ++pGenerator) { // for each generator // 1. generate new items according to each previous item. if ( rules.canSeparate( index ) ) { action.load(pGenerator, false, getOrUpdateSeparateScore(segmentor, &sentence, pGenerator)); if ( nBest == 1 ) { word_length = pGenerator->getWordLength(); if ( doneWordRnd[word_length] < index+1 ) { doneWordLink[word_length] = doneItemPointer; // doneWordLink[i] caches the last word with length i+1 doneWordItems[doneItemPointer]=action; // copy item to cache. ++doneItemPointer; doneWordRnd[word_length] = index+1; } else { assert(doneWordRnd[word_length] == index+1); if ( action > doneWordItems[doneWordLink[word_length]] ) doneWordItems[doneWordLink[word_length]]=action; } } else { beam.insertItem(&action); } } // 2. generate by replacing items if ( index > 0 && rules.canAppend(index) ) { action.load(pGenerator, true, getOrUpdateAppendScore(segmentor, &sentence, pGenerator, index-1)); beam.insertItem(&action); } } // 3. recollect the items for separate if (nBest == 1) { for (temp_index = 0; temp_index<doneItemPointer; ++temp_index) { beam.insertItem(&doneWordItems[temp_index]); } } // build new items in decode if (correct_starts) { bCompatible = false; if (index==correct_starts->at(correct_word)) { correct_append = false; ++correct_word; } else { assert(correct_word==correct_starts->size()||index<correct_starts->at(correct_word)); correct_append = true; } pBestGen = 0; } for (temp_index=0; temp_index<beam.size(); ++temp_index) { pGenerator = beam.item(temp_index)->item; if (beam.item(temp_index)->append) pGenerator->append(lattice_index[index+2]); else pGenerator->separate(lattice_index[index+2]); lattice_index[index+2]->score = beam.item(temp_index)->score; if (correct_starts) { if (pBestGen==0 || lattice_index[index+2]->score > pBestGen->score) pBestGen = lattice_index[index+2]; if (correct == pGenerator && correct_append == beam.item(temp_index)->append) { bCompatible = true; correct = lattice_index[index+2]; } } ++lattice_index[index+2]; } // update scores if none from the agenda is correct state. if (correct_starts && !bCompatible) { TRACE("Decoding error, updating the weight std::vector"); if (correct_append) correct->append(lattice_index[index+2]); else correct->separate(lattice_index[index+2]); updateScoreVectorForStates(segmentor, &sentence, pBestGen, lattice_index[index+2], round); return false; } } // a final step adding the last separate score for items. beam.clear(); for (pGenerator=lattice_index[length]; pGenerator!=lattice_index[length+1]; ++pGenerator) { action.load(pGenerator, false, getOrUpdateSeparateScore(segmentor, &sentence, pGenerator)); beam.insertItem(&action); } beam.sortItems(); // sort final items nBestGen = beam.size(); for (temp_index=0; temp_index<nBestGen; ++temp_index) { best[temp_index] = beam.item(temp_index)->item; } if (correct_starts) { assert(bCompatible); if (correct!=best[0]) { TRACE("Decoding error, updating the weight std::vector"); updateScoreVectorForStates(segmentor, &sentence, best[0], correct, round); return false; } } TRACE("Decoding finished"); // now generate outout sentence // n-best list will be stored in array if (!correct_starts){ TRACE("Outputing sentence"); for ( index=0; index<std::min(nBest, nBestGen); ++index ) { // clear vReturn[index].clear(); if ( out_scores ) out_scores[index] = 0; // assign retval static unsigned count; static unsigned start; count = 0; temp = best[index]; while (!temp->empty()) { ++count; temp = temp->prev(); } vReturn[index].resize(count); --count; temp = best[index]; while (!temp->empty()) { for (temp_index=temp->getWordStart(); temp_index<=temp->getWordEnd(); ++temp_index) { vReturn[index].at(count) += sentence.at(temp_index); } --count; temp = temp->prev(); } if ( out_scores!=NULL ) out_scores[index] = best[index]->score; } } return true; }
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { clock_t total_start_time = clock();; const CStateItem *pGenerator, *pCandidate; CStateItem tempState; unsigned index; // the index of the current char unsigned j, k; // temporary index int subtract_score; // the score to be subtracted (previous item) static CStateItem best_bigram; int start_index; int word_length; int generator_index; static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned length = sentence.size(); assert(length<MAX_SENTENCE_SIZE); assert(vReturn!=NULL); //clock_t start_time = clock(); TRACE("Initialising the segmentation process..."); vReturn->clear(); clearWordCache(); m_Chart.clear(); tempState.clear(); m_Chart[0]->insertItem(&tempState); TRACE("Segmenting started"); for (index=0; index<length; index++) { // m_Chart index 1 correspond to the first char m_Chart[index+1]; // control for the ending character of the candidate if ( index < length-1 && rules.canSeparate(index+1)==false ) continue ; start_index = index-1 ; // the end index of last word word_length = 1 ; // current word length // enumerating the start index // =========================== // the start index of the word is actually start_index + 1 while( start_index >= -1 && word_length <= MAX_WORD_SIZE ) { // control for the starting character of the candidate // --------------------------------------------------- while ( start_index >= 0 && rules.canSeparate(start_index+1)==false ) start_index-- ; // start the search process // ------------------------ for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) { pGenerator = m_Chart[ start_index+1 ]->item( generator_index ) ; tempState.copy( pGenerator ) ; tempState.append( index ) ; tempState.m_nScore += m_Feature->getLocalScore( &sentence, &tempState, tempState.m_nLength-1 ) ; if (nBest==1) { if ( generator_index == 0 || tempState.m_nScore > best_bigram.m_nScore ) { best_bigram.copy(&tempState); //@@@ } } else { m_Chart[ index+1 ]->insertItem( &tempState ); } } if (nBest==1) { m_Chart[ index+1 ]->insertItem( &best_bigram ); //@@@ } //@@@ // control the first character of the candidate if ( rules.canAppend(start_index+1)==false ) break ; // update start index and word len --start_index ; ++word_length ; }//start_index } // now generate outout sentence // n-best list will be stored in array // from the addr vReturn TRACE("Outputing sentence"); for (k=0; k<nBest; ++k) { // clear vReturn[k].clear(); if (out_scores!=NULL) out_scores[k] = 0; // assign retval if (k<m_Chart[length]->size()) { pGenerator = m_Chart[length]->bestItem(k); for (j=0; j<pGenerator->m_nLength; j++) { std::string temp = ""; for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) { assert(sentence.at(l)!=" "); // [SPACE] temp += sentence.at(l); } vReturn[k].push_back(temp); } if (out_scores!=NULL) out_scores[k] = pGenerator->m_nScore; } } TRACE("Done, the best score: " << pGenerator->m_nScore); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }