bool work(CSegmentor *segmentor, const CStringVector &sentence, CStringVector *vReturn, double *out_scores, CRule &rules, std::vector<unsigned> *correct_starts, unsigned nBest, int round) { static CStateItem lattice[(MAX_SENTENCE_SIZE+2)*BEAM_SIZE]; static CStateItem *lattice_index[MAX_SENTENCE_SIZE+2]; static const CStateItem *pGenerator, *pBestGen; static const CStateItem *correct, *temp; static int index, temp_index; // the index of the current char static unsigned long int doneWordRnd[MAX_SENTENCE_SIZE]; // mask whether candidate with the last word has been cached static unsigned long int doneWordLink[MAX_SENTENCE_SIZE]; // link to the corresponding cache state item from word_length + 1 static CScoredAct doneWordItems[BEAM_SIZE]; static int doneItemPointer; static unsigned correct_word; static bool correct_append; static unsigned long word_length; static bool bCompatible; const int length = sentence.size(); static CAgendaSimple<CScoredAct> beam(BEAM_SIZE); static CScoredAct action; static const CStateItem *best[BEAM_SIZE]; static unsigned nBestGen; //clock_t start_time = clock(); TRACE("Initialising the decoding process..."); segmentor->clearWordCache(); lattice[0].clear(); lattice_index[0] = lattice; lattice_index[1] = lattice+1; if (correct_starts) { correct = lattice; correct_word=0; correct_append=false; } if (nBest == 1) // optimization for one best memset(doneWordRnd, 0, MAX_SENTENCE_SIZE*sizeof(doneWordRnd[0])); TRACE("Decoding started"); // index is character index and lattice index shifts 1 right for (index=0; index<length; ++index) { lattice_index[index+2] = lattice_index[index+1]; // generate new state itmes for each character beam.clear(); doneItemPointer = 0; for (pGenerator=lattice_index[index]; pGenerator!=lattice_index[index+1]; ++pGenerator) { // for each generator // 1. generate new items according to each previous item. if ( rules.canSeparate( index ) ) { action.load(pGenerator, false, getOrUpdateSeparateScore(segmentor, &sentence, pGenerator)); if ( nBest == 1 ) { word_length = pGenerator->getWordLength(); if ( doneWordRnd[word_length] < index+1 ) { doneWordLink[word_length] = doneItemPointer; // doneWordLink[i] caches the last word with length i+1 doneWordItems[doneItemPointer]=action; // copy item to cache. ++doneItemPointer; doneWordRnd[word_length] = index+1; } else { assert(doneWordRnd[word_length] == index+1); if ( action > doneWordItems[doneWordLink[word_length]] ) doneWordItems[doneWordLink[word_length]]=action; } } else { beam.insertItem(&action); } } // 2. generate by replacing items if ( index > 0 && rules.canAppend(index) ) { action.load(pGenerator, true, getOrUpdateAppendScore(segmentor, &sentence, pGenerator, index-1)); beam.insertItem(&action); } } // 3. recollect the items for separate if (nBest == 1) { for (temp_index = 0; temp_index<doneItemPointer; ++temp_index) { beam.insertItem(&doneWordItems[temp_index]); } } // build new items in decode if (correct_starts) { bCompatible = false; if (index==correct_starts->at(correct_word)) { correct_append = false; ++correct_word; } else { assert(correct_word==correct_starts->size()||index<correct_starts->at(correct_word)); correct_append = true; } pBestGen = 0; } for (temp_index=0; temp_index<beam.size(); ++temp_index) { pGenerator = beam.item(temp_index)->item; if (beam.item(temp_index)->append) pGenerator->append(lattice_index[index+2]); else pGenerator->separate(lattice_index[index+2]); lattice_index[index+2]->score = beam.item(temp_index)->score; if (correct_starts) { if (pBestGen==0 || lattice_index[index+2]->score > pBestGen->score) pBestGen = lattice_index[index+2]; if (correct == pGenerator && correct_append == beam.item(temp_index)->append) { bCompatible = true; correct = lattice_index[index+2]; } } ++lattice_index[index+2]; } // update scores if none from the agenda is correct state. if (correct_starts && !bCompatible) { TRACE("Decoding error, updating the weight std::vector"); if (correct_append) correct->append(lattice_index[index+2]); else correct->separate(lattice_index[index+2]); updateScoreVectorForStates(segmentor, &sentence, pBestGen, lattice_index[index+2], round); return false; } } // a final step adding the last separate score for items. beam.clear(); for (pGenerator=lattice_index[length]; pGenerator!=lattice_index[length+1]; ++pGenerator) { action.load(pGenerator, false, getOrUpdateSeparateScore(segmentor, &sentence, pGenerator)); beam.insertItem(&action); } beam.sortItems(); // sort final items nBestGen = beam.size(); for (temp_index=0; temp_index<nBestGen; ++temp_index) { best[temp_index] = beam.item(temp_index)->item; } if (correct_starts) { assert(bCompatible); if (correct!=best[0]) { TRACE("Decoding error, updating the weight std::vector"); updateScoreVectorForStates(segmentor, &sentence, best[0], correct, round); return false; } } TRACE("Decoding finished"); // now generate outout sentence // n-best list will be stored in array if (!correct_starts){ TRACE("Outputing sentence"); for ( index=0; index<std::min(nBest, nBestGen); ++index ) { // clear vReturn[index].clear(); if ( out_scores ) out_scores[index] = 0; // assign retval static unsigned count; static unsigned start; count = 0; temp = best[index]; while (!temp->empty()) { ++count; temp = temp->prev(); } vReturn[index].resize(count); --count; temp = best[index]; while (!temp->empty()) { for (temp_index=temp->getWordStart(); temp_index<=temp->getWordEnd(); ++temp_index) { vReturn[index].at(count) += sentence.at(temp_index); } --count; temp = temp->prev(); } if ( out_scores!=NULL ) out_scores[index] = best[index]->score; } } return true; }
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { clock_t total_start_time = clock();; CStateItem *pGenerator, *pCandidate; unsigned index; // the index of the current char unsigned j, k; // temporary index int subtract_score; // the score to be subtracted (previous item) static unsigned doneLastWord[MAX_SENTENCE_SIZE]; static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned length = sentence.size(); if (length > MAX_SENTENCE_SIZE) { std::cerr << "The size of the sentence is " << length << " characters, which is larger than the limit of the system (" << MAX_SENTENCE_SIZE <<std::endl; vReturn->clear(); return; } assert(vReturn!=NULL); //clock_t start_time = clock(); TRACE("Initialising the segmentation process..."); vReturn->clear(); clearWordCache(); m_Agenda->clear(); pCandidate = m_Agenda->candidateItem(); // make the first item pCandidate->clear(); // restore state using clean m_Agenda->pushCandidate(); // and push it back m_Agenda->nextRound(); // as the generator item if (nBest == 1) // optimization for one best for (j=0; j<MAX_SENTENCE_SIZE; ++j) doneLastWord[j] = 0; TRACE("Segmenting started"); //TRACE("initialisation time: " << clock() - start_time); for (index=0; index<length; index++) { // generate new state itmes for each character pGenerator = m_Agenda->generatorStart(); for (j=0; j<m_Agenda->generatorSize(); ++j) { // 1. generate new items according to each previous item. if (pGenerator->m_nLength>0) k = pGenerator->getWordStart(pGenerator->m_nLength-1); // If we only ask 1-best, then we take only the best among those with the last word if ( ( nBest > 1 || pGenerator->m_nLength==0 || doneLastWord[k]<index+1 ) && rules.canSeparate( index ) ) { pCandidate = m_Agenda->candidateItem(); pCandidate->copy(pGenerator); pCandidate->append(index); pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); m_Agenda->pushCandidate(); if (nBest == 1 && pGenerator->m_nLength>0) doneLastWord[k] = index+1; } // 2. generate by replacing items if ( index > 0 && rules.canAppend(index) ) { pCandidate = m_Agenda->candidateItem(); pCandidate->copy(pGenerator); subtract_score = m_Feature->getLocalScore(&sentence, pGenerator, pGenerator->m_nLength-1); pCandidate->m_nScore -= subtract_score; pCandidate->replace(index); pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); m_Agenda->pushCandidate(); } pGenerator = m_Agenda->generatorNext(); // next generator } m_Agenda->nextRound(); // move round } // now generate outout sentence // n-best list will be stored in array // from the addr vReturn TRACE("Outputing sentence"); for (k=0; k<nBest; ++k) { // clear vReturn[k].clear(); if (out_scores!=NULL) out_scores[k] = 0; // assign retval if (k<m_Agenda->generatorSize()) { pGenerator = m_Agenda->generator(k); for (j=0; j<pGenerator->m_nLength; j++) { std::string temp = ""; for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) { assert(sentence.at(l)!=" "); // [SPACE] temp += sentence.at(l); } vReturn[k].push_back(temp); } if (out_scores!=NULL) out_scores[k] = pGenerator->m_nScore; } } TRACE("Done, the best score: " << pGenerator->m_nScore); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { clock_t total_start_time = clock();; const CStateItem *pGenerator, *pCandidate; CStateItem tempState; unsigned index; // the index of the current char unsigned j, k; // temporary index int subtract_score; // the score to be subtracted (previous item) static CStateItem best_bigram; int start_index; int word_length; int generator_index; static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned length = sentence.size(); assert(length<MAX_SENTENCE_SIZE); assert(vReturn!=NULL); //clock_t start_time = clock(); TRACE("Initialising the segmentation process..."); vReturn->clear(); clearWordCache(); m_Chart.clear(); tempState.clear(); m_Chart[0]->insertItem(&tempState); TRACE("Segmenting started"); for (index=0; index<length; index++) { // m_Chart index 1 correspond to the first char m_Chart[index+1]; // control for the ending character of the candidate if ( index < length-1 && rules.canSeparate(index+1)==false ) continue ; start_index = index-1 ; // the end index of last word word_length = 1 ; // current word length // enumerating the start index // =========================== // the start index of the word is actually start_index + 1 while( start_index >= -1 && word_length <= MAX_WORD_SIZE ) { // control for the starting character of the candidate // --------------------------------------------------- while ( start_index >= 0 && rules.canSeparate(start_index+1)==false ) start_index-- ; // start the search process // ------------------------ for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) { pGenerator = m_Chart[ start_index+1 ]->item( generator_index ) ; tempState.copy( pGenerator ) ; tempState.append( index ) ; tempState.m_nScore += m_Feature->getLocalScore( &sentence, &tempState, tempState.m_nLength-1 ) ; if (nBest==1) { if ( generator_index == 0 || tempState.m_nScore > best_bigram.m_nScore ) { best_bigram.copy(&tempState); //@@@ } } else { m_Chart[ index+1 ]->insertItem( &tempState ); } } if (nBest==1) { m_Chart[ index+1 ]->insertItem( &best_bigram ); //@@@ } //@@@ // control the first character of the candidate if ( rules.canAppend(start_index+1)==false ) break ; // update start index and word len --start_index ; ++word_length ; }//start_index } // now generate outout sentence // n-best list will be stored in array // from the addr vReturn TRACE("Outputing sentence"); for (k=0; k<nBest; ++k) { // clear vReturn[k].clear(); if (out_scores!=NULL) out_scores[k] = 0; // assign retval if (k<m_Chart[length]->size()) { pGenerator = m_Chart[length]->bestItem(k); for (j=0; j<pGenerator->m_nLength; j++) { std::string temp = ""; for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) { assert(sentence.at(l)!=" "); // [SPACE] temp += sentence.at(l); } vReturn[k].push_back(temp); } if (out_scores!=NULL) out_scores[k] = pGenerator->m_nScore; } } TRACE("Done, the best score: " << pGenerator->m_nScore); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }