void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { #ifdef DEBUG clock_t total_start_time = clock();; #endif TRACE("Starting segmenting a sentence..."); // turn the spaces in the input sentence into rules that separate corresponding characters static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned long length = sentence.size(); assert(length<MAX_SENTENCE_SIZE); assert(vReturn!=NULL); vReturn->clear(); // try to work std::cout the best item with the // correct outout reference param as NULL work(this, sentence, vReturn, out_scores, rules, NULL, nBest, -1); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }
void CSegmentor::train(const CStringVector* sentence_input, const CStringVector* correct, int & round) { #ifdef DEBUG clock_t total_start_time = clock();; #endif TRACE("Starting training using a sentence..."); static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned long int length = sentence.size(); assert(length<MAX_SENTENCE_SIZE); static std::vector<unsigned> correct_starts; static int word_length, word_index, char_length, char_index; // word_xxx are from correct, char_xxx from sentence char_index = 0; int count = 0; correct_starts.clear(); correct_starts.push_back(count); for (word_index=0; word_index<correct->size(); word_index++) { word_length = correct->at(word_index).size(); char_length = 0; while (char_length<word_length) { char_length += sentence[char_index++].size(); count += 1; } assert(char_length==word_length); correct_starts.push_back(count); } // the main learning process with update work(this, sentence, 0, 0, rules, &correct_starts, 1, round); TRACE("Done"); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { clock_t total_start_time = clock();; CStateItem *pGenerator, *pCandidate; unsigned index; // the index of the current char unsigned j, k; // temporary index int subtract_score; // the score to be subtracted (previous item) static unsigned doneLastWord[MAX_SENTENCE_SIZE]; static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned length = sentence.size(); if (length > MAX_SENTENCE_SIZE) { std::cerr << "The size of the sentence is " << length << " characters, which is larger than the limit of the system (" << MAX_SENTENCE_SIZE <<std::endl; vReturn->clear(); return; } assert(vReturn!=NULL); //clock_t start_time = clock(); TRACE("Initialising the segmentation process..."); vReturn->clear(); clearWordCache(); m_Agenda->clear(); pCandidate = m_Agenda->candidateItem(); // make the first item pCandidate->clear(); // restore state using clean m_Agenda->pushCandidate(); // and push it back m_Agenda->nextRound(); // as the generator item if (nBest == 1) // optimization for one best for (j=0; j<MAX_SENTENCE_SIZE; ++j) doneLastWord[j] = 0; TRACE("Segmenting started"); //TRACE("initialisation time: " << clock() - start_time); for (index=0; index<length; index++) { // generate new state itmes for each character pGenerator = m_Agenda->generatorStart(); for (j=0; j<m_Agenda->generatorSize(); ++j) { // 1. generate new items according to each previous item. if (pGenerator->m_nLength>0) k = pGenerator->getWordStart(pGenerator->m_nLength-1); // If we only ask 1-best, then we take only the best among those with the last word if ( ( nBest > 1 || pGenerator->m_nLength==0 || doneLastWord[k]<index+1 ) && rules.canSeparate( index ) ) { pCandidate = m_Agenda->candidateItem(); pCandidate->copy(pGenerator); pCandidate->append(index); pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); m_Agenda->pushCandidate(); if (nBest == 1 && pGenerator->m_nLength>0) doneLastWord[k] = index+1; } // 2. generate by replacing items if ( index > 0 && rules.canAppend(index) ) { pCandidate = m_Agenda->candidateItem(); pCandidate->copy(pGenerator); subtract_score = m_Feature->getLocalScore(&sentence, pGenerator, pGenerator->m_nLength-1); pCandidate->m_nScore -= subtract_score; pCandidate->replace(index); pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); m_Agenda->pushCandidate(); } pGenerator = m_Agenda->generatorNext(); // next generator } m_Agenda->nextRound(); // move round } // now generate outout sentence // n-best list will be stored in array // from the addr vReturn TRACE("Outputing sentence"); for (k=0; k<nBest; ++k) { // clear vReturn[k].clear(); if (out_scores!=NULL) out_scores[k] = 0; // assign retval if (k<m_Agenda->generatorSize()) { pGenerator = m_Agenda->generator(k); for (j=0; j<pGenerator->m_nLength; j++) { std::string temp = ""; for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) { assert(sentence.at(l)!=" "); // [SPACE] temp += sentence.at(l); } vReturn[k].push_back(temp); } if (out_scores!=NULL) out_scores[k] = pGenerator->m_nScore; } } TRACE("Done, the best score: " << pGenerator->m_nScore); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }
void CTagger::tag( const CStringVector * sentence_input , CTwoStringVector * vReturn , SCORE_TYPE * out_scores , unsigned long nBest , const CBitArray * prunes ) { clock_t total_start_time = clock();; int index , start_index , generator_index , temp_index, word_length; const CStateItem * generator_item ; CStateItem *candidate_item , tempState , maxState ; static CStateItem best_bigram[ 1<<CTag::SIZE ] ; unsigned long long best_bigram_mask = 0; // and the count unsigned long tag, last_tag ; static CStringVector sentence; static CRule rules(m_weights->m_bSegmentationRules); rules.segment(sentence_input, &sentence); const int length = sentence.size() ; if (length>=m_nMaxSentSize) THROW("the length of the sentence is bigger than the maximum sentence size "<<m_nMaxSentSize<<"; try changing the option"); assert(vReturn!=NULL); TRACE("Initialising the tagging process..."); m_WordCache.clear() ; m_Chart.clear() ; // put an empty sentence to the beginning tempState.clear() ; m_Chart[ 0 ]->insertItem( &tempState ) ; TRACE("Tagging started"); // enumerating the end index // ========================= // index is the word index starting from 0 for ( index = 0 ; index < length ; ++ index ) { // m_Chart index 1 correspond to the first char m_Chart[ index + 1 ] ; // this is to make some necessary initialisation for each agenda, when pruning // control for the ending character of the candidate if ( index < length-1 && rules.canSeparate(index+1)==false ) continue ; // enumerating the possible tags // ============================= // the tag 0 is the NONE tag, and tag 1 is the BEGIN tag for ( tag = CTag::FIRST ; tag <= CTag::LAST ; ++ tag ) { start_index = index-1 ; // the end index of last word word_length = 1 ; // current word length // enumerating the start index // =========================== // the start index of the word is actually start_index + 1 while( start_index >= -1 && word_length <= m_weights->m_maxLengthByTag[ tag ] ) { // control for the starting character of the candidate // --------------------------------------------------- while ( start_index >= 0 && rules.canSeparate(start_index+1)==false ) start_index-- ; // start the search process // ------------------------ // with pruning if ( ( prunes==NULL || prunes->isset( ( start_index+1 ) * m_nMaxSentSize + index ) ) && // not pruned ( ( m_weights->m_mapWordFrequency.find( m_WordCache.find( start_index+1 , index , &sentence ) , 0 ) < m_weights->m_nMaxWordFrequency/5000+5 && PENN_TAG_CLOSED[ tag ] == false ) || m_weights->m_mapTagDictionary.lookup( m_WordCache.find( start_index+1 , index , &sentence ), tag ) ) // wordtag match ) { if (nBest==1) best_bigram_mask=0LL; for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) { generator_item = m_Chart[ start_index+1 ]->item( generator_index ) ; tempState.copy( generator_item ) ; tempState.append( index , tag ) ; tempState.score += getOrUpdateLocalScore( &sentence , &tempState , tempState.size()-1 ) ; if (nBest==1) { last_tag = tempState.size()>1 ? tempState.getTag(tempState.size()-2).code() : CTag::SENTENCE_BEGIN; if ( ((best_bigram_mask&(1LL<<last_tag))==0LL) || best_bigram[last_tag].score < tempState.score ) { best_bigram_mask|=(1LL<<last_tag); best_bigram[last_tag].copy(&tempState); } } else { m_Chart[ index+1 ]->insertItem( &tempState ); } } if (nBest==1) { for ( last_tag=0; last_tag<CTag::COUNT; ++last_tag ) { if ( (best_bigram_mask&(1LL<<last_tag)) ) m_Chart[ index+1 ]->insertItem( &(best_bigram[last_tag]) ); } } }//if // control the first character of the candidate if ( rules.canAppend(start_index+1)==false ) break ; // update start index and word len --start_index ; ++word_length ; }//start_index }//tag }//index TRACE("Outputing sentence"); for ( temp_index = 0 ; temp_index < nBest ; ++ temp_index ) { vReturn[ temp_index ].clear() ; if (out_scores) out_scores[ temp_index ] = 0 ; if ( temp_index < m_Chart[length]->size() ) { generate( m_Chart[ length ]->bestItem( temp_index ) , &sentence , this , &(vReturn[ temp_index ]) ) ; if (out_scores) out_scores[ temp_index ] = m_Chart[ length ]->bestItem( temp_index )->score ; } } TRACE("Done, the highest score is: " << m_Chart[ length ]->bestItem( 0 )->score) ; TRACE("The total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC) ; }
void CTagger::tag( const CStringVector * sentence_input , CTwoStringVector * vReturn , SCORE_TYPE * out_scores , unsigned long nBest , const CBitArray * prunes ) { clock_t total_start_time = clock();; int temp_index; const CSubStateItem *pGenerator; CSubStateItem tempState; int j, k; unsigned tag; unsigned index, last_tag; static CSubStateItem uniqueItems[AGENDA_SIZE]; unsigned long uniqueIndex; static bool bUnique; // unsigned long long uniqueMarkup; // assert(CTag::COUNT<=sizeof(unsigned long long)*8); static CStringVector sentence; static CRule rules(m_weights->m_bSegmentationRules); rules.segment(sentence_input, &sentence); const unsigned length=sentence.size(); static CSubStateItem goldState; goldState.clear(); TRACE("Initialising the tagging process..."); m_WordCache.clear(); tempState.clear(); m_Agenda.clear(); m_Agenda.pushCandidate(&tempState); m_Agenda.nextRound(); TRACE("Tagging started"); //TRACE("initialisation time: " << clock() - start_time); for (index=0; index<length; index++) { // decide correction if ( m_bTrain ) { static bool bAnyCorrect; bAnyCorrect = false; pGenerator = m_Agenda.generatorStart(); for (j=0; j<m_Agenda.generatorSize(); ++j) { if ( *pGenerator == goldState ) bAnyCorrect = true; pGenerator = m_Agenda.generatorNext(); // next generator } if ( !bAnyCorrect ) { TRACE("Training error at character " << index); pGenerator = m_Agenda.bestGenerator(); updateScoreForState(&sentence, pGenerator, -1); updateScoreForState(&sentence, &goldState, 1); m_bTrainingError = true; return; } } // 2. generate by replacing items if ( index > 0 ) { pGenerator = m_Agenda.generatorStart(); for (j=0; j<m_Agenda.generatorSize(); ++j) { assert(pGenerator->size()>0); if ( ( rules.canAppend(index) ) && // ( index > 0 ) && pGenerator->getWordLength(pGenerator->size()-1) < m_weights->m_maxLengthByTag[pGenerator->getTag(pGenerator->size()-1).code()] ) { tempState.copy(pGenerator); tempState.replaceIndex(index); tempState.score += getOrUpdateAppendScore(&sentence, &tempState, tempState.size()-1, index); if (index+1==length) tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()); m_Agenda.pushCandidate(&tempState); } // if pGenerator = m_Agenda.generatorNext(); // next generator } } //_ // 1. generate new items according to each previous item. // iterate postags for (tag=CTag::FIRST; tag<CTag::COUNT; ++tag) { pGenerator = m_Agenda.generatorStart(); // uniqueMarkup=0; uniqueIndex=0; for (j=0; j<m_Agenda.generatorSize(); ++j) { last_tag = pGenerator->size()==0 ? CTag::SENTENCE_BEGIN : pGenerator->getTag(pGenerator->size()-1).code(); if ( rules.canSeparate( index ) && (index == 0 || canAssignTag( m_WordCache.find( pGenerator->getWordStart(pGenerator->size()-1), index-1, &sentence ), last_tag )) && // last word canStartWord(sentence, tag, index) // word ) { tempState.copy(pGenerator); tempState.append(index, tag); tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()-1); if (index+1==length) tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()); if (nBest==1) { // if ( ((uniqueMarkup&(1LL<<last_tag))==0LL) || uniqueItems[last_tag].score < tempState.score ) { // uniqueMarkup |= (1LL<<last_tag); // uniqueItems[last_tag].copy(&tempState); // } bUnique = true; for (temp_index=0; temp_index<uniqueIndex; ++temp_index) { // only one new when index=zero. assert(index>0&&uniqueItems[temp_index].size()>1); if (uniqueItems[temp_index].getTag(uniqueItems[temp_index].size()-2) == tempState.getTag(tempState.size()-2) && uniqueItems[temp_index].getWordStart(uniqueItems[temp_index].size()-2) == tempState.getWordStart(tempState.size()-2) ) { bUnique = false; if (uniqueItems[temp_index].score < tempState.score ) uniqueItems[temp_index].copy(&tempState); }//if }//for if (bUnique) { uniqueItems[uniqueIndex++].copy(&tempState); }//if } else { m_Agenda.pushCandidate(&tempState); } } pGenerator = m_Agenda.generatorNext(); // next generator } // push candidates if (nBest == 1) { // for (last_tag=0; last_tag<CTag::COUNT; ++last_tag) { // if ( (uniqueMarkup&(1LL<<last_tag)) ) // m_Agenda.pushCandidate(&(uniqueItems[last_tag])); // } for (temp_index=0; temp_index<uniqueIndex; ++temp_index) { m_Agenda.pushCandidate(&(uniqueItems[temp_index])); }//for } }//tag m_Agenda.nextRound(); // move round if (m_bTrain) goldState.follow(m_goldState); } if ( m_bTrain && 1 ) { pGenerator = m_Agenda.bestGenerator(); if ( *pGenerator != goldState ) { TRACE("Training error at the last word"); updateScoreForState(&sentence, pGenerator, -1); updateScoreForState(&sentence, &goldState, 1); m_bTrainingError = true; } m_bTrainingError = false; return; } TRACE("Outputing sentence"); vReturn->clear(); if (nBest == 1) { generate( m_Agenda.bestGenerator() , &sentence , this , vReturn ) ; if (out_scores) out_scores[ 0 ] = m_Agenda.bestGenerator( )->score ; } else { m_Agenda.sortGenerators(); for ( temp_index = 0 ; temp_index < nBest ; ++ temp_index ) { vReturn[ temp_index ].clear() ; if (out_scores) out_scores[ temp_index ] = 0 ; if ( temp_index < m_Agenda.generatorSize() ) { generate( m_Agenda.generator( temp_index ) , &sentence , this , &(vReturn[ temp_index ]) ) ; if (out_scores) out_scores[ temp_index ] = m_Agenda.bestGenerator( )->score ; } } } TRACE("Done, the highest score is: " << m_Agenda.bestGenerator()->score) ; TRACE("The total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC) ; }
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { clock_t total_start_time = clock();; const CStateItem *pGenerator, *pCandidate; CStateItem tempState; unsigned index; // the index of the current char unsigned j, k; // temporary index int subtract_score; // the score to be subtracted (previous item) static CStateItem best_bigram; int start_index; int word_length; int generator_index; static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned length = sentence.size(); assert(length<MAX_SENTENCE_SIZE); assert(vReturn!=NULL); //clock_t start_time = clock(); TRACE("Initialising the segmentation process..."); vReturn->clear(); clearWordCache(); m_Chart.clear(); tempState.clear(); m_Chart[0]->insertItem(&tempState); TRACE("Segmenting started"); for (index=0; index<length; index++) { // m_Chart index 1 correspond to the first char m_Chart[index+1]; // control for the ending character of the candidate if ( index < length-1 && rules.canSeparate(index+1)==false ) continue ; start_index = index-1 ; // the end index of last word word_length = 1 ; // current word length // enumerating the start index // =========================== // the start index of the word is actually start_index + 1 while( start_index >= -1 && word_length <= MAX_WORD_SIZE ) { // control for the starting character of the candidate // --------------------------------------------------- while ( start_index >= 0 && rules.canSeparate(start_index+1)==false ) start_index-- ; // start the search process // ------------------------ for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) { pGenerator = m_Chart[ start_index+1 ]->item( generator_index ) ; tempState.copy( pGenerator ) ; tempState.append( index ) ; tempState.m_nScore += m_Feature->getLocalScore( &sentence, &tempState, tempState.m_nLength-1 ) ; if (nBest==1) { if ( generator_index == 0 || tempState.m_nScore > best_bigram.m_nScore ) { best_bigram.copy(&tempState); //@@@ } } else { m_Chart[ index+1 ]->insertItem( &tempState ); } } if (nBest==1) { m_Chart[ index+1 ]->insertItem( &best_bigram ); //@@@ } //@@@ // control the first character of the candidate if ( rules.canAppend(start_index+1)==false ) break ; // update start index and word len --start_index ; ++word_length ; }//start_index } // now generate outout sentence // n-best list will be stored in array // from the addr vReturn TRACE("Outputing sentence"); for (k=0; k<nBest; ++k) { // clear vReturn[k].clear(); if (out_scores!=NULL) out_scores[k] = 0; // assign retval if (k<m_Chart[length]->size()) { pGenerator = m_Chart[length]->bestItem(k); for (j=0; j<pGenerator->m_nLength; j++) { std::string temp = ""; for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) { assert(sentence.at(l)!=" "); // [SPACE] temp += sentence.at(l); } vReturn[k].push_back(temp); } if (out_scores!=NULL) out_scores[k] = pGenerator->m_nScore; } } TRACE("Done, the best score: " << pGenerator->m_nScore); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }