Esempio n. 1
0
bool work(CSegmentor *segmentor, const CStringVector &sentence, CStringVector *vReturn, double *out_scores, CRule &rules, std::vector<unsigned> *correct_starts, unsigned nBest, int round) {
   static CStateItem lattice[(MAX_SENTENCE_SIZE+2)*BEAM_SIZE];
   static CStateItem *lattice_index[MAX_SENTENCE_SIZE+2];
   static const CStateItem *pGenerator, *pBestGen;
   static const CStateItem *correct, *temp;
   static int index, temp_index;                       // the index of the current char
   static unsigned long int doneWordRnd[MAX_SENTENCE_SIZE];  // mask whether candidate with the last word has been cached
   static unsigned long int doneWordLink[MAX_SENTENCE_SIZE]; // link to the corresponding cache state item from word_length + 1
   static CScoredAct doneWordItems[BEAM_SIZE]; 
   static int doneItemPointer; 
   static unsigned correct_word;
   static bool correct_append;
   static unsigned long word_length;
   static bool bCompatible; 
   const int length = sentence.size();
   static CAgendaSimple<CScoredAct> beam(BEAM_SIZE);
   static CScoredAct action;
   static const CStateItem *best[BEAM_SIZE];
   static unsigned nBestGen;

   //clock_t start_time = clock();
   TRACE("Initialising the decoding process...");
   segmentor->clearWordCache(); 

   lattice[0].clear();
   lattice_index[0] = lattice;
   lattice_index[1] = lattice+1;

   if (correct_starts) {
      correct = lattice;                             
      correct_word=0;
      correct_append=false;
   }

   if (nBest == 1) // optimization for one best
      memset(doneWordRnd, 0, MAX_SENTENCE_SIZE*sizeof(doneWordRnd[0]));

   TRACE("Decoding started");
   // index is character index and lattice index shifts 1 right
   for (index=0; index<length; ++index) {

      lattice_index[index+2] = lattice_index[index+1];
         
      // generate new state itmes for each character
      beam.clear();

      doneItemPointer = 0;

      for (pGenerator=lattice_index[index]; pGenerator!=lattice_index[index+1]; ++pGenerator) { // for each generator

         // 1. generate new items according to each previous item. 
         if ( rules.canSeparate( index ) ) {  
            action.load(pGenerator, false, getOrUpdateSeparateScore(segmentor, &sentence, pGenerator));
            if ( nBest == 1 ) {
               word_length = pGenerator->getWordLength();
               if ( doneWordRnd[word_length] < index+1 ) {
                  doneWordLink[word_length] = doneItemPointer;   // doneWordLink[i] caches the last word with length i+1
                  doneWordItems[doneItemPointer]=action; // copy item to cache.
                  ++doneItemPointer;
                  doneWordRnd[word_length] = index+1;
               }
               else {
                  assert(doneWordRnd[word_length] == index+1);
                  if ( action > doneWordItems[doneWordLink[word_length]] )
                     doneWordItems[doneWordLink[word_length]]=action;
               }
            }
            else {
               beam.insertItem(&action);
            }
         }

         // 2. generate by replacing items
         if ( index > 0 && rules.canAppend(index) ) {
            action.load(pGenerator, true, getOrUpdateAppendScore(segmentor, &sentence, pGenerator, index-1));
            beam.insertItem(&action);
         }

      }

      // 3. recollect the items for separate
      if (nBest == 1) {
         for (temp_index = 0; temp_index<doneItemPointer; ++temp_index) {
            beam.insertItem(&doneWordItems[temp_index]);
         }
      }

      // build new items in decode
      if (correct_starts) {
         bCompatible = false;
         if (index==correct_starts->at(correct_word)) {
            correct_append = false;
            ++correct_word;
         }
         else {
            assert(correct_word==correct_starts->size()||index<correct_starts->at(correct_word));
            correct_append = true;
         }
         pBestGen = 0;
      }
      for (temp_index=0; temp_index<beam.size(); ++temp_index) {
         pGenerator = beam.item(temp_index)->item;
         if (beam.item(temp_index)->append)
            pGenerator->append(lattice_index[index+2]);
         else
            pGenerator->separate(lattice_index[index+2]);
         lattice_index[index+2]->score = beam.item(temp_index)->score;
         if (correct_starts) {
            if (pBestGen==0 || lattice_index[index+2]->score > pBestGen->score)
               pBestGen = lattice_index[index+2];
            if (correct == pGenerator && correct_append == beam.item(temp_index)->append) {
               bCompatible = true;
               correct = lattice_index[index+2];
            }
         }
         ++lattice_index[index+2];
      }
         
      // update scores if none from the agenda is correct state.
      if (correct_starts && !bCompatible) {
         TRACE("Decoding error, updating the weight std::vector");
         if (correct_append)
            correct->append(lattice_index[index+2]);
         else
            correct->separate(lattice_index[index+2]);
         updateScoreVectorForStates(segmentor, &sentence, pBestGen, lattice_index[index+2], round);
         return false;
      }

   }

   // a final step adding the last separate score for items. 
   beam.clear();
   for (pGenerator=lattice_index[length]; pGenerator!=lattice_index[length+1]; ++pGenerator) { 
      action.load(pGenerator, false, getOrUpdateSeparateScore(segmentor, &sentence, pGenerator));
      beam.insertItem(&action);
   }
   beam.sortItems(); // sort final items
   nBestGen = beam.size();
   for (temp_index=0; temp_index<nBestGen; ++temp_index) {
      best[temp_index] = beam.item(temp_index)->item;
   }

   if (correct_starts) {
      assert(bCompatible);
      if (correct!=best[0]) {
         TRACE("Decoding error, updating the weight std::vector");
         updateScoreVectorForStates(segmentor, &sentence, best[0], correct, round);
         return false;
      }
   }

   TRACE("Decoding finished");

   // now generate outout sentence
   // n-best list will be stored in array
   if (!correct_starts){
      TRACE("Outputing sentence");
      for ( index=0; index<std::min(nBest, nBestGen); ++index ) {
         // clear
         vReturn[index].clear();
         if ( out_scores ) out_scores[index] = 0;
         // assign retval
         static unsigned count;
         static unsigned start;
         count = 0;
         temp = best[index];
         while (!temp->empty()) {
            ++count;
            temp = temp->prev();
         }
         vReturn[index].resize(count);
         --count;
         temp = best[index];
         while (!temp->empty()) {
            for (temp_index=temp->getWordStart(); temp_index<=temp->getWordEnd(); ++temp_index) {
               vReturn[index].at(count) += sentence.at(temp_index);
            }
            --count;
            temp = temp->prev();
         }
         if ( out_scores!=NULL )
            out_scores[index] = best[index]->score;
      }
   }
   return true;
}
Esempio n. 2
0
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) {
   clock_t total_start_time = clock();;
   CStateItem *pGenerator, *pCandidate;
   unsigned index;                              // the index of the current char
   unsigned j, k;                               // temporary index
   int subtract_score;                          // the score to be subtracted (previous item)
   static unsigned doneLastWord[MAX_SENTENCE_SIZE];

   static CStringVector sentence;
   static CRule rules(m_Feature->m_bRule);
   rules.segment(sentence_input, &sentence);
   const unsigned length = sentence.size();

   if (length > MAX_SENTENCE_SIZE) {
      std::cerr << "The size of the sentence is " << length << " characters, which is larger than the limit of the system (" << MAX_SENTENCE_SIZE <<std::endl;
      vReturn->clear();
      return;
   }
   assert(vReturn!=NULL);

   //clock_t start_time = clock();
   TRACE("Initialising the segmentation process...");
   vReturn->clear();
   clearWordCache(); 
   m_Agenda->clear();
   pCandidate = m_Agenda->candidateItem();      // make the first item
   pCandidate->clear();                         // restore state using clean
   m_Agenda->pushCandidate();                   // and push it back
   m_Agenda->nextRound();                       // as the generator item
   if (nBest == 1)                              // optimization for one best
      for (j=0; j<MAX_SENTENCE_SIZE; ++j) doneLastWord[j] = 0;

   TRACE("Segmenting started");
   //TRACE("initialisation time: " << clock() - start_time);
   for (index=0; index<length; index++) {
      // generate new state itmes for each character
      pGenerator = m_Agenda->generatorStart();
      for (j=0; j<m_Agenda->generatorSize(); ++j) {
         // 1. generate new items according to each previous item. 
         if (pGenerator->m_nLength>0) k = pGenerator->getWordStart(pGenerator->m_nLength-1);
         // If we only ask 1-best, then we take only the best among those with the last word
         if ( ( nBest > 1 || pGenerator->m_nLength==0 || doneLastWord[k]<index+1 ) && 
              rules.canSeparate( index ) 
            ) {  
            pCandidate = m_Agenda->candidateItem();
            pCandidate->copy(pGenerator);
            pCandidate->append(index);
            pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); 
            m_Agenda->pushCandidate();
            if (nBest == 1 && pGenerator->m_nLength>0) doneLastWord[k] = index+1;
         }
         // 2. generate by replacing items
         if ( index > 0 && rules.canAppend(index) ) {
            pCandidate = m_Agenda->candidateItem();
            pCandidate->copy(pGenerator);
            subtract_score = m_Feature->getLocalScore(&sentence, pGenerator, pGenerator->m_nLength-1);
            pCandidate->m_nScore -= subtract_score;
            pCandidate->replace(index);
            pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1);
            m_Agenda->pushCandidate();
         }
         pGenerator = m_Agenda->generatorNext();  // next generator
      }
      m_Agenda->nextRound(); // move round
   }
   // now generate outout sentence
   // n-best list will be stored in array
   // from the addr vReturn
   TRACE("Outputing sentence");
   for (k=0; k<nBest; ++k) {
      // clear
      vReturn[k].clear();
      if (out_scores!=NULL) 
         out_scores[k] = 0;
      // assign retval
      if (k<m_Agenda->generatorSize()) {
         pGenerator = m_Agenda->generator(k);
         for (j=0; j<pGenerator->m_nLength; j++) {
            std::string temp = "";
            for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) {
               assert(sentence.at(l)!=" "); // [SPACE]
               temp += sentence.at(l);
            }
            vReturn[k].push_back(temp);
         }
         if (out_scores!=NULL)
            out_scores[k] = pGenerator->m_nScore;
      }
   }
   TRACE("Done, the best score: " << pGenerator->m_nScore);
   TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC);
}
Esempio n. 3
0
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) {
   clock_t total_start_time = clock();;
   const CStateItem *pGenerator, *pCandidate;
   CStateItem tempState;
   unsigned index;                              // the index of the current char
   unsigned j, k;                               // temporary index
   int subtract_score;                          // the score to be subtracted (previous item)
   static CStateItem best_bigram;
   int start_index;
   int word_length;
   int generator_index;

   static CStringVector sentence;
   static CRule rules(m_Feature->m_bRule);
   rules.segment(sentence_input, &sentence);
   const unsigned length = sentence.size();

   assert(length<MAX_SENTENCE_SIZE);
   assert(vReturn!=NULL);

   //clock_t start_time = clock();
   TRACE("Initialising the segmentation process...");
   vReturn->clear();
   clearWordCache(); 
   m_Chart.clear();

   tempState.clear();
   m_Chart[0]->insertItem(&tempState);

   TRACE("Segmenting started");
   for (index=0; index<length; index++) {

      // m_Chart index 1 correspond to the first char
      m_Chart[index+1];

      // control for the ending character of the candidate 
      if ( index < length-1 && rules.canSeparate(index+1)==false ) 
         continue ; 

      start_index = index-1 ; // the end index of last word
      word_length = 1 ; // current word length

      // enumerating the start index
      // ===========================
      // the start index of the word is actually start_index + 1
      while( start_index >= -1 && word_length <= MAX_WORD_SIZE ) {

         // control for the starting character of the candidate
         // ---------------------------------------------------
         while ( start_index >= 0 && rules.canSeparate(start_index+1)==false )
            start_index-- ; 

         // start the search process
         // ------------------------
         for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) {
            pGenerator = m_Chart[ start_index+1 ]->item( generator_index ) ;
            tempState.copy( pGenerator ) ;
            tempState.append( index ) ;
            tempState.m_nScore += m_Feature->getLocalScore( &sentence, &tempState, tempState.m_nLength-1 ) ;
            if (nBest==1) {
               if ( generator_index == 0 || tempState.m_nScore > best_bigram.m_nScore ) {
                  best_bigram.copy(&tempState);                                       //@@@
               }
            }
            else {
               m_Chart[ index+1 ]->insertItem( &tempState );
            }
         }
         if (nBest==1) {
            m_Chart[ index+1 ]->insertItem( &best_bigram );                  //@@@
         }                                                        //@@@

         // control the first character of the candidate
         if ( rules.canAppend(start_index+1)==false ) 
            break ; 

         // update start index and word len
         --start_index ;
         ++word_length ;

      }//start_index
   }
   // now generate outout sentence
   // n-best list will be stored in array
   // from the addr vReturn
   TRACE("Outputing sentence");
   for (k=0; k<nBest; ++k) {
      // clear
      vReturn[k].clear();
      if (out_scores!=NULL) 
         out_scores[k] = 0;
      // assign retval
      if (k<m_Chart[length]->size()) {
         pGenerator = m_Chart[length]->bestItem(k);
         for (j=0; j<pGenerator->m_nLength; j++) {
            std::string temp = "";
            for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) {
               assert(sentence.at(l)!=" "); // [SPACE]
               temp += sentence.at(l);
            }
            vReturn[k].push_back(temp);
         }
         if (out_scores!=NULL)
            out_scores[k] = pGenerator->m_nScore;
      }
   }
   TRACE("Done, the best score: " << pGenerator->m_nScore);
   TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC);
}