Esempio n. 1
0
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) {
#ifdef DEBUG
   clock_t total_start_time = clock();;
#endif
   TRACE("Starting segmenting a sentence...");

   // turn the spaces in the input sentence into rules that separate corresponding characters
   static CStringVector sentence;
   static CRule rules(m_Feature->m_bRule); 
   rules.segment(sentence_input, &sentence); 
   const unsigned long length = sentence.size();

   assert(length<MAX_SENTENCE_SIZE);
   assert(vReturn!=NULL);
   vReturn->clear();

   // try to work std::cout the best item with the
   // correct outout reference param as NULL
   work(this, sentence, vReturn, out_scores, rules, NULL, nBest, -1);

   TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC);
}
Esempio n. 2
0
void CSegmentor::train(const CStringVector* sentence_input, const CStringVector* correct, int & round) {
#ifdef DEBUG
   clock_t total_start_time = clock();;
#endif
   TRACE("Starting training using a sentence...");
   static CStringVector sentence;
   static CRule rules(m_Feature->m_bRule);
   rules.segment(sentence_input, &sentence);
   const unsigned long int length = sentence.size();

   assert(length<MAX_SENTENCE_SIZE);

   static std::vector<unsigned> correct_starts;
   static int word_length, word_index, char_length, char_index; // word_xxx are from correct, char_xxx from sentence

   char_index = 0;
   int count = 0; 
   correct_starts.clear();
   correct_starts.push_back(count);
   for (word_index=0; word_index<correct->size(); word_index++) {
      word_length = correct->at(word_index).size();
      char_length = 0; 
      while (char_length<word_length) {
         char_length += sentence[char_index++].size();
         count += 1;
      }
      assert(char_length==word_length);
      correct_starts.push_back(count);
   }

   // the main learning process with update
   work(this, sentence, 0, 0, rules, &correct_starts, 1, round);

   TRACE("Done");
   TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC);
}
Esempio n. 3
0
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) {
   clock_t total_start_time = clock();;
   CStateItem *pGenerator, *pCandidate;
   unsigned index;                              // the index of the current char
   unsigned j, k;                               // temporary index
   int subtract_score;                          // the score to be subtracted (previous item)
   static unsigned doneLastWord[MAX_SENTENCE_SIZE];

   static CStringVector sentence;
   static CRule rules(m_Feature->m_bRule);
   rules.segment(sentence_input, &sentence);
   const unsigned length = sentence.size();

   if (length > MAX_SENTENCE_SIZE) {
      std::cerr << "The size of the sentence is " << length << " characters, which is larger than the limit of the system (" << MAX_SENTENCE_SIZE <<std::endl;
      vReturn->clear();
      return;
   }
   assert(vReturn!=NULL);

   //clock_t start_time = clock();
   TRACE("Initialising the segmentation process...");
   vReturn->clear();
   clearWordCache(); 
   m_Agenda->clear();
   pCandidate = m_Agenda->candidateItem();      // make the first item
   pCandidate->clear();                         // restore state using clean
   m_Agenda->pushCandidate();                   // and push it back
   m_Agenda->nextRound();                       // as the generator item
   if (nBest == 1)                              // optimization for one best
      for (j=0; j<MAX_SENTENCE_SIZE; ++j) doneLastWord[j] = 0;

   TRACE("Segmenting started");
   //TRACE("initialisation time: " << clock() - start_time);
   for (index=0; index<length; index++) {
      // generate new state itmes for each character
      pGenerator = m_Agenda->generatorStart();
      for (j=0; j<m_Agenda->generatorSize(); ++j) {
         // 1. generate new items according to each previous item. 
         if (pGenerator->m_nLength>0) k = pGenerator->getWordStart(pGenerator->m_nLength-1);
         // If we only ask 1-best, then we take only the best among those with the last word
         if ( ( nBest > 1 || pGenerator->m_nLength==0 || doneLastWord[k]<index+1 ) && 
              rules.canSeparate( index ) 
            ) {  
            pCandidate = m_Agenda->candidateItem();
            pCandidate->copy(pGenerator);
            pCandidate->append(index);
            pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1); 
            m_Agenda->pushCandidate();
            if (nBest == 1 && pGenerator->m_nLength>0) doneLastWord[k] = index+1;
         }
         // 2. generate by replacing items
         if ( index > 0 && rules.canAppend(index) ) {
            pCandidate = m_Agenda->candidateItem();
            pCandidate->copy(pGenerator);
            subtract_score = m_Feature->getLocalScore(&sentence, pGenerator, pGenerator->m_nLength-1);
            pCandidate->m_nScore -= subtract_score;
            pCandidate->replace(index);
            pCandidate->m_nScore += m_Feature->getLocalScore(&sentence, pCandidate, pCandidate->m_nLength-1);
            m_Agenda->pushCandidate();
         }
         pGenerator = m_Agenda->generatorNext();  // next generator
      }
      m_Agenda->nextRound(); // move round
   }
   // now generate outout sentence
   // n-best list will be stored in array
   // from the addr vReturn
   TRACE("Outputing sentence");
   for (k=0; k<nBest; ++k) {
      // clear
      vReturn[k].clear();
      if (out_scores!=NULL) 
         out_scores[k] = 0;
      // assign retval
      if (k<m_Agenda->generatorSize()) {
         pGenerator = m_Agenda->generator(k);
         for (j=0; j<pGenerator->m_nLength; j++) {
            std::string temp = "";
            for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) {
               assert(sentence.at(l)!=" "); // [SPACE]
               temp += sentence.at(l);
            }
            vReturn[k].push_back(temp);
         }
         if (out_scores!=NULL)
            out_scores[k] = pGenerator->m_nScore;
      }
   }
   TRACE("Done, the best score: " << pGenerator->m_nScore);
   TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC);
}
Esempio n. 4
0
void CTagger::tag( const CStringVector * sentence_input , CTwoStringVector * vReturn , SCORE_TYPE * out_scores , unsigned long nBest , const CBitArray * prunes ) {
   clock_t total_start_time = clock();;
   int index , start_index , generator_index , temp_index, word_length;
   const CStateItem * generator_item ;
   CStateItem *candidate_item , tempState , maxState ;
   static CStateItem best_bigram[ 1<<CTag::SIZE ] ;
   unsigned long long best_bigram_mask = 0; // and the count
   unsigned long tag, last_tag ;

   static CStringVector sentence;
   static CRule rules(m_weights->m_bSegmentationRules);
   rules.segment(sentence_input, &sentence);
   const int length = sentence.size() ;

   if (length>=m_nMaxSentSize)
      THROW("the length of the sentence is bigger than the maximum sentence size "<<m_nMaxSentSize<<"; try changing the option");

   assert(vReturn!=NULL);

   TRACE("Initialising the tagging process...");
   m_WordCache.clear() ;
   m_Chart.clear() ;
   // put an empty sentence to the beginning
   tempState.clear() ;
   m_Chart[ 0 ]->insertItem( &tempState ) ;

   TRACE("Tagging started");
   // enumerating the end index
   // =========================
   // index is the word index starting from 0
   for ( index = 0 ; index < length ; ++ index ) {

      // m_Chart index 1 correspond to the first char
      m_Chart[ index + 1 ] ; // this is to make some necessary initialisation for each agenda, when pruning

      // control for the ending character of the candidate
      if ( index < length-1 && rules.canSeparate(index+1)==false )
         continue ;

      // enumerating the possible tags
      // =============================
      // the tag 0 is the NONE tag, and tag 1 is the BEGIN tag
      for ( tag = CTag::FIRST ; tag <= CTag::LAST ; ++ tag ) {

         start_index = index-1 ; // the end index of last word
         word_length = 1 ; // current word length

         // enumerating the start index
         // ===========================
         // the start index of the word is actually start_index + 1
         while( start_index >= -1 && word_length <= m_weights->m_maxLengthByTag[ tag ] ) {

            // control for the starting character of the candidate
            // ---------------------------------------------------
            while ( start_index >= 0 && rules.canSeparate(start_index+1)==false )
               start_index-- ;

            // start the search process
            // ------------------------
            // with pruning
            if (  ( prunes==NULL || prunes->isset( ( start_index+1 ) * m_nMaxSentSize + index ) ) && // not pruned
                  (  (  m_weights->m_mapWordFrequency.find( m_WordCache.find( start_index+1 , index , &sentence ) , 0 ) <
                        m_weights->m_nMaxWordFrequency/5000+5 &&
                        PENN_TAG_CLOSED[ tag ] == false  ) ||
                     m_weights->m_mapTagDictionary.lookup( m_WordCache.find( start_index+1 , index , &sentence ), tag )
                  ) // wordtag match
               ) {

               if (nBest==1) best_bigram_mask=0LL;

               for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) {
                  generator_item = m_Chart[ start_index+1 ]->item( generator_index ) ;
                  tempState.copy( generator_item ) ;
                  tempState.append( index , tag ) ;
                  tempState.score += getOrUpdateLocalScore( &sentence , &tempState , tempState.size()-1 ) ;
                  if (nBest==1) {
                     last_tag = tempState.size()>1 ? tempState.getTag(tempState.size()-2).code() : CTag::SENTENCE_BEGIN;
                     if ( ((best_bigram_mask&(1LL<<last_tag))==0LL) || best_bigram[last_tag].score < tempState.score ) {
                        best_bigram_mask|=(1LL<<last_tag);
                        best_bigram[last_tag].copy(&tempState);
                     }
                  }
                  else {
                     m_Chart[ index+1 ]->insertItem( &tempState );
                  }
               }
               if (nBest==1) {
                  for ( last_tag=0; last_tag<CTag::COUNT; ++last_tag ) {
                     if ( (best_bigram_mask&(1LL<<last_tag)) )
                        m_Chart[ index+1 ]->insertItem( &(best_bigram[last_tag]) );
                  }
               }
            }//if

            // control the first character of the candidate
            if ( rules.canAppend(start_index+1)==false )
               break ;

            // update start index and word len
            --start_index ;
            ++word_length ;

         }//start_index
      }//tag
   }//index

   TRACE("Outputing sentence");
   for ( temp_index = 0 ; temp_index < nBest ; ++ temp_index ) {
      vReturn[ temp_index ].clear() ;
         if (out_scores) out_scores[ temp_index ] = 0 ;
      if ( temp_index < m_Chart[length]->size() ) {
         generate( m_Chart[ length ]->bestItem( temp_index ) , &sentence , this , &(vReturn[ temp_index ]) ) ;
         if (out_scores) out_scores[ temp_index ] = m_Chart[ length ]->bestItem( temp_index )->score ;
      }
   }
   TRACE("Done, the highest score is: " << m_Chart[ length ]->bestItem( 0 )->score) ;
   TRACE("The total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC) ;
}
Esempio n. 5
0
void CTagger::tag( const CStringVector * sentence_input , CTwoStringVector * vReturn , SCORE_TYPE * out_scores , unsigned long nBest , const CBitArray * prunes ) {
   clock_t total_start_time = clock();;
   int temp_index;
   const CSubStateItem *pGenerator;
   CSubStateItem tempState;
   int j, k;
   unsigned tag;
   unsigned index, last_tag;

   static CSubStateItem uniqueItems[AGENDA_SIZE];
   unsigned long uniqueIndex;
   static bool bUnique;
//   unsigned long long uniqueMarkup;
//   assert(CTag::COUNT<=sizeof(unsigned long long)*8);

   static CStringVector sentence;
   static CRule rules(m_weights->m_bSegmentationRules);
   rules.segment(sentence_input, &sentence);
   const unsigned length=sentence.size();

   static CSubStateItem goldState;
   goldState.clear();

   TRACE("Initialising the tagging process...");
   m_WordCache.clear();
   tempState.clear();
   m_Agenda.clear();
   m_Agenda.pushCandidate(&tempState);
   m_Agenda.nextRound();

   TRACE("Tagging started");
   //TRACE("initialisation time: " << clock() - start_time);
   for (index=0; index<length; index++) {

      // decide correction
      if ( m_bTrain ) {
         static bool bAnyCorrect;
         bAnyCorrect = false;
         pGenerator = m_Agenda.generatorStart();
         for (j=0; j<m_Agenda.generatorSize(); ++j) {
            if ( *pGenerator == goldState ) bAnyCorrect = true;
            pGenerator = m_Agenda.generatorNext();  // next generator
         }
         if ( !bAnyCorrect ) {
            TRACE("Training error at character " << index);
            pGenerator = m_Agenda.bestGenerator();
            updateScoreForState(&sentence, pGenerator, -1);
            updateScoreForState(&sentence, &goldState, 1);
            m_bTrainingError = true;
            return;
         }
      }

      // 2. generate by replacing items
      if ( index > 0 ) {
         pGenerator = m_Agenda.generatorStart();
         for (j=0; j<m_Agenda.generatorSize(); ++j) {
            assert(pGenerator->size()>0);
            if ( ( rules.canAppend(index) ) && // ( index > 0 ) &&
                 pGenerator->getWordLength(pGenerator->size()-1) <
                    m_weights->m_maxLengthByTag[pGenerator->getTag(pGenerator->size()-1).code()]
               ) {
               tempState.copy(pGenerator);
               tempState.replaceIndex(index);
               tempState.score += getOrUpdateAppendScore(&sentence, &tempState, tempState.size()-1, index);
               if (index+1==length) tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size());
               m_Agenda.pushCandidate(&tempState);
            } // if
            pGenerator = m_Agenda.generatorNext();  // next generator
         }
      }

   //_
   // 1. generate new items according to each previous item.
   // iterate postags
      for (tag=CTag::FIRST; tag<CTag::COUNT; ++tag) {

         pGenerator = m_Agenda.generatorStart();
//         uniqueMarkup=0;
         uniqueIndex=0;

         for (j=0; j<m_Agenda.generatorSize(); ++j) {

            last_tag = pGenerator->size()==0 ? CTag::SENTENCE_BEGIN : pGenerator->getTag(pGenerator->size()-1).code();

            if ( rules.canSeparate( index ) &&
                (index == 0 || canAssignTag( m_WordCache.find( pGenerator->getWordStart(pGenerator->size()-1), index-1, &sentence ), last_tag )) && // last word
                 canStartWord(sentence, tag, index) // word
               ) {

               tempState.copy(pGenerator);
               tempState.append(index, tag);
               tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size()-1);
               if (index+1==length) tempState.score += getOrUpdateSeparateScore(&sentence, &tempState, tempState.size());

               if (nBest==1) {
//                  if ( ((uniqueMarkup&(1LL<<last_tag))==0LL) || uniqueItems[last_tag].score < tempState.score ) {
//                     uniqueMarkup |= (1LL<<last_tag);
//                     uniqueItems[last_tag].copy(&tempState);
//                  }
                  bUnique = true;
                  for (temp_index=0; temp_index<uniqueIndex; ++temp_index) {
                     // only one new when index=zero.
                     assert(index>0&&uniqueItems[temp_index].size()>1);
                     if (uniqueItems[temp_index].getTag(uniqueItems[temp_index].size()-2) == tempState.getTag(tempState.size()-2) &&
                         uniqueItems[temp_index].getWordStart(uniqueItems[temp_index].size()-2) == tempState.getWordStart(tempState.size()-2)
                        ) {
                        bUnique = false;
                        if (uniqueItems[temp_index].score < tempState.score )
                           uniqueItems[temp_index].copy(&tempState);
                     }//if
                  }//for
                  if (bUnique) {
                     uniqueItems[uniqueIndex++].copy(&tempState);
                  }//if
               }
               else {
                  m_Agenda.pushCandidate(&tempState);
               }
            }
            pGenerator = m_Agenda.generatorNext();  // next generator
         }
         // push candidates
         if (nBest == 1) {
//            for (last_tag=0; last_tag<CTag::COUNT; ++last_tag) {
//               if ( (uniqueMarkup&(1LL<<last_tag)) )
//                  m_Agenda.pushCandidate(&(uniqueItems[last_tag]));
//            }
           for (temp_index=0; temp_index<uniqueIndex; ++temp_index) {
              m_Agenda.pushCandidate(&(uniqueItems[temp_index]));
           }//for
         }
      }//tag

      m_Agenda.nextRound(); // move round
      if (m_bTrain) goldState.follow(m_goldState);
   }

   if ( m_bTrain && 1 ) {
      pGenerator = m_Agenda.bestGenerator();
      if ( *pGenerator != goldState ) {
         TRACE("Training error at the last word");
         updateScoreForState(&sentence, pGenerator, -1);
         updateScoreForState(&sentence, &goldState, 1);
         m_bTrainingError = true;
      }
      m_bTrainingError = false;
      return;
   }
   TRACE("Outputing sentence");
   vReturn->clear();
   if (nBest == 1) {
      generate( m_Agenda.bestGenerator() , &sentence , this , vReturn ) ;
      if (out_scores) out_scores[ 0 ] = m_Agenda.bestGenerator( )->score ;
   }
   else {
      m_Agenda.sortGenerators();
      for ( temp_index = 0 ; temp_index < nBest ; ++ temp_index ) {
         vReturn[ temp_index ].clear() ;
         if (out_scores) out_scores[ temp_index ] = 0 ;
         if ( temp_index < m_Agenda.generatorSize() ) {
            generate( m_Agenda.generator( temp_index ) , &sentence , this , &(vReturn[ temp_index ]) ) ;
            if (out_scores) out_scores[ temp_index ] = m_Agenda.bestGenerator( )->score ;
         }
      }
   }
   TRACE("Done, the highest score is: " << m_Agenda.bestGenerator()->score) ;
   TRACE("The total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC) ;
}
Esempio n. 6
0
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) {
   clock_t total_start_time = clock();;
   const CStateItem *pGenerator, *pCandidate;
   CStateItem tempState;
   unsigned index;                              // the index of the current char
   unsigned j, k;                               // temporary index
   int subtract_score;                          // the score to be subtracted (previous item)
   static CStateItem best_bigram;
   int start_index;
   int word_length;
   int generator_index;

   static CStringVector sentence;
   static CRule rules(m_Feature->m_bRule);
   rules.segment(sentence_input, &sentence);
   const unsigned length = sentence.size();

   assert(length<MAX_SENTENCE_SIZE);
   assert(vReturn!=NULL);

   //clock_t start_time = clock();
   TRACE("Initialising the segmentation process...");
   vReturn->clear();
   clearWordCache(); 
   m_Chart.clear();

   tempState.clear();
   m_Chart[0]->insertItem(&tempState);

   TRACE("Segmenting started");
   for (index=0; index<length; index++) {

      // m_Chart index 1 correspond to the first char
      m_Chart[index+1];

      // control for the ending character of the candidate 
      if ( index < length-1 && rules.canSeparate(index+1)==false ) 
         continue ; 

      start_index = index-1 ; // the end index of last word
      word_length = 1 ; // current word length

      // enumerating the start index
      // ===========================
      // the start index of the word is actually start_index + 1
      while( start_index >= -1 && word_length <= MAX_WORD_SIZE ) {

         // control for the starting character of the candidate
         // ---------------------------------------------------
         while ( start_index >= 0 && rules.canSeparate(start_index+1)==false )
            start_index-- ; 

         // start the search process
         // ------------------------
         for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) {
            pGenerator = m_Chart[ start_index+1 ]->item( generator_index ) ;
            tempState.copy( pGenerator ) ;
            tempState.append( index ) ;
            tempState.m_nScore += m_Feature->getLocalScore( &sentence, &tempState, tempState.m_nLength-1 ) ;
            if (nBest==1) {
               if ( generator_index == 0 || tempState.m_nScore > best_bigram.m_nScore ) {
                  best_bigram.copy(&tempState);                                       //@@@
               }
            }
            else {
               m_Chart[ index+1 ]->insertItem( &tempState );
            }
         }
         if (nBest==1) {
            m_Chart[ index+1 ]->insertItem( &best_bigram );                  //@@@
         }                                                        //@@@

         // control the first character of the candidate
         if ( rules.canAppend(start_index+1)==false ) 
            break ; 

         // update start index and word len
         --start_index ;
         ++word_length ;

      }//start_index
   }
   // now generate outout sentence
   // n-best list will be stored in array
   // from the addr vReturn
   TRACE("Outputing sentence");
   for (k=0; k<nBest; ++k) {
      // clear
      vReturn[k].clear();
      if (out_scores!=NULL) 
         out_scores[k] = 0;
      // assign retval
      if (k<m_Chart[length]->size()) {
         pGenerator = m_Chart[length]->bestItem(k);
         for (j=0; j<pGenerator->m_nLength; j++) {
            std::string temp = "";
            for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) {
               assert(sentence.at(l)!=" "); // [SPACE]
               temp += sentence.at(l);
            }
            vReturn[k].push_back(temp);
         }
         if (out_scores!=NULL)
            out_scores[k] = pGenerator->m_nScore;
      }
   }
   TRACE("Done, the best score: " << pGenerator->m_nScore);
   TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC);
}