void CDepParser::updateScoresForStates( const CStateItem *outout , const CStateItem *correct , SCORE_TYPE amount_add, SCORE_TYPE amount_subtract ) { // do not update those steps where they are correct static CStateItem item(&m_lCache); static unsigned action, correct_action; item.clear(); while ( item != *outout ) { action = item.FollowMove( outout ); correct_action = item.FollowMove( correct ); if ( action == correct_action ) item.Move( action ); else break; } // for the necessary information for the correct and outout parsetree updateScoreForState( item, correct , amount_add ) ; updateScoreForState( item, outout , amount_subtract ) ; m_nTotalErrors++; }
void CConParser::work(const CTwoStringVector &sentence, const CSentenceParsed &correct, CCoNLLOutput *o_conll){ const int length = sentence.size(); static int tmp_i,tmp_j; static CAction correct_action; m_lCache.clear(); m_lWordLen.clear(); for (tmp_i=0; tmp_i<length; tmp_i++ ) { m_lCache.push_back( CTaggedWord<CTag, TAG_SEPARATOR>(sentence[tmp_i].first , sentence[tmp_i].second) ); m_lWordLen.push_back( getUTF8StringLength(sentence[tmp_i].first) ); } std::vector<CStateItem> p(MAX_SENTENCE_SIZE*(2+UNARY_MOVES)+2); CStateItem *correctState = &p[0]; //getLabeledBrackets(correct, correctState->gold_lb); correctState->clear(); correctState->words = (&m_lCache); tmp_i = 1; while(true){ correctState->StandardMove(correct, correct_action); //std::cerr<<correct_action<<std::endl; correctState->Move(&p[tmp_i],correct_action); correctState = &p[tmp_i]; tmp_i ++; if (correctState == 0 || correctState->IsTerminated()) break; // while } correctState->GenerateStanford(sentence, o_conll); p.clear(); }
void CDepParser::extract_features(const CDependencyParse &input0, const CDependencyParse &input1) { CStateItem item; unsigned action; CPackedScoreType<SCORE_TYPE, action::kMax> empty; // word and pos m_lCache.clear(); #ifdef LABELED m_lCacheLabel0.clear(); m_lCacheLabel1.clear(); #endif for (int i = 0; i < input0.size(); ++ i) { m_lCache.push_back(CTaggedWord<CTag, TAG_SEPARATOR>(input0[i].word, input0[i].tag)); #ifdef LABELED m_lCacheLabel0.push_back(CDependencyLabel(input0[i].label)); m_lCacheLabel1.push_back(CDependencyLabel(input1[i].label)); #endif } // make standard item item.clear(); item.len_ = input0.size(); for (int i = 0; i < input0.size() * 4; ++ i) { unsigned action = item.StandardMove(input0, input1 #ifdef LABELED , m_lCacheLabel0, m_lCacheLabel1 #endif ); GetOrUpdateStackScore(&item, empty, action, 1, 1); item.Move(item.nextactionindex(), action); } }
void CDepParser::work( const bool bTrain , const CTwoStringVector &sentence , CDependencyParse *retval , const CDependencyParse &correct , int nBest , SCORE_TYPE *scores ) { #ifdef DEBUG clock_t total_start_time = clock(); #endif static int index; const int length = sentence.size() ; const CStateItem *pGenerator ; static CStateItem pCandidate(&m_lCache) ; // used only for training static bool bCorrect ; // used in learning for early update static bool bContradictsRules; static CStateItem correctState(&m_lCache) ; static CPackedScoreType<SCORE_TYPE, action::MAX> packed_scores; ASSERT(length<MAX_SENTENCE_SIZE, "The size of the sentence is larger than the system configuration."); TRACE("Initialising the decoding process...") ; // initialise word cache bContradictsRules = false; m_lCache.clear(); for ( index=0; index<length; ++index ) { m_lCache.push_back( CTaggedWord<CTag, TAG_SEPARATOR>(sentence[index].first , sentence[index].second) ); // filter std::cout training examples with rules if (bTrain && m_weights->rules()) { // the root if ( correct[index].head == DEPENDENCY_LINK_NO_HEAD && canBeRoot(m_lCache[index].tag.code())==false) { TRACE("Rule contradiction: " << m_lCache[index].tag.code() << " can be root."); bContradictsRules = true; } // head left if ( correct[index].head < index && hasLeftHead(m_lCache[index].tag.code())==false) { TRACE("Rule contradiction: " << m_lCache[index].tag.code() << " has left head."); bContradictsRules = true; } // head right if ( correct[index].head > index && hasRightHead(m_lCache[index].tag.code())==false) { TRACE("Rule contradiction: " << m_lCache[index].tag.code() << " has right head."); bContradictsRules = true; } } } // initialise agenda m_Agenda->clear(); pCandidate.clear(); // restore state using clean m_Agenda->pushCandidate(&pCandidate); // and push it back m_Agenda->nextRound(); // as the generator item if (bTrain) correctState.clear(); // verifying supertags if (m_supertags) { ASSERT(m_supertags->getSentenceSize()==length, "Sentence size does not match supertags size"); } #ifdef LABELED unsigned long label; m_lCacheLabel.clear(); if (bTrain) { for (index=0; index<length; ++index) { m_lCacheLabel.push_back(CDependencyLabel(correct[index].label)); if (m_weights->rules() && !canAssignLabel(m_lCache, correct[index].head, index, m_lCacheLabel[index])) { TRACE("Rule contradiction: " << correct[index].label << " on link head " << m_lCache[correct[index].head].tag.code() << " dep " << m_lCache[index].tag.code()); bContradictsRules = true; } } } #endif // skip the training example if contradicts if (bTrain && m_weights->rules() && bContradictsRules) { std::cout << "Skipping training example because it contradicts rules..." <<std::endl; return; } TRACE("Decoding started"); // loop with the next word to process in the sentence for (index=0; index<length*2; ++index) { if (bTrain) bCorrect = false ; // none can this find with pruning ??? if (m_Agenda->generatorSize() == 0) { WARNING("parsing failed"); return; } pGenerator = m_Agenda->generatorStart(); // iterate generators for (int j=0; j<m_Agenda->generatorSize(); ++j) { // for the state items that already contain all words m_Beam->clear(); packed_scores.reset(); getOrUpdateStackScore( pGenerator, packed_scores, action::NO_ACTION ); if ( pGenerator->size() == length ) { assert( pGenerator->stacksize() != 0 ); if ( pGenerator->stacksize()>1 ) { #ifdef FRAGMENTED_TREE if (pGenerator->head(pGenerator->stacktop()) == DEPENDENCY_LINK_NO_HEAD) poproot(pGenerator, packed_scores); else #endif reduce(pGenerator, packed_scores) ; } else { poproot(pGenerator, packed_scores); } } // for the state items that still need more words else { if ( !pGenerator->afterreduce() ) { // there are many ways when there are many arcrighted items on the stack and the root need arcleft. force this. if ( #ifndef FRAGMENTED_TREE ( pGenerator->size() < length-1 || pGenerator->stackempty() ) && // keep only one global root #endif ( pGenerator->stackempty() || m_supertags == 0 || m_supertags->canShift( pGenerator->size() ) ) && // supertags ( pGenerator->stackempty() || !m_weights->rules() || canBeRoot( m_lCache[pGenerator->size()].tag.code() ) || hasRightHead(m_lCache[pGenerator->size()].tag.code()) ) // rules ) { shift(pGenerator, packed_scores) ; } } if ( !pGenerator->stackempty() ) { if ( #ifndef FRAGMENTED_TREE ( pGenerator->size() < length-1 || pGenerator->headstacksize() == 1 ) && // one root #endif ( m_supertags == 0 || m_supertags->canArcRight(pGenerator->stacktop(), pGenerator->size()) ) && // supertags conform to this action ( !m_weights->rules() || hasLeftHead(m_lCache[pGenerator->size()].tag.code()) ) // rules ) { arcright(pGenerator, packed_scores) ; } } if ( (!m_bCoNLL && !pGenerator->stackempty()) || (m_bCoNLL && pGenerator->stacksize()>1) // make sure that for conll the first item is not popped ) { if ( pGenerator->head( pGenerator->stacktop() ) != DEPENDENCY_LINK_NO_HEAD ) { reduce(pGenerator, packed_scores) ; } else { if ( (m_supertags == 0 || m_supertags->canArcLeft(pGenerator->size(), pGenerator->stacktop())) && // supertags (!m_weights->rules() || hasRightHead(m_lCache[pGenerator->stacktop()].tag.code())) // rules ) { arcleft(pGenerator, packed_scores) ; } } } } // insert item for (unsigned i=0; i<m_Beam->size(); ++i) { pCandidate = *pGenerator; pCandidate.score = m_Beam->item(i)->score; pCandidate.Move( m_Beam->item(i)->action ); m_Agenda->pushCandidate(&pCandidate); } if (bTrain && *pGenerator == correctState) { bCorrect = true ; } pGenerator = m_Agenda->generatorNext() ; } // when we are doing training, we need to consider the standard move and update if (bTrain) { #ifdef EARLY_UPDATE if (!bCorrect) { TRACE("Error at the "<<correctState.size()<<"th word; total is "<<correct.size()) updateScoresForStates(m_Agenda->bestGenerator(), &correctState, 1, -1) ; #ifndef LOCAL_LEARNING return ; #else m_Agenda->clearCandidates(); m_Agenda->pushCandidate(&correctState); #endif } #endif if (bCorrect) { #ifdef LABELED correctState.StandardMoveStep(correct, m_lCacheLabel); #else correctState.StandardMoveStep(correct); #endif } #ifdef LOCAL_LEARNING ++m_nTrainingRound; // each training round is one transition-action #endif } m_Agenda->nextRound(); // move round } if (bTrain) { correctState.StandardFinish(); // pop the root that is left // then make sure that the correct item is stack top finally if ( *(m_Agenda->bestGenerator()) != correctState ) { TRACE("The best item is not the correct one") updateScoresForStates(m_Agenda->bestGenerator(), &correctState, 1, -1) ; return ; } } TRACE("Outputing sentence"); m_Agenda->sortGenerators(); for (int i=0; i<std::min(m_Agenda->generatorSize(), nBest); ++i) { pGenerator = m_Agenda->generator(i) ; if (pGenerator) { pGenerator->GenerateTree( sentence , retval[i] ) ; if (scores) scores[i] = pGenerator->score; } } TRACE("Done, the highest score is: " << m_Agenda->bestGenerator()->score ) ; TRACE("The total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC) ; }
int CDepParser::work(const bool is_train, const CTwoStringVector & sentence, CDependencyParse * retval0, CDependencyParse * retval1, const CDependencyParse & oracle_tree0, const CDependencyParse & oracle_tree1, int nbest, SCORE_TYPE *scores) { #ifdef DEBUG clock_t total_start_time = clock(); #endif const int length = sentence.size(); const int max_round = length * 4 + 1; const int max_lattice_size = (kAgendaSize + 1) * max_round; ASSERT(length < MAX_SENTENCE_SIZE, "The size of sentence is too long."); CStateItem * lattice = GetLattice(max_lattice_size); CStateItem * lattice_wrapper[max_lattice_size]; CStateItem ** lattice_index[max_round]; CStateItem * correct_state = lattice; for (int i = 0; i < max_lattice_size; ++ i) { lattice_wrapper[i] = lattice + i; lattice[i].len_ = length; } lattice[0].clear(); correct_state = lattice; lattice_index[0] = lattice_wrapper; lattice_index[1] = lattice_index[0] + 1; static CPackedScoreType<SCORE_TYPE, action::kMax> packed_scores; TRACE("Initialising the decoding process ..."); m_lCache.clear(); for (int i = 0; i < length; ++ i) { m_lCache.push_back(CTaggedWord<CTag, TAG_SEPARATOR>(sentence[i].first, sentence[i].second)); #ifdef LABELED if (is_train) { if (i == 0) { m_lCacheLabel0.clear(); m_lCacheLabel1.clear(); } m_lCacheLabel0.push_back(CDependencyLabel(oracle_tree0[i].label)); m_lCacheLabel1.push_back(CDependencyLabel(oracle_tree1[i].label)); } #endif } int num_results = 0; int round = 0; bool is_correct; // used for training to specify correct state in lattice // loop with the next word to process in the sentence, // `round` represent the generators, and the condidates should be inserted // into the `round + 1` for (round = 1; round < max_round; ++ round) { if (lattice_index[round - 1] == lattice_index[round]) { // there is nothing in generators, the proning has cut all legel // generator. actually, in this kind of case, we should raise a // exception. however to achieve a parsing tree, an alternative // solution is go back to the previous round WARNING("Parsing Failed!"); -- round; break; } int current_beam_size = 0; // loop over the generator states // std::cout << "round : " << round << std::endl; for (CStateItem ** q = lattice_index[round - 1]; q != lattice_index[round]; ++ q) { const CStateItem * generator = (*q); m_Beam->clear(); packed_scores.reset(); GetOrUpdateStackScore(generator, packed_scores, action::kNoAction); Transit(generator, packed_scores); for (unsigned i = 0; i < m_Beam->size(); ++ i) { CStateItem candidate; candidate = (*generator); // generate candidate state according to the states in beam int curIndex = candidate.nextactionindex(); candidate.Move(curIndex, m_Beam->item(i)->action); candidate.score = m_Beam->item(i)->score; candidate.previous_ = generator; current_beam_size += InsertIntoBeam(lattice_index[round], &candidate, current_beam_size, kAgendaSize); } } lattice_index[round + 1] = lattice_index[round] + current_beam_size; if (is_train) { CStateItem next_correct_state(*correct_state); unsigned goldaction = next_correct_state.StandardMoveStep(oracle_tree0, oracle_tree1 #ifdef LABELED , m_lCacheLabel0, m_lCacheLabel1 #endif // end for LABELED ); //std::cout << *correct_state << std::endl; //std::cout << goldaction << std::endl; next_correct_state.previous_ = correct_state; is_correct = false; for (CStateItem ** q = lattice_index[round]; q != lattice_index[round + 1]; ++ q) { CStateItem * p = *q; if (next_correct_state.last_action_index == p->last_action_index && next_correct_state.last_action[next_correct_state.last_action_index] == p->last_action[p->last_action_index] && p->previous_ == correct_state) { correct_state = p; is_correct = true; break; } } //std::cout << *correct_state << std::endl; //std::cout << goldaction << std::endl; #ifdef EARLY_UPDATE if (!is_correct || round == max_round-1) { int curIndex = next_correct_state.nextactionindex(); TRACE("ERROR at the " << next_correct_state.size() << "th word for schema " << curIndex); if(curIndex == 0) { TRACE(" Total is " << oracle_tree0.size()); } else { TRACE(" Total is " << oracle_tree1.size()); } CStateItem * best_generator = (*lattice_index[round]); for (CStateItem ** q = lattice_index[round]; q != lattice_index[round + 1]; ++ q) { CStateItem * p = (*q); if (best_generator->score < p->score) { best_generator = p; } } UpdateScoresForStates(best_generator, &next_correct_state, 1, -1); return -1; } #endif // end for EARLY_UPDATE } } // if (is_train) { // CStateItem * best_generator = (*lattice_index[round-1]); // for (CStateItem ** q = lattice_index[round-1]; q != lattice_index[round]; ++ q) { // CStateItem * p = (*q); // if (best_generator->score < p->score) { // best_generator = p; // } // } // if (best_generator != correct_state) { // UpdateScoresForStates(best_generator, correct_state, 1, -1); // } // return -1; // } //delete[] sequence_correct_state; /* if (is_train) { //correct_state->StandardFinish(); // pop the root that is left // then make sure that the correct item is stack top finally CStateItem * best_generator = (*lattice_index[round-1]); for (CStateItem ** q = lattice_index[round-1]; q != lattice_index[round ]; ++ q) { CStateItem * p = (*q); if (best_generator->score < p->score) { best_generator = p; } } { //TRACE("The best item is not the correct one") UpdateScoresForStates(best_generator, correct_state, 1, -1) ; } } */ if (!retval0 || !retval1) { return -1; } TRACE("Output sentence"); std::sort(lattice_index[round - 1], lattice_index[round], StateHeapMore); num_results = lattice_index[round] - lattice_index[round - 1]; for (int i = 0; i < std::min(num_results, nbest); ++ i) { assert( (*(lattice_index[round - 1] + i))->size() == m_lCache.size()); (*(lattice_index[round - 1] + i))->GenerateTree(sentence, retval0[i], retval1[i]); if (scores) { scores[i] = (*(lattice_index[round - 1] + i))->score; } } TRACE("Done, total time spent: " << double(clock() - total_start_time) / CLOCKS_PER_SEC); return num_results; }
/*--------------------------------------------------------------- * * work - the working process shared by training and parsing * * Returns: makes a new instance of CDependencyParse * *--------------------------------------------------------------*/ int CDepParser::work(const bool is_train, const CTwoStringVector & sentence, CDependencyParse * retval, const CDependencyParse & oracle_tree, int nbest, SCORE_TYPE *scores) { #ifdef DEBUG clock_t total_start_time = clock(); #endif const int length = sentence.size(); const int max_round = length * 2 + 1; const int max_lattice_size = (kAgendaSize + 1) * max_round; ASSERT(length < MAX_SENTENCE_SIZE, "The size of sentence is too long."); CStateItem * lattice = GetLattice(max_lattice_size); CStateItem * lattice_index[max_round]; CStateItem * correct_state = lattice; for (int i = 0; i < max_lattice_size; ++ i) { lattice[i].len_ = length; } lattice[0].clear(); correct_state = lattice; lattice_index[0] = lattice; lattice_index[1] = lattice_index[0] + 1; static CPackedScore packed_scores; TRACE("Initialising the decoding process ..."); m_lCache.clear(); for (int i = 0; i < length; ++ i) { m_lCache.push_back(CTaggedWord<CTag, TAG_SEPARATOR>(sentence[i].first, sentence[i].second)); #ifdef LABELED if (is_train) { if (i == 0) { m_lCacheLabel.clear(); } m_lCacheLabel.push_back(CDependencyLabel(oracle_tree[i].label)); } #endif } int num_results = 0; int round = 0; bool is_correct; // used for training to specify correct state in lattice // loop with the next word to process in the sentence, 'round' represent the // generators, and the condidates should be inserted into the 'round + 1' for (round = 1; round < max_round; ++ round) { if (lattice_index[round - 1] == lattice_index[round]) { // There is nothing in generators, the proning has cut all legel // generator. Actually, in this kind of case, we should raise a // exception. However to achieve a parsing tree, an alternative // solution is go back to the previous round WARNING("Parsing Failed!"); -- round; break; } current_beam_size_ = 0; // loop over the generator states // std::cout << "round : " << round << std::endl; for (CStateItem * q = lattice_index[round - 1]; q != lattice_index[round]; ++ q) { const CStateItem * generator = q; packed_scores.reset(); GetOrUpdateStackScore(generator, packed_scores, action::kNoAction); Transit(generator, packed_scores); } for (unsigned i = 0; i < current_beam_size_; ++ i) { const CScoredTransition& transition = m_kBestTransitions[i]; CStateItem* target = lattice_index[round]+ i; (*target) = (*transition.source); // generate candidate state according to the states in beam target->Move(transition.action); target->score = transition.score; target->previous_ = transition.source; } lattice_index[round + 1] = lattice_index[round] + current_beam_size_; if (is_train) { CStateItem next_correct_state(*correct_state); next_correct_state.StandardMoveStep(oracle_tree #ifdef LABELED , m_lCacheLabel #endif // end for LABELED ); next_correct_state.previous_ = correct_state; is_correct = false; for (CStateItem *p = lattice_index[round]; p != lattice_index[round + 1]; ++ p) { if (next_correct_state.last_action == p->last_action && p->previous_ == correct_state) { correct_state = p; is_correct = true; break; } } #ifdef EARLY_UPDATE if (!is_correct) { TRACE("ERROR at the " << next_correct_state.size() << "th word;" << " Total is " << oracle_tree.size()); CStateItem * best_generator = lattice_index[round]; for (CStateItem * p = lattice_index[round]; p != lattice_index[round + 1]; ++ p) { if (best_generator->score < p->score) { best_generator = p; } } UpdateScoresForStates(best_generator, &next_correct_state, 1, -1); return -1; } #endif // end for EARLY_UPDATE } } if (is_train) { CStateItem * best_generator = lattice_index[round-1]; for (CStateItem * p = lattice_index[round-1]; p != lattice_index[round]; ++ p) { if (best_generator->score < p->score) { best_generator = p; } } if (best_generator != correct_state) { UpdateScoresForStates(best_generator, correct_state, 1, -1); } return -1; } if (!retval) { return -1; } TRACE("Output sentence"); std::sort(lattice_index[round - 1], lattice_index[round], StateMore); num_results = lattice_index[round] - lattice_index[round - 1]; for (int i = 0; i < std::min(num_results, nbest); ++ i) { assert( (lattice_index[round - 1] + i)->size() == m_lCache.size()); (lattice_index[round - 1] + i)->GenerateTree(sentence, retval[i]); if (scores) { scores[i] = (lattice_index[round - 1] + i)->score; } } TRACE("Done, total time spent: " << double(clock() - total_start_time) / CLOCKS_PER_SEC); return num_results; }
void CSegmentor::segment(const CStringVector* sentence_input, CStringVector *vReturn, double *out_scores, int nBest) { clock_t total_start_time = clock();; const CStateItem *pGenerator, *pCandidate; CStateItem tempState; unsigned index; // the index of the current char unsigned j, k; // temporary index int subtract_score; // the score to be subtracted (previous item) static CStateItem best_bigram; int start_index; int word_length; int generator_index; static CStringVector sentence; static CRule rules(m_Feature->m_bRule); rules.segment(sentence_input, &sentence); const unsigned length = sentence.size(); assert(length<MAX_SENTENCE_SIZE); assert(vReturn!=NULL); //clock_t start_time = clock(); TRACE("Initialising the segmentation process..."); vReturn->clear(); clearWordCache(); m_Chart.clear(); tempState.clear(); m_Chart[0]->insertItem(&tempState); TRACE("Segmenting started"); for (index=0; index<length; index++) { // m_Chart index 1 correspond to the first char m_Chart[index+1]; // control for the ending character of the candidate if ( index < length-1 && rules.canSeparate(index+1)==false ) continue ; start_index = index-1 ; // the end index of last word word_length = 1 ; // current word length // enumerating the start index // =========================== // the start index of the word is actually start_index + 1 while( start_index >= -1 && word_length <= MAX_WORD_SIZE ) { // control for the starting character of the candidate // --------------------------------------------------- while ( start_index >= 0 && rules.canSeparate(start_index+1)==false ) start_index-- ; // start the search process // ------------------------ for ( generator_index = 0 ; generator_index < m_Chart[ start_index+1 ]->size() ; ++ generator_index ) { pGenerator = m_Chart[ start_index+1 ]->item( generator_index ) ; tempState.copy( pGenerator ) ; tempState.append( index ) ; tempState.m_nScore += m_Feature->getLocalScore( &sentence, &tempState, tempState.m_nLength-1 ) ; if (nBest==1) { if ( generator_index == 0 || tempState.m_nScore > best_bigram.m_nScore ) { best_bigram.copy(&tempState); //@@@ } } else { m_Chart[ index+1 ]->insertItem( &tempState ); } } if (nBest==1) { m_Chart[ index+1 ]->insertItem( &best_bigram ); //@@@ } //@@@ // control the first character of the candidate if ( rules.canAppend(start_index+1)==false ) break ; // update start index and word len --start_index ; ++word_length ; }//start_index } // now generate outout sentence // n-best list will be stored in array // from the addr vReturn TRACE("Outputing sentence"); for (k=0; k<nBest; ++k) { // clear vReturn[k].clear(); if (out_scores!=NULL) out_scores[k] = 0; // assign retval if (k<m_Chart[length]->size()) { pGenerator = m_Chart[length]->bestItem(k); for (j=0; j<pGenerator->m_nLength; j++) { std::string temp = ""; for (unsigned l = pGenerator->getWordStart(j); l <= pGenerator->getWordEnd(j); ++l) { assert(sentence.at(l)!=" "); // [SPACE] temp += sentence.at(l); } vReturn[k].push_back(temp); } if (out_scores!=NULL) out_scores[k] = pGenerator->m_nScore; } } TRACE("Done, the best score: " << pGenerator->m_nScore); TRACE("total time spent: " << double(clock() - total_start_time)/CLOCKS_PER_SEC); }