Ejemplo n.º 1
0
void Method2_pairs::each_prepare_data_oneiter()
{
	delete []data;
	delete []gradient;

	//for gradient
	gradient = new REAL[mach->GetWidth()*mach->GetOdim()];
	mach->SetGradOut(gradient);

	//prepare all
	//-- first all
	int num_pairs = 0;
	int sentences = training_corpus->size();
	for(int i=0;i<sentences;i++){
		int length = training_corpus->at(i)->length();
		//here duplicate right ones and exclude root as mod
		// -- length-2 excludes self,real-head
		num_pairs += (length-2)*(length-1)*2;
	}
	//-- generate all
	int real_num_pairs = 0;
	data = new REAL[num_pairs*mach->GetIdim()];
	REAL* assign_x = data;
	FeatureGenO1* feat_o1 = (FeatureGenO1*)feat_gen;	//force it
	for(int i=0;i<sentences;i++){
		DependencyInstance* x = training_corpus->at(i);
		int length = x->length();
		for(int mod=1;mod<length;mod++){
			int head = x->heads->at(mod);
			for(int j=0;j<length;j++){	//length-2
				if(j==head || j==mod)
					continue;
				//always first right and then wrong
				feat_gen->fill_one(assign_x,x,head,mod);
				assign_x += mach->GetIdim();
				feat_gen->fill_one(assign_x,x,j,mod);
				assign_x += mach->GetIdim();
				real_num_pairs += 2;
			}
		}
	}
	current = 0;
	end = real_num_pairs;
	//shuffle --- make sure shuffle 2 at the same time(here really lazy to write another shuffle,so ...)
	shuffle_data(data,data,2*mach->GetIdim(),2*mach->GetIdim(),
			real_num_pairs*mach->GetIdim(),real_num_pairs*mach->GetIdim(),10);
	//sample
	cout << "--Data for this iter: samples all " << end << " resample: " << (int)(end*parameters->CONF_NN_resample) << endl;
	end = (int)(end*parameters->CONF_NN_resample);
}
Ejemplo n.º 2
0
void TokenDictionary::InitializeFromDependencyReader(DependencyReader *reader) {
  LOG(INFO) << "Creating token dictionary...";

  int form_cutoff = FLAGS_form_cutoff;
  int form_lower_cutoff = FLAGS_form_cutoff;
  int lemma_cutoff = FLAGS_lemma_cutoff;
  int feats_cutoff = FLAGS_feats_cutoff;
  int pos_cutoff = FLAGS_pos_cutoff;
  int cpos_cutoff = FLAGS_cpos_cutoff;
  int prefix_length = FLAGS_prefix_length;
  int suffix_length = FLAGS_suffix_length;
  bool form_case_sensitive = FLAGS_form_case_sensitive;

  vector<int> form_freqs;
  vector<int> form_lower_freqs;
  vector<int> lemma_freqs;
  vector<int> feats_freqs;
  vector<int> pos_freqs;
  vector<int> cpos_freqs;

  Alphabet form_alphabet;
  Alphabet form_lower_alphabet;
  Alphabet lemma_alphabet;
  Alphabet feats_alphabet;
  Alphabet pos_alphabet;
  Alphabet cpos_alphabet;

  string special_symbols[NUM_SPECIAL_TOKENS];
  special_symbols[TOKEN_UNKNOWN] = kTokenUnknown;
  special_symbols[TOKEN_START] = kTokenStart;
  special_symbols[TOKEN_STOP] = kTokenStop;

  for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
    prefix_alphabet_.Insert(special_symbols[i]);
    suffix_alphabet_.Insert(special_symbols[i]);
    form_alphabet.Insert(special_symbols[i]);
    form_lower_alphabet.Insert(special_symbols[i]);
    lemma_alphabet.Insert(special_symbols[i]);
    feats_alphabet.Insert(special_symbols[i]);
    pos_alphabet.Insert(special_symbols[i]);
    cpos_alphabet.Insert(special_symbols[i]);

    // Counts of special symbols are set to -1:
    form_freqs.push_back(-1);
    form_lower_freqs.push_back(-1);
    lemma_freqs.push_back(-1);
    feats_freqs.push_back(-1);
    pos_freqs.push_back(-1);
    cpos_freqs.push_back(-1);
  }

  // Go through the corpus and build the dictionaries,
  // counting the frequencies.
  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  DependencyInstance *instance =
    static_cast<DependencyInstance*>(reader->GetNext());
  while (instance != NULL) {
    int instance_length = instance->size();
    for (int i = 0; i < instance_length; ++i) {
      int id;

      // Add form to alphabet.
      std::string form = instance->GetForm(i);
      std::string form_lower(form);
      transform(form_lower.begin(), form_lower.end(), form_lower.begin(),
                ::tolower);
      if (!form_case_sensitive) form = form_lower;
      id = form_alphabet.Insert(form);
      if (id >= form_freqs.size()) {
        CHECK_EQ(id, form_freqs.size());
        form_freqs.push_back(0);
      }
      ++form_freqs[id];

      // Add lower-case form to alphabet.
      id = form_lower_alphabet.Insert(form_lower);
      if (id >= form_lower_freqs.size()) {
        CHECK_EQ(id, form_lower_freqs.size());
        form_lower_freqs.push_back(0);
      }
      ++form_lower_freqs[id];

      // Add lemma to alphabet.
      id = lemma_alphabet.Insert(instance->GetLemma(i));
      if (id >= lemma_freqs.size()) {
        CHECK_EQ(id, lemma_freqs.size());
        lemma_freqs.push_back(0);
      }
      ++lemma_freqs[id];

      // Add prefix/suffix to alphabet.
      // TODO: add varying lengths.
      string prefix = form.substr(0, prefix_length);
      id = prefix_alphabet_.Insert(prefix);
      int start = form.length() - suffix_length;
      if (start < 0) start = 0;
      string suffix = form.substr(start, suffix_length);
      id = suffix_alphabet_.Insert(suffix);

      // Add POS to alphabet.
      id = pos_alphabet.Insert(instance->GetPosTag(i));
      if (id >= pos_freqs.size()) {
        CHECK_EQ(id, pos_freqs.size());
        pos_freqs.push_back(0);
      }
      ++pos_freqs[id];

      // Add CPOS to alphabet.
      id = cpos_alphabet.Insert(instance->GetCoarsePosTag(i));
      if (id >= cpos_freqs.size()) {
        CHECK_EQ(id, cpos_freqs.size());
        cpos_freqs.push_back(0);
      }
      ++cpos_freqs[id];

      // Add FEATS to alphabet.
      for (int j = 0; j < instance->GetNumMorphFeatures(i); ++j) {
        id = feats_alphabet.Insert(instance->GetMorphFeature(i,j));
        if (id >= feats_freqs.size()) {
          CHECK_EQ(id, feats_freqs.size());
          feats_freqs.push_back(0);
        }
        ++feats_freqs[id];
      }
    }
    delete instance;
    instance = static_cast<DependencyInstance*>(reader->GetNext());
  }
  reader->Close();

  // Now adjust the cutoffs if necessary.
  while (true) {
    form_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      form_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = form_alphabet.begin();
         iter != form_alphabet.end(); 
         ++iter) {
      if (form_freqs[iter->second] > form_cutoff) {
        form_alphabet_.Insert(iter->first);
      }
    }
    if (form_alphabet_.size() < kMaxFormAlphabetSize) break;
    ++form_cutoff;
    LOG(INFO) << "Incrementing form cutoff to " << form_cutoff << "...";
  }

  while (true) {
    form_lower_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      form_lower_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = form_lower_alphabet.begin();
         iter != form_lower_alphabet.end();
         ++iter) {
      if (form_lower_freqs[iter->second] > form_lower_cutoff) {
        form_lower_alphabet_.Insert(iter->first);
      }
    }
    if (form_lower_alphabet_.size() < kMaxFormAlphabetSize) break;
    ++form_lower_cutoff;
    LOG(INFO) << "Incrementing lower-case form cutoff to "
              << form_lower_cutoff << "...";
  }

  while (true) {
    lemma_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      lemma_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = lemma_alphabet.begin();
         iter != lemma_alphabet.end();
         ++iter) {
      if (lemma_freqs[iter->second] > lemma_cutoff) {
        lemma_alphabet_.Insert(iter->first);
      }
    }
    if (lemma_alphabet_.size() < kMaxLemmaAlphabetSize) break;
    ++lemma_cutoff;
    LOG(INFO) << "Incrementing lemma cutoff to " << lemma_cutoff << "...";
  }

  while (true) {
    pos_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      pos_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = pos_alphabet.begin();
         iter != pos_alphabet.end(); 
         ++iter) {
      if (pos_freqs[iter->second] > pos_cutoff) {
        pos_alphabet_.Insert(iter->first);
      }
    }
    if (pos_alphabet_.size() < kMaxPosAlphabetSize) break;
    ++pos_cutoff;
    LOG(INFO) << "Incrementing POS cutoff to " << pos_cutoff << "...";
  }

  while (true) {
    cpos_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      cpos_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = cpos_alphabet.begin();
         iter != cpos_alphabet.end(); 
         ++iter) {
      if (cpos_freqs[iter->second] > cpos_cutoff) {
        cpos_alphabet_.Insert(iter->first);
      }
    }
    if (cpos_alphabet_.size() < kMaxCoarsePosAlphabetSize) break;
    ++cpos_cutoff;
    LOG(INFO) << "Incrementing CPOS cutoff to " << cpos_cutoff << "...";
  }

  while (true) {
    feats_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      feats_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = feats_alphabet.begin();
         iter != feats_alphabet.end(); 
         ++iter) {
      if (feats_freqs[iter->second] > feats_cutoff) {
        feats_alphabet_.Insert(iter->first);
      }
    }
    if (feats_alphabet_.size() < kMaxFeatsAlphabetSize) break;
    ++feats_cutoff;
    LOG(INFO) << "Incrementing FEATS cutoff to " << feats_cutoff << "...";
  }

  form_alphabet_.StopGrowth();
  form_lower_alphabet_.StopGrowth();
  lemma_alphabet_.StopGrowth();
  prefix_alphabet_.StopGrowth();
  suffix_alphabet_.StopGrowth();
  feats_alphabet_.StopGrowth();
  pos_alphabet_.StopGrowth();
  cpos_alphabet_.StopGrowth();

  LOG(INFO) << "Number of forms: " << form_alphabet_.size() << endl
            << "Number of lower-case forms: " << form_lower_alphabet_.size()
            << endl
            << "Number of lemmas: " << lemma_alphabet_.size() << endl
            << "Number of prefixes: " << prefix_alphabet_.size() << endl
            << "Number of suffixes: " << suffix_alphabet_.size() << endl
            << "Number of feats: " << feats_alphabet_.size() << endl
            << "Number of pos: " << pos_alphabet_.size() << endl
            << "Number of cpos: " << cpos_alphabet_.size();

  CHECK_LT(form_alphabet_.size(), 0xffff);
  CHECK_LT(form_lower_alphabet_.size(), 0xffff);
  CHECK_LT(lemma_alphabet_.size(), 0xffff);
  CHECK_LT(prefix_alphabet_.size(), 0xffff);
  CHECK_LT(suffix_alphabet_.size(), 0xffff);
  CHECK_LT(feats_alphabet_.size(), 0xffff);
  CHECK_LT(pos_alphabet_.size(), 0xff);
  CHECK_LT(cpos_alphabet_.size(), 0xff);

  // TODO: Remove this (only for debugging purposes).
  BuildNames();
}
Ejemplo n.º 3
0
void M2_p2o2::each_train_one_iter()
{
	static bool** STA_noprobs = 0;	//static ine, init only once
	if(STA_noprobs==0 && !filter_read(STA_noprobs)){
		//init only once
		int all_tokens_train=0,all_token_filter_wrong=0;
		time_t now;
		time(&now);
		cout << "-Preparing no_probs at " << ctime(&now) << endl;
		STA_noprobs = new bool*[training_corpus->size()];
		for(unsigned int i=0;i<training_corpus->size();i++){
			DependencyInstance* x = training_corpus->at(i);
			STA_noprobs[i] = get_cut_o1(x,mfo1,dict,hp->CONF_score_o1filter_cut);
			all_tokens_train += x->length()-1;
			for(int m=1;m<x->length();m++)
				if(STA_noprobs[i][get_index2(x->length(),x->heads->at(m),m)])
					all_token_filter_wrong ++;
		}
		cout << "For o1 filter: all " << all_tokens_train << ";filter wrong " << all_token_filter_wrong << endl;
		filter_write(STA_noprobs);
	}

	//per-sentence approach
	int num_sentences = training_corpus->size();
	//statistics
	int skip_sent_num = 0;
	int all_forward_instance = 0;
	int all_inst_right = 0;
	int all_inst_wrong = 0;
	//some useful info
	int odim = mach->get_odim();
	//training
	time_t now;
	time(&now); //ctime is not rentrant ! use ctime_r() instead if needed
	cout << "##*** // Start the p2o2 training for iter " << cur_iter << " at " << ctime(&now)
			<< "with lrate " << cur_lrate << endl;
	cout << "#Sentences is " << num_sentences << " and resample (about)" << num_sentences*hp->CONF_NN_resample << endl;
	for(int i=0;i<num_sentences;){
		//random skip (instead of shuffling every time)
		if(drand48() > hp->CONF_NN_resample || training_corpus->at(i)->length() >= hp->CONF_higho_toolong){
			skip_sent_num ++;
			i ++;
			continue;
		}

		mach->prepare_batch();
		//if nesterov update before each batch (pre-update)
		if(hp->CONF_NESTEROV_MOMENTUM)
			mach->nesterov_update(hp->CONF_UPDATE_WAY,hp->CONF_MOMENTUM_ALPHA);
		//main batch
		int this_sentence = 0;
		int this_instance = 0;
		for(;;){
			//forward
			DependencyInstance* x = training_corpus->at(i);
			const int length = x->length();
			nn_input* the_inputs;
			REAL *fscores = forward_scores_o2sib(x,mach,&the_inputs,dict->get_helper(),0,STA_noprobs[i],hp);

			this_instance += the_inputs->get_numi();
			all_forward_instance += the_inputs->get_numi();
			all_inst_right += the_inputs->inst_good;
			all_inst_wrong += the_inputs->inst_bad;
			this_sentence ++;
			i++;

			the_scores::Scores<REAL_SCORES>* rscores = get_the_scores(the_inputs,fscores,mach->get_odim(),the_inputs->get_numi());
			REAL_SCORES* tmp_marginals = LencodeMarginals_o2sib(length,*rscores);
//			//two situations
//			int length = x->length();
//			if(!hp->CONF_labeled){
//				//calculate prob
//				rscores = rearrange_scores_o2sib(x,mach,the_inputs,fscores,0,0,0,hp);
//				tmp_marginals = encodeMarginals_o2sib(length,rscores);
//			}
//			else{
//				//calculate prob
//				rscores = rearrange_scores_o2sib(x,mach,the_inputs,fscores,0,0,0,hp);
//				tmp_marginals = LencodeMarginals_o2sib(length,rscores,mach->get_odim());
//			}
			//set gradients
			int HERE_dim = the_inputs->num_width;
			REAL* to_assign = fscores;
			for(int ii=0;ii<the_inputs->num_inst*HERE_dim;ii+=HERE_dim){
				int tmph = the_inputs->inputs->at(ii);
				int tmpm = the_inputs->inputs->at(ii+1);
				int tmps = the_inputs->inputs->at(ii+2);
				if(tmps<0)
					tmps = tmph;
				int tmp_goal = the_inputs->goals->at(ii/HERE_dim);
				REAL_SCORES* from_mar = tmp_marginals+odim*(ii/HERE_dim);
				for(int once=0;once<odim;once++,to_assign++){
					if(tmp_goal == once)
						*to_assign = -1 * (1 - from_mar[once]) + *to_assign * hp->CONF_score_p2reg;
					else
						*to_assign = from_mar[once] + *to_assign * hp->CONF_score_p2reg;	//now object is maximum
				}
			}

			//backward
			mach->backward(fscores);

			delete the_inputs;
			delete rscores;
			delete []tmp_marginals;

			if(i>=num_sentences)
				break;
			//out of the mini-batch
			while(training_corpus->at(i)->length() >= hp->CONF_higho_toolong){	//HAVE to compromise, bad choice
				skip_sent_num ++;
				i ++;
			}
			if(i>=num_sentences)
				break;
			if(hp->CONF_minibatch > 0){
				if(this_sentence >= hp->CONF_minibatch)
					break;
			}
			else{
				if(this_instance >= -1*hp->CONF_minibatch)
					break;
			}
		}
		//real update
		mach->update(hp->CONF_UPDATE_WAY,cur_lrate,hp->CONF_NN_WD,hp->CONF_MOMENTUM_ALPHA,hp->CONF_RMS_SMOOTH);
	}
	cout << "Iter done, skip " << skip_sent_num << " sentences and f&b " << all_forward_instance
			<< ";good/bad: " << all_inst_right << "/" << all_inst_wrong << endl;
}
Ejemplo n.º 4
0
void M4_o2::each_train_one_iter()
{
	//per-sentence approach
	int num_sentences = training_corpus->size();
	//statistics
	int skip_sent_num = 0;
	int all_forward_instance = 0;
	int all_inst_right = 0;
	int all_inst_wrong = 0;
	//some useful info
	int odim = mach->get_odim();
	//training
	time_t now;
	time(&now); //ctime is not rentrant ! use ctime_r() instead if needed
	cout << "##*** //M4O2// Start the training for iter " << cur_iter << " at " << ctime(&now)
			<< "with lrate " << cur_lrate << endl;
	cout << "#Sentences is " << num_sentences << " and resample (about)" << num_sentences*hp->CONF_NN_resample << endl;

	vector<DependencyInstance*> xs;
	int all_token=0,all_right=0;
	for(int i=0;i<num_sentences;){
		//random skip (instead of shuffling every time)
		if(drand48() > hp->CONF_NN_resample || training_corpus->at(i)->length() >= hp->CONF_higho_toolong){
			skip_sent_num ++;
			i ++;
			continue;
		}
		//main batch
		int this_instance_toupdate = 0;
		int this_tokens = 0;
		for(;;){
			//forward
			DependencyInstance* x = training_corpus->at(i);
			xs.push_back(x);

			Process::parse_o2sib(x,mfo1,0,true);	//add margin MAYBE
			// -- statistic
			all_token += x->length()-1;
			for(int i2=1;i2<x->length();i2++){	//ignore root
				if((*(x->predict_heads))[i2] == (*(x->heads))[i2])
					all_right ++;
				else
					this_instance_toupdate++;
			}
			//
			this_tokens += x->length() - 1;
			i++;

			if(i>=num_sentences)
				break;
			//out of the mini-batch
			while(training_corpus->at(i)->length() >= hp->CONF_higho_toolong){	//HAVE to compromise, bad choice
				skip_sent_num ++;
				i ++;
			}
			if(i>=num_sentences)
				break;
			if (hp->CONF_minibatch > 0) {
				if (int(xs.size()) >= hp->CONF_minibatch)
					break;
			}
			else {
				if (this_instance_toupdate >= -1 * hp->CONF_minibatch)
					break;
			}
		}

		//backward
		for(int ii=0;ii<xs.size();ii++){
			DependencyInstance* x = xs[ii];
			nn_input* good;
			nn_input* bad;
			M3_pro2::get_nninput_o2sib(x,&good,&bad,dict);
			MM_margin_backward(mach, good, 1, hp->CONF_score_p2reg);
			MM_margin_backward(mach, bad, -1, hp->CONF_score_p2reg);
			delete good;delete bad;
		}
		int this_sentence = xs.size();
		xs.clear();
		//real update
		if (hp->CONF_mbatch_way == 1)
			mach->set_this_mbsize(this_tokens*this_tokens);
		else if (hp->CONF_mbatch_way == 2)
			mach->set_this_mbsize(this_sentence*this_sentence);
		mach->update(hp->CONF_UPDATE_WAY,cur_lrate,hp->CONF_NN_WD,hp->CONF_MOMENTUM_ALPHA,hp->CONF_RMS_SMOOTH);
	}
	cout << "Iter done, skip " << skip_sent_num << " sentences." << "AND training UAS:"
			<< all_right << "/" << all_token << "=" << all_right/(0.0+all_token) << endl;
}
Ejemplo n.º 5
0
void Method9_O3g::each_prepare_data_oneiter()
{
	delete []data;
	delete []target;
	delete []gradient;
	//for gradient
	gradient = new REAL[mach->GetWidth()*mach->GetOdim()];
	mach->SetGradOut(gradient);
	int sentences = training_corpus->size();
	int idim = mach->GetIdim();
	int odim = mach->GetOdim();

	//only one time when o1_filter(decoding o1 is quite expensive)
	static REAL* data_right = 0;
	static REAL* data_wrong = 0;
	static int tmpall_right=0;
	static int tmpall_wrong=0;
	static int tmpall_bad=0;
	int whether_o1_filter = 0;
	if(parameters->CONF_NN_highO_o1mach.length() > 0 && parameters->CONF_NN_highO_o1filter)
		whether_o1_filter = 1;

	//************WE MUST SPECIFY O1_FILTER****************//
	if(!whether_o1_filter){
		cout << "No o1-filter for o2g, too expensive!!" << endl;
		exit(1);
	}
	//************WE MUST SPECIFY O1_FILTER****************//

	if(data_right==0){
	//1.o1-filter (MUST HAVE)
	FeatureGenO1* feat_temp_o1 = new FeatureGenO1(dict,parameters->CONF_x_window,
					parameters->CONF_add_distance,parameters->CONF_add_pos,parameters->CONF_add_distance_parent);
	double** all_scores_o1 = new double*[sentences];
	int all_tokens_train=0,all_token_filter_wrong=0;
	for(int i=0;i<sentences;i++){
			all_scores_o1[i] = 0;
			DependencyInstance* x = training_corpus->at(i);
			all_scores_o1[i] = get_scores_o1(x,parameters,mach_o1,feat_temp_o1);
			double* scores_o1_filter = all_scores_o1[i];
			all_tokens_train += x->length();
			for(int i2=1;i2<x->length();i2++){	//ignore root
				if(score_noprob(scores_o1_filter[get_index2(x->length(),x->heads->at(i2),i2)]))
					all_token_filter_wrong ++;
		}
	}
	cout << "For o1 filter: all " << all_tokens_train << ";filter wrong " << all_token_filter_wrong << endl;
	time_t now;
	time(&now);cout << "#Finish o1-filter at " << ctime(&now) << flush;

	//2.first pass --- figure out the numbers
	int tmp2_right=0,tmp2_wrong=0,tmp2_bad=0;
	int tmp3_right=0,tmp3_wrong=0,tmp3_bad=0;
	for(int i=0;i<sentences;i++){
		DependencyInstance* x = training_corpus->at(i);
		double* scores_o1_filter = all_scores_o1[i];
		int length = x->length();
		for(int m=1;m<length;m++){
			//2.1 special (0,0,c,m)	when h==0
			int noprob_0m = score_noprob(scores_o1_filter[get_index2(length,0,m)]);
			int link_0m = (x->heads->at(m)==0);
			int c = -1;
			for(int mid=m-1;mid>0;mid--){
				if(x->heads->at(mid)==0){
					c = mid;
					break;
				}
			}
			if(link_0m && c<0)
				tmp2_right++;
			else if(noprob_0m)
				tmp2_bad++;
			else
				tmp2_wrong++;
			for(int mid=1;mid<m;mid++){
				if(link_0m && mid==c)
					tmp3_right++;
				else if(noprob_0m || score_noprob(scores_o1_filter[get_index2(length,0,mid)]))
					tmp3_bad++;
				else
					tmp3_wrong++;
			}
			//2.2. ordinary ones
			for(int h=1;h<length;h++){	//h>=1
				if(h==m)
					continue;
				//get information
				int small = GET_MIN_ONE(m,h);
				int large = GET_MAX_ONE(m,h);
				bool link_hm = (x->heads->at(m)==h);
				int noprob_hm = score_noprob(scores_o1_filter[get_index2(length,h,m)]);
				int c=-1;	//inside sibling
				if(link_hm){
				if(h>m){
					for(int ii=m+1;ii<h;ii++)
						if(x->heads->at(ii)==h){
							c = ii;
							break;
						}
				}
				else{
					for(int ii=m-1;ii>h;ii--)
						if(x->heads->at(ii)==h){
							c = ii;
							break;
						}
				}}
				//for g and c
				for(int g=0;g<length;g++){
					if(g==h || g==m || g==c)
						continue;
					bool link_gh = (x->heads->at(h)==g);
					int noprob_gh = score_noprob(scores_o1_filter[get_index2(length,g,h)]);
					int nonproj_g = (g>=small && g<=large);
					if(link_hm && link_gh && c<0)
						tmp2_right++;
					else if(noprob_hm || noprob_gh || nonproj_g)
						tmp2_bad++;
					else
						tmp2_wrong++;
					for(int mid=small+1;mid<large;mid++){
						if(link_hm && link_gh && mid==c)
							tmp3_right++;
						else if(noprob_hm || noprob_gh || nonproj_g || score_noprob(scores_o1_filter[get_index2(length,h,mid)]))
							tmp3_bad++;
						else
							tmp3_wrong++;
					}
				}
			}
		}
	}
	tmpall_right=tmp2_right+tmp3_right;
	tmpall_wrong=tmp2_wrong+tmp3_wrong;
	tmpall_bad=tmp2_bad+tmp3_bad;
	printf("--Stat<all,2,3>:right(%d,%d,%d),wrong(%d,%d,%d),bad(%d,%d,%d)\n",tmpall_right,tmp2_right,tmp3_right,
			tmpall_wrong,tmp2_wrong,tmp3_wrong,tmpall_bad,tmp2_bad,tmp3_bad);

	//3.sweep second time and adding them
	//-allocate
	data_right = new REAL[tmpall_right*idim];
	data_wrong = new REAL[tmpall_wrong*idim];
	REAL* assign_right = data_right;
	REAL* assign_wrong = data_wrong;
	for(int i=0;i<sentences;i++){
		DependencyInstance* x = training_corpus->at(i);
		double* scores_o1_filter = all_scores_o1[i];
		int length = x->length();
		for(int m=1;m<length;m++){
			//2.1 special (0,0,c,m)	when h==0
			int noprob_0m = score_noprob(scores_o1_filter[get_index2(length,0,m)]);
			int link_0m = (x->heads->at(m)==0);
			int c = -1;
			for(int mid=m-1;mid>0;mid--){
				if(x->heads->at(mid)==0){
					c = mid;
					break;
				}
			}
			if(link_0m && c<0){
				feat_gen->fill_one(assign_right,x,0,m,-1,0);assign_right += idim;
			}
			else if(noprob_0m){}
			else{
				feat_gen->fill_one(assign_wrong,x,0,m,-1,0);assign_wrong += idim;
			}
			for(int mid=1;mid<m;mid++){
				if(link_0m && mid==c){
					feat_gen->fill_one(assign_right,x,0,m,mid,0);assign_right += idim;
				}
				else if(noprob_0m || score_noprob(scores_o1_filter[get_index2(length,0,mid)])){}
				else{
					feat_gen->fill_one(assign_wrong,x,0,m,mid,0);assign_wrong += idim;
				}
			}
			//2.2. ordinary ones
			for(int h=1;h<length;h++){	//h>=1
				if(h==m)
					continue;
				//get information
				int small = GET_MIN_ONE(m,h);
				int large = GET_MAX_ONE(m,h);
				bool link_hm = (x->heads->at(m)==h);
				int noprob_hm = score_noprob(scores_o1_filter[get_index2(length,h,m)]);
				int c=-1;	//inside sibling
				if(link_hm){
				if(h>m){
					for(int ii=m+1;ii<h;ii++)
						if(x->heads->at(ii)==h){
							c = ii;
							break;
						}
				}
				else{
					for(int ii=m-1;ii>h;ii--)
						if(x->heads->at(ii)==h){
							c = ii;
							break;
						}
				}}
				//for g and c
				for(int g=0;g<length;g++){
					if(g==h || g==m || g==c)
						continue;
					bool link_gh = (x->heads->at(h)==g);
					int noprob_gh = score_noprob(scores_o1_filter[get_index2(length,g,h)]);
					int nonproj_g = (g>=small && g<=large);
					if(link_hm && link_gh && c<0){
						feat_gen->fill_one(assign_right,x,h,m,-1,g);assign_right += idim;
					}
					else if(noprob_hm || noprob_gh || nonproj_g){}
					else{
						feat_gen->fill_one(assign_wrong,x,h,m,-1,g);assign_wrong += idim;
					}
					for(int mid=small+1;mid<large;mid++){
						if(link_hm && link_gh && mid==c){
							feat_gen->fill_one(assign_right,x,h,m,mid,g);assign_right += idim;
						}
						else if(noprob_hm || noprob_gh || nonproj_g || score_noprob(scores_o1_filter[get_index2(length,h,mid)])){}
						else{
							feat_gen->fill_one(assign_wrong,x,h,m,mid,g);assign_wrong += idim;
						}
					}
				}
			}
		}
	}
	for(int i=0;i<sentences;i++){
		delete [](all_scores_o1[i]);
	}
	delete []all_scores_o1;
	time(&now);cout << "#Finish data-gen at " << ctime(&now) << flush;
	}

	//then considering CONF_NN_resample and copy them to finish data
	if(parameters->CONF_NN_resample < 1){
		//get part of the wrong ones --- but first shuffle them
		shuffle_data(data_wrong,data_wrong,idim,idim,tmpall_wrong*idim,tmpall_wrong*idim,10);
	}
	int tmp_sumup = tmpall_wrong*parameters->CONF_NN_resample + tmpall_right;
	data = new REAL[tmp_sumup*idim];
	target = new REAL[tmp_sumup];
	memcpy(data,data_right,tmpall_right*idim*sizeof(REAL));
	memcpy(data+tmpall_right*idim,data_wrong,tmpall_wrong*parameters->CONF_NN_resample*idim*sizeof(REAL));
	for(int i=0;i<tmp_sumup;i++){
		if(i<tmpall_right)
			target[i] = 1;
		else
			target[i] = 0;
	}
	shuffle_data(data,target,idim,1,tmp_sumup*idim,tmp_sumup,10);	//final shuffle
	cout << "--M9, Data for this iter: samples all " << tmpall_right+tmpall_wrong << " resample: " << tmp_sumup << endl;
	current = 0;
	end = tmp_sumup;
}
Ejemplo n.º 6
0
void DependencyDictionary::CreateLabelDictionary(DependencyReader *reader) {
  LOG(INFO) << "Creating label dictionary...";

  vector<int> label_freqs;

  // Go through the corpus and build the label dictionary,
  // counting the frequencies.
  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  DependencyInstance *instance = reader->GetNext();
  while (instance != NULL) {
    int instance_length = instance->size();
    for (int i = 1; i < instance_length; ++i) {
      int id;

      // Add dependency label to alphabet.
      id = label_alphabet_.Insert(instance->GetDependencyRelation(i));
      if (id >= label_freqs.size()) {
        CHECK_EQ(id, label_freqs.size());
        label_freqs.push_back(0);
      }
      ++label_freqs[id];
    }
    delete instance;
    instance = reader->GetNext();
  }
  reader->Close();
  label_alphabet_.StopGrowth();

  // Go through the corpus and build the existing labels for each head-modifier
  // POS pair.
  existing_labels_.clear();
  existing_labels_.resize(token_dictionary_->GetNumPosTags(),
                          vector<vector<int> >(
                            token_dictionary_->GetNumPosTags()));

  maximum_left_distances_.clear();
  maximum_left_distances_.resize(token_dictionary_->GetNumPosTags(),
                                 vector<int>(
                                   token_dictionary_->GetNumPosTags(), 0));

  maximum_right_distances_.clear();
  maximum_right_distances_.resize(token_dictionary_->GetNumPosTags(),
                                  vector<int>(
                                    token_dictionary_->GetNumPosTags(), 0));

  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  instance = reader->GetNext();
  while (instance != NULL) {
    int instance_length = instance->size();
    for (int i = 1; i < instance_length; ++i) {
      int id;
      int head = instance->GetHead(i);
      const string &modifier_pos = instance->GetPosTag(i);
      const string &head_pos = instance->GetPosTag(head);
      int modifier_pos_id = token_dictionary_->GetPosTagId(modifier_pos);
      int head_pos_id = token_dictionary_->GetPosTagId(head_pos);
      if (modifier_pos_id < 0) modifier_pos_id = TOKEN_UNKNOWN;
      if (head_pos_id < 0) head_pos_id = TOKEN_UNKNOWN;
      //CHECK_GE(modifier_pos_id, 0);
      //CHECK_GE(head_pos_id, 0);

      id = label_alphabet_.Lookup(instance->GetDependencyRelation(i));
      CHECK_GE(id, 0);

      // Insert new label in the set of existing labels, if it is not there
      // already. NOTE: this is inefficient, maybe we should be using a 
      // different data structure.
      vector<int> &labels = existing_labels_[modifier_pos_id][head_pos_id];
      int j;
      for (j = 0; j < labels.size(); ++j) {
        if (labels[j] == id) break;
      }
      if (j == labels.size()) labels.push_back(id);

      // Update the maximum distances if necessary.
      if (head != 0) {
        if (head < i) {
          // Right attachment.
          if (i - head > maximum_right_distances_[modifier_pos_id][head_pos_id]) {
            maximum_right_distances_[modifier_pos_id][head_pos_id] = i - head;
          }
        } else {
          // Left attachment.
          if (head - i > maximum_left_distances_[modifier_pos_id][head_pos_id]) {
            maximum_left_distances_[modifier_pos_id][head_pos_id] = head - i;
          }
        }
      }
    }
    delete instance;
    instance = reader->GetNext();
  }
  reader->Close();

  LOG(INFO) << "Number of labels: " << label_alphabet_.size();
}
Ejemplo n.º 7
0
DependencyInstance *DependencyReader::GetNext() {
  // Fill all fields for the entire sentence.
  vector<vector<string> > sentence_fields;
  string line;
  if (is_.is_open()) {
    while (!is_.eof()) {
      getline(is_, line);
      if (line.length() <= 0) break;
      vector<string> fields;
      StringSplit(line, "\t", &fields);
      sentence_fields.push_back(fields);
    }
  }

  // Sentence length.
  int length = sentence_fields.size();

  // Convert to array of forms, lemmas, etc.
  // Note: the first token is the root symbol.
  vector<string> forms(length+1);
  vector<string> brownall(length+1);
  vector<string> lemmas(length+1);
  vector<string> cpos(length+1);
  vector<string> brown4(length+1);
  vector<string> brown6(length+1);
  vector<string> pos(length+1);
  vector<vector<string> > feats(length+1);
  vector<string> deprels(length+1);
  vector<int> selects(length+1);
  vector<int> heads(length+1);

  forms[0] = "_root_";
  brownall[0] = "_root_";
  lemmas[0] = "_root_";
  cpos[0] = "_root_";
  brown4[0] = "_root_";
  brown6[0] = "_root_";
  pos[0] = "_root_";
  deprels[0] = "_root_";
  heads[0] = -1;
  feats[0] = vector<string>(1, "_root_");
  // LPK: the root should always be selected, otherwises nothing can be linked to the root
  selects[0] = 1;

  for(int i = 0; i < length; i++) {
    const vector<string> &info = sentence_fields[i];

    // LPK_TODO: If we would like to add an line to the original Conll format -- augmented Conll Format,
    // we would want to read something from here with index longer than 9
    // Also, at the same time, the format of the DependencyInstance should be changed

    forms[i+1] = info[1];
    brownall[i+1] = info[12];
    lemmas[i+1] = info[2];
    cpos[i+1] = info[3];
    brown4[i+1] = info[10];
    brown6[i+1] = info[11];
    pos[i+1] = info[4];

    string feat_seq = info[5];
    if (0 == feat_seq.compare("_")) {
      // LPK: the underline means nothing in this field
      feats[i+1].clear();
    } else {
      // LPK: every field is actually a vector of string even it only has one value inside that
      StringSplit(feat_seq, "|", &feats[i+1]);
    }
    // LPK: save the dependency relation
    deprels[i+1] = info[7];

    // LPK: check the index of the head into the vector heads
    stringstream ss(info[6]);
    ss >> heads[i+1];

    stringstream sss(info[13]);
    //VLOG(2) << "input into select " << info[13];
    sss >> selects[i+1];

  }

  DependencyInstance *instance = NULL;
  if (length > 0) {
    instance = new DependencyInstance;
    instance->Initialize(forms, brownall, lemmas, cpos, brown4, brown6, pos, feats, deprels, heads, selects);
  }

  return instance;
}
Ejemplo n.º 8
0
double DependencyEvaluator::evaluate(std::string &act_file, std::string &pred_file, std::string &format, bool labeled){

	set<string> punctSet = set<string>();
	punctSet.insert("''");
	punctSet.insert("``");
	punctSet.insert(".");
	punctSet.insert(":");
	punctSet.insert(",");
	punctSet.insert("PU");	//for CTB

	CONLLReader* goldReader = new CONLLReader();
	goldReader->startReading(act_file.c_str());

	CONLLReader* predictedReader = new CONLLReader();
	predictedReader->startReading(pred_file.c_str());

	int total = 0;
	int total_root = 0;
	int total_non_root = 0;
	int corr = 0;
	int corr_root = 0;
	int corr_non_root = 0;
	int corrL = 0;
	int corrL_root = 0;
	int corrL_non_root = 0;
	int numsent = 0;
	int corrsent = 0;
	int corrsentL = 0;
	
	int totalNoPunc = 0;
	int totalNoPunc_root = 0;
	int totalNoPunc_non_root = 0;
	int corrNoPunc = 0;
	int corrNoPunc_root = 0;
	int corrNoPunc_non_root = 0;
	int corrLNoPunc = 0;
	int corrLNoPunc_root = 0;
	int corrLNoPunc_non_root = 0;
	int corrsentNoPunc = 0;
	int corrsentLNoPunc = 0;

	DependencyInstance* goldInstance = goldReader->getNext();
	DependencyInstance* predInstance = predictedReader->getNext();


	while(goldInstance != NULL){
		int instanceLength = goldInstance->length();

		if(instanceLength != predInstance->length()){
			cout<<"Lengths do not match on sentence "<<numsent<<endl;
		}

		vector<int>* goldHeads = goldInstance->heads;
		vector<string*>* goldLabels = goldInstance->deprels;
		vector<int>* predHeads = predInstance->heads;		//because after reading, the predict ones goes there
		vector<string*>* predLabels = predInstance->deprels;

		vector<string*>* pos = goldInstance->postags;

		bool whole = true;
		bool wholeL = true;

		bool wholeNP = true;
		bool wholeLNP = true;

		for(int i = 1; i < instanceLength; i++){
			if((*goldHeads)[i] == 0){
				total_root++;
			}
			else{
				total_non_root++;
			}
			if((*predHeads)[i] == (*goldHeads)[i]){
				corr++;
				if((*goldHeads)[i] == 0){
					corr_root++;
				}
				else{
					corr_non_root++;
				}
				if(labeled){
					if((*(*predLabels)[i]) == (*(*goldLabels)[i])){
						corrL++;
						if((*goldHeads)[i] == 0){
							corrL_root++;
						}
						else{
							corrL_non_root++;
						}
					}
					else{
						wholeL = false;
					}
				}
			}
			else{
				whole = false;
				wholeL = false;
			}

			if(punctSet.count(*((*pos)[i])) <= 0){
				totalNoPunc++;
				if((*goldHeads)[i] == 0){
					totalNoPunc_root++;
				}
				else{
					totalNoPunc_non_root++;
				}
				if((*predHeads)[i] == (*goldHeads)[i]){
					corrNoPunc++;
					if((*goldHeads)[i] == 0){
						corrNoPunc_root++;
					}
					else{
						corrNoPunc_non_root++;
					}
					if(labeled){
						if((*(*predLabels)[i]) == (*(*goldLabels)[i])){
							corrLNoPunc++;
							if((*goldHeads)[i] == 0){
								corrLNoPunc_root++;
							}
							else{
								corrLNoPunc_non_root++;
							}
						}
						else{
							wholeLNP = false;
						}
					}
				}
				else{
					wholeNP = false;
					wholeLNP = false;
				}
			}
		}
		total += instanceLength - 1;
		if(whole){
			corrsent++;
		}
		if(wholeL){
			corrsentL++;
		}
		if(wholeNP){
			corrsentNoPunc++;
		}
		if(wholeLNP){
			corrsentLNoPunc++;
		}
		numsent++;

		delete(goldInstance);
		delete(predInstance);
		goldInstance = goldReader->getNext();
		predInstance = predictedReader->getNext();
	}

	printf("Tokens: %d\n", total);
	printf("Correct: %d\n", corr);
	printf("Unlabeled Accuracy: %.2lf%%\n", ((double)corr) * 100 / total);
	printf("Unlabeled Complete Correct: %.2lf%%\n", ((double)corrsent) *100 / numsent);
	if(labeled){
		printf("Labeled Accuracy: %.2lf%%\n", ((double)corrL) * 100 / total);
		printf("Labeled Complete Correct: %.2lf%%\n", ((double)corrsentL) * 100 / numsent);
	}

	printf("\n");

	printf("Tokens Root: %d\n", total_root);
	printf("Correct Root: %d\n", corr_root);
	printf("Unlabeled Accuracy Root: %.2lf%%\n", ((double)corr_root) * 100 / total_root);
	if(labeled){
		printf("Labeled Accuracy Root: %.2lf%%\n", ((double)corrL_root) * 100 / total_root);
	}

	printf("\n");

	printf("Tokens Non Root: %d\n", total_non_root);
	printf("Correct Non Root: %d\n", corr_non_root);
	printf("Unlabeled Accuracy Non Root: %.2lf%%\n", ((double)corr_non_root) * 100 / total_non_root);
	if(labeled){
		printf("Labeled Accuracy Non Root: %.2lf%%\n", ((double)corrL_non_root) * 100 / total_non_root);
	}

	printf("\n");

	printf("Tokens No Punc: %d\n", totalNoPunc);
	printf("Correct No Punc: %d\n", corrNoPunc);
	printf("Unlabeled Accuracy No Punc: %.2lf%%\n", ((double)corrNoPunc) * 100 / totalNoPunc);
	printf("Unlabeled Complete Correct No Punc: %.2lf%%\n", ((double)corrsentNoPunc) *100 / numsent);
	if(labeled){
		printf("Labeled Accuracy No Punc: %.2lf%%\n", ((double)corrLNoPunc) * 100 / totalNoPunc);
		printf("Labeled Complete Correct No Punc: %.2lf%%\n", ((double)corrsentLNoPunc) * 100 / numsent);
	}

	printf("\n");

	printf("Tokens No Punc Root: %d\n", totalNoPunc_root);
	printf("Correct No Punc Root: %d\n", corrNoPunc_root);
	printf("Unlabeled Accuracy No Punc Root: %.2lf%%\n", ((double)corrNoPunc_root) * 100 / totalNoPunc_root);
	if(labeled){
		printf("Labeled Accuracy No Punc Root: %.2lf%%\n", ((double)corrLNoPunc_root) * 100 / totalNoPunc_root);
	}

	printf("\n");

	printf("Tokens No Punc Non Root: %d\n", totalNoPunc_non_root);
	printf("Correct No Punc Non Root: %d\n", corrNoPunc_non_root);
	printf("Unlabeled Accuracy No Punc Non Root: %.2lf%%\n", ((double)corrNoPunc_non_root) * 100 / totalNoPunc_non_root);
	if(labeled){
		printf("Labeled Accuracy No Punc Non Root: %.2lf%%\n", ((double)corrLNoPunc_non_root) * 100 / totalNoPunc_non_root);
	}

	goldReader->finishReading();
	predictedReader->finishReading();
	delete(goldReader);
	delete(predictedReader);
	return ((double)corr) / total;
}
Ejemplo n.º 9
0
void M2_p2o1::each_train_one_iter()
{
	//per-sentence approach
	int num_sentences = training_corpus->size();
	//statistics
	int skip_sent_num = 0;
	int all_forward_instance = 0;
	int all_inst_right = 0;
	int all_inst_wrong = 0;
	//some useful info
	int odim = mach->get_odim();
	//training
	time_t now;
	time(&now); //ctime is not rentrant ! use ctime_r() instead if needed
	cout << "##*** //p2o1// Start the training for iter " << cur_iter << " at " << ctime(&now)
			<< "with lrate " << cur_lrate << endl;
	cout << "#Sentences is " << num_sentences << " and resample (about)" << num_sentences*hp->CONF_NN_resample << endl;
	for(int i=0;i<num_sentences;){
		//random skip (instead of shuffling every time)
		if(drand48() > hp->CONF_NN_resample){
			skip_sent_num ++;
			i ++;
			continue;
		}

		mach->prepare_batch();
		//if nesterov update before each batch (pre-update)
		if(hp->CONF_NESTEROV_MOMENTUM)
			mach->nesterov_update(hp->CONF_UPDATE_WAY,hp->CONF_MOMENTUM_ALPHA);
		//main batch
		int this_sentence = 0;
		int this_instance = 0;
		int this_tokens = 0;
		for(;;){
			//forward
			DependencyInstance* x = training_corpus->at(i);
			nn_input* the_inputs;
			REAL *fscores = forward_scores_o1(x,mach,&the_inputs,dict->get_helper(),0,hp);
			double* rscores = 0;
			double* tmp_marginals = 0;

			this_instance += the_inputs->get_numi();
			all_forward_instance += the_inputs->get_numi();
			all_inst_right += the_inputs->inst_good;
			all_inst_wrong += the_inputs->inst_bad;
			this_sentence ++;
			this_tokens += x->length()-1;
			i++;

			adjust_scores_before(the_inputs, fscores, odim, hp->CONF_margin);
			//two situations
			int length = x->length();
			if(!hp->CONF_labeled){
				//calculate prob
				rscores = rearrange_scores_o1(x,mach,the_inputs,fscores,0,0,hp);
				tmp_marginals = encodeMarginals(length,rscores);
			}
			else{
				//calculate prob
				rscores = rearrange_scores_o1(x,mach,the_inputs,fscores,0,0,hp);
				tmp_marginals = LencodeMarginals(length,rscores,mach->get_odim());
			}
			adjust_scores_after(the_inputs, fscores, odim, hp->CONF_margin);

			//set gradients
			int HERE_dim = the_inputs->num_width;
			REAL* to_assign = fscores;
			for(int ii=0;ii<the_inputs->num_inst*HERE_dim;ii+=HERE_dim){
				int tmph = the_inputs->inputs->at(ii);
				int tmpm = the_inputs->inputs->at(ii+1);
				int tmp_goal = the_inputs->goals->at(ii/HERE_dim);
				for(int once=0;once<odim;once++,to_assign++){
					if(tmp_goal == once)
						*to_assign = -1 * (1 - tmp_marginals[get_index2(length,tmph,tmpm,once,odim)]) + *to_assign * hp->CONF_score_p2reg;
					else
						*to_assign = tmp_marginals[get_index2(length,tmph,tmpm,once,odim)] + *to_assign * hp->CONF_score_p2reg;	//now object is maximum
				}
			}

			//backward
			mach->backward(fscores);

			//mach->check_gradients(the_inputs);

			delete the_inputs;
			delete []fscores;
			delete []rscores;
			delete []tmp_marginals;

			//out of the mini-batch
			if(i>=num_sentences)
				break;
			if(hp->CONF_minibatch > 0){
				if(this_sentence >= hp->CONF_minibatch)
					break;
			}
			else{
				if(this_instance >= -1*hp->CONF_minibatch)
					break;
			}
		}
		//real update
		if(hp->CONF_mbatch_way == 1)
			mach->set_this_mbsize(this_tokens*this_tokens);
		else if(hp->CONF_mbatch_way == 2)
			mach->set_this_mbsize(this_sentence*this_sentence);
		mach->update(hp->CONF_UPDATE_WAY,cur_lrate,hp->CONF_NN_WD,hp->CONF_MOMENTUM_ALPHA,hp->CONF_RMS_SMOOTH);
	}
	cout << "Iter done, skip " << skip_sent_num << " sentences and f&b " << all_forward_instance
			<< ";good/bad: " << all_inst_right << "/" << all_inst_wrong << endl;
}
Ejemplo n.º 10
0
void Method8_O2g::each_prepare_data_oneiter()
{
	delete []data;
	delete []target;
	delete []gradient;
	//for gradient
	gradient = new REAL[mach->GetWidth()*mach->GetOdim()];
	mach->SetGradOut(gradient);
	//FeatureGenO2sib* feat_o2 = (FeatureGenO2sib*)feat_gen;	//force it
	int sentences = training_corpus->size();
	int idim = mach->GetIdim();
	int odim = mach->GetOdim();

	//only one time when o1_filter(decoding o1 is quite expensive)
	static REAL* data_right = 0;
	static REAL* data_wrong = 0;
	static int tmpall_right=0;
	static int tmpall_wrong=0;
	static int tmpall_bad=0;
	int whether_o1_filter = 0;
	if(parameters->CONF_NN_highO_o1mach.length() > 0 && parameters->CONF_NN_highO_o1filter)
		whether_o1_filter = 1;

	//************WE MUST SPECIFY O1_FILTER****************//
	if(!whether_o1_filter){
		cout << "No o1-filter for o2g, too expensive!!" << endl;
		exit(1);
	}
	//************WE MUST SPECIFY O1_FILTER****************//

	if(data_right==0 || !whether_o1_filter){
	//sweep all once and count
	FeatureGenO1* feat_temp_o1 = new FeatureGenO1(dict,parameters->CONF_x_window,
					parameters->CONF_add_distance,parameters->CONF_add_pos,parameters->CONF_add_distance_parent);
	double** all_scores_o1 = new double*[sentences];
	int all_tokens_train=0,all_token_filter_wrong=0;
	for(int i=0;i<sentences;i++){
		all_scores_o1[i] = 0;
		if(whether_o1_filter){
			DependencyInstance* x = training_corpus->at(i);
			all_scores_o1[i] = get_scores_o1(x,parameters,mach_o1,feat_temp_o1);
			double* scores_o1_filter = all_scores_o1[i];
			all_tokens_train += x->length();
			for(int i2=1;i2<x->length();i2++){	//ignore root
				if(score_noprob(scores_o1_filter[get_index2(x->length(),x->heads->at(i2),i2)]))
					all_token_filter_wrong ++;
			}
		}
	}
	if(whether_o1_filter)
		cout << "For o1 filter: all " << all_tokens_train << ";filter wrong " << all_token_filter_wrong << endl;
	time_t now;
	time(&now);cout << "#Finish o1-filter at " << ctime(&now) << flush;

	int length_sofar_fordebugging = 0;
	for(int i=0;i<sentences;i++){
		DependencyInstance* x = training_corpus->at(i);
		double* scores_o1_filter = all_scores_o1[i];
		int length = x->length();
		/*
		//------debugging------ ###tmpall_becauseof_unprojective###
		length_sofar_fordebugging += length - 1;
		if(!whether_o1_filter)
			scores_o1_filter = new double[length*length];
		//------debugging------
		*/
		for(int m=1;m<length;m++){
			//first special (0,0,m)
			if(x->heads->at(m) == 0)
				tmpall_right++;
			else if(score_noprob(scores_o1_filter[get_index2(length,0,m)]))
				tmpall_bad++;
			else
				tmpall_wrong++;
			//then (g,h,m)
			for(int h=1;h<length;h++){
				if(m==h)
					continue;
				int nope_hm = score_noprob(scores_o1_filter[get_index2(length,h,m)]);
				int link_hm = (x->heads->at(m)==h);
				int small = GET_MIN_ONE(m,h);
				int large = GET_MAX_ONE(m,h);
				for(int g=0;g<length;g++){
					if(g==h || g==m)
						continue;
					//if(g>=s && g<=t)continue;	###allow non-projective here###
					int nope_gh = score_noprob(scores_o1_filter[get_index2(length,g,h)]);
					if(link_hm && x->heads->at(h)==g)
						tmpall_right++;
					else if(nope_hm || nope_gh || (g>=small && g<=large))	//no non-projective
						tmpall_bad++;
					else
						tmpall_wrong++;
				}
			}
		}
		/*
		//------debugging------
		if(tmpall_right != length_sofar_fordebugging){
			cout << i << ": sth strange happen" << endl;
		}
		if(!whether_o1_filter)
			delete [] scores_o1_filter;
		//------debugging------
		*/
	}
	printf("--Stat:%d,%d,%d\n",tmpall_right,tmpall_wrong,tmpall_bad);

	//sweep second time and adding them
	//-allocate
	data_right = new REAL[tmpall_right*idim];
	data_wrong = new REAL[tmpall_wrong*idim];
	REAL* assign_right = data_right;
	REAL* assign_wrong = data_wrong;
	for(int i=0;i<sentences;i++){
		DependencyInstance* x = training_corpus->at(i);
		int length = x->length();
		double* scores_o1_filter = all_scores_o1[i];
		for(int m=1;m<length;m++){
			//first special (0,0,m)
			if(x->heads->at(m) == 0){
				feat_gen->fill_one(assign_right,x,0,m,0);assign_right += idim;
			}
			else if(score_noprob(scores_o1_filter[get_index2(length,0,m)])){}
			else{
				feat_gen->fill_one(assign_wrong,x,0,m,0);assign_wrong += idim;
			}
			//then (g,h,m)
			for(int h=1;h<length;h++){
				if(m==h)
					continue;
				int nope_hm = score_noprob(scores_o1_filter[get_index2(length,h,m)]);
				int link_hm = (x->heads->at(m)==h);
				int small = GET_MIN_ONE(m,h);
				int large = GET_MAX_ONE(m,h);
				for(int g=0;g<length;g++){
					if(g==h || g==m)
						continue;
					//if(g>=s && g<=t)continue;	###allow non-projective here###
					int nope_gh = score_noprob(scores_o1_filter[get_index2(length,g,h)]);
					if(link_hm && x->heads->at(h)==g){
						feat_gen->fill_one(assign_right,x,h,m,g);assign_right += idim;
					}
					else if(nope_hm || nope_gh || (g>=small && g<=large))	//no non-projective
					{}
					else{
						feat_gen->fill_one(assign_wrong,x,h,m,g);assign_wrong += idim;
					}
				}
			}
		}
	}

	for(int i=0;i<sentences;i++){
		delete [](all_scores_o1[i]);
	}
	delete []all_scores_o1;
	time(&now);cout << "#Finish data-gen at " << ctime(&now) << flush;
	}

	//then considering CONF_NN_resample and copy them to finish data
	if(parameters->CONF_NN_resample < 1){
		//get part of the wrong ones --- but first shuffle them
		shuffle_data(data_wrong,data_wrong,idim,idim,tmpall_wrong*idim,tmpall_wrong*idim,10);
	}
	int tmp_sumup = tmpall_wrong*parameters->CONF_NN_resample + tmpall_right;
	data = new REAL[tmp_sumup*idim];
	target = new REAL[tmp_sumup];
	memcpy(data,data_right,tmpall_right*idim*sizeof(REAL));
	memcpy(data+tmpall_right*idim,data_wrong,tmpall_wrong*parameters->CONF_NN_resample*idim*sizeof(REAL));
	for(int i=0;i<tmp_sumup;i++){
		if(i<tmpall_right)
			target[i] = 1;
		else
			target[i] = 0;
	}
	shuffle_data(data,target,idim,1,tmp_sumup*idim,tmp_sumup,10);	//final shuffle
	cout << "--Data for this iter(M8:o2g): samples all " << tmpall_right+tmpall_wrong << " resample: " << tmp_sumup << endl;
	current = 0;
	end = tmp_sumup;
	if(!whether_o1_filter){
		delete[] data_right;
		delete[] data_wrong;
	}
}