示例#1
0
文件: data.c 项目: Arttii/dtm
corpus_seq_t* read_corpus_seq(const char* name)
{
    char filename[400];
    corpus_seq_t* corpus_seq = (corpus_seq_t*) malloc(sizeof(corpus_seq_t));

    // read corpus
    corpus_t* raw_corpus = read_corpus(name);
    corpus_seq->nterms = raw_corpus->nterms;
    // read sequence information
    sprintf(filename, "%s-seq.dat", name);
    outlog("Reading corpus sequence %s.", filename);
    FILE* fileptr = fopen(filename, "r");
    if (!fileptr) {
      outlog("Error opening dtm sequence file %s.\n",
	     filename);
      exit(1);
    }
    fscanf(fileptr, "%d", &(corpus_seq->len));
    corpus_seq->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * corpus_seq->len);
    // allocate corpora
    int doc_idx = 0;
    int ndocs, i, j;
    corpus_seq->ndocs = 0;
    for (i = 0; i < corpus_seq->len; ++i)
    {
        fscanf(fileptr, "%d", &ndocs);
        corpus_seq->ndocs += ndocs;
        corpus_seq->corpus[i] = (corpus_t*) malloc(sizeof(corpus_t));
        corpus_seq->corpus[i]->ndocs = ndocs;
        corpus_seq->corpus[i]->doc = (doc_t**) malloc(sizeof(doc_t*) * ndocs);
        for (j = 0; j < ndocs; j++)
        {
	  if (doc_idx >= raw_corpus->ndocs) {
	    outlog("Error: too few documents listed in dtm sequence file %s.\n"
		   "Current  line: %d %d %d.\n",
		   filename,
		   doc_idx,
		   ndocs,
		   j);
	    exit(1);
	  }
	  //	  outlog("%d %d %d %d\n", i, j, doc_idx, raw_corpus->ndocs);
	  corpus_seq->corpus[i]->doc[j] = raw_corpus->doc[doc_idx];
	  doc_idx++;
        }
    }
    corpus_seq->max_nterms = compute_max_nterms(corpus_seq);
    outlog("read corpus of length %d\n", corpus_seq->len);
    return(corpus_seq);
}
示例#2
0
//init --- right after construction, but here use some virtual functions
void Process::nn_train_prepare()
{
	dev_results = new double[parameters->CONF_NN_ITER];
	//2. get training corpus --- configured for training
	cout << "2.read-corpus:" << endl;
	training_corpus = read_corpus(parameters->CONF_train_file);
	dev_test_corpus = 0;
	//2.5 if continue/restart --- also init some values ---- CAN ONLY CONTINUE WITH ONE MACH METHOD
	cout << "2.5:let's see whether to continue" << endl;
	read_restart_conf();
	//3. get dictionary --- whether continue to train
	if(CTL_continue){
		cout << "3.get dict from file "<< parameters->CONF_dict_file << endl;
		dict = new Dict(parameters->CONF_dict_file);
	}
	else{
		cout << "3.get dict from scratch:" << endl;
		dict = new Dict(parameters->CONF_dict_remove,parameters->CONF_add_distance,parameters->CONF_oov_backoff,parameters->CONF_dict_tolower);
		//this is not good, but...	/*****************NOT GOOD*******************/
		if(parameters->CONF_add_direction)
			dict->construct_dictionary(training_corpus,(void*)1);
		else
			dict->construct_dictionary(training_corpus);
		dict->write(parameters->CONF_dict_file);
	}
	//3.5 get the feature generator
	each_get_featgen(0);			/*************virtual****************/
	//4. get machine
	string mach_cur_name = parameters->CONF_mach_name+parameters->CONF_mach_cur_suffix;
	if(CTL_continue){
		cout << "4.get mach from file "<< mach_cur_name << endl;
		mach = NNInterface::Read(mach_cur_name);
	}
	else{
		cout << "4.get mach from scratch:" << endl;
		mach = NNInterface::create_one(parameters,feat_gen,each_get_mach_outdim());
	    //if init embed
	    init_embed();
	}
	cout << "-Prepare over..." << endl;
}
示例#3
0
double Process::nn_dev_test(string to_test,string output,string gold)
{
	time_t now;
	//also assuming test-file itself is gold file(this must be true with dev file)
	dev_test_corpus = read_corpus(to_test);
	each_get_featgen(1);	/*************virtual****************/
	int token_num = 0;	//token number
	int miss_count = 0;
	time(&now);
	cout << "#--Test at " << ctime(&now) << std::flush;
	for(int i=0;i<dev_test_corpus->size();i++){
		DependencyInstance* t = dev_test_corpus->at(i);
		int length = t->forms->size();
		token_num += length - 1;
		vector<int>* ret = each_test_one(t);		/*************virtual****************/
		for(int i2=1;i2<length;i2++){	//ignore root
			if((*ret)[i2] != (*(t->heads))[i2])
				miss_count ++;
		}
		delete t->heads;
		t->heads = ret;
	}
	time(&now);
	cout << "#--Finish at " << ctime(&now) << std::flush;
	write_corpus(dev_test_corpus,output);
	string ttt;
	double rate = (double)(token_num-miss_count) / token_num;
	cout << "Evaluate:" << (token_num-miss_count) << "/" << token_num
			<< "(" << rate << ")" << endl;
	DependencyEvaluator::evaluate(gold,output,ttt,false);

	//clear
	for(int i=0;i<dev_test_corpus->size();i++){
		delete dev_test_corpus->at(i);
	}
	delete dev_test_corpus;
	return rate;
}
示例#4
0
corpus_type *read_corpus_file(corpusflags_type *flags, const char* filename) {
  const char* filesuffix = strrchr(filename, '.');
  char *command = NULL;
  FILE *in;
  corpus_type *corpus;
  if (strcasecmp(filesuffix, ".bz2") == 0) {
    const char bzcat[] = "bzcat ";
    command = malloc(sizeof(bzcat)+strlen(filename)+1);
    strcpy(command, bzcat);
    strcat(command, filename);
    in = popen(command, "r");
  }
  else if (strcasecmp(filesuffix, ".gz") == 0) {
    const char zcat[] = "zcat ";
    command = malloc(sizeof(zcat)+strlen(filename)+1);
    strcpy(command, zcat);
    strcat(command, filename);
    in = popen(command, "r");
  }
  else
    in = fopen(filename, "r");
  if (in == NULL) {
    if (command == NULL) 
      fprintf(stderr, "## Error: couldn't open corpus file %s\n", filename);
    else
      fprintf(stderr, "## Error: couldn't popen command %s\n", command);
    exit(EXIT_FAILURE);
  }
  corpus = read_corpus(flags, in);
  if (command != NULL) {
    free(command);
    pclose(in);
  }
  else
    fclose(in);
  return corpus;
}  /* read_corpus_file() */
示例#5
0
double Process::nn_dev_test(string to_test,string output,string gold,int dev)
{
	time_t now;
	//also assuming test-file itself is gold file(this must be true with dev file)
	dev_test_corpus = read_corpus(to_test);
	dict->prepare_corpus(dev_test_corpus,1);	//get those indexes
	int token_num = 0;	//token number
	int miss_count = 0;
	time(&now);
	cout << "#--Test at " << ctime(&now) << std::flush;
	for(unsigned int i=0;i<dev_test_corpus->size();i++){
		DependencyInstance* t = dev_test_corpus->at(i);
		int length = t->forms->size();
		token_num += length - 1;
		each_test_one(t,dev);		/*************virtual****************/
		for(int i2=1;i2<length;i2++){	//ignore root
			if((*(t->predict_heads))[i2] != (*(t->heads))[i2])
				miss_count ++;
		}
	}
	time(&now);
	cout << "#--Finish at " << ctime(&now) << std::flush;
	dict->prepare_deprel_str(dev_test_corpus);	//get deprel's strings
	write_corpus(dev_test_corpus,output);
	string ttt;
	double rate = (double)(token_num-miss_count) / token_num;
	cout << "Evaluate:" << (token_num-miss_count) << "/" << token_num
			<< "(" << rate << ")" << endl;
	DependencyEvaluator::evaluate(gold,output,ttt,hp->CONF_labeled);

	//clear
	for(unsigned int i=0;i<dev_test_corpus->size();i++){
		delete dev_test_corpus->at(i);
	}
	delete dev_test_corpus;
	return rate;
}
示例#6
0
void parseall(const void *model, const char* path, const char* test_sections_str, int embedding_dimension) {
    DArray *test_sections = parse_range(test_sections_str);

    signal(SIGINT, intHandler);

    log_info("Test sections to be used in %s: %s", path, join_range(test_sections));

    CoNLLCorpus test = create_CoNLLCorpus(path, test_sections, embedding_dimension, NULL);

    log_info("Reading test corpus");
    read_corpus(test, false);
    
    char* output_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 13));
    check_mem(output_filename);

    sprintf(output_filename, "%s.gold.conll", modelname);
    FILE *gold_fp = fopen(output_filename, "w");

    sprintf(output_filename, "%s.model.conll", modelname);
    FILE *model_fp = fopen(output_filename, "w");
    ParserTestMetric test_metric = test_KernelPerceptronModel(model, test, true, gold_fp, model_fp);
    fclose(gold_fp);
    fclose(model_fp);

    printParserTestMetric(test_metric);
    freeParserTestMetric(test_metric);

    free(output_filename);

    return;
error:
    log_err("Memory allocation error");

    exit(1);


}
示例#7
0
//--------------------------------TRAIN-----------------------//
//init for training --- right after construction, but here use some virtual functions
void Process::nn_train_prepare()
{
	//2. get training corpus --- configured for training
	cout << "2.read-corpus:" << endl;
	training_corpus = read_corpus(hp->CONF_train_file);
	dev_test_corpus = 0;
	//3. get dictionary and write
	if(hp->CONF_traindict_file.length() > 0){
		cout << "3.get dict from " << hp->CONF_traindict_file << endl;
		dict = new Dictionary(hp->CONF_traindict_file);
		dict->write(hp->CONF_dict_file);
	}
	else{
		cout << "3.get dict from scratch:" << endl;
		dict = new Dictionary(training_corpus,hp->CONF_dict_remove);
		dict->write(hp->CONF_dict_file);
	}
	dict->prepare_corpus(training_corpus);	//get those indexes
	//4.create machine
	cout << "4.get mach from scratch:" << endl;
	each_create_machine();		/*************VIRTUAL************/
	init_embed();	//possible init
	cout << "- Prepare over..." << endl;
}
inline void BigramModel::process_corpus(std::string filename)
{
  std::vector< std::vector<std::string> > corpus;
  read_corpus(corpus, filename);
  process_corpus(corpus);
}
示例#9
0
void* optimize(int max_numit, int max_rec, const char* path, const char* train_sections_str, const char* dev_sections_str, int embedding_dimension) {
    DArray *train_sections = parse_range(train_sections_str);
    DArray *dev_sections = parse_range(dev_sections_str);

    signal(SIGINT, intHandler);

    log_info("Development sections to be used in %s: %s", path, join_range(dev_sections));

    CoNLLCorpus dev = create_CoNLLCorpus(path, dev_sections, embedding_dimension, NULL);

    log_info("Training sections to be used in %s: %s", path, join_range(train_sections));

    CoNLLCorpus train = create_CoNLLCorpus(path, train_sections, embedding_dimension, NULL);

    log_info("Reading training corpus");
    read_corpus(train, false);

    log_info("Reading dev corpus");
    read_corpus(dev, false);

    float *numit_dev_avg = (float*) malloc(sizeof (float)* max_numit);
    float *numit_train_avg = (float*) malloc(sizeof (float)*max_numit);

    check(numit_dev_avg != NULL, "Memory allocation failed for numit_dev_avg");
    check(numit_train_avg != NULL, "Memory allocation failed for numit_train_avg");

    PerceptronModel model = NULL;
    KernelPerceptron kmodel = NULL;
    if (kernel == KLINEAR){   
        log_info("Creating a averaged perceptron model");
        model = create_PerceptronModel(train->transformed_embedding_length, NULL);
    }
    else if (kernel == KPOLYNOMIAL)
        kmodel = create_PolynomialKernelPerceptron(polynomial_degree, bias);
    else
        kmodel = create_RBFKernelPerceptron(rbf_lambda);


    int numit;

    int best_iter = -1;
    float best_score = 0.0;

    for (numit = 1; numit <= max_numit && keepRunning; numit++) {
        log_info("BEGIN-TRAIN: Iteration %d", numit);

        if (kernel == KLINEAR)
            train_once_PerceptronModel(model, train, max_rec);
        else
            train_once_KernelPerceptronModel(kmodel, train, max_rec);


        log_info("END-TRAIN: Iteration %d", numit);

        ParserTestMetric dev_metric;
        log_info("BEGIN-TEST: Iteration %d", numit);
        
        if (kernel == KLINEAR)
            dev_metric = test_KernelPerceptronModel(model, dev, true, NULL, NULL);
        else
            dev_metric = test_KernelPerceptronModel(kmodel, dev, true, NULL, NULL);
        log_info("END-TEST: Iteration %d", numit);

        log_info("\nnumit=%d", numit);

        printParserTestMetric(dev_metric);

        double dev_acc = (dev_metric->without_punc->true_prediction * 1.) / dev_metric->without_punc->total_prediction;
        numit_dev_avg[numit - 1] = dev_acc;
        numit_train_avg[numit - 1] = 0.0;

        freeParserTestMetric(dev_metric);

        if (best_score < dev_acc) {
            if (best_score + MIN_DELTA > dev_acc)
                log_warn("Improvement is less than %f", MIN_DELTA);

            best_score = dev_acc;
            best_iter = numit;

            if (kernel == KLINEAR)
                mark_best_PerceptronModel(model, numit);
            else
                mark_best_KernelPerceptronModel(kmodel, numit);
        }

        if (numit - best_iter > MAX_IDLE_ITER && STOP_ON_CONVERGE) {
            log_info("No improvement in last %d iterations", MAX_IDLE_ITER);
            keepRunning = false;
        }
    }

    log_info("Iteration\tAccuracy(dev)\tAccuracy(train)");
    for (int i = 0; i < numit - 1; i++) {
        log_info("%d\t\t%f\t%f%s", i + 1, numit_dev_avg[i], numit_train_avg[i], (i + 1 == best_iter) ? " (*)" : "");
    }

    //free_CoNLLCorpus(dev, true);
    //free_CoNLLCorpus(train, true);

    if (kernel == KLINEAR)
        return (void*) model;
    else
        return (void*) kmodel;

error:
    log_err("Memory allocation error");

    exit(1);

}
示例#10
0
Perceptron_t optimize(int max_numit, int max_rec, const char* path, const char* train_sections_str, const char* dev_sections_str) {
    DArray *train_sections = parse_range(train_sections_str);
    DArray *dev_sections = parse_range(dev_sections_str);

    signal(SIGINT, intHandler);

    log_info("Development sections to be used in %s: %s", path, join_range(dev_sections));

    CoNLLCorpus dev = create_CoNLLCorpus(path, dev_sections);

    log_info("Training sections to be used in %s: %s", path, join_range(train_sections));

    CoNLLCorpus train = create_CoNLLCorpus(path, train_sections);

    log_info("Reading training corpus");
    read_corpus(train, max_rec, false);

    log_info("Reading dev corpus");
    read_corpus(dev, -1, false);

    float *numit_dev_avg = (float*) malloc(sizeof (float)* max_numit);
    float *numit_train_avg = (float*) malloc(sizeof (float)*max_numit);
	long *numit_num_sv = (long*) malloc(sizeof (long)*max_numit);

    check(numit_dev_avg != NULL, "Memory allocation failed for numit_dev_avg");
    check(numit_train_avg != NULL, "Memory allocation failed for numit_train_avg");
	check(numit_num_sv != NULL, "Memory allocation failed for numit_num_sv");

    Perceptron_t model = NULL;
    
    if (type == SIMPLE_PERCEPTRON) {
        
        #ifdef	OPTIMIZED_TRANSFORMATION
        log_info("Creating a averaged perceptron model with optimized feature transformation.");
        model = newSimplePerceptron(ft);
        #else
        log_info("Creating a averaged perceptron model");
        model = newSimplePerceptron(NULL);
        #endif
        //model = create_PerceptronModel(train->transformed_embedding_length, NULL);
    }
    else {
        if (kernel == POLYNOMIAL_KERNEL)
            model = newPolynomialKernelPerceptron(polynomial_degree, bias);
            //kmodel = newPolynomialKernelPerceptron(polynomial_degree, bias);
        else
            model = NULL;
            //kmodel = create_RBFKernelPerceptron(rbf_lambda);
    }


    int numit;

    int best_iter = -1;
    float best_score = 0.0;

    for (numit = 1; numit <= max_numit && keepRunning; numit++) {
        log_info("BEGIN-TRAIN: Iteration %d", numit);

        trainPerceptronOnce(model, train, max_rec);

        log_info("END-TRAIN: Iteration %d", numit);

        ParserTestMetric dev_metric;
        log_info("BEGIN-TEST: Iteration %d", numit);

        dev_metric = testPerceptron(model, dev, true, NULL, NULL);
        
        log_info("END-TEST: Iteration %d", numit);

        log_info("\nnumit=%d", numit);

        printParserTestMetric(dev_metric);

        double dev_acc = (dev_metric->without_punc->true_prediction * 1.) / dev_metric->without_punc->total_prediction;
        numit_dev_avg[numit - 1] = dev_acc;
        numit_train_avg[numit - 1] = 0.0;
		
		if ( model->type == KERNEL_PERCEPTRON )
			numit_num_sv[ numit - 1 ] = ((KernelPerceptron_t)model->pDeriveObj)->kernel->matrix->ncol;	

        freeParserTestMetric(dev_metric);

        if (best_score < dev_acc) {
            if (best_score + MIN_DELTA > dev_acc)
                log_warn("Improvement is less than %f", MIN_DELTA);

            best_score = dev_acc;
            best_iter = numit;

            //mark_best_PerceptronModel(model, numit);
            
            EPARSE_CHECK_RETURN(snapshotBest(model));
        }

        if (numit - best_iter > MAX_IDLE_ITER && STOP_ON_CONVERGE) {
            log_info("No improvement in last %d iterations", MAX_IDLE_ITER);
            keepRunning = false;
        }
    }

    log_info("Iteration\tAccuracy(dev)\tAccuracy(train)\t# of SV");
    for (int i = 0; i < numit - 1; i++) {
        log_info("%d\t\t%f\t%f\t%ld%s", i + 1, numit_dev_avg[i], numit_train_avg[i], numit_num_sv[i], (i + 1 == best_iter) ? " (*)" : "");
    }

    //free_CoNLLCorpus(dev, true);
    //free_CoNLLCorpus(train, true);

    return model;

error:
    log_err("Memory allocation error");

    exit(1);

}