corpus_seq_t* read_corpus_seq(const char* name) { char filename[400]; corpus_seq_t* corpus_seq = (corpus_seq_t*) malloc(sizeof(corpus_seq_t)); // read corpus corpus_t* raw_corpus = read_corpus(name); corpus_seq->nterms = raw_corpus->nterms; // read sequence information sprintf(filename, "%s-seq.dat", name); outlog("Reading corpus sequence %s.", filename); FILE* fileptr = fopen(filename, "r"); if (!fileptr) { outlog("Error opening dtm sequence file %s.\n", filename); exit(1); } fscanf(fileptr, "%d", &(corpus_seq->len)); corpus_seq->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * corpus_seq->len); // allocate corpora int doc_idx = 0; int ndocs, i, j; corpus_seq->ndocs = 0; for (i = 0; i < corpus_seq->len; ++i) { fscanf(fileptr, "%d", &ndocs); corpus_seq->ndocs += ndocs; corpus_seq->corpus[i] = (corpus_t*) malloc(sizeof(corpus_t)); corpus_seq->corpus[i]->ndocs = ndocs; corpus_seq->corpus[i]->doc = (doc_t**) malloc(sizeof(doc_t*) * ndocs); for (j = 0; j < ndocs; j++) { if (doc_idx >= raw_corpus->ndocs) { outlog("Error: too few documents listed in dtm sequence file %s.\n" "Current line: %d %d %d.\n", filename, doc_idx, ndocs, j); exit(1); } // outlog("%d %d %d %d\n", i, j, doc_idx, raw_corpus->ndocs); corpus_seq->corpus[i]->doc[j] = raw_corpus->doc[doc_idx]; doc_idx++; } } corpus_seq->max_nterms = compute_max_nterms(corpus_seq); outlog("read corpus of length %d\n", corpus_seq->len); return(corpus_seq); }
//init --- right after construction, but here use some virtual functions void Process::nn_train_prepare() { dev_results = new double[parameters->CONF_NN_ITER]; //2. get training corpus --- configured for training cout << "2.read-corpus:" << endl; training_corpus = read_corpus(parameters->CONF_train_file); dev_test_corpus = 0; //2.5 if continue/restart --- also init some values ---- CAN ONLY CONTINUE WITH ONE MACH METHOD cout << "2.5:let's see whether to continue" << endl; read_restart_conf(); //3. get dictionary --- whether continue to train if(CTL_continue){ cout << "3.get dict from file "<< parameters->CONF_dict_file << endl; dict = new Dict(parameters->CONF_dict_file); } else{ cout << "3.get dict from scratch:" << endl; dict = new Dict(parameters->CONF_dict_remove,parameters->CONF_add_distance,parameters->CONF_oov_backoff,parameters->CONF_dict_tolower); //this is not good, but... /*****************NOT GOOD*******************/ if(parameters->CONF_add_direction) dict->construct_dictionary(training_corpus,(void*)1); else dict->construct_dictionary(training_corpus); dict->write(parameters->CONF_dict_file); } //3.5 get the feature generator each_get_featgen(0); /*************virtual****************/ //4. get machine string mach_cur_name = parameters->CONF_mach_name+parameters->CONF_mach_cur_suffix; if(CTL_continue){ cout << "4.get mach from file "<< mach_cur_name << endl; mach = NNInterface::Read(mach_cur_name); } else{ cout << "4.get mach from scratch:" << endl; mach = NNInterface::create_one(parameters,feat_gen,each_get_mach_outdim()); //if init embed init_embed(); } cout << "-Prepare over..." << endl; }
double Process::nn_dev_test(string to_test,string output,string gold) { time_t now; //also assuming test-file itself is gold file(this must be true with dev file) dev_test_corpus = read_corpus(to_test); each_get_featgen(1); /*************virtual****************/ int token_num = 0; //token number int miss_count = 0; time(&now); cout << "#--Test at " << ctime(&now) << std::flush; for(int i=0;i<dev_test_corpus->size();i++){ DependencyInstance* t = dev_test_corpus->at(i); int length = t->forms->size(); token_num += length - 1; vector<int>* ret = each_test_one(t); /*************virtual****************/ for(int i2=1;i2<length;i2++){ //ignore root if((*ret)[i2] != (*(t->heads))[i2]) miss_count ++; } delete t->heads; t->heads = ret; } time(&now); cout << "#--Finish at " << ctime(&now) << std::flush; write_corpus(dev_test_corpus,output); string ttt; double rate = (double)(token_num-miss_count) / token_num; cout << "Evaluate:" << (token_num-miss_count) << "/" << token_num << "(" << rate << ")" << endl; DependencyEvaluator::evaluate(gold,output,ttt,false); //clear for(int i=0;i<dev_test_corpus->size();i++){ delete dev_test_corpus->at(i); } delete dev_test_corpus; return rate; }
corpus_type *read_corpus_file(corpusflags_type *flags, const char* filename) { const char* filesuffix = strrchr(filename, '.'); char *command = NULL; FILE *in; corpus_type *corpus; if (strcasecmp(filesuffix, ".bz2") == 0) { const char bzcat[] = "bzcat "; command = malloc(sizeof(bzcat)+strlen(filename)+1); strcpy(command, bzcat); strcat(command, filename); in = popen(command, "r"); } else if (strcasecmp(filesuffix, ".gz") == 0) { const char zcat[] = "zcat "; command = malloc(sizeof(zcat)+strlen(filename)+1); strcpy(command, zcat); strcat(command, filename); in = popen(command, "r"); } else in = fopen(filename, "r"); if (in == NULL) { if (command == NULL) fprintf(stderr, "## Error: couldn't open corpus file %s\n", filename); else fprintf(stderr, "## Error: couldn't popen command %s\n", command); exit(EXIT_FAILURE); } corpus = read_corpus(flags, in); if (command != NULL) { free(command); pclose(in); } else fclose(in); return corpus; } /* read_corpus_file() */
double Process::nn_dev_test(string to_test,string output,string gold,int dev) { time_t now; //also assuming test-file itself is gold file(this must be true with dev file) dev_test_corpus = read_corpus(to_test); dict->prepare_corpus(dev_test_corpus,1); //get those indexes int token_num = 0; //token number int miss_count = 0; time(&now); cout << "#--Test at " << ctime(&now) << std::flush; for(unsigned int i=0;i<dev_test_corpus->size();i++){ DependencyInstance* t = dev_test_corpus->at(i); int length = t->forms->size(); token_num += length - 1; each_test_one(t,dev); /*************virtual****************/ for(int i2=1;i2<length;i2++){ //ignore root if((*(t->predict_heads))[i2] != (*(t->heads))[i2]) miss_count ++; } } time(&now); cout << "#--Finish at " << ctime(&now) << std::flush; dict->prepare_deprel_str(dev_test_corpus); //get deprel's strings write_corpus(dev_test_corpus,output); string ttt; double rate = (double)(token_num-miss_count) / token_num; cout << "Evaluate:" << (token_num-miss_count) << "/" << token_num << "(" << rate << ")" << endl; DependencyEvaluator::evaluate(gold,output,ttt,hp->CONF_labeled); //clear for(unsigned int i=0;i<dev_test_corpus->size();i++){ delete dev_test_corpus->at(i); } delete dev_test_corpus; return rate; }
void parseall(const void *model, const char* path, const char* test_sections_str, int embedding_dimension) { DArray *test_sections = parse_range(test_sections_str); signal(SIGINT, intHandler); log_info("Test sections to be used in %s: %s", path, join_range(test_sections)); CoNLLCorpus test = create_CoNLLCorpus(path, test_sections, embedding_dimension, NULL); log_info("Reading test corpus"); read_corpus(test, false); char* output_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 13)); check_mem(output_filename); sprintf(output_filename, "%s.gold.conll", modelname); FILE *gold_fp = fopen(output_filename, "w"); sprintf(output_filename, "%s.model.conll", modelname); FILE *model_fp = fopen(output_filename, "w"); ParserTestMetric test_metric = test_KernelPerceptronModel(model, test, true, gold_fp, model_fp); fclose(gold_fp); fclose(model_fp); printParserTestMetric(test_metric); freeParserTestMetric(test_metric); free(output_filename); return; error: log_err("Memory allocation error"); exit(1); }
//--------------------------------TRAIN-----------------------// //init for training --- right after construction, but here use some virtual functions void Process::nn_train_prepare() { //2. get training corpus --- configured for training cout << "2.read-corpus:" << endl; training_corpus = read_corpus(hp->CONF_train_file); dev_test_corpus = 0; //3. get dictionary and write if(hp->CONF_traindict_file.length() > 0){ cout << "3.get dict from " << hp->CONF_traindict_file << endl; dict = new Dictionary(hp->CONF_traindict_file); dict->write(hp->CONF_dict_file); } else{ cout << "3.get dict from scratch:" << endl; dict = new Dictionary(training_corpus,hp->CONF_dict_remove); dict->write(hp->CONF_dict_file); } dict->prepare_corpus(training_corpus); //get those indexes //4.create machine cout << "4.get mach from scratch:" << endl; each_create_machine(); /*************VIRTUAL************/ init_embed(); //possible init cout << "- Prepare over..." << endl; }
inline void BigramModel::process_corpus(std::string filename) { std::vector< std::vector<std::string> > corpus; read_corpus(corpus, filename); process_corpus(corpus); }
void* optimize(int max_numit, int max_rec, const char* path, const char* train_sections_str, const char* dev_sections_str, int embedding_dimension) { DArray *train_sections = parse_range(train_sections_str); DArray *dev_sections = parse_range(dev_sections_str); signal(SIGINT, intHandler); log_info("Development sections to be used in %s: %s", path, join_range(dev_sections)); CoNLLCorpus dev = create_CoNLLCorpus(path, dev_sections, embedding_dimension, NULL); log_info("Training sections to be used in %s: %s", path, join_range(train_sections)); CoNLLCorpus train = create_CoNLLCorpus(path, train_sections, embedding_dimension, NULL); log_info("Reading training corpus"); read_corpus(train, false); log_info("Reading dev corpus"); read_corpus(dev, false); float *numit_dev_avg = (float*) malloc(sizeof (float)* max_numit); float *numit_train_avg = (float*) malloc(sizeof (float)*max_numit); check(numit_dev_avg != NULL, "Memory allocation failed for numit_dev_avg"); check(numit_train_avg != NULL, "Memory allocation failed for numit_train_avg"); PerceptronModel model = NULL; KernelPerceptron kmodel = NULL; if (kernel == KLINEAR){ log_info("Creating a averaged perceptron model"); model = create_PerceptronModel(train->transformed_embedding_length, NULL); } else if (kernel == KPOLYNOMIAL) kmodel = create_PolynomialKernelPerceptron(polynomial_degree, bias); else kmodel = create_RBFKernelPerceptron(rbf_lambda); int numit; int best_iter = -1; float best_score = 0.0; for (numit = 1; numit <= max_numit && keepRunning; numit++) { log_info("BEGIN-TRAIN: Iteration %d", numit); if (kernel == KLINEAR) train_once_PerceptronModel(model, train, max_rec); else train_once_KernelPerceptronModel(kmodel, train, max_rec); log_info("END-TRAIN: Iteration %d", numit); ParserTestMetric dev_metric; log_info("BEGIN-TEST: Iteration %d", numit); if (kernel == KLINEAR) dev_metric = test_KernelPerceptronModel(model, dev, true, NULL, NULL); else dev_metric = test_KernelPerceptronModel(kmodel, dev, true, NULL, NULL); log_info("END-TEST: Iteration %d", numit); log_info("\nnumit=%d", numit); printParserTestMetric(dev_metric); double dev_acc = (dev_metric->without_punc->true_prediction * 1.) / dev_metric->without_punc->total_prediction; numit_dev_avg[numit - 1] = dev_acc; numit_train_avg[numit - 1] = 0.0; freeParserTestMetric(dev_metric); if (best_score < dev_acc) { if (best_score + MIN_DELTA > dev_acc) log_warn("Improvement is less than %f", MIN_DELTA); best_score = dev_acc; best_iter = numit; if (kernel == KLINEAR) mark_best_PerceptronModel(model, numit); else mark_best_KernelPerceptronModel(kmodel, numit); } if (numit - best_iter > MAX_IDLE_ITER && STOP_ON_CONVERGE) { log_info("No improvement in last %d iterations", MAX_IDLE_ITER); keepRunning = false; } } log_info("Iteration\tAccuracy(dev)\tAccuracy(train)"); for (int i = 0; i < numit - 1; i++) { log_info("%d\t\t%f\t%f%s", i + 1, numit_dev_avg[i], numit_train_avg[i], (i + 1 == best_iter) ? " (*)" : ""); } //free_CoNLLCorpus(dev, true); //free_CoNLLCorpus(train, true); if (kernel == KLINEAR) return (void*) model; else return (void*) kmodel; error: log_err("Memory allocation error"); exit(1); }
Perceptron_t optimize(int max_numit, int max_rec, const char* path, const char* train_sections_str, const char* dev_sections_str) { DArray *train_sections = parse_range(train_sections_str); DArray *dev_sections = parse_range(dev_sections_str); signal(SIGINT, intHandler); log_info("Development sections to be used in %s: %s", path, join_range(dev_sections)); CoNLLCorpus dev = create_CoNLLCorpus(path, dev_sections); log_info("Training sections to be used in %s: %s", path, join_range(train_sections)); CoNLLCorpus train = create_CoNLLCorpus(path, train_sections); log_info("Reading training corpus"); read_corpus(train, max_rec, false); log_info("Reading dev corpus"); read_corpus(dev, -1, false); float *numit_dev_avg = (float*) malloc(sizeof (float)* max_numit); float *numit_train_avg = (float*) malloc(sizeof (float)*max_numit); long *numit_num_sv = (long*) malloc(sizeof (long)*max_numit); check(numit_dev_avg != NULL, "Memory allocation failed for numit_dev_avg"); check(numit_train_avg != NULL, "Memory allocation failed for numit_train_avg"); check(numit_num_sv != NULL, "Memory allocation failed for numit_num_sv"); Perceptron_t model = NULL; if (type == SIMPLE_PERCEPTRON) { #ifdef OPTIMIZED_TRANSFORMATION log_info("Creating a averaged perceptron model with optimized feature transformation."); model = newSimplePerceptron(ft); #else log_info("Creating a averaged perceptron model"); model = newSimplePerceptron(NULL); #endif //model = create_PerceptronModel(train->transformed_embedding_length, NULL); } else { if (kernel == POLYNOMIAL_KERNEL) model = newPolynomialKernelPerceptron(polynomial_degree, bias); //kmodel = newPolynomialKernelPerceptron(polynomial_degree, bias); else model = NULL; //kmodel = create_RBFKernelPerceptron(rbf_lambda); } int numit; int best_iter = -1; float best_score = 0.0; for (numit = 1; numit <= max_numit && keepRunning; numit++) { log_info("BEGIN-TRAIN: Iteration %d", numit); trainPerceptronOnce(model, train, max_rec); log_info("END-TRAIN: Iteration %d", numit); ParserTestMetric dev_metric; log_info("BEGIN-TEST: Iteration %d", numit); dev_metric = testPerceptron(model, dev, true, NULL, NULL); log_info("END-TEST: Iteration %d", numit); log_info("\nnumit=%d", numit); printParserTestMetric(dev_metric); double dev_acc = (dev_metric->without_punc->true_prediction * 1.) / dev_metric->without_punc->total_prediction; numit_dev_avg[numit - 1] = dev_acc; numit_train_avg[numit - 1] = 0.0; if ( model->type == KERNEL_PERCEPTRON ) numit_num_sv[ numit - 1 ] = ((KernelPerceptron_t)model->pDeriveObj)->kernel->matrix->ncol; freeParserTestMetric(dev_metric); if (best_score < dev_acc) { if (best_score + MIN_DELTA > dev_acc) log_warn("Improvement is less than %f", MIN_DELTA); best_score = dev_acc; best_iter = numit; //mark_best_PerceptronModel(model, numit); EPARSE_CHECK_RETURN(snapshotBest(model)); } if (numit - best_iter > MAX_IDLE_ITER && STOP_ON_CONVERGE) { log_info("No improvement in last %d iterations", MAX_IDLE_ITER); keepRunning = false; } } log_info("Iteration\tAccuracy(dev)\tAccuracy(train)\t# of SV"); for (int i = 0; i < numit - 1; i++) { log_info("%d\t\t%f\t%f\t%ld%s", i + 1, numit_dev_avg[i], numit_train_avg[i], numit_num_sv[i], (i + 1 == best_iter) ? " (*)" : ""); } //free_CoNLLCorpus(dev, true); //free_CoNLLCorpus(train, true); return model; error: log_err("Memory allocation error"); exit(1); }