std::vector<SVMData> read_data(std::string file_name){ std::vector<SVMData> output; std::ifstream f_in(file_name, std::ios_base::in); if(!f_in.good()) return output; char line[256]; while(!f_in.eof() && f_in.getline(line, 256)){ SVMData svmData; if (analyze_features(line, svmData)) output.push_back(svmData); } return output; }
void Tagger::Init(int argc, char * argv[]) { setlocale(LC_CTYPE, "iso_8858_1"); #ifdef SpecialMalloc /* Force fast allocation */ set_small_allocation(100); #endif /* Clear data structures */ InitDict(dict); InitDict(skip_dict); InitTrans(trans); InitTrans(c_newtrans); odictfile = otranfile = NULL; /* Verify command line */ if (argc <= 2) error_exit("Usage: label corpus options\n"); /* Form options */ InitOptions; set_up_options(argc, argv, &iterations, &initialise, &dict_size, dictname, tranname, odictname, otranname, outname, mapname, skipname, reducename, fsmname, grammarname, infername, ukwname, ofeaturesname, obadwordname, bdbmname, runkstatname, wunkstatname); any_output = !no_output || Option(report_stats) || OutOpt(prob_dist); /* Open BDBM dictionary */ if (Option(bdbm)){ /* Berkeley DB: first of all need to create the dbp data structure*/ if((ret = db_create(&dbp, NULL, 0)) != 0) { fprintf(stderr, "db_create: %s\n", db_strerror(ret)); exit (1); } /* Berkeley DB: Then you open it, readonly */ if((ret = dbp->open(dbp,bdbmname, NULL, DB_BTREE, DB_RDONLY, 0777)) != 0) { dbp->err(dbp, ret, "%s", bdbmname); exit(1); } } /* Read mappings */ if (Option(verbose)) printf("Read mappings\n"); read_mapping(mapname); /* Read tag reduction mappings */ if (Option(reduced_tags)) { if (Option(verbose)) printf("Read reduced tag set\n"); read_reduce_mapping(reducename); } #ifdef Use_Parser /* Read parse rules */ if (Option(use_parser)) { if (Option(verbose)) printf("Read parse rules\n"); parser_read_named(grammarname); } #endif #ifdef Use_FSM /* Read FSM definitions */ if (Option(use_fsm)) { if (Option(verbose)) printf("Read FSMs\n"); fsm_read_named(fsmname); } #endif /* Read skip list */ if (Option(skip_list)) { if (Option(verbose)) printf("Read skip list\n"); read_named_dict(skipname, &skip_dict, -1); } /* Read unknown word rules */ if (Option(unknown_rules)) { if (Option(verbose)) printf("Read unknown word rules\n"); read_unknown_rules(ukwname); } /* Set up dictionary [note]:it costs a few seconds*/ if (dictname[0] == 0) { create_dict(&dict, dict_size); clear_dict(&dict); } else { if (Option(verbose)) printf("Read dictionary\n"); read_named_dict(dictname, &dict, -1); if (infername[0] != 0) { if (Option(verbose)) printf("Read inference rules\n"); infer_tags((char *)infername, &dict); } } /* Set up transitions [note] it costs a few seconds*/ if (tranname[0] == 0) { create_trans(&trans, tags_all); clear_trans_all(&trans); } else { if (Option(verbose)) printf("Read transitions\n"); read_named_ascii_trans(tranname, &trans); /* Analyze selected features of lexicon to generate tag probabilities for unknown words. */ if ( Option(unknown_morph) || Option(unknown_rules)) { /* Initialize feature values */ Allocate(features->featuretags, sizeof(FeatureTagSt), "features->featuretags: main"); features->featuretags->next_open_slot = 0; features->gamma = trans.gamma; if ( features->maxsuffix == 0 ) features->maxsuffix = MinSuffixLen; if ( features->maxunkwords == 0 ) features->maxunkwords = MAXUNKWORDS; if ( features->maxprefcut == 0 ) features->maxprefcut = MinPrefixLen; if ( features->maxsuffcut == 0 ) features->maxsuffcut = MinSuffixLen; unknown_word_handling_initialization(); gather_unigram_freqs( &dict ); } if ( Option(unknown_morph) ) { analyze_features( &dict, ofeaturesname, obadwordname, &trans, dbp, &dict, runkstatname ); } } set_special_words(&dict, features ); /* Create space for re-estimation or training */ if (Option(reestimate) || Option(training)) { c_newtrans.gamma = trans.gamma; /* Share arrays */ create_trans(&c_newtrans, tags_all); } if (odictname[0] != 0) odictfile = open_file(odictname, "w"); if (otranname[0] != 0) otranfile = open_file(otranname, "w"); /* Set up anchor word */ set_anchor(&dict); adjust_dict(&dict, trans.gamma, FALSE); adjust_trans(&trans, NULL); }