void PartialSegmentationUtils::trim_partial_tag(const std::string& input, std::string& output) { if (startswith(input, __word_start__)) { output = input.substr(__word_start__.length(), input.length() - __word_start__.length() - __word_end__.length()); } else if (startswith(input, __partial_start__)) { output = input.substr(__partial_start__.length(), input.length() - __partial_start__.length() - __partial_end__.length()); } else { output = input; } }
NERTransitionConstrain::NERTransitionConstrain(const utility::IndexableSmartMap& alphabet, const std::vector<std::string>& includes): T(alphabet.size()) { for (size_t i = 0; i < includes.size(); ++ i) { const std::string& include = includes[i]; std::vector<std::string> tokens = split_by_sep(include, "->", 1); if (tokens.size() != 2) { WARNING_LOG("constrain text \"%s\" is in illegal format.", include.c_str()); continue; } int from = alphabet.index(trim_copy(tokens[0])); int to = alphabet.index(trim_copy(tokens[1])); if (-1 == from || -1 == to) { WARNING_LOG("label in constrain text \"%s,%s\" is not in alphabet.", trim_copy(tokens[0]).c_str(), trim_copy(tokens[1]).c_str()); } else { rep.insert(from * T + to); } } }
int PartialSegmentationUtils::split_by_partial_tag( const std::string& line, std::vector<std::string>& words) { size_t offset1 = line.find(__partial_start__); size_t offset2 = line.find(__word_start__); if (offset1 == std::string::npos && offset2 == std::string::npos) { // 0 representing no partial tags. split with the original tags. words = split(line); return 0; } size_t offset = 0; size_t prelude = 0, coda = 0, len_start_tag = 0, len_end_tag = 0; while (offset < line.length()) { prelude = (offset1 < offset2 ? offset1 : offset2); std::string word = line.substr(offset, prelude - offset); if (word.length() > 0) { words.push_back( word ); } if (offset1 < offset2) { coda = line.find(__partial_end__, prelude); len_start_tag = __partial_start__.length(); len_end_tag = __partial_end__.length(); } else { coda = line.find(__word_end__, prelude); len_start_tag = __word_start__.length(); len_end_tag = __word_end__.length(); } word = line.substr(prelude + len_start_tag, coda - prelude - len_start_tag); if ((word.find(__partial_start__) != std::string::npos) || (word.find(__partial_end__) != std::string::npos) || (word.find(__word_start__) != std::string::npos) || (word.find(__word_end__) != std::string::npos)) { return -1; } words.push_back( line.substr(prelude, coda - prelude + len_end_tag) ); offset = coda + len_end_tag; offset1 = line.find(__partial_start__, offset); offset2 = line.find(__word_start__, offset); if (offset1 == std::string::npos && offset2 == std::string::npos) { // 0 representing no partial tags. word = line.substr(offset); if (word.length() > 0) { words.push_back( word ); } break; } } return 1; }
Instance* PostaggerReader::next() { if (is.eof()) { return 0; } cursor ++; if (trace && cursor % interval == 0) { INFO_LOG("reading: read %d0%% instances.", (cursor/ interval)); } Instance* inst = new Instance; std::string line; std::getline(is, line); trim(line); if (line.size() == 0) { delete inst; return 0; } std::vector<std::string> words = split(line); for (size_t i = 0; i < words.size(); ++ i) { if (with_tag) { std::vector<std::string> sep = rsplit_by_sep(words[i], delimiter, 1); if (sep.size() == 2) { inst->raw_forms.push_back(sep[0]); inst->forms.push_back(sbc2dbc_x(sep[0])); inst->tags.push_back(sep[1]); } else { std::cerr << words[i] << std::endl; delete inst; return 0; } } else { inst->raw_forms.push_back(words[i]); inst->forms.push_back(sbc2dbc_x(words[i])); } } return inst; }
Instance* CoNLLReader::next() { if (is.eof()) { return NULL;} Instance* inst = new Instance; std::string line; inst->raw_forms.push_back( SpecialOption::ROOT ); inst->forms.push_back( SpecialOption::ROOT ); inst->lemmas.push_back( SpecialOption::ROOT ); inst->postags.push_back( SpecialOption::ROOT ); inst->heads.push_back( -1 ); inst->deprels.push_back( SpecialOption::ROOT ); while (!is.eof()) { getline(is, line); trim(line); if (line.size() == 0) { break; } std::vector<std::string> items = split(line); if (items.size() < 8) { WARNING_LOG("Unknown conll format file"); } inst->raw_forms.push_back( items[1] ); // items[1]: form inst->forms.push_back( sbc2dbc(items[1]) ); inst->lemmas.push_back( items[2] ); // items[2]: lemma inst->postags.push_back( items[3] ); // items[4]: postag inst->heads.push_back( to_int(items[6]) ); inst->deprels.push_back( items[7] ); } if (inst->forms.size() == 1) { delete inst; inst = NULL; } return inst; }
void CoNLLWriter::write(const Instance& inst) { size_t len = inst.size(); bool predicted = (inst.predict_heads.size() == len && inst.predict_deprels.size() == len); for (size_t i = 1; i < len; ++ i) { f << i << "\t" // 0 - index << inst.raw_forms[i] << "\t" // 1 - form << inst.lemmas[i] << "\t" // 2 - lemma << inst.postags[i] << "\t" // 3 - postag << "_\t" // 4 - unknown << "_\t" // 5 - unknown << inst.heads[i] << "\t" // 6 - heads << inst.deprels[i] << "\t" // 7 - deprels << (predicted ? to_str(inst.predict_heads[i]) : "_") << "\t" << (predicted ? inst.predict_deprels[i] : "_") << std::endl; } f << std::endl; }
void PostaggerFrontend::train(void) { // read in training instance INFO_LOG("trace: reading reference dataset ..."); if (!read_instances(train_opt.train_file.c_str())) { ERROR_LOG("Training file doesn't exist."); } INFO_LOG("trace: %d sentences is loaded.", train_dat.size()); model = new Model(Extractor::num_templates()); // build tag dictionary, map string tag to index INFO_LOG("report: start building configuration ..."); build_configuration(); INFO_LOG("report: build configuration is done."); INFO_LOG("report: number of postags: %d", model->labels.size()); // build feature space from the training instance INFO_LOG("report: start building feature space ..."); build_feature_space(); INFO_LOG("report: building feature space is done."); INFO_LOG("report: number of features: %d", model->space.num_features()); model->param.realloc(model->space.dim()); INFO_LOG("report: allocate %d dimensition parameter.", model->space.dim()); int nr_groups = model->space.num_groups(); std::vector<int> groupwise_update_counters; if (train_opt.rare_feature_threshold > 0) { groupwise_update_counters.resize(nr_groups, 0); INFO_LOG("report: allocate %d update-time counters", nr_groups); } else { INFO_LOG("report: model truncation is inactived."); } int best_iteration = -1; double best_p = -1.; std::vector<size_t> update_counts; for (int iter = 0; iter < train_opt.max_iter; ++ iter) { INFO_LOG("Training iteraition #%d", (iter + 1)); size_t interval= train_dat.size() / 10; for (size_t i = 0; i < train_dat.size(); ++ i) { Instance* inst = train_dat[i]; extract_features((*inst), &ctx, false); calculate_scores((*inst), ctx, false, &scm); decoder.decode(scm, inst->predict_tagsidx); collect_features(model, ctx.uni_features, inst->tagsidx, ctx.correct_features); collect_features(model, ctx.uni_features, inst->predict_tagsidx, ctx.predict_features); SparseVec updated_features; updated_features.add(ctx.correct_features, 1.); updated_features.add(ctx.predict_features, -1.); learn(train_opt.algorithm, updated_features, iter*train_dat.size() + 1, inst->num_errors(), model); if (train_opt.rare_feature_threshold > 0) { increase_groupwise_update_counts(model, updated_features, update_counts); } ctx.clear(); if ((i+1) % interval == 0) { INFO_LOG("training: %d0%% (%d) instances is trained.", ((i+1)/interval), i+1); } } INFO_LOG("trace: %d instances is trained.", train_dat.size()); model->param.flush( train_dat.size() * (iter + 1) ); Model* new_model = new Model(Extractor::num_templates()); erase_rare_features(model, new_model, train_opt.rare_feature_threshold, update_counts); std::swap(model, new_model); double p; evaluate(p); if(p > best_p){ best_p = p; best_iteration = iter; } std::string saved_model_file = (train_opt.model_name+ "."+ to_str(iter)); std::ofstream ofs(saved_model_file.c_str(), std::ofstream::binary); std::swap(model, new_model); new_model->save(model_header, Parameters::kDumpAveraged, ofs); delete new_model; INFO_LOG("trace: model for iteration #%d is saved to %s", iter+1, saved_model_file.c_str()); } INFO_LOG("Best result (iteration = %d) : P = %lf", best_iteration, best_p); }
bool PartialSegmentationUtils::is_partial_tagged_word(const std::string& word) { return startswith(word, __word_start__); }
Instance* SegmentReader::next() { if (is.eof()) { return 0; } cursor ++; if (trace && cursor % interval == 0) { INFO_LOG("reading: read %d0%% instances.", (cursor/ interval)); } Instance* inst = new Instance; std::string line; std::getline(is, line); trim(line); if (line.size() == 0) { delete inst; return 0; } if (segmented) { std::vector<std::string> words = strutils::split(line); inst->words = words; for (size_t i = 0; i < words.size(); ++ i) { // std::vector<std::string> chars; // int num_chars = codecs::decode(words[i], chars); int num_chars = preprocessor.preprocess(words[i], inst->raw_forms, inst->forms, inst->chartypes); if (num_chars < 0) { delete inst; return 0; } for (size_t j = 0; j < num_chars; ++ j) { // inst->forms.push_back(chars[j]); if (1 == num_chars) { inst->tags.push_back( __s__ ); } else { if (0 == j) { inst->tags.push_back( __b__ ); } else if (num_chars - 1 == j) { inst->tags.push_back( __e__ ); } else { inst->tags.push_back( __i__ ); } } } } } else { int ret = preprocessor.preprocess(line, inst->raw_forms, inst->forms, inst->chartypes); if (ret < 0) { delete inst; return 0; } } return inst; }