Instance* PostaggerReader::next() { if (is.eof()) { return 0; } cursor ++; if (trace && cursor % interval == 0) { INFO_LOG("reading: read %d0%% instances.", (cursor/ interval)); } Instance* inst = new Instance; std::string line; std::getline(is, line); trim(line); if (line.size() == 0) { delete inst; return 0; } std::vector<std::string> words = split(line); for (size_t i = 0; i < words.size(); ++ i) { if (with_tag) { std::vector<std::string> sep = rsplit_by_sep(words[i], delimiter, 1); if (sep.size() == 2) { inst->raw_forms.push_back(sep[0]); inst->forms.push_back(sbc2dbc_x(sep[0])); inst->tags.push_back(sep[1]); } else { std::cerr << words[i] << std::endl; delete inst; return 0; } } else { inst->raw_forms.push_back(words[i]); inst->forms.push_back(sbc2dbc_x(words[i])); } } return inst; }
Instance* CoNLLReader::next() { if (is.eof()) { return NULL;} Instance* inst = new Instance; std::string line; inst->raw_forms.push_back( SpecialOption::ROOT ); inst->forms.push_back( SpecialOption::ROOT ); inst->lemmas.push_back( SpecialOption::ROOT ); inst->postags.push_back( SpecialOption::ROOT ); inst->heads.push_back( -1 ); inst->deprels.push_back( SpecialOption::ROOT ); while (!is.eof()) { getline(is, line); trim(line); if (line.size() == 0) { break; } std::vector<std::string> items = split(line); if (items.size() < 8) { WARNING_LOG("Unknown conll format file"); } inst->raw_forms.push_back( items[1] ); // items[1]: form inst->forms.push_back( sbc2dbc(items[1]) ); inst->lemmas.push_back( items[2] ); // items[2]: lemma inst->postags.push_back( items[3] ); // items[4]: postag inst->heads.push_back( to_int(items[6]) ); inst->deprels.push_back( items[7] ); } if (inst->forms.size() == 1) { delete inst; inst = NULL; } return inst; }
Instance* SegmentReader::next() { if (is.eof()) { return 0; } cursor ++; if (trace && cursor % interval == 0) { INFO_LOG("reading: read %d0%% instances.", (cursor/ interval)); } Instance* inst = new Instance; std::string line; std::getline(is, line); trim(line); if (line.size() == 0) { delete inst; return 0; } if (segmented) { std::vector<std::string> words = strutils::split(line); inst->words = words; for (size_t i = 0; i < words.size(); ++ i) { // std::vector<std::string> chars; // int num_chars = codecs::decode(words[i], chars); int num_chars = preprocessor.preprocess(words[i], inst->raw_forms, inst->forms, inst->chartypes); if (num_chars < 0) { delete inst; return 0; } for (size_t j = 0; j < num_chars; ++ j) { // inst->forms.push_back(chars[j]); if (1 == num_chars) { inst->tags.push_back( __s__ ); } else { if (0 == j) { inst->tags.push_back( __b__ ); } else if (num_chars - 1 == j) { inst->tags.push_back( __e__ ); } else { inst->tags.push_back( __i__ ); } } } } } else { int ret = preprocessor.preprocess(line, inst->raw_forms, inst->forms, inst->chartypes); if (ret < 0) { delete inst; return 0; } } return inst; }