int PartialSegmentationUtils::split_by_partial_tag( const std::string& line, std::vector<std::string>& words) { size_t offset1 = line.find(__partial_start__); size_t offset2 = line.find(__word_start__); if (offset1 == std::string::npos && offset2 == std::string::npos) { // 0 representing no partial tags. split with the original tags. words = split(line); return 0; } size_t offset = 0; size_t prelude = 0, coda = 0, len_start_tag = 0, len_end_tag = 0; while (offset < line.length()) { prelude = (offset1 < offset2 ? offset1 : offset2); std::string word = line.substr(offset, prelude - offset); if (word.length() > 0) { words.push_back( word ); } if (offset1 < offset2) { coda = line.find(__partial_end__, prelude); len_start_tag = __partial_start__.length(); len_end_tag = __partial_end__.length(); } else { coda = line.find(__word_end__, prelude); len_start_tag = __word_start__.length(); len_end_tag = __word_end__.length(); } word = line.substr(prelude + len_start_tag, coda - prelude - len_start_tag); if ((word.find(__partial_start__) != std::string::npos) || (word.find(__partial_end__) != std::string::npos) || (word.find(__word_start__) != std::string::npos) || (word.find(__word_end__) != std::string::npos)) { return -1; } words.push_back( line.substr(prelude, coda - prelude + len_end_tag) ); offset = coda + len_end_tag; offset1 = line.find(__partial_start__, offset); offset2 = line.find(__word_start__, offset); if (offset1 == std::string::npos && offset2 == std::string::npos) { // 0 representing no partial tags. word = line.substr(offset); if (word.length() > 0) { words.push_back( word ); } break; } } return 1; }
Instance* PostaggerReader::next() { if (is.eof()) { return 0; } cursor ++; if (trace && cursor % interval == 0) { INFO_LOG("reading: read %d0%% instances.", (cursor/ interval)); } Instance* inst = new Instance; std::string line; std::getline(is, line); trim(line); if (line.size() == 0) { delete inst; return 0; } std::vector<std::string> words = split(line); for (size_t i = 0; i < words.size(); ++ i) { if (with_tag) { std::vector<std::string> sep = rsplit_by_sep(words[i], delimiter, 1); if (sep.size() == 2) { inst->raw_forms.push_back(sep[0]); inst->forms.push_back(sbc2dbc_x(sep[0])); inst->tags.push_back(sep[1]); } else { std::cerr << words[i] << std::endl; delete inst; return 0; } } else { inst->raw_forms.push_back(words[i]); inst->forms.push_back(sbc2dbc_x(words[i])); } } return inst; }
Instance* CoNLLReader::next() { if (is.eof()) { return NULL;} Instance* inst = new Instance; std::string line; inst->raw_forms.push_back( SpecialOption::ROOT ); inst->forms.push_back( SpecialOption::ROOT ); inst->lemmas.push_back( SpecialOption::ROOT ); inst->postags.push_back( SpecialOption::ROOT ); inst->heads.push_back( -1 ); inst->deprels.push_back( SpecialOption::ROOT ); while (!is.eof()) { getline(is, line); trim(line); if (line.size() == 0) { break; } std::vector<std::string> items = split(line); if (items.size() < 8) { WARNING_LOG("Unknown conll format file"); } inst->raw_forms.push_back( items[1] ); // items[1]: form inst->forms.push_back( sbc2dbc(items[1]) ); inst->lemmas.push_back( items[2] ); // items[2]: lemma inst->postags.push_back( items[3] ); // items[4]: postag inst->heads.push_back( to_int(items[6]) ); inst->deprels.push_back( items[7] ); } if (inst->forms.size() == 1) { delete inst; inst = NULL; } return inst; }