bool DecoderFeatureIndex::buildFeature(LearnerPath *path) { path->rnode->wcost = path->cost = 0.0; std::string ufeature1; std::string lfeature1; std::string rfeature1; std::string ufeature2; std::string lfeature2; std::string rfeature2; CHECK_DIE(rewrite_.rewrite2(path->lnode->feature, &ufeature1, &lfeature1, &rfeature1)) << " cannot rewrite pattern: " << path->lnode->feature; CHECK_DIE(rewrite_.rewrite2(path->rnode->feature, &ufeature2, &lfeature2, &rfeature2)) << " cannot rewrite pattern: " << path->rnode->feature; if (!buildUnigramFeature(path, ufeature2.c_str())) return false; if (!buildBigramFeature(path, rfeature1.c_str(), lfeature2.c_str())) return false; return true; }
bool LearnerTagger::connect(size_t pos, LearnerNode *_rNode) { for (LearnerNode *rNode = _rNode ; rNode; rNode = rNode->bnext) { for (LearnerNode *lNode = end_node_list_[pos]; lNode; lNode = lNode->enext) { LearnerPath *path = allocator_->newPath(); std::memset(path, 0, sizeof(Path)); path->rnode = rNode; path->lnode = lNode; path->fvector = 0; path->cost = 0.0; path->rnode = rNode; path->lnode = lNode; path->lnext = rNode->lpath; rNode->lpath = path; path->rnext = lNode->rpath; lNode->rpath = path; CHECK_DIE(feature_index_->buildFeature(path)); CHECK_DIE(path->fvector); } const size_t x = rNode->rlength + pos; rNode->enext = end_node_list_[x]; end_node_list_[x] = rNode; } return true; }
bool DictionaryRewriter::open(const char *filename, Iconv *iconv) { std::ifstream ifs(filename); CHECK_DIE(ifs) << "no such file or directory: " << filename; int append_to = 0; std::string line; while (std::getline(ifs, line)) { if (iconv) iconv->convert(&line); if (line.empty() || line[0] == '#') continue; if (line == "[unigram rewrite]") { append_to = 1; } else if (line == "[left rewrite]") { append_to = 2; } else if (line == "[right rewrite]") { append_to = 3; } else { CHECK_DIE(append_to != 0) << "no sections found"; char *str = const_cast<char *>(line.c_str()); switch (append_to) { case 1: append_rewrite_rule(&unigram_rewrite_, str); break; case 2: append_rewrite_rule(&left_rewrite_, str); break; case 3: append_rewrite_rule(&right_rewrite_, str); break; } } } return true; }
bool POSIDGenerator::open(const char *filename, Iconv *iconv) { scoped_ptr<std::istream> p_ist; const jma::DictUnit* dict = jma::JMA_Dictionary::instance()->getDict(filename); if(dict) p_ist.reset(new std::istrstream(dict->text_, dict->length_)); else p_ist.reset(new std::ifstream(filename)); if (!*p_ist) { std::cerr << filename << " is not found. minimum setting is used" << std::endl; rewrite_.resize(1); rewrite_.back().set_pattern("*", "1"); return true; } std::string line; char *col[2]; while (std::getline(*p_ist, line)) { if (iconv) iconv->convert(&line); const size_t n = tokenize2(const_cast<char *>(line.c_str()), " \t", col, 2); CHECK_DIE(n == 2) << "format error: " << line; for (char *p = col[1]; *p; ++p) { CHECK_DIE(*p >= '0' && *p <= '9') << "not a number: " << col[1]; } rewrite_.resize(rewrite_.size() + 1); rewrite_.back().set_pattern(col[0], col[1]); } return true; }
void enum_csv_dictionaries(const char *path, std::vector<std::string> *dics) { dics->clear(); #if defined(_WIN32) && !defined(__CYGWIN__) WIN32_FIND_DATA wfd; HANDLE hFind; const std::string pat = create_filename(path, "*.csv"); hFind = FindFirstFile(pat.c_str(), &wfd); CHECK_DIE(hFind != INVALID_HANDLE_VALUE) << "Invalid File Handle. Get Last Error reports"; do { std::string tmp = create_filename(path, wfd.cFileName); dics->push_back(tmp); } while (FindNextFile(hFind, &wfd)); FindClose(hFind); #else DIR *dir = opendir(path); CHECK_DIE(dir) << "no such directory: " << path; for (struct dirent *dp = readdir(dir); dp; dp = readdir(dir)) { const std::string tmp = dp->d_name; if (tmp.size() >= 5) { std::string ext = tmp.substr(tmp.size() - 4, 4); toLower(&ext); if (ext == ".csv") dics->push_back(create_filename(path, tmp)); } } closedir(dir); #endif }
bool POSIDGenerator::open(const char *filename, Iconv *iconv) { std::ifstream ifs(filename); if (!ifs) { std::cerr << filename << " is not found. minimum setting is used" << std::endl; rewrite_.resize(1); rewrite_.back().set_pattern("*", "1"); return true; } std::string line; char *col[2]; while (std::getline(ifs, line)) { if (iconv) iconv->convert(&line); const size_t n = tokenize2(const_cast<char *>(line.c_str()), " \t", col, 2); CHECK_DIE(n == 2) << "format error: " << line; for (char *p = col[1]; *p; ++p) { CHECK_DIE(*p >= '0' && *p <= '9') << "not a number: " << col[1]; } rewrite_.resize(rewrite_.size() + 1); rewrite_.back().set_pattern(col[0], col[1]); } return true; }
bool DictionaryRewriter::open(const char *filename, Iconv *iconv) { scoped_ptr<std::istream> p_ist; const jma::DictUnit* dict = jma::JMA_Dictionary::instance()->getDict(filename); if(dict) p_ist.reset(new std::istrstream(dict->text_, dict->length_)); else p_ist.reset(new std::ifstream(filename)); CHECK_DIE(*p_ist) << "no such file or directory: " << filename; int append_to = 0; std::string line; while (std::getline(*p_ist, line)) { if (iconv) iconv->convert(&line); if (line.empty() || line[0] == '#') continue; if (line == "[unigram rewrite]") { append_to = 1; } else if (line == "[left rewrite]") { append_to = 2; } else if (line == "[right rewrite]") { append_to = 3; } else { CHECK_DIE(append_to != 0) << "no sections found"; char *str = const_cast<char *>(line.c_str()); switch (append_to) { case 1: append_rewrite_rule(&unigram_rewrite_, str); break; case 2: append_rewrite_rule(&left_rewrite_, str); break; case 3: append_rewrite_rule(&right_rewrite_, str); break; } } } return true; }
static bool read(std::istream *is, std::vector<std::vector<std::string> > *r, const std::vector<int> &level) { if (!*is) return false; char buf[BUF_SIZE]; char *col[2]; char *cvs[BUF_SIZE]; r->clear(); while (is->getline(buf, sizeof(buf))) { if (std::strcmp(buf, "EOS") == 0) break; CHECK_DIE(tokenize(buf, "\t", col, 2) == 2) << "format error"; cvs[0] = col[0]; size_t n = tokenizeCSV(col[1], cvs + 1, sizeof(cvs) - 1); std::vector<std::string> tmp; for (size_t i = 0; i < level.size(); ++i) { size_t m = level[i] < 0 ? n : level[i]; CHECK_DIE(m <= n) << " out of range " << level[i]; std::string output; for (size_t j = 0; j <= m; ++j) { output += cvs[j]; if (j != 0) output += "\t"; } tmp.push_back(output); } r->push_back(tmp); } return true; }
void copy(const char *src, const char *dst) { std::cout << "copying " << src << " to " << dst << std::endl; Mmap<char> mmap; CHECK_DIE(mmap.open(src)) << mmap.what(); std::ofstream ofs(dst, std::ios::binary|std::ios::out); CHECK_DIE(ofs) << "permission denied: " << dst; ofs.write(reinterpret_cast<char*>(mmap.begin()), mmap.size()); ofs.close(); }
int POSIDGenerator::id(const char *feature) const { char buf[BUF_SIZE]; char *col[BUF_SIZE]; CHECK_DIE(std::strlen(feature) < sizeof(buf) - 1) << "too long feature"; std::strncpy(buf, feature, sizeof(buf) - 1); const size_t n = tokenizeCSV(buf, col, sizeof(col)); CHECK_DIE(n < sizeof(col)) << "too long CSV entities"; std::string tmp; if (!rewrite_.rewrite(n, const_cast<const char **>(col), &tmp)) return -1; return std::atoi(tmp.c_str()); }
static int run(int argc, char **argv) { static const MeCab::Option long_options[] = { { "output", 'o', 0, "FILE", "set the output filename" }, { "version", 'v', 0, 0, "show the version and exit" }, { "help", 'h', 0, 0, "show this help and exit." }, { 0, 0, 0, 0 } }; MeCab::Param param; param.open(argc, argv, long_options); if (!param.open(argc, argv, long_options)) { std::cout << param.what() << "\n\n" << COPYRIGHT << "\ntry '--help' for more information." << std::endl; return -1; } if (!param.help_version()) { return 0; } const std::vector<std::string> &tmp = param.rest_args(); std::vector<std::string> files = tmp; if (files.empty()) { files.push_back("-"); } std::string output = param.get<std::string>("output"); if (output.empty()) output = "-"; MeCab::ostream_wrapper ofs(output.c_str()); CHECK_DIE(*ofs) << "permission denied: " << output; scoped_fixed_array<char, BUF_SIZE> buf; char *col[2]; std::string str; for (size_t i = 0; i < files.size(); ++i) { MeCab::istream_wrapper ifs(files[i].c_str()); CHECK_DIE(*ifs) << "no such file or directory: " << files[i]; while (ifs->getline(buf.get(), buf.size())) { const size_t n = tokenize(buf.get(), "\t ", col, 2); CHECK_DIE(n <= 2) << "format error: " << buf.get(); if (std::strcmp(col[0], "EOS") == 0 && !str.empty()) { *ofs << str << std::endl; str.clear(); } else { str += col[0]; } } } return 0; }
bool DecoderLearnerTagger::open(const Param ¶m) { close(); allocator_data_.reset(new Allocator<LearnerNode, LearnerPath>()); tokenizer_data_.reset(new Tokenizer<LearnerNode, LearnerPath>()); feature_index_data_.reset(new DecoderFeatureIndex); allocator_ = allocator_data_.get(); tokenizer_ = tokenizer_data_.get(); feature_index_ = feature_index_data_.get(); CHECK_DIE(tokenizer_->open(param)) << tokenizer_->what(); CHECK_DIE(feature_index_->open(param)); return true; }
// without cache bool DictionaryRewriter::rewrite(const std::string &feature, std::string *ufeature, std::string *lfeature, std::string *rfeature) const { char buf[BUF_SIZE]; char *col[BUF_SIZE]; CHECK_DIE(feature.size() < sizeof(buf) - 1) << "too long feature"; std::strncpy(buf, feature.c_str(), sizeof(buf) - 1); size_t n = tokenizeCSV(buf, col, sizeof(col)); CHECK_DIE(n < sizeof(col)) << "too long CSV entities"; return (unigram_rewrite_.rewrite(n, const_cast<const char **>(col), ufeature) && left_rewrite_.rewrite(n, const_cast<const char **>(col), lfeature) && right_rewrite_.rewrite(n, const_cast<const char **>(col), rfeature)); }
static bool genmatrix(const char *filename, const ContextID &cid, DecoderFeatureIndex *fi, int factor, int default_cost) { std::ofstream ofs(filename); CHECK_DIE(ofs) << "permission denied: " << filename; LearnerPath path; LearnerNode rnode; LearnerNode lnode; rnode.stat = lnode.stat = MECAB_NOR_NODE; rnode.rpath = &path; lnode.lpath = &path; path.lnode = &lnode; path.rnode = &rnode; const std::map<std::string, int> &left = cid.left_ids(); const std::map<std::string, int> &right = cid.right_ids(); CHECK_DIE(left.size()) << "left id size is empty"; CHECK_DIE(right.size()) << "right id size is empty"; ofs << right.size() << ' ' << left.size() << std::endl; size_t l = 0; for (std::map<std::string, int>::const_iterator rit = right.begin(); rit != right.end(); ++rit) { ++l; progress_bar("emitting matrix ", l+1, right.size()); for (std::map<std::string, int>::const_iterator lit = left.begin(); lit != left.end(); ++lit) { path.rnode->wcost = 0; fi->buildBigramFeature(&path, rit->first.c_str(), lit->first.c_str()); fi->calcCost(&path); ofs << rit->second << ' ' << lit->second << ' ' << tocost(path.cost, factor, default_cost) << std::endl; } } return true; }
bool RewritePattern::rewrite(size_t size, const char **input, std::string *output) const { if (spat_.size() > size) return false; for (size_t i = 0; i < spat_.size(); ++i) { if (!match_rewrite_pattern(spat_[i].c_str(), input[i])) return false; } output->clear(); for (size_t i = 0; i < dpat_.size(); ++i) { std::string elm; const char *begin = dpat_[i].c_str(); const char *end = begin + dpat_[i].size(); for (const char *p = begin; p < end; ++p) { if (*p == '$') { ++p; size_t n = 0; for (; p < end; ++p) { switch (*p) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = 10 * n + (*p - '0'); break; default: goto NEXT; } } NEXT: CHECK_DIE(n > 0 && (n - 1) < size) << " out of range: [" << dpat_[i] << "] " << n; elm += input[n - 1]; if (p < end) elm += *p; } else { elm += *p; } } CHECK_DIE(escape_csv_element(&elm)); *output += elm; if (i + 1 != dpat_.size()) *output += ","; } return true; }
static void gencid(const char *filename, DictionaryRewriter *rewrite, ContextID *cid) { std::ifstream ifs(filename); CHECK_DIE(ifs) << "no such file or directory: " << filename; char line[BUF_SIZE]; std::cout << "reading " << filename << " ... " << std::flush; size_t num = 0; std::string feature, ufeature, lfeature, rfeature; char *col[8]; while (ifs.getline(line, sizeof(line))) { const size_t n = tokenizeCSV(line, col, 5); CHECK_DIE(n == 5) << "format error: " << line; feature = col[4]; rewrite->rewrite2(feature, &ufeature, &lfeature, &rfeature); cid->add(lfeature.c_str(), rfeature.c_str()); ++num; } std::cout << num << std::endl; ifs.close(); }
bool FeatureIndex::convert(const char* txtfile, const char *binfile) { std::ifstream ifs(txtfile); CHECK_DIE(ifs) << "no such file or directory: " << txtfile; char buf[BUF_SIZE]; char *column[4]; std::map<std::string, double> dic; while (ifs.getline(buf, sizeof(buf))) { CHECK_DIE(tokenize2(buf, "\t", column, 2) == 2) << "format error: " << buf; dic.insert(std::make_pair<std::string, double> (std::string(column[1]), atof(column[0]) )); } std::ofstream ofs(binfile, std::ios::out | std::ios::binary); CHECK_DIE(ofs) << "permission denied: " << binfile; std::vector<char *> key; unsigned int size = static_cast<unsigned int>(dic.size()); ofs.write(reinterpret_cast<const char*>(&size), sizeof(unsigned int)); for (std::map<std::string, double>::const_iterator it = dic.begin(); it != dic.end(); ++it) { key.push_back(const_cast<char*>(it->first.c_str())); ofs.write(reinterpret_cast<const char*>(&it->second), sizeof(double)); } Darts::DoubleArray da; CHECK_DIE(da.build(key.size(), &key[0], 0, 0, 0) == 0) << "unkown error in building double array: " << binfile; ofs.write(reinterpret_cast<const char*>(da.array()), da.unit_size() * da.size()); return true; }
bool NE::open(const Param ¶m) { close(); if (action_mode() == PARSING_MODE) { const std::string filename = param.get<std::string>("ne-model"); std::vector<const char*> argv; argv.push_back(param.program_name()); argv.push_back("-m"); argv.push_back(filename.c_str()); model_ = crfpp_model_new(argv.size(), const_cast<char **>(&argv[0])); // CHECK_FALSE(tagger_) << crfpp_strerror(tagger_); // CHECK_FALSE(crfpp_ysize(tagger_) >= 2); // CHECK_FALSE(crfpp_xsize(tagger_) == 3); // for (size_t i = 0; i < crfpp_ysize(tagger_); ++i) { // const char *p = crfpp_yname(tagger_, i); // CHECK_FALSE(p && (p[0] == 'B' || p[0] == 'I' || p[0] == 'O')); // } } // "名詞,数," ne_composite_ipa_ ="\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0,"; // "名詞,数詞" ne_composite_juman_ = "\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0\xE8\xA9\x9E,"; // "名詞,数詞" ne_composite_unidic_ = "\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0\xE8\xA9\x9E,"; Iconv iconv; iconv.open(UTF8, charset()); CHECK_DIE(iconv.convert(&ne_composite_ipa_)); CHECK_DIE(iconv.convert(&ne_composite_juman_)); CHECK_DIE(iconv.convert(&ne_composite_unidic_)); CHECK_FALSE(!ne_composite_ipa_.empty()); CHECK_FALSE(!ne_composite_juman_.empty()); CHECK_FALSE(!ne_composite_unidic_.empty()); return true; }
static bool read(std::istream *is, std::vector<std::vector<std::string> > *r, const std::vector<int> &level) { if (!*is) { return false; } char *col[2]; scoped_fixed_array<char, BUF_SIZE> buf; scoped_fixed_array<char *, BUF_SIZE> csv; r->clear(); while (is->getline(buf.get(), buf.size())) { if (std::strcmp(buf.get(), "EOS") == 0) { break; } CHECK_DIE(tokenize(buf.get(), "\t", col, 2) == 2) << "format error"; csv[0] = col[0]; size_t n = tokenizeCSV(col[1], csv.get() + 1, csv.size() - 1); std::vector<std::string> tmp; for (size_t i = 0; i < level.size(); ++i) { size_t m = level[i] < 0 ? n : level[i]; CHECK_DIE(m <= n) << " out of range " << level[i]; std::string output; for (size_t j = 0; j <= m; ++j) { output += csv[j]; if (j != 0) { output += "\t"; } } tmp.push_back(output); } r->push_back(tmp); } return true; }
/* dwarf_CU_dieoffset_given_die returns the global debug_info section offset of the CU die that is the CU containing the given (passed-in) die. This information makes it possible for a consumer to find and print context information for any die. Use dwarf_offdie() passing in the offset this returns to get a die pointer to the CU die. */ int dwarf_CU_dieoffset_given_die(Dwarf_Die die, Dwarf_Off* return_offset, Dwarf_Error* error) { Dwarf_Off dieoff = 0; Dwarf_CU_Context cucontext = 0; CHECK_DIE(die, DW_DLV_ERROR); cucontext = die->di_cu_context; dieoff = cucontext->cc_debug_offset; /* The following call cannot fail, so no error check. */ dwarf_get_cu_die_offset_given_cu_header_offset_b( cucontext->cc_dbg, dieoff, die->di_is_info, return_offset,error); return DW_DLV_OK; }
/* Validate the sibling DIE. This only makes sense to call if the sibling's DIEs have been travsersed and dwarf_child() called on each, so that the last DIE dwarf_child saw was the last. Essentially ensuring that (after such traversal) that we are in the same place a sibling attribute would identify. In case we return DW_DLV_ERROR, the global offset of the last DIE traversed by dwarf_child is returned through *offset It is essentially guaranteed that dbg->de_last_die is a stale DIE pointer of a deallocated DIE when we get here. It must not be used as a DIE pointer here, just as a sort of anonymous pointer that we just check against NULL. There is a (subtle?) dependence on the fact that when we call this the last dwarf_child() call would have been for this sibling. Meaning that this works in a depth-first traversal even though there is no stack of 'de_last_die' values. The check for dbg->de_last_die just ensures sanity. If one is switching between normal debug_frame and eh_frame (traversing them in tandem, let us say) in a single Dwarf_Debug this validator makes no sense. It works if one processes a .debug_frame (entirely) and then an eh_frame (or vice versa) though. Use caution. */ int dwarf_validate_die_sibling(Dwarf_Die sibling,Dwarf_Off *offset) { Dwarf_Debug dbg = 0; Dwarf_Error *error = 0; Dwarf_Debug_InfoTypes dis = 0; CHECK_DIE(sibling, DW_DLV_ERROR); dbg = sibling->di_cu_context->cc_dbg; dis = sibling->di_is_info? &dbg->de_info_reading: &dbg->de_types_reading; *offset = 0; if (dis->de_last_die && dis->de_last_di_ptr) { if (sibling->di_debug_ptr == dis->de_last_di_ptr) { return (DW_DLV_OK); } } /* Calculate global offset used for error reporting */ dwarf_ptr_CU_offset(sibling->di_cu_context, dis->de_last_di_ptr,sibling->di_is_info,offset); return (DW_DLV_ERROR); }
int ContextID::lid(const char *l) const { std::map<std::string, int>::const_iterator it = left_.find(l); CHECK_DIE(it != left_.end()) << "cannot find LEFT-ID for " << l; return it->second; }
int ContextID::rid(const char *r) const { std::map<std::string, int>::const_iterator it = right_.find(r); CHECK_DIE(it != right_.end()) << "cannot find RIGHT-ID for " << r; return it->second; }
/* return DW_DLV_OK if ok. else DW_DLV_NO_ENTRY or DW_DLV_ERROR If err_count_out is non-NULL, this is a special 'check' call. */ int _dwarf_internal_printlines(Dwarf_Die die, Dwarf_Error * error, int * err_count_out, int only_line_header) { /* This pointer is used to scan the portion of the .debug_line section for the current cu. */ Dwarf_Small *line_ptr = 0; Dwarf_Small *orig_line_ptr = 0; /* This points to the last byte of the .debug_line portion for the current cu. */ Dwarf_Small *line_ptr_end = 0; /* Pointer to a DW_AT_stmt_list attribute in case it exists in the die. */ Dwarf_Attribute stmt_list_attr = 0; /* Pointer to DW_AT_comp_dir attribute in die. */ Dwarf_Attribute comp_dir_attr = 0; /* Pointer to name of compilation directory. */ Dwarf_Small *comp_dir = NULL; /* Offset into .debug_line specified by a DW_AT_stmt_list attribute. */ Dwarf_Unsigned line_offset = 0; struct Line_Table_Prefix_s prefix; /* These are the state machine state variables. */ Dwarf_Addr address = 0; Dwarf_Word file = 1; Dwarf_Word line = 1; Dwarf_Word column = 0; Dwarf_Bool is_stmt = false; Dwarf_Bool basic_block = false; Dwarf_Bool end_sequence = false; Dwarf_Bool prologue_end = false; Dwarf_Bool epilogue_begin = false; Dwarf_Small isa = 0; Dwarf_Sword i=0; /* This is the current opcode read from the statement program. */ Dwarf_Small opcode=0; /* These variables are used to decode leb128 numbers. Leb128_num holds the decoded number, and leb128_length is its length in bytes. */ Dwarf_Word leb128_num=0; Dwarf_Word leb128_length=0; Dwarf_Sword advance_line=0; Dwarf_Half attrform = 0; /* This is the operand of the latest fixed_advance_pc extended opcode. */ Dwarf_Half fixed_advance_pc=0; /* In case there are wierd bytes 'after' the line table * prologue this lets us print something. This is a gcc * compiler bug and we expect the bytes count to be 12. */ Dwarf_Small* bogus_bytes_ptr = 0; Dwarf_Unsigned bogus_bytes_count = 0; /* The Dwarf_Debug this die belongs to. */ Dwarf_Debug dbg=0; int resattr = DW_DLV_ERROR; int lres = DW_DLV_ERROR; int res = DW_DLV_ERROR; /* ***** BEGIN CODE ***** */ if (error != NULL) { *error = NULL; } CHECK_DIE(die, DW_DLV_ERROR); dbg = die->di_cu_context->cc_dbg; res = _dwarf_load_section(dbg, &dbg->de_debug_line,error); if (res != DW_DLV_OK) { return res; } resattr = dwarf_attr(die, DW_AT_stmt_list, &stmt_list_attr, error); if (resattr != DW_DLV_OK) { return resattr; } /* The list of relevant FORMs is small. DW_FORM_data4, DW_FORM_data8, DW_FORM_sec_offset */ lres = dwarf_whatform(stmt_list_attr,&attrform,error); if (lres != DW_DLV_OK) { return lres; } if (attrform != DW_FORM_data4 && attrform != DW_FORM_data8 && attrform != DW_FORM_sec_offset ) { _dwarf_error(dbg, error, DW_DLE_LINE_OFFSET_BAD); return (DW_DLV_ERROR); } lres = dwarf_global_formref(stmt_list_attr, &line_offset, error); if (lres != DW_DLV_OK) { return lres; } if (line_offset >= dbg->de_debug_line.dss_size) { _dwarf_error(dbg, error, DW_DLE_LINE_OFFSET_BAD); return (DW_DLV_ERROR); } orig_line_ptr = dbg->de_debug_line.dss_data; line_ptr = dbg->de_debug_line.dss_data + line_offset; dwarf_dealloc(dbg, stmt_list_attr, DW_DLA_ATTR); /* If die has DW_AT_comp_dir attribute, get the string that names the compilation directory. */ resattr = dwarf_attr(die, DW_AT_comp_dir, &comp_dir_attr, error); if (resattr == DW_DLV_ERROR) { return resattr; } if (resattr == DW_DLV_OK) { int cres = DW_DLV_ERROR; char *cdir = 0; cres = dwarf_formstring(comp_dir_attr, &cdir, error); if (cres == DW_DLV_ERROR) { return cres; } else if (cres == DW_DLV_OK) { comp_dir = (Dwarf_Small *) cdir; } } if (resattr == DW_DLV_OK) { dwarf_dealloc(dbg, comp_dir_attr, DW_DLA_ATTR); } dwarf_init_line_table_prefix(&prefix); { Dwarf_Small *line_ptr_out = 0; int dres = dwarf_read_line_table_prefix(dbg, line_ptr,dbg->de_debug_line.dss_size - line_offset, &line_ptr_out, &prefix, &bogus_bytes_ptr, &bogus_bytes_count, error, err_count_out); if (dres == DW_DLV_ERROR) { dwarf_free_line_table_prefix(&prefix); return dres; } if (dres == DW_DLV_NO_ENTRY) { dwarf_free_line_table_prefix(&prefix); return dres; } line_ptr_end = prefix.pf_line_ptr_end; line_ptr = line_ptr_out; } if(only_line_header) { /* Just checking for header errors, nothing more here.*/ dwarf_free_line_table_prefix(&prefix); return DW_DLV_OK; } printf("total line info length %ld bytes, " "line offset 0x%" DW_PR_DUx " %" DW_PR_DSd "\n", (long) prefix.pf_total_length, (Dwarf_Unsigned) line_offset, (Dwarf_Signed) line_offset); printf("line table version %d\n",(int) prefix.pf_version); printf("line table length field length %d prologue length %d\n", (int)prefix.pf_length_field_length, (int)prefix.pf_prologue_length); printf("compilation_directory %s\n", comp_dir ? ((char *) comp_dir) : ""); printf(" min instruction length %d\n", (int) prefix.pf_minimum_instruction_length); printf(" default is stmt %d\n", (int) prefix.pf_default_is_stmt); printf(" line base %d\n", (int) prefix.pf_line_base); printf(" line_range %d\n", (int) prefix.pf_line_range); printf(" opcode base %d\n", (int) prefix.pf_opcode_base); printf(" standard opcode count %d\n", (int) prefix.pf_std_op_count); for (i = 1; i < prefix.pf_opcode_base; i++) { printf(" opcode[%2d] length %d\n", (int) i, (int) prefix.pf_opcode_length_table[i - 1]); } printf(" include directories count %d\n", (int) prefix.pf_include_directories_count); for (i = 0; i < prefix.pf_include_directories_count; ++i) { printf(" include dir[%d] %s\n", (int) i, prefix.pf_include_directories[i]); } printf(" files count %d\n", (int) prefix.pf_files_count); for (i = 0; i < prefix.pf_files_count; ++i) { struct Line_Table_File_Entry_s *lfile = prefix.pf_line_table_file_entries + i; Dwarf_Unsigned tlm2 = lfile->lte_last_modification_time; Dwarf_Unsigned di = lfile->lte_directory_index; Dwarf_Unsigned fl = lfile->lte_length_of_file; printf(" file[%d] %s (file-number: %d) \n", (int) i, (char *) lfile->lte_filename, (int)(i+1)); printf(" dir index %d\n", (int) di); { time_t tt = (time_t) tlm2; printf(" last time 0x%x %s", /* ctime supplies newline */ (unsigned) tlm2, ctime(&tt)); } printf(" file length %ld 0x%lx\n", (long) fl, (unsigned long) fl); } { Dwarf_Unsigned offset = 0; if(bogus_bytes_count > 0) { Dwarf_Unsigned wcount = bogus_bytes_count; Dwarf_Unsigned boffset = bogus_bytes_ptr - orig_line_ptr; printf("*** DWARF CHECK: the line table prologue header_length " " is %" DW_PR_DUu " too high, we pretend it is smaller." "Section offset: %" DW_PR_DUu " (0x%" DW_PR_DUx ") ***\n", wcount, boffset,boffset); *err_count_out += 1; } offset = line_ptr - orig_line_ptr; printf(" statement prog offset in section: %" DW_PR_DUu " 0x%" DW_PR_DUx "\n", offset, offset); } /* Initialize the part of the state machine dependent on the prefix. */ is_stmt = prefix.pf_default_is_stmt; print_line_header(); /* Start of statement program. */ while (line_ptr < line_ptr_end) { int type = 0; printf(" [0x%06" DW_PR_DSx "] ", (Dwarf_Signed) (line_ptr - orig_line_ptr)); opcode = *(Dwarf_Small *) line_ptr; line_ptr++; /* 'type' is the output */ WHAT_IS_OPCODE(type, opcode, prefix.pf_opcode_base, prefix.pf_opcode_length_table, line_ptr, prefix.pf_std_op_count); if (type == LOP_DISCARD) { int oc; int opcnt = prefix.pf_opcode_length_table[opcode]; printf("*** DWARF CHECK: DISCARD standard opcode %d " "with %d operands: " "not understood.", opcode, opcnt); *err_count_out += 1; for (oc = 0; oc < opcnt; oc++) { /* * Read and discard operands we don't * understand. * Arbitrary choice of unsigned read. * Signed read would work as well. */ Dwarf_Unsigned utmp2; DECODE_LEB128_UWORD(line_ptr, utmp2); printf(" %" DW_PR_DUu " (0x%" DW_PR_DUx ")", (Dwarf_Unsigned) utmp2, (Dwarf_Unsigned) utmp2); } printf("***\n"); /* do nothing, necessary ops done */ } else if (type == LOP_SPECIAL) { /* This op code is a special op in the object, no matter that it might fall into the standard op range in this compile Thatis, these are special opcodes between special_opcode_base and MAX_LINE_OP_CODE. (including special_opcode_base and MAX_LINE_OP_CODE) */ char special[50]; unsigned origop = opcode; opcode = opcode - prefix.pf_opcode_base; address = address + prefix.pf_minimum_instruction_length * (opcode / prefix.pf_line_range); line = line + prefix.pf_line_base + opcode % prefix.pf_line_range; sprintf(special, "Specialop %3u", origop); print_line_detail(special, opcode, address, (int) file, line, column, is_stmt, basic_block, end_sequence, prologue_end, epilogue_begin, isa); basic_block = false; } else if (type == LOP_STANDARD) { switch (opcode) { case DW_LNS_copy:{ print_line_detail("DW_LNS_copy", opcode, address, file, line, column, is_stmt, basic_block, end_sequence, prologue_end, epilogue_begin, isa); basic_block = false; break; } case DW_LNS_advance_pc:{ Dwarf_Unsigned utmp2; DECODE_LEB128_UWORD(line_ptr, utmp2); printf("DW_LNS_advance_pc val %" DW_PR_DSd " 0x%" DW_PR_DUx "\n", (Dwarf_Signed) (Dwarf_Word) utmp2, (Dwarf_Unsigned) (Dwarf_Word) utmp2); leb128_num = (Dwarf_Word) utmp2; address = address + prefix.pf_minimum_instruction_length * leb128_num; break; } case DW_LNS_advance_line:{ Dwarf_Signed stmp; DECODE_LEB128_SWORD(line_ptr, stmp); advance_line = (Dwarf_Sword) stmp; printf("DW_LNS_advance_line val %" DW_PR_DSd " 0x%" DW_PR_DSx "\n", (Dwarf_Signed) advance_line, (Dwarf_Signed) advance_line); line = line + advance_line; break; } case DW_LNS_set_file:{ Dwarf_Unsigned utmp2; DECODE_LEB128_UWORD(line_ptr, utmp2); file = (Dwarf_Word) utmp2; printf("DW_LNS_set_file %ld\n", (long) file); break; } case DW_LNS_set_column:{ Dwarf_Unsigned utmp2; DECODE_LEB128_UWORD(line_ptr, utmp2); column = (Dwarf_Word) utmp2; printf("DW_LNS_set_column val %" DW_PR_DSd " 0x%" DW_PR_DSx "\n", (Dwarf_Signed) column, (Dwarf_Signed) column); break; } case DW_LNS_negate_stmt:{ is_stmt = !is_stmt; printf("DW_LNS_negate_stmt\n"); break; } case DW_LNS_set_basic_block:{ printf("DW_LNS_set_basic_block\n"); basic_block = true; break; } case DW_LNS_const_add_pc:{ opcode = MAX_LINE_OP_CODE - prefix.pf_opcode_base; address = address + prefix.pf_minimum_instruction_length * (opcode / prefix. pf_line_range); printf("DW_LNS_const_add_pc new address 0x%" DW_PR_DSx "\n", (Dwarf_Signed) address); break; } case DW_LNS_fixed_advance_pc:{ READ_UNALIGNED(dbg, fixed_advance_pc, Dwarf_Half, line_ptr, sizeof(Dwarf_Half)); line_ptr += sizeof(Dwarf_Half); address = address + fixed_advance_pc; printf("DW_LNS_fixed_advance_pc val %" DW_PR_DSd " 0x%" DW_PR_DSx " new address 0x%" DW_PR_DSx "\n", (Dwarf_Signed) fixed_advance_pc, (Dwarf_Signed) fixed_advance_pc, (Dwarf_Signed) address); break; } case DW_LNS_set_prologue_end:{ prologue_end = true; printf("DW_LNS_set_prologue_end set true.\n"); break; } /* New in DWARF3 */ case DW_LNS_set_epilogue_begin:{ epilogue_begin = true; printf("DW_LNS_set_epilogue_begin set true.\n"); break; } /* New in DWARF3 */ case DW_LNS_set_isa:{ Dwarf_Unsigned utmp2; DECODE_LEB128_UWORD(line_ptr, utmp2); isa = utmp2; printf("DW_LNS_set_isa new value 0x%" DW_PR_DUx ".\n", (Dwarf_Unsigned) utmp2); if (isa != utmp2) { /* The value of the isa did not fit in our local so we record it wrong. declare an error. */ dwarf_free_line_table_prefix(&prefix); _dwarf_error(dbg, error, DW_DLE_LINE_NUM_OPERANDS_BAD); return (DW_DLV_ERROR); } break; } } } else if (type == LOP_EXTENDED) { Dwarf_Unsigned utmp3 = 0; Dwarf_Word instr_length = 0; Dwarf_Small ext_opcode = 0; DECODE_LEB128_UWORD(line_ptr, utmp3); instr_length = (Dwarf_Word) utmp3; ext_opcode = *(Dwarf_Small *) line_ptr; line_ptr++; switch (ext_opcode) { case DW_LNE_end_sequence:{ end_sequence = true; print_line_detail("DW_LNE_end_sequence extended", opcode, address, file, line, column, is_stmt, basic_block, end_sequence, prologue_end, epilogue_begin, isa); address = 0; file = 1; line = 1; column = 0; is_stmt = prefix.pf_default_is_stmt; basic_block = false; end_sequence = false; prologue_end = false; epilogue_begin = false; break; } case DW_LNE_set_address:{ { READ_UNALIGNED(dbg, address, Dwarf_Addr, line_ptr, die->di_cu_context->cc_address_size); line_ptr += die->di_cu_context->cc_address_size; printf("DW_LNE_set_address address 0x%" DW_PR_DUx "\n", (Dwarf_Unsigned) address); } break; } case DW_LNE_define_file:{ Dwarf_Unsigned di = 0; Dwarf_Unsigned tlm = 0; Dwarf_Unsigned fl = 0; Dwarf_Small *fn = (Dwarf_Small *) line_ptr; line_ptr = line_ptr + strlen((char *) line_ptr) + 1; di = _dwarf_decode_u_leb128(line_ptr, &leb128_length); line_ptr = line_ptr + leb128_length; tlm = _dwarf_decode_u_leb128(line_ptr, &leb128_length); line_ptr = line_ptr + leb128_length; fl = _dwarf_decode_u_leb128(line_ptr, &leb128_length); line_ptr = line_ptr + leb128_length; printf("DW_LNE_define_file %s \n", fn); printf(" dir index %d\n", (int) di); { time_t tt3 = (time_t) tlm; /* ctime supplies newline */ printf(" last time 0x%x %s", (unsigned) tlm, ctime(&tt3)); } printf(" file length %ld 0x%lx\n", (long) fl, (unsigned long) fl); break; } default:{ /* This is an extended op code we do not know about, other than we know now many bytes it is (and the op code and the bytes of operand). */ Dwarf_Unsigned remaining_bytes = instr_length -1; if(instr_length < 1 || remaining_bytes > DW_LNE_LEN_MAX) { dwarf_free_line_table_prefix(&prefix); _dwarf_error(dbg, error, DW_DLE_LINE_EXT_OPCODE_BAD); return (DW_DLV_ERROR); } printf("DW_LNE extended op 0x%x ",ext_opcode); printf("Bytecount: " DW_PR_DUu , instr_length); if(remaining_bytes > 0) { printf(" linedata: 0x"); while (remaining_bytes > 0) { printf("%02x",(unsigned char)(*(line_ptr))); line_ptr++; remaining_bytes--; } } printf("\n"); } break; } } } dwarf_free_line_table_prefix(&prefix); return (DW_DLV_OK); }
N *Tokenizer<N, P>::lookup(const char *begin, const char *end, Allocator<N, P> *allocator, Lattice *lattice) const { CharInfo cinfo; N *result_node = 0; size_t mblen = 0; size_t clen = 0; end = static_cast<size_t>(end - begin) >= 65535 ? begin + 65535 : end; if (isPartial) { const size_t begin_pos = begin - lattice->sentence(); for (size_t n = begin_pos + 1; n < lattice->size(); ++n) { if (lattice->boundary_constraint(n) == MECAB_TOKEN_BOUNDARY) { end = lattice->sentence() + n; break; } } } const char *begin2 = property_.seekToOtherType(begin, end, space_, &cinfo, &mblen, &clen); Dictionary::result_type *daresults = allocator->mutable_results(); const size_t results_size = allocator->results_size(); for (std::vector<Dictionary *>::const_iterator it = dic_.begin(); it != dic_.end(); ++it) { const size_t n = (*it)->commonPrefixSearch( begin2, static_cast<size_t>(end - begin2), daresults, results_size); for (size_t i = 0; i < n; ++i) { size_t size = (*it)->token_size(daresults[i]); const Token *token = (*it)->token(daresults[i]); for (size_t j = 0; j < size; ++j) { N *new_node = allocator->newNode(); read_node_info(**it, *(token + j), &new_node); new_node->length = daresults[i].length; new_node->rlength = begin2 - begin + new_node->length; new_node->surface = begin2; new_node->stat = MECAB_NOR_NODE; new_node->char_type = cinfo.default_type; if (isPartial && !is_valid_node(lattice, new_node)) { continue; } new_node->bnext = result_node; result_node = new_node; } } } if (result_node && !cinfo.invoke) { return result_node; } const char *begin3 = begin2 + mblen; const char *group_begin3 = 0; if (begin3 > end) { ADDUNKNWON; if (result_node) { return result_node; } } if (cinfo.group) { const char *tmp = begin3; CharInfo fail; begin3 = property_.seekToOtherType(begin3, end, cinfo, &fail, &mblen, &clen); if (clen <= max_grouping_size_) { ADDUNKNWON; } group_begin3 = begin3; begin3 = tmp; } for (size_t i = 1; i <= cinfo.length; ++i) { if (begin3 > end) { break; } if (begin3 == group_begin3) { continue; } clen = i; ADDUNKNWON; if (!cinfo.isKindOf(property_.getCharInfo(begin3, end, &mblen))) { break; } begin3 += mblen; } if (!result_node) { ADDUNKNWON; } if (isPartial && !result_node) { begin3 = begin2; while (true) { cinfo = property_.getCharInfo(begin3, end, &mblen); begin3 += mblen; if (begin3 > end || lattice->boundary_constraint(begin3 - lattice->sentence()) != MECAB_INSIDE_TOKEN) { break; } } ADDUNKNWON; if (!result_node) { N *new_node = allocator->newNode(); new_node->char_type = cinfo.default_type; new_node->surface = begin2; new_node->length = begin3 - begin2; new_node->rlength = begin3 - begin; new_node->stat = MECAB_UNK_NODE; new_node->bnext = result_node; new_node->feature = lattice->feature_constraint(begin - lattice->sentence()); CHECK_DIE(new_node->feature); result_node = new_node; } } return result_node; }
bool EncoderFeatureIndex::buildFeature(LearnerPath *path) { path->rnode->wcost = path->cost = 0.0; std::string ufeature1; std::string lfeature1; std::string rfeature1; std::string ufeature2; std::string lfeature2; std::string rfeature2; CHECK_DIE(rewrite_.rewrite2(path->lnode->feature, &ufeature1, &lfeature1, &rfeature1)) << " cannot rewrite pattern: " << path->lnode->feature; CHECK_DIE(rewrite_.rewrite2(path->rnode->feature, &ufeature2, &lfeature2, &rfeature2)) << " cannot rewrite pattern: " << path->rnode->feature; { os_.clear(); os_ << ufeature2 << ' ' << path->rnode->char_type << '\0'; const std::string key(os_.str()); std::map<std::string, std::pair<const int *, size_t> >::iterator it = feature_cache_.find(key); if (it != feature_cache_.end()) { path->rnode->fvector = it->second.first; it->second.second++; } else { if (!buildUnigramFeature(path, ufeature2.c_str())) return false; feature_cache_.insert(std::pair <std::string, std::pair<const int *, size_t> > (key, std::make_pair<const int *, size_t> (path->rnode->fvector, 1))); } } { os_.clear(); os_ << rfeature1 << ' ' << lfeature2 << '\0'; std::string key(os_.str()); std::map<std::string, std::pair<const int *, size_t> >::iterator it = feature_cache_.find(key); if (it != feature_cache_.end()) { path->fvector = it->second.first; it->second.second++; } else { if (!buildBigramFeature(path, rfeature1.c_str(), lfeature2.c_str())) return false; feature_cache_.insert(std::pair <std::string, std::pair<const int *, size_t> > (key, std::make_pair<const int *, size_t> (path->fvector, 1))); } } CHECK_DIE(path->fvector) << " fvector is NULL"; CHECK_DIE(path->rnode->fvector) << "fevector is NULL"; return true; }
/* return DW_DLV_OK if ok. else DW_DLV_NO_ENTRY or DW_DLV_ERROR */ int _dwarf_internal_printlines(Dwarf_Die die, Dwarf_Error * error) { /* This pointer is used to scan the portion of the .debug_line section for the current cu. */ Dwarf_Small *line_ptr; Dwarf_Small *orig_line_ptr; /* This points to the last byte of the .debug_line portion for the current cu. */ Dwarf_Small *line_ptr_end; /* This points to the end of the statement program prologue for the current cu, and serves to check that the prologue was correctly decoded. */ Dwarf_Small *check_line_ptr; /* Pointer to a DW_AT_stmt_list attribute in case it exists in the die. */ Dwarf_Attribute stmt_list_attr; /* Pointer to DW_AT_comp_dir attribute in die. */ Dwarf_Attribute comp_dir_attr; /* Pointer to name of compilation directory. */ Dwarf_Small *comp_dir = NULL; /* Offset into .debug_line specified by a DW_AT_stmt_list attribute. */ Dwarf_Unsigned line_offset; /* These are the fields of the statement program header. */ Dwarf_Unsigned total_length; Dwarf_Half version; Dwarf_Unsigned prologue_length; Dwarf_Small minimum_instruction_length; Dwarf_Small default_is_stmt; Dwarf_Sbyte line_base; Dwarf_Small line_range; Dwarf_Small opcode_base; Dwarf_Small *opcode_length; /* These are the state machine state variables. */ Dwarf_Addr address; Dwarf_Word file; Dwarf_Word line; Dwarf_Word column; Dwarf_Bool is_stmt; Dwarf_Bool basic_block; Dwarf_Bool end_sequence; Dwarf_Sword i, file_entry_count, include_directories_count; /* This is the current opcode read from the statement program. */ Dwarf_Small opcode; /* Pointer to a Dwarf_Line_Context_s structure that contains the context such as file names and include directories for the set of lines being generated. */ Dwarf_Line_Context line_context; /* These variables are used to decode leb128 numbers. Leb128_num holds the decoded number, and leb128_length is its length in bytes. */ Dwarf_Word leb128_num; Dwarf_Word leb128_length; Dwarf_Sword advance_line; /* This is the operand of the latest fixed_advance_pc extended opcode. */ Dwarf_Half fixed_advance_pc; /* This is the length of an extended opcode instr. */ Dwarf_Word instr_length; Dwarf_Small ext_opcode; int local_length_size; /*REFERENCED*/ /* Not used in this instance of the macro */ int local_extension_size; /* The Dwarf_Debug this die belongs to. */ Dwarf_Debug dbg; int resattr; int lres; int res; /* ***** BEGIN CODE ***** */ if (error != NULL) *error = NULL; CHECK_DIE(die, DW_DLV_ERROR) dbg = die->di_cu_context->cc_dbg; res = _dwarf_load_section(dbg, dbg->de_debug_line_index, &dbg->de_debug_line, error); if (res != DW_DLV_OK) { return res; } resattr = dwarf_attr(die, DW_AT_stmt_list, &stmt_list_attr, error); if (resattr != DW_DLV_OK) { return resattr; } lres = dwarf_formudata(stmt_list_attr, &line_offset, error); if (lres != DW_DLV_OK) { return lres; } if (line_offset >= dbg->de_debug_line_size) { _dwarf_error(dbg, error, DW_DLE_LINE_OFFSET_BAD); return (DW_DLV_ERROR); } orig_line_ptr = dbg->de_debug_line; line_ptr = dbg->de_debug_line + line_offset; dwarf_dealloc(dbg, stmt_list_attr, DW_DLA_ATTR); /* If die has DW_AT_comp_dir attribute, get the string that names the compilation directory. */ resattr = dwarf_attr(die, DW_AT_comp_dir, &comp_dir_attr, error); if (resattr == DW_DLV_ERROR) { return resattr; } if (resattr == DW_DLV_OK) { int cres; char *cdir; cres = dwarf_formstring(comp_dir_attr, &cdir, error); if (cres == DW_DLV_ERROR) { return cres; } else if (cres == DW_DLV_OK) { comp_dir = (Dwarf_Small *) cdir; } } if (resattr == DW_DLV_OK) { dwarf_dealloc(dbg, comp_dir_attr, DW_DLA_ATTR); } /* Following is a straightforward decoding of the statement program prologue information. */ /* READ_AREA_LENGTH updates line_ptr for consumed bytes */ READ_AREA_LENGTH(dbg, total_length, Dwarf_Unsigned, line_ptr, local_length_size, local_extension_size); line_ptr_end = line_ptr + total_length; if (line_ptr_end > dbg->de_debug_line + dbg->de_debug_line_size) { _dwarf_error(dbg, error, DW_DLE_DEBUG_LINE_LENGTH_BAD); return (DW_DLV_ERROR); } printf("total line info length %ld bytes, " "line offset 0x%llx %lld\n", (long) total_length, (long long) line_offset, (long long) line_offset); printf("compilation_directory %s\n", comp_dir ? ((char *) comp_dir) : ""); READ_UNALIGNED(dbg, version, Dwarf_Half, line_ptr, sizeof(Dwarf_Half)); line_ptr += sizeof(Dwarf_Half); if (version != CURRENT_VERSION_STAMP) { _dwarf_error(dbg, error, DW_DLE_VERSION_STAMP_ERROR); return (DW_DLV_ERROR); } READ_UNALIGNED(dbg, prologue_length, Dwarf_Unsigned, line_ptr, local_length_size); line_ptr += local_length_size; check_line_ptr = line_ptr; minimum_instruction_length = *(Dwarf_Small *) line_ptr; line_ptr = line_ptr + sizeof(Dwarf_Small); default_is_stmt = *(Dwarf_Small *) line_ptr; line_ptr = line_ptr + sizeof(Dwarf_Small); line_base = *(Dwarf_Sbyte *) line_ptr; line_ptr = line_ptr + sizeof(Dwarf_Sbyte); line_range = *(Dwarf_Small *) line_ptr; line_ptr = line_ptr + sizeof(Dwarf_Small); opcode_base = *(Dwarf_Small *) line_ptr; line_ptr = line_ptr + sizeof(Dwarf_Small); printf(" min instruction length %d\n", (int) minimum_instruction_length); printf(" default is stmt %d\n", (int) default_is_stmt); printf(" line base %d\n", (int) line_base); printf(" line_range %d\n", (int) line_range); opcode_length = (Dwarf_Small *) alloca(sizeof(Dwarf_Small) * opcode_base); for (i = 1; i < opcode_base; i++) { opcode_length[i] = *(Dwarf_Small *) line_ptr; printf(" opcode[%d] length %d\n", (int) i, (int) opcode_length[i]); line_ptr = line_ptr + sizeof(Dwarf_Small); } include_directories_count = 0; while ((*(char *) line_ptr) != '\0') { printf(" include dir[%d] %s\n", (int) include_directories_count, line_ptr); line_ptr = line_ptr + strlen((char *) line_ptr) + 1; include_directories_count++; } line_ptr++; file_entry_count = 0; while (*(char *) line_ptr != '\0') { Dwarf_Unsigned tlm2; Dwarf_Unsigned di; Dwarf_Unsigned fl; printf(" file[%d] %s\n", (int) file_entry_count, (char *) line_ptr); line_ptr = line_ptr + strlen((char *) line_ptr) + 1; di = _dwarf_decode_u_leb128(line_ptr, &leb128_length); line_ptr = line_ptr + leb128_length; tlm2 = _dwarf_decode_u_leb128(line_ptr, &leb128_length); line_ptr = line_ptr + leb128_length; fl = _dwarf_decode_u_leb128(line_ptr, &leb128_length); line_ptr = line_ptr + leb128_length; printf(" dir index %d\n", (int) di); { time_t tt = (time_t) tlm2; printf(" last time 0x%x %s", /* ctime supplies newline */ (unsigned) tlm2, ctime(&tt)); } printf(" file length %ld 0x%lx\n", (long) fl, (unsigned long) fl); file_entry_count++; } line_ptr++; if (line_ptr != check_line_ptr + prologue_length) { _dwarf_error(dbg, error, DW_DLE_LINE_PROLOG_LENGTH_BAD); return (DW_DLV_ERROR); } /* Set up context structure for this set of lines. */ line_context = (Dwarf_Line_Context) _dwarf_get_alloc(dbg, DW_DLA_LINE_CONTEXT, 1); if (line_context == NULL) { _dwarf_error(dbg, error, DW_DLE_ALLOC_FAIL); return (DW_DLV_ERROR); } printf(" statement prog offset in section: %lld 0x%llx\n", (long long) (line_ptr - orig_line_ptr), (long long) (line_ptr - orig_line_ptr)); /* Initialize the state machine. */ address = 0; file = 1; line = 1; column = 0; is_stmt = default_is_stmt; basic_block = false; end_sequence = false; print_line_header(); /* Start of statement program. */ while (line_ptr < line_ptr_end) { int type; printf(" [0x%06llx] ", (long long) (line_ptr - orig_line_ptr)); opcode = *(Dwarf_Small *) line_ptr; line_ptr++; /* 'type' is the output */ WHAT_IS_OPCODE(type, opcode, opcode_base, opcode_length, line_ptr); if (type == LOP_DISCARD) { /* do nothing, necessary ops done */ } else if (type == LOP_SPECIAL) { /* This op code is a special op in the object, no matter that it might fall into the standard op range in this compile Thatis, these are special opcodes between special_opcode_base and MAX_LINE_OP_CODE. (including special_opcode_base and MAX_LINE_OP_CODE) */ char special[50]; unsigned origop = opcode; opcode = opcode - opcode_base; address = address + minimum_instruction_length * (opcode / line_range); line = line + line_base + opcode % line_range; sprintf(special, "Specialop %3u", origop); print_line_detail(special, opcode, address, (int) file, line, column, is_stmt, basic_block, end_sequence); basic_block = false; } else if (type == LOP_STANDARD) { switch (opcode) { case DW_LNS_copy:{ if (opcode_length[DW_LNS_copy] != 0) { _dwarf_error(dbg, error, DW_DLE_LINE_NUM_OPERANDS_BAD); return (DW_DLV_ERROR); } print_line_detail("DW_LNS_copy", opcode, address, file, line, column, is_stmt, basic_block, end_sequence); basic_block = false; break; } case DW_LNS_advance_pc:{ Dwarf_Unsigned utmp2; if (opcode_length[DW_LNS_advance_pc] != 1) { _dwarf_error(dbg, error, DW_DLE_LINE_NUM_OPERANDS_BAD); return (DW_DLV_ERROR); } DECODE_LEB128_UWORD(line_ptr, utmp2) printf("DW_LNS_advance_pc val %lld 0x%llx\n", (long long) (Dwarf_Word) utmp2, (long long) (Dwarf_Word) utmp2); leb128_num = (Dwarf_Word) utmp2; address = address + minimum_instruction_length * leb128_num; break; } case DW_LNS_advance_line:{ Dwarf_Signed stmp; if (opcode_length[DW_LNS_advance_line] != 1) { _dwarf_error(dbg, error, DW_DLE_LINE_NUM_OPERANDS_BAD); return (DW_DLV_ERROR); } DECODE_LEB128_SWORD(line_ptr, stmp) advance_line = (Dwarf_Sword) stmp; printf("DW_LNS_advance_line val %lld 0x%llx\n", (long long) advance_line, (long long) advance_line); line = line + advance_line; break; } case DW_LNS_set_file:{ Dwarf_Unsigned utmp2; if (opcode_length[DW_LNS_set_file] != 1) { _dwarf_error(dbg, error, DW_DLE_LINE_NUM_OPERANDS_BAD); return (DW_DLV_ERROR); } DECODE_LEB128_UWORD(line_ptr, utmp2) file = (Dwarf_Word) utmp2; printf("DW_LNS_set_file %ld\n", (long) file); break; } case DW_LNS_set_column:{ Dwarf_Unsigned utmp2; if (opcode_length[DW_LNS_set_column] != 1) { _dwarf_error(dbg, error, DW_DLE_LINE_NUM_OPERANDS_BAD); return (DW_DLV_ERROR); } DECODE_LEB128_UWORD(line_ptr, utmp2) column = (Dwarf_Word) utmp2; printf("DW_LNS_set_column val %lld 0x%llx\n", (long long) column, (long long) column); break; } case DW_LNS_negate_stmt:{ if (opcode_length[DW_LNS_negate_stmt] != 0) { _dwarf_error(dbg, error, DW_DLE_LINE_NUM_OPERANDS_BAD); return (DW_DLV_ERROR); } is_stmt = !is_stmt; printf("DW_LNS_negate_stmt\n"); break; } case DW_LNS_set_basic_block:{ if (opcode_length[DW_LNS_set_basic_block] != 0) { _dwarf_error(dbg, error, DW_DLE_LINE_NUM_OPERANDS_BAD); return (DW_DLV_ERROR); } printf("DW_LNS_set_basic_block\n"); basic_block = true; break; } case DW_LNS_const_add_pc:{ opcode = MAX_LINE_OP_CODE - opcode_base; address = address + minimum_instruction_length * (opcode / line_range); printf("DW_LNS_const_add_pc new address 0x%llx\n", (long long) address); break; } case DW_LNS_fixed_advance_pc:{ if (opcode_length[DW_LNS_fixed_advance_pc] != 1) { _dwarf_error(dbg, error, DW_DLE_LINE_NUM_OPERANDS_BAD); return (DW_DLV_ERROR); } READ_UNALIGNED(dbg, fixed_advance_pc, Dwarf_Half, line_ptr, sizeof(Dwarf_Half)); line_ptr += sizeof(Dwarf_Half); address = address + fixed_advance_pc; printf("DW_LNS_fixed_advance_pc val %lld 0x%llx" " new address 0x%llx\n", (long long) fixed_advance_pc, (long long) fixed_advance_pc, (long long) address); break; } } } else if (type == LOP_EXTENDED) { Dwarf_Unsigned utmp3; DECODE_LEB128_UWORD(line_ptr, utmp3) instr_length = (Dwarf_Word) utmp3; ext_opcode = *(Dwarf_Small *) line_ptr; line_ptr++; switch (ext_opcode) { case DW_LNE_end_sequence:{ end_sequence = true; print_line_detail("DW_LNE_end_sequence extended", opcode, address, file, line, column, is_stmt, basic_block, end_sequence); address = 0; file = 1; line = 1; column = 0; is_stmt = default_is_stmt; basic_block = false; end_sequence = false; break; } case DW_LNE_set_address:{ if (instr_length - 1 == dbg->de_pointer_size) { READ_UNALIGNED(dbg, address, Dwarf_Addr, line_ptr, dbg->de_pointer_size); line_ptr += dbg->de_pointer_size; printf("DW_LNE_set_address address 0x%llx\n", (long long) address); } else { _dwarf_error(dbg, error, DW_DLE_LINE_SET_ADDR_ERROR); return (DW_DLV_ERROR); } break; } case DW_LNE_define_file:{ Dwarf_Small *fn; Dwarf_Signed di; Dwarf_Signed tlm; Dwarf_Unsigned fl; fn = (Dwarf_Small *) line_ptr; line_ptr = line_ptr + strlen((char *) line_ptr) + 1; di = _dwarf_decode_u_leb128(line_ptr, &leb128_length); line_ptr = line_ptr + leb128_length; tlm = _dwarf_decode_u_leb128(line_ptr, &leb128_length); line_ptr = line_ptr + leb128_length; fl = _dwarf_decode_u_leb128(line_ptr, &leb128_length); line_ptr = line_ptr + leb128_length; printf("DW_LNE_define_file %s \n", fn); printf(" dir index %d\n", (int) di); { time_t tt3 = (time_t) tlm; /* ctime supplies newline */ printf(" last time 0x%x %s", (unsigned) tlm, ctime(&tt3)); } printf(" file length %ld 0x%lx\n", (long) fl, (unsigned long) fl); break; } default:{ _dwarf_error(dbg, error, DW_DLE_LINE_EXT_OPCODE_BAD); return (DW_DLV_ERROR); } } } } return (DW_DLV_OK); }
/* return DW_DLV_OK if ok. else DW_DLV_NO_ENTRY or DW_DLV_ERROR If err_count_out is non-NULL, this is a special 'check' call. */ static int _dwarf_internal_printlines(Dwarf_Die die, Dwarf_Error * error, int * err_count_out, int only_line_header) { /* This pointer is used to scan the portion of the .debug_line section for the current cu. */ Dwarf_Small *line_ptr = 0; Dwarf_Small *orig_line_ptr = 0; /* Pointer to a DW_AT_stmt_list attribute in case it exists in the die. */ Dwarf_Attribute stmt_list_attr = 0; /* Pointer to DW_AT_comp_dir attribute in die. */ Dwarf_Attribute comp_dir_attr = 0; /* Pointer to name of compilation directory. */ Dwarf_Small *comp_dir = NULL; /* Offset into .debug_line specified by a DW_AT_stmt_list attribute. */ Dwarf_Unsigned line_offset = 0; Dwarf_Sword i=0; Dwarf_Word u=0; /* These variables are used to decode leb128 numbers. Leb128_num holds the decoded number, and leb128_length is its length in bytes. */ Dwarf_Half attrform = 0; /* In case there are wierd bytes 'after' the line table prologue this lets us print something. This is a gcc compiler bug and we expect the bytes count to be 12. */ Dwarf_Small* bogus_bytes_ptr = 0; Dwarf_Unsigned bogus_bytes_count = 0; Dwarf_Half address_size = 0; Dwarf_Unsigned fission_offset = 0; /* The Dwarf_Debug this die belongs to. */ Dwarf_Debug dbg=0; Dwarf_CU_Context cu_context = 0; Dwarf_Line_Context line_context = 0; int resattr = DW_DLV_ERROR; int lres = DW_DLV_ERROR; int res = DW_DLV_ERROR; Dwarf_Small *line_ptr_actuals = 0; Dwarf_Small *line_ptr_end = 0; Dwarf_Small *section_start = 0; /* ***** BEGIN CODE ***** */ if (error != NULL) { *error = NULL; } CHECK_DIE(die, DW_DLV_ERROR); cu_context = die->di_cu_context; dbg = cu_context->cc_dbg; res = _dwarf_load_section(dbg, &dbg->de_debug_line,error); if (res != DW_DLV_OK) { return res; } if (!dbg->de_debug_line.dss_size) { return (DW_DLV_NO_ENTRY); } address_size = _dwarf_get_address_size(dbg, die); resattr = dwarf_attr(die, DW_AT_stmt_list, &stmt_list_attr, error); if (resattr != DW_DLV_OK) { return resattr; } /* The list of relevant FORMs is small. DW_FORM_data4, DW_FORM_data8, DW_FORM_sec_offset */ lres = dwarf_whatform(stmt_list_attr,&attrform,error); if (lres != DW_DLV_OK) { return lres; } if (attrform != DW_FORM_data4 && attrform != DW_FORM_data8 && attrform != DW_FORM_sec_offset ) { _dwarf_error(dbg, error, DW_DLE_LINE_OFFSET_BAD); return (DW_DLV_ERROR); } lres = dwarf_global_formref(stmt_list_attr, &line_offset, error); if (lres != DW_DLV_OK) { return lres; } if (line_offset >= dbg->de_debug_line.dss_size) { _dwarf_error(dbg, error, DW_DLE_LINE_OFFSET_BAD); return (DW_DLV_ERROR); } section_start = dbg->de_debug_line.dss_data; { Dwarf_Unsigned fission_size = 0; int resfis = _dwarf_get_fission_addition_die(die, DW_SECT_LINE, &fission_offset,&fission_size,error); if(resfis != DW_DLV_OK) { return resfis; } } orig_line_ptr = section_start + line_offset + fission_offset; line_ptr = orig_line_ptr; dwarf_dealloc(dbg, stmt_list_attr, DW_DLA_ATTR); /* If die has DW_AT_comp_dir attribute, get the string that names the compilation directory. */ resattr = dwarf_attr(die, DW_AT_comp_dir, &comp_dir_attr, error); if (resattr == DW_DLV_ERROR) { return resattr; } if (resattr == DW_DLV_OK) { int cres = DW_DLV_ERROR; char *cdir = 0; cres = dwarf_formstring(comp_dir_attr, &cdir, error); if (cres == DW_DLV_ERROR) { return cres; } else if (cres == DW_DLV_OK) { comp_dir = (Dwarf_Small *) cdir; } } if (resattr == DW_DLV_OK) { dwarf_dealloc(dbg, comp_dir_attr, DW_DLA_ATTR); } line_context = (Dwarf_Line_Context) _dwarf_get_alloc(dbg, DW_DLA_LINE_CONTEXT, 1); if (line_context == NULL) { _dwarf_error(dbg, error, DW_DLE_ALLOC_FAIL); return (DW_DLV_ERROR); } { Dwarf_Small *newlinep = 0; int dres = _dwarf_read_line_table_header(dbg, cu_context, section_start, line_ptr, dbg->de_debug_line.dss_size, &newlinep, line_context, &bogus_bytes_ptr, &bogus_bytes_count, error, err_count_out); if (dres == DW_DLV_ERROR) { dwarf_srclines_dealloc_b(line_context); return dres; } if (dres == DW_DLV_NO_ENTRY) { dwarf_srclines_dealloc_b(line_context); return dres; } line_ptr_end = line_context->lc_line_ptr_end; line_ptr = newlinep; if (line_context->lc_actuals_table_offset > 0) { line_ptr_actuals = line_context->lc_line_prologue_start + line_context->lc_actuals_table_offset; } } line_context->lc_compilation_directory = comp_dir; if (only_line_header) { /* Just checking for header errors, nothing more here.*/ dwarf_srclines_dealloc_b(line_context); return DW_DLV_OK; } dwarf_printf(dbg, "total line info length %ld bytes," " line offset 0x%" DW_PR_XZEROS DW_PR_DUx " %" DW_PR_DUu "\n", (long) line_context->lc_total_length, line_context->lc_section_offset, line_context->lc_section_offset); if (line_context->lc_version_number <= DW_LINE_VERSION5) { dwarf_printf(dbg, "line table version %d\n",(int) line_context->lc_version_number); } else { dwarf_printf(dbg, "line table version 0x%x\n",(int) line_context->lc_version_number); } dwarf_printf(dbg, "line table length field length %d prologue length %d\n", (int)line_context->lc_length_field_length, (int)line_context->lc_prologue_length); dwarf_printf(dbg, "compilation_directory %s\n", comp_dir ? ((char *) comp_dir) : ""); dwarf_printf(dbg, " min instruction length %d\n", (int) line_context->lc_minimum_instruction_length); if (line_context->lc_version_number == EXPERIMENTAL_LINE_TABLES_VERSION) { dwarf_printf(dbg, " actuals table offset " "0x%" DW_PR_XZEROS DW_PR_DUx " logicals table offset " "0x%" DW_PR_XZEROS DW_PR_DUx "\n", line_context->lc_actuals_table_offset, line_context->lc_logicals_table_offset); } if (line_context->lc_version_number == DW_LINE_VERSION5) { dwarf_printf(dbg, " segment selector size %d\n", (int) line_context->lc_segment_selector_size); dwarf_printf(dbg, " address size %d\n", (int) line_context->lc_address_size); } dwarf_printf(dbg, " default is stmt %d\n",(int)line_context->lc_default_is_stmt); dwarf_printf(dbg, " line base %d\n",(int)line_context->lc_line_base); dwarf_printf(dbg, " line_range %d\n",(int)line_context->lc_line_range); dwarf_printf(dbg, " opcode base %d\n",(int)line_context->lc_opcode_base); dwarf_printf(dbg, " standard opcode count %d\n",(int)line_context->lc_std_op_count); for (i = 1; i < line_context->lc_opcode_base; i++) { dwarf_printf(dbg, " opcode[%2d] length %d\n", (int) i, (int) line_context->lc_opcode_length_table[i - 1]); } dwarf_printf(dbg, " include directories count %d\n", (int) line_context->lc_include_directories_count); for (u = 0; u < line_context->lc_include_directories_count; ++u) { dwarf_printf(dbg, " include dir[%u] %s\n", (int) u, line_context->lc_include_directories[u]); } dwarf_printf(dbg, " files count %d\n", (int) line_context->lc_file_entry_count); if (line_context->lc_file_entry_count) { Dwarf_File_Entry fe = line_context->lc_file_entries; Dwarf_File_Entry fe2 = fe; unsigned fiu = 0; for (fiu = 0 ; fe2 ; fe2 = fe->fi_next,++fiu ) { Dwarf_Unsigned tlm2 = 0; Dwarf_Unsigned di = 0; Dwarf_Unsigned fl = 0; fe = fe2; tlm2 = fe->fi_time_last_mod; di = fe->fi_dir_index; fl = fe->fi_file_length; dwarf_printf(dbg, " file[%u] %s (file-number: %u) \n", (unsigned) fiu, (char *) fe->fi_file_name, (unsigned)(fiu+1)); dwarf_printf(dbg, " dir index %d\n", (int) di); { time_t tt = (time_t) tlm2; /* ctime supplies newline */ dwarf_printf(dbg, " last time 0x%x %s", (unsigned) tlm2, ctime(&tt)); } dwarf_printf(dbg, " file length %ld 0x%lx\n", (long) fl, (unsigned long) fl); } } if (line_context->lc_version_number == EXPERIMENTAL_LINE_TABLES_VERSION) { /* Print the subprograms list. */ Dwarf_Unsigned count = line_context->lc_subprogs_count; Dwarf_Unsigned exu = 0; Dwarf_Subprog_Entry sub = line_context->lc_subprogs; dwarf_printf(dbg," subprograms count" " %" DW_PR_DUu "\n",count); if (count > 0) { dwarf_printf(dbg," indx file line name\n"); } for (exu = 0 ; exu < count ; exu++,sub++) { dwarf_printf(dbg," [%2" DW_PR_DUu "] %4" DW_PR_DUu " %4" DW_PR_DUu " %s\n", exu+1, sub->ds_decl_file, sub->ds_decl_line, sub->ds_subprog_name); } } { Dwarf_Unsigned offset = 0; if (bogus_bytes_count > 0) { Dwarf_Unsigned wcount = bogus_bytes_count; Dwarf_Unsigned boffset = bogus_bytes_ptr - section_start; dwarf_printf(dbg, "*** DWARF CHECK: the line table prologue header_length " " is %" DW_PR_DUu " too high, we pretend it is smaller." "Section offset: 0x%" DW_PR_XZEROS DW_PR_DUx " (%" DW_PR_DUu ") ***\n", wcount, boffset,boffset); *err_count_out += 1; } offset = line_ptr - section_start; dwarf_printf(dbg, " statement prog offset in section: 0x%" DW_PR_XZEROS DW_PR_DUx " (%" DW_PR_DUu ")\n", offset, offset); } { Dwarf_Bool doaddrs = false; Dwarf_Bool dolines = true; _dwarf_print_line_context_record(dbg,line_context); if (!line_ptr_actuals) { /* Normal single level line table. */ Dwarf_Bool is_single_table = true; Dwarf_Bool is_actuals_table = false; print_line_header(dbg, is_single_table, is_actuals_table); res = read_line_table_program(dbg, line_ptr, line_ptr_end, orig_line_ptr, section_start, line_context, address_size, doaddrs, dolines, is_single_table, is_actuals_table, error, err_count_out); if (res != DW_DLV_OK) { dwarf_srclines_dealloc_b(line_context); return res; } } else { Dwarf_Bool is_single_table = false; Dwarf_Bool is_actuals_table = false; if (line_context->lc_version_number != EXPERIMENTAL_LINE_TABLES_VERSION) { dwarf_srclines_dealloc_b(line_context); _dwarf_error(dbg, error, DW_DLE_VERSION_STAMP_ERROR); return (DW_DLV_ERROR); } /* Read Logicals */ print_line_header(dbg, is_single_table, is_actuals_table); res = read_line_table_program(dbg, line_ptr, line_ptr_actuals, orig_line_ptr, section_start, line_context, address_size, doaddrs, dolines, is_single_table, is_actuals_table, error,err_count_out); if (res != DW_DLV_OK) { dwarf_srclines_dealloc_b(line_context); return res; } if (line_context->lc_actuals_table_offset > 0) { is_actuals_table = true; /* Read Actuals */ print_line_header(dbg, is_single_table, is_actuals_table); res = read_line_table_program(dbg, line_ptr_actuals, line_ptr_end, orig_line_ptr, section_start, line_context, address_size, doaddrs, dolines, is_single_table, is_actuals_table, error, err_count_out); if (res != DW_DLV_OK) { dwarf_srclines_dealloc_b(line_context); return res; } } } } dwarf_srclines_dealloc_b(line_context); return DW_DLV_OK; }
bool EncoderLearnerTagger::read(std::istream *is, std::vector<double> *observed) { scoped_fixed_array<char, BUF_SIZE> line; char *column[8]; std::string sentence; std::vector<LearnerNode *> corpus; ans_path_list_.clear(); bool eos = false; for (;;) { if (!is->getline(line.get(), line.size())) { is->clear(std::ios::eofbit|std::ios::badbit); return true; } eos = (std::strcmp(line.get(), "EOS") == 0 || line[0] == '\0'); LearnerNode *m = new LearnerNode; std::memset(m, 0, sizeof(LearnerNode)); if (eos) { m->stat = MECAB_EOS_NODE; } else { const size_t size = tokenize(line.get(), "\t", column, 2); CHECK_DIE(size == 2) << "format error: " << line.get(); m->stat = MECAB_NOR_NODE; m->surface = mystrdup(column[0]); m->feature = mystrdup(column[1]); m->length = m->rlength = std::strlen(column[0]); } corpus.push_back(m); if (eos) { break; } sentence.append(column[0]); } CHECK_DIE(!sentence.empty()) << "empty sentence"; CHECK_DIE(eos) << "\"EOS\" is not found"; begin_data_.reset_string(sentence); begin_ = begin_data_.get(); initList(); size_t pos = 0; for (size_t i = 0; corpus[i]->stat != MECAB_EOS_NODE; ++i) { LearnerNode *found = 0; for (LearnerNode *node = lookup(pos); node; node = node->bnext) { if (node_cmp_eq(*(corpus[i]), *node, eval_size_, unk_eval_size_)) { found = node; break; } } // cannot find node even using UNKNOWN WORD PROSESSING if (!found) { LearnerNode *node = allocator_->newNode(); node->surface = begin_ + pos; node->length = node->rlength = std::strlen(corpus[i]->surface); node->feature = feature_index_->strdup(corpus[i]->feature); node->stat = MECAB_NOR_NODE; node->fvector = 0; node->wcost = 0.0; node->bnext = begin_node_list_[pos]; begin_node_list_[pos] = node; std::cout << "adding virtual node: " << node->feature << std::endl; } pos += corpus[i]->length; } buildLattice(); LearnerNode* prev = end_node_list_[0]; // BOS prev->anext = 0; pos = 0; for (size_t i = 0; i < corpus.size(); ++i) { LearnerNode *rNode = 0; for (LearnerNode *node = begin_node_list_[pos]; node; node = node->bnext) { if (corpus[i]->stat == MECAB_EOS_NODE || node_cmp_eq(*(corpus[i]), *node, eval_size_, unk_eval_size_)) { rNode = node; // take last node } } LearnerPath *lpath = 0; for (LearnerPath *path = rNode->lpath; path; path = path->lnext) { if (prev == path->lnode) { lpath = path; break; } } CHECK_DIE(lpath->fvector) << "lpath is NULL"; for (const int *f = lpath->fvector; *f != -1; ++f) { if (*f >= static_cast<long>(observed->size())) { observed->resize(*f + 1); } ++(*observed)[*f]; } if (lpath->rnode->stat != MECAB_EOS_NODE) { for (const int *f = lpath->rnode->fvector; *f != -1; ++f) { if (*f >= static_cast<long>(observed->size())) { observed->resize(*f + 1); } ++(*observed)[*f]; } } ans_path_list_.push_back(lpath); prev->anext = rNode; prev = rNode; if (corpus[i]->stat == MECAB_EOS_NODE) { break; } pos += std::strlen(corpus[i]->surface); } prev->anext = begin_node_list_[len_]; // connect to EOS begin_node_list_[len_]->anext = 0; for (size_t i = 0 ; i < corpus.size(); ++i) { delete [] corpus[i]->surface; delete [] corpus[i]->feature; delete corpus[i]; } return true; }
bool CharProperty::compile(const char *cfile, const char *ufile, const char *ofile) { scoped_fixed_array<char, BUF_SIZE> line; scoped_fixed_array<char *, 512> col; size_t id = 0; std::vector<Range> range; std::map<std::string, CharInfo> category; std::vector<std::string> category_ary; std::ifstream ifs(WPATH(cfile)); std::istringstream iss(CHAR_PROPERTY_DEF_DEFAULT); std::istream *is = &ifs; if (!ifs) { std::cerr << cfile << " is not found. minimum setting is used" << std::endl; is = &iss; } while (is->getline(line.get(), line.size())) { if (std::strlen(line.get()) == 0 || line[0] == '#') { continue; } const size_t size = tokenize2(line.get(), "\t ", col.get(), col.size()); CHECK_DIE(size >= 2) << "format error: " << line.get(); // 0xFFFF..0xFFFF hoge hoge hgoe # if (std::strncmp(col[0], "0x", 2) == 0) { std::string low = col[0]; std::string high; size_t pos = low.find(".."); if (pos != std::string::npos) { high = low.substr(pos + 2, low.size() - pos - 2); low = low.substr(0, pos); } else { high = low; } Range r; r.low = atohex(low.c_str()); r.high = atohex(high.c_str()); CHECK_DIE(r.low >= 0 && r.low < 0xffff && r.high >= 0 && r.high < 0xffff && r.low <= r.high) << "range error: low=" << r.low << " high=" << r.high; for (size_t i = 1; i < size; ++i) { if (col[i][0] == '#') { break; // skip comments } CHECK_DIE(category.find(std::string(col[i])) != category.end()) << "category [" << col[i] << "] is undefined"; r.c.push_back(col[i]); } range.push_back(r); } else { CHECK_DIE(size >= 4) << "format error: " << line.get(); std::string key = col[0]; CHECK_DIE(category.find(key) == category.end()) << "category " << key << " is already defined"; CharInfo c; std::memset(&c, 0, sizeof(c)); c.invoke = std::atoi(col[1]); c.group = std::atoi(col[2]); c.length = std::atoi(col[3]); c.default_type = id++; category.insert(std::pair<std::string, CharInfo>(key, c)); category_ary.push_back(key); } } CHECK_DIE(category.size() < 18) << "too many categories(>= 18)"; CHECK_DIE(category.find("DEFAULT") != category.end()) << "category [DEFAULT] is undefined"; CHECK_DIE(category.find("SPACE") != category.end()) << "category [SPACE] is undefined"; std::istringstream iss2(UNK_DEF_DEFAULT); std::ifstream ifs2(WPATH(ufile)); std::istream *is2 = &ifs2; if (!ifs2) { std::cerr << ufile << " is not found. minimum setting is used." << std::endl; is2 = &iss2; } std::set<std::string> unk; while (is2->getline(line.get(), line.size())) { const size_t n = tokenizeCSV(line.get(), col.get(), 2); CHECK_DIE(n >= 1) << "format error: " << line.get(); const std::string key = col[0]; CHECK_DIE(category.find(key) != category.end()) << "category [" << key << "] is undefined in " << cfile; unk.insert(key); } for (std::map<std::string, CharInfo>::const_iterator it = category.begin(); it != category.end(); ++it) { CHECK_DIE(unk.find(it->first) != unk.end()) << "category [" << it->first << "] is undefined in " << ufile; } std::vector<CharInfo> table(0xffff); { std::vector<std::string> tmp; tmp.push_back("DEFAULT"); const CharInfo c = encode(tmp, &category); std::fill(table.begin(), table.end(), c); } for (std::vector<Range>::const_iterator it = range.begin(); it != range.end(); ++it) { const CharInfo c = encode(it->c, &category); for (int i = it->low; i <= it->high; ++i) { table[i] = c; } } // output binary table { std::ofstream ofs(WPATH(ofile), std::ios::binary|std::ios::out); CHECK_DIE(ofs) << "permission denied: " << ofile; unsigned int size = static_cast<unsigned int>(category.size()); ofs.write(reinterpret_cast<const char*>(&size), sizeof(size)); for (std::vector<std::string>::const_iterator it = category_ary.begin(); it != category_ary.end(); ++it) { char buf[32]; std::fill(buf, buf + sizeof(buf), '\0'); std::strncpy(buf, it->c_str(), sizeof(buf) - 1); ofs.write(reinterpret_cast<const char*>(buf), sizeof(buf)); } ofs.write(reinterpret_cast<const char*>(&table[0]), sizeof(CharInfo) * table.size()); ofs.close(); } return true; }