bool FeatureIndex::buildBigramFeature(LearnerPath *path, const char *rfeature, const char *lfeature) { char rbuf[BUFSIZE]; char lbuf[BUFSIZE]; char *R[POSSIZE]; char *L[POSSIZE]; feature_.clear(); std::strncpy(lbuf, rfeature, BUFSIZE); std::strncpy(rbuf, lfeature, BUFSIZE); size_t lsize = tokenizeCSV(lbuf, L, POSSIZE); size_t rsize = tokenizeCSV(rbuf, R, POSSIZE); for (std::vector<const char*>::const_iterator it = bigram_templs_.begin(); it != bigram_templs_.end(); ++it) { const char *p = *it; os_.clear(); for (; *p; p++) { switch (*p) { default: os_ << *p; break; case '\\': os_ << getEscapedChar(*++p); break; case '%': { switch (*++p) { case 'L': { const char *r = getIndex(const_cast<char **>(&p), L, lsize); if (!r) goto NEXT; os_ << r; } break; case 'R': { const char *r = getIndex(const_cast<char **>(&p), R, rsize); if (!r) goto NEXT; os_ << r; } break; case 'l': os_ << lfeature; break; // use lfeature as it is case 'r': os_ << rfeature; break; default: CHECK_FALSE(false) << "unkonwn meta char: " << *p; } } } } os_ << '\0'; ADDB(os_.str()); NEXT: continue; } COPY_FEATURE(path->fvector); return true; }
String addEscapes(const String& str, const String& specialChars) { String ret; for (unsigned i = 0; i < str.length(); i++) { if (String::npos != specialChars.find_first_of(str[i])) { ret += '\\'; ret += getEscapedChar(str[i]); } else { ret += str[i]; } } return ret; }
bool FeatureIndex::buildUnigramFeature(LearnerPath *path, const char *ufeature) { char ubuf[BUFSIZE]; char *F[POSSIZE]; feature_.clear(); std::strncpy(ubuf, ufeature, BUFSIZE); size_t usize = tokenizeCSV(ubuf, F, POSSIZE); for (std::vector<const char*>::const_iterator it = unigram_templs_.begin(); it != unigram_templs_.end(); ++it) { const char *p = *it; os_.clear(); for (; *p; p++) { switch (*p) { default: os_ << *p; break; case '\\': os_ << getEscapedChar(*++p); break; case '%': { switch (*++p) { case 'F': { const char *r = getIndex(const_cast<char **>(&p), F, usize); if (!r) goto NEXT; os_ << r; } break; case 't': os_ << (size_t)path->rnode->char_type; break; case 'u': os_ << ufeature; break; default: CHECK_FALSE(false) << "unkonwn meta char: " << *p; } } } } os_ << '\0'; ADDB(os_.str()); NEXT: continue; } COPY_FEATURE(path->rnode->fvector); return true; }
bool Writer::writeNode(Lattice *lattice, const char *p, const Node *node, StringBuffer *os) const { scoped_fixed_array<char, BUF_SIZE> buf; scoped_fixed_array<char *, 64> ptr; size_t psize = 0; for (; *p; p++) { switch (*p) { default: *os << *p; break; case '\\': *os << getEscapedChar(*++p); break; case '%': { // macros switch (*++p) { default: { const std::string error = "unknown meta char: " + *p; lattice->set_what(error.c_str()); return false; } // input sentence case 'S': os->write(lattice->sentence(), lattice->size()); break; // sentence length case 'L': *os << lattice->size(); break; // morph case 'm': os->write(node->surface, node->length); break; case 'M': os->write(reinterpret_cast<const char *> (node->surface - node->rlength + node->length), node->rlength); break; case 'h': *os << node->posid; break; // Part-Of-Speech ID case '%': *os << '%'; break; // % case 'c': *os << static_cast<int>(node->wcost); break; // word cost case 'H': *os << node->feature; break; case 't': *os << static_cast<unsigned int>(node->char_type); break; case 's': *os << static_cast<unsigned int>(node->stat); break; case 'P': *os << node->prob; break; case 'p': { switch (*++p) { default: lattice->set_what("[iseSCwcnblLh] is required after %p"); return false; case 'i': *os << node->id; break; // node id case 'S': os->write(reinterpret_cast<const char*> (node->surface - node->rlength + node->length), node->rlength - node->length); break; // space // start position case 's': *os << static_cast<int>( node->surface - lattice->sentence()); break; // end position case 'e': *os << static_cast<int> (node->surface - lattice->sentence() + node->length); break; // connection cost case 'C': *os << node->cost - node->prev->cost - node->wcost; break; case 'w': *os << node->wcost; break; // word cost case 'c': *os << node->cost; break; // best cost case 'n': *os << (node->cost - node->prev->cost); break; // node cost // * if best path, otherwise ' ' case 'b': *os << (node->isbest ? '*' : ' '); break; case 'P': *os << node->prob; break; case 'A': *os << node->alpha; break; case 'B': *os << node->beta; break; case 'l': *os << node->length; break; // length of morph // length of morph including the spaces case 'L': *os << node->rlength; break; case 'h': { // Hidden Layer ID switch (*++p) { default: lattice->set_what("lr is required after %ph"); return false; case 'l': *os << node->lcAttr; break; // current case 'r': *os << node->rcAttr; break; // prev } } break; case 'p': { char mode = *++p; char sep = *++p; if (sep == '\\') { sep = getEscapedChar(*++p); } if (!node->lpath) { lattice->set_what("no path information is available"); return false; } for (Path *path = node->lpath; path; path = path->lnext) { if (path != node->lpath) *os << sep; switch (mode) { case 'i': *os << path->lnode->id; break; case 'c': *os << path->cost; break; case 'P': *os << path->prob; break; default: lattice->set_what("[icP] is required after %pp"); return false; } } } break; } } break; case 'F': case 'f': { if (node->feature[0] == '\0') { lattice->set_what("no feature information available"); return false; } if (!psize) { strncpy_s(buf.get(), sizeof(buf.get()), node->feature, buf.size()); psize = tokenizeCSV(buf.get(), ptr.get(), ptr.size()); } // separator char separator = '\t'; // default separator if (*p == 'F') { // change separator if (*++p == '\\') { separator = getEscapedChar(*++p); } else { separator = *p; } } if (*++p !='[') { lattice->set_what("cannot find '['"); return false; } size_t n = 0; bool sep = false; bool isfil = false; p++; for (;; ++p) { switch (*p) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = 10 * n +(*p - '0'); break; case ',': case ']': if (n >= psize) { lattice->set_what("given index is out of range"); return false; } isfil = (ptr[n][0] != '*'); if (isfil) { if (sep) { *os << separator; } *os << ptr[n]; } if (*p == ']') { goto last; } sep = isfil; n = 0; break; default: lattice->set_what("cannot find ']'"); return false; } } } last: break; } // end switch } break; // end case '%' } // end switch } return true; }