bool FeatureIndex::buildBigramFeature(LearnerPath *path,
                                      const char *rfeature,
                                      const char *lfeature) {
  char rbuf[BUFSIZE];
  char lbuf[BUFSIZE];
  char *R[POSSIZE];
  char *L[POSSIZE];

  feature_.clear();
  std::strncpy(lbuf,  rfeature, BUFSIZE);
  std::strncpy(rbuf,  lfeature, BUFSIZE);

  size_t lsize = tokenizeCSV(lbuf, L, POSSIZE);
  size_t rsize = tokenizeCSV(rbuf, R, POSSIZE);

  for (std::vector<const char*>::const_iterator it = bigram_templs_.begin();
       it != bigram_templs_.end(); ++it) {
    const char *p = *it;
    os_.clear();

    for (; *p; p++) {
      switch (*p) {
        default: os_ << *p; break;
        case '\\': os_ << getEscapedChar(*++p); break;
        case '%': {
          switch (*++p) {
            case 'L': {
              const char *r = getIndex(const_cast<char **>(&p), L, lsize);
              if (!r) goto NEXT;
              os_ << r;
            } break;
            case 'R': {
              const char *r = getIndex(const_cast<char **>(&p), R, rsize);
              if (!r) goto NEXT;
              os_ << r;
            } break;
            case 'l':  os_ << lfeature; break;  // use lfeature as it is
            case 'r':  os_ << rfeature; break;
            default:
              CHECK_FALSE(false) << "unkonwn meta char: " <<  *p;
          }
        }
      }
    }

    os_ << '\0';

    ADDB(os_.str());

 NEXT: continue;
  }

  COPY_FEATURE(path->fvector);

  return true;
}
Ejemplo n.º 2
0
String addEscapes(const String& str, const String& specialChars)
{
  String ret;
  for (unsigned i = 0; i < str.length(); i++) {
    if (String::npos != specialChars.find_first_of(str[i])) {
      ret += '\\';
      ret += getEscapedChar(str[i]);
    } else {
      ret += str[i];
    }
  }

  return ret;
}
bool FeatureIndex::buildUnigramFeature(LearnerPath *path,
                                       const char *ufeature) {
  char ubuf[BUFSIZE];
  char *F[POSSIZE];

  feature_.clear();
  std::strncpy(ubuf, ufeature, BUFSIZE);
  size_t usize = tokenizeCSV(ubuf, F, POSSIZE);

  for (std::vector<const char*>::const_iterator it = unigram_templs_.begin();
       it != unigram_templs_.end(); ++it) {
    const char *p = *it;
    os_.clear();

    for (; *p; p++) {
      switch (*p) {
        default: os_ << *p; break;
        case '\\': os_ << getEscapedChar(*++p); break;
        case '%': {
          switch (*++p) {
            case 'F':  {
              const char *r = getIndex(const_cast<char **>(&p), F, usize);
              if (!r) goto NEXT;
              os_ << r;
            } break;
            case 't':  os_ << (size_t)path->rnode->char_type;     break;
            case 'u':  os_ << ufeature; break;
            default:
              CHECK_FALSE(false) << "unkonwn meta char: " <<  *p;
          }
        }
      }
    }

    os_ << '\0';
    ADDB(os_.str());

 NEXT: continue;
  }

  COPY_FEATURE(path->rnode->fvector);

  return true;
}
Ejemplo n.º 4
0
bool Writer::writeNode(Lattice *lattice,
                       const char *p,
                       const Node *node,
                       StringBuffer *os) const {
  scoped_fixed_array<char, BUF_SIZE> buf;
  scoped_fixed_array<char *, 64> ptr;
  size_t psize = 0;

  for (; *p; p++) {
    switch (*p) {
      default: *os << *p; break;

      case '\\': *os << getEscapedChar(*++p); break;

      case '%': {  // macros
        switch (*++p) {
          default: {
            const std::string error = "unknown meta char: " + *p;
            lattice->set_what(error.c_str());
            return false;
          }
            // input sentence
          case 'S': os->write(lattice->sentence(), lattice->size()); break;
            // sentence length
          case 'L': *os << lattice->size(); break;
            // morph
          case 'm': os->write(node->surface, node->length); break;
          case 'M': os->write(reinterpret_cast<const char *>
                              (node->surface - node->rlength + node->length),
                              node->rlength);
            break;
          case 'h': *os << node->posid; break;  // Part-Of-Speech ID
          case '%': *os << '%'; break;         // %
          case 'c': *os << static_cast<int>(node->wcost); break;  // word cost
          case 'H': *os << node->feature; break;
          case 't': *os << static_cast<unsigned int>(node->char_type); break;
          case 's': *os << static_cast<unsigned int>(node->stat); break;
          case 'P': *os << node->prob; break;
          case 'p': {
            switch (*++p) {
              default:
                lattice->set_what("[iseSCwcnblLh] is required after %p");
                return false;
              case 'i': *os << node->id; break;  // node id
              case 'S': os->write(reinterpret_cast<const char*>
                                  (node->surface -
                                   node->rlength + node->length),
                                  node->rlength - node->length);
                break;  // space
                // start position
              case 's': *os << static_cast<int>(
                  node->surface - lattice->sentence());
                break;
                // end position
              case 'e': *os << static_cast<int>
                    (node->surface - lattice->sentence() + node->length);
                break;
                // connection cost
              case 'C': *os << node->cost -
                    node->prev->cost - node->wcost;
                break;
              case 'w': *os << node->wcost; break;  // word cost
              case 'c': *os << node->cost; break;  // best cost
              case 'n': *os << (node->cost - node->prev->cost); break;
                // node cost
                // * if best path, otherwise ' '
              case 'b': *os << (node->isbest ? '*' : ' '); break;
              case 'P': *os << node->prob; break;
              case 'A': *os << node->alpha; break;
              case 'B': *os << node->beta; break;
              case 'l': *os << node->length; break;  // length of morph
                // length of morph including the spaces
              case 'L': *os << node->rlength;    break;
              case 'h': {  // Hidden Layer ID
                switch (*++p) {
                  default:
                    lattice->set_what("lr is required after %ph");
                    return false;
                  case 'l': *os << node->lcAttr; break;   // current
                  case 'r': *os << node->rcAttr; break;   // prev
                }
              } break;

              case 'p': {
                char mode = *++p;
                char sep = *++p;
                if (sep == '\\') {
                  sep = getEscapedChar(*++p);
                }
                if (!node->lpath) {
                  lattice->set_what("no path information is available");
                  return false;
                }
                for (Path *path = node->lpath; path; path = path->lnext) {
                  if (path != node->lpath) *os << sep;
                  switch (mode) {
                    case 'i': *os << path->lnode->id; break;
                    case 'c': *os << path->cost; break;
                    case 'P': *os << path->prob; break;
                    default:
                      lattice->set_what("[icP] is required after %pp");
                      return false;
                  }
                }
              } break;

            }
          } break;

          case 'F':
          case 'f': {
            if (node->feature[0] == '\0') {
              lattice->set_what("no feature information available");
              return false;
            }
            if (!psize) {
				strncpy_s(buf.get(), sizeof(buf.get()), node->feature, buf.size());
              psize = tokenizeCSV(buf.get(), ptr.get(), ptr.size());
            }

            // separator
            char separator = '\t';  // default separator
            if (*p == 'F') {  // change separator
              if (*++p == '\\') {
                separator = getEscapedChar(*++p);
              } else {
                separator = *p;
              }
            }

            if (*++p !='[') {
              lattice->set_what("cannot find '['");
              return false;
            }
            size_t n = 0;
            bool sep = false;
            bool isfil = false;
            p++;

            for (;; ++p) {
              switch (*p) {
                case '0': case '1': case '2': case '3': case '4':
                case '5': case '6': case '7': case '8': case '9':
                  n = 10 * n +(*p - '0');
                  break;
                case ',': case ']':
                  if (n >= psize) {
                    lattice->set_what("given index is out of range");
                    return false;
                  }
                  isfil = (ptr[n][0] != '*');
                  if (isfil) {
                    if (sep) {
                      *os << separator;
                    }
                    *os << ptr[n];
                  }
                  if (*p == ']') {
                    goto last;
                  }
                  sep = isfil;
                  n = 0;
                  break;
                default:
                  lattice->set_what("cannot find ']'");
                  return false;
              }
            }
          } last: break;
        }  // end switch
      } break;  // end case '%'
    }  // end switch
  }

  return true;
}