Beispiel #1
0
N *Tokenizer<N, P>::lookup(const char *begin, const char *end,
                           Allocator<N, P> *allocator, Lattice *lattice) const {
  CharInfo cinfo;
  N *result_node = 0;
  size_t mblen = 0;
  size_t clen = 0;

  end = static_cast<size_t>(end - begin) >= 65535 ? begin + 65535 : end;

  if (isPartial) {
    const size_t begin_pos = begin - lattice->sentence();
    for (size_t n = begin_pos + 1; n < lattice->size(); ++n) {
      if (lattice->boundary_constraint(n) == MECAB_TOKEN_BOUNDARY) {
        end = lattice->sentence() + n;
        break;
      }
    }
  }

  const char *begin2 = property_.seekToOtherType(begin, end, space_,
                                                 &cinfo, &mblen, &clen);

  Dictionary::result_type *daresults = allocator->mutable_results();
  const size_t results_size = allocator->results_size();

  for (std::vector<Dictionary *>::const_iterator it = dic_.begin();
       it != dic_.end(); ++it) {
    const size_t n = (*it)->commonPrefixSearch(
        begin2,
        static_cast<size_t>(end - begin2),
        daresults, results_size);

    for (size_t i = 0; i < n; ++i) {
      size_t size = (*it)->token_size(daresults[i]);
      const Token *token = (*it)->token(daresults[i]);
      for (size_t j = 0; j < size; ++j) {
        N *new_node = allocator->newNode();
        read_node_info(**it, *(token + j), &new_node);
        new_node->length = daresults[i].length;
        new_node->rlength = begin2 - begin + new_node->length;
        new_node->surface = begin2;
        new_node->stat = MECAB_NOR_NODE;
        new_node->char_type = cinfo.default_type;
        if (isPartial && !is_valid_node(lattice, new_node)) {
          continue;
        }
        new_node->bnext = result_node;
        result_node = new_node;
      }
    }
  }

  if (result_node && !cinfo.invoke) {
    return result_node;
  }

  const char *begin3 = begin2 + mblen;
  const char *group_begin3 = 0;

  if (begin3 > end) {
    ADDUNKNWON;
    if (result_node) {
    return result_node;
  }
  }

  if (cinfo.group) {
    const char *tmp = begin3;
    CharInfo fail;
    begin3 = property_.seekToOtherType(begin3, end, cinfo,
                                       &fail, &mblen, &clen);
    if (clen <= max_grouping_size_) {
      ADDUNKNWON;
    }
    group_begin3 = begin3;
    begin3 = tmp;
  }

  for (size_t i = 1; i <= cinfo.length; ++i) {
    if (begin3 > end) {
      break;
    }
    if (begin3 == group_begin3) {
      continue;
    }
    clen = i;
    ADDUNKNWON;
    if (!cinfo.isKindOf(property_.getCharInfo(begin3, end, &mblen))) {
      break;
    }
    begin3 += mblen;
  }

  if (!result_node) {
    ADDUNKNWON;
  }

  if (isPartial && !result_node) {
    begin3 = begin2;
    while (true) {
      cinfo = property_.getCharInfo(begin3, end, &mblen);
      begin3 += mblen;
      if (begin3 > end ||
          lattice->boundary_constraint(begin3 - lattice->sentence())
          != MECAB_INSIDE_TOKEN) {
        break;
      }
    }
    ADDUNKNWON;

    if (!result_node) {
      N *new_node = allocator->newNode();
      new_node->char_type = cinfo.default_type;
      new_node->surface = begin2;
      new_node->length = begin3 - begin2;
      new_node->rlength = begin3 - begin;
      new_node->stat = MECAB_UNK_NODE;
      new_node->bnext = result_node;
      new_node->feature =
          lattice->feature_constraint(begin - lattice->sentence());
      CHECK_DIE(new_node->feature);
      result_node = new_node;
    }
  }

  return result_node;
}
N *Tokenizer<N, P>::lookup(const char *begin, const char *end,
                           Allocator<N, P> *allocator) const {
  CharInfo cinfo;
  N *result_node = 0;
  size_t mblen = 0;
  size_t clen = 0;

  end = static_cast<size_t>(end - begin) >= 65535 ? begin + 65535 : end;
  const char *begin2 = property_.seekToOtherType(begin, end, space_,
                                                 &cinfo, &mblen, &clen);

  Dictionary::result_type *daresults = allocator->mutable_results();
  const size_t results_size = allocator->results_size();

  for (std::vector<Dictionary *>::const_iterator it = dic_.begin();
       it != dic_.end(); ++it) {
    const size_t n = (*it)->commonPrefixSearch(
        begin2,
        static_cast<size_t>(end - begin2),
        daresults, results_size);

    for (size_t i = 0; i < n; ++i) {
      size_t size = (*it)->token_size(daresults[i]);
      const Token *token = (*it)->token(daresults[i]);
      for (size_t j = 0; j < size; ++j) {
        N *new_node = allocator->newNode();
        read_node_info(**it, *(token + j), &new_node);
        new_node->length = daresults[i].length;
        new_node->rlength = begin2 - begin + new_node->length;
        new_node->surface = begin2;
        new_node->stat = MECAB_NOR_NODE;
        new_node->char_type = cinfo.default_type;
        new_node->bnext = result_node;
        result_node = new_node;
      }
    }
  }

  if (result_node && !cinfo.invoke) {
    return result_node;
  }

  const char *begin3 = begin2 + mblen;
  const char *group_begin3 = 0;

  if (begin3 > end) {
    ADDUNKNWON;
    return result_node;
  }

  if (cinfo.group) {
    const char *tmp = begin3;
    CharInfo fail;
    begin3 = property_.seekToOtherType(begin3, end, cinfo,
                                       &fail, &mblen, &clen);
    if (clen <= max_grouping_size_) {
      ADDUNKNWON;
    }
    group_begin3 = begin3;
    begin3 = tmp;
  }

  for (size_t i = 1; i <= cinfo.length; ++i) {
    if (begin3 > end) {
      break;
    }
    if (begin3 == group_begin3) {
      continue;
    }
    clen = i;
    ADDUNKNWON;
    if (!cinfo.isKindOf(property_.getCharInfo(begin3, end, &mblen))) {
      break;
    }
    begin3 += mblen;
  }

  if (!result_node) {
    ADDUNKNWON;
  }

  return result_node;
}