Beispiel #1
0
  static int run(int argc, char **argv) {
    static const MeCab::Option long_options[] = {
      { "output",   'o',  0,   "FILE", "set the output filename" },
      { "version",  'v',  0,   0,    "show the version and exit"   },
      { "help",  'h',  0,   0,    "show this help and exit."   },
      { 0, 0, 0, 0 }
    };

    MeCab::Param param;
    param.open(argc, argv, long_options);

    if (!param.open(argc, argv, long_options)) {
      std::cout << param.what() << "\n\n" <<  COPYRIGHT
                << "\ntry '--help' for more information." << std::endl;
      return -1;
    }

    if (!param.help_version()) {
      return 0;
    }

    const std::vector<std::string> &tmp = param.rest_args();
    std::vector<std::string> files = tmp;
    if (files.empty()) {
      files.push_back("-");
    }

    std::string output = param.get<std::string>("output");
    if (output.empty()) output = "-";
    MeCab::ostream_wrapper ofs(output.c_str());
    CHECK_DIE(*ofs) << "permission denied: " << output;

    scoped_fixed_array<char, BUF_SIZE> buf;
    char *col[2];
    std::string str;
    for (size_t i = 0; i < files.size(); ++i) {
      MeCab::istream_wrapper ifs(files[i].c_str());
      CHECK_DIE(*ifs) << "no such file or directory: " << files[i];
      while (ifs->getline(buf.get(), buf.size())) {
        const size_t n = tokenize(buf.get(), "\t ", col, 2);
        CHECK_DIE(n <= 2) << "format error: " << buf.get();
        if (std::strcmp(col[0], "EOS") == 0 && !str.empty()) {
          *ofs << str << std::endl;
          str.clear();
        } else {
          str += col[0];
        }
      }
    }

    return 0;
  }
Beispiel #2
0
int mecab_do(int argc, char **argv) {

    //debug
    std::cout << "[" << __LINE__ << "]: "
              << "mecab_do(int argc, char **argv)" << __FILE__ << std::endl;
    ///

#define WHAT_ERROR(msg) do {                    \
    std::cout << msg << std::endl;              \
    return EXIT_FAILURE; }                      \
  while (0);

//	//debug
//	std::cout << "[" << __LINE__ << "]: " << __FILE__ << std::endl;
//	WHAT_ERROR("ABCDE");
//	///

    MeCab::Param param;
    if (!param.open(argc, argv, MeCab::long_options)) {
        std::cout << param.what() << std::endl;
        return EXIT_FAILURE;
    }

    if (param.get<bool>("help")) {
        std::cout << param.help() << std::endl;
        return EXIT_SUCCESS;
    }

    if (param.get<bool>("version")) {
        std::cout << param.version() << std::endl;
        return EXIT_SUCCESS;
    }

    if (!load_dictionary_resource(&param)) {
        std::cout << param.what() << std::endl;
        return EXIT_SUCCESS;
    }

    if (param.get<int>("lattice-level") >= 1) {
        std::cerr << "lattice-level is DEPERCATED. "
                  << "use --marginal or --nbest." << std::endl;
    }

    //debug
    std::cout << "[" << __LINE__ << "]: " << __FILE__ << std::endl;
    int dbg = param.get<int>("lattice-level");

    std::cout << "	param.get<int>(\"lattice-level\") => " << dbg << std::endl;

    ///

    MeCab::scoped_ptr<MeCab::ModelImpl> model(new MeCab::ModelImpl);
    if (!model->open(param)) {
        std::cout << MeCab::getLastError() << std::endl;
        return EXIT_FAILURE;
    }

    std::string ofilename = param.get<std::string>("output");
    if (ofilename.empty()) {
        ofilename = "-";
    }

    const int nbest = param.get<int>("nbest");
    if (nbest <= 0 || nbest > NBEST_MAX) {
        WHAT_ERROR("invalid N value");
    }

    MeCab::ostream_wrapper ofs(ofilename.c_str());
    if (!*ofs) {
        WHAT_ERROR("no such file or directory: " << ofilename);
    }

    if (param.get<bool>("dump-config")) {
        param.dump_config(&*ofs);
        return EXIT_FAILURE;
    }

    if (param.get<bool>("dictionary-info")) {
        for (const MeCab::DictionaryInfo *d = model->dictionary_info();
                d; d = d->next) {
            *ofs << "filename:\t" << d->filename << std::endl;
            *ofs << "version:\t" << d->version << std::endl;
            *ofs << "charset:\t" << d->charset << std::endl;
            *ofs << "type:\t" << d->type   << std::endl;
            *ofs << "size:\t" << d->size << std::endl;
            *ofs << "left size:\t" << d->lsize << std::endl;
            *ofs << "right size:\t" << d->rsize << std::endl;
            *ofs << std::endl;
        }
        return EXIT_FAILURE;
    }

    const std::vector<std::string>& rest_ = param.rest_args();
    std::vector<std::string> rest = rest_;

    if (rest.empty()) {
        rest.push_back("-");
    }

    size_t ibufsize = std::min(MAX_INPUT_BUFFER_SIZE,
                               std::max(param.get<int>
                                        ("input-buffer-size"),
                                        MIN_INPUT_BUFFER_SIZE));

    const bool partial = param.get<bool>("partial");
    if (partial) {
        ibufsize *= 8;
    }

    MeCab::scoped_array<char> ibuf_data(new char[ibufsize]);
    char *ibuf = ibuf_data.get();

    MeCab::scoped_ptr<MeCab::Tagger> tagger(model->createTagger());

    if (!tagger.get()) {
        WHAT_ERROR("cannot create tagger");
    }

    for (size_t i = 0; i < rest.size(); ++i) {
        MeCab::istream_wrapper ifs(rest[i].c_str());
        if (!*ifs) {
            WHAT_ERROR("no such file or directory: " << rest[i]);
        }

        while (true) {
            if (!partial) {
                ifs->getline(ibuf, ibufsize);
            } else {
                std::string sentence;
                MeCab::scoped_fixed_array<char, BUF_SIZE> line;
                for (;;) {
                    if (!ifs->getline(line.get(), line.size())) {
                        ifs->clear(std::ios::eofbit|std::ios::badbit);
                        break;
                    }
                    sentence += line.get();
                    sentence += '\n';
                    if (std::strcmp(line.get(), "EOS") == 0 || line[0] == '\0') {
                        break;
                    }
                }
                std::strncpy(ibuf, sentence.c_str(), ibufsize);
            }
            if (ifs->eof() && !ibuf[0]) {
                return false;
            }
            if (ifs->fail()) {
                std::cerr << "input-buffer overflow. "
                          << "The line is split. use -b #SIZE option." << std::endl;
                ifs->clear();
            }
            const char *r = (nbest >= 2) ? tagger->parseNBest(nbest, ibuf) :
                            tagger->parse(ibuf);
            if (!r)  {
                WHAT_ERROR(tagger->what());
            }
            *ofs << r << std::flush;
        }
    }

    return EXIT_SUCCESS;

#undef WHAT_ERROR
}
  static bool eval(int argc, char **argv) {
    static const MeCab::Option long_options[] = {
      { "level",  'l',  "0 -1",    "STR",    "set level of evaluations" },
      { "output", 'o',  0,         "FILE",   "set the output file name" },
      { "version",  'v',  0,   0,    "show the version and exit"   },
      { "help",  'h',  0,   0,    "show this help and exit."   },
      { 0, 0, 0, 0 }
    };

    MeCab::Param param;
    param.open(argc, argv, long_options);

    if (!param.open(argc, argv, long_options)) {
      std::cout << param.what() << "\n\n" <<  COPYRIGHT
                << "\ntry '--help' for more information." << std::endl;
      return -1;
    }

    if (!param.help_version()) return 0;

    const std::vector<std::string> &files = param.rest_args();
    if (files.size() < 2) {
      std::cout << "Usage: " <<
          param.program_name() << " output answer" << std::endl;
      return -1;
    }

    std::string output = param.get<std::string>("output");
    if (output.empty()) output = "-";
    MeCab::ostream_wrapper ofs(output.c_str());
    CHECK_DIE(*ofs) << "no such file or directory: " << output;

    const std::string system = files[0];
    const std::string answer = files[1];

    const std::string level_str = param.get<std::string>("level");

    std::ifstream ifs1(files[0].c_str());
    std::ifstream ifs2(files[1].c_str());

    CHECK_DIE(ifs1) << "no such file or directory: " << files[0].c_str();
    CHECK_DIE(ifs2) << "no such file or directory: " << files[0].c_str();
    CHECK_DIE(!level_str.empty()) << "level_str is NULL";

    std::vector<int> level;
    parseLevel(level_str.c_str(), &level);
    CHECK_DIE(level.size()) << "level_str is empty: " << level_str;
    std::vector<size_t> result_tbl(level.size());
    std::fill(result_tbl.begin(), result_tbl.end(), 0);

    size_t prec = 0;
    size_t recall = 0;

    std::vector<std::vector<std::string> > r1;
    std::vector<std::vector<std::string> > r2;

    while (true) {
      if (!read(&ifs1, &r1, level) || !read(&ifs2, &r2, level))
        break;

      size_t i1 = 0;
      size_t i2 = 0;
      size_t p1 = 0;
      size_t p2 = 0;

      while (i1 < r1.size() && i2 < r2.size()) {
        if (p1 == p2) {
          for (size_t i = 0; i < result_tbl.size(); ++i) {
            if (r1[i1][i] == r2[i2][i]) {
              result_tbl[i]++;
            }
          }
          p1 += r1[i1][0].size();
          p2 += r2[i2][0].size();
          ++i1;
          ++i2;
          ++prec;
          ++recall;
        } else if (p1 < p2) {
          p1 += r1[i1][0].size();
          ++i1;
          ++prec;
        } else {
          p2 += r2[i2][0].size();
          ++i2;
          ++recall;
        }
      }

      while (i1 < r1.size()) {
        ++prec;
        ++i1;
      }

      while (i2 < r2.size()) {
        ++recall;
        ++i2;
      }
    }

    *ofs <<  "              precision          recall         F"
         << std::endl;
    for (size_t i = 0; i < result_tbl.size(); ++i) {
      if (level[i] == -1) {
        *ofs << "LEVEL ALL: ";
      } else {
        *ofs << "LEVEL " << level[i] << ":    ";
      }
      printeval(&*ofs, result_tbl[i], prec, recall);
    }

    return true;
  }