TEST(datum_to_fv_converter, register_string_rule) { datum_to_fv_converter conv; init_weight_manager(conv); initialize_converter(converter_config(), conv); std::vector<splitter_weight_type> p; p.push_back(splitter_weight_type(FREQ_BINARY, TERM_BINARY)); shared_ptr<word_splitter> s(new character_ngram(1)); shared_ptr<key_matcher> a(new match_all()); conv.register_string_rule("1gram", a, s, p); datum datum; datum.string_values_.push_back(std::make_pair("/id", "a b")); std::vector<std::pair<std::string, float> > feature; conv.convert(datum, feature); std::vector<std::pair<std::string, float> > exp; exp.push_back(std::make_pair("/id$a@1gram#bin/bin", 1.)); exp.push_back(std::make_pair("/id$ @1gram#bin/bin", 1.)); exp.push_back(std::make_pair("/id$b@1gram#bin/bin", 1.)); std::sort(feature.begin(), feature.end()); std::sort(exp.begin(), exp.end()); ASSERT_EQ(exp, feature); }
int main(int ac, char **av) { char *cmd = *av; char *normalizer[MAX_NORMALIZER]; int nnormalizer = 0; const char *in_code = NULL; const char *out_code = NULL; char *resconf_file = NULL; int no_resconf = 0; char zld[256 + 1]; int zld_specified = 0; int auto_zld = 0; char *encoding_alias = NULL; int selective = 1; FILE *fp; mdn_resconf_t resconf; #ifdef HAVE_SETLOCALE (void)setlocale(LC_ALL, ""); #endif zld[0] = '\0'; ac--; av++; while (ac > 0 && **av == '-') { #define MUST_HAVE_ARG if (ac < 2) usage(cmd) if (strcmp(*av, "-in") == 0) { MUST_HAVE_ARG; in_code = av[1]; ac--; av++; } else if (strcmp(*av, "-out") == 0) { MUST_HAVE_ARG; out_code = av[1]; ac--; av++; } else if (strcmp(*av, "-conf") == 0) { MUST_HAVE_ARG; resconf_file = av[1]; ac--; av++; } else if (strcmp(*av, "-noconf") == 0) { no_resconf = 1; } else if (strcmp(*av, "-zld") == 0) { MUST_HAVE_ARG; canonical_zld(zld, av[1]); zld_specified = 1; ac--; av++; } else if (strcmp(*av, "-auto") == 0) { auto_zld = 1; } else if (strcmp(*av, "-normalize") == 0) { MUST_HAVE_ARG; if (nnormalizer >= MAX_NORMALIZER) { errormsg("too many normalizers\n"); exit(1); } normalizer[nnormalizer++] = av[1]; ac--; av++; } else if (strcmp(*av, "-alias") == 0) { MUST_HAVE_ARG; encoding_alias = *av; } else if (strcmp(*av, "-flush") == 0) { flush_every_line = 1; } else if (strcmp(*av, "-whole") == 0) { selective = 0; } else { usage(cmd); } #undef MUST_HAVE_ARG ac--; av++; } if (ac > 1) usage(cmd); /* * Load configuration file. */ resconf = NULL; if (!no_resconf) { mdn_result_t r; r = mdn_resconf_initialize(); if (r == mdn_success) r = mdn_resconf_create(&resconf); if (r == mdn_success) r = mdn_resconf_loadfile(resconf, resconf_file); if (r != mdn_success) { errormsg("error reading configuration file: %s\n", mdn_result_tostring(r)); return (1); } } /* * Get default input/output code. */ if (in_code == NULL) in_code = mdn_localencoding_name(); if (out_code == NULL) { mdn_converter_t c; if (resconf != NULL && (c = mdn_resconf_serverconverter(resconf)) != NULL) out_code = mdn_converter_localencoding(c); } if (in_code == NULL) { errormsg("input codeset must be specified\n"); return (1); } if (out_code == NULL) { errormsg("output codeset must be specified\n"); return (1); } /* * Initialize codeset converter. */ if (!initialize_converter(in_code, out_code, encoding_alias)) return (1); /* * Initialize normalizer. */ if (nnormalizer == 0 && resconf != NULL) norm_ctx = mdn_resconf_normalizer(resconf); if (norm_ctx == NULL && !initialize_normalizer(normalizer, nnormalizer)) return (1); /* * Default ZLD. */ if (!zld_specified && resconf != NULL) { const char *conf_zld = mdn_resconf_zld(resconf); if (conf_zld != NULL) canonical_zld(zld, conf_zld); } /* * Open input file. */ if (ac > 0) { if ((fp = fopen(av[0], "r")) == NULL) { errormsg("cannot open file %s: %s\n", av[0], strerror(errno)); return (1); } } else { fp = stdin; } /* * Do the conversion. */ return convert_file(fp, zld, auto_zld, selective); }