TEST(datum_to_fv_converter, register_string_rule) {
  datum_to_fv_converter conv;
  init_weight_manager(conv);
  initialize_converter(converter_config(), conv);

  std::vector<splitter_weight_type> p;
  p.push_back(splitter_weight_type(FREQ_BINARY, TERM_BINARY));
  shared_ptr<word_splitter> s(new character_ngram(1));
  shared_ptr<key_matcher> a(new match_all());
  conv.register_string_rule("1gram", a, s, p);

  datum datum;
  datum.string_values_.push_back(std::make_pair("/id", "a b"));

  std::vector<std::pair<std::string, float> > feature;
  conv.convert(datum, feature);

  std::vector<std::pair<std::string, float> > exp;
  exp.push_back(std::make_pair("/id$a@1gram#bin/bin", 1.));
  exp.push_back(std::make_pair("/id$ @1gram#bin/bin", 1.));
  exp.push_back(std::make_pair("/id$b@1gram#bin/bin", 1.));

  std::sort(feature.begin(), feature.end());
  std::sort(exp.begin(), exp.end());
  ASSERT_EQ(exp, feature);
}
Beispiel #2
0
int
main(int ac, char **av) {
	char *cmd = *av;
	char *normalizer[MAX_NORMALIZER];
	int nnormalizer = 0;
	const char *in_code = NULL;
	const char *out_code = NULL;
	char *resconf_file = NULL;
	int no_resconf = 0;
	char zld[256 + 1];
	int zld_specified = 0;
	int auto_zld = 0;
	char *encoding_alias = NULL;
	int selective = 1;
	FILE *fp;
	mdn_resconf_t resconf;

#ifdef HAVE_SETLOCALE
	(void)setlocale(LC_ALL, "");
#endif

	zld[0] = '\0';

	ac--;
	av++;
	while (ac > 0 && **av == '-') {

#define MUST_HAVE_ARG if (ac < 2) usage(cmd)
		if (strcmp(*av, "-in") == 0) {
			MUST_HAVE_ARG;
			in_code = av[1];
			ac--;
			av++;
		} else if (strcmp(*av, "-out") == 0) {
			MUST_HAVE_ARG;
			out_code = av[1];
			ac--;
			av++;
		} else if (strcmp(*av, "-conf") == 0) {
			MUST_HAVE_ARG;
			resconf_file = av[1];
			ac--;
			av++;
		} else if (strcmp(*av, "-noconf") == 0) {
			no_resconf = 1;
		} else if (strcmp(*av, "-zld") == 0) {
			MUST_HAVE_ARG;
			canonical_zld(zld, av[1]);
			zld_specified = 1;
			ac--;
			av++;
		} else if (strcmp(*av, "-auto") == 0) {
			auto_zld = 1;
		} else if (strcmp(*av, "-normalize") == 0) {
			MUST_HAVE_ARG;
			if (nnormalizer >= MAX_NORMALIZER) {
				errormsg("too many normalizers\n");
				exit(1);
			}
			normalizer[nnormalizer++] = av[1];
			ac--;
			av++;
		} else if (strcmp(*av, "-alias") == 0) {
			MUST_HAVE_ARG;
			encoding_alias = *av;
		} else if (strcmp(*av, "-flush") == 0) {
			flush_every_line = 1;
		} else if (strcmp(*av, "-whole") == 0) {
			selective = 0;
		} else {
			usage(cmd);
		}
#undef MUST_HAVE_ARG

		ac--;
		av++;
	}

	if (ac > 1)
		usage(cmd);

	/*
	 * Load configuration file.
	 */
	resconf = NULL;
	if (!no_resconf) {
		mdn_result_t r;

		r = mdn_resconf_initialize();
		if (r == mdn_success)
			r = mdn_resconf_create(&resconf);
		if (r == mdn_success)
			r = mdn_resconf_loadfile(resconf, resconf_file);
		if (r != mdn_success) {
			errormsg("error reading configuration file: %s\n",
				 mdn_result_tostring(r));
			return (1);
		}
	}

	/*
	 * Get default input/output code.
	 */
	if (in_code == NULL)
		in_code = mdn_localencoding_name();

	if (out_code == NULL) {
		mdn_converter_t c;
		if (resconf != NULL &&
		    (c = mdn_resconf_serverconverter(resconf)) != NULL)
			out_code = mdn_converter_localencoding(c);
	}

	if (in_code == NULL) {
		errormsg("input codeset must be specified\n");
		return (1);
	}
	if (out_code == NULL) {
		errormsg("output codeset must be specified\n");
		return (1);
	}

	/*
	 * Initialize codeset converter.
	 */
	if (!initialize_converter(in_code, out_code, encoding_alias))
		return (1);

	/*
	 * Initialize normalizer.
	 */
	if (nnormalizer == 0 && resconf != NULL)
		norm_ctx = mdn_resconf_normalizer(resconf);
	if (norm_ctx == NULL &&
	    !initialize_normalizer(normalizer, nnormalizer))
		return (1);

	/*
	 * Default ZLD.
	 */
	if (!zld_specified && resconf != NULL) {
		const char *conf_zld = mdn_resconf_zld(resconf);
		if (conf_zld != NULL)
			canonical_zld(zld, conf_zld);
	}

	/*
	 * Open input file.
	 */
	if (ac > 0) {
		if ((fp = fopen(av[0], "r")) == NULL) {
			errormsg("cannot open file %s: %s\n",
				 av[0], strerror(errno));
			return (1);
		}
	} else {
		fp = stdin;
	}

	/*
	 * Do the conversion.
	 */
	return convert_file(fp, zld, auto_zld, selective);
}