MweSplitApplicator::MweSplitApplicator(std::ostream& ux_err)
  : GrammarApplicator(ux_err)
{
	CG3::Grammar* grammar = new CG3::Grammar;
	grammar->ux_stderr = ux_stderr;
	grammar->allocateDummySet();
	grammar->delimiters = grammar->allocateSet();
	grammar->addTagToSet(grammar->allocateTag(CG3::stringbits[0]), grammar->delimiters);
	grammar->reindex();
	setGrammar(grammar);
	owns_grammar = true;
	is_conv = true;
}
Beispiel #2
0
int main(int argc, char *argv[]) {
	UErrorCode status = U_ZERO_ERROR;
	UFILE *ux_stdin = 0;
	UFILE *ux_stdout = 0;
	UFILE *ux_stderr = 0;

	/* Initialize ICU */
	u_init(&status);
	if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) {
		std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl;
		CG3Quit(1);
	}

	U_MAIN_INIT_ARGS(argc, argv);
	argc = u_parseArgs(argc, argv, NUM_OPTIONS, options);

	if (argc < 0 || options[HELP1].doesOccur || options[HELP2].doesOccur) {
		FILE *out = (argc < 0) ? stderr : stdout;
		fprintf(out, "Usage: cg-conv [OPTIONS]\n");
		fprintf(out, "\n");
		fprintf(out, "Options:\n");

		size_t longest = 0;
		for (uint32_t i = 0; i < NUM_OPTIONS; i++) {
			if (options[i].description) {
				size_t len = strlen(options[i].longName);
				longest = std::max(longest, len);
			}
		}
		for (uint32_t i = 0; i < NUM_OPTIONS; i++) {
			if (options[i].description && options[i].description[0] != '!') {
				fprintf(out, " ");
				if (options[i].shortName) {
					fprintf(out, "-%c,", options[i].shortName);
				}
				else {
					fprintf(out, "   ");
				}
				fprintf(out, " --%s", options[i].longName);
				size_t ldiff = longest - strlen(options[i].longName);
				while (ldiff--) {
					fprintf(out, " ");
				}
				fprintf(out, "  %s", options[i].description);
				fprintf(out, "\n");
			}
		}

		return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	}

	if (options[IN_CG2].doesOccur) {
		options[IN_CG].doesOccur = true;
	}
	if (options[OUT_CG2].doesOccur) {
		options[OUT_CG].doesOccur = true;
	}

	ucnv_setDefaultName("UTF-8");
	const char *codepage_default = ucnv_getDefaultName();
	uloc_setDefault("en_US_POSIX", &status);
	const char *locale_default = uloc_getDefault();

	ux_stdin = u_finit(stdin, locale_default, codepage_default);
	ux_stdout = u_finit(stdout, locale_default, codepage_default);
	ux_stderr = u_finit(stderr, locale_default, codepage_default);

	CG3::Grammar grammar;

	grammar.ux_stderr = ux_stderr;
	grammar.allocateDummySet();
	grammar.delimiters = grammar.allocateSet();
	grammar.addTagToSet(grammar.allocateTag(CG3::stringbits[0].getTerminatedBuffer()), grammar.delimiters);
	grammar.reindex();

	CG3::FormatConverter applicator(ux_stderr);
	applicator.setGrammar(&grammar);

	boost::scoped_ptr<CG3::istream> instream;

	CG3::CG_FORMATS fmt = CG3::FMT_INVALID;

	if (options[IN_CG].doesOccur) {
		fmt = CG3::FMT_CG;
	}
	else if (options[IN_NICELINE].doesOccur) {
		fmt = CG3::FMT_NICELINE;
	}
	else if (options[IN_APERTIUM].doesOccur) {
		fmt = CG3::FMT_APERTIUM;
	}
	else if (options[IN_FST].doesOccur) {
		fmt = CG3::FMT_FST;
	}
	else if (options[IN_PLAIN].doesOccur) {
		fmt = CG3::FMT_PLAIN;
	}

	if (options[IN_AUTO].doesOccur || fmt == CG3::FMT_INVALID) {
		CG3::UString buffer;
		buffer.resize(1000);
		int32_t nr = u_file_read(&buffer[0], buffer.size(), ux_stdin);
		buffer.resize(nr);
		URegularExpression *rx = 0;

		for (;;) {
			rx = uregex_openC("^\"<[^>]+>\".*?^\\s+\"[^\"]+\"", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_CG;
				break;
			}
			uregex_close(rx);

			rx = uregex_openC("^\\S+ *\t *\\[\\S+\\]", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_NICELINE;
				break;
			}
			uregex_close(rx);

			rx = uregex_openC("^\\S+ *\t *\"\\S+\"", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_NICELINE;
				break;
			}
			uregex_close(rx);

			rx = uregex_openC("\\^[^/]+(/[^<]+(<[^>]+>)+)+\\$", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_APERTIUM;
				break;
			}
			uregex_close(rx);

			rx = uregex_openC("^\\S+\t\\S+(\\+\\S+)+$", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_FST;
				break;
			}

			fmt = CG3::FMT_PLAIN;
			break;
		}
		uregex_close(rx);

		instream.reset(new CG3::istream_buffer(ux_stdin, buffer));
	}
	else {
		instream.reset(new CG3::istream(ux_stdin));
	}

	applicator.setInputFormat(fmt);

	if (options[SUB_LTR].doesOccur) {
		grammar.sub_readings_ltr = true;
	}
	if (options[MAPPING_PREFIX].doesOccur) {
		size_t sn = strlen(options[MAPPING_PREFIX].value);
		CG3::UString buf(sn * 3, 0);
		UConverter *conv = ucnv_open(codepage_default, &status);
		ucnv_toUChars(conv, &buf[0], buf.size(), options[MAPPING_PREFIX].value, sn, &status);
		ucnv_close(conv);
		grammar.mapping_prefix = buf[0];
	}
	if (options[SUB_DELIMITER].doesOccur) {
		size_t sn = strlen(options[SUB_DELIMITER].value);
		applicator.sub_delims.resize(sn * 2);
		UConverter *conv = ucnv_open(codepage_default, &status);
		sn = ucnv_toUChars(conv, &applicator.sub_delims[0], applicator.sub_delims.size(), options[SUB_DELIMITER].value, sn, &status);
		applicator.sub_delims.resize(sn);
		applicator.sub_delims += '+';
		ucnv_close(conv);
	}
	if (options[FST_WTAG].doesOccur) {
		size_t sn = strlen(options[FST_WTAG].value);
		applicator.wtag.resize(sn * 2);
		UConverter *conv = ucnv_open(codepage_default, &status);
		sn = ucnv_toUChars(conv, &applicator.wtag[0], applicator.wtag.size(), options[FST_WTAG].value, sn, &status);
		applicator.wtag.resize(sn);
		ucnv_close(conv);
	}
	if (options[FST_WFACTOR].doesOccur) {
		applicator.wfactor = strtof(options[FST_WFACTOR].value, 0);
	}

	applicator.setOutputFormat(CG3::FMT_CG);

	if (options[OUT_APERTIUM].doesOccur) {
		applicator.setOutputFormat(CG3::FMT_APERTIUM);
	}
	else if (options[OUT_NICELINE].doesOccur) {
		applicator.setOutputFormat(CG3::FMT_NICELINE);
	}
	else if (options[OUT_PLAIN].doesOccur) {
		applicator.setOutputFormat(CG3::FMT_PLAIN);
	}

	applicator.is_conv = true;
	applicator.verbosity_level = 0;
	applicator.runGrammarOnText(*instream.get(), ux_stdout);

	u_fclose(ux_stdout);
	u_fclose(ux_stderr);

	u_cleanup();
}
Beispiel #3
0
int main(int argc, char **argv) {
	UErrorCode status = U_ZERO_ERROR;
	UFILE *ux_stdin = 0;
	UFILE *ux_stdout = 0;
	UFILE *ux_stderr = 0;

	/* Initialize ICU */
	u_init(&status);
	if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) {
		std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl;
		CG3Quit(1);
	}

	U_MAIN_INIT_ARGS(argc, argv);
	argc = u_parseArgs(argc, argv, NUM_OPTIONS, options);

	if (argc < 0 || options[HELP1].doesOccur || options[HELP2].doesOccur) {
		FILE *out = (argc < 0) ? stderr : stdout;
		fprintf(out, "Usage: cg-mwesplit [OPTIONS]\n");
		fprintf(out, "\n");
		fprintf(out, "Options:\n");

		size_t longest = 0;
		for (uint32_t i = 0; i < NUM_OPTIONS; i++) {
			if (options[i].description) {
				size_t len = strlen(options[i].longName);
				longest = std::max(longest, len);
			}
		}
		for (uint32_t i = 0; i < NUM_OPTIONS; i++) {
			if (options[i].description && options[i].description[0] != '!') {
				fprintf(out, " ");
				if (options[i].shortName) {
					fprintf(out, "-%c,", options[i].shortName);
				}
				else {
					fprintf(out, "   ");
				}
				fprintf(out, " --%s", options[i].longName);
				size_t ldiff = longest - strlen(options[i].longName);
				while (ldiff--) {
					fprintf(out, " ");
				}
				fprintf(out, "  %s", options[i].description);
				fprintf(out, "\n");
			}
		}

		return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	}

	ucnv_setDefaultName("UTF-8");
	const char *codepage_default = ucnv_getDefaultName();
	uloc_setDefault("en_US_POSIX", &status);
	const char *locale_default = uloc_getDefault();

	ux_stdin = u_finit(stdin, locale_default, codepage_default);
	ux_stdout = u_finit(stdout, locale_default, codepage_default);
	ux_stderr = u_finit(stderr, locale_default, codepage_default);

	CG3::Grammar grammar;

	grammar.ux_stderr = ux_stderr;
	grammar.allocateDummySet();
	grammar.delimiters = grammar.allocateSet();
	grammar.addTagToSet(grammar.allocateTag(CG3::stringbits[0].getTerminatedBuffer()), grammar.delimiters);
	grammar.reindex();

	CG3::MweSplitApplicator applicator(ux_stderr);
	applicator.setGrammar(&grammar);

	boost::scoped_ptr<CG3::istream> instream;

	instream.reset(new CG3::istream(ux_stdin));

	applicator.is_conv = true;
	applicator.verbosity_level = 0;
	applicator.runGrammarOnText(*instream.get(), ux_stdout);

	u_fclose(ux_stdout);
	u_fclose(ux_stderr);

	u_cleanup();
}