Esempio n. 1
0
int main(int argc, char *argv[]) {
	UFILE *ux_stdout = 0;
	UFILE *ux_stderr = 0;
	UErrorCode status = U_ZERO_ERROR;

	if (argc != 4) {
		endProgram(argv[0]);
	}

	/* Initialize ICU */
	u_init(&status);
	if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) {
		std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl;
		CG3Quit(1);
	}
	status = U_ZERO_ERROR;

	ucnv_setDefaultName("UTF-8");
	const char *codepage_default = ucnv_getDefaultName();
	uloc_setDefault("en_US_POSIX", &status);
	const char *locale_default = uloc_getDefault();

	ux_stdout = u_finit(stdout, locale_default, codepage_default);
	ux_stderr = u_finit(stderr, locale_default, codepage_default);

	CG3::Grammar *grammar = cg3_grammar_load(argv[1], ux_stdout, ux_stderr, true);
	CG3::Grammar *relabel_grammar = cg3_grammar_load(argv[2], ux_stdout, ux_stderr);

	CG3::Relabeller relabeller(*grammar, *relabel_grammar, ux_stderr);
	relabeller.relabel();

	FILE *gout = fopen(argv[3], "wb");
	if (gout) {
		CG3::BinaryGrammar writer(*grammar, ux_stderr);
		writer.writeBinaryGrammar(gout);
	}
	else {
		std::cerr << "Could not write grammar to " << argv[3] << std::endl;
	}

	delete relabel_grammar;
	relabel_grammar = 0;
	delete grammar;
	grammar = 0;

	u_fclose(ux_stderr);
	u_fclose(ux_stdout);

	u_cleanup();

	return status;
}
Esempio n. 2
0
// like libcg3's, but with a non-void grammar …
CG3::Grammar *cg3_grammar_load(const char *filename, UFILE *ux_stdout, UFILE *ux_stderr, bool require_binary = false) {
	using namespace CG3;
	std::ifstream input(filename, std::ios::binary);
	if (!input) {
		u_fprintf(ux_stderr, "Error: Error opening %s for reading!\n", filename);
		return 0;
	}
	if (!input.read(&cbuffers[0][0], 4)) {
		u_fprintf(ux_stderr, "Error: Error reading first 4 bytes from grammar!\n");
		return 0;
	}
	input.close();

	Grammar *grammar = new Grammar;
	grammar->ux_stderr = ux_stderr;
	grammar->ux_stdout = ux_stdout;

	boost::scoped_ptr<IGrammarParser> parser;

	if (cbuffers[0][0] == 'C' && cbuffers[0][1] == 'G' && cbuffers[0][2] == '3' && cbuffers[0][3] == 'B') {
		parser.reset(new BinaryGrammar(*grammar, ux_stderr));
	}
	else {
		if (require_binary) {
			u_fprintf(ux_stderr, "Error: Text grammar detected -- to compile this grammar, use `cg-comp'\n");
			CG3Quit(1);
		}
		parser.reset(new TextualParser(*grammar, ux_stderr));
	}
	if (parser->parse_grammar_from_file(filename, uloc_getDefault(), ucnv_getDefaultName())) {
		u_fprintf(ux_stderr, "Error: Grammar could not be parsed!\n");
		return 0;
	}

	grammar->reindex();

	return grammar;
}
Esempio n. 3
0
int main(int argc, char *argv[]) {
	UErrorCode status = U_ZERO_ERROR;
	UFILE *ux_stdin = 0;
	UFILE *ux_stdout = 0;
	UFILE *ux_stderr = 0;

	/* Initialize ICU */
	u_init(&status);
	if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) {
		std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl;
		CG3Quit(1);
	}

	U_MAIN_INIT_ARGS(argc, argv);
	argc = u_parseArgs(argc, argv, NUM_OPTIONS, options);

	if (argc < 0 || options[HELP1].doesOccur || options[HELP2].doesOccur) {
		FILE *out = (argc < 0) ? stderr : stdout;
		fprintf(out, "Usage: cg-conv [OPTIONS]\n");
		fprintf(out, "\n");
		fprintf(out, "Options:\n");

		size_t longest = 0;
		for (uint32_t i = 0; i < NUM_OPTIONS; i++) {
			if (options[i].description) {
				size_t len = strlen(options[i].longName);
				longest = std::max(longest, len);
			}
		}
		for (uint32_t i = 0; i < NUM_OPTIONS; i++) {
			if (options[i].description && options[i].description[0] != '!') {
				fprintf(out, " ");
				if (options[i].shortName) {
					fprintf(out, "-%c,", options[i].shortName);
				}
				else {
					fprintf(out, "   ");
				}
				fprintf(out, " --%s", options[i].longName);
				size_t ldiff = longest - strlen(options[i].longName);
				while (ldiff--) {
					fprintf(out, " ");
				}
				fprintf(out, "  %s", options[i].description);
				fprintf(out, "\n");
			}
		}

		return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	}

	if (options[IN_CG2].doesOccur) {
		options[IN_CG].doesOccur = true;
	}
	if (options[OUT_CG2].doesOccur) {
		options[OUT_CG].doesOccur = true;
	}

	ucnv_setDefaultName("UTF-8");
	const char *codepage_default = ucnv_getDefaultName();
	uloc_setDefault("en_US_POSIX", &status);
	const char *locale_default = uloc_getDefault();

	ux_stdin = u_finit(stdin, locale_default, codepage_default);
	ux_stdout = u_finit(stdout, locale_default, codepage_default);
	ux_stderr = u_finit(stderr, locale_default, codepage_default);

	CG3::Grammar grammar;

	grammar.ux_stderr = ux_stderr;
	grammar.allocateDummySet();
	grammar.delimiters = grammar.allocateSet();
	grammar.addTagToSet(grammar.allocateTag(CG3::stringbits[0].getTerminatedBuffer()), grammar.delimiters);
	grammar.reindex();

	CG3::FormatConverter applicator(ux_stderr);
	applicator.setGrammar(&grammar);

	boost::scoped_ptr<CG3::istream> instream;

	CG3::CG_FORMATS fmt = CG3::FMT_INVALID;

	if (options[IN_CG].doesOccur) {
		fmt = CG3::FMT_CG;
	}
	else if (options[IN_NICELINE].doesOccur) {
		fmt = CG3::FMT_NICELINE;
	}
	else if (options[IN_APERTIUM].doesOccur) {
		fmt = CG3::FMT_APERTIUM;
	}
	else if (options[IN_FST].doesOccur) {
		fmt = CG3::FMT_FST;
	}
	else if (options[IN_PLAIN].doesOccur) {
		fmt = CG3::FMT_PLAIN;
	}

	if (options[IN_AUTO].doesOccur || fmt == CG3::FMT_INVALID) {
		CG3::UString buffer;
		buffer.resize(1000);
		int32_t nr = u_file_read(&buffer[0], buffer.size(), ux_stdin);
		buffer.resize(nr);
		URegularExpression *rx = 0;

		for (;;) {
			rx = uregex_openC("^\"<[^>]+>\".*?^\\s+\"[^\"]+\"", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_CG;
				break;
			}
			uregex_close(rx);

			rx = uregex_openC("^\\S+ *\t *\\[\\S+\\]", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_NICELINE;
				break;
			}
			uregex_close(rx);

			rx = uregex_openC("^\\S+ *\t *\"\\S+\"", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_NICELINE;
				break;
			}
			uregex_close(rx);

			rx = uregex_openC("\\^[^/]+(/[^<]+(<[^>]+>)+)+\\$", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_APERTIUM;
				break;
			}
			uregex_close(rx);

			rx = uregex_openC("^\\S+\t\\S+(\\+\\S+)+$", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status);
			uregex_setText(rx, buffer.c_str(), buffer.size(), &status);
			if (uregex_find(rx, -1, &status)) {
				fmt = CG3::FMT_FST;
				break;
			}

			fmt = CG3::FMT_PLAIN;
			break;
		}
		uregex_close(rx);

		instream.reset(new CG3::istream_buffer(ux_stdin, buffer));
	}
	else {
		instream.reset(new CG3::istream(ux_stdin));
	}

	applicator.setInputFormat(fmt);

	if (options[SUB_LTR].doesOccur) {
		grammar.sub_readings_ltr = true;
	}
	if (options[MAPPING_PREFIX].doesOccur) {
		size_t sn = strlen(options[MAPPING_PREFIX].value);
		CG3::UString buf(sn * 3, 0);
		UConverter *conv = ucnv_open(codepage_default, &status);
		ucnv_toUChars(conv, &buf[0], buf.size(), options[MAPPING_PREFIX].value, sn, &status);
		ucnv_close(conv);
		grammar.mapping_prefix = buf[0];
	}
	if (options[SUB_DELIMITER].doesOccur) {
		size_t sn = strlen(options[SUB_DELIMITER].value);
		applicator.sub_delims.resize(sn * 2);
		UConverter *conv = ucnv_open(codepage_default, &status);
		sn = ucnv_toUChars(conv, &applicator.sub_delims[0], applicator.sub_delims.size(), options[SUB_DELIMITER].value, sn, &status);
		applicator.sub_delims.resize(sn);
		applicator.sub_delims += '+';
		ucnv_close(conv);
	}
	if (options[FST_WTAG].doesOccur) {
		size_t sn = strlen(options[FST_WTAG].value);
		applicator.wtag.resize(sn * 2);
		UConverter *conv = ucnv_open(codepage_default, &status);
		sn = ucnv_toUChars(conv, &applicator.wtag[0], applicator.wtag.size(), options[FST_WTAG].value, sn, &status);
		applicator.wtag.resize(sn);
		ucnv_close(conv);
	}
	if (options[FST_WFACTOR].doesOccur) {
		applicator.wfactor = strtof(options[FST_WFACTOR].value, 0);
	}

	applicator.setOutputFormat(CG3::FMT_CG);

	if (options[OUT_APERTIUM].doesOccur) {
		applicator.setOutputFormat(CG3::FMT_APERTIUM);
	}
	else if (options[OUT_NICELINE].doesOccur) {
		applicator.setOutputFormat(CG3::FMT_NICELINE);
	}
	else if (options[OUT_PLAIN].doesOccur) {
		applicator.setOutputFormat(CG3::FMT_PLAIN);
	}

	applicator.is_conv = true;
	applicator.verbosity_level = 0;
	applicator.runGrammarOnText(*instream.get(), ux_stdout);

	u_fclose(ux_stdout);
	u_fclose(ux_stderr);

	u_cleanup();
}
Esempio n. 4
0
int main(int argc, char **argv) {
	UErrorCode status = U_ZERO_ERROR;
	UFILE *ux_stdin = 0;
	UFILE *ux_stdout = 0;
	UFILE *ux_stderr = 0;

	/* Initialize ICU */
	u_init(&status);
	if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) {
		std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl;
		CG3Quit(1);
	}

	U_MAIN_INIT_ARGS(argc, argv);
	argc = u_parseArgs(argc, argv, NUM_OPTIONS, options);

	if (argc < 0 || options[HELP1].doesOccur || options[HELP2].doesOccur) {
		FILE *out = (argc < 0) ? stderr : stdout;
		fprintf(out, "Usage: cg-mwesplit [OPTIONS]\n");
		fprintf(out, "\n");
		fprintf(out, "Options:\n");

		size_t longest = 0;
		for (uint32_t i = 0; i < NUM_OPTIONS; i++) {
			if (options[i].description) {
				size_t len = strlen(options[i].longName);
				longest = std::max(longest, len);
			}
		}
		for (uint32_t i = 0; i < NUM_OPTIONS; i++) {
			if (options[i].description && options[i].description[0] != '!') {
				fprintf(out, " ");
				if (options[i].shortName) {
					fprintf(out, "-%c,", options[i].shortName);
				}
				else {
					fprintf(out, "   ");
				}
				fprintf(out, " --%s", options[i].longName);
				size_t ldiff = longest - strlen(options[i].longName);
				while (ldiff--) {
					fprintf(out, " ");
				}
				fprintf(out, "  %s", options[i].description);
				fprintf(out, "\n");
			}
		}

		return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	}

	ucnv_setDefaultName("UTF-8");
	const char *codepage_default = ucnv_getDefaultName();
	uloc_setDefault("en_US_POSIX", &status);
	const char *locale_default = uloc_getDefault();

	ux_stdin = u_finit(stdin, locale_default, codepage_default);
	ux_stdout = u_finit(stdout, locale_default, codepage_default);
	ux_stderr = u_finit(stderr, locale_default, codepage_default);

	CG3::Grammar grammar;

	grammar.ux_stderr = ux_stderr;
	grammar.allocateDummySet();
	grammar.delimiters = grammar.allocateSet();
	grammar.addTagToSet(grammar.allocateTag(CG3::stringbits[0].getTerminatedBuffer()), grammar.delimiters);
	grammar.reindex();

	CG3::MweSplitApplicator applicator(ux_stderr);
	applicator.setGrammar(&grammar);

	boost::scoped_ptr<CG3::istream> instream;

	instream.reset(new CG3::istream(ux_stdin));

	applicator.is_conv = true;
	applicator.verbosity_level = 0;
	applicator.runGrammarOnText(*instream.get(), ux_stdout);

	u_fclose(ux_stdout);
	u_fclose(ux_stderr);

	u_cleanup();
}