int main(int argc, char *argv[]) { UFILE *ux_stdout = 0; UFILE *ux_stderr = 0; UErrorCode status = U_ZERO_ERROR; if (argc != 4) { endProgram(argv[0]); } /* Initialize ICU */ u_init(&status); if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) { std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl; CG3Quit(1); } status = U_ZERO_ERROR; ucnv_setDefaultName("UTF-8"); const char *codepage_default = ucnv_getDefaultName(); uloc_setDefault("en_US_POSIX", &status); const char *locale_default = uloc_getDefault(); ux_stdout = u_finit(stdout, locale_default, codepage_default); ux_stderr = u_finit(stderr, locale_default, codepage_default); CG3::Grammar *grammar = cg3_grammar_load(argv[1], ux_stdout, ux_stderr, true); CG3::Grammar *relabel_grammar = cg3_grammar_load(argv[2], ux_stdout, ux_stderr); CG3::Relabeller relabeller(*grammar, *relabel_grammar, ux_stderr); relabeller.relabel(); FILE *gout = fopen(argv[3], "wb"); if (gout) { CG3::BinaryGrammar writer(*grammar, ux_stderr); writer.writeBinaryGrammar(gout); } else { std::cerr << "Could not write grammar to " << argv[3] << std::endl; } delete relabel_grammar; relabel_grammar = 0; delete grammar; grammar = 0; u_fclose(ux_stderr); u_fclose(ux_stdout); u_cleanup(); return status; }
// like libcg3's, but with a non-void grammar … CG3::Grammar *cg3_grammar_load(const char *filename, UFILE *ux_stdout, UFILE *ux_stderr, bool require_binary = false) { using namespace CG3; std::ifstream input(filename, std::ios::binary); if (!input) { u_fprintf(ux_stderr, "Error: Error opening %s for reading!\n", filename); return 0; } if (!input.read(&cbuffers[0][0], 4)) { u_fprintf(ux_stderr, "Error: Error reading first 4 bytes from grammar!\n"); return 0; } input.close(); Grammar *grammar = new Grammar; grammar->ux_stderr = ux_stderr; grammar->ux_stdout = ux_stdout; boost::scoped_ptr<IGrammarParser> parser; if (cbuffers[0][0] == 'C' && cbuffers[0][1] == 'G' && cbuffers[0][2] == '3' && cbuffers[0][3] == 'B') { parser.reset(new BinaryGrammar(*grammar, ux_stderr)); } else { if (require_binary) { u_fprintf(ux_stderr, "Error: Text grammar detected -- to compile this grammar, use `cg-comp'\n"); CG3Quit(1); } parser.reset(new TextualParser(*grammar, ux_stderr)); } if (parser->parse_grammar_from_file(filename, uloc_getDefault(), ucnv_getDefaultName())) { u_fprintf(ux_stderr, "Error: Grammar could not be parsed!\n"); return 0; } grammar->reindex(); return grammar; }
int main(int argc, char *argv[]) { UErrorCode status = U_ZERO_ERROR; UFILE *ux_stdin = 0; UFILE *ux_stdout = 0; UFILE *ux_stderr = 0; /* Initialize ICU */ u_init(&status); if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) { std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl; CG3Quit(1); } U_MAIN_INIT_ARGS(argc, argv); argc = u_parseArgs(argc, argv, NUM_OPTIONS, options); if (argc < 0 || options[HELP1].doesOccur || options[HELP2].doesOccur) { FILE *out = (argc < 0) ? stderr : stdout; fprintf(out, "Usage: cg-conv [OPTIONS]\n"); fprintf(out, "\n"); fprintf(out, "Options:\n"); size_t longest = 0; for (uint32_t i = 0; i < NUM_OPTIONS; i++) { if (options[i].description) { size_t len = strlen(options[i].longName); longest = std::max(longest, len); } } for (uint32_t i = 0; i < NUM_OPTIONS; i++) { if (options[i].description && options[i].description[0] != '!') { fprintf(out, " "); if (options[i].shortName) { fprintf(out, "-%c,", options[i].shortName); } else { fprintf(out, " "); } fprintf(out, " --%s", options[i].longName); size_t ldiff = longest - strlen(options[i].longName); while (ldiff--) { fprintf(out, " "); } fprintf(out, " %s", options[i].description); fprintf(out, "\n"); } } return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } if (options[IN_CG2].doesOccur) { options[IN_CG].doesOccur = true; } if (options[OUT_CG2].doesOccur) { options[OUT_CG].doesOccur = true; } ucnv_setDefaultName("UTF-8"); const char *codepage_default = ucnv_getDefaultName(); uloc_setDefault("en_US_POSIX", &status); const char *locale_default = uloc_getDefault(); ux_stdin = u_finit(stdin, locale_default, codepage_default); ux_stdout = u_finit(stdout, locale_default, codepage_default); ux_stderr = u_finit(stderr, locale_default, codepage_default); CG3::Grammar grammar; grammar.ux_stderr = ux_stderr; grammar.allocateDummySet(); grammar.delimiters = grammar.allocateSet(); grammar.addTagToSet(grammar.allocateTag(CG3::stringbits[0].getTerminatedBuffer()), grammar.delimiters); grammar.reindex(); CG3::FormatConverter applicator(ux_stderr); applicator.setGrammar(&grammar); boost::scoped_ptr<CG3::istream> instream; CG3::CG_FORMATS fmt = CG3::FMT_INVALID; if (options[IN_CG].doesOccur) { fmt = CG3::FMT_CG; } else if (options[IN_NICELINE].doesOccur) { fmt = CG3::FMT_NICELINE; } else if (options[IN_APERTIUM].doesOccur) { fmt = CG3::FMT_APERTIUM; } else if (options[IN_FST].doesOccur) { fmt = CG3::FMT_FST; } else if (options[IN_PLAIN].doesOccur) { fmt = CG3::FMT_PLAIN; } if (options[IN_AUTO].doesOccur || fmt == CG3::FMT_INVALID) { CG3::UString buffer; buffer.resize(1000); int32_t nr = u_file_read(&buffer[0], buffer.size(), ux_stdin); buffer.resize(nr); URegularExpression *rx = 0; for (;;) { rx = uregex_openC("^\"<[^>]+>\".*?^\\s+\"[^\"]+\"", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status); uregex_setText(rx, buffer.c_str(), buffer.size(), &status); if (uregex_find(rx, -1, &status)) { fmt = CG3::FMT_CG; break; } uregex_close(rx); rx = uregex_openC("^\\S+ *\t *\\[\\S+\\]", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status); uregex_setText(rx, buffer.c_str(), buffer.size(), &status); if (uregex_find(rx, -1, &status)) { fmt = CG3::FMT_NICELINE; break; } uregex_close(rx); rx = uregex_openC("^\\S+ *\t *\"\\S+\"", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status); uregex_setText(rx, buffer.c_str(), buffer.size(), &status); if (uregex_find(rx, -1, &status)) { fmt = CG3::FMT_NICELINE; break; } uregex_close(rx); rx = uregex_openC("\\^[^/]+(/[^<]+(<[^>]+>)+)+\\$", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status); uregex_setText(rx, buffer.c_str(), buffer.size(), &status); if (uregex_find(rx, -1, &status)) { fmt = CG3::FMT_APERTIUM; break; } uregex_close(rx); rx = uregex_openC("^\\S+\t\\S+(\\+\\S+)+$", UREGEX_DOTALL | UREGEX_MULTILINE, 0, &status); uregex_setText(rx, buffer.c_str(), buffer.size(), &status); if (uregex_find(rx, -1, &status)) { fmt = CG3::FMT_FST; break; } fmt = CG3::FMT_PLAIN; break; } uregex_close(rx); instream.reset(new CG3::istream_buffer(ux_stdin, buffer)); } else { instream.reset(new CG3::istream(ux_stdin)); } applicator.setInputFormat(fmt); if (options[SUB_LTR].doesOccur) { grammar.sub_readings_ltr = true; } if (options[MAPPING_PREFIX].doesOccur) { size_t sn = strlen(options[MAPPING_PREFIX].value); CG3::UString buf(sn * 3, 0); UConverter *conv = ucnv_open(codepage_default, &status); ucnv_toUChars(conv, &buf[0], buf.size(), options[MAPPING_PREFIX].value, sn, &status); ucnv_close(conv); grammar.mapping_prefix = buf[0]; } if (options[SUB_DELIMITER].doesOccur) { size_t sn = strlen(options[SUB_DELIMITER].value); applicator.sub_delims.resize(sn * 2); UConverter *conv = ucnv_open(codepage_default, &status); sn = ucnv_toUChars(conv, &applicator.sub_delims[0], applicator.sub_delims.size(), options[SUB_DELIMITER].value, sn, &status); applicator.sub_delims.resize(sn); applicator.sub_delims += '+'; ucnv_close(conv); } if (options[FST_WTAG].doesOccur) { size_t sn = strlen(options[FST_WTAG].value); applicator.wtag.resize(sn * 2); UConverter *conv = ucnv_open(codepage_default, &status); sn = ucnv_toUChars(conv, &applicator.wtag[0], applicator.wtag.size(), options[FST_WTAG].value, sn, &status); applicator.wtag.resize(sn); ucnv_close(conv); } if (options[FST_WFACTOR].doesOccur) { applicator.wfactor = strtof(options[FST_WFACTOR].value, 0); } applicator.setOutputFormat(CG3::FMT_CG); if (options[OUT_APERTIUM].doesOccur) { applicator.setOutputFormat(CG3::FMT_APERTIUM); } else if (options[OUT_NICELINE].doesOccur) { applicator.setOutputFormat(CG3::FMT_NICELINE); } else if (options[OUT_PLAIN].doesOccur) { applicator.setOutputFormat(CG3::FMT_PLAIN); } applicator.is_conv = true; applicator.verbosity_level = 0; applicator.runGrammarOnText(*instream.get(), ux_stdout); u_fclose(ux_stdout); u_fclose(ux_stderr); u_cleanup(); }
int main(int argc, char **argv) { UErrorCode status = U_ZERO_ERROR; UFILE *ux_stdin = 0; UFILE *ux_stdout = 0; UFILE *ux_stderr = 0; /* Initialize ICU */ u_init(&status); if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) { std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl; CG3Quit(1); } U_MAIN_INIT_ARGS(argc, argv); argc = u_parseArgs(argc, argv, NUM_OPTIONS, options); if (argc < 0 || options[HELP1].doesOccur || options[HELP2].doesOccur) { FILE *out = (argc < 0) ? stderr : stdout; fprintf(out, "Usage: cg-mwesplit [OPTIONS]\n"); fprintf(out, "\n"); fprintf(out, "Options:\n"); size_t longest = 0; for (uint32_t i = 0; i < NUM_OPTIONS; i++) { if (options[i].description) { size_t len = strlen(options[i].longName); longest = std::max(longest, len); } } for (uint32_t i = 0; i < NUM_OPTIONS; i++) { if (options[i].description && options[i].description[0] != '!') { fprintf(out, " "); if (options[i].shortName) { fprintf(out, "-%c,", options[i].shortName); } else { fprintf(out, " "); } fprintf(out, " --%s", options[i].longName); size_t ldiff = longest - strlen(options[i].longName); while (ldiff--) { fprintf(out, " "); } fprintf(out, " %s", options[i].description); fprintf(out, "\n"); } } return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } ucnv_setDefaultName("UTF-8"); const char *codepage_default = ucnv_getDefaultName(); uloc_setDefault("en_US_POSIX", &status); const char *locale_default = uloc_getDefault(); ux_stdin = u_finit(stdin, locale_default, codepage_default); ux_stdout = u_finit(stdout, locale_default, codepage_default); ux_stderr = u_finit(stderr, locale_default, codepage_default); CG3::Grammar grammar; grammar.ux_stderr = ux_stderr; grammar.allocateDummySet(); grammar.delimiters = grammar.allocateSet(); grammar.addTagToSet(grammar.allocateTag(CG3::stringbits[0].getTerminatedBuffer()), grammar.delimiters); grammar.reindex(); CG3::MweSplitApplicator applicator(ux_stderr); applicator.setGrammar(&grammar); boost::scoped_ptr<CG3::istream> instream; instream.reset(new CG3::istream(ux_stdin)); applicator.is_conv = true; applicator.verbosity_level = 0; applicator.runGrammarOnText(*instream.get(), ux_stdout); u_fclose(ux_stdout); u_fclose(ux_stderr); u_cleanup(); }