/** * Main function. */ int main(int argc, char* argv[]) { const char* sysdict = TEST_JMA_DEFAULT_SYSTEM_DICT; const char* stopdict = TEST_JMA_DEFAULT_STOPWORD_DICT; for(int optIndex=1; optIndex+1<argc; optIndex+=2) { if(! strcmp(argv[optIndex], "--stop")) stopdict = argv[optIndex+1]; else if(! strcmp(argv[optIndex], "--dict")) sysdict = argv[optIndex+1]; else { cerr << "unknown option: " << argv[optIndex] << endl; printUsage(); exit(1); } } cout << "system dictionary: " << sysdict << endl; cout << "stop word dictionary: " << stopdict << endl; // create factory JMA_Factory* factory = JMA_Factory::instance(); // create analyzer and knowledge Analyzer* analyzer = factory->createAnalyzer(); Knowledge* knowledge = factory->createKnowledge(); // load dictioanry files knowledge->setSystemDict(sysdict); if(knowledge->loadDict() == 0) { cerr << "error: fail to load dictionary files" << endl; exit(1); } cout << "encoding type of system dictionary: " << Knowledge::encodeStr(knowledge->getEncodeType()) << endl; // load stop word dictionary if(knowledge->loadStopWordDict(stopdict) == 0) { cerr << "error: fail to load stop word dictionary" << endl; exit(1); } // set knowledge if(analyzer->setKnowledge(knowledge) == 0) { cerr << "fail to set knowledge" << endl; exit(1); } Sentence s; string line; while(getline(cin, line)) { s.setString(line.c_str()); if(analyzer->runWithSentence(s) != 1) { cerr << "error: fail in Analyzer::runWithSentence()" << endl; exit(1); } // get one-best result int i= s.getOneBestIndex(); if(i == -1) cout << "no one-best result exists." << endl; else { for(int j=0; j<s.getCount(i); ++j) cout << s.getLexicon(i, j) << "/" << s.getStrPOS(i, j) << " "; cout << endl; } } delete knowledge; delete analyzer; return 0; }
/** * Main function. */ int main(int argc, char* argv[]) { string inputFile, typeStr; bool isAnalyzeFile = false; bool isTestLine = false; try { po::options_description config("Allowed options"); config.add_options() ("help,h", "print help message") ("file,f", po::value<string>(&inputFile), "use an input file instead of standard input") ("type,t", po::value<string>(&typeStr)->default_value("language"), "type of function: [encoding, language, list, segment, sentence]") ("line,l", "test each line in input file") ; po::options_description cmdline_options; cmdline_options.add(config); po::variables_map vm; store(po::command_line_parser(argc, argv).options(cmdline_options).run(), vm); po::notify(vm); if(vm.count("help")) { cout << cmdline_options << endl; exit(1); } if(vm.count("file")) { isAnalyzeFile = true; //cout << "input file: " << inputFile << endl; } if(typeStr == "encoding" || typeStr == "language" || typeStr == "list" || typeStr == "segment" || typeStr == "sentence") { //cout << "function type: " << typeStr << endl; } else { cerr << "unknown function type: " << typeStr << endl; cout << cmdline_options << endl; exit(1); } if(vm.count("line")) { isTestLine = true; //cout << "testing each line" << endl; if(! isAnalyzeFile) { cerr << "error: no input file is given by -f." << endl; exit(1); } if(typeStr != "encoding" && typeStr != "language") { cerr << "error: line test mode is only valid for \"encoding\" or \"language\" type." << endl; exit(1); } } //cout << endl; } catch(std::exception& e) { cerr << "error: " << e.what() << "\n"; exit(1); } // create instances Factory* factory = Factory::instance(); Analyzer* analyzer = factory->createAnalyzer(); Knowledge* knowledge = factory->createKnowledge(); // model files const char* encodingModel = "../../db/langid/model/encoding.bin"; const char* languageModel = "../../db/langid/model/language.bin"; // load encoding model for encoding identification if(! knowledge->loadEncodingModel(encodingModel)) { cerr << "error: fail to load file " << encodingModel << endl; exit(1); } // load language model for language identification or sentence tokenization if(! knowledge->loadLanguageModel(languageModel)) { cerr << "error: fail to load file " << languageModel << endl; exit(1); } // not to limit analyze size //analyzer->setOption(Analyzer::OPTION_TYPE_LIMIT_ANALYZE_SIZE, 0); // set minimum block size //analyzer->setOption(Analyzer::OPTION_TYPE_BLOCK_SIZE_THRESHOLD, 100); // identify Chinese Traditional text as Chinese Simplified language //analyzer->setOption(Analyzer::OPTION_TYPE_NO_CHINESE_TRADITIONAL, 1); // set knowledge analyzer->setKnowledge(knowledge); // identify character encoding if(typeStr == "encoding") { if(isAnalyzeFile) { if(isTestLine) testEncodingFromFileLine(*analyzer, inputFile.c_str()); else testEncodingFromFile(*analyzer, inputFile.c_str()); } else testEncodingFromString(*analyzer); } // identify the single primary language in UTF-8 encoding else if(typeStr == "language") { if(isAnalyzeFile) { if(isTestLine) testLanguageFromFileLine(*analyzer, inputFile.c_str()); else testLanguageFromFile(*analyzer, inputFile.c_str()); } else testLanguageFromString(*analyzer); } // identify the list of multiple languages in UTF-8 encoding else if(typeStr == "list") { if(isAnalyzeFile) testLanguageListFromFile(*analyzer, inputFile.c_str()); else testLanguageListFromString(*analyzer); } // segment the UTF-8 multi-lingual text into single-language regions else if(typeStr == "segment") { if(isAnalyzeFile) testSegmentFile(*analyzer, inputFile.c_str()); else testSegmentString(*analyzer); } // tokenize the UTF-8 text into sentences else if(typeStr == "sentence") { if(isAnalyzeFile) { ifstream ifs(inputFile.c_str()); if(! ifs) { cerr << "error in opening file " << inputFile << endl; exit(1); } testSentenceLength(*analyzer, ifs); } else testSentenceLength(*analyzer, cin); } delete knowledge; delete analyzer; return 0; }
/** * Main function. */ int main(int argc, char* argv[]) { if(argc < 3) { printUsage(); exit(1); } // set default dictionary file const char* sysdict = TEST_JMA_DEFAULT_SYSTEM_DICT; if(argc > 3) { if(argc == 5 && ! strcmp(argv[3], "--dict")) sysdict = argv[4]; else { cerr << "unknown command option " << argv[3] << endl; printUsage(); exit(1); } } // create factory JMA_Factory* factory = JMA_Factory::instance(); // create analyzer and knowledge Analyzer* analyzer = factory->createAnalyzer(); Knowledge* knowledge = factory->createKnowledge(); // load dictioanry files knowledge->setSystemDict(sysdict); cout << "system dictionary: " << sysdict << endl; if(knowledge->loadDict() == 0) { cerr << "fail to load dictionary files" << endl; exit(1); } cout << "encoding type of system dictionary: " << Knowledge::encodeStr(knowledge->getEncodeType()) << endl; // set knowledge if(analyzer->setKnowledge(knowledge) == 0) { cerr << "fail to set knowledge" << endl; exit(1); } // open files const char* source = argv[1]; const char* dest = argv[2]; assert(source && dest); ifstream from(source); if(! from) { cerr << "error in opening file: " << source << endl; exit(1); } ofstream to(dest); if(! to) { cerr << "error in opening file: " << dest << endl; exit(1); } // split sentences string line; vector<Sentence> sentVec; while(getline(from, line)) { sentVec.clear(); analyzer->splitSentence(line.c_str(), sentVec); for(size_t i=0; i<sentVec.size(); ++i) { string str(sentVec[i].getString()); trimString(str); if(!str.empty()) to << str << endl; } } // destroy instances delete knowledge; delete analyzer; return 0; }