Esempio n. 1
0
/**
 * Main function.
 */
int main(int argc, char* argv[])
{
    const char* sysdict = TEST_JMA_DEFAULT_SYSTEM_DICT;
    const char* stopdict = TEST_JMA_DEFAULT_STOPWORD_DICT;

    for(int optIndex=1; optIndex+1<argc; optIndex+=2)
    {
        if(! strcmp(argv[optIndex], "--stop"))
            stopdict = argv[optIndex+1];
        else if(! strcmp(argv[optIndex], "--dict"))
            sysdict = argv[optIndex+1];
        else
        {
            cerr << "unknown option: " << argv[optIndex] << endl;
            printUsage();
            exit(1);
        }
    }

    cout << "system dictionary: " << sysdict << endl;
    cout << "stop word dictionary: " << stopdict << endl;

    // create factory
    JMA_Factory* factory = JMA_Factory::instance();

    // create analyzer and knowledge
    Analyzer* analyzer = factory->createAnalyzer();
    Knowledge* knowledge = factory->createKnowledge();

    // load dictioanry files
    knowledge->setSystemDict(sysdict);
    if(knowledge->loadDict() == 0)
    {
        cerr << "error: fail to load dictionary files" << endl;
        exit(1);
    }
    cout << "encoding type of system dictionary: " << Knowledge::encodeStr(knowledge->getEncodeType()) << endl;

    // load stop word dictionary
    if(knowledge->loadStopWordDict(stopdict) == 0)
    {
        cerr << "error: fail to load stop word dictionary" << endl;
        exit(1);
    }

    // set knowledge
    if(analyzer->setKnowledge(knowledge) == 0)
    {
        cerr << "fail to set knowledge" << endl;
        exit(1);
    }

    Sentence s;
    string line;
    while(getline(cin, line))
    {
        s.setString(line.c_str());

        if(analyzer->runWithSentence(s) != 1)
        {
            cerr << "error: fail in Analyzer::runWithSentence()" << endl;
            exit(1);
        }

        // get one-best result
        int i= s.getOneBestIndex();
        if(i == -1)
            cout << "no one-best result exists." << endl;
        else
        {
            for(int j=0; j<s.getCount(i); ++j)
                cout << s.getLexicon(i, j) << "/" << s.getStrPOS(i, j) << "  ";

            cout << endl;
        }
    }

    delete knowledge;
    delete analyzer;

    return 0;
}
Esempio n. 2
0
/**
 * Main function.
 */
int main(int argc, char* argv[])
{
    string inputFile, typeStr;
    bool isAnalyzeFile = false;
    bool isTestLine = false;

    try
    {
        po::options_description config("Allowed options");
        config.add_options()
            ("help,h", "print help message")
            ("file,f", po::value<string>(&inputFile), "use an input file instead of standard input")
            ("type,t", po::value<string>(&typeStr)->default_value("language"), "type of function: [encoding, language, list, segment, sentence]")
            ("line,l", "test each line in input file")
            ;

        po::options_description cmdline_options;
        cmdline_options.add(config);

        po::variables_map vm;
        store(po::command_line_parser(argc, argv).options(cmdline_options).run(), vm);
        po::notify(vm);    

        if(vm.count("help"))
        {
            cout << cmdline_options << endl;
            exit(1);
        }

        if(vm.count("file"))
        {
            isAnalyzeFile = true;
            //cout << "input file: " << inputFile << endl;
        }

        if(typeStr == "encoding" || typeStr == "language" || typeStr == "list" || typeStr == "segment" || typeStr == "sentence")
        {
            //cout << "function type: " << typeStr << endl;
        }
        else
        {
            cerr << "unknown function type: " << typeStr << endl;
            cout << cmdline_options << endl;
            exit(1);
        }

        if(vm.count("line"))
        {
            isTestLine = true;
            //cout << "testing each line" << endl;

            if(! isAnalyzeFile)
            {
                cerr << "error: no input file is given by -f." << endl;
                exit(1);
            }

            if(typeStr != "encoding" && typeStr != "language")
            {
                cerr << "error: line test mode is only valid for \"encoding\" or \"language\" type." << endl;
                exit(1);
            }
        }

        //cout << endl;
    }
    catch(std::exception& e)
    {
        cerr << "error: " << e.what() << "\n";
        exit(1);
    }

    // create instances
    Factory* factory = Factory::instance();
    Analyzer* analyzer = factory->createAnalyzer();
    Knowledge* knowledge = factory->createKnowledge();

    // model files
    const char* encodingModel = "../../db/langid/model/encoding.bin";
    const char* languageModel = "../../db/langid/model/language.bin";

    // load encoding model for encoding identification
    if(! knowledge->loadEncodingModel(encodingModel))
    {
        cerr << "error: fail to load file " << encodingModel << endl;
        exit(1);
    }

    // load language model for language identification or sentence tokenization
    if(! knowledge->loadLanguageModel(languageModel))
    {
        cerr << "error: fail to load file " << languageModel << endl;
        exit(1);
    }

    // not to limit analyze size
    //analyzer->setOption(Analyzer::OPTION_TYPE_LIMIT_ANALYZE_SIZE, 0);

    // set minimum block size
    //analyzer->setOption(Analyzer::OPTION_TYPE_BLOCK_SIZE_THRESHOLD, 100);

    // identify Chinese Traditional text as Chinese Simplified language
    //analyzer->setOption(Analyzer::OPTION_TYPE_NO_CHINESE_TRADITIONAL, 1);
    
    // set knowledge
    analyzer->setKnowledge(knowledge);
    
    // identify character encoding
    if(typeStr == "encoding")
    {
        if(isAnalyzeFile)
        {
            if(isTestLine)
                testEncodingFromFileLine(*analyzer, inputFile.c_str());
            else
                testEncodingFromFile(*analyzer, inputFile.c_str());
        }
        else
            testEncodingFromString(*analyzer);
    }
    // identify the single primary language in UTF-8 encoding
    else if(typeStr == "language")
    {
        if(isAnalyzeFile)
        {
            if(isTestLine)
                testLanguageFromFileLine(*analyzer, inputFile.c_str());
            else
                testLanguageFromFile(*analyzer, inputFile.c_str());
        }
        else
            testLanguageFromString(*analyzer);
    }
    // identify the list of multiple languages in UTF-8 encoding
    else if(typeStr == "list")
    {
        if(isAnalyzeFile)
            testLanguageListFromFile(*analyzer, inputFile.c_str());
        else
            testLanguageListFromString(*analyzer);
    }
    // segment the UTF-8 multi-lingual text into single-language regions
    else if(typeStr == "segment")
    {
        if(isAnalyzeFile)
            testSegmentFile(*analyzer, inputFile.c_str());
        else
            testSegmentString(*analyzer);
    }
    // tokenize the UTF-8 text into sentences
    else if(typeStr == "sentence")
    {
        if(isAnalyzeFile)
        {
            ifstream ifs(inputFile.c_str());
            if(! ifs)
            {
                cerr << "error in opening file " << inputFile << endl;
                exit(1);
            }
            testSentenceLength(*analyzer, ifs);
        }
        else
            testSentenceLength(*analyzer, cin);
    }
    
    delete knowledge;
    delete analyzer;

    return 0;
}
/**
 * Main function.
 */
int main(int argc, char* argv[])
{
    if(argc < 3)
    {
        printUsage();
        exit(1);
    }

    // set default dictionary file
    const char* sysdict = TEST_JMA_DEFAULT_SYSTEM_DICT;
    if(argc > 3)
    {
        if(argc == 5 && ! strcmp(argv[3], "--dict"))
            sysdict = argv[4];
        else
        {
            cerr << "unknown command option " << argv[3] << endl;
            printUsage();
            exit(1);
        }
    }

    // create factory
    JMA_Factory* factory = JMA_Factory::instance();

    // create analyzer and knowledge
    Analyzer* analyzer = factory->createAnalyzer();
    Knowledge* knowledge = factory->createKnowledge();

    // load dictioanry files
    knowledge->setSystemDict(sysdict);
    cout << "system dictionary: " << sysdict << endl;
    if(knowledge->loadDict() == 0)
    {
        cerr << "fail to load dictionary files" << endl;
        exit(1);
    }
    cout << "encoding type of system dictionary: " << Knowledge::encodeStr(knowledge->getEncodeType()) << endl;

    // set knowledge
    if(analyzer->setKnowledge(knowledge) == 0)
    {
        cerr << "fail to set knowledge" << endl;
        exit(1);
    }

    // open files
    const char* source = argv[1];
    const char* dest = argv[2];
    assert(source && dest);

    ifstream from(source);
    if(! from)
    {
        cerr << "error in opening file: " << source << endl;
        exit(1);
    }

    ofstream to(dest);
    if(! to)
    {
        cerr << "error in opening file: " << dest << endl;
        exit(1);
    }

    // split sentences
    string line;
    vector<Sentence> sentVec;
    while(getline(from, line))
    {
        sentVec.clear();
        analyzer->splitSentence(line.c_str(), sentVec);
        for(size_t i=0; i<sentVec.size(); ++i)
        {
            string str(sentVec[i].getString());
            trimString(str);

            if(!str.empty())
                to << str << endl;
        }
    }

    // destroy instances
    delete knowledge;
    delete analyzer;

    return 0;
}