int main(int argc, char** argv) { // Create an instance of the Kytea program Kytea kytea; // Load a KyTea model from a model file // this can be a binary or text model in any character encoding, // it will be detected automatically kytea.readModel("../../data/model.bin"); // Get the string utility class. This allows you to convert from // the appropriate string encoding to Kytea's internal format StringUtil* util = kytea.getStringUtil(); // Get the configuration class, this allows you to read or set the // configuration for the analysis KyteaConfig* config = kytea.getConfig(); // Map a plain text string to a KyteaString, and create a sentence object KyteaString surface_string = util->mapString("これはテストです。"); KyteaSentence sentence(surface_string, util->normalize(surface_string)); // Find the word boundaries kytea.calculateWS(sentence); // Find the pronunciations for each tag level for(int i = 0; i < config->getNumTags(); i++) kytea.calculateTags(sentence,i); // For each word in the sentence const KyteaSentence::Words & words = sentence.words; for(int i = 0; i < (int)words.size(); i++) { // Print the word cout << util->showString(words[i].surface); // For each tag level for(int j = 0; j < (int)words[i].tags.size(); j++) { cout << "\t"; // Print each of its tags for(int k = 0; k < (int)words[i].tags[j].size(); k++) { cout << " " << util->showString(words[i].tags[j][k].first) << "/" << words[i].tags[j][k].second; } } cout << endl; } cout << endl; }
void TextModelIO::readConfig(KyteaConfig & config) { string line,s1,s2; getline(*str_,line); // ignore the header while(getline(*str_, line) && line.length() != 0) { istringstream iss(line); iss >> s1; iss >> s2; config.parseTrainArg(s1.c_str(), (s2.length()==0?0:s2.c_str())); } numTags_ = config.getNumTags(); getline(*str_,line); // check the header if(line != "characters") THROW_ERROR("Badly formatted file, expected 'characters', got '" << line << "'"); getline(*str_, line); // get the serialized string util config.getStringUtil()->unserialize(line); getline(*str_, line); // check the last line }
// trains a KyTea model int main(int argc, const char **argv) { #ifndef KYTEA_SAFE try { #endif KyteaConfig * config = new KyteaConfig; config->setDebug(1); config->setOnTraining(true); config->parseTrainCommandLine(argc, argv); Kytea kytea(config); kytea.trainAll(); return 0; #ifndef KYTEA_SAFE } catch (exception &e) { cerr << endl; cerr << " KyTea Error: " << e.what() << endl; return 1; } #endif }
void TextModelIO::writeConfig(const KyteaConfig & config) { *str_ << "KyTea " << MODEL_IO_VERSION << " T " << config.getEncodingString() << endl; numTags_ = (int)config.getNumTags(); if(!config.getDoWS()) *str_ << "-nows" << endl; if(!config.getDoTags()) *str_ << "-notags" << endl; *str_ << "-numtags " << numTags_ << endl; if(config.getBias()<0) *str_ << "-nobias" << endl; *str_ << "-charw " << (int)config.getCharWindow() << endl << "-charn " << (int)config.getCharN() << endl << "-typew " << (int)config.getTypeWindow() << endl << "-typen " << (int)config.getTypeN() << endl << "-dicn " << (int)config.getDictionaryN() << endl << "-eps " << config.getEpsilon() << endl << "-solver " << config.getSolverType() << endl << endl; // write the character map *str_ << "characters" << endl << config.getStringUtil()->serialize() << endl; *str_ << endl; }