void create_corpus_index(const char* corpus_file){ // Create the index to be used for analysis std::cout << "Loading corpus using files listed in " << corpus_file << std::endl; std::fstream corpus (corpus_file, std::fstream::in); std::istream_iterator<std::string> corpus_iterator(corpus); std::for_each(corpus_iterator, std::istream_iterator<std::string>(), [](const std::string& doc){ read_document(doc, corpus_index); }); std::cout << "Loaded corpus of " << corpus_index.size() << " words from " << num_docs << " file(s)." << std::endl; }
Node load_corpus() { std::ifstream corpus("test.txt"); std::string line; Node root; if (corpus.is_open()) { while (std::getline(corpus,line)) { wordRank[line] = line_number; root.add(line); line_number++; } } std::cout << root.children['t']->children['h']->children['a']->children.size() << std::endl; return root; }
void Vocabulary::loadFromTrainFile(const char * train_file) { char * word; TaggedBrownCorpus corpus(train_file); long long a, i, k; for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1; m_vocab_size = 0; if(!m_doctag) addWordToVocab((char *)"</s>"); TaggedDocument * doc = NULL; while ((doc = corpus.next()) != NULL) { if(m_doctag) { //for doc tag word = doc->m_tag; m_train_words++; i = searchVocab(word); if (i == -1) { a = addWordToVocab(word); m_vocab[a].cn = 1; } } else { // for doc words for(k = 0; k < doc->m_word_num; k++){ word = doc->m_words[k]; m_train_words++; if (!m_doctag && m_train_words % 100000 == 0) { printf("%lldK%c", m_train_words / 1000, 13); fflush(stdout); } i = searchVocab(word); if (i == -1) { a = addWordToVocab(word); m_vocab[a].cn = 1; } else m_vocab[i].cn++; if (m_vocab_size > vocab_hash_size * 0.7) reduceVocab(); } m_train_words--; } } if(!m_doctag) { sortVocab(); printf("Vocab size: %lld\n", m_vocab_size); printf("Words in train file: %lld\n", m_train_words); } }
int main(int argc, char* argv[]) { std::string searchFile(argv[2]); uint topN(std::stoi(argv[3])); std::ifstream corpus(argv[1]); std::istream_iterator<std::string> corpus_it(corpus), eof; std::vector<std::string> fileList(corpus_it, eof); strIntMap corpusMap, documentMap; std::cout << "Loading corpus using files listed in " << argv[1] << std::endl; loadCorpusAndSearchFiles(corpusMap, documentMap, searchFile, fileList); std::cout << "Loaded corpus of " << corpusMap.size() << " words from " << fileList.size() << " file(s)" << std::endl << "------[ Starting analysis ]------" << std::endl << "Top " << topN << " significant words..." << std::endl; std::set<tfidfPair> result; getTopN(topN, fileList.size(), result, documentMap, corpusMap); printTopN(result); std::cout << "Lines with 1 or more significant words:" << std::endl; countSigWords(searchFile, result); return 0; }
Classificador *C50Treinador::executarTreinamento( Corpus &corpus, int atributo ) { string linha,val; ifstream tree; ofstream names, data; vector<int> indexes; vector<string> linhasArquivo, valores; vector< vector<string> > valoresPossiveis; int c, a, numeroClasses, numeroAtributos, e, v, numeroValores; //gera arquivo .names names.open("c50tempT.names"); numeroClasses = classes.size(); for (c=0;c<numeroClasses;c++){ names << classes[c]; if (c!=numeroClasses-1) names << ", "; else names << "." << endl << endl; } numeroAtributos = atributos.size(); for (a=0;a<numeroAtributos;a++){ names << atributos[a] << ": "; if (corpus.discreto(atributos[a],valores)){ numeroValores = valores.size(); for (v=0;v<numeroValores;v++){ if (valores[v]=="") names << "?"; else names << removeVirgula(valores[v]); if (v!=numeroValores-1) names << ", "; else names << "." << endl; } } else{ names << "continuous." << endl; valores.clear(); valores.push_back("continuous"); } valoresPossiveis.push_back(valores); indexes.push_back(corpus.pegarPosAtributo(atributos[a])); } names.close(); //gera data data.open("c50tempT.data"); for (c=0;c<corpus.pegarQtdConjExemplos();c++) for (e=0;e<corpus.pegarQtdExemplos(c);e++){ for (a=0;a<numeroAtributos;a++){ val = corpus(c,e,indexes[a]); if (val=="") data << "?, "; else data << removeVirgula(val) << ", "; } data << corpus(c,e,atributo) << endl; } data.close(); //chama função c50train("c50tempT", cf); //carrega .tree tree.open("c50tempT.tree"); while (!tree.eof()){ getline(tree,linha); linhasArquivo.push_back(linha); } tree.close(); C50Classificador *cl = new C50Classificador(); cl->linhasArquivo = linhasArquivo; cl->atributos = atributos; cl->valoresPossiveis = valoresPossiveis; cl->classes = classes; return cl; }
//Convert all the mila xml's to vrt files and store them in the vrt folder bool CMilatoCWBConverter::ConvertFromMilaToVrt(){ cout << "-------------------------" << endl; cout << " Start Converting " << endl; cout << "-------------------------" << endl << endl; //Open a linux folders tree char *dot[] = {const_cast<char *>(m_MilaFolderPath.data()), 0}; FTS *tree = fts_open(dot,FTS_NOCHDIR, 0); if (!tree) { perror("fts_open"); return 1; } //Start working on every xml file in the input directory. FTSENT *node; //A place holder for the currnet folder //Every vrt file will have is current folder string as the value of the id attribute of the text node string sVrtTextID; string sContainerVrtTextID; string sCurrentCorpusName; //Initialize the mila converter helper class CCorpus::milaConverter.Initialize(); while ((node = fts_read(tree))) { if (node->fts_level > 0 && node->fts_name[0] == '.') fts_set(tree, node, FTS_SKIP); //If directory node - We will create this directory in the vrtfolder else if (node->fts_info & FTS_D) { //A corpus name folder if (node->fts_level == 1) { //Save the corpus name for the id attribute of the text sCurrentCorpusName = node->fts_name; sContainerVrtTextID = "t_" + sCurrentCorpusName; //Replace all occurences of "-" with "_" string from = "-"; string to = "_"; while(sContainerVrtTextID.find(from) != std::string::npos) { sContainerVrtTextID.replace(sContainerVrtTextID.find(from), from.length(), to); } } //Saves the current folder sVrtTextID = node->fts_name; //Check if the name is valid (a valid c variable name) //Add a prefix of t(text) to the value sVrtTextID = "t_" + sCurrentCorpusName + "_" + sVrtTextID; //Replace all occurences of "-" with "_" string from = "-"; string to = "_"; while(sVrtTextID.find(from) != std::string::npos) { sVrtTextID.replace(sVrtTextID.find(from), from.length(), to); } //Create the output folders //First for the main vrt path string vrtPath = node->fts_accpath; //Update the Error Logger errorLogger->StartANewText(node->fts_accpath); vrtPath.replace(vrtPath.find(m_MilaFolderPath),m_MilaFolderPath.length(),m_VrtFolderPath.data()); cout << endl << "-----------------------------------------------------------------------------------------" << endl; cout << " --Start converting the output folder :" << vrtPath << endl << endl; mkdir(vrtPath.data() , S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); //Second for the Container vrt path vrtPath = node->fts_accpath; vrtPath.replace(vrtPath.find(m_MilaFolderPath),m_MilaFolderPath.length(),m_ContainerVRTFolderPath.data()); cout << endl << "-----------------------------------------------------------------------------------------" << endl; cout << " --Start converting the output folder :" << vrtPath << endl << endl; mkdir(vrtPath.data() , S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); } //If corrent node is a file else if (node->fts_info & FTS_F) { //The absoulte path to the mila xml file including the name string xmlName = node->fts_accpath; //Updating the ErrorLoger errorLogger->SetCurrentFile(xmlName); //The name of the output vrt file string outName = node->fts_name; outName.replace(outName.find(".xml"),4,".vrt"); //The path to the output file string vrtPath = xmlName.data(); vrtPath.replace(vrtPath.find(m_MilaFolderPath),m_MilaFolderPath.length(),m_VrtFolderPath.data()); vrtPath.replace(vrtPath.find(node->fts_name),outName.length(),""); //The path to the container output file string containerVrtPath = xmlName.data(); containerVrtPath.replace(containerVrtPath.find(m_MilaFolderPath),m_MilaFolderPath.length(),m_ContainerVRTFolderPath.data()); containerVrtPath.replace(containerVrtPath.find(node->fts_name),outName.length(),""); ////////////////////////////////////////////////// //Loading document //Creating the document xml_document doc; //Load the file xml_parse_result result = doc.load_file(xmlName.data()); //Check if the xml file is valid if (result) { //If valid //Create writing file streams //main vrt ofstream outputFile((vrtPath + outName).data()); //Container vrt ofstream containerOutputFile((containerVrtPath + outName).data()); xml_node corpusNode= doc.child("corpus"); if (corpusNode) { //If enable to open output file if (outputFile.is_open() && containerOutputFile.is_open()) { //Start parsing the xml file into the outputfile //Create a corpus object CCorpus corpus(&corpusNode,&outputFile, &containerOutputFile, sVrtTextID ,sContainerVrtTextID); //Start parsing if (!corpus.Parse()) cout << endl << "Error Parsing " << corpusNode.attribute("name").value() << endl; outputFile.close(); containerOutputFile.close(); } else //if Failed to open file { cout << endl << "Unable to open output file : " << outName << endl; if (fts_close(tree)) { perror("fts_close"); return false; } return false; } } else { cout << endl << "Error with file: " << xmlName << endl; } } else { //if Not valid print the description cout << endl << "Load result " << result.description() << endl; cout << endl << "Error offset: " << result.offset << "(error at [..." << xmlName << result.offset << "]" << endl << endl; } } } cout << endl; //Close the errorLogger and the error file delete errorLogger; if (fts_close(tree)) { perror("fts_close"); return false; } return true; }
int main(int n_args, char** args) { init_logging(); isage::util::print_pstats(); int num_topics; int num_epochs_; isage::wtm::LDAVStrategy strategy; std::string output_usage_name; std::string heldout_output_usage_name; po::variables_map vm; { po::options_description desc("Allowed options"); desc.add_options() ("help", "produce help message") ("vocab-size", po::value< int >()->default_value(10), "number of vocab words (default: 10)") ("words-per-doc", po::value<int>()->default_value(10), "number of words per document (default: 10)") ("bias", po::value<double>()->default_value(.8), "Bernoulli parameter p for how to partition the vocab words. (default: 0.8)") ("num-docs", po::value<int>()->default_value(10), "number of documents to generate (default: 10)") ////////////////////////// ("topics", po::value<int>(&num_topics)->default_value(10), "number of topics to use") ("train-epochs", po::value<int>(&num_epochs_)->default_value(5), "Number of epochs to run") ("em-iterations", po::value<int>(&(strategy.num_learn_iters))->default_value(100), "number of EM iterations to run") ("e-steps", po::value<int>(&(strategy.num_e_iters))->default_value(25), "number of iterations to perform, per E-step") ("m-steps", po::value<int>(&(strategy.num_m_iters))->default_value(1), "number of iterations to perform, per M-step") ("update-hypers", po::value<int>(&(strategy.hyper_update_iter))->default_value(-1), "how often to update the hyperparameters (default: -1 == never update)") ("update-model-interval", po::value<int>(&(strategy.update_model_every))->default_value(5), "update the model every [some] number of EM steps (default: 5)") ("print-topics-every", po::value<int>(&(strategy.print_topics_every))->default_value(5), "print topics every [some] number of EM steps (default: 5)") ("print-usage-every", po::value<int>(&(strategy.print_usage_every))->default_value(5), "print topic usage every [some] number of EM steps (default: 5)") ("top-k", po::value<int>(&(strategy.print_topics_k))->default_value(10), "number of words per topic to print (default: 10)") ("em-verbosity", po::value<int>(&(strategy.em_verbosity))->default_value(1), "how verbose should EM output be (default: 1; higher == more verbose)") ("eta-density-threshold", po::value<double>(&(strategy.eta_density_threshold))->default_value(1E-4), "the threshold t for counting the number of eta parameters are above t (default: 1E-4)") //////////////////////////////// ("topic-usage-file", po::value<std::string>(&output_usage_name)->default_value("-"), "filename to write topic usage to (default: - (to console)") ("heldout-topic-usage-file", po::value<std::string>(&heldout_output_usage_name)->default_value("-"), "filename to write heldout topic usage to (default: - (to console)") ("inferencer-serialization", po::value<std::string>(), "filename to serialize inference state to") ("serialized-inferencer", po::value<std::string>(), "filename to READ serialized inference state from") //////////////////////////////// ; po::store(po::parse_command_line(n_args, args, desc), vm); if (vm.count("help")) { ERROR << desc << "\n"; return 1; } po::notify(vm); } typedef std::string string; typedef string VocabType; typedef isage::wtm::Vocabulary< VocabType > SVocab; typedef double CountType; typedef isage::wtm::Document< VocabType, CountType > Doc; typedef isage::wtm::Corpus< Doc > Corpus; typedef std::vector<double> TopicType; typedef isage::wtm::DiscreteLDA< VocabType, std::vector<double> > Model; typedef isage::wtm::DiscreteVariational< Doc, VocabType, TopicType > Variational; isage::util::SmartWriter usage_outer(output_usage_name); isage::util::SmartWriter assign_outer("assignments"); Variational* var_inf = NULL; SVocab word_vocab("__OOV__"); for(int wi = 1; wi <= vm["vocab-size"].as<int>(); ++wi) { word_vocab.make_word("word_" + std::to_string(wi)); } Corpus corpus("train_corpus"); corpus.generate(vm["num-docs"].as<int>(), vm["words-per-doc"].as<int>(), vm["bias"].as<double>(), word_vocab ); int num_words_total = get_num_tokens(corpus); INFO << "Number of documents: " << corpus.num_docs(); INFO << "Number of word tokens total: " << num_words_total; INFO << "Number of vocab types: " << word_vocab.num_words(); isage::wtm::SymmetricHyperparams shp; shp.h_theta = 1.0/(double)num_topics; shp.h_word = 0.1; INFO << "Creating model with " << num_topics << " topics"; Model dm(num_topics, &shp, &word_vocab); INFO << "Done creating model."; var_inf = new Variational(&dm, &corpus, &word_vocab); isage::wtm::UniformHyperSeedWeightedInitializer initer(num_topics, corpus.num_docs(), (double)num_words_total/(double)corpus.num_docs()); var_inf->init(initer); for(int epoch = 0; epoch < num_epochs_; ++epoch) { INFO << "Starting learning epoch " << epoch; var_inf->learn(strategy, epoch, usage_outer, assign_outer); INFO << "Done with inference in epoch " << epoch; // // create and open a character archive for output // if(vm.count("inferencer-serialization")) { // std::string sfname = vm["inferencer-serialization"].as<std::string>() + // ".iteration" + std::to_string((1+epoch)); // std::ofstream ofs(sfname, std::ios::out|std::ios::binary); // boost::iostreams::filtering_streambuf<boost::iostreams::output> out; // out.push(boost::iostreams::gzip_compressor()); // out.push(ofs); // boost::archive::binary_oarchive oa(out); // oa << (*var_inf); // INFO << "see " << sfname << " for serialized inferencer"; // } dm.print_topics(strategy.print_topics_k, word_vocab); } if(var_inf != NULL) { delete var_inf; } return 0; }