Example #1
0
void create_corpus_index(const char* corpus_file){
  // Create the index to be used for analysis
  std::cout << "Loading corpus using files listed in " << corpus_file << std::endl;
  std::fstream corpus (corpus_file, std::fstream::in);
  std::istream_iterator<std::string> corpus_iterator(corpus);
  std::for_each(corpus_iterator, std::istream_iterator<std::string>(), [](const std::string& doc){
      read_document(doc, corpus_index);
    });
  std::cout << "Loaded corpus of " << corpus_index.size() 
	    << " words from " << num_docs << " file(s)." 
	    << std::endl;
}
Example #2
0
Node load_corpus() {
	std::ifstream corpus("test.txt");
	std::string line;
	Node root;
	if (corpus.is_open()) {
		while (std::getline(corpus,line)) {
			wordRank[line] = line_number;
			root.add(line);	
			line_number++;
		}
	}
	std::cout << root.children['t']->children['h']->children['a']->children.size() << std::endl;
	return root;
}
Example #3
0
void Vocabulary::loadFromTrainFile(const char * train_file)
{
  char * word;
  TaggedBrownCorpus corpus(train_file);
  long long a, i, k;
  for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1;
  m_vocab_size = 0;
  if(!m_doctag) addWordToVocab((char *)"</s>");
  TaggedDocument * doc = NULL;
  while ((doc = corpus.next()) != NULL) {
    if(m_doctag) {  //for doc tag
      word = doc->m_tag;
      m_train_words++;
      i = searchVocab(word);
      if (i == -1) {
        a = addWordToVocab(word);
        m_vocab[a].cn = 1;
      }
    } else { // for doc words
      for(k = 0; k < doc->m_word_num; k++){
        word = doc->m_words[k];
        m_train_words++;
        if (!m_doctag && m_train_words % 100000 == 0)
        {
          printf("%lldK%c", m_train_words / 1000, 13);
          fflush(stdout);
        }
        i = searchVocab(word);
        if (i == -1) {
          a = addWordToVocab(word);
          m_vocab[a].cn = 1;
        } else m_vocab[i].cn++;
        if (m_vocab_size > vocab_hash_size * 0.7) reduceVocab();
      }
      m_train_words--;
    }
  }
  if(!m_doctag)
  {
    sortVocab();
    printf("Vocab size: %lld\n", m_vocab_size);
    printf("Words in train file: %lld\n", m_train_words);
  }
}
Example #4
0
int main(int argc, char* argv[]) {
    std::string searchFile(argv[2]);
    uint topN(std::stoi(argv[3]));
    std::ifstream corpus(argv[1]);
    std::istream_iterator<std::string> corpus_it(corpus), eof;
    std::vector<std::string> fileList(corpus_it, eof);
    strIntMap corpusMap, documentMap;
    std::cout << "Loading corpus using files listed in " << argv[1]
              << std::endl;
    loadCorpusAndSearchFiles(corpusMap, documentMap, searchFile, fileList);
    std::cout << "Loaded corpus of " << corpusMap.size() << " words from "
              << fileList.size() << " file(s)" << std::endl
              << "------[ Starting analysis ]------" << std::endl << "Top "
              << topN << " significant words..." << std::endl;
    std::set<tfidfPair> result;
    getTopN(topN, fileList.size(), result, documentMap, corpusMap);
    printTopN(result);
    std::cout << "Lines with 1 or more significant words:" << std::endl;
    countSigWords(searchFile, result);
    return 0;
}
Example #5
0
Classificador *C50Treinador::executarTreinamento( Corpus &corpus, int atributo )
{
    string linha,val;
    ifstream tree;
    ofstream names, data;
    vector<int> indexes;
    vector<string> linhasArquivo, valores;
    vector< vector<string> > valoresPossiveis;
    int c, a, numeroClasses, numeroAtributos, e, v, numeroValores;

    //gera arquivo .names
    names.open("c50tempT.names");

    numeroClasses = classes.size();
    for (c=0;c<numeroClasses;c++){
        names << classes[c];
        if (c!=numeroClasses-1)
            names << ", ";
        else
            names << "." << endl << endl;
    }

    numeroAtributos = atributos.size();
    for (a=0;a<numeroAtributos;a++){
        names << atributos[a] << ": ";
        if (corpus.discreto(atributos[a],valores)){
            numeroValores = valores.size();
            for (v=0;v<numeroValores;v++){
                if (valores[v]=="")
                    names << "?";
                else
                    names << removeVirgula(valores[v]);
                if (v!=numeroValores-1)
                    names << ", ";
                else
                    names << "." << endl;
            }
        }
        else{
            names << "continuous." << endl;
            valores.clear();
            valores.push_back("continuous");
        }
        valoresPossiveis.push_back(valores);
        indexes.push_back(corpus.pegarPosAtributo(atributos[a]));
    }
    names.close();

    //gera data
    data.open("c50tempT.data");
    for (c=0;c<corpus.pegarQtdConjExemplos();c++)
        for (e=0;e<corpus.pegarQtdExemplos(c);e++){
            for (a=0;a<numeroAtributos;a++){
                val = corpus(c,e,indexes[a]);
                if (val=="")
                    data << "?, ";
                else
                    data << removeVirgula(val) << ", ";
            }
            data << corpus(c,e,atributo) << endl;
        }
    data.close();
    //chama função
    c50train("c50tempT", cf);

    //carrega .tree
    tree.open("c50tempT.tree");
    while (!tree.eof()){
        getline(tree,linha);
        linhasArquivo.push_back(linha);

    }
    tree.close();

    C50Classificador *cl = new C50Classificador();
    cl->linhasArquivo = linhasArquivo;
    cl->atributos = atributos;
    cl->valoresPossiveis = valoresPossiveis;
    cl->classes = classes;

    return cl;
}
//Convert all the mila xml's to vrt files and store them in the vrt folder
bool CMilatoCWBConverter::ConvertFromMilaToVrt(){

	cout << "-------------------------" << endl;
	cout << "     Start Converting    " << endl;
	cout << "-------------------------" << endl << endl;

	//Open a linux folders tree
	char *dot[] = {const_cast<char *>(m_MilaFolderPath.data()), 0};
	FTS *tree = fts_open(dot,FTS_NOCHDIR, 0);
	if (!tree) {
		perror("fts_open");
		return 1;
	}

	//Start working on every xml file in the input directory.
	FTSENT *node;

	//A place holder for the currnet folder
	//Every vrt file will have is current folder string as the value of the id attribute of the text node
	string sVrtTextID;
	string sContainerVrtTextID;
	string sCurrentCorpusName;

	//Initialize the mila converter helper class
	CCorpus::milaConverter.Initialize();


	while ((node = fts_read(tree)))
	{


		if (node->fts_level > 0 && node->fts_name[0] == '.')
			fts_set(tree, node, FTS_SKIP);

		//If directory node - We will create this directory in the vrtfolder
		else if (node->fts_info & FTS_D)
		{
			//A corpus name folder
			if (node->fts_level == 1)
			{
				//Save the corpus name for the id attribute of the text
				sCurrentCorpusName = node->fts_name;
				sContainerVrtTextID = "t_" + sCurrentCorpusName;

				//Replace all occurences of "-" with "_"
				string from = "-";
				string to = "_";
				while(sContainerVrtTextID.find(from) != std::string::npos) {
					sContainerVrtTextID.replace(sContainerVrtTextID.find(from), from.length(), to);
				}
			}

			//Saves the current folder
			sVrtTextID = node->fts_name;

			//Check if the name is valid (a valid c variable name)
			//Add a prefix of t(text) to the value
			sVrtTextID = "t_" + sCurrentCorpusName + "_" + sVrtTextID;


			//Replace all occurences of "-" with "_"
			string from = "-";
			string to = "_";
			while(sVrtTextID.find(from) != std::string::npos) {
				sVrtTextID.replace(sVrtTextID.find(from), from.length(), to);
			}

			//Create the output folders

			//First for the main vrt path

			 string vrtPath = node->fts_accpath;

			 //Update the Error Logger
			 errorLogger->StartANewText(node->fts_accpath);

			 vrtPath.replace(vrtPath.find(m_MilaFolderPath),m_MilaFolderPath.length(),m_VrtFolderPath.data());

			 cout << endl << "-----------------------------------------------------------------------------------------" << endl;

			 cout << " --Start converting the output folder :" << vrtPath << endl << endl;

			 mkdir(vrtPath.data() , S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);

			 //Second for the Container vrt path

			 vrtPath = node->fts_accpath;

			 vrtPath.replace(vrtPath.find(m_MilaFolderPath),m_MilaFolderPath.length(),m_ContainerVRTFolderPath.data());

			 cout << endl << "-----------------------------------------------------------------------------------------" << endl;

			 cout << " --Start converting the output folder :" << vrtPath << endl << endl;

			 mkdir(vrtPath.data() , S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);

		}
		//If corrent node is a file
		else if (node->fts_info & FTS_F)
		{

			//The absoulte path to the mila xml file including the name
			string xmlName = node->fts_accpath;

			//Updating the ErrorLoger
			errorLogger->SetCurrentFile(xmlName);

			//The name of the output vrt file
			string outName = node->fts_name;
			outName.replace(outName.find(".xml"),4,".vrt");

			//The path to the output file
			string vrtPath = xmlName.data();
			vrtPath.replace(vrtPath.find(m_MilaFolderPath),m_MilaFolderPath.length(),m_VrtFolderPath.data());
			vrtPath.replace(vrtPath.find(node->fts_name),outName.length(),"");

			//The path to the container output file
			string containerVrtPath = xmlName.data();
			containerVrtPath.replace(containerVrtPath.find(m_MilaFolderPath),m_MilaFolderPath.length(),m_ContainerVRTFolderPath.data());
			containerVrtPath.replace(containerVrtPath.find(node->fts_name),outName.length(),"");

			//////////////////////////////////////////////////
			//Loading document

			//Creating the document
			xml_document doc;

			//Load the file
			xml_parse_result result = doc.load_file(xmlName.data());

			//Check if the xml file is valid
			if (result)
			{
				//If valid

				//Create writing file streams

				//main vrt
				ofstream outputFile((vrtPath + outName).data());

				//Container vrt
				ofstream containerOutputFile((containerVrtPath + outName).data());

				xml_node corpusNode= doc.child("corpus");
				if (corpusNode)
				{
					//If enable to open output file
					if (outputFile.is_open() && containerOutputFile.is_open())
					{

						//Start parsing the xml file into the outputfile

						//Create a corpus object
						CCorpus corpus(&corpusNode,&outputFile, &containerOutputFile, sVrtTextID ,sContainerVrtTextID);

						//Start parsing
						if (!corpus.Parse())
							cout << endl << "Error Parsing " << corpusNode.attribute("name").value() << endl;


						outputFile.close();
						containerOutputFile.close();
					}
					else //if Failed to open file
					{
						cout << endl << "Unable to open output file : " << outName << endl;
						if (fts_close(tree)) {
								perror("fts_close");
								return false;
							}
						return false;
					}
				}
				else
				{
					cout << endl << "Error with file: " << xmlName << endl;
				}
			}
			else
			{
				//if Not valid print the description
				cout << endl << "Load result " << result.description() << endl;
				cout << endl << "Error offset: " << result.offset << "(error at [..." << xmlName << result.offset << "]" << endl << endl;
			}
		}
	}

	cout << endl;

	//Close the errorLogger and the error file
	delete errorLogger;

	if (fts_close(tree)) {
		perror("fts_close");
		return false;
	}
	return true;
}
int main(int n_args, char** args) {
  init_logging();
  isage::util::print_pstats();

  int num_topics;
  int num_epochs_;

  isage::wtm::LDAVStrategy strategy;
  
  std::string output_usage_name;
  std::string heldout_output_usage_name;

  po::variables_map vm;
  {
    po::options_description desc("Allowed options");
    desc.add_options()
      ("help", "produce help message")
      ("vocab-size", po::value< int >()->default_value(10),
       "number of vocab words (default: 10)")
      ("words-per-doc", po::value<int>()->default_value(10),
       "number of words per document (default: 10)")
      ("bias", po::value<double>()->default_value(.8),
       "Bernoulli parameter p for how to partition the vocab words. (default: 0.8)")
      ("num-docs", po::value<int>()->default_value(10),
       "number of documents to generate (default: 10)")
      //////////////////////////
      ("topics", po::value<int>(&num_topics)->default_value(10), 
       "number of topics to use")
      ("train-epochs", po::value<int>(&num_epochs_)->default_value(5),
       "Number of epochs to run")
      ("em-iterations", po::value<int>(&(strategy.num_learn_iters))->default_value(100), 
       "number of EM iterations to run")
      ("e-steps", po::value<int>(&(strategy.num_e_iters))->default_value(25), 
       "number of iterations to perform, per E-step")
      ("m-steps", po::value<int>(&(strategy.num_m_iters))->default_value(1), 
       "number of iterations to perform, per M-step")
      ("update-hypers", po::value<int>(&(strategy.hyper_update_iter))->default_value(-1),
       "how often to update the hyperparameters (default: -1 == never update)")
      ("update-model-interval", po::value<int>(&(strategy.update_model_every))->default_value(5), "update the model every [some] number of EM steps (default: 5)")
      ("print-topics-every", po::value<int>(&(strategy.print_topics_every))->default_value(5), "print topics every [some] number of EM steps (default: 5)")
      ("print-usage-every", po::value<int>(&(strategy.print_usage_every))->default_value(5), "print topic usage every [some] number of EM steps (default: 5)")
      ("top-k", po::value<int>(&(strategy.print_topics_k))->default_value(10), "number of words per topic to print (default: 10)")
      ("em-verbosity", po::value<int>(&(strategy.em_verbosity))->default_value(1),
       "how verbose should EM output be (default: 1; higher == more verbose)")
      ("eta-density-threshold", po::value<double>(&(strategy.eta_density_threshold))->default_value(1E-4),
       "the threshold t for counting the number of eta parameters are above t (default: 1E-4)")
      ////////////////////////////////
      ("topic-usage-file", po::value<std::string>(&output_usage_name)->default_value("-"), 
       "filename to write topic usage to (default: - (to console)")
      ("heldout-topic-usage-file", po::value<std::string>(&heldout_output_usage_name)->default_value("-"), 
       "filename to write heldout topic usage to (default: - (to console)")
      ("inferencer-serialization", po::value<std::string>(), "filename to serialize inference state to")
      ("serialized-inferencer", po::value<std::string>(), "filename to READ serialized inference state from")
      ////////////////////////////////
      ;

    po::store(po::parse_command_line(n_args, args, desc), vm);
    if (vm.count("help")) {
      ERROR << desc << "\n";
      return 1;
    }
    po::notify(vm);
  }

  typedef std::string string;
  typedef string VocabType;
  typedef isage::wtm::Vocabulary< VocabType > SVocab;
  typedef double CountType;
  typedef isage::wtm::Document< VocabType, CountType > Doc;
  typedef isage::wtm::Corpus< Doc > Corpus;
  typedef std::vector<double> TopicType;
  typedef isage::wtm::DiscreteLDA< VocabType, std::vector<double> > Model;
  typedef isage::wtm::DiscreteVariational< Doc, VocabType, TopicType > Variational;

  isage::util::SmartWriter usage_outer(output_usage_name);
  isage::util::SmartWriter assign_outer("assignments");
  
  Variational* var_inf = NULL;
  SVocab word_vocab("__OOV__");
  for(int wi = 1; wi <= vm["vocab-size"].as<int>(); ++wi) {
    word_vocab.make_word("word_" + std::to_string(wi));
  }

  Corpus corpus("train_corpus");
  corpus.generate(vm["num-docs"].as<int>(),
		  vm["words-per-doc"].as<int>(),
		  vm["bias"].as<double>(),
		  word_vocab
		  );
  int num_words_total = get_num_tokens(corpus);
  INFO << "Number of documents: " << corpus.num_docs();
  INFO << "Number of word tokens total: " << num_words_total;
  INFO << "Number of vocab types: " << word_vocab.num_words();

  isage::wtm::SymmetricHyperparams shp;
  shp.h_theta = 1.0/(double)num_topics;
  shp.h_word =  0.1; 
  INFO << "Creating model with " << num_topics << " topics";
  Model dm(num_topics, &shp, &word_vocab);
  INFO << "Done creating model.";
  var_inf = new Variational(&dm, &corpus, &word_vocab);
  isage::wtm::UniformHyperSeedWeightedInitializer initer(num_topics, corpus.num_docs(), (double)num_words_total/(double)corpus.num_docs());
  var_inf->init(initer);

  for(int epoch = 0; epoch < num_epochs_; ++epoch) {
    INFO << "Starting learning epoch " << epoch;
    var_inf->learn(strategy, epoch, usage_outer, assign_outer);
    INFO << "Done with inference in epoch " << epoch;
    // // create and open a character archive for output
    // if(vm.count("inferencer-serialization")) {
    //   std::string sfname = vm["inferencer-serialization"].as<std::string>() + 
    // 	".iteration" + std::to_string((1+epoch));	
    //   std::ofstream ofs(sfname, std::ios::out|std::ios::binary);
    //   boost::iostreams::filtering_streambuf<boost::iostreams::output> out;
    //   out.push(boost::iostreams::gzip_compressor());
    //   out.push(ofs);
    //   boost::archive::binary_oarchive oa(out);
    //   oa << (*var_inf);
    //   INFO << "see " << sfname << " for serialized inferencer";
    // }
    dm.print_topics(strategy.print_topics_k, word_vocab);
  }

  if(var_inf != NULL) {
    delete var_inf;
  }
  return 0;
}