Index::~Index() { delete hash; delete lexicon; fclose(indexFile); iReportedTime["timeTotal"] = tTotal.reportTime(); for (map<string, double>::iterator it = iReportedTime.begin(); it != iReportedTime.end(); ++it) { printf("%s\t%g\n", it->first.c_str(), it->second); } }
Index::Index(IndexConfig &config) { tTotal.start(); char** dict = NULL; DictFileManager dictFileManager; // Read dictionary file and get the terms in a vector ordered by its code int numberOfTerms = dictFileManager.read_dictionary(&dict, config.getDictFilePath()); // Create the cmph buildHash.start(); hash = new Hash(dict, numberOfTerms, config.getDirectory() + "temp.mph"); iReportedTime["hashTime"] = buildHash.reportTime(); // Create lexicon lexicon = new Lexicon(hash->getSize()); // Open an index file indexFile = fopen(config.getIndexFilePath().c_str(), "wb+"); if (indexFile == NULL) { printf("Error when opening index file."); exit(0); } number_of_documents = config.getDocumentsMaxNumber(); // Parse triples and build the index file buildIndex.start(); parseTriplesFile(dict, config.getTmpFilePath()); iReportedTime["indexTime"] = buildIndex.reportTime(); // Dict is not necessary anymore free(dict); }
// Compressed void Index::parseTriplesFile(char** dict, string filePath) { // Open the triples file FILE *triplesFile = fopen(filePath.c_str(), "rb"); if (triplesFile == NULL) { printf("Error when opening triples file."); exit(0); } EliasGamma gamma; vector<unsigned int> serial_buffer; size_t pos = 0; unsigned int term, doc, freq, last_doc = 0, cterm = 0, docCounter = 0; list<pair<unsigned int, unsigned int> > docList; avg_doclen = 0; while (!feof(triplesFile)) { fread(&term, sizeof(unsigned int), 1, triplesFile); fread(&doc, sizeof(unsigned int), 1, triplesFile); fread(&freq, sizeof(unsigned int), 1, triplesFile); if (cterm == 0) { cterm = term; } if (cterm != term || feof(triplesFile)) { // Get its hash position size_t p = hash->search(dict[cterm - 1]); // Compress buffer vector<unsigned char> compressed_data; gammaTime.start(); gamma.encode(serial_buffer, compressed_data); iReportedTime["compressionTime"] += gammaTime.reportTime(); serial_buffer.clear(); last_doc = 0; // Record the current file position for the new term pos = ftell(indexFile); // Calculate tf-idf double idf = log2( (double) number_of_documents / (double) docCounter); // printf("%u %u %g\n", cterm, docCounter, idf); // Adds the entry to the lexicon lexicon->add(p, dict[cterm - 1], docList.size(), pos, idf); while (!docList.empty()) { flushTime.start(); fwrite(&docList.front().first, sizeof(unsigned int), 1, indexFile); fwrite(&docList.front().second, sizeof(unsigned int), 1, indexFile); iReportedTime["flushTime"] += flushTime.reportTime(); documentNorm[docList.front().first] += pow( (1 + log2(docList.front().second)) * idf, 2); documentLen[docList.front().first] += docList.front().second; avg_doclen += docList.front().second; docList.pop_front(); } docCounter = 0; docList.clear(); // Flush to the index file flushTime.start(); for (size_t i = 0; i < compressed_data.size(); i++) { fputc(compressed_data[i], indexFile); } iReportedTime["flushTime"] += flushTime.reportTime(); // Updates the cterm and ndoc for the following dict being read cterm = term; } docList.push_back(make_pair(doc, freq)); docCounter++; serial_buffer.push_back(doc - last_doc); serial_buffer.push_back(freq); last_doc = doc; } avg_doclen /= number_of_documents; fclose(triplesFile); }