Пример #1
0
Index::~Index() {
	delete hash;
	delete lexicon;
	fclose(indexFile);

	iReportedTime["timeTotal"] = tTotal.reportTime();

	for (map<string, double>::iterator it = iReportedTime.begin();
			it != iReportedTime.end(); ++it) {
		printf("%s\t%g\n", it->first.c_str(), it->second);
	}
}
Пример #2
0
Index::Index(IndexConfig &config) {

	tTotal.start();

	char** dict = NULL;
	DictFileManager dictFileManager;

	// Read dictionary file and get the terms in a vector ordered by its code
	int numberOfTerms = dictFileManager.read_dictionary(&dict,
			config.getDictFilePath());

	// Create the cmph
	buildHash.start();
	hash = new Hash(dict, numberOfTerms, config.getDirectory() + "temp.mph");
	iReportedTime["hashTime"] = buildHash.reportTime();

	// Create lexicon
	lexicon = new Lexicon(hash->getSize());

	// Open an index file
	indexFile = fopen(config.getIndexFilePath().c_str(), "wb+");
	if (indexFile == NULL) {
		printf("Error when opening index file.");
		exit(0);
	}

	number_of_documents = config.getDocumentsMaxNumber();

	// Parse triples and build the index file
	buildIndex.start();
	parseTriplesFile(dict, config.getTmpFilePath());
	iReportedTime["indexTime"] = buildIndex.reportTime();

	// Dict is not necessary anymore
	free(dict);
}
Пример #3
0
// Compressed
void Index::parseTriplesFile(char** dict, string filePath) {

	// Open the triples file
	FILE *triplesFile = fopen(filePath.c_str(), "rb");
	if (triplesFile == NULL) {
		printf("Error when opening triples file.");
		exit(0);
	}

	EliasGamma gamma;
	vector<unsigned int> serial_buffer;
	size_t pos = 0;
	unsigned int term, doc, freq, last_doc = 0, cterm = 0, docCounter = 0;

	list<pair<unsigned int, unsigned int> > docList;

	avg_doclen = 0;

	while (!feof(triplesFile)) {

		fread(&term, sizeof(unsigned int), 1, triplesFile);
		fread(&doc, sizeof(unsigned int), 1, triplesFile);
		fread(&freq, sizeof(unsigned int), 1, triplesFile);

		if (cterm == 0) {
			cterm = term;
		}

		if (cterm != term || feof(triplesFile)) {

			// Get its hash position
			size_t p = hash->search(dict[cterm - 1]);

			// Compress buffer
			vector<unsigned char> compressed_data;
			gammaTime.start();
			gamma.encode(serial_buffer, compressed_data);
			iReportedTime["compressionTime"] += gammaTime.reportTime();
			serial_buffer.clear();
			last_doc = 0;

			// Record the current file position for the new term
			pos = ftell(indexFile);

			// Calculate tf-idf
			double idf = log2(
					(double) number_of_documents / (double) docCounter);
			//			printf("%u %u %g\n", cterm, docCounter, idf);

			// Adds the entry to the lexicon
			lexicon->add(p, dict[cterm - 1], docList.size(), pos, idf);

			while (!docList.empty()) {

				flushTime.start();
				fwrite(&docList.front().first, sizeof(unsigned int), 1,
						indexFile);
				fwrite(&docList.front().second, sizeof(unsigned int), 1,
						indexFile);
				iReportedTime["flushTime"] += flushTime.reportTime();

				documentNorm[docList.front().first] += pow(
						(1 + log2(docList.front().second)) * idf, 2);

				documentLen[docList.front().first] += docList.front().second;

				avg_doclen += docList.front().second;

				docList.pop_front();
			}

			docCounter = 0;
			docList.clear();

			// Flush to the index file
			flushTime.start();
			for (size_t i = 0; i < compressed_data.size(); i++) {
				fputc(compressed_data[i], indexFile);
			}
			iReportedTime["flushTime"] += flushTime.reportTime();

			// Updates the cterm and ndoc for the following dict being read
			cterm = term;
		}

		docList.push_back(make_pair(doc, freq));
		docCounter++;

		serial_buffer.push_back(doc - last_doc);
		serial_buffer.push_back(freq);

		last_doc = doc;
	}

	avg_doclen /= number_of_documents;

	fclose(triplesFile);
}