static void test_file_sort(unsigned buffer_size, unsigned temp_files, file_merger_feed_record_t callback, int skip_writeback) { file_sorter_error_t ret; int i = 0; create_file(); ret = sort_file(UNSORTED_FILE_PATH, SORT_TMP_DIR, temp_files, buffer_size, read_record, write_record, callback, compare_records, free_record, skip_writeback, &i); cb_assert(ret == FILE_SORTER_SUCCESS); if (!skip_writeback) { cb_assert(check_file_sorted(UNSORTED_FILE_PATH)); } else { cb_assert(check_file_sorted(UNSORTED_FILE_PATH) == 0); } remove(UNSORTED_FILE_PATH); }
static void test_file_sort(unsigned buffer_size, unsigned temp_files, file_merger_feed_record_t callback, int skip_writeback) { #ifdef _MSC_VER fprintf(stderr, "This test is broken on WIN32. See MB-10292\n"); #else file_sorter_error_t ret; int i = 0; create_file(); ret = sort_file(UNSORTED_FILE_PATH, SORT_TMP_DIR, temp_files, buffer_size, read_record, write_record, callback, compare_records, free_record, skip_writeback, &i); assert(ret == FILE_SORTER_SUCCESS); if (!skip_writeback) { assert(check_file_sorted(UNSORTED_FILE_PATH)); } else { assert(check_file_sorted(UNSORTED_FILE_PATH) == 0); } remove(UNSORTED_FILE_PATH); #endif }
int sort_random_file(const char* filename, int length) { if (make_random_file(filename, length) == EXIT_FAILURE) { return EXIT_FAILURE; } sort_file (filename); return EXIT_SUCCESS; }
int sort_main(int ac,const char *av[]){ sort_file(stdin,stdout,0); return 0; }
int main( int argc, char** argv ) { try { indri::api::Parameters& param = indri::api::Parameters::instance(); param.loadCommandLine( argc, argv ); if( !param.exists("index") || !param.exists("input") || !param.exists("name") ) { std::cerr << "makeprior usage: " << std::endl << " makeprior -index=myindex -input=myinputfile -name=priorname" << std::endl << " myindex: a valid Indri index " << std::endl << " myinputfile: a two column text file, where the first column contains docno values" << std::endl << " and the second column contains log probabilities (should be between -infinity and zero)" << std::endl << " name: the name of this prior (as you will reference it in queries, using the #prior(name) syntax)" << std::endl; exit(-1); } std::string index = param["index"]; // get the total document count, including deleted documents. indri::collection::Repository* _repository = new indri::collection::Repository(); _repository->openRead(index); indri::collection::Repository::index_state indexes = _repository->indexes(); int documentCount = 0; for( size_t i=0; i<indexes->size(); i++ ) { indri::thread::ScopedLock lock( (*indexes)[i]->statisticsLock() ); documentCount += (int)(*indexes)[i]->documentCount(); } delete _repository; indri::api::QueryEnvironment env; std::cout << "opening index: " << index << std::endl; env.addIndex( index ); std::string input = param["input"]; std::string priorName = param["name"]; size_t memory = param.get( "memory", 50*1024*1024 ); // step one - convert file from docno/score format to binary format indri::file::File unsortedBinary; std::string unsortedName; unsortedBinary.openTemporary( unsortedName ); std::cout << "converting to binary..."; std::cout.flush(); convert_docnoscore_to_binary( unsortedBinary, input, env ); std::cout << "finished" << std::endl; // step two -- sort the binary version indri::file::File uncompressedPrior; std::string uncompressedPriorName; uncompressedPrior.openTemporary( uncompressedPriorName ); std::cout << "sorting..."; std::cout.flush(); sort_file( uncompressedPrior, unsortedBinary, memory, documentCount ); std::cout << "finished"; unsortedBinary.close(); lemur_compat::remove( unsortedName.c_str() ); // step three -- check to see if it's compressable, if so, compress it std::map<double, int> table; indri::file::File compressedPrior; std::string compressedPriorName; compressedPrior.openTemporary( compressedPriorName ); indri::file::File& finalPrior = uncompressedPrior; std::cout << "checking for compressability..."; std::cout.flush(); bool result = extract_compression_table( table, uncompressedPrior ); if( result ) { std::cout << "yep" << std::endl; // compress the file by using a lookup table std::cout << "compressing..."; std::cout.flush(); compress_file( compressedPrior, uncompressedPrior, table ); std::cout << std::endl; finalPrior = compressedPrior; } else { std::cout << "nope" << std::endl; } // step four -- install the prior in the index std::cout << "installing..."; std::cout.flush(); install_prior( index, priorName, finalPrior ); std::cout << "finished" << std::endl; // clean up uncompressedPrior.close(); compressedPrior.close(); lemur_compat::remove( uncompressedPriorName.c_str() ); lemur_compat::remove( compressedPriorName.c_str() ); } catch( lemur::api::Exception& e ) { LEMUR_ABORT(e); } return 0; }