예제 #1
0
static void test_file_sort(unsigned buffer_size,
                           unsigned temp_files,
                           file_merger_feed_record_t callback,
                           int skip_writeback)
{
    file_sorter_error_t ret;
    int i = 0;
    create_file();

    ret = sort_file(UNSORTED_FILE_PATH,
                    SORT_TMP_DIR,
                    temp_files,
                    buffer_size,
                    read_record,
                    write_record,
                    callback,
                    compare_records,
                    free_record,
                    skip_writeback,
                    &i);

    cb_assert(ret == FILE_SORTER_SUCCESS);

    if (!skip_writeback) {
        cb_assert(check_file_sorted(UNSORTED_FILE_PATH));
    } else {
        cb_assert(check_file_sorted(UNSORTED_FILE_PATH) == 0);
    }

    remove(UNSORTED_FILE_PATH);
}
예제 #2
0
static void test_file_sort(unsigned buffer_size,
                           unsigned temp_files,
                           file_merger_feed_record_t callback,
                           int skip_writeback)
{
#ifdef _MSC_VER
    fprintf(stderr, "This test is broken on WIN32. See MB-10292\n");
#else
    file_sorter_error_t ret;
    int i = 0;
    create_file();

    ret = sort_file(UNSORTED_FILE_PATH,
                    SORT_TMP_DIR,
                    temp_files,
                    buffer_size,
                    read_record,
                    write_record,
                    callback,
                    compare_records,
                    free_record,
                    skip_writeback,
                    &i);

    assert(ret == FILE_SORTER_SUCCESS);

    if (!skip_writeback) {
        assert(check_file_sorted(UNSORTED_FILE_PATH));
    } else {
        assert(check_file_sorted(UNSORTED_FILE_PATH) == 0);
    }

    remove(UNSORTED_FILE_PATH);
#endif
}
예제 #3
0
int sort_random_file(const char* filename, int length) {
    if (make_random_file(filename, length) == EXIT_FAILURE) {
        return EXIT_FAILURE;
    }
    sort_file (filename);
    return EXIT_SUCCESS;
}
예제 #4
0
int sort_main(int ac,const char *av[]){
	sort_file(stdin,stdout,0);
	return 0;
}
예제 #5
0
int main( int argc, char** argv ) {
  try {
    indri::api::Parameters& param = indri::api::Parameters::instance();
    param.loadCommandLine( argc, argv );

    if( !param.exists("index") || !param.exists("input") || !param.exists("name") ) {
      std::cerr << "makeprior usage: " << std::endl
                << "    makeprior -index=myindex -input=myinputfile -name=priorname" << std::endl
                << "      myindex: a valid Indri index " << std::endl
                << "      myinputfile: a two column text file, where the first column contains docno values" << std::endl
                << "         and the second column contains log probabilities (should be between -infinity and zero)" << std::endl
                << "      name: the name of this prior (as you will reference it in queries, using the #prior(name) syntax)" << std::endl;
      exit(-1);
    }
  
    std::string index = param["index"];

    // get the total document count, including deleted documents.
    indri::collection::Repository* _repository = new indri::collection::Repository();
    _repository->openRead(index);    
    indri::collection::Repository::index_state indexes = _repository->indexes();
    int documentCount = 0;
  
    for( size_t i=0; i<indexes->size(); i++ ) {
      indri::thread::ScopedLock lock( (*indexes)[i]->statisticsLock() );
      documentCount += (int)(*indexes)[i]->documentCount();
    }
    delete _repository;
    
    indri::api::QueryEnvironment env;
    std::cout << "opening index: " << index << std::endl;
    env.addIndex( index );
    
    std::string input = param["input"];
    std::string priorName = param["name"];
    size_t memory = param.get( "memory", 50*1024*1024 );
    
    // step one - convert file from docno/score format to binary format
    indri::file::File unsortedBinary;
    std::string unsortedName;
    
    unsortedBinary.openTemporary( unsortedName );
    std::cout << "converting to binary...";
    std::cout.flush();
    convert_docnoscore_to_binary( unsortedBinary, input, env );
    std::cout << "finished" << std::endl;
    
    // step two -- sort the binary version
    indri::file::File uncompressedPrior;
    std::string uncompressedPriorName;
    uncompressedPrior.openTemporary( uncompressedPriorName );
    
    std::cout << "sorting...";
    std::cout.flush();
    sort_file( uncompressedPrior, unsortedBinary, memory, documentCount );
    std::cout << "finished";
    
    unsortedBinary.close();
    lemur_compat::remove( unsortedName.c_str() );
    
    // step three -- check to see if it's compressable, if so, compress it
    std::map<double, int> table;
    indri::file::File compressedPrior;
    
    std::string compressedPriorName;
    compressedPrior.openTemporary( compressedPriorName );
    
    indri::file::File& finalPrior = uncompressedPrior;
    std::cout << "checking for compressability...";
    std::cout.flush();
    bool result = extract_compression_table( table, uncompressedPrior );
    
    if( result ) {
      std::cout << "yep" << std::endl;
      // compress the file by using a lookup table
      std::cout << "compressing...";
      std::cout.flush();
      compress_file( compressedPrior, uncompressedPrior, table );
      std::cout << std::endl;
      finalPrior = compressedPrior;
    } else {
      std::cout << "nope" << std::endl;
    }
    
    // step four -- install the prior in the index
    std::cout << "installing...";
    std::cout.flush();
    install_prior( index, priorName, finalPrior );
    std::cout << "finished" << std::endl;
    
    // clean up
    uncompressedPrior.close();
    compressedPrior.close();
    lemur_compat::remove( uncompressedPriorName.c_str() );
    lemur_compat::remove( compressedPriorName.c_str() );
  } catch( lemur::api::Exception& e ) {
    LEMUR_ABORT(e);
  }
  
  return 0;  
}