static
error_t get_next_non_dup_path(char** path_out, MYHASH_value** dups_out)
{
  error_t err = ERR_NOERR;
  char* path;
  MYHASH_value* d;
  hm_entry_t entry;
  while( 1 ) {
    path = NULL;
    d = NULL;
    err = file_find_get_next(&file_find_state, &path);
    if( err ) break;
    if( !path ) break;
    if( !dedup ) break;
    entry.key = path;
    entry.value = NULL;
    if( !hashmap_retrieve(&dups, &entry) ) {
      printf("Missing from dups hashmap? Data added? %s\n", path);
    } else {
      if( entry.value ) {
        // Canonical version!
        d = (MYHASH_value*) entry.value;
        break;
      } // otherwise, continue looping.
    }
    free(path);
  }
  *path_out = path;
  *dups_out = d;
  return err;
}
static
error_t do_dedup(file_find_state_t* s)
{
  error_t err;
  char* path;

  if( ! dedup ) return 1;
 
  // For each file, compute its MD5-sum
  // populate the MYHASH table
  while( 1 ) {
    path = NULL;
    err = file_find_get_next(s, &path);
    if( err ) return err;

    if( path == NULL ) break;

    // Now dedup this file!
    err = do_dedup_file(path);
    if( err ) return err;

    // Normally we would free path, but don't
    // because we will leave hashtable entries pointing to it!
    //free(path);
  }

  return ERR_NOERR;
}  
int its_get_doc(int64_t doc_num,
                int64_t* doc_len_out,
                int64_t* doc_info_len_out,
                unsigned char** doc_info_out,
                int64_t* num_doc_headers_out,
                int64_t** doc_header_lens_out,
                unsigned char*** doc_headers_out,
                unsigned char** doc_contents_out)
{
  char* got_path;
  error_t err;
  int rc;
  size_t info_len, doc_len;
  unsigned char* info_data;
  unsigned char* doc_data;
  size_t next_offset;

  // First, move on to the next file if we need to.
  while( !data || cur_offset >= data_len ) {
    got_path = NULL;
    err = file_find_get_next(&file_find_state, &got_path);
    if( err ) {
      warn_if_err(err);
      return -1;
    }

    // No more documents!
    if( !got_path ) return 0;

    rc = do_mmap(got_path);
    if( rc ) return rc;
  }

  // Now, get some data from the current file.
  rc = parse_data(data, data_len, cur_offset,
                  &info_len, &info_data,
                  &doc_len, &doc_data,
                  &next_offset);
  if( rc ) return rc;

  cur_offset = next_offset;

  *doc_len_out = doc_len;
  *doc_info_len_out = info_len;
  *doc_info_out = info_data;
  *num_doc_headers_out = 0;
  *doc_header_lens_out = NULL;

  *doc_headers_out = NULL;
  *doc_contents_out = doc_data;
  return 1;
}
Esempio n. 4
0
error_t file_find(file_find_state_t* s, file_find_func_t fun, void* state)
{
  error_t err;
  char* path;

  while( 1 ) {
    path = NULL;
    err = file_find_get_next(s, &path);
    if( err ) return err;

    if( path == NULL ) break;
    fun(path, state);

    free(path);
  }

  return ERR_NOERR;
}