Example #1
0
static CondorChunk * absorb( CondorChunk *head, CondorChunk *c )
{
	CondorChunk *next, *combined;

	if(!head) return c;

	if( should_combine( head, c ) ) {
		next = head->next;
		combined = combine( head, c );
		return absorb( next, combined );
	} else {
		head->next = absorb( head->next, c );
		return head;
	}
}
Example #2
0
uint64_t cluster(uint64_t nrows, struct hash* initial_in, struct candidate* tmp, struct hash* out, uint64_t data_len, unsigned char* data)
{
  int pass = 0;
  uint64_t i,j,k,n, start_hash, end_hash, start_next_hash, end_next_hash;
  uint64_t use_i, next_i;
  uint64_t hash, next_hash;
  uint16_t score;
  struct hash* in = initial_in;
  int combined_any;

  n = nrows;
  for( pass = 0; pass < MAX_PASSES; pass++ ) {
    printf("cluster pass  %i %" PRIi64 "/%" PRIi64 " rows\n", pass, n, nrows );
    assert( n <= nrows );
    // Create array of candidates.
    j = 0;
    for( i = 0; i < n; i++ ) {
      int hasnext = (i+1 < n);
      int hasnextnext = (i+2 < n);
      int nhashes, next_nhashes;

      //printf("in %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i hashes %i\n", in[i].hash, in[i].position, (int) in[i].length, (int) in[i].score, (int) in[i].nerrors, (int) in[i].nhashes);

      // first, output this and next
      tmp[j].position = in[i].position;
      tmp[j].hash = in[i].hash;
      tmp[j].next_hash = hasnext ? in[i+1].hash : 0;
      tmp[j].old_length = in[i].length;
      if( hasnext ) {
        tmp[j].new_length = in[i+1].length + in[i+1].position - in[i].position;
      } else {
        tmp[j].new_length = in[i].length;
      }
      tmp[j].old_nerrors = in[i].nerrors;
      tmp[j].new_nerrors = in[i].nerrors + (hasnext ? in[i+1].nerrors : 0);
      nhashes = in[i].nhashes;
      next_nhashes = (hasnext ? in[i+1].nhashes : 0);
      if( nhashes == 0 ) nhashes = 1;
      if( next_nhashes == 0 ) next_nhashes = 1;
      tmp[j].old_nhashes = nhashes;
      tmp[j].new_nhashes = nhashes + next_nhashes;
      tmp[j].isgap = 0;

      //printf("c %016" PRIX64 " %016" PRIX64 " pos %" PRIi64 " oldlen %i newlen %i oldn %i newn %i\n", tmp[j].hash, tmp[j].next_hash, tmp[j].position, (int) tmp[j].old_length, (int) tmp[j].new_length, (int) tmp[j].old_nhashes, (int) tmp[j].new_nhashes);
      j++;
      // then, output this and next next (skipping one)
      tmp[j].position = in[i].position;
      tmp[j].hash = in[i].hash;
      tmp[j].next_hash = hasnextnext ? in[i+2].hash : 0;
      tmp[j].old_length = in[i].length;
      if( hasnextnext ) {
        tmp[j].new_length = in[i+2].length + in[i+2].position - in[i].position;
      } else {
        tmp[j].new_length = in[i].length;
      }
      tmp[j].old_nerrors = in[i].nerrors;
      tmp[j].new_nerrors = 1 + in[i].nerrors + (hasnextnext ? in[i+2].nerrors : 0);
      nhashes = in[i].nhashes;
      next_nhashes = (hasnextnext ? in[i+2].nhashes : 0);
      if( nhashes == 0 ) nhashes = 1;
      if( next_nhashes == 0 ) next_nhashes = 1;
      tmp[j].old_nhashes = nhashes;
      tmp[j].new_nhashes = nhashes + next_nhashes;
      tmp[j].isgap = 1;

      //printf("c %016" PRIX64 " %016" PRIX64 " pos %" PRIi64 " oldlen %i newlen %i oldn %i newn %i\n", tmp[j].hash, tmp[j].next_hash, tmp[j].position, (int) tmp[j].old_length, (int) tmp[j].new_length, (int) tmp[j].old_nhashes, (int) tmp[j].new_nhashes);
      j++;
    }
    n = j;
    assert( n <= 2*nrows );
    // Sort candidates by hash, next hash.
    qsort(tmp, n, sizeof(struct candidate), cmp_candidate_by_hashes);
    // Create new hashes by merging candidates with appropriate counts,
    // taking into account error information.

    /*
    for( j = 0; j < n; j++ ) {
      printf("% 4li sc %016" PRIX64 " %016" PRIX64 " pos %" PRIi64 " oldlen %i newlen %i oldn %i newn %i\n", (long) j, tmp[j].hash, tmp[j].next_hash, tmp[j].position, (int) tmp[j].old_length, (int) tmp[j].new_length, (int) tmp[j].old_nhashes, (int) tmp[j].new_nhashes);
    }*/

    combined_any = 0;
    k = 0;
    start_hash = end_hash = 0;
    while( start_hash < n ) {
      start_hash = end_hash;
      hash = tmp[start_hash].hash;
      // find the first entry with a different hash
      while( end_hash < n && tmp[end_hash].hash == hash ) end_hash++;
      // OK, now we are working with a group of hashes with the same
      // initial hash, and a variety of next hashes, from [start_hash,end_hash)

      //printf(" found hash group from %li to %li with hash  %016" PRIX64 "\n", (long) start_hash, (long) end_hash, hash);
      // for each group of next hashes...
      start_next_hash = end_next_hash = start_hash;
      while ( start_next_hash < end_hash ) {
        start_next_hash = end_next_hash;
        next_hash = tmp[start_next_hash].next_hash;
        // find the first entry with a different next hash.
        while( end_next_hash < end_hash &&
               tmp[end_next_hash].next_hash == next_hash ) end_next_hash++;
        // OK, now we have a group of next hashes from [start_next_hash,end_next_hash).

        //printf(" found hash sub group from %li to %li with next_hash  %016" PRIX64 "\n", (long) start_next_hash, (long) end_next_hash, next_hash);
        // Is it worth combining?
        score = should_combine( end_hash - start_hash, end_next_hash - start_next_hash );

        for( j = start_next_hash; j < end_next_hash; j++ ) {
          assert(k < 2*nrows);

          //printf("% 4li cc %016" PRIX64 " %016" PRIX64 " pos %" PRIi64 " oldlen %i newlen %i oldn %i newn %i olderr %i newerr %i\n", (long) j, tmp[j].hash, tmp[j].next_hash, tmp[j].position, (int) tmp[j].old_length, (int) tmp[j].new_length, (int) tmp[j].old_nhashes, (int) tmp[j].new_nhashes, (int) tmp[j].old_nerrors, (int) tmp[j].new_nerrors);

          if( score > 0 && tmp[j].new_nerrors <= MAX_SKIPS && tmp[j].new_length <= MAX_LENGTH ) {
            // Output the merged rows.
            combined_any = 1;
            out[k].position = tmp[j].position;
            out[k].hash = ROL_hash(tmp[j].hash, tmp[j].new_nhashes - tmp[j].old_nhashes) ^ tmp[j].next_hash;
            out[k].nerrors = tmp[j].new_nerrors; 
            out[k].nhashes = tmp[j].new_nhashes;
            out[k].score = (score << 1) | 1;
            out[k].length = tmp[j].new_length;
            //printf("comb out %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i\n", out[k].hash, out[k].position, (int) out[k].length, (int) out[k].score, (int) out[k].nerrors);
            k++;
          } else if( ! tmp[j].isgap ) {
            // Output the non-merged original row only.
            out[k].position = tmp[j].position;
            out[k].hash = tmp[j].hash;
            out[k].nerrors = tmp[j].old_nerrors;
            out[k].nhashes = tmp[j].old_nhashes;
            if( end_hash - start_hash >= MINIMUM_GROUP_SIZE ) 
              out[k].score = (log2lli( end_hash - start_hash ) << 1) | 0;
            else
              out[k].score = 0;
            out[k].length = tmp[j].old_length;
            //printf("out %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i\n", out[k].hash, out[k].position, (int) out[k].length, (int) out[k].score, (int) out[k].nerrors);
            k++;
          }
        }
      }
    }

    //printf(" DONE MERGE\n");
    n = k;
    // Sort the new hashes by position.
    qsort(out, n, sizeof(struct hash), cmp_hash_by_position);

    /*
    for( i = 0; i < n; i++ ) {
      printf("s out %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i hashes %i\n", out[i].hash, out[i].position, (int) out[i].length, (int) out[i].score, (int) out[i].nerrors, (int) out[i].nhashes);
    }*/

 
    // Remove hashes that are totally subsumed by others
    // if we have 2 hashes at the same position, keep the one with a better score
    // or fewer errors.
    k = 0;
    for( i = 0; i < n; i = next_i) {
      use_i = i;
      next_i = i + 1;
      // handle choosing the best of two entries at the same position.
      if( i + 1 < n && out[i].position == out[i+1].position ) {
        int i_score, i_1_score;
        i_score = out[i].score;
        i_score -= out[i].nerrors;
        i_1_score = out[i+1].score;
        i_1_score -= out[i+1].nerrors;
        // handle them both now.
        if( ((i_1_score & 1) && !(i_score & 1) ) ||
            i_1_score > i_score ) use_i = i+1;
        next_i = i + 2;
      }
      // Now, look at the last entry we output.
      // Is this entry totally subsumed by it?
      // If so, we must remove it.
      if( k > 0 &&
          out[k-1].position + out[k-1].length >=
          out[use_i].position + out[use_i].length ) {
        // We are totally inside the previously output
        // element, so not output here.
      } else {
        //printf("o out %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i hashes %i\n", out[use_i].hash, out[use_i].position, (int) out[use_i].length, (int) out[use_i].score, (int) out[use_i].nerrors, (int) out[use_i].nhashes);
        if( k != use_i ) {
          out[k] = out[use_i];
        }
        k++;
      }
    }

    n = k;

    assert( n <= nrows );

    // For the next loop, use the output we just made.
    in = out;

    // Stop looping if we didn't find any to combine.
    if( combined_any == 0 ) break;
  }

  printf("Completed clustering after %i passes and returning  %" PRIi64 "/%" PRIi64 " rows\n",
         pass, n, nrows );

  in = initial_in;
  // Explain the cluster patterns in terms of hashes and in terms of data offsets.
  i = 0;
  j = 0;
  while( i < nrows && j < n ) {
    int p = 0;

    // advance past any input elements less than out[j].position.
    while( in[i].position < out[j].position ) i++;

    if( out[j].score > 0 ) p = 1;

    if( p ) printf("score %u at %" PRIi64 " %" PRIi32 " bytes { ", (unsigned int) out[j].score, out[j].position, out[j].length);

    // Scroll through any hashes in the input.
    for( ; i < nrows; i++ ) {
      if( in[i].position + in[i].length > out[j].position + out[j].length ) break;
      if( p ) printf("%016" PRIX64 " ", in[i].hash);
    }
    if( p ) printf("}");
    if( data && p ) {
      printf(" : ");
      for( k = 0; k < out[j].length; k++ ) {
        uint64_t off = out[j].position + k;
        unsigned char byte = data[off];
        if( byte == '\n' ) printf("\\n");
        else if( byte == '\\' ) printf("\\\\");
        else if( isprint(byte) ) printf("%c", byte);
        else printf("\\x%02x", (int) byte);
      }
    }
    if( p ) printf("\n");
    j++;
  }

  return n;
}