static CondorChunk * absorb( CondorChunk *head, CondorChunk *c ) { CondorChunk *next, *combined; if(!head) return c; if( should_combine( head, c ) ) { next = head->next; combined = combine( head, c ); return absorb( next, combined ); } else { head->next = absorb( head->next, c ); return head; } }
uint64_t cluster(uint64_t nrows, struct hash* initial_in, struct candidate* tmp, struct hash* out, uint64_t data_len, unsigned char* data) { int pass = 0; uint64_t i,j,k,n, start_hash, end_hash, start_next_hash, end_next_hash; uint64_t use_i, next_i; uint64_t hash, next_hash; uint16_t score; struct hash* in = initial_in; int combined_any; n = nrows; for( pass = 0; pass < MAX_PASSES; pass++ ) { printf("cluster pass %i %" PRIi64 "/%" PRIi64 " rows\n", pass, n, nrows ); assert( n <= nrows ); // Create array of candidates. j = 0; for( i = 0; i < n; i++ ) { int hasnext = (i+1 < n); int hasnextnext = (i+2 < n); int nhashes, next_nhashes; //printf("in %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i hashes %i\n", in[i].hash, in[i].position, (int) in[i].length, (int) in[i].score, (int) in[i].nerrors, (int) in[i].nhashes); // first, output this and next tmp[j].position = in[i].position; tmp[j].hash = in[i].hash; tmp[j].next_hash = hasnext ? in[i+1].hash : 0; tmp[j].old_length = in[i].length; if( hasnext ) { tmp[j].new_length = in[i+1].length + in[i+1].position - in[i].position; } else { tmp[j].new_length = in[i].length; } tmp[j].old_nerrors = in[i].nerrors; tmp[j].new_nerrors = in[i].nerrors + (hasnext ? in[i+1].nerrors : 0); nhashes = in[i].nhashes; next_nhashes = (hasnext ? in[i+1].nhashes : 0); if( nhashes == 0 ) nhashes = 1; if( next_nhashes == 0 ) next_nhashes = 1; tmp[j].old_nhashes = nhashes; tmp[j].new_nhashes = nhashes + next_nhashes; tmp[j].isgap = 0; //printf("c %016" PRIX64 " %016" PRIX64 " pos %" PRIi64 " oldlen %i newlen %i oldn %i newn %i\n", tmp[j].hash, tmp[j].next_hash, tmp[j].position, (int) tmp[j].old_length, (int) tmp[j].new_length, (int) tmp[j].old_nhashes, (int) tmp[j].new_nhashes); j++; // then, output this and next next (skipping one) tmp[j].position = in[i].position; tmp[j].hash = in[i].hash; tmp[j].next_hash = hasnextnext ? in[i+2].hash : 0; tmp[j].old_length = in[i].length; if( hasnextnext ) { tmp[j].new_length = in[i+2].length + in[i+2].position - in[i].position; } else { tmp[j].new_length = in[i].length; } tmp[j].old_nerrors = in[i].nerrors; tmp[j].new_nerrors = 1 + in[i].nerrors + (hasnextnext ? in[i+2].nerrors : 0); nhashes = in[i].nhashes; next_nhashes = (hasnextnext ? in[i+2].nhashes : 0); if( nhashes == 0 ) nhashes = 1; if( next_nhashes == 0 ) next_nhashes = 1; tmp[j].old_nhashes = nhashes; tmp[j].new_nhashes = nhashes + next_nhashes; tmp[j].isgap = 1; //printf("c %016" PRIX64 " %016" PRIX64 " pos %" PRIi64 " oldlen %i newlen %i oldn %i newn %i\n", tmp[j].hash, tmp[j].next_hash, tmp[j].position, (int) tmp[j].old_length, (int) tmp[j].new_length, (int) tmp[j].old_nhashes, (int) tmp[j].new_nhashes); j++; } n = j; assert( n <= 2*nrows ); // Sort candidates by hash, next hash. qsort(tmp, n, sizeof(struct candidate), cmp_candidate_by_hashes); // Create new hashes by merging candidates with appropriate counts, // taking into account error information. /* for( j = 0; j < n; j++ ) { printf("% 4li sc %016" PRIX64 " %016" PRIX64 " pos %" PRIi64 " oldlen %i newlen %i oldn %i newn %i\n", (long) j, tmp[j].hash, tmp[j].next_hash, tmp[j].position, (int) tmp[j].old_length, (int) tmp[j].new_length, (int) tmp[j].old_nhashes, (int) tmp[j].new_nhashes); }*/ combined_any = 0; k = 0; start_hash = end_hash = 0; while( start_hash < n ) { start_hash = end_hash; hash = tmp[start_hash].hash; // find the first entry with a different hash while( end_hash < n && tmp[end_hash].hash == hash ) end_hash++; // OK, now we are working with a group of hashes with the same // initial hash, and a variety of next hashes, from [start_hash,end_hash) //printf(" found hash group from %li to %li with hash %016" PRIX64 "\n", (long) start_hash, (long) end_hash, hash); // for each group of next hashes... start_next_hash = end_next_hash = start_hash; while ( start_next_hash < end_hash ) { start_next_hash = end_next_hash; next_hash = tmp[start_next_hash].next_hash; // find the first entry with a different next hash. while( end_next_hash < end_hash && tmp[end_next_hash].next_hash == next_hash ) end_next_hash++; // OK, now we have a group of next hashes from [start_next_hash,end_next_hash). //printf(" found hash sub group from %li to %li with next_hash %016" PRIX64 "\n", (long) start_next_hash, (long) end_next_hash, next_hash); // Is it worth combining? score = should_combine( end_hash - start_hash, end_next_hash - start_next_hash ); for( j = start_next_hash; j < end_next_hash; j++ ) { assert(k < 2*nrows); //printf("% 4li cc %016" PRIX64 " %016" PRIX64 " pos %" PRIi64 " oldlen %i newlen %i oldn %i newn %i olderr %i newerr %i\n", (long) j, tmp[j].hash, tmp[j].next_hash, tmp[j].position, (int) tmp[j].old_length, (int) tmp[j].new_length, (int) tmp[j].old_nhashes, (int) tmp[j].new_nhashes, (int) tmp[j].old_nerrors, (int) tmp[j].new_nerrors); if( score > 0 && tmp[j].new_nerrors <= MAX_SKIPS && tmp[j].new_length <= MAX_LENGTH ) { // Output the merged rows. combined_any = 1; out[k].position = tmp[j].position; out[k].hash = ROL_hash(tmp[j].hash, tmp[j].new_nhashes - tmp[j].old_nhashes) ^ tmp[j].next_hash; out[k].nerrors = tmp[j].new_nerrors; out[k].nhashes = tmp[j].new_nhashes; out[k].score = (score << 1) | 1; out[k].length = tmp[j].new_length; //printf("comb out %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i\n", out[k].hash, out[k].position, (int) out[k].length, (int) out[k].score, (int) out[k].nerrors); k++; } else if( ! tmp[j].isgap ) { // Output the non-merged original row only. out[k].position = tmp[j].position; out[k].hash = tmp[j].hash; out[k].nerrors = tmp[j].old_nerrors; out[k].nhashes = tmp[j].old_nhashes; if( end_hash - start_hash >= MINIMUM_GROUP_SIZE ) out[k].score = (log2lli( end_hash - start_hash ) << 1) | 0; else out[k].score = 0; out[k].length = tmp[j].old_length; //printf("out %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i\n", out[k].hash, out[k].position, (int) out[k].length, (int) out[k].score, (int) out[k].nerrors); k++; } } } } //printf(" DONE MERGE\n"); n = k; // Sort the new hashes by position. qsort(out, n, sizeof(struct hash), cmp_hash_by_position); /* for( i = 0; i < n; i++ ) { printf("s out %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i hashes %i\n", out[i].hash, out[i].position, (int) out[i].length, (int) out[i].score, (int) out[i].nerrors, (int) out[i].nhashes); }*/ // Remove hashes that are totally subsumed by others // if we have 2 hashes at the same position, keep the one with a better score // or fewer errors. k = 0; for( i = 0; i < n; i = next_i) { use_i = i; next_i = i + 1; // handle choosing the best of two entries at the same position. if( i + 1 < n && out[i].position == out[i+1].position ) { int i_score, i_1_score; i_score = out[i].score; i_score -= out[i].nerrors; i_1_score = out[i+1].score; i_1_score -= out[i+1].nerrors; // handle them both now. if( ((i_1_score & 1) && !(i_score & 1) ) || i_1_score > i_score ) use_i = i+1; next_i = i + 2; } // Now, look at the last entry we output. // Is this entry totally subsumed by it? // If so, we must remove it. if( k > 0 && out[k-1].position + out[k-1].length >= out[use_i].position + out[use_i].length ) { // We are totally inside the previously output // element, so not output here. } else { //printf("o out %016" PRIX64 " pos %" PRIi64 " len %i score %i errors %i hashes %i\n", out[use_i].hash, out[use_i].position, (int) out[use_i].length, (int) out[use_i].score, (int) out[use_i].nerrors, (int) out[use_i].nhashes); if( k != use_i ) { out[k] = out[use_i]; } k++; } } n = k; assert( n <= nrows ); // For the next loop, use the output we just made. in = out; // Stop looping if we didn't find any to combine. if( combined_any == 0 ) break; } printf("Completed clustering after %i passes and returning %" PRIi64 "/%" PRIi64 " rows\n", pass, n, nrows ); in = initial_in; // Explain the cluster patterns in terms of hashes and in terms of data offsets. i = 0; j = 0; while( i < nrows && j < n ) { int p = 0; // advance past any input elements less than out[j].position. while( in[i].position < out[j].position ) i++; if( out[j].score > 0 ) p = 1; if( p ) printf("score %u at %" PRIi64 " %" PRIi32 " bytes { ", (unsigned int) out[j].score, out[j].position, out[j].length); // Scroll through any hashes in the input. for( ; i < nrows; i++ ) { if( in[i].position + in[i].length > out[j].position + out[j].length ) break; if( p ) printf("%016" PRIX64 " ", in[i].hash); } if( p ) printf("}"); if( data && p ) { printf(" : "); for( k = 0; k < out[j].length; k++ ) { uint64_t off = out[j].position + k; unsigned char byte = data[off]; if( byte == '\n' ) printf("\\n"); else if( byte == '\\' ) printf("\\\\"); else if( isprint(byte) ) printf("%c", byte); else printf("\\x%02x", (int) byte); } } if( p ) printf("\n"); j++; } return n; }