static void rm_directory_add_subdir(RmDirectory *parent, RmDirectory *subdir) { if(subdir->was_merged) { return; } parent->mergeups = subdir->mergeups + parent->mergeups + 1; parent->dupe_count += subdir->dupe_count; g_queue_push_head(&parent->children, subdir); parent->prefd_files += subdir->prefd_files; #ifdef _RM_TREEMERGE_DEBUG g_printerr("%55s (%3ld/%3ld) <- %s (%3ld/%3ld)\n", parent->dirname, parent->dupe_count, parent->file_count, subdir->dirname, subdir->dupe_count, subdir->file_count); #endif /** * Here's something weird: * - a counter is used and substraced at once from parent->dupe_count. * - it would ofc. be nicer to substract it step by step. * - but for some weird reasons this only works on clang, not gcc. * - yes, what. But I tested this, I promise! */ for(GList *iter = subdir->known_files.head; iter; iter = iter->next) { int c = rm_directory_add(parent, (RmFile *)iter->data); parent->dupe_count -= c; } /* Inherit the child's checksum */ unsigned char *subdir_cksum = rm_digest_steal_buffer(subdir->digest); rm_digest_update(parent->digest, subdir_cksum, subdir->digest->bytes); g_slice_free1(subdir->digest->bytes, subdir_cksum); subdir->was_merged = true; }
static int rm_directory_add(RmDirectory *directory, RmFile *file) { /* Update the directorie's hash with the file's hash Since we cannot be sure in which order the files come in we have to add the hash cummulatively. */ int new_dupes = 0; g_assert(file); g_assert(file->digest); g_assert(directory); guint8 *file_digest = NULL; RmOff digest_bytes = 0; if(file->digest->type == RM_DIGEST_PARANOID) { file_digest = rm_digest_steal_buffer(file->digest->shadow_hash); digest_bytes = file->digest->shadow_hash->bytes; } else { file_digest = rm_digest_steal_buffer(file->digest); digest_bytes = file->digest->bytes; } /* + and not XOR, since ^ would yield 0 for same hashes always. No matter * which hashes. Also this would be confusing. For me and for debuggers. */ rm_digest_update(directory->digest, file_digest, digest_bytes); /* The file value is not really used, but we need some non-null value */ g_hash_table_add(directory->hash_set, file->digest); g_slice_free1(digest_bytes, file_digest); if(file->hardlinks.is_head && file->hardlinks.files) { new_dupes = 1 + g_queue_get_length(file->hardlinks.files); } else { new_dupes = 1; } directory->dupe_count += new_dupes; directory->prefd_files += file->is_prefd; return new_dupes; }
void rm_digest_buffered_update(RmBuffer *buffer) { RmDigest *digest = buffer->digest; if(digest->type != RM_DIGEST_PARANOID) { rm_digest_update(digest, buffer->data, buffer->len); rm_buffer_release(buffer); } else { RmParanoid *paranoid = digest->paranoid; /* efficiently append buffer to buffers GSList */ if(!paranoid->buffers) { /* first buffer */ paranoid->buffers = g_slist_prepend(NULL, buffer); paranoid->buffer_tail = paranoid->buffers; } else { paranoid->buffer_tail = g_slist_append(paranoid->buffer_tail, buffer)->next; } digest->bytes += buffer->len; if(paranoid->shadow_hash) { rm_digest_update(paranoid->shadow_hash, buffer->data, buffer->len); } if(paranoid->twin_candidate) { /* do a running check that digest remains the same as its candidate twin */ if(rm_buffer_equal(buffer, paranoid->twin_candidate_buffer->data)) { /* buffers match; move ptr to next one ready for next buffer */ paranoid->twin_candidate_buffer = paranoid->twin_candidate_buffer->next; } else { /* buffers don't match - delete candidate (new candidate might be added on * next * call to rm_digest_buffered_update) */ paranoid->twin_candidate = NULL; paranoid->twin_candidate_buffer = NULL; #if _RM_CHECKSUM_DEBUG rm_log_debug_line("Ejected candidate match at buffer #%u", g_slist_length(paranoid->buffers)); #endif } } while(!paranoid->twin_candidate && paranoid->incoming_twin_candidates && (paranoid->twin_candidate = g_async_queue_try_pop(paranoid->incoming_twin_candidates))) { /* validate the new candidate by comparing the previous buffers (not * including current)*/ paranoid->twin_candidate_buffer = paranoid->twin_candidate->paranoid->buffers; GSList *iter_self = paranoid->buffers; gboolean match = TRUE; while(match && iter_self) { match = (rm_buffer_equal(paranoid->twin_candidate_buffer->data, iter_self->data)); iter_self = iter_self->next; paranoid->twin_candidate_buffer = paranoid->twin_candidate_buffer->next; } if(paranoid->twin_candidate && !match) { /* reject the twin candidate, also add to rejects list to speed up rm_digest_equal() */ #if _RM_CHECKSUM_DEBUG rm_log_debug_line("Rejected twin candidate %p for %p", paranoid->twin_candidate, paranoid); #endif if(!paranoid->shadow_hash) { /* we use the rejects file to speed up rm_digest_equal */ paranoid->rejects = g_slist_prepend(paranoid->rejects, paranoid->twin_candidate); } paranoid->twin_candidate = NULL; paranoid->twin_candidate_buffer = NULL; #if _RM_CHECKSUM_DEBUG } else { rm_log_debug_line("Added twin candidate %p for %p", paranoid->twin_candidate, paranoid); #endif } } } }
void rm_digest_update(RmDigest *digest, const unsigned char *data, RmOff size) { switch(digest->type) { case RM_DIGEST_MD5: case RM_DIGEST_SHA512: case RM_DIGEST_SHA256: case RM_DIGEST_SHA1: g_checksum_update(digest->glib_checksum, (const guchar *)data, size); break; case RM_DIGEST_SPOOKY32: digest->checksum[0].first = spooky_hash32(data, size, digest->checksum[0].first); break; case RM_DIGEST_SPOOKY64: digest->checksum[0].first = spooky_hash64(data, size, digest->checksum[0].first); break; case RM_DIGEST_SPOOKY: spooky_hash128(data, size, &digest->checksum[0].first, &digest->checksum[0].second); break; case RM_DIGEST_MURMUR512: case RM_DIGEST_MURMUR256: case RM_DIGEST_MURMUR: for (guint8 block = 0; block < ( digest->bytes / 16 ); block++) { #if RM_PLATFORM_32 MurmurHash3_x86_128(data, size, (uint32_t)digest->checksum[block].first, &digest->checksum[block]); //& #elif RM_PLATFORM_64 MurmurHash3_x64_128(data, size, (uint32_t)digest->checksum[block].first, &digest->checksum[block]); #else #error "Probably not a good idea to compile rmlint on 16bit." #endif } break; case RM_DIGEST_CITY: case RM_DIGEST_CITY256: case RM_DIGEST_CITY512: for (guint8 block = 0; block < (digest->bytes / 16); block++) { /* Opt out for the more optimized version. * This needs the crc command of sse4.2 * (available on Intel Nehalem and up; my amd box doesn't have this though) */ #if RM_PLATFORM_64 && HAVE_SSE42 digest->checksum[block] = CityHashCrc128WithSeed((const char *)data, size, digest->checksum[block]); #else digest->checksum[block] = CityHash128WithSeed((const char *) data, size, digest->checksum[block]); #endif } break; case RM_DIGEST_BASTARD: MurmurHash3_x86_128(data, size, (uint32_t)digest->checksum[0].first, &digest->checksum[0]); #if RM_PLATFORM_64 && HAVE_SSE42 digest->checksum[1] = CityHashCrc128WithSeed((const char *)data, size, digest->checksum[1]); #else digest->checksum[1] = CityHash128WithSeed((const char *) data, size, digest->checksum[1]); #endif break; case RM_DIGEST_CUMULATIVE: { /* This is basically FNV1a, it is just important that the order of * adding data to the hash has no effect on the result, so it can * be used as a lookup key: * * http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function * */ RmOff hash = 0xcbf29ce484222325; for(gsize i = 0; i < digest->bytes; ++i) { hash ^= ((guint8 *)data)[i % size]; hash *= 0x100000001b3; ((guint8 *)digest->checksum)[i] += hash; } } break; case RM_DIGEST_PARANOID: g_assert(size + digest->paranoid_offset <= digest->bytes); memcpy((char *)digest->checksum + digest->paranoid_offset, data, size); digest->paranoid_offset += size; rm_digest_update(digest->shadow_hash, data, size); break; default: g_assert_not_reached(); } }