예제 #1
0
static void rm_directory_add_subdir(RmDirectory *parent, RmDirectory *subdir) {
    if(subdir->was_merged) {
        return;
    }

    parent->mergeups = subdir->mergeups + parent->mergeups + 1;
    parent->dupe_count += subdir->dupe_count;
    g_queue_push_head(&parent->children, subdir);
    parent->prefd_files += subdir->prefd_files;

#ifdef _RM_TREEMERGE_DEBUG
    g_printerr("%55s (%3ld/%3ld) <- %s (%3ld/%3ld)\n", parent->dirname,
               parent->dupe_count, parent->file_count, subdir->dirname,
               subdir->dupe_count, subdir->file_count);
#endif

    /**
     * Here's something weird:
     * - a counter is used and substraced at once from parent->dupe_count.
     * - it would ofc. be nicer to substract it step by step.
     * - but for some weird reasons this only works on clang, not gcc.
     * - yes, what. But I tested this, I promise!
     */
    for(GList *iter = subdir->known_files.head; iter; iter = iter->next) {
        int c = rm_directory_add(parent, (RmFile *)iter->data);
        parent->dupe_count -= c;
    }

    /* Inherit the child's checksum */
    unsigned char *subdir_cksum = rm_digest_steal_buffer(subdir->digest);
    rm_digest_update(parent->digest, subdir_cksum, subdir->digest->bytes);
    g_slice_free1(subdir->digest->bytes, subdir_cksum);

    subdir->was_merged = true;
}
예제 #2
0
static int rm_directory_add(RmDirectory *directory, RmFile *file) {
    /* Update the directorie's hash with the file's hash
       Since we cannot be sure in which order the files come in
       we have to add the hash cummulatively.
     */
    int new_dupes = 0;

    g_assert(file);
    g_assert(file->digest);
    g_assert(directory);

    guint8 *file_digest = NULL;
    RmOff digest_bytes = 0;

    if(file->digest->type == RM_DIGEST_PARANOID) {
        file_digest = rm_digest_steal_buffer(file->digest->shadow_hash);
        digest_bytes = file->digest->shadow_hash->bytes;
    } else {
        file_digest = rm_digest_steal_buffer(file->digest);
        digest_bytes = file->digest->bytes;
    }

    /* + and not XOR, since ^ would yield 0 for same hashes always. No matter
     * which hashes. Also this would be confusing. For me and for debuggers.
     */
    rm_digest_update(directory->digest, file_digest, digest_bytes);

    /* The file value is not really used, but we need some non-null value */
    g_hash_table_add(directory->hash_set, file->digest);

    g_slice_free1(digest_bytes, file_digest);

    if(file->hardlinks.is_head && file->hardlinks.files) {
        new_dupes = 1 + g_queue_get_length(file->hardlinks.files);
    } else {
        new_dupes = 1;
    }

    directory->dupe_count += new_dupes;
    directory->prefd_files += file->is_prefd;

    return new_dupes;
}
예제 #3
0
파일: checksum.c 프로젝트: FihlaTV/rmlint
void rm_digest_buffered_update(RmBuffer *buffer) {
    RmDigest *digest = buffer->digest;
    if(digest->type != RM_DIGEST_PARANOID) {
        rm_digest_update(digest, buffer->data, buffer->len);
        rm_buffer_release(buffer);
    } else {
        RmParanoid *paranoid = digest->paranoid;

        /* efficiently append buffer to buffers GSList */
        if(!paranoid->buffers) {
            /* first buffer */
            paranoid->buffers = g_slist_prepend(NULL, buffer);
            paranoid->buffer_tail = paranoid->buffers;
        } else {
            paranoid->buffer_tail = g_slist_append(paranoid->buffer_tail, buffer)->next;
        }

        digest->bytes += buffer->len;

        if(paranoid->shadow_hash) {
            rm_digest_update(paranoid->shadow_hash, buffer->data, buffer->len);
        }

        if(paranoid->twin_candidate) {
            /* do a running check that digest remains the same as its candidate twin */
            if(rm_buffer_equal(buffer, paranoid->twin_candidate_buffer->data)) {
                /* buffers match; move ptr to next one ready for next buffer */
                paranoid->twin_candidate_buffer = paranoid->twin_candidate_buffer->next;
            } else {
                /* buffers don't match - delete candidate (new candidate might be added on
                 * next
                 * call to rm_digest_buffered_update) */
                paranoid->twin_candidate = NULL;
                paranoid->twin_candidate_buffer = NULL;
#if _RM_CHECKSUM_DEBUG
                rm_log_debug_line("Ejected candidate match at buffer #%u",
                                  g_slist_length(paranoid->buffers));
#endif
            }
        }

        while(!paranoid->twin_candidate && paranoid->incoming_twin_candidates &&
              (paranoid->twin_candidate =
                   g_async_queue_try_pop(paranoid->incoming_twin_candidates))) {
            /* validate the new candidate by comparing the previous buffers (not
             * including current)*/
            paranoid->twin_candidate_buffer = paranoid->twin_candidate->paranoid->buffers;
            GSList *iter_self = paranoid->buffers;
            gboolean match = TRUE;
            while(match && iter_self) {
                match = (rm_buffer_equal(paranoid->twin_candidate_buffer->data,
                                         iter_self->data));
                iter_self = iter_self->next;
                paranoid->twin_candidate_buffer = paranoid->twin_candidate_buffer->next;
            }
            if(paranoid->twin_candidate && !match) {
/* reject the twin candidate, also add to rejects list to speed up rm_digest_equal() */
#if _RM_CHECKSUM_DEBUG
                rm_log_debug_line("Rejected twin candidate %p for %p",
                                  paranoid->twin_candidate, paranoid);
#endif
                if(!paranoid->shadow_hash) {
                    /* we use the rejects file to speed up rm_digest_equal */
                    paranoid->rejects =
                        g_slist_prepend(paranoid->rejects, paranoid->twin_candidate);
                }
                paranoid->twin_candidate = NULL;
                paranoid->twin_candidate_buffer = NULL;
#if _RM_CHECKSUM_DEBUG
            } else {
                rm_log_debug_line("Added twin candidate %p for %p",
                                  paranoid->twin_candidate, paranoid);
#endif
            }
        }
    }
}
예제 #4
0
파일: checksum.c 프로젝트: mentat-fr/rmlint
void rm_digest_update(RmDigest *digest, const unsigned char *data, RmOff size) {
    switch(digest->type) {
    case RM_DIGEST_MD5:
    case RM_DIGEST_SHA512:
    case RM_DIGEST_SHA256:
    case RM_DIGEST_SHA1:
        g_checksum_update(digest->glib_checksum, (const guchar *)data, size);
        break;
    case RM_DIGEST_SPOOKY32:
        digest->checksum[0].first = spooky_hash32(data, size, digest->checksum[0].first);
        break;
    case RM_DIGEST_SPOOKY64:
        digest->checksum[0].first = spooky_hash64(data, size, digest->checksum[0].first);
        break;
    case RM_DIGEST_SPOOKY:
        spooky_hash128(data, size, &digest->checksum[0].first, &digest->checksum[0].second);
        break;
    case RM_DIGEST_MURMUR512:
    case RM_DIGEST_MURMUR256:
    case RM_DIGEST_MURMUR:
        for (guint8 block = 0; block < ( digest->bytes / 16 ); block++) {
#if RM_PLATFORM_32
            MurmurHash3_x86_128(data, size,
                                (uint32_t)digest->checksum[block].first,
                                &digest->checksum[block]); //&
#elif RM_PLATFORM_64
            MurmurHash3_x64_128(data, size,
                                (uint32_t)digest->checksum[block].first,
                                &digest->checksum[block]);
#else
#error "Probably not a good idea to compile rmlint on 16bit."
#endif
        }
        break;
    case RM_DIGEST_CITY:
    case RM_DIGEST_CITY256:
    case RM_DIGEST_CITY512:
        for (guint8 block = 0; block < (digest->bytes / 16); block++) {
            /* Opt out for the more optimized version.
            * This needs the crc command of sse4.2
            * (available on Intel Nehalem and up; my amd box doesn't have this though)
            */
#if RM_PLATFORM_64 && HAVE_SSE42
            digest->checksum[block] = CityHashCrc128WithSeed((const char *)data, size, digest->checksum[block]);
#else
            digest->checksum[block] = CityHash128WithSeed((const char *) data, size, digest->checksum[block]);
#endif
        }
        break;
    case RM_DIGEST_BASTARD:
        MurmurHash3_x86_128(data, size, (uint32_t)digest->checksum[0].first, &digest->checksum[0]);
#if RM_PLATFORM_64 && HAVE_SSE42
        digest->checksum[1] = CityHashCrc128WithSeed((const char *)data, size, digest->checksum[1]);
#else
        digest->checksum[1] = CityHash128WithSeed((const char *) data, size, digest->checksum[1]);
#endif
        break;
    case RM_DIGEST_CUMULATIVE: {
        /* This is basically FNV1a, it is just important that the order of
         * adding data to the hash has no effect on the result, so it can
         * be used as a lookup key:
         *
         * http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
         * */
        RmOff hash = 0xcbf29ce484222325;
        for(gsize i = 0; i < digest->bytes; ++i) {
            hash ^= ((guint8 *)data)[i % size];
            hash *= 0x100000001b3;
            ((guint8 *)digest->checksum)[i] += hash;
        }
    }
    break;
    case RM_DIGEST_PARANOID:
        g_assert(size + digest->paranoid_offset <= digest->bytes);
        memcpy((char *)digest->checksum + digest->paranoid_offset, data, size);
        digest->paranoid_offset += size;
        rm_digest_update(digest->shadow_hash, data, size);
        break;
    default:
        g_assert_not_reached();
    }
}