Ejemplo n.º 1
0
float
distance_jaccard(struct sr_thread *thread1,
                 struct sr_thread *thread2)
{
    assert(thread1->type == thread2->type);

    int intersection_size = 0, set1_size = 0, set2_size = 0;

    for (struct sr_frame *curr_frame = sr_thread_frames(thread1);
         curr_frame;
         curr_frame = sr_frame_next(curr_frame))
    {
        if (distance_jaccard_frames_contain(
                sr_frame_next(curr_frame),
                curr_frame))
        {
            continue; // not last, skip
        }

        ++set1_size;

        if (distance_jaccard_frames_contain(
                sr_thread_frames(thread2),
                curr_frame))
        {
            ++intersection_size;
        }
    }

    for (struct sr_frame *curr_frame = sr_thread_frames(thread2);
         curr_frame;
         curr_frame = sr_frame_next(curr_frame))
    {
        if (distance_jaccard_frames_contain(
                sr_frame_next(curr_frame),
                curr_frame))
        {
            continue; // not last, skip
        }

        ++set2_size;
    }

    int union_size = set1_size + set2_size - intersection_size;
    if (!union_size)
        return 0.0;

    float j_distance = 1.0 - intersection_size / (float)union_size;
    if (j_distance < 0.0)
        j_distance = 0.0;

    return j_distance;
}
Ejemplo n.º 2
0
PyObject *
frames_to_python_list(struct sr_thread *thread, PyTypeObject *frame_type)
{
    PyObject *result = PyList_New(0);
    if (!result)
        return NULL;

    struct sr_frame *frame = sr_thread_frames(thread);
    struct sr_py_base_frame *item;
    while (frame)
    {
        item = PyObject_New(struct sr_py_base_frame, frame_type);
        if (!item)
            return PyErr_NoMemory();

        /* XXX may need to initialize item further */
        /* It would be a good idea to have a common code that is executed each
         * time new object (e.g. via __new__ or _dup) is created so that we
         * don't miss setting some attribute. As opposed to using PyObject_New
         * directly. */
        item->frame = frame;
        if (PyList_Append(result, (PyObject *)item) < 0)
            return NULL;

        frame = sr_frame_next(frame);
    }

    return result;
}
Ejemplo n.º 3
0
static bool
distance_jaccard_frames_contain(struct sr_frame *haystack,
                                struct sr_frame *needle)
{
    while (haystack)
    {
        // Checking if functions are the same but not both "??".
        if (!sr_frame_cmp_distance(haystack, needle))
            return true;

        haystack = sr_frame_next(haystack);
    }

    return false;
}
Ejemplo n.º 4
0
float
distance_jaro_winkler(struct sr_thread *thread1,
                      struct sr_thread *thread2)
{
    assert(thread1->type == thread2->type);

    int frame1_count = sr_thread_frame_count(thread1);
    int frame2_count = sr_thread_frame_count(thread2);

    if (frame1_count == 0 && frame2_count == 0)
        return 1.0;

    int max_frame_count = frame2_count;
    if (max_frame_count < frame1_count)
        max_frame_count = frame1_count;

    int prefix_len = 0;
    bool still_prefix = true;
    float trans_count = 0, match_count = 0;

    struct sr_frame *curr_frame = sr_thread_frames(thread1);
    for (int i = 1; curr_frame; ++i)
    {
        bool match = false;
        struct sr_frame *curr_frame2 = sr_thread_frames(thread2);
        for (int j = 1; !match && curr_frame2; ++j)
        {
            /* Whether the prefix continues to be the same for both
             * threads or not.
             */
            if (i == j && 0 != sr_frame_cmp_distance(curr_frame, curr_frame2))
                still_prefix = false;

            /* Getting a match only if not too far away from each
             * other and if functions aren't both unpaired unknown
             * functions.
             */
            if (abs(i - j) <= max_frame_count / 2 - 1 &&
                0 == sr_frame_cmp_distance(curr_frame, curr_frame2))
            {
                match = true;
                if (i != j)
                    ++trans_count;  // transposition in place
            }

            curr_frame2 = sr_frame_next(curr_frame2);
        }

        if (still_prefix)
            ++prefix_len;

        if (match)
            ++match_count;

        curr_frame = sr_frame_next(curr_frame);
    }

    trans_count /= 2;

    if (prefix_len > 4)
        prefix_len = 4;

    if (0 == match_count)
        return 0;  // so as not to divide by 0

    float dist_jaro = (match_count / (float)frame1_count +
                       match_count / (float)frame2_count +
                       (match_count - trans_count) / match_count) / 3;

    /* How much weight we give to having common prefixes
     * (always k < 0.25).
     */
    float k = 0.2;

    float dist = dist_jaro + (float)prefix_len * k * (1 - dist_jaro);
    return dist;
}
Ejemplo n.º 5
0
float
distance_levenshtein(struct sr_thread *thread1,
                     struct sr_thread *thread2,
                     bool transposition)
{
    assert(thread1->type == thread2->type);

    int frame_count1 = sr_thread_frame_count(thread1);
    int frame_count2 = sr_thread_frame_count(thread2);

    int max_frame_count = frame_count2;
    if (max_frame_count < frame_count1)
        max_frame_count = frame_count1;

    /* Avoid division by zero in case we get two empty threads */
    if (max_frame_count == 0)
        return 0.0;

    int m = frame_count1 + 1;
    int n = frame_count2 + 1;

    // store only two last rows and columns instead of whole 2D array
    SR_ASSERT(n <= SIZE_MAX - 1);
    SR_ASSERT(m <= SIZE_MAX - (n + 1));
    int *dist = sr_malloc_array(sizeof(int), m + n + 1);
    int *dist1 = sr_malloc_array(sizeof(int), m + n + 1);

    // first row and column having distance equal to their position
    for (int i = m; i > 0; --i)
        dist[m - i] = i;

    for (int i = 0; i <= n; ++i)
        dist[m + i] = i;

    struct sr_frame *curr_frame2 = sr_thread_frames(thread2);
    struct sr_frame *prev_frame = NULL;
    struct sr_frame *prev_frame2 = NULL;

    for (int j = 1; curr_frame2; ++j)
    {
        struct sr_frame *curr_frame = sr_thread_frames(thread1);
        for (int i = 1; curr_frame; ++i)
        {
            int l = m + j - i;

            int dist2 = dist1[l];
            dist1[l] = dist[l];

            int cost;

            /*similar characters have distance equal to the previous
              one diagonally, "??" functions aren't taken as
              similar */
            if (0 == sr_frame_cmp_distance(curr_frame, curr_frame2))
                cost = 0;
            else
            {
                // different ones takes the lowest value of all
                // previous distances
                cost = 1;
                dist[l] += 1;
                if (dist[l] > dist[l - 1] + 1)
                    dist[l] = dist[l - 1] + 1;

                if (dist[l] > dist[l + 1] + 1)
                    dist[l] = dist[l + 1] + 1;
            }

            /*checking for transposition of two characters in both ways
              taking into account that "??" functions are not similar*/
            if (transposition &&
                (i >= 2 && j >= 2 && dist[l] > dist2 + cost &&
                 0 == sr_frame_cmp_distance(curr_frame, prev_frame2) &&
                 0 == sr_frame_cmp_distance(prev_frame, curr_frame2)))
            {
                dist[l] = dist2 + cost;
            }

            prev_frame = curr_frame;
            curr_frame = sr_frame_next(curr_frame);
        }

        prev_frame2 = curr_frame2;
        curr_frame2 = sr_frame_next(curr_frame2);
    }

    int result = dist[n];
    free(dist);
    free(dist1);

    return (float)result / max_frame_count;
}