float distance_jaccard(struct sr_thread *thread1, struct sr_thread *thread2) { assert(thread1->type == thread2->type); int intersection_size = 0, set1_size = 0, set2_size = 0; for (struct sr_frame *curr_frame = sr_thread_frames(thread1); curr_frame; curr_frame = sr_frame_next(curr_frame)) { if (distance_jaccard_frames_contain( sr_frame_next(curr_frame), curr_frame)) { continue; // not last, skip } ++set1_size; if (distance_jaccard_frames_contain( sr_thread_frames(thread2), curr_frame)) { ++intersection_size; } } for (struct sr_frame *curr_frame = sr_thread_frames(thread2); curr_frame; curr_frame = sr_frame_next(curr_frame)) { if (distance_jaccard_frames_contain( sr_frame_next(curr_frame), curr_frame)) { continue; // not last, skip } ++set2_size; } int union_size = set1_size + set2_size - intersection_size; if (!union_size) return 0.0; float j_distance = 1.0 - intersection_size / (float)union_size; if (j_distance < 0.0) j_distance = 0.0; return j_distance; }
PyObject * frames_to_python_list(struct sr_thread *thread, PyTypeObject *frame_type) { PyObject *result = PyList_New(0); if (!result) return NULL; struct sr_frame *frame = sr_thread_frames(thread); struct sr_py_base_frame *item; while (frame) { item = PyObject_New(struct sr_py_base_frame, frame_type); if (!item) return PyErr_NoMemory(); /* XXX may need to initialize item further */ /* It would be a good idea to have a common code that is executed each * time new object (e.g. via __new__ or _dup) is created so that we * don't miss setting some attribute. As opposed to using PyObject_New * directly. */ item->frame = frame; if (PyList_Append(result, (PyObject *)item) < 0) return NULL; frame = sr_frame_next(frame); } return result; }
static bool distance_jaccard_frames_contain(struct sr_frame *haystack, struct sr_frame *needle) { while (haystack) { // Checking if functions are the same but not both "??". if (!sr_frame_cmp_distance(haystack, needle)) return true; haystack = sr_frame_next(haystack); } return false; }
float distance_jaro_winkler(struct sr_thread *thread1, struct sr_thread *thread2) { assert(thread1->type == thread2->type); int frame1_count = sr_thread_frame_count(thread1); int frame2_count = sr_thread_frame_count(thread2); if (frame1_count == 0 && frame2_count == 0) return 1.0; int max_frame_count = frame2_count; if (max_frame_count < frame1_count) max_frame_count = frame1_count; int prefix_len = 0; bool still_prefix = true; float trans_count = 0, match_count = 0; struct sr_frame *curr_frame = sr_thread_frames(thread1); for (int i = 1; curr_frame; ++i) { bool match = false; struct sr_frame *curr_frame2 = sr_thread_frames(thread2); for (int j = 1; !match && curr_frame2; ++j) { /* Whether the prefix continues to be the same for both * threads or not. */ if (i == j && 0 != sr_frame_cmp_distance(curr_frame, curr_frame2)) still_prefix = false; /* Getting a match only if not too far away from each * other and if functions aren't both unpaired unknown * functions. */ if (abs(i - j) <= max_frame_count / 2 - 1 && 0 == sr_frame_cmp_distance(curr_frame, curr_frame2)) { match = true; if (i != j) ++trans_count; // transposition in place } curr_frame2 = sr_frame_next(curr_frame2); } if (still_prefix) ++prefix_len; if (match) ++match_count; curr_frame = sr_frame_next(curr_frame); } trans_count /= 2; if (prefix_len > 4) prefix_len = 4; if (0 == match_count) return 0; // so as not to divide by 0 float dist_jaro = (match_count / (float)frame1_count + match_count / (float)frame2_count + (match_count - trans_count) / match_count) / 3; /* How much weight we give to having common prefixes * (always k < 0.25). */ float k = 0.2; float dist = dist_jaro + (float)prefix_len * k * (1 - dist_jaro); return dist; }
float distance_levenshtein(struct sr_thread *thread1, struct sr_thread *thread2, bool transposition) { assert(thread1->type == thread2->type); int frame_count1 = sr_thread_frame_count(thread1); int frame_count2 = sr_thread_frame_count(thread2); int max_frame_count = frame_count2; if (max_frame_count < frame_count1) max_frame_count = frame_count1; /* Avoid division by zero in case we get two empty threads */ if (max_frame_count == 0) return 0.0; int m = frame_count1 + 1; int n = frame_count2 + 1; // store only two last rows and columns instead of whole 2D array SR_ASSERT(n <= SIZE_MAX - 1); SR_ASSERT(m <= SIZE_MAX - (n + 1)); int *dist = sr_malloc_array(sizeof(int), m + n + 1); int *dist1 = sr_malloc_array(sizeof(int), m + n + 1); // first row and column having distance equal to their position for (int i = m; i > 0; --i) dist[m - i] = i; for (int i = 0; i <= n; ++i) dist[m + i] = i; struct sr_frame *curr_frame2 = sr_thread_frames(thread2); struct sr_frame *prev_frame = NULL; struct sr_frame *prev_frame2 = NULL; for (int j = 1; curr_frame2; ++j) { struct sr_frame *curr_frame = sr_thread_frames(thread1); for (int i = 1; curr_frame; ++i) { int l = m + j - i; int dist2 = dist1[l]; dist1[l] = dist[l]; int cost; /*similar characters have distance equal to the previous one diagonally, "??" functions aren't taken as similar */ if (0 == sr_frame_cmp_distance(curr_frame, curr_frame2)) cost = 0; else { // different ones takes the lowest value of all // previous distances cost = 1; dist[l] += 1; if (dist[l] > dist[l - 1] + 1) dist[l] = dist[l - 1] + 1; if (dist[l] > dist[l + 1] + 1) dist[l] = dist[l + 1] + 1; } /*checking for transposition of two characters in both ways taking into account that "??" functions are not similar*/ if (transposition && (i >= 2 && j >= 2 && dist[l] > dist2 + cost && 0 == sr_frame_cmp_distance(curr_frame, prev_frame2) && 0 == sr_frame_cmp_distance(prev_frame, curr_frame2))) { dist[l] = dist2 + cost; } prev_frame = curr_frame; curr_frame = sr_frame_next(curr_frame); } prev_frame2 = curr_frame2; curr_frame2 = sr_frame_next(curr_frame2); } int result = dist[n]; free(dist); free(dist1); return (float)result / max_frame_count; }