// Match.new needle, string, options = {} VALUE CommandTMatch_initialize(int argc, VALUE *argv, VALUE self) { // process arguments: 2 mandatory, 1 optional VALUE str, needle, options; if (rb_scan_args(argc, argv, "21", &str, &needle, &options) == 2) options = Qnil; str = StringValue(str); needle = StringValue(needle); // already downcased by caller // check optional options hash for overrides VALUE always_show_dot_files = CommandT_option_from_hash("always_show_dot_files", options); VALUE never_show_dot_files = CommandT_option_from_hash("never_show_dot_files", options); matchinfo_t m; m.haystack_p = RSTRING_PTR(str); m.haystack_len = RSTRING_LEN(str); m.needle_p = RSTRING_PTR(needle); m.needle_len = RSTRING_LEN(needle); m.max_score_per_char = (1.0 / m.haystack_len + 1.0 / m.needle_len) / 2; m.dot_file = 0; m.always_show_dot_files = always_show_dot_files == Qtrue; m.never_show_dot_files = never_show_dot_files == Qtrue; // calculate score double score = 1.0; // special case for zero-length search string if (m.needle_len == 0) { // filter out dot files if (!m.always_show_dot_files) { for (long i = 0; i < m.haystack_len; i++) { char c = m.haystack_p[i]; if (c == '.' && (i == 0 || m.haystack_p[i - 1] == '/')) { score = 0.0; break; } } } } else if (m.haystack_len > 0) { // normal case // prepare for memoization double memo[m.haystack_len * m.needle_len]; for (long i = 0, max = m.haystack_len * m.needle_len; i < max; i++) memo[i] = DBL_MAX; m.memo = memo; score = recursive_match(&m, 0, 0, 0, 0.0); } // clean-up and final book-keeping rb_iv_set(self, "@score", rb_float_new(score)); rb_iv_set(self, "@str", str); return Qnil; }
void calculate_match(VALUE str, VALUE needle, VALUE case_sensitive, VALUE always_show_dot_files, VALUE never_show_dot_files, VALUE recurse, match_t *out) { long i, max; double score; matchinfo_t m; m.haystack_p = RSTRING_PTR(str); m.haystack_len = RSTRING_LEN(str); m.needle_p = RSTRING_PTR(needle); m.needle_len = RSTRING_LEN(needle); m.max_score_per_char = (1.0 / m.haystack_len + 1.0 / m.needle_len) / 2; m.always_show_dot_files = always_show_dot_files == Qtrue; m.never_show_dot_files = never_show_dot_files == Qtrue; m.case_sensitive = case_sensitive; m.recurse = recurse == Qtrue; // calculate score score = 1.0; // special case for zero-length search string if (m.needle_len == 0) { // filter out dot files if (!m.always_show_dot_files) { for (i = 0; i < m.haystack_len; i++) { char c = m.haystack_p[i]; if (c == '.' && (i == 0 || m.haystack_p[i - 1] == '/')) { score = 0.0; break; } } } } else if (m.haystack_len > 0) { // normal case // prepare for memoization double memo[m.haystack_len * m.needle_len]; for (i = 0, max = m.haystack_len * m.needle_len; i < max; i++) memo[i] = DBL_MAX; m.memo = memo; score = recursive_match(&m, 0, 0, 0, 0.0); } // final book-keeping out->path = str; out->score = score; }
double recursive_match(matchinfo_t *m, // sharable meta-data long haystack_idx, // where in the path string to start long needle_idx, // where in the needle string to start long last_idx, // location of last matched character double score) // cumulative score so far { double score_for_char; double seen_score = 0; // remember best score seen via recursion int found; long i, j, distance; long memo_idx = haystack_idx; // do we have a memoized result we can return? double memoized = m->memo[needle_idx * m->needle_len + memo_idx]; if (memoized != DBL_MAX) return memoized; // bail early if not enough room (left) in haystack for (rest of) needle if (m->haystack_len - haystack_idx < m->needle_len - needle_idx) { score = 0.0; goto memoize; } for (i = needle_idx; i < m->needle_len; i++) { char c = m->needle_p[i]; found = 0; // similar to above, we'll stop iterating when we know we're too close // to the end of the string to possibly match for (j = haystack_idx; j <= m->haystack_len - (m->needle_len - i); j++, haystack_idx++) { char d = m->haystack_p[j]; if (d == '.') { if (j == 0 || m->haystack_p[j - 1] == '/') { // this is a dot-file int dot_search = (i == 0 && c == '.'); // searching for a dot if (m->never_show_dot_files || (!dot_search && !m->always_show_dot_files)) { score = 0.0; goto memoize; } } } else if (d >= 'A' && d <= 'Z' && !m->case_sensitive) { d += 'a' - 'A'; // add 32 to downcase } if (c == d) { found = 1; // calculate score score_for_char = m->max_score_per_char; distance = j - last_idx; if (distance > 1) { double factor = 1.0; char last = m->haystack_p[j - 1]; char curr = m->haystack_p[j]; // case matters, so get again if (last == '/') factor = 0.9; else if (last == '-' || last == '_' || last == ' ' || (last >= '0' && last <= '9')) factor = 0.8; else if (last >= 'a' && last <= 'z' && curr >= 'A' && curr <= 'Z') factor = 0.8; else if (last == '.') factor = 0.7; else // if no "special" chars behind char, factor diminishes // as distance from last matched char increases factor = (1.0 / distance) * 0.75; score_for_char *= factor; } if (++j < m->haystack_len) { // bump cursor one char to the right and // use recursion to try and find a better match double sub_score = recursive_match(m, j, i, last_idx, score); if (sub_score > seen_score) seen_score = sub_score; } score += score_for_char; last_idx = haystack_idx++; break; } } if (!found) { score = 0.0; goto memoize; } } score = score > seen_score ? score : seen_score; memoize: m->memo[needle_idx * m->needle_len + memo_idx] = score; return score; }
void match_stdin(char *abbrev) { char *line = 0; size_t read = 0; size_t max_line_len = 0; size_t current_line_len = 0; size_t results_buf_len = 10000; matchresult_t *results_buf = malloc(results_buf_len * sizeof(matchresult_t)); size_t results_count = 0; while ((read = getline(&line, ¤t_line_len, stdin)) != -1) { matchinfo_t matchinfo = { line, strlen(line), abbrev, strlen(abbrev), 1.0, 0, 1, 0 }; double score = 0; char *line_copy = 0; // chomp newline at the end of filename line[read - 1] = 0; // if getline call expanded line buffer, remember new buffer size if (current_line_len > max_line_len) max_line_len = current_line_len; // next iteration of while will reuse line buffer current_line_len = max_line_len; // do match score = recursive_match(&matchinfo, 0, 0, 0, 0.0); // record result only if score is higher than 0 if (score > 0.0) { // realloc results_buf if needed if (results_buf_len == results_count) { results_buf = realloc(results_buf, (results_buf_len *= 2) * sizeof(matchresult_t)); } // make copy of line line_copy = malloc(read + 1); memcpy(line_copy, line, read + 1); // add result to results_buf results_buf[results_count].line = line_copy; results_buf[results_count].score = score; results_count++; } } // Sorting results qsort(results_buf, results_count, sizeof(matchresult_t), matchresult_comp_func); // Print sorted results for (size_t i = 0; i < results_count; i++) printf("%f: %s\n", results_buf[i].score, results_buf[i].line); // Cleanup free(line); for (size_t i = 0; i < results_count; i++) free(results_buf[i].line); free(results_buf); }
float recursive_match( matchinfo_t *m, // Sharable meta-data. long haystack_idx, // Where in the path string to start. long needle_idx, // Where in the needle string to start. long last_idx, // Location of last matched character. float score // Cumulative score so far. ) { long distance, i, j; float *memoized = NULL; float score_for_char; float seen_score = 0; // Iterate over needle. for (i = needle_idx; i < m->needle_len; i++) { // Iterate over (valid range of) haystack. for (j = haystack_idx; j <= m->rightmost_match_p[i]; j++) { char c, d; // Do we have a memoized result we can return? memoized = &m->memo[j * m->needle_len + i]; if (*memoized != UNSET_SCORE) { return *memoized > seen_score ? *memoized : seen_score; } c = m->needle_p[i]; d = m->haystack_p[j]; if (d == '.') { if (j == 0 || m->haystack_p[j - 1] == '/') { // This is a dot-file. int dot_search = c == '.'; // Searching for a dot. if ( m->never_show_dot_files || (!dot_search && !m->always_show_dot_files) ) { return *memoized = 0.0; } } } else if (d >= 'A' && d <= 'Z' && !m->case_sensitive) { d += 'a' - 'A'; // Add 32 to downcase. } if (c == d) { // Calculate score. float sub_score = 0; score_for_char = m->max_score_per_char; distance = j - last_idx; if (distance > 1) { float factor = 1.0; char last = m->haystack_p[j - 1]; char curr = m->haystack_p[j]; // Case matters, so get again. if (last == '/') { factor = 0.9; } else if ( last == '-' || last == '_' || last == ' ' || (last >= '0' && last <= '9') ) { factor = 0.8; } else if ( last >= 'a' && last <= 'z' && curr >= 'A' && curr <= 'Z' ) { factor = 0.8; } else if (last == '.') { factor = 0.7; } else { // If no "special" chars behind char, factor diminishes // as distance from last matched char increases. factor = (1.0 / distance) * 0.75; } score_for_char *= factor; } if (j < m->rightmost_match_p[i] && m->recurse) { sub_score = recursive_match(m, j + 1, i, last_idx, score); if (sub_score > seen_score) { seen_score = sub_score; } } last_idx = j; haystack_idx = last_idx + 1; score += score_for_char; *memoized = seen_score > score ? seen_score : score; if (i == m->needle_len - 1) { // Whole string matched. return *memoized; } } } } return *memoized = score; }
float calculate_match( VALUE haystack, VALUE needle, VALUE case_sensitive, VALUE always_show_dot_files, VALUE never_show_dot_files, VALUE recurse, long needle_bitmask, long *haystack_bitmask ) { matchinfo_t m; long i; float score = 1.0; int compute_bitmasks = *haystack_bitmask == UNSET_BITMASK; m.haystack_p = RSTRING_PTR(haystack); m.haystack_len = RSTRING_LEN(haystack); m.needle_p = RSTRING_PTR(needle); m.needle_len = RSTRING_LEN(needle); m.rightmost_match_p = NULL; m.max_score_per_char = (1.0 / m.haystack_len + 1.0 / m.needle_len) / 2; m.always_show_dot_files = always_show_dot_files == Qtrue; m.never_show_dot_files = never_show_dot_files == Qtrue; m.case_sensitive = (int)case_sensitive; m.recurse = recurse == Qtrue; // Special case for zero-length search string. if (m.needle_len == 0) { // Filter out dot files. if (m.never_show_dot_files || !m.always_show_dot_files) { for (i = 0; i < m.haystack_len; i++) { char c = m.haystack_p[i]; if (c == '.' && (i == 0 || m.haystack_p[i - 1] == '/')) { return 0.0; } } } } else { long haystack_limit; long memo_size; long needle_idx; long mask; long rightmost_match_p[m.needle_len]; if (*haystack_bitmask != UNSET_BITMASK) { if ((needle_bitmask & *haystack_bitmask) != needle_bitmask) { return 0.0; } } // Pre-scan string: // - Bail if it can't match at all. // - Record rightmost match for each character (prune search space). // - Record bitmask for haystack to speed up future searches. m.rightmost_match_p = rightmost_match_p; needle_idx = m.needle_len - 1; mask = 0; for (i = m.haystack_len - 1; i >= 0; i--) { char c = m.haystack_p[i]; char lower = c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; if (!m.case_sensitive) { c = lower; } if (compute_bitmasks) { mask |= (1 << (lower - 'a')); } if (needle_idx >= 0) { char d = m.needle_p[needle_idx]; if (c == d) { rightmost_match_p[needle_idx] = i; needle_idx--; } } } if (compute_bitmasks) { *haystack_bitmask = mask; } if (needle_idx != -1) { return 0.0; } // Prepare for memoization. haystack_limit = rightmost_match_p[m.needle_len - 1] + 1; memo_size = m.needle_len * haystack_limit; { float memo[memo_size]; for (i = 0; i < memo_size; i++) { memo[i] = UNSET_SCORE; } m.memo = memo; score = recursive_match(&m, 0, 0, 0, 0.0); } } return score; }