Beispiel #1
0
// Match.new needle, string, options = {}
VALUE CommandTMatch_initialize(int argc, VALUE *argv, VALUE self)
{
    // process arguments: 2 mandatory, 1 optional
    VALUE str, needle, options;
    if (rb_scan_args(argc, argv, "21", &str, &needle, &options) == 2)
        options = Qnil;
    str    = StringValue(str);
    needle = StringValue(needle); // already downcased by caller

    // check optional options hash for overrides
    VALUE always_show_dot_files = CommandT_option_from_hash("always_show_dot_files", options);
    VALUE never_show_dot_files = CommandT_option_from_hash("never_show_dot_files", options);

    matchinfo_t m;
    m.haystack_p            = RSTRING_PTR(str);
    m.haystack_len          = RSTRING_LEN(str);
    m.needle_p              = RSTRING_PTR(needle);
    m.needle_len            = RSTRING_LEN(needle);
    m.max_score_per_char    = (1.0 / m.haystack_len + 1.0 / m.needle_len) / 2;
    m.dot_file              = 0;
    m.always_show_dot_files = always_show_dot_files == Qtrue;
    m.never_show_dot_files  = never_show_dot_files == Qtrue;

    // calculate score
    double score = 1.0;

    // special case for zero-length search string
    if (m.needle_len == 0) {

        // filter out dot files
        if (!m.always_show_dot_files) {
            for (long i = 0; i < m.haystack_len; i++) {
                char c = m.haystack_p[i];

                if (c == '.' && (i == 0 || m.haystack_p[i - 1] == '/')) {
                    score = 0.0;
                    break;
                }
            }
        }
    } else if (m.haystack_len > 0) { // normal case

        // prepare for memoization
        double memo[m.haystack_len * m.needle_len];
        for (long i = 0, max = m.haystack_len * m.needle_len; i < max; i++)
            memo[i] = DBL_MAX;
        m.memo = memo;

        score = recursive_match(&m, 0, 0, 0, 0.0);
    }

    // clean-up and final book-keeping
    rb_iv_set(self, "@score", rb_float_new(score));
    rb_iv_set(self, "@str", str);
    return Qnil;
}
Beispiel #2
0
void calculate_match(VALUE str,
                     VALUE needle,
                     VALUE case_sensitive,
                     VALUE always_show_dot_files,
                     VALUE never_show_dot_files,
                     VALUE recurse,
                     match_t *out)
{
    long i, max;
    double score;
    matchinfo_t m;
    m.haystack_p            = RSTRING_PTR(str);
    m.haystack_len          = RSTRING_LEN(str);
    m.needle_p              = RSTRING_PTR(needle);
    m.needle_len            = RSTRING_LEN(needle);
    m.max_score_per_char    = (1.0 / m.haystack_len + 1.0 / m.needle_len) / 2;
    m.always_show_dot_files = always_show_dot_files == Qtrue;
    m.never_show_dot_files  = never_show_dot_files == Qtrue;
    m.case_sensitive        = case_sensitive;
    m.recurse               = recurse == Qtrue;

    // calculate score
    score = 1.0;

    // special case for zero-length search string
    if (m.needle_len == 0) {

        // filter out dot files
        if (!m.always_show_dot_files) {
            for (i = 0; i < m.haystack_len; i++) {
                char c = m.haystack_p[i];

                if (c == '.' && (i == 0 || m.haystack_p[i - 1] == '/')) {
                    score = 0.0;
                    break;
                }
            }
        }
    } else if (m.haystack_len > 0) { // normal case

        // prepare for memoization
        double memo[m.haystack_len * m.needle_len];
        for (i = 0, max = m.haystack_len * m.needle_len; i < max; i++)
            memo[i] = DBL_MAX;
        m.memo = memo;

        score = recursive_match(&m, 0, 0, 0, 0.0);
    }

    // final book-keeping
    out->path  = str;
    out->score = score;
}
Beispiel #3
0
double recursive_match(matchinfo_t *m,    // sharable meta-data
                       long haystack_idx, // where in the path string to start
                       long needle_idx,   // where in the needle string to start
                       long last_idx,     // location of last matched character
                       double score)      // cumulative score so far
{
    double score_for_char;
    double seen_score = 0;  // remember best score seen via recursion
    int found;
    long i, j, distance;
    long memo_idx = haystack_idx;

    // do we have a memoized result we can return?
    double memoized = m->memo[needle_idx * m->needle_len + memo_idx];
    if (memoized != DBL_MAX)
        return memoized;

    // bail early if not enough room (left) in haystack for (rest of) needle
    if (m->haystack_len - haystack_idx < m->needle_len - needle_idx) {
        score = 0.0;
        goto memoize;
    }

    for (i = needle_idx; i < m->needle_len; i++) {
        char c = m->needle_p[i];
        found = 0;

        // similar to above, we'll stop iterating when we know we're too close
        // to the end of the string to possibly match
        for (j = haystack_idx;
             j <= m->haystack_len - (m->needle_len - i);
             j++, haystack_idx++) {
            char d = m->haystack_p[j];
            if (d == '.') {
                if (j == 0 || m->haystack_p[j - 1] == '/') { // this is a dot-file
                    int dot_search = (i == 0 && c == '.'); // searching for a dot
                    if (m->never_show_dot_files || (!dot_search && !m->always_show_dot_files)) {
                        score = 0.0;
                        goto memoize;
                    }
                }
            } else if (d >= 'A' && d <= 'Z' && !m->case_sensitive) {
                d += 'a' - 'A'; // add 32 to downcase
            }

            if (c == d) {
                found = 1;

                // calculate score
                score_for_char = m->max_score_per_char;
                distance = j - last_idx;

                if (distance > 1) {
                    double factor = 1.0;
                    char last = m->haystack_p[j - 1];
                    char curr = m->haystack_p[j]; // case matters, so get again
                    if (last == '/')
                        factor = 0.9;
                    else if (last == '-' ||
                            last == '_' ||
                            last == ' ' ||
                            (last >= '0' && last <= '9'))
                        factor = 0.8;
                    else if (last >= 'a' && last <= 'z' &&
                            curr >= 'A' && curr <= 'Z')
                        factor = 0.8;
                    else if (last == '.')
                        factor = 0.7;
                    else
                        // if no "special" chars behind char, factor diminishes
                        // as distance from last matched char increases
                        factor = (1.0 / distance) * 0.75;
                    score_for_char *= factor;
                }

                if (++j < m->haystack_len) {
                    // bump cursor one char to the right and
                    // use recursion to try and find a better match
                    double sub_score = recursive_match(m, j, i, last_idx, score);
                    if (sub_score > seen_score)
                        seen_score = sub_score;
                }

                score += score_for_char;
                last_idx = haystack_idx++;
                break;
            }
        }
        if (!found) {
            score = 0.0;
            goto memoize;
        }
    }

    score = score > seen_score ? score : seen_score;

memoize:
    m->memo[needle_idx * m->needle_len + memo_idx] = score;
    return score;
}
Beispiel #4
0
void match_stdin(char *abbrev) {
  char   *line             = 0;
  size_t  read             = 0;
  size_t  max_line_len     = 0;
  size_t  current_line_len = 0;

  size_t         results_buf_len = 10000;
  matchresult_t *results_buf     = malloc(results_buf_len * sizeof(matchresult_t));
  size_t         results_count   = 0;

  while ((read = getline(&line, &current_line_len, stdin)) != -1) {
    matchinfo_t matchinfo = {
      line,
      strlen(line),
      abbrev,
      strlen(abbrev),
      1.0, 0, 1, 0
    };
    double  score     = 0;
    char   *line_copy = 0;

    // chomp newline at the end of filename
    line[read - 1] = 0;

    // if getline call expanded line buffer, remember new buffer size
    if (current_line_len > max_line_len)
      max_line_len = current_line_len;

    // next iteration of while will reuse line buffer
    current_line_len = max_line_len;

    // do match
    score = recursive_match(&matchinfo, 0, 0, 0, 0.0);

    // record result only if score is higher than 0
    if (score > 0.0) {
      // realloc results_buf if needed
      if (results_buf_len == results_count) {
        results_buf = realloc(results_buf, (results_buf_len *= 2) * sizeof(matchresult_t));
      }

      // make copy of line
      line_copy = malloc(read + 1);
      memcpy(line_copy, line, read + 1);

      // add result to results_buf
      results_buf[results_count].line = line_copy;
      results_buf[results_count].score = score;
      results_count++;
    }
  }

  // Sorting results
  qsort(results_buf, results_count, sizeof(matchresult_t), matchresult_comp_func);

  // Print sorted results
  for (size_t i = 0; i < results_count; i++)
    printf("%f: %s\n", results_buf[i].score, results_buf[i].line);

  // Cleanup
  free(line);

  for (size_t i = 0; i < results_count; i++)
    free(results_buf[i].line);

  free(results_buf);
}
Beispiel #5
0
float recursive_match(
    matchinfo_t *m,    // Sharable meta-data.
    long haystack_idx, // Where in the path string to start.
    long needle_idx,   // Where in the needle string to start.
    long last_idx,     // Location of last matched character.
    float score        // Cumulative score so far.
) {
    long distance, i, j;
    float *memoized = NULL;
    float score_for_char;
    float seen_score = 0;

    // Iterate over needle.
    for (i = needle_idx; i < m->needle_len; i++) {
        // Iterate over (valid range of) haystack.
        for (j = haystack_idx; j <= m->rightmost_match_p[i]; j++) {
            char c, d;

            // Do we have a memoized result we can return?
            memoized = &m->memo[j * m->needle_len + i];
            if (*memoized != UNSET_SCORE) {
                return *memoized > seen_score ? *memoized : seen_score;
            }
            c = m->needle_p[i];
            d = m->haystack_p[j];
            if (d == '.') {
                if (j == 0 || m->haystack_p[j - 1] == '/') { // This is a dot-file.
                    int dot_search = c == '.'; // Searching for a dot.
                    if (
                        m->never_show_dot_files ||
                        (!dot_search && !m->always_show_dot_files)
                    ) {
                        return *memoized = 0.0;
                    }
                }
            } else if (d >= 'A' && d <= 'Z' && !m->case_sensitive) {
                d += 'a' - 'A'; // Add 32 to downcase.
            }

            if (c == d) {
                // Calculate score.
                float sub_score = 0;
                score_for_char = m->max_score_per_char;
                distance = j - last_idx;

                if (distance > 1) {
                    float factor = 1.0;
                    char last = m->haystack_p[j - 1];
                    char curr = m->haystack_p[j]; // Case matters, so get again.
                    if (last == '/') {
                        factor = 0.9;
                    } else if (
                        last == '-' ||
                        last == '_' ||
                        last == ' ' ||
                        (last >= '0' && last <= '9')
                    ) {
                        factor = 0.8;
                    } else if (
                        last >= 'a' && last <= 'z' &&
                        curr >= 'A' && curr <= 'Z'
                    ) {
                        factor = 0.8;
                    } else if (last == '.') {
                        factor = 0.7;
                    } else {
                        // If no "special" chars behind char, factor diminishes
                        // as distance from last matched char increases.
                        factor = (1.0 / distance) * 0.75;
                    }
                    score_for_char *= factor;
                }

                if (j < m->rightmost_match_p[i] && m->recurse) {
                    sub_score = recursive_match(m, j + 1, i, last_idx, score);
                    if (sub_score > seen_score) {
                        seen_score = sub_score;
                    }
                }
                last_idx = j;
                haystack_idx = last_idx + 1;
                score += score_for_char;
                *memoized = seen_score > score ? seen_score : score;
                if (i == m->needle_len - 1) {
                    // Whole string matched.
                    return *memoized;
                }
            }
        }
    }
    return *memoized = score;
}
Beispiel #6
0
float calculate_match(
    VALUE haystack,
    VALUE needle,
    VALUE case_sensitive,
    VALUE always_show_dot_files,
    VALUE never_show_dot_files,
    VALUE recurse,
    long needle_bitmask,
    long *haystack_bitmask
) {
    matchinfo_t m;
    long i;
    float score             = 1.0;
    int compute_bitmasks    = *haystack_bitmask == UNSET_BITMASK;
    m.haystack_p            = RSTRING_PTR(haystack);
    m.haystack_len          = RSTRING_LEN(haystack);
    m.needle_p              = RSTRING_PTR(needle);
    m.needle_len            = RSTRING_LEN(needle);
    m.rightmost_match_p     = NULL;
    m.max_score_per_char    = (1.0 / m.haystack_len + 1.0 / m.needle_len) / 2;
    m.always_show_dot_files = always_show_dot_files == Qtrue;
    m.never_show_dot_files  = never_show_dot_files == Qtrue;
    m.case_sensitive        = (int)case_sensitive;
    m.recurse               = recurse == Qtrue;

    // Special case for zero-length search string.
    if (m.needle_len == 0) {
        // Filter out dot files.
        if (m.never_show_dot_files || !m.always_show_dot_files) {
            for (i = 0; i < m.haystack_len; i++) {
                char c = m.haystack_p[i];
                if (c == '.' && (i == 0 || m.haystack_p[i - 1] == '/')) {
                    return 0.0;
                }
            }
        }
    } else {
        long haystack_limit;
        long memo_size;
        long needle_idx;
        long mask;
        long rightmost_match_p[m.needle_len];

        if (*haystack_bitmask != UNSET_BITMASK) {
            if ((needle_bitmask & *haystack_bitmask) != needle_bitmask) {
                return 0.0;
            }
        }

        // Pre-scan string:
        // - Bail if it can't match at all.
        // - Record rightmost match for each character (prune search space).
        // - Record bitmask for haystack to speed up future searches.
        m.rightmost_match_p = rightmost_match_p;
        needle_idx = m.needle_len - 1;
        mask = 0;
        for (i = m.haystack_len - 1; i >= 0; i--) {
            char c = m.haystack_p[i];
            char lower = c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c;
            if (!m.case_sensitive) {
                c = lower;
            }
            if (compute_bitmasks) {
                mask |= (1 << (lower - 'a'));
            }

            if (needle_idx >= 0) {
                char d = m.needle_p[needle_idx];
                if (c == d) {
                    rightmost_match_p[needle_idx] = i;
                    needle_idx--;
                }
            }
        }
        if (compute_bitmasks) {
            *haystack_bitmask = mask;
        }
        if (needle_idx != -1) {
            return 0.0;
        }

        // Prepare for memoization.
        haystack_limit = rightmost_match_p[m.needle_len - 1] + 1;
        memo_size = m.needle_len * haystack_limit;
        {
            float memo[memo_size];
            for (i = 0; i < memo_size; i++) {
                memo[i] = UNSET_SCORE;
            }
            m.memo = memo;
            score = recursive_match(&m, 0, 0, 0, 0.0);
        }
    }
    return score;
}