示例#1
0
文件: QueryLexer.c 项目: rectang/lucy
static ParserElem*
S_consume_text(StringIterator *iter) {
    StringIterator *temp = StrIter_Clone(iter);

    while (1) {
        int32_t code_point = StrIter_Next(temp);
        if (code_point == '\\') {
            code_point = StrIter_Next(temp);
            if (code_point == STR_OOB) {
                break;
            }
        }
        else if (code_point == STR_OOB) {
            break;
        }
        else if (StrHelp_is_whitespace(code_point)
            || code_point == '"'
            || code_point == '('
            || code_point == ')'
           ) {
            StrIter_Recede(temp, 1);
            break;
        }
    }

    String *text = StrIter_crop(iter, temp);
    StrIter_Assign(iter, temp);
    DECREF(temp);
    return ParserElem_new(TOKEN_STRING, (Obj*)text);
}
示例#2
0
文件: QueryLexer.c 项目: rectang/lucy
static ParserElem*
S_consume_keyword(StringIterator *iter, const char *keyword,
                  size_t keyword_len, int type) {
    if (!StrIter_Starts_With_Utf8(iter, keyword, keyword_len)) {
        return NULL;
    }
    StringIterator *temp = StrIter_Clone(iter);
    StrIter_Advance(temp, keyword_len);
    int32_t lookahead = StrIter_Next(temp);
    if (lookahead == STR_OOB) {
        DECREF(temp);
        return NULL;
    }
    if (StrHelp_is_whitespace(lookahead)
        || lookahead == '"'
        || lookahead == '('
        || lookahead == ')'
        || lookahead == '+'
        || lookahead == '-'
       ) {
        StrIter_Recede(temp, 1);
        StrIter_Assign(iter, temp);
        DECREF(temp);
        return ParserElem_new(type, NULL);
    }
    DECREF(temp);
    return NULL;
}
示例#3
0
String*
IxFileNames_local_part(String *path) {
    StringIterator *top = Str_Tail(path);
    int32_t code_point = StrIter_Prev(top);

    // Trim trailing slash.
    while (code_point == '/') {
        code_point = StrIter_Prev(top);
    }

    StringIterator *tail = StrIter_Clone(top);
    StrIter_Advance(tail, 1);

    // Substring should start after last slash.
    while (code_point != STR_OOB) {
        if (code_point == '/') {
            StrIter_Advance(top, 1);
            break;
        }
        code_point = StrIter_Prev(top);
    }

    String *retval = StrIter_crop(top, tail);

    DECREF(tail);
    DECREF(top);
    return retval;
}
示例#4
0
文件: QueryLexer.c 项目: rectang/lucy
static ParserElem*
S_consume_field(StringIterator *iter) {
    StringIterator *temp = StrIter_Clone(iter);

    // Field names constructs must start with a letter or underscore.
    int32_t code_point = StrIter_Next(temp);
    if (code_point == STR_OOB) {
        DECREF(temp);
        return NULL;
    }
    if (!(isalpha(code_point) || code_point == '_')) {
        DECREF(temp);
        return NULL;
    }

    // Only alphanumerics and underscores are allowed in field names.
    while (':' != (code_point = StrIter_Next(temp))) {
        if (code_point == STR_OOB) {
            DECREF(temp);
            return NULL;
        }
        if (!(isalnum(code_point) || code_point == '_')) {
            DECREF(temp);
            return NULL;
        }
    }

    // Field name constructs must be followed by something sensible.
    int32_t lookahead = StrIter_Next(temp);
    if (lookahead == STR_OOB) {
        DECREF(temp);
        return NULL;
    }
    if (!(isalnum(lookahead)
          || lookahead == '_'
          || lookahead > 127
          || lookahead == '"'
          || lookahead == '('
         )
       ) {
        DECREF(temp);
        return NULL;
    }

    // Consume string data.
    StrIter_Recede(temp, 2); // Back up over lookahead and colon.
    String *field = StrIter_crop(iter, temp);
    StrIter_Advance(temp, 1); // Skip colon.
    StrIter_Assign(iter, temp);
    DECREF(temp);
    return ParserElem_new(TOKEN_FIELD, (Obj*)field);
}
示例#5
0
文件: QueryLexer.c 项目: rectang/lucy
static ParserElem*
S_consume_quoted_string(StringIterator *iter) {
    StringIterator *temp = StrIter_Clone(iter);

    if (StrIter_Next(temp) != '"') {
        THROW(ERR, "Internal error: expected a quote");
    }

    while (1) {
        int32_t code_point = StrIter_Next(temp);
        if (code_point == STR_OOB || code_point == '"') {
            break;
        }
        else if (code_point == '\\') {
            StrIter_Next(temp);
        }
    }

    String *text = StrIter_crop(iter, temp);
    StrIter_Assign(iter, temp);
    DECREF(temp);
    return ParserElem_new(TOKEN_STRING, (Obj*)text);
}
示例#6
0
static void
test_iterator(TestBatchRunner *runner) {
    static const int32_t code_points[] = {
        0x41,
        0x7F,
        0x80,
        0x7FF,
        0x800,
        0xFFFF,
        0x10000,
        0x10FFFF
    };
    static size_t num_code_points
        = sizeof(code_points) / sizeof(code_points[0]);

    CharBuf *buf = CB_new(0);
    for (size_t i = 0; i < num_code_points; ++i) {
        CB_Cat_Char(buf, code_points[i]);
    }
    String *string = CB_To_String(buf);

    {
        StringIterator *iter = Str_Top(string);

        TEST_TRUE(runner, StrIter_Equals(iter, (Obj*)iter),
                  "StringIterator equal to self");
        TEST_FALSE(runner, StrIter_Equals(iter, (Obj*)CFISH_TRUE),
                   "StringIterator not equal non-iterators");

        DECREF(iter);
    }

    {
        StringIterator *top  = Str_Top(string);
        StringIterator *tail = Str_Tail(string);

        TEST_INT_EQ(runner, StrIter_Compare_To(top, (Obj*)tail), -1,
                    "Compare_To top < tail");
        TEST_INT_EQ(runner, StrIter_Compare_To(tail, (Obj*)top), 1,
                    "Compare_To tail > top");
        TEST_INT_EQ(runner, StrIter_Compare_To(top, (Obj*)top), 0,
                    "Compare_To top == top");

        StringIterator *clone = StrIter_Clone(top);
        TEST_TRUE(runner, StrIter_Equals(clone, (Obj*)top), "Clone");

        StrIter_Assign(clone, tail);
        TEST_TRUE(runner, StrIter_Equals(clone, (Obj*)tail), "Assign");

        String *other = Str_newf("Other string");
        StringIterator *other_iter = Str_Top(other);
        TEST_FALSE(runner, StrIter_Equals(other_iter, (Obj*)tail),
                   "Equals returns false for different strings");
        StrIter_Assign(clone, other_iter);
        TEST_TRUE(runner, StrIter_Equals(clone, (Obj*)other_iter),
                  "Assign iterator with different string");

        DECREF(other);
        DECREF(other_iter);
        DECREF(clone);
        DECREF(top);
        DECREF(tail);
    }

    {
        StringIterator *iter = Str_Top(string);

        for (size_t i = 0; i < num_code_points; ++i) {
            TEST_TRUE(runner, StrIter_Has_Next(iter), "Has_Next %d", i);
            int32_t code_point = StrIter_Next(iter);
            TEST_INT_EQ(runner, code_point, code_points[i], "Next %d", i);
        }

        TEST_TRUE(runner, !StrIter_Has_Next(iter),
                  "Has_Next at end of string");
        TEST_INT_EQ(runner, StrIter_Next(iter), STR_OOB,
                    "Next at end of string");

        StringIterator *tail = Str_Tail(string);
        TEST_TRUE(runner, StrIter_Equals(iter, (Obj*)tail), "Equals tail");

        DECREF(tail);
        DECREF(iter);
    }

    {
        StringIterator *iter = Str_Tail(string);

        for (size_t i = num_code_points; i--;) {
            TEST_TRUE(runner, StrIter_Has_Prev(iter), "Has_Prev %d", i);
            int32_t code_point = StrIter_Prev(iter);
            TEST_INT_EQ(runner, code_point, code_points[i], "Prev %d", i);
        }

        TEST_TRUE(runner, !StrIter_Has_Prev(iter),
                  "Has_Prev at end of string");
        TEST_INT_EQ(runner, StrIter_Prev(iter), STR_OOB,
                    "Prev at start of string");

        StringIterator *top = Str_Top(string);
        TEST_TRUE(runner, StrIter_Equals(iter, (Obj*)top), "Equals top");

        DECREF(top);
        DECREF(iter);
    }

    {
        StringIterator *iter = Str_Top(string);

        StrIter_Next(iter);
        TEST_INT_EQ(runner, StrIter_Advance(iter, 2), 2,
                    "Advance returns number of code points");
        TEST_INT_EQ(runner, StrIter_Next(iter), code_points[3],
                    "Advance works");
        TEST_INT_EQ(runner,
                    StrIter_Advance(iter, 1000000), num_code_points - 4,
                    "Advance past end of string");

        StrIter_Prev(iter);
        TEST_INT_EQ(runner, StrIter_Recede(iter, 2), 2,
                    "Recede returns number of code points");
        TEST_INT_EQ(runner, StrIter_Prev(iter), code_points[num_code_points-4],
                    "Recede works");
        TEST_INT_EQ(runner, StrIter_Recede(iter, 1000000), num_code_points - 4,
                    "Recede past start of string");

        DECREF(iter);
    }

    DECREF(string);
    DECREF(buf);
}
示例#7
0
String*
Highlighter_Raw_Excerpt_IMP(Highlighter *self, String *field_val,
                            int32_t *start_ptr, HeatMap *heat_map) {
    HighlighterIVARS *const ivars = Highlighter_IVARS(self);

    // Find start of excerpt.

    StringIterator *top = Str_Top(field_val);

    int32_t  best_location = S_hottest(heat_map);
    int32_t  start;
    uint32_t max_skip;

    if ((uint32_t)best_location <= ivars->slop) {
        // If the beginning of the string falls within the window centered
        // around the hottest point in the field, start the fragment at the
        // beginning.
        start    = 0;
        max_skip = best_location;
    }
    else {
        start    = best_location - ivars->slop;
        max_skip = ivars->slop;
        StrIter_Advance(top, start);
    }

    uint32_t num_skipped;
    bool found_starting_edge
        = S_find_starting_boundary(top, max_skip, &num_skipped);
    start += num_skipped;

    // Find end of excerpt.

    StringIterator *tail = StrIter_Clone(top);

    uint32_t max_len = ivars->excerpt_length;
    if (!found_starting_edge) {
        // Leave space for starting ellipsis and space character.
        max_len -= 2;
    }

    bool found_ending_edge = true;
    uint32_t excerpt_len = StrIter_Advance(tail, max_len);

    // Skip up to slop code points but keep at least max_len - slop.
    if (excerpt_len > max_len - ivars->slop) {
        max_skip = excerpt_len - (max_len - ivars->slop);
        found_ending_edge
            = S_find_ending_boundary(tail, max_skip, &num_skipped);
        if (num_skipped >= excerpt_len) {
            excerpt_len = 0;
        }
        else {
            excerpt_len -= num_skipped;
        }
    }

    // Extract excerpt.

    String *raw_excerpt;

    if (!excerpt_len) {
        raw_excerpt = Str_new_from_trusted_utf8("", 0);
    }
    else {
        String  *substring = StrIter_substring(top, tail);
        CharBuf *buf       = CB_new(Str_Get_Size(substring) + 8);

        // If not starting on a sentence boundary, prepend an ellipsis.
        if (!found_starting_edge) {
            CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
            CB_Cat_Char(buf, ' ');
            start -= 2;
        }

        CB_Cat(buf, substring);

        // If not ending on a sentence boundary, append an ellipsis.
        if (!found_ending_edge) {
            CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
        }

        raw_excerpt = CB_Yield_String(buf);

        DECREF(buf);
        DECREF(substring);
    }

    *start_ptr = start;

    DECREF(top);
    DECREF(tail);
    return raw_excerpt;
}
示例#8
0
// Find an ending boundary before the current position given by the iterator.
// Skip up to max_skip code points plus potential whitespace. Update the
// iterator and return number of code points skipped. Return true if a
// ending edge (sentence) was found.
bool
S_find_ending_boundary(StringIterator *tail, uint32_t max_skip,
                       uint32_t *num_skipped_ptr) {
    int32_t code_point;

    // Check if we're at an ending boundary already. Don't check for a word
    // boundary because we need space for a trailing ellipsis.

    StringIterator *iter = StrIter_Clone(tail);

    do {
        code_point = StrIter_Next(iter);

        if (code_point == STRITER_DONE) {
            // Skip remaining whitespace.
            *num_skipped_ptr = StrIter_Skip_Prev_Whitespace(tail);
            DECREF(iter);
            return true;
        }
    } while (StrHelp_is_whitespace(code_point));

    // Keep track of the first word boundary.
    StringIterator *word = NULL;
    uint32_t word_offset = 0;

    StrIter_Assign(iter, tail);

    for (uint32_t i = 0;
            STRITER_DONE != (code_point = StrIter_Prev(iter));
            ++i)
    {
        if (code_point == '.') {
            StrIter_Assign(tail, iter);
            StrIter_Advance(tail, 1); // Include period.
            *num_skipped_ptr = i;
            DECREF(word);
            DECREF(iter);
            return true;
        }

        if (StrHelp_is_whitespace(code_point)) {
            if (word == NULL) {
                word = StrIter_Clone(iter);
                word_offset = i + 1;
            }
        }
        else if (i >= max_skip) {
            // Break only at non-whitespace to allow another sentence
            // boundary to be found.
            break;
        }
    }

    if (word == NULL) {
        // Make space for ellipsis.
        *num_skipped_ptr = StrIter_Recede(tail, 1);
    }
    else {
        // Use word boundary if no sentence boundary was found.
        StrIter_Assign(tail, word);

        // Strip whitespace and punctuation that collides with an ellipsis.
        while (STRITER_DONE != (code_point = StrIter_Prev(tail))) {
            if (!StrHelp_is_whitespace(code_point)
                    && code_point != '.'
                    && code_point != ','
                    && code_point != ';'
                    && code_point != ':'
                    && code_point != ':'
                    && code_point != '?'
                    && code_point != '!'
               ) {
                StrIter_Advance(tail, 1); // Back up.
                break;
            }
            ++word_offset;
        }

        *num_skipped_ptr = word_offset;
    }

    DECREF(word);
    DECREF(iter);
    return false;
}
示例#9
0
// Find a starting boundary after the current position given by the iterator.
// Skip up to max_skip code points plus potential whitespace. Update the
// iterator and return number of code points skipped. Return true if a
// starting edge (sentence) was found.
bool
S_find_starting_boundary(StringIterator *top, uint32_t max_skip,
                         uint32_t *num_skipped_ptr) {
    // Keep track of the first word boundary.
    StringIterator *word = NULL;
    uint32_t word_offset = 0;

    // Check if we're at a starting boundary already.

    StringIterator *iter = StrIter_Clone(top);

    while (true) {
        int32_t code_point = StrIter_Prev(iter);

        if (code_point == STRITER_DONE || code_point == '.') {
            // Skip remaining whitespace.
            *num_skipped_ptr = StrIter_Skip_Next_Whitespace(top);
            DECREF(iter);
            return true;
        }

        if (StrHelp_is_whitespace(code_point)) {
            if (word == NULL) {
                word = StrIter_Clone(top);
            }
        }
        else {
            break;
        }
    }

    // Try to start on a boundary.

    uint32_t num_skipped = 0;
    bool     found_edge  = false;

    StrIter_Assign(iter, top);

    for (uint32_t i = 0; i < max_skip; ++i) {
        int32_t code_point = StrIter_Next(iter);

        if (code_point == STRITER_DONE || code_point == '.') {
            found_edge = true;
            StrIter_Assign(top, iter);
            num_skipped = i + 1;
            break;
        }

        if (word == NULL && StrHelp_is_whitespace(code_point)) {
            word = StrIter_Clone(iter);
            word_offset = i + 1;
        }
    }

    // Try to use word boundary if no sentence boundary was found.
    if (!found_edge && word != NULL) {
        StrIter_Assign(top, word);
        num_skipped = word_offset;
    }

    // Skip remaining whitespace.
    num_skipped += StrIter_Skip_Next_Whitespace(top);
    *num_skipped_ptr = num_skipped;

    DECREF(word);
    DECREF(iter);
    return found_edge;
}