示例#1
0
VArray*
QueryLexer_Tokenize_IMP(QueryLexer *self, String *query_string) {
    QueryLexerIVARS *const ivars = QueryLexer_IVARS(self);

    VArray *elems = VA_new(0);
    if (!query_string) { return elems; }

    StringIterator *iter = Str_Top(query_string);

    while (StrIter_Has_Next(iter)) {
        ParserElem *elem = NULL;

        if (StrIter_Skip_Next_Whitespace(iter)) {
            // Fast-forward past whitespace.
            continue;
        }

        if (ivars->heed_colons) {
            ParserElem *elem = S_consume_field(iter);
            if (elem) {
                VA_Push(elems, (Obj*)elem);
            }
        }

        int32_t code_point = StrIter_Next(iter);
        switch (code_point) {
            case '(':
                elem = ParserElem_new(TOKEN_OPEN_PAREN, NULL);
                break;
            case ')':
                elem = ParserElem_new(TOKEN_CLOSE_PAREN, NULL);
                break;
            case '+':
                if (StrIter_Has_Next(iter)
                    && !StrIter_Skip_Next_Whitespace(iter)
                   ) {
                    elem = ParserElem_new(TOKEN_PLUS, NULL);
                }
                else {
                    elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("+"));
                }
                break;
            case '-':
                if (StrIter_Has_Next(iter)
                    && !StrIter_Skip_Next_Whitespace(iter)
                   ) {
                    elem = ParserElem_new(TOKEN_MINUS, NULL);
                }
                else {
                    elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("-"));
                }
                break;
            case '"':
                StrIter_Recede(iter, 1);
                elem = S_consume_quoted_string(iter);
                break;
            case 'O':
                StrIter_Recede(iter, 1);
                elem = S_consume_keyword(iter, "OR", 2, TOKEN_OR);
                if (!elem) {
                    elem = S_consume_text(iter);
                }
                break;
            case 'A':
                StrIter_Recede(iter, 1);
                elem = S_consume_keyword(iter, "AND", 3, TOKEN_AND);
                if (!elem) {
                    elem = S_consume_text(iter);
                }
                break;
            case 'N':
                StrIter_Recede(iter, 1);
                elem = S_consume_keyword(iter, "NOT", 3, TOKEN_NOT);
                if (!elem) {
                    elem = S_consume_text(iter);
                }
                break;
            default:
                StrIter_Recede(iter, 1);
                elem = S_consume_text(iter);
                break;
        }
        VA_Push(elems, (Obj*)elem);
    }

    DECREF(iter);
    return elems;
}
示例#2
0
文件: QueryParser.c 项目: kidaa/lucy
Query*
QParser_Expand_Leaf_IMP(QueryParser *self, Query *query) {
    QueryParserIVARS *const ivars = QParser_IVARS(self);
    LeafQuery *leaf_query = (LeafQuery*)query;
    Schema    *schema     = ivars->schema;
    bool       is_phrase  = false;
    bool       ambiguous  = false;

    // Determine whether we can actually process the input.
    if (!Query_is_a(query, LEAFQUERY)) { return NULL; }
    String *full_text = LeafQuery_Get_Text(leaf_query);
    if (!Str_Get_Size(full_text)) { return NULL; }

    // If quoted, always generate PhraseQuery.
    StringIterator *top  = Str_Top(full_text);
    StringIterator *tail = Str_Tail(full_text);
    StrIter_Skip_Next_Whitespace(top);
    StrIter_Skip_Prev_Whitespace(tail);
    if (StrIter_Starts_With_Utf8(top, "\"", 1)) {
        is_phrase = true;
        StrIter_Advance(top, 1);
        if (StrIter_Ends_With_Utf8(tail, "\"", 1)
            && !StrIter_Ends_With_Utf8(tail, "\\\"", 2)
        ) {
            StrIter_Recede(tail, 1);
        }
    }
    String *source_text = StrIter_substring(top, tail);

    // Either use LeafQuery's field or default to Parser's list.
    Vector *fields;
    if (LeafQuery_Get_Field(leaf_query)) {
        fields = Vec_new(1);
        Vec_Push(fields, INCREF(LeafQuery_Get_Field(leaf_query)));
    }
    else {
        fields = (Vector*)INCREF(ivars->fields);
    }

    CharBuf *unescape_buf = CB_new(Str_Get_Size(source_text));
    Vector  *queries      = Vec_new(Vec_Get_Size(fields));
    for (uint32_t i = 0, max = Vec_Get_Size(fields); i < max; i++) {
        String   *field    = (String*)Vec_Fetch(fields, i);
        Analyzer *analyzer = ivars->analyzer
                             ? ivars->analyzer
                             : Schema_Fetch_Analyzer(schema, field);

        if (!analyzer) {
            Vec_Push(queries,
                    (Obj*)QParser_Make_Term_Query(self, field,
                                                  (Obj*)source_text));
        }
        else {
            // Extract token texts.
            String *split_source = S_unescape(self, source_text, unescape_buf);
            Vector *maybe_texts = Analyzer_Split(analyzer, split_source);
            uint32_t num_maybe_texts = Vec_Get_Size(maybe_texts);
            Vector *token_texts = Vec_new(num_maybe_texts);

            // Filter out zero-length token texts.
            for (uint32_t j = 0; j < num_maybe_texts; j++) {
                String *token_text = (String*)Vec_Fetch(maybe_texts, j);
                if (Str_Get_Size(token_text)) {
                    Vec_Push(token_texts, INCREF(token_text));
                }
            }

            if (Vec_Get_Size(token_texts) == 0) {
                /* Query might include stop words.  Who knows? */
                ambiguous = true;
            }

            // Add either a TermQuery or a PhraseQuery.
            if (is_phrase || Vec_Get_Size(token_texts) > 1) {
                Vec_Push(queries, (Obj*)
                        QParser_Make_Phrase_Query(self, field, token_texts));
            }
            else if (Vec_Get_Size(token_texts) == 1) {
                Vec_Push(queries,
                        (Obj*)QParser_Make_Term_Query(self, field, Vec_Fetch(token_texts, 0)));
            }

            DECREF(token_texts);
            DECREF(maybe_texts);
            DECREF(split_source);
        }
    }

    Query *retval;
    if (Vec_Get_Size(queries) == 0) {
        retval = (Query*)NoMatchQuery_new();
        if (ambiguous) {
            NoMatchQuery_Set_Fails_To_Match((NoMatchQuery*)retval, false);
        }
    }
    else if (Vec_Get_Size(queries) == 1) {
        retval = (Query*)INCREF(Vec_Fetch(queries, 0));
    }
    else {
        retval = QParser_Make_OR_Query(self, queries);
    }

    // Clean up.
    DECREF(unescape_buf);
    DECREF(queries);
    DECREF(fields);
    DECREF(source_text);
    DECREF(tail);
    DECREF(top);

    return retval;
}
示例#3
0
// Find a starting boundary after the current position given by the iterator.
// Skip up to max_skip code points plus potential whitespace. Update the
// iterator and return number of code points skipped. Return true if a
// starting edge (sentence) was found.
bool
S_find_starting_boundary(StringIterator *top, uint32_t max_skip,
                         uint32_t *num_skipped_ptr) {
    // Keep track of the first word boundary.
    StringIterator *word = NULL;
    uint32_t word_offset = 0;

    // Check if we're at a starting boundary already.

    StringIterator *iter = StrIter_Clone(top);

    while (true) {
        int32_t code_point = StrIter_Prev(iter);

        if (code_point == STRITER_DONE || code_point == '.') {
            // Skip remaining whitespace.
            *num_skipped_ptr = StrIter_Skip_Next_Whitespace(top);
            DECREF(iter);
            return true;
        }

        if (StrHelp_is_whitespace(code_point)) {
            if (word == NULL) {
                word = StrIter_Clone(top);
            }
        }
        else {
            break;
        }
    }

    // Try to start on a boundary.

    uint32_t num_skipped = 0;
    bool     found_edge  = false;

    StrIter_Assign(iter, top);

    for (uint32_t i = 0; i < max_skip; ++i) {
        int32_t code_point = StrIter_Next(iter);

        if (code_point == STRITER_DONE || code_point == '.') {
            found_edge = true;
            StrIter_Assign(top, iter);
            num_skipped = i + 1;
            break;
        }

        if (word == NULL && StrHelp_is_whitespace(code_point)) {
            word = StrIter_Clone(iter);
            word_offset = i + 1;
        }
    }

    // Try to use word boundary if no sentence boundary was found.
    if (!found_edge && word != NULL) {
        StrIter_Assign(top, word);
        num_skipped = word_offset;
    }

    // Skip remaining whitespace.
    num_skipped += StrIter_Skip_Next_Whitespace(top);
    *num_skipped_ptr = num_skipped;

    DECREF(word);
    DECREF(iter);
    return found_edge;
}