VArray* QueryLexer_Tokenize_IMP(QueryLexer *self, String *query_string) { QueryLexerIVARS *const ivars = QueryLexer_IVARS(self); VArray *elems = VA_new(0); if (!query_string) { return elems; } StringIterator *iter = Str_Top(query_string); while (StrIter_Has_Next(iter)) { ParserElem *elem = NULL; if (StrIter_Skip_Next_Whitespace(iter)) { // Fast-forward past whitespace. continue; } if (ivars->heed_colons) { ParserElem *elem = S_consume_field(iter); if (elem) { VA_Push(elems, (Obj*)elem); } } int32_t code_point = StrIter_Next(iter); switch (code_point) { case '(': elem = ParserElem_new(TOKEN_OPEN_PAREN, NULL); break; case ')': elem = ParserElem_new(TOKEN_CLOSE_PAREN, NULL); break; case '+': if (StrIter_Has_Next(iter) && !StrIter_Skip_Next_Whitespace(iter) ) { elem = ParserElem_new(TOKEN_PLUS, NULL); } else { elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("+")); } break; case '-': if (StrIter_Has_Next(iter) && !StrIter_Skip_Next_Whitespace(iter) ) { elem = ParserElem_new(TOKEN_MINUS, NULL); } else { elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("-")); } break; case '"': StrIter_Recede(iter, 1); elem = S_consume_quoted_string(iter); break; case 'O': StrIter_Recede(iter, 1); elem = S_consume_keyword(iter, "OR", 2, TOKEN_OR); if (!elem) { elem = S_consume_text(iter); } break; case 'A': StrIter_Recede(iter, 1); elem = S_consume_keyword(iter, "AND", 3, TOKEN_AND); if (!elem) { elem = S_consume_text(iter); } break; case 'N': StrIter_Recede(iter, 1); elem = S_consume_keyword(iter, "NOT", 3, TOKEN_NOT); if (!elem) { elem = S_consume_text(iter); } break; default: StrIter_Recede(iter, 1); elem = S_consume_text(iter); break; } VA_Push(elems, (Obj*)elem); } DECREF(iter); return elems; }
Query* QParser_Expand_Leaf_IMP(QueryParser *self, Query *query) { QueryParserIVARS *const ivars = QParser_IVARS(self); LeafQuery *leaf_query = (LeafQuery*)query; Schema *schema = ivars->schema; bool is_phrase = false; bool ambiguous = false; // Determine whether we can actually process the input. if (!Query_is_a(query, LEAFQUERY)) { return NULL; } String *full_text = LeafQuery_Get_Text(leaf_query); if (!Str_Get_Size(full_text)) { return NULL; } // If quoted, always generate PhraseQuery. StringIterator *top = Str_Top(full_text); StringIterator *tail = Str_Tail(full_text); StrIter_Skip_Next_Whitespace(top); StrIter_Skip_Prev_Whitespace(tail); if (StrIter_Starts_With_Utf8(top, "\"", 1)) { is_phrase = true; StrIter_Advance(top, 1); if (StrIter_Ends_With_Utf8(tail, "\"", 1) && !StrIter_Ends_With_Utf8(tail, "\\\"", 2) ) { StrIter_Recede(tail, 1); } } String *source_text = StrIter_substring(top, tail); // Either use LeafQuery's field or default to Parser's list. Vector *fields; if (LeafQuery_Get_Field(leaf_query)) { fields = Vec_new(1); Vec_Push(fields, INCREF(LeafQuery_Get_Field(leaf_query))); } else { fields = (Vector*)INCREF(ivars->fields); } CharBuf *unescape_buf = CB_new(Str_Get_Size(source_text)); Vector *queries = Vec_new(Vec_Get_Size(fields)); for (uint32_t i = 0, max = Vec_Get_Size(fields); i < max; i++) { String *field = (String*)Vec_Fetch(fields, i); Analyzer *analyzer = ivars->analyzer ? ivars->analyzer : Schema_Fetch_Analyzer(schema, field); if (!analyzer) { Vec_Push(queries, (Obj*)QParser_Make_Term_Query(self, field, (Obj*)source_text)); } else { // Extract token texts. String *split_source = S_unescape(self, source_text, unescape_buf); Vector *maybe_texts = Analyzer_Split(analyzer, split_source); uint32_t num_maybe_texts = Vec_Get_Size(maybe_texts); Vector *token_texts = Vec_new(num_maybe_texts); // Filter out zero-length token texts. for (uint32_t j = 0; j < num_maybe_texts; j++) { String *token_text = (String*)Vec_Fetch(maybe_texts, j); if (Str_Get_Size(token_text)) { Vec_Push(token_texts, INCREF(token_text)); } } if (Vec_Get_Size(token_texts) == 0) { /* Query might include stop words. Who knows? */ ambiguous = true; } // Add either a TermQuery or a PhraseQuery. if (is_phrase || Vec_Get_Size(token_texts) > 1) { Vec_Push(queries, (Obj*) QParser_Make_Phrase_Query(self, field, token_texts)); } else if (Vec_Get_Size(token_texts) == 1) { Vec_Push(queries, (Obj*)QParser_Make_Term_Query(self, field, Vec_Fetch(token_texts, 0))); } DECREF(token_texts); DECREF(maybe_texts); DECREF(split_source); } } Query *retval; if (Vec_Get_Size(queries) == 0) { retval = (Query*)NoMatchQuery_new(); if (ambiguous) { NoMatchQuery_Set_Fails_To_Match((NoMatchQuery*)retval, false); } } else if (Vec_Get_Size(queries) == 1) { retval = (Query*)INCREF(Vec_Fetch(queries, 0)); } else { retval = QParser_Make_OR_Query(self, queries); } // Clean up. DECREF(unescape_buf); DECREF(queries); DECREF(fields); DECREF(source_text); DECREF(tail); DECREF(top); return retval; }
// Find a starting boundary after the current position given by the iterator. // Skip up to max_skip code points plus potential whitespace. Update the // iterator and return number of code points skipped. Return true if a // starting edge (sentence) was found. bool S_find_starting_boundary(StringIterator *top, uint32_t max_skip, uint32_t *num_skipped_ptr) { // Keep track of the first word boundary. StringIterator *word = NULL; uint32_t word_offset = 0; // Check if we're at a starting boundary already. StringIterator *iter = StrIter_Clone(top); while (true) { int32_t code_point = StrIter_Prev(iter); if (code_point == STRITER_DONE || code_point == '.') { // Skip remaining whitespace. *num_skipped_ptr = StrIter_Skip_Next_Whitespace(top); DECREF(iter); return true; } if (StrHelp_is_whitespace(code_point)) { if (word == NULL) { word = StrIter_Clone(top); } } else { break; } } // Try to start on a boundary. uint32_t num_skipped = 0; bool found_edge = false; StrIter_Assign(iter, top); for (uint32_t i = 0; i < max_skip; ++i) { int32_t code_point = StrIter_Next(iter); if (code_point == STRITER_DONE || code_point == '.') { found_edge = true; StrIter_Assign(top, iter); num_skipped = i + 1; break; } if (word == NULL && StrHelp_is_whitespace(code_point)) { word = StrIter_Clone(iter); word_offset = i + 1; } } // Try to use word boundary if no sentence boundary was found. if (!found_edge && word != NULL) { StrIter_Assign(top, word); num_skipped = word_offset; } // Skip remaining whitespace. num_skipped += StrIter_Skip_Next_Whitespace(top); *num_skipped_ptr = num_skipped; DECREF(word); DECREF(iter); return found_edge; }