static void S_compose_inner_queries(QueryParser *self, Vector *elems, String *default_field) { const int32_t default_occur = QParser_IVARS(self)->default_occur; // Generate all queries. Apply any fields. for (uint32_t i = Vec_Get_Size(elems); i--;) { String *field = default_field; ParserElem *elem = (ParserElem*)Vec_Fetch(elems, i); // Apply field. if (i > 0) { // Field specifier must immediately precede any query. ParserElem* maybe_field_elem = (ParserElem*)Vec_Fetch(elems, i - 1); if (ParserElem_Get_Type(maybe_field_elem) == TOKEN_FIELD) { field = (String*)ParserElem_As(maybe_field_elem, STRING); } } if (ParserElem_Get_Type(elem) == TOKEN_STRING) { String *text = (String*)ParserElem_As(elem, STRING); LeafQuery *query = LeafQuery_new(field, text); ParserElem *new_elem = ParserElem_new(TOKEN_QUERY, (Obj*)query); if (default_occur == MUST) { ParserElem_Require(new_elem); } Vec_Store(elems, i, (Obj*)new_elem); } } }
static ParserElem* S_consume_text(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); while (1) { int32_t code_point = StrIter_Next(temp); if (code_point == '\\') { code_point = StrIter_Next(temp); if (code_point == STR_OOB) { break; } } else if (code_point == STR_OOB) { break; } else if (StrHelp_is_whitespace(code_point) || code_point == '"' || code_point == '(' || code_point == ')' ) { StrIter_Recede(temp, 1); break; } } String *text = StrIter_crop(iter, temp); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_STRING, (Obj*)text); }
static void S_balance_parens(QueryParser *self, Vector *elems) { UNUSED_VAR(self); // Count paren balance, eliminate unbalanced right parens. int64_t paren_depth = 0; size_t i = 0; while (i < Vec_Get_Size(elems)) { ParserElem *elem = (ParserElem*)Vec_Fetch(elems, i); if (ParserElem_Get_Type(elem) == TOKEN_OPEN_PAREN) { paren_depth++; } else if (ParserElem_Get_Type(elem) == TOKEN_CLOSE_PAREN) { if (paren_depth > 0) { paren_depth--; } else { Vec_Excise(elems, i, 1); continue; } } i++; } // Insert implicit parens. while (paren_depth--) { ParserElem *elem = ParserElem_new(TOKEN_CLOSE_PAREN, NULL); Vec_Push(elems, (Obj*)elem); } }
static ParserElem* S_consume_keyword(StringIterator *iter, const char *keyword, size_t keyword_len, int type) { if (!StrIter_Starts_With_Utf8(iter, keyword, keyword_len)) { return NULL; } StringIterator *temp = StrIter_Clone(iter); StrIter_Advance(temp, keyword_len); int32_t lookahead = StrIter_Next(temp); if (lookahead == STR_OOB) { DECREF(temp); return NULL; } if (StrHelp_is_whitespace(lookahead) || lookahead == '"' || lookahead == '(' || lookahead == ')' || lookahead == '+' || lookahead == '-' ) { StrIter_Recede(temp, 1); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(type, NULL); } DECREF(temp); return NULL; }
static void S_parse_subqueries(QueryParser *self, Vector *elems) { const int32_t default_occur = QParser_IVARS(self)->default_occur; while (1) { // Work from the inside out, starting with the leftmost innermost // paren group. size_t left = SIZE_MAX; size_t right = SIZE_MAX; String *field = NULL; for (size_t i = 0, max = Vec_Get_Size(elems); i < max; i++) { ParserElem *elem = (ParserElem*)Vec_Fetch(elems, i); uint32_t type = ParserElem_Get_Type(elem); if (type == TOKEN_OPEN_PAREN) { left = i; } else if (type == TOKEN_CLOSE_PAREN) { right = i; break; } else if (type == TOKEN_FIELD && i < max - 1) { // If a field applies to an enclosing paren, pass it along. ParserElem *next_elem = (ParserElem*)Vec_Fetch(elems, i + 1); uint32_t next_type = ParserElem_Get_Type(next_elem); if (next_type == TOKEN_OPEN_PAREN) { field = (String*)ParserElem_As(elem, STRING); } } } // Break out of loop when there are no parens left. if (right == SIZE_MAX) { break; } // Create the subquery. Vector *sub_elems = Vec_Slice(elems, left + 1, right - left - 1); Query *subquery = S_parse_subquery(self, sub_elems, field, true); ParserElem *new_elem = ParserElem_new(TOKEN_QUERY, (Obj*)subquery); if (default_occur == MUST) { ParserElem_Require(new_elem); } DECREF(sub_elems); // Replace the elements used to create the subquery with the subquery // itself. if (left > 0) { ParserElem *maybe_field = (ParserElem*)Vec_Fetch(elems, left - 1); uint32_t maybe_field_type = ParserElem_Get_Type(maybe_field); if (maybe_field_type == TOKEN_FIELD) { left -= 1; } } Vec_Excise(elems, left + 1, right - left); Vec_Store(elems, left, (Obj*)new_elem); } }
static ParserElem* S_consume_field(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); // Field names constructs must start with a letter or underscore. int32_t code_point = StrIter_Next(temp); if (code_point == STR_OOB) { DECREF(temp); return NULL; } if (!(isalpha(code_point) || code_point == '_')) { DECREF(temp); return NULL; } // Only alphanumerics and underscores are allowed in field names. while (':' != (code_point = StrIter_Next(temp))) { if (code_point == STR_OOB) { DECREF(temp); return NULL; } if (!(isalnum(code_point) || code_point == '_')) { DECREF(temp); return NULL; } } // Field name constructs must be followed by something sensible. int32_t lookahead = StrIter_Next(temp); if (lookahead == STR_OOB) { DECREF(temp); return NULL; } if (!(isalnum(lookahead) || lookahead == '_' || lookahead > 127 || lookahead == '"' || lookahead == '(' ) ) { DECREF(temp); return NULL; } // Consume string data. StrIter_Recede(temp, 2); // Back up over lookahead and colon. String *field = StrIter_crop(iter, temp); StrIter_Advance(temp, 1); // Skip colon. StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_FIELD, (Obj*)field); }
static ParserElem* S_consume_quoted_string(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); if (StrIter_Next(temp) != '"') { THROW(ERR, "Internal error: expected a quote"); } while (1) { int32_t code_point = StrIter_Next(temp); if (code_point == STR_OOB || code_point == '"') { break; } else if (code_point == '\\') { StrIter_Next(temp); } } String *text = StrIter_crop(iter, temp); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_STRING, (Obj*)text); }
Vector* QueryLexer_Tokenize_IMP(QueryLexer *self, String *query_string) { QueryLexerIVARS *const ivars = QueryLexer_IVARS(self); Vector *elems = Vec_new(0); if (!query_string) { return elems; } StringIterator *iter = Str_Top(query_string); while (StrIter_Has_Next(iter)) { ParserElem *elem = NULL; if (StrIter_Skip_Whitespace(iter)) { // Fast-forward past whitespace. continue; } if (ivars->heed_colons) { ParserElem *elem = S_consume_field(iter); if (elem) { Vec_Push(elems, (Obj*)elem); } } int32_t code_point = StrIter_Next(iter); switch (code_point) { case '(': elem = ParserElem_new(TOKEN_OPEN_PAREN, NULL); break; case ')': elem = ParserElem_new(TOKEN_CLOSE_PAREN, NULL); break; case '+': if (StrIter_Has_Next(iter) && !StrIter_Skip_Whitespace(iter) ) { elem = ParserElem_new(TOKEN_PLUS, NULL); } else { elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("+")); } break; case '-': if (StrIter_Has_Next(iter) && !StrIter_Skip_Whitespace(iter) ) { elem = ParserElem_new(TOKEN_MINUS, NULL); } else { elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("-")); } break; case '"': StrIter_Recede(iter, 1); elem = S_consume_quoted_string(iter); break; case 'O': StrIter_Recede(iter, 1); elem = S_consume_keyword(iter, "OR", 2, TOKEN_OR); if (!elem) { elem = S_consume_text(iter); } break; case 'A': StrIter_Recede(iter, 1); elem = S_consume_keyword(iter, "AND", 3, TOKEN_AND); if (!elem) { elem = S_consume_text(iter); } break; case 'N': StrIter_Recede(iter, 1); elem = S_consume_keyword(iter, "NOT", 3, TOKEN_NOT); if (!elem) { elem = S_consume_text(iter); } break; default: StrIter_Recede(iter, 1); elem = S_consume_text(iter); break; } Vec_Push(elems, (Obj*)elem); } DECREF(iter); return elems; }