static ParserElem* S_consume_text(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); while (1) { int32_t code_point = StrIter_Next(temp); if (code_point == '\\') { code_point = StrIter_Next(temp); if (code_point == STRITER_DONE) { break; } } else if (code_point == STRITER_DONE) { break; } else if (StrHelp_is_whitespace(code_point) || code_point == '"' || code_point == '(' || code_point == ')' ) { StrIter_Recede(temp, 1); break; } } String *text = StrIter_substring(iter, temp); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_STRING, (Obj*)text); }
static ParserElem* S_consume_field(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); // Field names constructs must start with a letter or underscore. int32_t code_point = StrIter_Next(temp); if (code_point == STRITER_DONE) { DECREF(temp); return NULL; } if (!(isalpha(code_point) || code_point == '_')) { DECREF(temp); return NULL; } // Only alphanumerics and underscores are allowed in field names. while (':' != (code_point = StrIter_Next(temp))) { if (code_point == STRITER_DONE) { DECREF(temp); return NULL; } if (!(isalnum(code_point) || code_point != '_')) { DECREF(temp); return NULL; } } // Field name constructs must be followed by something sensible. int32_t lookahead = StrIter_Next(temp); if (lookahead == STRITER_DONE) { DECREF(temp); return NULL; } if (!(isalnum(lookahead) || lookahead == '_' || lookahead > 127 || lookahead == '"' || lookahead == '(' ) ) { DECREF(temp); return NULL; } // Consume string data. StrIter_Recede(temp, 2); // Back up over lookahead and colon. String *field = StrIter_substring(iter, temp); StrIter_Advance(temp, 1); // Skip colon. StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_FIELD, (Obj*)field); }
static ParserElem* S_consume_quoted_string(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); if (StrIter_Next(temp) != '"') { THROW(ERR, "Internal error: expected a quote"); } while (1) { int32_t code_point = StrIter_Next(temp); if (code_point == STRITER_DONE || code_point == '"') { break; } else if (code_point == '\\') { StrIter_Next(temp); } } String *text = StrIter_substring(iter, temp); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_STRING, (Obj*)text); }
Query* QParser_Expand_Leaf_IMP(QueryParser *self, Query *query) { QueryParserIVARS *const ivars = QParser_IVARS(self); LeafQuery *leaf_query = (LeafQuery*)query; Schema *schema = ivars->schema; bool is_phrase = false; bool ambiguous = false; // Determine whether we can actually process the input. if (!Query_is_a(query, LEAFQUERY)) { return NULL; } String *full_text = LeafQuery_Get_Text(leaf_query); if (!Str_Get_Size(full_text)) { return NULL; } // If quoted, always generate PhraseQuery. StringIterator *top = Str_Top(full_text); StringIterator *tail = Str_Tail(full_text); StrIter_Skip_Next_Whitespace(top); StrIter_Skip_Prev_Whitespace(tail); if (StrIter_Starts_With_Utf8(top, "\"", 1)) { is_phrase = true; StrIter_Advance(top, 1); if (StrIter_Ends_With_Utf8(tail, "\"", 1) && !StrIter_Ends_With_Utf8(tail, "\\\"", 2) ) { StrIter_Recede(tail, 1); } } String *source_text = StrIter_substring(top, tail); // Either use LeafQuery's field or default to Parser's list. Vector *fields; if (LeafQuery_Get_Field(leaf_query)) { fields = Vec_new(1); Vec_Push(fields, INCREF(LeafQuery_Get_Field(leaf_query))); } else { fields = (Vector*)INCREF(ivars->fields); } CharBuf *unescape_buf = CB_new(Str_Get_Size(source_text)); Vector *queries = Vec_new(Vec_Get_Size(fields)); for (uint32_t i = 0, max = Vec_Get_Size(fields); i < max; i++) { String *field = (String*)Vec_Fetch(fields, i); Analyzer *analyzer = ivars->analyzer ? ivars->analyzer : Schema_Fetch_Analyzer(schema, field); if (!analyzer) { Vec_Push(queries, (Obj*)QParser_Make_Term_Query(self, field, (Obj*)source_text)); } else { // Extract token texts. String *split_source = S_unescape(self, source_text, unescape_buf); Vector *maybe_texts = Analyzer_Split(analyzer, split_source); uint32_t num_maybe_texts = Vec_Get_Size(maybe_texts); Vector *token_texts = Vec_new(num_maybe_texts); // Filter out zero-length token texts. for (uint32_t j = 0; j < num_maybe_texts; j++) { String *token_text = (String*)Vec_Fetch(maybe_texts, j); if (Str_Get_Size(token_text)) { Vec_Push(token_texts, INCREF(token_text)); } } if (Vec_Get_Size(token_texts) == 0) { /* Query might include stop words. Who knows? */ ambiguous = true; } // Add either a TermQuery or a PhraseQuery. if (is_phrase || Vec_Get_Size(token_texts) > 1) { Vec_Push(queries, (Obj*) QParser_Make_Phrase_Query(self, field, token_texts)); } else if (Vec_Get_Size(token_texts) == 1) { Vec_Push(queries, (Obj*)QParser_Make_Term_Query(self, field, Vec_Fetch(token_texts, 0))); } DECREF(token_texts); DECREF(maybe_texts); DECREF(split_source); } } Query *retval; if (Vec_Get_Size(queries) == 0) { retval = (Query*)NoMatchQuery_new(); if (ambiguous) { NoMatchQuery_Set_Fails_To_Match((NoMatchQuery*)retval, false); } } else if (Vec_Get_Size(queries) == 1) { retval = (Query*)INCREF(Vec_Fetch(queries, 0)); } else { retval = QParser_Make_OR_Query(self, queries); } // Clean up. DECREF(unescape_buf); DECREF(queries); DECREF(fields); DECREF(source_text); DECREF(tail); DECREF(top); return retval; }
String* Highlighter_Highlight_Excerpt_IMP(Highlighter *self, VArray *spans, String *raw_excerpt, int32_t top) { int32_t hl_start = 0; int32_t hl_end = 0; StringIterator *iter = Str_Top(raw_excerpt); StringIterator *temp = Str_Top(raw_excerpt); CharBuf *buf = CB_new(Str_Get_Size(raw_excerpt) + 32); CharBuf *encode_buf = NULL; int32_t raw_excerpt_end = top + Str_Length(raw_excerpt); for (uint32_t i = 0, max = VA_Get_Size(spans); i < max; i++) { Span *span = (Span*)VA_Fetch(spans, i); int32_t offset = Span_Get_Offset(span); if (offset < top) { continue; } else if (offset >= raw_excerpt_end) { break; } else { int32_t relative_start = offset - top; int32_t relative_end = relative_start + Span_Get_Length(span); if (relative_start <= hl_end) { if (relative_end > hl_end) { hl_end = relative_end; } } else { if (hl_start < hl_end) { // Highlight previous section int32_t highlighted_len = hl_end - hl_start; StrIter_Assign(temp, iter); StrIter_Advance(iter, highlighted_len); String *to_cat = StrIter_substring(temp, iter); String *encoded = S_do_encode(self, to_cat, &encode_buf); String *hl_frag = Highlighter_Highlight(self, encoded); CB_Cat(buf, hl_frag); DECREF(hl_frag); DECREF(encoded); DECREF(to_cat); } int32_t non_highlighted_len = relative_start - hl_end; StrIter_Assign(temp, iter); StrIter_Advance(iter, non_highlighted_len); String *to_cat = StrIter_substring(temp, iter); String *encoded = S_do_encode(self, to_cat, &encode_buf); CB_Cat(buf, (String*)encoded); DECREF(encoded); DECREF(to_cat); hl_start = relative_start; hl_end = relative_end; } } } if (hl_start < hl_end) { // Highlight final section int32_t highlighted_len = hl_end - hl_start; StrIter_Assign(temp, iter); StrIter_Advance(iter, highlighted_len); String *to_cat = StrIter_substring(temp, iter); String *encoded = S_do_encode(self, to_cat, &encode_buf); String *hl_frag = Highlighter_Highlight(self, encoded); CB_Cat(buf, hl_frag); DECREF(hl_frag); DECREF(encoded); DECREF(to_cat); } // Last text, beyond last highlight span. if (StrIter_Has_Next(iter)) { String *to_cat = StrIter_substring(iter, NULL); String *encoded = S_do_encode(self, to_cat, &encode_buf); CB_Cat(buf, encoded); DECREF(encoded); DECREF(to_cat); } String *highlighted = CB_Yield_String(buf); DECREF(encode_buf); DECREF(buf); DECREF(temp); DECREF(iter); return highlighted; }
String* Highlighter_Raw_Excerpt_IMP(Highlighter *self, String *field_val, int32_t *start_ptr, HeatMap *heat_map) { HighlighterIVARS *const ivars = Highlighter_IVARS(self); // Find start of excerpt. StringIterator *top = Str_Top(field_val); int32_t best_location = S_hottest(heat_map); int32_t start; uint32_t max_skip; if ((uint32_t)best_location <= ivars->slop) { // If the beginning of the string falls within the window centered // around the hottest point in the field, start the fragment at the // beginning. start = 0; max_skip = best_location; } else { start = best_location - ivars->slop; max_skip = ivars->slop; StrIter_Advance(top, start); } uint32_t num_skipped; bool found_starting_edge = S_find_starting_boundary(top, max_skip, &num_skipped); start += num_skipped; // Find end of excerpt. StringIterator *tail = StrIter_Clone(top); uint32_t max_len = ivars->excerpt_length; if (!found_starting_edge) { // Leave space for starting ellipsis and space character. max_len -= 2; } bool found_ending_edge = true; uint32_t excerpt_len = StrIter_Advance(tail, max_len); // Skip up to slop code points but keep at least max_len - slop. if (excerpt_len > max_len - ivars->slop) { max_skip = excerpt_len - (max_len - ivars->slop); found_ending_edge = S_find_ending_boundary(tail, max_skip, &num_skipped); if (num_skipped >= excerpt_len) { excerpt_len = 0; } else { excerpt_len -= num_skipped; } } // Extract excerpt. String *raw_excerpt; if (!excerpt_len) { raw_excerpt = Str_new_from_trusted_utf8("", 0); } else { String *substring = StrIter_substring(top, tail); CharBuf *buf = CB_new(Str_Get_Size(substring) + 8); // If not starting on a sentence boundary, prepend an ellipsis. if (!found_starting_edge) { CB_Cat_Char(buf, ELLIPSIS_CODE_POINT); CB_Cat_Char(buf, ' '); start -= 2; } CB_Cat(buf, substring); // If not ending on a sentence boundary, append an ellipsis. if (!found_ending_edge) { CB_Cat_Char(buf, ELLIPSIS_CODE_POINT); } raw_excerpt = CB_Yield_String(buf); DECREF(buf); DECREF(substring); } *start_ptr = start; DECREF(top); DECREF(tail); return raw_excerpt; }