static void test_calc_proximity_boost(TestBatchRunner *runner) { VArray *spans = VA_new(0); HeatMap *heat_map = HeatMap_new(spans, 133); Span *span1 = Span_new( 0, 10, 1.0f); Span *span2 = Span_new( 10, 10, 1.0f); Span *span3 = Span_new( 5, 4, 1.0f); Span *span4 = Span_new(100, 10, 1.0f); Span *span5 = Span_new(150, 10, 1.0f); float big_boost = HeatMap_Calc_Proximity_Boost(heat_map, span1, span2); float eq_big_boost = HeatMap_Calc_Proximity_Boost(heat_map, span1, span3); float smaller_boost = HeatMap_Calc_Proximity_Boost(heat_map, span1, span4); float zero_boost = HeatMap_Calc_Proximity_Boost(heat_map, span1, span5); TEST_TRUE(runner, big_boost == eq_big_boost, "overlapping and abutting produce the same proximity boost"); TEST_TRUE(runner, big_boost > smaller_boost, "closer is better"); TEST_TRUE(runner, zero_boost == 0.0, "distance outside of window yields no prox boost"); DECREF(span1); DECREF(span2); DECREF(span3); DECREF(span4); DECREF(span5); DECREF(heat_map); DECREF(spans); }
static void test_Highlight_Excerpt(TestBatchRunner *runner, Searcher *searcher, Obj *query) { String *content = (String*)SSTR_WRAP_UTF8("content", 7); Highlighter *highlighter = Highlighter_new(searcher, query, content, 3); String *highlighted; Vector *spans = Vec_new(1); Vec_Push(spans, (Obj*)Span_new(2, 1, 0.0f)); String *raw_excerpt = (String *)SSTR_WRAP_UTF8("a b c", 5); highlighted = Highlighter_Highlight_Excerpt(highlighter, spans, raw_excerpt, 0); TEST_TRUE(runner, Str_Equals_Utf8(highlighted, "a <strong>b</strong> c", 22), "basic Highlight_Excerpt"); DECREF(highlighted); DECREF(spans); spans = Vec_new(2); Vec_Push(spans, (Obj*)Span_new(0, 1, 1.0f)); Vec_Push(spans, (Obj*)Span_new(10, 10, 1.0f)); raw_excerpt = (String *)SSTR_WRAP_UTF8(PHI, 2); highlighted = Highlighter_Highlight_Excerpt(highlighter, spans, raw_excerpt, 0); TEST_TRUE(runner, Str_Equals_Utf8(highlighted, "<strong>Φ</strong>", 23), "don't surround spans off end of raw excerpt."); DECREF(highlighted); DECREF(spans); spans = Vec_new(1); Vec_Push(spans, (Obj*)Span_new(3, 1, 1.0f)); raw_excerpt = (String *)SSTR_WRAP_UTF8(PHI " " PHI " " PHI, 8); highlighted = Highlighter_Highlight_Excerpt(highlighter, spans, raw_excerpt, 1); TEST_TRUE(runner, Str_Equals_Utf8(highlighted, "Φ <strong>Φ</strong> Φ", 37), "Highlight_Excerpt pays attention to offset"); DECREF(highlighted); DECREF(spans); spans = Vec_new(4); Vec_Push(spans, (Obj*)Span_new(2, 10, 1.0f)); Vec_Push(spans, (Obj*)Span_new(2, 4, 1.0f)); Vec_Push(spans, (Obj*)Span_new(8, 9, 1.0f)); Vec_Push(spans, (Obj*)Span_new(8, 4, 1.0f)); raw_excerpt = (String *)SSTR_WRAP_UTF8(PHI " Oook. Urk. Ick. " PHI, 21); highlighted = Highlighter_Highlight_Excerpt(highlighter, spans, raw_excerpt, 0); TEST_TRUE(runner, Str_Equals_Utf8(highlighted, "Φ <strong>Oook. Urk. Ick.</strong> Φ", 46), "Highlight_Excerpt works with overlapping spans"); DECREF(highlighted); DECREF(spans); DECREF(highlighter); }
// Create all the spans needed by HeatMap_Flatten_Spans, based on the source // offsets and lengths... but leave the scores at 0. static Vector* S_flattened_but_empty_spans(Vector *spans) { const size_t num_spans = Vec_Get_Size(spans); int32_t *bounds = (int32_t*)MALLOCATE((num_spans * 2) * sizeof(int32_t)); // Assemble a list of all unique start/end boundaries. for (size_t i = 0; i < num_spans; i++) { Span *span = (Span*)Vec_Fetch(spans, i); bounds[i] = Span_Get_Offset(span); bounds[i + num_spans] = Span_Get_Offset(span) + Span_Get_Length(span); } qsort(bounds, num_spans * 2, sizeof(int32_t), S_compare_i32); size_t num_bounds = 0; int32_t last = INT32_MAX; for (size_t i = 0; i < num_spans * 2; i++) { if (bounds[i] != last) { bounds[num_bounds++] = bounds[i]; last = bounds[i]; } } // Create one Span for each zone between two bounds. Vector *flattened = Vec_new(num_bounds - 1); for (size_t i = 0; i < num_bounds - 1; i++) { int32_t start = bounds[i]; int32_t length = bounds[i + 1] - start; Vec_Push(flattened, (Obj*)Span_new(start, length, 0.0f)); } FREEMEM(bounds); return flattened; }
Vector* HeatMap_Generate_Proximity_Boosts_IMP(HeatMap *self, Vector *spans) { Vector *boosts = Vec_new(0); const size_t num_spans = Vec_Get_Size(spans); if (num_spans > 1) { for (size_t i = 0, max = num_spans - 1; i < max; i++) { Span *span1 = (Span*)Vec_Fetch(spans, i); for (size_t j = i + 1; j <= max; j++) { Span *span2 = (Span*)Vec_Fetch(spans, j); float prox_score = HeatMap_Calc_Proximity_Boost(self, span1, span2); if (prox_score == 0) { break; } else { int32_t length = Span_Get_Offset(span2) - Span_Get_Offset(span1) + Span_Get_Length(span2); Vec_Push(boosts, (Obj*)Span_new(Span_Get_Offset(span1), length, prox_score)); } } } } return boosts; }
// Create all the spans needed by HeatMap_Flatten_Spans, based on the source // offsets and lengths... but leave the scores at 0. static VArray* S_flattened_but_empty_spans(VArray *spans) { const uint32_t num_spans = VA_Get_Size(spans); int32_t *bounds = (int32_t*)MALLOCATE((num_spans * 2) * sizeof(int32_t)); // Assemble a list of all unique start/end boundaries. for (uint32_t i = 0; i < num_spans; i++) { Span *span = (Span*)VA_Fetch(spans, i); bounds[i] = span->offset; bounds[i + num_spans] = span->offset + span->length; } Sort_quicksort(bounds, num_spans * 2, sizeof(uint32_t), S_compare_i32, NULL); uint32_t num_bounds = 0; int32_t last = I32_MAX; for (uint32_t i = 0; i < num_spans * 2; i++) { if (bounds[i] != last) { bounds[num_bounds++] = bounds[i]; last = bounds[i]; } } // Create one Span for each zone between two bounds. VArray *flattened = VA_new(num_bounds - 1); for (uint32_t i = 0; i < num_bounds - 1; i++) { int32_t start = bounds[i]; int32_t length = bounds[i + 1] - start; VA_Push(flattened, (Obj*)Span_new(start, length, 0.0f)); } FREEMEM(bounds); return flattened; }
VArray* HeatMap_generate_proximity_boosts(HeatMap *self, VArray *spans) { VArray *boosts = VA_new(0); const uint32_t num_spans = VA_Get_Size(spans); if (num_spans > 1) { for (uint32_t i = 0, max = num_spans - 1; i < max; i++ ) { Span *span1 = (Span*)VA_Fetch(spans, i); for (uint32_t j = i + 1; j <= max; j++) { Span *span2 = (Span*)VA_Fetch(spans, j); float prox_score = HeatMap_Calc_Proximity_Boost(self, span1, span2); if (prox_score == 0) { break; } else { int32_t length = (span2->offset - span1->offset) + span2->length; VA_Push(boosts, (Obj*)Span_new(span1->offset, length, prox_score)); } } } } return boosts; }
Vector* TermCompiler_Highlight_Spans_IMP(TermCompiler *self, Searcher *searcher, DocVector *doc_vec, String *field) { TermCompilerIVARS *const ivars = TermCompiler_IVARS(self); TermQueryIVARS *const parent_ivars = TermQuery_IVARS((TermQuery*)ivars->parent); Vector *spans = Vec_new(0); TermVector *term_vector; I32Array *starts, *ends; UNUSED_VAR(searcher); if (!Str_Equals(parent_ivars->field, (Obj*)field)) { return spans; } // Add all starts and ends. term_vector = DocVec_Term_Vector(doc_vec, field, (String*)parent_ivars->term); if (!term_vector) { return spans; } starts = TV_Get_Start_Offsets(term_vector); ends = TV_Get_End_Offsets(term_vector); for (size_t i = 0, max = I32Arr_Get_Size(starts); i < max; i++) { int32_t start = I32Arr_Get(starts, i); int32_t length = I32Arr_Get(ends, i) - start; Vec_Push(spans, (Obj*)Span_new(start, length, TermCompiler_Get_Weight(self))); } DECREF(term_vector); return spans; }
VArray* PhraseCompiler_highlight_spans(PhraseCompiler *self, Searcher *searcher, DocVector *doc_vec, const CharBuf *field) { PhraseQuery *const parent = (PhraseQuery*)self->parent; VArray *const terms = parent->terms; VArray *const spans = VA_new(0); VArray *term_vectors; BitVector *posit_vec; BitVector *other_posit_vec; uint32_t i; const uint32_t num_terms = VA_Get_Size(terms); uint32_t num_tvs; UNUSED_VAR(searcher); // Bail if no terms or field doesn't match. if (!num_terms) { return spans; } if (!CB_Equals(field, (Obj*)parent->field)) { return spans; } term_vectors = VA_new(num_terms); posit_vec = BitVec_new(0); other_posit_vec = BitVec_new(0); for (i = 0; i < num_terms; i++) { Obj *term = VA_Fetch(terms, i); TermVector *term_vector = DocVec_Term_Vector(doc_vec, field, (CharBuf*)term); // Bail if any term is missing. if (!term_vector) break; VA_Push(term_vectors, (Obj*)term_vector); if (i == 0) { // Set initial positions from first term. uint32_t j; I32Array *positions = TV_Get_Positions(term_vector); for (j = I32Arr_Get_Size(positions); j > 0; j--) { BitVec_Set(posit_vec, I32Arr_Get(positions, j - 1)); } } else { // Filter positions using logical "and". uint32_t j; I32Array *positions = TV_Get_Positions(term_vector); BitVec_Clear_All(other_posit_vec); for (j = I32Arr_Get_Size(positions); j > 0; j--) { int32_t pos = I32Arr_Get(positions, j - 1) - i; if (pos >= 0) { BitVec_Set(other_posit_vec, pos); } } BitVec_And(posit_vec, other_posit_vec); } } // Proceed only if all terms are present. num_tvs = VA_Get_Size(term_vectors); if (num_tvs == num_terms) { TermVector *first_tv = (TermVector*)VA_Fetch(term_vectors, 0); TermVector *last_tv = (TermVector*)VA_Fetch(term_vectors, num_tvs - 1); I32Array *tv_start_positions = TV_Get_Positions(first_tv); I32Array *tv_end_positions = TV_Get_Positions(last_tv); I32Array *tv_start_offsets = TV_Get_Start_Offsets(first_tv); I32Array *tv_end_offsets = TV_Get_End_Offsets(last_tv); uint32_t terms_max = num_terms - 1; I32Array *valid_posits = BitVec_To_Array(posit_vec); uint32_t num_valid_posits = I32Arr_Get_Size(valid_posits); uint32_t j = 0; uint32_t posit_tick; float weight = PhraseCompiler_Get_Weight(self); i = 0; // Add only those starts/ends that belong to a valid position. for (posit_tick = 0; posit_tick < num_valid_posits; posit_tick++) { int32_t valid_start_posit = I32Arr_Get(valid_posits, posit_tick); int32_t valid_end_posit = valid_start_posit + terms_max; int32_t start_offset = 0, end_offset = 0; uint32_t max; for (max = I32Arr_Get_Size(tv_start_positions); i < max; i++) { if (I32Arr_Get(tv_start_positions, i) == valid_start_posit) { start_offset = I32Arr_Get(tv_start_offsets, i); break; } } for (max = I32Arr_Get_Size(tv_end_positions); j < max; j++) { if (I32Arr_Get(tv_end_positions, j) == valid_end_posit) { end_offset = I32Arr_Get(tv_end_offsets, j); break; } } VA_Push(spans, (Obj*)Span_new(start_offset, end_offset - start_offset, weight) ); i++, j++; } DECREF(valid_posits); } DECREF(other_posit_vec); DECREF(posit_vec); DECREF(term_vectors); return spans; }
static void test_flatten_spans(TestBatchRunner *runner) { VArray *spans = VA_new(8); VArray *wanted = VA_new(8); HeatMap *heat_map = HeatMap_new(spans, 133); VArray *flattened, *boosts; VA_Push(spans, (Obj*)Span_new(10, 10, 1.0f)); VA_Push(spans, (Obj*)Span_new(16, 14, 2.0f)); flattened = HeatMap_Flatten_Spans(heat_map, spans); VA_Push(wanted, (Obj*)Span_new(10, 6, 1.0f)); VA_Push(wanted, (Obj*)Span_new(16, 4, 3.0f)); VA_Push(wanted, (Obj*)Span_new(20, 10, 2.0f)); TEST_TRUE(runner, VA_Equals(flattened, (Obj*)wanted), "flatten two overlapping spans"); VA_Clear(wanted); boosts = HeatMap_Generate_Proximity_Boosts(heat_map, spans); VA_Push(wanted, (Obj*)Span_new(10, 20, 3.0f)); TEST_TRUE(runner, VA_Equals(boosts, (Obj*)wanted), "prox boosts for overlap"); VA_Clear(wanted); VA_Clear(spans); DECREF(boosts); DECREF(flattened); VA_Push(spans, (Obj*)Span_new(10, 10, 1.0f)); VA_Push(spans, (Obj*)Span_new(16, 14, 2.0f)); VA_Push(spans, (Obj*)Span_new(50, 1, 1.0f)); flattened = HeatMap_Flatten_Spans(heat_map, spans); VA_Push(wanted, (Obj*)Span_new(10, 6, 1.0f)); VA_Push(wanted, (Obj*)Span_new(16, 4, 3.0f)); VA_Push(wanted, (Obj*)Span_new(20, 10, 2.0f)); VA_Push(wanted, (Obj*)Span_new(50, 1, 1.0f)); TEST_TRUE(runner, VA_Equals(flattened, (Obj*)wanted), "flatten two overlapping spans, leave hole, then third span"); VA_Clear(wanted); boosts = HeatMap_Generate_Proximity_Boosts(heat_map, spans); TEST_TRUE(runner, VA_Get_Size(boosts) == 2 + 1, "boosts generated for each unique pair, since all were in range"); VA_Clear(spans); DECREF(boosts); DECREF(flattened); VA_Push(spans, (Obj*)Span_new(10, 10, 1.0f)); VA_Push(spans, (Obj*)Span_new(14, 4, 4.0f)); VA_Push(spans, (Obj*)Span_new(16, 14, 2.0f)); flattened = HeatMap_Flatten_Spans(heat_map, spans); VA_Push(wanted, (Obj*)Span_new(10, 4, 1.0f)); VA_Push(wanted, (Obj*)Span_new(14, 2, 5.0f)); VA_Push(wanted, (Obj*)Span_new(16, 2, 7.0f)); VA_Push(wanted, (Obj*)Span_new(18, 2, 3.0f)); VA_Push(wanted, (Obj*)Span_new(20, 10, 2.0f)); TEST_TRUE(runner, VA_Equals(flattened, (Obj*)wanted), "flatten three overlapping spans"); VA_Clear(wanted); boosts = HeatMap_Generate_Proximity_Boosts(heat_map, spans); TEST_TRUE(runner, VA_Get_Size(boosts) == 2 + 1, "boosts generated for each unique pair, since all were in range"); VA_Clear(spans); DECREF(boosts); DECREF(flattened); VA_Push(spans, (Obj*)Span_new(10, 10, 1.0f)); VA_Push(spans, (Obj*)Span_new(16, 14, 4.0f)); VA_Push(spans, (Obj*)Span_new(16, 14, 2.0f)); VA_Push(spans, (Obj*)Span_new(30, 10, 10.0f)); flattened = HeatMap_Flatten_Spans(heat_map, spans); VA_Push(wanted, (Obj*)Span_new(10, 6, 1.0f)); VA_Push(wanted, (Obj*)Span_new(16, 4, 7.0f)); VA_Push(wanted, (Obj*)Span_new(20, 10, 6.0f)); VA_Push(wanted, (Obj*)Span_new(30, 10, 10.0f)); TEST_TRUE(runner, VA_Equals(flattened, (Obj*)wanted), "flatten 4 spans, middle two have identical range"); VA_Clear(wanted); boosts = HeatMap_Generate_Proximity_Boosts(heat_map, spans); TEST_TRUE(runner, VA_Get_Size(boosts) == 3 + 2 + 1, "boosts generated for each unique pair, since all were in range"); VA_Clear(spans); DECREF(boosts); DECREF(flattened); VA_Push(spans, (Obj*)Span_new( 10, 10, 1.0f)); VA_Push(spans, (Obj*)Span_new( 16, 4, 4.0f)); VA_Push(spans, (Obj*)Span_new( 16, 14, 2.0f)); VA_Push(spans, (Obj*)Span_new(230, 10, 10.0f)); flattened = HeatMap_Flatten_Spans(heat_map, spans); VA_Push(wanted, (Obj*)Span_new( 10, 6, 1.0f)); VA_Push(wanted, (Obj*)Span_new( 16, 4, 7.0f)); VA_Push(wanted, (Obj*)Span_new( 20, 10, 2.0f)); VA_Push(wanted, (Obj*)Span_new(230, 10, 10.0f)); TEST_TRUE(runner, VA_Equals(flattened, (Obj*)wanted), "flatten 4 spans, middle two have identical starts but different ends"); VA_Clear(wanted); boosts = HeatMap_Generate_Proximity_Boosts(heat_map, spans); TEST_TRUE(runner, VA_Get_Size(boosts) == 2 + 1, "boosts not generated for out of range span"); VA_Clear(spans); DECREF(boosts); DECREF(flattened); DECREF(heat_map); DECREF(wanted); DECREF(spans); }
static void test_Raw_Excerpt(TestBatchRunner *runner, Searcher *searcher, Obj *query) { String *content = (String*)SSTR_WRAP_UTF8("content", 7); Highlighter *highlighter = Highlighter_new(searcher, query, content, 6); int32_t top; String *raw_excerpt; String *field_val = (String *)SSTR_WRAP_UTF8("Ook. Urk. Ick. ", 18); Vector *spans = Vec_new(1); Vec_Push(spans, (Obj*)Span_new(0, 18, 1.0f)); HeatMap *heat_map = HeatMap_new(spans, 133); DECREF(spans); raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top, heat_map); TEST_TRUE(runner, Str_Equals_Utf8(raw_excerpt, "Ook.", 4), "Raw_Excerpt at top %s", Str_Get_Ptr8(raw_excerpt)); TEST_TRUE(runner, top == 0, "top is 0"); DECREF(raw_excerpt); DECREF(heat_map); spans = Vec_new(1); Vec_Push(spans, (Obj*)Span_new(6, 12, 1.0f)); heat_map = HeatMap_new(spans, 133); DECREF(spans); raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top, heat_map); TEST_TRUE(runner, Str_Equals_Utf8(raw_excerpt, "Urk.", 4), "Raw_Excerpt in middle, with 2 bounds"); TEST_TRUE(runner, top == 6, "top in the middle modified by Raw_Excerpt"); DECREF(raw_excerpt); DECREF(heat_map); field_val = (String *)SSTR_WRAP_UTF8("Ook urk ick i.", 14); spans = Vec_new(1); Vec_Push(spans, (Obj*)Span_new(12, 1, 1.0f)); heat_map = HeatMap_new(spans, 133); DECREF(spans); raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top, heat_map); TEST_TRUE(runner, Str_Equals_Utf8(raw_excerpt, ELLIPSIS " i.", 6), "Ellipsis at top"); TEST_TRUE(runner, top == 10, "top correct when leading ellipsis inserted"); DECREF(heat_map); DECREF(raw_excerpt); field_val = (String *)SSTR_WRAP_UTF8("Urk. Iz no good.", 17); spans = Vec_new(1); Vec_Push(spans, (Obj*)Span_new(6, 2, 1.0f)); heat_map = HeatMap_new(spans, 133); DECREF(spans); raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top, heat_map); TEST_TRUE(runner, Str_Equals_Utf8(raw_excerpt, "Iz no" ELLIPSIS, 8), "Ellipsis at end"); TEST_TRUE(runner, top == 6, "top trimmed"); DECREF(heat_map); DECREF(raw_excerpt); // Words longer than excerpt len field_val = (String *)SSTR_WRAP_UTF8("abc/def/ghi/jkl/mno", 19); spans = Vec_new(1); Vec_Push(spans, (Obj*)Span_new(0, 3, 1.0f)); heat_map = HeatMap_new(spans, 133); DECREF(spans); raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top, heat_map); TEST_TRUE(runner, Str_Equals_Utf8(raw_excerpt, "abc/d" ELLIPSIS, 8), "Long word at top"); DECREF(heat_map); DECREF(raw_excerpt); spans = Vec_new(1); Vec_Push(spans, (Obj*)Span_new(8, 3, 1.0f)); heat_map = HeatMap_new(spans, 133); DECREF(spans); raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top, heat_map); TEST_TRUE(runner, Str_Equals_Utf8(raw_excerpt, ELLIPSIS " f/g" ELLIPSIS, 10), "Long word in middle"); DECREF(heat_map); DECREF(raw_excerpt); DECREF(highlighter); }