static void test_c32(TestBatch *batch) { uint64_t mins[] = { 0, 0x4000 - 100, (uint32_t)I32_MAX - 100, U32_MAX - 10 }; uint64_t limits[] = { 500, 0x4000 + 100, (uint32_t)I32_MAX + 100, U32_MAX }; uint32_t set_num; uint32_t num_sets = sizeof(mins) / sizeof(uint64_t); size_t count = 64; uint64_t *ints = NULL; size_t amount = count * C32_MAX_BYTES; char *encoded = (char*)CALLOCATE(amount, sizeof(char)); char *target = encoded; char *limit = target + amount; for (set_num = 0; set_num < num_sets; set_num++) { char *skip; ints = TestUtils_random_u64s(ints, count, mins[set_num], limits[set_num]); target = encoded; for (size_t i = 0; i < count; i++) { NumUtil_encode_c32((uint32_t)ints[i], &target); } target = encoded; skip = encoded; for (size_t i = 0; i < count; i++) { TEST_INT_EQ(batch, NumUtil_decode_c32(&target), (long)ints[i], "c32 %lu", (long)ints[i]); NumUtil_skip_cint(&skip); if (target > limit) { THROW(ERR, "overrun"); } } TEST_TRUE(batch, skip == target, "skip %lu == %lu", (unsigned long)skip, (unsigned long)target); target = encoded; for (size_t i = 0; i < count; i++) { NumUtil_encode_padded_c32((uint32_t)ints[i], &target); } TEST_TRUE(batch, target == limit, "padded c32 uses 5 bytes (%lu == %lu)", (unsigned long)target, (unsigned long)limit); target = encoded; skip = encoded; for (size_t i = 0; i < count; i++) { TEST_INT_EQ(batch, NumUtil_decode_c32(&target), (long)ints[i], "padded c32 %lu", (long)ints[i]); NumUtil_skip_cint(&skip); if (target > limit) { THROW(ERR, "overrun"); } } TEST_TRUE(batch, skip == target, "skip padded %lu == %lu", (unsigned long)skip, (unsigned long)target); } target = encoded; NumUtil_encode_c32(U32_MAX, &target); target = encoded; TEST_INT_EQ(batch, NumUtil_decode_c32(&target), U32_MAX, "c32 U32_MAX"); FREEMEM(encoded); FREEMEM(ints); }
void ScorePost_Add_Inversion_To_Pool_IMP(ScorePosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { ScorePostingIVARS *const ivars = ScorePost_IVARS(self); MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); Similarity *sim = ivars->sim; float field_boost = doc_boost * FType_Get_Boost(type) * length_norm; const uint8_t field_boost_byte = Sim_Encode_Norm(sim, field_boost); const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING); Token **tokens; uint32_t freq; Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { TokenIVARS *const token_ivars = Token_IVARS(*tokens); uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, token_ivars->len, freq); RawPosting *raw_posting = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token_ivars->text, token_ivars->len); RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting); char *const start = raw_post_ivars->blob + token_ivars->len; char *dest = start; uint32_t last_prox = 0; // Field_boost. *((uint8_t*)dest) = field_boost_byte; dest++; // Positions. for (uint32_t i = 0; i < freq; i++) { TokenIVARS *const t_ivars = Token_IVARS(tokens[i]); const uint32_t prox_delta = t_ivars->pos - last_prox; NumUtil_encode_c32(prox_delta, &dest); last_prox = t_ivars->pos; } // Resize raw posting memory allocation. raw_post_ivars->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); PostPool_Feed(post_pool, (Obj*)raw_posting); } }
void RichPost_add_inversion_to_pool(RichPosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); Similarity *sim = self->sim; float field_boost = doc_boost * FType_Get_Boost(type) * length_norm; Token **tokens; uint32_t freq; Inversion_Reset(inversion); while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { Token *token = *tokens; uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len, freq); RawPosting *raw_posting = RawPost_new( MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token->text, token->len ); char *const start = raw_posting->blob + token->len; char *dest = start; uint32_t last_prox = 0; uint32_t i; // Positions and boosts. for (i = 0; i < freq; i++) { Token *const t = tokens[i]; const uint32_t prox_delta = t->pos - last_prox; const float boost = field_boost * t->boost; NumUtil_encode_c32(prox_delta, &dest); last_prox = t->pos; *((uint8_t*)dest) = Sim_Encode_Norm(sim, boost); dest++; } // Resize raw posting memory allocation. raw_posting->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); PostPool_Feed(post_pool, &raw_posting); } }
ByteBuf* HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) { const char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8); uint32_t num_postings = 0; Token **tokens; uint32_t freq; UNUSED_VAR(self); // Leave space for a c32 indicating the number of postings. BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { Token *token = *tokens; char *const token_text = Token_Get_Text(token); const int32_t token_len = Token_Get_Len(token); int32_t overlap = StrHelp_overlap(last_text, token_text, last_len, token_len); char *ptr; char *orig; size_t old_size = BB_Get_Size(tv_buf); size_t new_size = old_size + C32_MAX_BYTES // overlap + C32_MAX_BYTES // length of string diff + (token_len - overlap) // diff char data + C32_MAX_BYTES // num prox + (C32_MAX_BYTES * freq * 3); // pos data // Allocate for worst-case scenario. ptr = BB_Grow(tv_buf, new_size); orig = ptr; ptr += old_size; // Track number of postings. num_postings += 1; // Append the string diff to the tv_buf. NumUtil_encode_c32(overlap, &ptr); NumUtil_encode_c32((token_len - overlap), &ptr); memcpy(ptr, (token_text + overlap), (token_len - overlap)); ptr += token_len - overlap; // Save text and text_len for comparison next loop. last_text = token_text; last_len = token_len; // Append the number of positions for this term. NumUtil_encode_c32(freq, &ptr); do { // Add position, start_offset, and end_offset to tv_buf. NumUtil_encode_c32(Token_Get_Pos(token), &ptr); NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr); NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr); } while (--freq && (token = *++tokens)); // Set new byte length. BB_Set_Size(tv_buf, ptr - orig); } // Go back and start the term vector string with the posting count. char *dest = BB_Get_Buf(tv_buf); NumUtil_encode_padded_c32(num_postings, &dest); return tv_buf; }