static void test_c32(TestBatch *batch) { uint64_t mins[] = { 0, 0x4000 - 100, (uint32_t)I32_MAX - 100, U32_MAX - 10 }; uint64_t limits[] = { 500, 0x4000 + 100, (uint32_t)I32_MAX + 100, U32_MAX }; uint32_t set_num; uint32_t num_sets = sizeof(mins) / sizeof(uint64_t); size_t count = 64; uint64_t *ints = NULL; size_t amount = count * C32_MAX_BYTES; char *encoded = (char*)CALLOCATE(amount, sizeof(char)); char *target = encoded; char *limit = target + amount; for (set_num = 0; set_num < num_sets; set_num++) { char *skip; ints = TestUtils_random_u64s(ints, count, mins[set_num], limits[set_num]); target = encoded; for (size_t i = 0; i < count; i++) { NumUtil_encode_c32((uint32_t)ints[i], &target); } target = encoded; skip = encoded; for (size_t i = 0; i < count; i++) { TEST_INT_EQ(batch, NumUtil_decode_c32(&target), (long)ints[i], "c32 %lu", (long)ints[i]); NumUtil_skip_cint(&skip); if (target > limit) { THROW(ERR, "overrun"); } } TEST_TRUE(batch, skip == target, "skip %lu == %lu", (unsigned long)skip, (unsigned long)target); target = encoded; for (size_t i = 0; i < count; i++) { NumUtil_encode_padded_c32((uint32_t)ints[i], &target); } TEST_TRUE(batch, target == limit, "padded c32 uses 5 bytes (%lu == %lu)", (unsigned long)target, (unsigned long)limit); target = encoded; skip = encoded; for (size_t i = 0; i < count; i++) { TEST_INT_EQ(batch, NumUtil_decode_c32(&target), (long)ints[i], "padded c32 %lu", (long)ints[i]); NumUtil_skip_cint(&skip); if (target > limit) { THROW(ERR, "overrun"); } } TEST_TRUE(batch, skip == target, "skip padded %lu == %lu", (unsigned long)skip, (unsigned long)target); } target = encoded; NumUtil_encode_c32(U32_MAX, &target); target = encoded; TEST_INT_EQ(batch, NumUtil_decode_c32(&target), U32_MAX, "c32 U32_MAX"); FREEMEM(encoded); FREEMEM(ints); }
ByteBuf* HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) { const char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8); uint32_t num_postings = 0; Token **tokens; uint32_t freq; UNUSED_VAR(self); // Leave space for a c32 indicating the number of postings. BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { Token *token = *tokens; char *const token_text = Token_Get_Text(token); const int32_t token_len = Token_Get_Len(token); int32_t overlap = StrHelp_overlap(last_text, token_text, last_len, token_len); char *ptr; char *orig; size_t old_size = BB_Get_Size(tv_buf); size_t new_size = old_size + C32_MAX_BYTES // overlap + C32_MAX_BYTES // length of string diff + (token_len - overlap) // diff char data + C32_MAX_BYTES // num prox + (C32_MAX_BYTES * freq * 3); // pos data // Allocate for worst-case scenario. ptr = BB_Grow(tv_buf, new_size); orig = ptr; ptr += old_size; // Track number of postings. num_postings += 1; // Append the string diff to the tv_buf. NumUtil_encode_c32(overlap, &ptr); NumUtil_encode_c32((token_len - overlap), &ptr); memcpy(ptr, (token_text + overlap), (token_len - overlap)); ptr += token_len - overlap; // Save text and text_len for comparison next loop. last_text = token_text; last_len = token_len; // Append the number of positions for this term. NumUtil_encode_c32(freq, &ptr); do { // Add position, start_offset, and end_offset to tv_buf. NumUtil_encode_c32(Token_Get_Pos(token), &ptr); NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr); NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr); } while (--freq && (token = *++tokens)); // Set new byte length. BB_Set_Size(tv_buf, ptr - orig); } // Go back and start the term vector string with the posting count. char *dest = BB_Get_Buf(tv_buf); NumUtil_encode_padded_c32(num_postings, &dest); return tv_buf; }