SortCache* SortCache_init(SortCache *self, Schema *schema, Folder *folder, Segment *segment, i32_t field_num) { CharBuf *field = Seg_Field_Name(segment, field_num); CharBuf *seg_name = Seg_Get_Name(segment); CharBuf *ord_file = CB_newf("%o/sort-%i32.ord", seg_name, field_num); CharBuf *ix_file = CB_newf("%o/sort-%i32.ix", seg_name, field_num); CharBuf *dat_file = CB_newf("%o/sort-%i32.dat", seg_name, field_num); i64_t ord_len, ix_len, dat_len; /* Derive. */ self->doc_max = Seg_Get_Count(segment); self->type = Schema_Fetch_Type(schema, field); if (!self->type || !FType_Sortable(self->type)) { THROW("'%o' isn't a sortable field", field); } /* Open instreams. */ self->ord_in = Folder_Open_In(folder, ord_file); self->ix_in = Folder_Open_In(folder, ix_file); self->dat_in = Folder_Open_In(folder, dat_file); if (!self->ix_in || !self->dat_in || !self->ord_in) { CharBuf *mess = MAKE_MESS("Can't open either %o, %o or %o", ord_file, ix_file, dat_file); DECREF(ord_file); DECREF(ix_file); DECREF(dat_file); Err_throw_mess(mess); } ord_len = InStream_Length(self->ord_in); ix_len = InStream_Length(self->ix_in); dat_len = InStream_Length(self->dat_in); /* Calculate the number of unique values and derive the ord bit width. */ self->num_uniq = (i32_t)(ix_len / 8) - 1; self->width = S_calc_width(self->num_uniq); /* Validate file lengths. */ { double bytes_per_doc = self->width / 8.0; double max_ords = ord_len / bytes_per_doc; if (max_ords < self->doc_max + 1) { THROW("Conflict between ord count max %f64 and doc_max %i32", max_ords, self->doc_max); } } /* Mmap ords, offsets and character data. */ self->ords = InStream_Buf(self->ord_in, (size_t)ord_len); self->offsets = (i64_t*)InStream_Buf(self->ix_in, (size_t)ix_len); self->char_data = InStream_Buf(self->dat_in, dat_len); { char *offs = (char*)self->offsets; self->offsets_limit = (i64_t*)(offs + ix_len); self->char_data_limit = self->char_data + dat_len; } DECREF(ord_file); DECREF(ix_file); DECREF(dat_file); return self; }
static int32_t S_write_files(SortFieldWriter *self, OutStream *ord_out, OutStream *ix_out, OutStream *dat_out) { SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); int8_t prim_id = ivars->prim_id; int32_t doc_max = (int32_t)Seg_Get_Count(ivars->segment); bool has_nulls = ivars->count == doc_max ? false : true; size_t size = (doc_max + 1) * sizeof(int32_t); int32_t *ords = (int32_t*)MALLOCATE(size); int32_t ord = 0; int64_t dat_start = OutStream_Tell(dat_out); // Assign -1 as a stand-in for the NULL ord. for (int32_t i = 0; i <= doc_max; i++) { ords[i] = -1; } // Grab the first item and record its ord. Add a dummy ord for invalid // doc id 0. SFWriterElem *elem = (SFWriterElem*)SortFieldWriter_Fetch(self); SFWriterElemIVARS *elem_ivars = SFWriterElem_IVARS(elem); if (elem_ivars->doc_id > doc_max) { THROW(ERR, "doc_id %i32 greater than doc_max %i32", elem_ivars->doc_id, doc_max); } ords[elem_ivars->doc_id] = ord; ords[0] = 0; // Build array of ords, write non-NULL sorted values. Obj *last_val = INCREF(elem_ivars->value); S_write_val(elem_ivars->value, prim_id, ix_out, dat_out, dat_start); DECREF(elem); while (NULL != (elem = (SFWriterElem*)SortFieldWriter_Fetch(self))) { elem_ivars = SFWriterElem_IVARS(elem); if (elem_ivars->value != last_val) { int32_t comparison = FType_Compare_Values(ivars->type, elem_ivars->value, last_val); if (comparison != 0) { ord++; S_write_val(elem_ivars->value, prim_id, ix_out, dat_out, dat_start); } DECREF(last_val); last_val = INCREF(elem_ivars->value); } if (elem_ivars->doc_id > doc_max) { THROW(ERR, "doc_id %i32 greater than doc_max %i32", elem_ivars->doc_id, doc_max); } ords[elem_ivars->doc_id] = ord; DECREF(elem); } DECREF(last_val); // If there are NULL values, write one now and record the NULL ord. if (has_nulls) { S_write_val(NULL, prim_id, ix_out, dat_out, dat_start); ord++; ivars->null_ord = ord; } int32_t null_ord = ivars->null_ord; // Write one extra file pointer so that we can always derive length. if (ivars->var_width) { OutStream_Write_I64(ix_out, OutStream_Tell(dat_out) - dat_start); } // Calculate cardinality and ord width. int32_t cardinality = ord + 1; ivars->ord_width = S_calc_width(cardinality); int32_t ord_width = ivars->ord_width; // Write ords. const double BITS_PER_BYTE = 8.0; double bytes_per_doc = ord_width / BITS_PER_BYTE; double byte_count = ceil((doc_max + 1) * bytes_per_doc); char *compressed_ords = (char*)CALLOCATE((size_t)byte_count, sizeof(char)); for (int32_t i = 0; i <= doc_max; i++) { int32_t real_ord = ords[i] == -1 ? null_ord : ords[i]; S_write_ord(compressed_ords, ord_width, i, real_ord); } OutStream_Write_Bytes(ord_out, compressed_ords, (size_t)byte_count); FREEMEM(compressed_ords); FREEMEM(ords); return cardinality; }