Inversion* PolyAnalyzer_transform_text(PolyAnalyzer *self, CharBuf *text) { VArray *const analyzers = self->analyzers; const uint32_t num_analyzers = VA_Get_Size(analyzers); Inversion *retval; if (num_analyzers == 0) { size_t token_len = CB_Get_Size(text); char *buf = (char*)CB_Get_Ptr8(text); Token *seed = Token_new(buf, token_len, 0, token_len, 1.0f, 1); retval = Inversion_new(seed); DECREF(seed); } else { Analyzer *first_analyzer = (Analyzer*)VA_Fetch(analyzers, 0); retval = Analyzer_Transform_Text(first_analyzer, text); for (uint32_t i = 1; i < num_analyzers; i++) { Analyzer *analyzer = (Analyzer*)VA_Fetch(analyzers, i); Inversion *new_inversion = Analyzer_Transform(analyzer, retval); DECREF(retval); retval = new_inversion; } } return retval; }
void Folder_consolidate(Folder *self, const CharBuf *path) { Folder *folder = Folder_Find_Folder(self, path); Folder *enclosing_folder = Folder_Enclosing_Folder(self, path); if (!folder) { THROW(ERR, "Can't consolidate %o", path); } else if (Folder_Is_A(folder, COMPOUNDFILEREADER)) { THROW(ERR, "Can't consolidate %o twice", path); } else { CompoundFileWriter *cf_writer = CFWriter_new(folder); CFWriter_Consolidate(cf_writer); DECREF(cf_writer); if (CB_Get_Size(path)) { ZombieCharBuf *name = IxFileNames_local_part(path, ZCB_BLANK()); CompoundFileReader *cf_reader = CFReader_open(folder); if (!cf_reader) { RETHROW(INCREF(Err_get_error())); } Hash_Store(enclosing_folder->entries, (Obj*)name, (Obj*)cf_reader); } } }
InStream* CFReader_local_open_in(CompoundFileReader *self, const CharBuf *name) { Hash *entry = (Hash*)Hash_Fetch(self->records, (Obj*)name); if (!entry) { InStream *instream = Folder_Local_Open_In(self->real_folder, name); if (!instream) { ERR_ADD_FRAME(Err_get_error()); } return instream; } else { Obj *len = Hash_Fetch_Str(entry, "length", 6); Obj *offset = Hash_Fetch_Str(entry, "offset", 6); if (!len || !offset) { Err_set_error(Err_new(CB_newf("Malformed entry for '%o' in '%o'", name, Folder_Get_Path(self->real_folder)))); return NULL; } else if (CB_Get_Size(self->path)) { CharBuf *fullpath = CB_newf("%o/%o", self->path, name); InStream *instream = InStream_Reopen(self->instream, fullpath, Obj_To_I64(offset), Obj_To_I64(len)); DECREF(fullpath); return instream; } else { return InStream_Reopen(self->instream, name, Obj_To_I64(offset), Obj_To_I64(len)); } } }
Inversion* StandardTokenizer_transform_text(StandardTokenizer *self, CharBuf *text) { Inversion *new_inversion = Inversion_new(NULL); StandardTokenizer_Tokenize_Str(self, (char*)CB_Get_Ptr8(text), CB_Get_Size(text), new_inversion); return new_inversion; }
bool FSDH_entry_is_dir(FSDirHandle *self) { FSDirHandleIVARS *const ivars = FSDH_IVARS(self); struct dirent *sys_dir_entry = (struct dirent*)ivars->sys_dir_entry; if (!sys_dir_entry) { return false; } // If d_type is available, try to avoid a stat() call. If it's not, or if // the type comes back as unknown, fall back to stat(). #ifdef CHY_HAS_DIRENT_D_TYPE if (sys_dir_entry->d_type == DT_DIR) { return true; } else if (sys_dir_entry->d_type != DT_UNKNOWN) { return false; } #endif struct stat stat_buf; if (!ivars->fullpath) { ivars->fullpath = CB_new(CB_Get_Size(ivars->dir) + 20); } CB_setf(ivars->fullpath, "%o%s%o", ivars->dir, CHY_DIR_SEP, ivars->entry); if (stat((char*)CB_Get_Ptr8(ivars->fullpath), &stat_buf) != -1) { if (stat_buf.st_mode & S_IFDIR) { return true; } } return false; }
RawPosting* RichPost_read_raw(RichPosting *self, InStream *instream, int32_t last_doc_id, CharBuf *term_text, MemoryPool *mem_pool) { char *const text_buf = (char*)CB_Get_Ptr8(term_text); const size_t text_size = CB_Get_Size(term_text); const uint32_t doc_code = InStream_Read_C32(instream); const uint32_t delta_doc = doc_code >> 1; const int32_t doc_id = last_doc_id + delta_doc; const uint32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(text_size, freq); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); RawPosting *const raw_posting = RawPost_new(allocation, doc_id, freq, text_buf, text_size); uint32_t num_prox = freq; char *const start = raw_posting->blob + text_size; char * dest = start; UNUSED_VAR(self); // Read positions and per-position boosts. while (num_prox--) { dest += InStream_Read_Raw_C64(instream, dest); *((uint8_t*)dest) = InStream_Read_U8(instream); dest++; } // Resize raw posting memory allocation. raw_posting->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); return raw_posting; }
Obj* Json_from_json(CharBuf *json) { Obj *dump = S_parse_json((char*)CB_Get_Ptr8(json), CB_Get_Size(json)); if (!dump) { ERR_ADD_FRAME(Err_get_error()); } return dump; }
RAMFolder* RAMFolder_init(RAMFolder *self, const CharBuf *path) { Folder_init((Folder*)self, path); self->elems = Hash_new(16); if (CB_Get_Size(self->path) != 0) S_read_fsfolder(self); return self; }
static CharBuf* S_fullpath(RAMFolder *self, const CharBuf *path) { if (CB_Get_Size(self->path)) { return CB_newf("%o/%o", self->path, path); } else { return CB_Clone(path); } }
bool_t LFLock_maybe_delete_file(LockFileLock *self, const CharBuf *path, bool_t delete_mine, bool_t delete_other) { Folder *folder = self->folder; bool_t success = false; ZombieCharBuf *scratch = ZCB_WRAP(path); // Only delete locks that start with our lock name. CharBuf *lock_dir_name = (CharBuf*)ZCB_WRAP_STR("locks", 5); if (!ZCB_Starts_With(scratch, lock_dir_name)) { return false; } ZCB_Nip(scratch, CB_Get_Size(lock_dir_name) + 1); if (!ZCB_Starts_With(scratch, self->name)) { return false; } // Attempt to delete dead lock file. if (Folder_Exists(folder, path)) { Hash *hash = (Hash*)Json_slurp_json(folder, path); if (hash != NULL && Obj_Is_A((Obj*)hash, HASH)) { CharBuf *pid_buf = (CharBuf*)Hash_Fetch_Str(hash, "pid", 3); CharBuf *host = (CharBuf*)Hash_Fetch_Str(hash, "host", 4); CharBuf *name = (CharBuf*)Hash_Fetch_Str(hash, "name", 4); // Match hostname and lock name. if (host != NULL && CB_Equals(host, (Obj*)self->host) && name != NULL && CB_Equals(name, (Obj*)self->name) && pid_buf != NULL ) { // Verify that pid is either mine or dead. int pid = (int)CB_To_I64(pid_buf); if ((delete_mine && pid == PID_getpid()) // This process. || (delete_other && !PID_active(pid)) // Dead pid. ) { if (Folder_Delete(folder, path)) { success = true; } else { CharBuf *mess = MAKE_MESS("Can't delete '%o'", path); DECREF(hash); Err_throw_mess(ERR, mess); } } } } DECREF(hash); } return success; }
void TextTermStepper_Write_Delta_IMP(TextTermStepper *self, OutStream *outstream, Obj *value) { TextTermStepperIVARS *const ivars = TextTermStepper_IVARS(self); CharBuf *charbuf = (CharBuf*)ivars->value; const char *last_text = CB_Get_Ptr8(charbuf); size_t last_size = CB_Get_Size(charbuf); const char *new_text = NULL; size_t new_size = 0; if (Obj_is_a(value, STRING)) { String *new_string = (String*)value; new_text = Str_Get_Ptr8(new_string); new_size = Str_Get_Size(new_string); } else if (Obj_is_a(value, CHARBUF)) { CharBuf *new_charbuf = (CharBuf*)value; new_text = CB_Get_Ptr8(new_charbuf); new_size = CB_Get_Size(new_charbuf); } else { THROW(ERR, "'value' must be a String or CharBuf"); } // Count how many bytes the strings share at the top. const int32_t overlap = StrHelp_overlap(last_text, new_text, last_size, new_size); const char *const diff_start_str = new_text + overlap; const size_t diff_len = new_size - overlap; // Write number of common bytes and common bytes. OutStream_Write_C32(outstream, overlap); OutStream_Write_String(outstream, diff_start_str, diff_len); // Update value. CB_Mimic_Utf8(charbuf, new_text, new_size); // Invalidate string. DECREF(ivars->string); ivars->string = NULL; }
bool_t Folder_delete_tree(Folder *self, const CharBuf *path) { Folder *enclosing_folder = Folder_Enclosing_Folder(self, path); // Don't allow Folder to delete itself. if (!path || !CB_Get_Size(path)) { return false; } if (enclosing_folder) { ZombieCharBuf *local = IxFileNames_local_part(path, ZCB_BLANK()); if (Folder_Local_Is_Directory(enclosing_folder, (CharBuf*)local)) { Folder *inner_folder = Folder_Local_Find_Folder(enclosing_folder, (CharBuf*)local); DirHandle *dh = Folder_Local_Open_Dir(inner_folder); if (dh) { VArray *files = VA_new(20); VArray *dirs = VA_new(20); CharBuf *entry = DH_Get_Entry(dh); while (DH_Next(dh)) { VA_Push(files, (Obj*)CB_Clone(entry)); if (DH_Entry_Is_Dir(dh) && !DH_Entry_Is_Symlink(dh)) { VA_Push(dirs, (Obj*)CB_Clone(entry)); } } for (uint32_t i = 0, max = VA_Get_Size(dirs); i < max; i++) { CharBuf *name = (CharBuf*)VA_Fetch(files, i); bool_t success = Folder_Delete_Tree(inner_folder, name); if (!success && Folder_Local_Exists(inner_folder, name)) { break; } } for (uint32_t i = 0, max = VA_Get_Size(files); i < max; i++) { CharBuf *name = (CharBuf*)VA_Fetch(files, i); bool_t success = Folder_Local_Delete(inner_folder, name); if (!success && Folder_Local_Exists(inner_folder, name)) { break; } } DECREF(dirs); DECREF(files); DECREF(dh); } } return Folder_Local_Delete(enclosing_folder, (CharBuf*)local); } else { // Return failure if the entry wasn't there in the first place. return false; } }
void TextTermStepper_Write_Key_Frame_IMP(TextTermStepper *self, OutStream *outstream, Obj *value) { TextTermStepperIVARS *const ivars = TextTermStepper_IVARS(self); CharBuf *charbuf = (CharBuf*)ivars->value; CB_Mimic(charbuf, value); const char *buf = CB_Get_Ptr8(charbuf); size_t size = CB_Get_Size(charbuf); OutStream_Write_C32(outstream, size); OutStream_Write_Bytes(outstream, buf, size); // Invalidate string. DECREF(ivars->string); ivars->string = NULL; }
void LexWriter_add_term(LexiconWriter* self, CharBuf* term_text, TermInfo* tinfo) { char *text_ptr = (char*)CB_Get_Ptr8(term_text); char *last_ptr = (char*)CB_Get_Ptr8(self->last_text); size_t text_size = CB_Get_Size(term_text); size_t last_size = CB_Get_Size(self->last_text); /* Write a subset of entries to lexicon.ix. */ if ( (self->count % self->index_interval == 0) && !self->temp_mode ) { S_add_last_term_to_ix(self, last_ptr, last_size); } /* Write the record; track number of terms. */ LexStepper_Write_Record(self->stepper, self->dat_out, text_ptr, text_size, last_ptr, last_size, tinfo, self->last_tinfo); self->count++; /* Remember for delta encoding. */ CB_Copy(self->last_text, term_text); TInfo_copy(self->last_tinfo, tinfo); }
static void S_add_to_file_list(Folder *self, VArray *list, CharBuf *dir, CharBuf *prefix) { size_t orig_prefix_size = CB_Get_Size(prefix); DirHandle *dh = Folder_Open_Dir(self, dir); CharBuf *entry; if (!dh) { RETHROW(INCREF(Err_get_error())); } entry = DH_Get_Entry(dh); while (DH_Next(dh)) { // Updates entry if (!S_is_updir(entry)) { CharBuf *relpath = CB_newf("%o%o", prefix, entry); if (VA_Get_Size(list) == VA_Get_Capacity(list)) { VA_Grow(list, VA_Get_Size(list) * 2); } VA_Push(list, (Obj*)relpath); if (DH_Entry_Is_Dir(dh) && !DH_Entry_Is_Symlink(dh)) { CharBuf *subdir = CB_Get_Size(dir) ? CB_newf("%o/%o", dir, entry) : CB_Clone(entry); CB_catf(prefix, "%o/", entry); S_add_to_file_list(self, list, subdir, prefix); // recurse CB_Set_Size(prefix, orig_prefix_size); DECREF(subdir); } } } if (!DH_Close(dh)) { RETHROW(INCREF(Err_get_error())); } DECREF(dh); }
void TextTermStepper_write_delta(TextTermStepper *self, OutStream *outstream, Obj *value) { CharBuf *new_value = (CharBuf*)CERTIFY(value, CHARBUF); CharBuf *last_value = (CharBuf*)self->value; char *new_text = (char*)CB_Get_Ptr8(new_value); size_t new_size = CB_Get_Size(new_value); char *last_text = (char*)CB_Get_Ptr8(last_value); size_t last_size = CB_Get_Size(last_value); // Count how many bytes the strings share at the top. const int32_t overlap = StrHelp_overlap(last_text, new_text, last_size, new_size); const char *const diff_start_str = new_text + overlap; const size_t diff_len = new_size - overlap; // Write number of common bytes and common bytes. OutStream_Write_C32(outstream, overlap); OutStream_Write_String(outstream, diff_start_str, diff_len); // Update value. CB_Mimic((CharBuf*)self->value, value); }
VArray* Folder_list_r(Folder *self, const CharBuf *path) { Folder *local_folder = Folder_Find_Folder(self, path); VArray *list = VA_new(0); if (local_folder) { CharBuf *dir = CB_new(20); CharBuf *prefix = CB_new(20); if (path && CB_Get_Size(path)) { CB_setf(prefix, "%o/", path); } S_add_to_file_list(local_folder, list, dir, prefix); DECREF(prefix); DECREF(dir); } return list; }
RawPosting* MatchPost_read_raw(MatchPosting *self, InStream *instream, i32_t last_doc_id, CharBuf *term_text, MemoryPool *mem_pool) { const size_t text_size = CB_Get_Size(term_text); const u32_t doc_code = InStream_Read_C32(instream); const u32_t delta_doc = doc_code >> 1; const i32_t doc_id = last_doc_id + delta_doc; const u32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(text_size); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); UNUSED_VAR(self); return RawPost_new(allocation, doc_id, freq, term_text->ptr, text_size); }
Folder* Folder_find_folder(Folder *self, const CharBuf *path) { if (!path || !CB_Get_Size(path)) { return self; } else { ZombieCharBuf *scratch = ZCB_WRAP(path); Folder *enclosing_folder = S_enclosing_folder(self, scratch); if (!enclosing_folder) { return NULL; } else { return Folder_Local_Find_Folder(enclosing_folder, (CharBuf*)scratch); } } }
FSDirHandle* FSDH_do_open(FSDirHandle *self, const CharBuf *dir) { size_t dir_path_size = CB_Get_Size(dir); char *dir_path_ptr = (char*)CB_Get_Ptr8(dir); char search_string[MAX_PATH + 1]; char *path_ptr = search_string; DH_init((DirHandle*)self, dir); FSDirHandleIVARS *const ivars = FSDH_IVARS(self); ivars->sys_dir_entry = MALLOCATE(sizeof(WIN32_FIND_DATA)); ivars->sys_dirhandle = INVALID_HANDLE_VALUE; ivars->saved_error = NULL; if (dir_path_size >= MAX_PATH - 2) { // Deal with Windows ceiling on file path lengths. Err_set_error(Err_new(CB_newf("Directory path is too long: %o", dir))); CFISH_DECREF(self); return NULL; } // Append trailing wildcard so Windows lists dir contents rather than just // the dir name itself. memcpy(path_ptr, dir_path_ptr, dir_path_size); memcpy(path_ptr + dir_path_size, "\\*\0", 3); ivars->sys_dirhandle = FindFirstFile(search_string, (WIN32_FIND_DATA*)ivars->sys_dir_entry); if (INVALID_HANDLE_VALUE == ivars->sys_dirhandle) { // Directory inaccessible or doesn't exist. Err_set_error(Err_new(CB_newf("Failed to open dir '%o'", dir))); CFISH_DECREF(self); return NULL; } else { // Compensate for the fact that FindFirstFile has already returned the // first entry but DirHandle's API requires that you call Next() to // start the iterator. ivars->delayed_iter = true; } return self; }
bool_t Json_spew_json(Obj *dump, Folder *folder, const CharBuf *path) { CharBuf *json = Json_to_json(dump); if (!json) { ERR_ADD_FRAME(Err_get_error()); return false; } OutStream *outstream = Folder_Open_Out(folder, path); if (!outstream) { ERR_ADD_FRAME(Err_get_error()); DECREF(json); return false; } size_t size = CB_Get_Size(json); OutStream_Write_Bytes(outstream, CB_Get_Ptr8(json), size); OutStream_Close(outstream); DECREF(outstream); DECREF(json); return true; }
bool Folder_mkdir(Folder *self, const CharBuf *path) { Folder *enclosing_folder = Folder_Enclosing_Folder(self, path); bool result = false; if (!CB_Get_Size(path)) { Err_set_error(Err_new(CB_newf("Invalid path: '%o'", path))); } else if (!enclosing_folder) { Err_set_error(Err_new(CB_newf("Can't recursively create dir %o", path))); } else { ZombieCharBuf *name = IxFileNames_local_part(path, ZCB_BLANK()); result = Folder_Local_MkDir(enclosing_folder, (CharBuf*)name); if (!result) { ERR_ADD_FRAME(Err_get_error()); } } return result; }
bool FSDH_entry_is_symlink(FSDirHandle *self) { FSDirHandleIVARS *const ivars = FSDH_IVARS(self); struct dirent *sys_dir_entry = (struct dirent*)ivars->sys_dir_entry; if (!sys_dir_entry) { return false; } #ifdef CHY_HAS_DIRENT_D_TYPE return sys_dir_entry->d_type == DT_LNK ? true : false; #else { struct stat stat_buf; if (!ivars->fullpath) { ivars->fullpath = CB_new(CB_Get_Size(ivars->dir) + 20); } CB_setf(ivars->fullpath, "%o%s%o", ivars->dir, CHY_DIR_SEP, ivars->entry); if (stat((char*)CB_Get_Ptr8(ivars->fullpath), &stat_buf) != -1) { if (stat_buf.st_mode & S_IFLNK) { return true; } } return false; } #endif // CHY_HAS_DIRENT_D_TYPE }
void StrHelp_add_indent(CharBuf *charbuf, size_t amount) { u32_t num_margins = 1; size_t new_size; char *limit = CBEND(charbuf); char *source = charbuf->ptr; char *dest; /* Add a margin for every newline. */ for ( ; source < limit; source++) { if (*source == '\n') num_margins++; } /* Make space for margins. */ new_size = CB_Get_Size(charbuf) + (num_margins * amount); CB_Grow(charbuf, new_size); source = CBEND(charbuf); CB_Set_Size(charbuf, new_size); dest = CBEND(charbuf); *dest-- = '\0'; source--; while (source >= charbuf->ptr) { if (*source == '\n') { int i = amount; while (i--) { *dest-- = ' '; } } *dest-- = *source--; } while (dest >= charbuf->ptr) { *dest-- = ' '; } }
void DocWriter_add_inverted_doc(DocWriter *self, Inverter *inverter, int32_t doc_id) { DocWriterIVARS *const ivars = DocWriter_IVARS(self); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; uint32_t num_stored = 0; int64_t start = OutStream_Tell(dat_out); int64_t expected = OutStream_Tell(ix_out) / 8; // Verify doc id. if (doc_id != expected) { THROW(ERR, "Expected doc id %i64 but got %i32", expected, doc_id); } // Write the number of stored fields. Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { num_stored++; } } OutStream_Write_C32(dat_out, num_stored); Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { // Only store fields marked as "stored". FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { CharBuf *field = Inverter_Get_Field_Name(inverter); Obj *value = Inverter_Get_Value(inverter); Freezer_serialize_charbuf(field, dat_out); switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { uint8_t *buf = CB_Get_Ptr8((CharBuf*)value); size_t size = CB_Get_Size((CharBuf*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_BLOB: { char *buf = BB_Get_Buf((ByteBuf*)value); size_t size = BB_Get_Size((ByteBuf*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_INT32: { int32_t val = Int32_Get_Value((Integer32*)value); OutStream_Write_C32(dat_out, val); break; } case FType_INT64: { int64_t val = Int64_Get_Value((Integer64*)value); OutStream_Write_C64(dat_out, val); break; } case FType_FLOAT32: { float val = Float32_Get_Value((Float32*)value); OutStream_Write_F32(dat_out, val); break; } case FType_FLOAT64: { double val = Float64_Get_Value((Float64*)value); OutStream_Write_F64(dat_out, val); break; } default: THROW(ERR, "Unrecognized type: %o", type); } } } // Write file pointer. OutStream_Write_I64(ix_out, start); }
FSFileHandle* FSFH_do_open(FSFileHandle *self, const CharBuf *path, uint32_t flags) { FH_do_open((FileHandle*)self, path, flags); if (!path || !CB_Get_Size(path)) { Err_set_error(Err_new(CB_newf("Missing required param 'path'"))); CFISH_DECREF(self); return NULL; } // Attempt to open file. if (flags & FH_WRITE_ONLY) { self->fd = open((char*)CB_Get_Ptr8(path), SI_posix_flags(flags), 0666); if (self->fd == -1) { self->fd = 0; Err_set_error(Err_new(CB_newf("Attempt to open '%o' failed: %s", path, strerror(errno)))); CFISH_DECREF(self); return NULL; } if (flags & FH_EXCLUSIVE) { self->len = 0; } else { // Derive length. self->len = lseek64(self->fd, I64_C(0), SEEK_END); if (self->len == -1) { Err_set_error(Err_new(CB_newf("lseek64 on %o failed: %s", self->path, strerror(errno)))); CFISH_DECREF(self); return NULL; } else { int64_t check_val = lseek64(self->fd, I64_C(0), SEEK_SET); if (check_val == -1) { Err_set_error(Err_new(CB_newf("lseek64 on %o failed: %s", self->path, strerror(errno)))); CFISH_DECREF(self); return NULL; } } } } else if (flags & FH_READ_ONLY) { if (SI_init_read_only(self)) { // On 64-bit systems, map the whole file up-front. if (IS_64_BIT && self->len) { self->buf = (char*)SI_map(self, 0, self->len); if (!self->buf) { // An error occurred during SI_map, which has set // Err_error for us already. CFISH_DECREF(self); return NULL; } } } else { CFISH_DECREF(self); return NULL; } } else { Err_set_error(Err_new(CB_newf("Must specify FH_READ_ONLY or FH_WRITE_ONLY to open '%o'", path))); CFISH_DECREF(self); return NULL; } return self; }
static void S_write_terms_and_postings(PostingPool *self, PostingWriter *post_writer, OutStream *skip_stream) { PostingPoolIVARS *const ivars = PostPool_IVARS(self); TermInfo *const tinfo = TInfo_new(0); TermInfo *const skip_tinfo = TInfo_new(0); TermInfoIVARS *const tinfo_ivars = TInfo_IVARS(tinfo); TermInfoIVARS *const skip_tinfo_ivars = TInfo_IVARS(skip_tinfo); LexiconWriter *const lex_writer = ivars->lex_writer; SkipStepper *const skip_stepper = ivars->skip_stepper; SkipStepperIVARS *const skip_stepper_ivars = SkipStepper_IVARS(skip_stepper); int32_t last_skip_doc = 0; int64_t last_skip_filepos = 0; const int32_t skip_interval = Arch_Skip_Interval(Schema_Get_Architecture(ivars->schema)); // Prime heldover variables. RawPosting *posting = (RawPosting*)CERTIFY(PostPool_Fetch(self), RAWPOSTING); RawPostingIVARS *post_ivars = RawPost_IVARS(posting); CharBuf *last_term_text = CB_new_from_trusted_utf8(post_ivars->blob, post_ivars->content_len); const char *last_text_buf = CB_Get_Ptr8(last_term_text); uint32_t last_text_size = CB_Get_Size(last_term_text); SkipStepper_Set_ID_And_Filepos(skip_stepper, 0, 0); // Initialize sentinel to be used on the last iter, using an empty string // in order to make LexiconWriter Do The Right Thing. size_t sentinel_size = Class_Get_Obj_Alloc_Size(RAWPOSTING) + 20; // blob length + cushion char empty_string[] = ""; RawPosting *sentinel = RawPost_new(alloca(sentinel_size), 0, 1, empty_string, 0); while (1) { bool same_text_as_last = true; if (posting == NULL) { // On the last iter, use an empty string to make LexiconWriter // DTRT. posting = sentinel; post_ivars = RawPost_IVARS(posting); same_text_as_last = false; } else { // Compare once. if (post_ivars->content_len != last_text_size || memcmp(&post_ivars->blob, last_text_buf, last_text_size) != 0 ) { same_text_as_last = false; } } // If the term text changes, process the last term. if (!same_text_as_last) { // Hand off to LexiconWriter. LexWriter_Add_Term(lex_writer, (Obj*)last_term_text, tinfo); // Start each term afresh. TInfo_Reset(tinfo); PostWriter_Start_Term(post_writer, tinfo); // Init skip data in preparation for the next term. skip_stepper_ivars->doc_id = 0; skip_stepper_ivars->filepos = tinfo_ivars->post_filepos; last_skip_doc = 0; last_skip_filepos = tinfo_ivars->post_filepos; // Remember the term_text so we can write string diffs. CB_Mimic_Utf8(last_term_text, post_ivars->blob, post_ivars->content_len); last_text_buf = CB_Get_Ptr8(last_term_text); last_text_size = CB_Get_Size(last_term_text); } // Bail on last iter before writing invalid posting data. if (posting == sentinel) { break; } // Write posting data. PostWriter_Write_Posting(post_writer, posting); // Doc freq lags by one iter. tinfo_ivars->doc_freq++; // Write skip data. if (skip_stream != NULL && same_text_as_last && tinfo_ivars->doc_freq % skip_interval == 0 && tinfo_ivars->doc_freq != 0 ) { // If first skip group, save skip stream pos for term info. if (tinfo_ivars->doc_freq == skip_interval) { tinfo_ivars->skip_filepos = OutStream_Tell(skip_stream); } // Write deltas. last_skip_doc = skip_stepper_ivars->doc_id; last_skip_filepos = skip_stepper_ivars->filepos; skip_stepper_ivars->doc_id = post_ivars->doc_id; PostWriter_Update_Skip_Info(post_writer, skip_tinfo); skip_stepper_ivars->filepos = skip_tinfo_ivars->post_filepos; SkipStepper_Write_Record(skip_stepper, skip_stream, last_skip_doc, last_skip_filepos); } // Retrieve the next posting from the sort pool. // DECREF(posting); // No!! DON'T destroy!!! posting = (RawPosting*)PostPool_Fetch(self); post_ivars = RawPost_IVARS(posting); } // Clean up. DECREF(last_term_text); DECREF(skip_tinfo); DECREF(tinfo); }