static String* S_encode_entities(String *text, CharBuf *buf) { StringIterator *iter = Str_Top(text); size_t space = 0; const int MAX_ENTITY_BYTES = 9; // &#dddddd; // Scan first so that we only allocate once. int32_t code_point; while (STRITER_DONE != (code_point = StrIter_Next(iter))) { if (code_point > 127 || (!isgraph(code_point) && !isspace(code_point)) || code_point == '<' || code_point == '>' || code_point == '&' || code_point == '"' ) { space += MAX_ENTITY_BYTES; } else { space += 1; } } CB_Grow(buf, space); CB_Set_Size(buf, 0); DECREF(iter); iter = Str_Top(text); while (STRITER_DONE != (code_point = StrIter_Next(iter))) { if (code_point > 127 || (!isgraph(code_point) && !isspace(code_point)) ) { CB_catf(buf, "&#%u32;", code_point); } else if (code_point == '<') { CB_Cat_Trusted_Utf8(buf, "<", 4); } else if (code_point == '>') { CB_Cat_Trusted_Utf8(buf, ">", 4); } else if (code_point == '&') { CB_Cat_Trusted_Utf8(buf, "&", 5); } else if (code_point == '"') { CB_Cat_Trusted_Utf8(buf, """, 6); } else { CB_Cat_Char(buf, code_point); } } DECREF(iter); return CB_To_String(buf); }
static String* S_unescape(QueryParser *self, String *orig, CharBuf *buf) { StringIterator *iter = Str_Top(orig); int32_t code_point; UNUSED_VAR(self); CB_Set_Size(buf, 0); CB_Grow(buf, Str_Get_Size(orig) + 4); while (STRITER_DONE != (code_point = StrIter_Next(iter))) { if (code_point == '\\') { int32_t next_code_point = StrIter_Next(iter); if (next_code_point == ':' || next_code_point == '"' || next_code_point == '\\' ) { CB_Cat_Char(buf, next_code_point); } else { CB_Cat_Char(buf, code_point); if (next_code_point != STRITER_DONE) { CB_Cat_Char(buf, next_code_point); } } } else { CB_Cat_Char(buf, code_point); } } DECREF(iter); return CB_To_String(buf); }
Lock* Lock_init(Lock *self, Folder *folder, String *name, int32_t timeout, int32_t interval) { LockIVARS *const ivars = Lock_IVARS(self); // Validate. if (interval <= 0) { DECREF(self); THROW(ERR, "Invalid value for 'interval': %i32", interval); } StringIterator *iter = Str_Top(name); int32_t code_point; while (STR_OOB != (code_point = StrIter_Next(iter))) { if (isalnum(code_point) || code_point == '.' || code_point == '-' || code_point == '_' ) { continue; } DECREF(self); THROW(ERR, "Lock name contains disallowed characters: '%o'", name); } DECREF(iter); // Assign. ivars->folder = (Folder*)INCREF(folder); ivars->timeout = timeout; ivars->name = Str_Clone(name); ivars->interval = interval; return self; }
static void test_iterator_whitespace(TestBatchRunner *runner) { int num_spaces; String *ws_smiley = S_smiley_with_whitespace(&num_spaces); { StringIterator *iter = Str_Top(ws_smiley); TEST_INT_EQ(runner, StrIter_Skip_Whitespace(iter), num_spaces, "Skip_Whitespace"); TEST_INT_EQ(runner, StrIter_Skip_Whitespace(iter), 0, "Skip_Whitespace without whitespace"); DECREF(iter); } { StringIterator *iter = Str_Tail(ws_smiley); TEST_INT_EQ(runner, StrIter_Skip_Whitespace_Back(iter), num_spaces, "Skip_Whitespace_Back"); TEST_INT_EQ(runner, StrIter_Skip_Whitespace_Back(iter), 0, "Skip_Whitespace_Back without whitespace"); DECREF(iter); } DECREF(ws_smiley); }
bool LFLock_Maybe_Delete_File_IMP(LockFileLock *self, String *path, bool delete_mine, bool delete_other) { LockFileLockIVARS *const ivars = LFLock_IVARS(self); Folder *folder = ivars->folder; bool success = false; // Only delete locks that start with our lock name. if (!Str_Starts_With_Utf8(path, "locks", 5)) { return false; } StringIterator *iter = Str_Top(path); StrIter_Advance(iter, 5 + 1); if (!StrIter_Starts_With(iter, ivars->name)) { DECREF(iter); return false; } DECREF(iter); // Attempt to delete dead lock file. if (Folder_Exists(folder, path)) { Hash *hash = (Hash*)Json_slurp_json(folder, path); if (hash != NULL && Obj_Is_A((Obj*)hash, HASH)) { String *pid_buf = (String*)Hash_Fetch_Utf8(hash, "pid", 3); String *host = (String*)Hash_Fetch_Utf8(hash, "host", 4); String *name = (String*)Hash_Fetch_Utf8(hash, "name", 4); // Match hostname and lock name. if (host != NULL && Str_Is_A(host, STRING) && Str_Equals(host, (Obj*)ivars->host) && name != NULL && Str_Is_A(name, STRING) && Str_Equals(name, (Obj*)ivars->name) && pid_buf != NULL && Str_Is_A(pid_buf, STRING) ) { // Verify that pid is either mine or dead. int pid = (int)Str_To_I64(pid_buf); if ((delete_mine && pid == PID_getpid()) // This process. || (delete_other && !PID_active(pid)) // Dead pid. ) { if (Folder_Delete(folder, path)) { success = true; } else { String *mess = MAKE_MESS("Can't delete '%o'", path); DECREF(hash); Err_throw_mess(ERR, mess); } } } } DECREF(hash); } return success; }
bool Seg_valid_seg_name(String *name) { if (Str_Starts_With_Utf8(name, "seg_", 4)) { StringIterator *iter = Str_Top(name); StrIter_Advance(iter, 4); int32_t code_point; while (STR_OOB != (code_point = StrIter_Next(iter))) { if (!isalnum(code_point)) { DECREF(iter); return false; } } DECREF(iter); return true; // Success! } return false; }
uint64_t IxFileNames_extract_gen(String *name) { StringIterator *iter = Str_Top(name); // Advance past first underscore. Bail if we run out of string or if we // encounter a NULL. while (1) { int32_t code_point = StrIter_Next(iter); if (code_point == STR_OOB) { return 0; } else if (code_point == '_') { break; } } String *num_string = StrIter_crop(iter, NULL); uint64_t retval = (uint64_t)Str_BaseX_To_I64(num_string, 36); DECREF(num_string); DECREF(iter); return retval; }
Query* QParser_Expand_Leaf_IMP(QueryParser *self, Query *query) { QueryParserIVARS *const ivars = QParser_IVARS(self); LeafQuery *leaf_query = (LeafQuery*)query; Schema *schema = ivars->schema; bool is_phrase = false; bool ambiguous = false; // Determine whether we can actually process the input. if (!Query_is_a(query, LEAFQUERY)) { return NULL; } String *full_text = LeafQuery_Get_Text(leaf_query); if (!Str_Get_Size(full_text)) { return NULL; } // If quoted, always generate PhraseQuery. StringIterator *top = Str_Top(full_text); StringIterator *tail = Str_Tail(full_text); StrIter_Skip_Next_Whitespace(top); StrIter_Skip_Prev_Whitespace(tail); if (StrIter_Starts_With_Utf8(top, "\"", 1)) { is_phrase = true; StrIter_Advance(top, 1); if (StrIter_Ends_With_Utf8(tail, "\"", 1) && !StrIter_Ends_With_Utf8(tail, "\\\"", 2) ) { StrIter_Recede(tail, 1); } } String *source_text = StrIter_substring(top, tail); // Either use LeafQuery's field or default to Parser's list. Vector *fields; if (LeafQuery_Get_Field(leaf_query)) { fields = Vec_new(1); Vec_Push(fields, INCREF(LeafQuery_Get_Field(leaf_query))); } else { fields = (Vector*)INCREF(ivars->fields); } CharBuf *unescape_buf = CB_new(Str_Get_Size(source_text)); Vector *queries = Vec_new(Vec_Get_Size(fields)); for (uint32_t i = 0, max = Vec_Get_Size(fields); i < max; i++) { String *field = (String*)Vec_Fetch(fields, i); Analyzer *analyzer = ivars->analyzer ? ivars->analyzer : Schema_Fetch_Analyzer(schema, field); if (!analyzer) { Vec_Push(queries, (Obj*)QParser_Make_Term_Query(self, field, (Obj*)source_text)); } else { // Extract token texts. String *split_source = S_unescape(self, source_text, unescape_buf); Vector *maybe_texts = Analyzer_Split(analyzer, split_source); uint32_t num_maybe_texts = Vec_Get_Size(maybe_texts); Vector *token_texts = Vec_new(num_maybe_texts); // Filter out zero-length token texts. for (uint32_t j = 0; j < num_maybe_texts; j++) { String *token_text = (String*)Vec_Fetch(maybe_texts, j); if (Str_Get_Size(token_text)) { Vec_Push(token_texts, INCREF(token_text)); } } if (Vec_Get_Size(token_texts) == 0) { /* Query might include stop words. Who knows? */ ambiguous = true; } // Add either a TermQuery or a PhraseQuery. if (is_phrase || Vec_Get_Size(token_texts) > 1) { Vec_Push(queries, (Obj*) QParser_Make_Phrase_Query(self, field, token_texts)); } else if (Vec_Get_Size(token_texts) == 1) { Vec_Push(queries, (Obj*)QParser_Make_Term_Query(self, field, Vec_Fetch(token_texts, 0))); } DECREF(token_texts); DECREF(maybe_texts); DECREF(split_source); } } Query *retval; if (Vec_Get_Size(queries) == 0) { retval = (Query*)NoMatchQuery_new(); if (ambiguous) { NoMatchQuery_Set_Fails_To_Match((NoMatchQuery*)retval, false); } } else if (Vec_Get_Size(queries) == 1) { retval = (Query*)INCREF(Vec_Fetch(queries, 0)); } else { retval = QParser_Make_OR_Query(self, queries); } // Clean up. DECREF(unescape_buf); DECREF(queries); DECREF(fields); DECREF(source_text); DECREF(tail); DECREF(top); return retval; }
Vector* QueryLexer_Tokenize_IMP(QueryLexer *self, String *query_string) { QueryLexerIVARS *const ivars = QueryLexer_IVARS(self); Vector *elems = Vec_new(0); if (!query_string) { return elems; } StringIterator *iter = Str_Top(query_string); while (StrIter_Has_Next(iter)) { ParserElem *elem = NULL; if (StrIter_Skip_Whitespace(iter)) { // Fast-forward past whitespace. continue; } if (ivars->heed_colons) { ParserElem *elem = S_consume_field(iter); if (elem) { Vec_Push(elems, (Obj*)elem); } } int32_t code_point = StrIter_Next(iter); switch (code_point) { case '(': elem = ParserElem_new(TOKEN_OPEN_PAREN, NULL); break; case ')': elem = ParserElem_new(TOKEN_CLOSE_PAREN, NULL); break; case '+': if (StrIter_Has_Next(iter) && !StrIter_Skip_Whitespace(iter) ) { elem = ParserElem_new(TOKEN_PLUS, NULL); } else { elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("+")); } break; case '-': if (StrIter_Has_Next(iter) && !StrIter_Skip_Whitespace(iter) ) { elem = ParserElem_new(TOKEN_MINUS, NULL); } else { elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("-")); } break; case '"': StrIter_Recede(iter, 1); elem = S_consume_quoted_string(iter); break; case 'O': StrIter_Recede(iter, 1); elem = S_consume_keyword(iter, "OR", 2, TOKEN_OR); if (!elem) { elem = S_consume_text(iter); } break; case 'A': StrIter_Recede(iter, 1); elem = S_consume_keyword(iter, "AND", 3, TOKEN_AND); if (!elem) { elem = S_consume_text(iter); } break; case 'N': StrIter_Recede(iter, 1); elem = S_consume_keyword(iter, "NOT", 3, TOKEN_NOT); if (!elem) { elem = S_consume_text(iter); } break; default: StrIter_Recede(iter, 1); elem = S_consume_text(iter); break; } Vec_Push(elems, (Obj*)elem); } DECREF(iter); return elems; }
static void test_iterator_substring(TestBatchRunner *runner) { String *string = Str_newf("a%sb%sc%sd", smiley, smiley, smiley); StringIterator *start = Str_Top(string); StringIterator *end = Str_Tail(string); { String *substring = StrIter_crop(start, end); TEST_TRUE(runner, Str_Equals(substring, (Obj*)string), "StrIter_crop whole string"); DECREF(substring); } StrIter_Advance(start, 2); StrIter_Recede(end, 2); { String *substring = StrIter_crop(start, end); String *wanted = Str_newf("b%sc", smiley); TEST_TRUE(runner, Str_Equals(substring, (Obj*)wanted), "StrIter_crop"); TEST_TRUE(runner, StrIter_Starts_With(start, wanted), "Starts_With returns true"); TEST_TRUE(runner, StrIter_Ends_With(end, wanted), "Ends_With returns true"); DECREF(wanted); DECREF(substring); } { String *short_str = Str_newf("b%sx", smiley); TEST_FALSE(runner, StrIter_Starts_With(start, short_str), "Starts_With returns false"); TEST_FALSE(runner, StrIter_Ends_With(start, short_str), "Ends_With returns false"); String *long_str = Str_newf("b%sxxxxxxxxxxxx%sc", smiley, smiley); TEST_FALSE(runner, StrIter_Starts_With(start, long_str), "Starts_With long string returns false"); TEST_FALSE(runner, StrIter_Ends_With(end, long_str), "Ends_With long string returns false"); DECREF(short_str); DECREF(long_str); } { String *substring = StrIter_crop(end, NULL); String *wanted = Str_newf("%sd", smiley); TEST_TRUE(runner, Str_Equals(substring, (Obj*)wanted), "StrIter_crop with NULL tail"); DECREF(wanted); DECREF(substring); } { String *substring = StrIter_crop(NULL, start); String *wanted = Str_newf("a%s", smiley); TEST_TRUE(runner, Str_Equals(substring, (Obj*)wanted), "StrIter_crop with NULL top"); DECREF(wanted); DECREF(substring); } DECREF(start); DECREF(end); DECREF(string); }
static void test_iterator(TestBatchRunner *runner) { static const int32_t code_points[] = { 0x41, 0x7F, 0x80, 0x7FF, 0x800, 0xFFFF, 0x10000, 0x10FFFF }; static size_t num_code_points = sizeof(code_points) / sizeof(code_points[0]); CharBuf *buf = CB_new(0); for (size_t i = 0; i < num_code_points; ++i) { CB_Cat_Char(buf, code_points[i]); } String *string = CB_To_String(buf); { StringIterator *iter = Str_Top(string); TEST_TRUE(runner, StrIter_Equals(iter, (Obj*)iter), "StringIterator equal to self"); TEST_FALSE(runner, StrIter_Equals(iter, (Obj*)CFISH_TRUE), "StringIterator not equal non-iterators"); DECREF(iter); } { StringIterator *top = Str_Top(string); StringIterator *tail = Str_Tail(string); TEST_INT_EQ(runner, StrIter_Compare_To(top, (Obj*)tail), -1, "Compare_To top < tail"); TEST_INT_EQ(runner, StrIter_Compare_To(tail, (Obj*)top), 1, "Compare_To tail > top"); TEST_INT_EQ(runner, StrIter_Compare_To(top, (Obj*)top), 0, "Compare_To top == top"); StringIterator *clone = StrIter_Clone(top); TEST_TRUE(runner, StrIter_Equals(clone, (Obj*)top), "Clone"); StrIter_Assign(clone, tail); TEST_TRUE(runner, StrIter_Equals(clone, (Obj*)tail), "Assign"); String *other = Str_newf("Other string"); StringIterator *other_iter = Str_Top(other); TEST_FALSE(runner, StrIter_Equals(other_iter, (Obj*)tail), "Equals returns false for different strings"); StrIter_Assign(clone, other_iter); TEST_TRUE(runner, StrIter_Equals(clone, (Obj*)other_iter), "Assign iterator with different string"); DECREF(other); DECREF(other_iter); DECREF(clone); DECREF(top); DECREF(tail); } { StringIterator *iter = Str_Top(string); for (size_t i = 0; i < num_code_points; ++i) { TEST_TRUE(runner, StrIter_Has_Next(iter), "Has_Next %d", i); int32_t code_point = StrIter_Next(iter); TEST_INT_EQ(runner, code_point, code_points[i], "Next %d", i); } TEST_TRUE(runner, !StrIter_Has_Next(iter), "Has_Next at end of string"); TEST_INT_EQ(runner, StrIter_Next(iter), STR_OOB, "Next at end of string"); StringIterator *tail = Str_Tail(string); TEST_TRUE(runner, StrIter_Equals(iter, (Obj*)tail), "Equals tail"); DECREF(tail); DECREF(iter); } { StringIterator *iter = Str_Tail(string); for (size_t i = num_code_points; i--;) { TEST_TRUE(runner, StrIter_Has_Prev(iter), "Has_Prev %d", i); int32_t code_point = StrIter_Prev(iter); TEST_INT_EQ(runner, code_point, code_points[i], "Prev %d", i); } TEST_TRUE(runner, !StrIter_Has_Prev(iter), "Has_Prev at end of string"); TEST_INT_EQ(runner, StrIter_Prev(iter), STR_OOB, "Prev at start of string"); StringIterator *top = Str_Top(string); TEST_TRUE(runner, StrIter_Equals(iter, (Obj*)top), "Equals top"); DECREF(top); DECREF(iter); } { StringIterator *iter = Str_Top(string); StrIter_Next(iter); TEST_INT_EQ(runner, StrIter_Advance(iter, 2), 2, "Advance returns number of code points"); TEST_INT_EQ(runner, StrIter_Next(iter), code_points[3], "Advance works"); TEST_INT_EQ(runner, StrIter_Advance(iter, 1000000), num_code_points - 4, "Advance past end of string"); StrIter_Prev(iter); TEST_INT_EQ(runner, StrIter_Recede(iter, 2), 2, "Recede returns number of code points"); TEST_INT_EQ(runner, StrIter_Prev(iter), code_points[num_code_points-4], "Recede works"); TEST_INT_EQ(runner, StrIter_Recede(iter, 1000000), num_code_points - 4, "Recede past start of string"); DECREF(iter); } DECREF(string); DECREF(buf); }
String* Highlighter_Highlight_Excerpt_IMP(Highlighter *self, VArray *spans, String *raw_excerpt, int32_t top) { int32_t hl_start = 0; int32_t hl_end = 0; StringIterator *iter = Str_Top(raw_excerpt); StringIterator *temp = Str_Top(raw_excerpt); CharBuf *buf = CB_new(Str_Get_Size(raw_excerpt) + 32); CharBuf *encode_buf = NULL; int32_t raw_excerpt_end = top + Str_Length(raw_excerpt); for (uint32_t i = 0, max = VA_Get_Size(spans); i < max; i++) { Span *span = (Span*)VA_Fetch(spans, i); int32_t offset = Span_Get_Offset(span); if (offset < top) { continue; } else if (offset >= raw_excerpt_end) { break; } else { int32_t relative_start = offset - top; int32_t relative_end = relative_start + Span_Get_Length(span); if (relative_start <= hl_end) { if (relative_end > hl_end) { hl_end = relative_end; } } else { if (hl_start < hl_end) { // Highlight previous section int32_t highlighted_len = hl_end - hl_start; StrIter_Assign(temp, iter); StrIter_Advance(iter, highlighted_len); String *to_cat = StrIter_substring(temp, iter); String *encoded = S_do_encode(self, to_cat, &encode_buf); String *hl_frag = Highlighter_Highlight(self, encoded); CB_Cat(buf, hl_frag); DECREF(hl_frag); DECREF(encoded); DECREF(to_cat); } int32_t non_highlighted_len = relative_start - hl_end; StrIter_Assign(temp, iter); StrIter_Advance(iter, non_highlighted_len); String *to_cat = StrIter_substring(temp, iter); String *encoded = S_do_encode(self, to_cat, &encode_buf); CB_Cat(buf, (String*)encoded); DECREF(encoded); DECREF(to_cat); hl_start = relative_start; hl_end = relative_end; } } } if (hl_start < hl_end) { // Highlight final section int32_t highlighted_len = hl_end - hl_start; StrIter_Assign(temp, iter); StrIter_Advance(iter, highlighted_len); String *to_cat = StrIter_substring(temp, iter); String *encoded = S_do_encode(self, to_cat, &encode_buf); String *hl_frag = Highlighter_Highlight(self, encoded); CB_Cat(buf, hl_frag); DECREF(hl_frag); DECREF(encoded); DECREF(to_cat); } // Last text, beyond last highlight span. if (StrIter_Has_Next(iter)) { String *to_cat = StrIter_substring(iter, NULL); String *encoded = S_do_encode(self, to_cat, &encode_buf); CB_Cat(buf, encoded); DECREF(encoded); DECREF(to_cat); } String *highlighted = CB_Yield_String(buf); DECREF(encode_buf); DECREF(buf); DECREF(temp); DECREF(iter); return highlighted; }
String* Highlighter_Raw_Excerpt_IMP(Highlighter *self, String *field_val, int32_t *start_ptr, HeatMap *heat_map) { HighlighterIVARS *const ivars = Highlighter_IVARS(self); // Find start of excerpt. StringIterator *top = Str_Top(field_val); int32_t best_location = S_hottest(heat_map); int32_t start; uint32_t max_skip; if ((uint32_t)best_location <= ivars->slop) { // If the beginning of the string falls within the window centered // around the hottest point in the field, start the fragment at the // beginning. start = 0; max_skip = best_location; } else { start = best_location - ivars->slop; max_skip = ivars->slop; StrIter_Advance(top, start); } uint32_t num_skipped; bool found_starting_edge = S_find_starting_boundary(top, max_skip, &num_skipped); start += num_skipped; // Find end of excerpt. StringIterator *tail = StrIter_Clone(top); uint32_t max_len = ivars->excerpt_length; if (!found_starting_edge) { // Leave space for starting ellipsis and space character. max_len -= 2; } bool found_ending_edge = true; uint32_t excerpt_len = StrIter_Advance(tail, max_len); // Skip up to slop code points but keep at least max_len - slop. if (excerpt_len > max_len - ivars->slop) { max_skip = excerpt_len - (max_len - ivars->slop); found_ending_edge = S_find_ending_boundary(tail, max_skip, &num_skipped); if (num_skipped >= excerpt_len) { excerpt_len = 0; } else { excerpt_len -= num_skipped; } } // Extract excerpt. String *raw_excerpt; if (!excerpt_len) { raw_excerpt = Str_new_from_trusted_utf8("", 0); } else { String *substring = StrIter_substring(top, tail); CharBuf *buf = CB_new(Str_Get_Size(substring) + 8); // If not starting on a sentence boundary, prepend an ellipsis. if (!found_starting_edge) { CB_Cat_Char(buf, ELLIPSIS_CODE_POINT); CB_Cat_Char(buf, ' '); start -= 2; } CB_Cat(buf, substring); // If not ending on a sentence boundary, append an ellipsis. if (!found_ending_edge) { CB_Cat_Char(buf, ELLIPSIS_CODE_POINT); } raw_excerpt = CB_Yield_String(buf); DECREF(buf); DECREF(substring); } *start_ptr = start; DECREF(top); DECREF(tail); return raw_excerpt; }