static ParserElem* S_consume_text(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); while (1) { int32_t code_point = StrIter_Next(temp); if (code_point == '\\') { code_point = StrIter_Next(temp); if (code_point == STR_OOB) { break; } } else if (code_point == STR_OOB) { break; } else if (StrHelp_is_whitespace(code_point) || code_point == '"' || code_point == '(' || code_point == ')' ) { StrIter_Recede(temp, 1); break; } } String *text = StrIter_crop(iter, temp); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_STRING, (Obj*)text); }
static ParserElem* S_consume_keyword(StringIterator *iter, const char *keyword, size_t keyword_len, int type) { if (!StrIter_Starts_With_Utf8(iter, keyword, keyword_len)) { return NULL; } StringIterator *temp = StrIter_Clone(iter); StrIter_Advance(temp, keyword_len); int32_t lookahead = StrIter_Next(temp); if (lookahead == STR_OOB) { DECREF(temp); return NULL; } if (StrHelp_is_whitespace(lookahead) || lookahead == '"' || lookahead == '(' || lookahead == ')' || lookahead == '+' || lookahead == '-' ) { StrIter_Recede(temp, 1); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(type, NULL); } DECREF(temp); return NULL; }
String* IxFileNames_local_part(String *path) { StringIterator *top = Str_Tail(path); int32_t code_point = StrIter_Prev(top); // Trim trailing slash. while (code_point == '/') { code_point = StrIter_Prev(top); } StringIterator *tail = StrIter_Clone(top); StrIter_Advance(tail, 1); // Substring should start after last slash. while (code_point != STR_OOB) { if (code_point == '/') { StrIter_Advance(top, 1); break; } code_point = StrIter_Prev(top); } String *retval = StrIter_crop(top, tail); DECREF(tail); DECREF(top); return retval; }
static ParserElem* S_consume_field(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); // Field names constructs must start with a letter or underscore. int32_t code_point = StrIter_Next(temp); if (code_point == STR_OOB) { DECREF(temp); return NULL; } if (!(isalpha(code_point) || code_point == '_')) { DECREF(temp); return NULL; } // Only alphanumerics and underscores are allowed in field names. while (':' != (code_point = StrIter_Next(temp))) { if (code_point == STR_OOB) { DECREF(temp); return NULL; } if (!(isalnum(code_point) || code_point == '_')) { DECREF(temp); return NULL; } } // Field name constructs must be followed by something sensible. int32_t lookahead = StrIter_Next(temp); if (lookahead == STR_OOB) { DECREF(temp); return NULL; } if (!(isalnum(lookahead) || lookahead == '_' || lookahead > 127 || lookahead == '"' || lookahead == '(' ) ) { DECREF(temp); return NULL; } // Consume string data. StrIter_Recede(temp, 2); // Back up over lookahead and colon. String *field = StrIter_crop(iter, temp); StrIter_Advance(temp, 1); // Skip colon. StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_FIELD, (Obj*)field); }
static ParserElem* S_consume_quoted_string(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); if (StrIter_Next(temp) != '"') { THROW(ERR, "Internal error: expected a quote"); } while (1) { int32_t code_point = StrIter_Next(temp); if (code_point == STR_OOB || code_point == '"') { break; } else if (code_point == '\\') { StrIter_Next(temp); } } String *text = StrIter_crop(iter, temp); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_STRING, (Obj*)text); }
static void test_iterator(TestBatchRunner *runner) { static const int32_t code_points[] = { 0x41, 0x7F, 0x80, 0x7FF, 0x800, 0xFFFF, 0x10000, 0x10FFFF }; static size_t num_code_points = sizeof(code_points) / sizeof(code_points[0]); CharBuf *buf = CB_new(0); for (size_t i = 0; i < num_code_points; ++i) { CB_Cat_Char(buf, code_points[i]); } String *string = CB_To_String(buf); { StringIterator *iter = Str_Top(string); TEST_TRUE(runner, StrIter_Equals(iter, (Obj*)iter), "StringIterator equal to self"); TEST_FALSE(runner, StrIter_Equals(iter, (Obj*)CFISH_TRUE), "StringIterator not equal non-iterators"); DECREF(iter); } { StringIterator *top = Str_Top(string); StringIterator *tail = Str_Tail(string); TEST_INT_EQ(runner, StrIter_Compare_To(top, (Obj*)tail), -1, "Compare_To top < tail"); TEST_INT_EQ(runner, StrIter_Compare_To(tail, (Obj*)top), 1, "Compare_To tail > top"); TEST_INT_EQ(runner, StrIter_Compare_To(top, (Obj*)top), 0, "Compare_To top == top"); StringIterator *clone = StrIter_Clone(top); TEST_TRUE(runner, StrIter_Equals(clone, (Obj*)top), "Clone"); StrIter_Assign(clone, tail); TEST_TRUE(runner, StrIter_Equals(clone, (Obj*)tail), "Assign"); String *other = Str_newf("Other string"); StringIterator *other_iter = Str_Top(other); TEST_FALSE(runner, StrIter_Equals(other_iter, (Obj*)tail), "Equals returns false for different strings"); StrIter_Assign(clone, other_iter); TEST_TRUE(runner, StrIter_Equals(clone, (Obj*)other_iter), "Assign iterator with different string"); DECREF(other); DECREF(other_iter); DECREF(clone); DECREF(top); DECREF(tail); } { StringIterator *iter = Str_Top(string); for (size_t i = 0; i < num_code_points; ++i) { TEST_TRUE(runner, StrIter_Has_Next(iter), "Has_Next %d", i); int32_t code_point = StrIter_Next(iter); TEST_INT_EQ(runner, code_point, code_points[i], "Next %d", i); } TEST_TRUE(runner, !StrIter_Has_Next(iter), "Has_Next at end of string"); TEST_INT_EQ(runner, StrIter_Next(iter), STR_OOB, "Next at end of string"); StringIterator *tail = Str_Tail(string); TEST_TRUE(runner, StrIter_Equals(iter, (Obj*)tail), "Equals tail"); DECREF(tail); DECREF(iter); } { StringIterator *iter = Str_Tail(string); for (size_t i = num_code_points; i--;) { TEST_TRUE(runner, StrIter_Has_Prev(iter), "Has_Prev %d", i); int32_t code_point = StrIter_Prev(iter); TEST_INT_EQ(runner, code_point, code_points[i], "Prev %d", i); } TEST_TRUE(runner, !StrIter_Has_Prev(iter), "Has_Prev at end of string"); TEST_INT_EQ(runner, StrIter_Prev(iter), STR_OOB, "Prev at start of string"); StringIterator *top = Str_Top(string); TEST_TRUE(runner, StrIter_Equals(iter, (Obj*)top), "Equals top"); DECREF(top); DECREF(iter); } { StringIterator *iter = Str_Top(string); StrIter_Next(iter); TEST_INT_EQ(runner, StrIter_Advance(iter, 2), 2, "Advance returns number of code points"); TEST_INT_EQ(runner, StrIter_Next(iter), code_points[3], "Advance works"); TEST_INT_EQ(runner, StrIter_Advance(iter, 1000000), num_code_points - 4, "Advance past end of string"); StrIter_Prev(iter); TEST_INT_EQ(runner, StrIter_Recede(iter, 2), 2, "Recede returns number of code points"); TEST_INT_EQ(runner, StrIter_Prev(iter), code_points[num_code_points-4], "Recede works"); TEST_INT_EQ(runner, StrIter_Recede(iter, 1000000), num_code_points - 4, "Recede past start of string"); DECREF(iter); } DECREF(string); DECREF(buf); }
String* Highlighter_Raw_Excerpt_IMP(Highlighter *self, String *field_val, int32_t *start_ptr, HeatMap *heat_map) { HighlighterIVARS *const ivars = Highlighter_IVARS(self); // Find start of excerpt. StringIterator *top = Str_Top(field_val); int32_t best_location = S_hottest(heat_map); int32_t start; uint32_t max_skip; if ((uint32_t)best_location <= ivars->slop) { // If the beginning of the string falls within the window centered // around the hottest point in the field, start the fragment at the // beginning. start = 0; max_skip = best_location; } else { start = best_location - ivars->slop; max_skip = ivars->slop; StrIter_Advance(top, start); } uint32_t num_skipped; bool found_starting_edge = S_find_starting_boundary(top, max_skip, &num_skipped); start += num_skipped; // Find end of excerpt. StringIterator *tail = StrIter_Clone(top); uint32_t max_len = ivars->excerpt_length; if (!found_starting_edge) { // Leave space for starting ellipsis and space character. max_len -= 2; } bool found_ending_edge = true; uint32_t excerpt_len = StrIter_Advance(tail, max_len); // Skip up to slop code points but keep at least max_len - slop. if (excerpt_len > max_len - ivars->slop) { max_skip = excerpt_len - (max_len - ivars->slop); found_ending_edge = S_find_ending_boundary(tail, max_skip, &num_skipped); if (num_skipped >= excerpt_len) { excerpt_len = 0; } else { excerpt_len -= num_skipped; } } // Extract excerpt. String *raw_excerpt; if (!excerpt_len) { raw_excerpt = Str_new_from_trusted_utf8("", 0); } else { String *substring = StrIter_substring(top, tail); CharBuf *buf = CB_new(Str_Get_Size(substring) + 8); // If not starting on a sentence boundary, prepend an ellipsis. if (!found_starting_edge) { CB_Cat_Char(buf, ELLIPSIS_CODE_POINT); CB_Cat_Char(buf, ' '); start -= 2; } CB_Cat(buf, substring); // If not ending on a sentence boundary, append an ellipsis. if (!found_ending_edge) { CB_Cat_Char(buf, ELLIPSIS_CODE_POINT); } raw_excerpt = CB_Yield_String(buf); DECREF(buf); DECREF(substring); } *start_ptr = start; DECREF(top); DECREF(tail); return raw_excerpt; }
// Find an ending boundary before the current position given by the iterator. // Skip up to max_skip code points plus potential whitespace. Update the // iterator and return number of code points skipped. Return true if a // ending edge (sentence) was found. bool S_find_ending_boundary(StringIterator *tail, uint32_t max_skip, uint32_t *num_skipped_ptr) { int32_t code_point; // Check if we're at an ending boundary already. Don't check for a word // boundary because we need space for a trailing ellipsis. StringIterator *iter = StrIter_Clone(tail); do { code_point = StrIter_Next(iter); if (code_point == STRITER_DONE) { // Skip remaining whitespace. *num_skipped_ptr = StrIter_Skip_Prev_Whitespace(tail); DECREF(iter); return true; } } while (StrHelp_is_whitespace(code_point)); // Keep track of the first word boundary. StringIterator *word = NULL; uint32_t word_offset = 0; StrIter_Assign(iter, tail); for (uint32_t i = 0; STRITER_DONE != (code_point = StrIter_Prev(iter)); ++i) { if (code_point == '.') { StrIter_Assign(tail, iter); StrIter_Advance(tail, 1); // Include period. *num_skipped_ptr = i; DECREF(word); DECREF(iter); return true; } if (StrHelp_is_whitespace(code_point)) { if (word == NULL) { word = StrIter_Clone(iter); word_offset = i + 1; } } else if (i >= max_skip) { // Break only at non-whitespace to allow another sentence // boundary to be found. break; } } if (word == NULL) { // Make space for ellipsis. *num_skipped_ptr = StrIter_Recede(tail, 1); } else { // Use word boundary if no sentence boundary was found. StrIter_Assign(tail, word); // Strip whitespace and punctuation that collides with an ellipsis. while (STRITER_DONE != (code_point = StrIter_Prev(tail))) { if (!StrHelp_is_whitespace(code_point) && code_point != '.' && code_point != ',' && code_point != ';' && code_point != ':' && code_point != ':' && code_point != '?' && code_point != '!' ) { StrIter_Advance(tail, 1); // Back up. break; } ++word_offset; } *num_skipped_ptr = word_offset; } DECREF(word); DECREF(iter); return false; }
// Find a starting boundary after the current position given by the iterator. // Skip up to max_skip code points plus potential whitespace. Update the // iterator and return number of code points skipped. Return true if a // starting edge (sentence) was found. bool S_find_starting_boundary(StringIterator *top, uint32_t max_skip, uint32_t *num_skipped_ptr) { // Keep track of the first word boundary. StringIterator *word = NULL; uint32_t word_offset = 0; // Check if we're at a starting boundary already. StringIterator *iter = StrIter_Clone(top); while (true) { int32_t code_point = StrIter_Prev(iter); if (code_point == STRITER_DONE || code_point == '.') { // Skip remaining whitespace. *num_skipped_ptr = StrIter_Skip_Next_Whitespace(top); DECREF(iter); return true; } if (StrHelp_is_whitespace(code_point)) { if (word == NULL) { word = StrIter_Clone(top); } } else { break; } } // Try to start on a boundary. uint32_t num_skipped = 0; bool found_edge = false; StrIter_Assign(iter, top); for (uint32_t i = 0; i < max_skip; ++i) { int32_t code_point = StrIter_Next(iter); if (code_point == STRITER_DONE || code_point == '.') { found_edge = true; StrIter_Assign(top, iter); num_skipped = i + 1; break; } if (word == NULL && StrHelp_is_whitespace(code_point)) { word = StrIter_Clone(iter); word_offset = i + 1; } } // Try to use word boundary if no sentence boundary was found. if (!found_edge && word != NULL) { StrIter_Assign(top, word); num_skipped = word_offset; } // Skip remaining whitespace. num_skipped += StrIter_Skip_Next_Whitespace(top); *num_skipped_ptr = num_skipped; DECREF(word); DECREF(iter); return found_edge; }