static ParserElem* S_consume_text(StringIterator *iter) { StringIterator *temp = StrIter_Clone(iter); while (1) { int32_t code_point = StrIter_Next(temp); if (code_point == '\\') { code_point = StrIter_Next(temp); if (code_point == STR_OOB) { break; } } else if (code_point == STR_OOB) { break; } else if (StrHelp_is_whitespace(code_point) || code_point == '"' || code_point == '(' || code_point == ')' ) { StrIter_Recede(temp, 1); break; } } String *text = StrIter_crop(iter, temp); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(TOKEN_STRING, (Obj*)text); }
static ParserElem* S_consume_keyword(StringIterator *iter, const char *keyword, size_t keyword_len, int type) { if (!StrIter_Starts_With_Utf8(iter, keyword, keyword_len)) { return NULL; } StringIterator *temp = StrIter_Clone(iter); StrIter_Advance(temp, keyword_len); int32_t lookahead = StrIter_Next(temp); if (lookahead == STR_OOB) { DECREF(temp); return NULL; } if (StrHelp_is_whitespace(lookahead) || lookahead == '"' || lookahead == '(' || lookahead == ')' || lookahead == '+' || lookahead == '-' ) { StrIter_Recede(temp, 1); StrIter_Assign(iter, temp); DECREF(temp); return ParserElem_new(type, NULL); } DECREF(temp); return NULL; }
static void test_is_whitespace(TestBatchRunner *runner) { TEST_TRUE(runner, StrHelp_is_whitespace(' '), "space is whitespace"); TEST_TRUE(runner, StrHelp_is_whitespace('\n'), "newline is whitespace"); TEST_TRUE(runner, StrHelp_is_whitespace('\t'), "tab is whitespace"); TEST_TRUE(runner, StrHelp_is_whitespace('\v'), "vertical tab is whitespace"); TEST_TRUE(runner, StrHelp_is_whitespace(0x180E), "Mongolian vowel separator is whitespace"); TEST_FALSE(runner, StrHelp_is_whitespace('a'), "'a' isn't whitespace"); TEST_FALSE(runner, StrHelp_is_whitespace(0), "NULL isn't whitespace"); TEST_FALSE(runner, StrHelp_is_whitespace(0x263A), "Smiley isn't whitespace"); }
size_t StrIter_Skip_Whitespace_Back_IMP(StringIterator *self) { size_t num_skipped = 0; size_t byte_offset = self->byte_offset; int32_t code_point; while (STR_OOB != (code_point = StrIter_Prev(self))) { if (!StrHelp_is_whitespace(code_point)) { break; } byte_offset = self->byte_offset; ++num_skipped; } self->byte_offset = byte_offset; return num_skipped; }
// Find an ending boundary before the current position given by the iterator. // Skip up to max_skip code points plus potential whitespace. Update the // iterator and return number of code points skipped. Return true if a // ending edge (sentence) was found. bool S_find_ending_boundary(StringIterator *tail, uint32_t max_skip, uint32_t *num_skipped_ptr) { int32_t code_point; // Check if we're at an ending boundary already. Don't check for a word // boundary because we need space for a trailing ellipsis. StringIterator *iter = StrIter_Clone(tail); do { code_point = StrIter_Next(iter); if (code_point == STRITER_DONE) { // Skip remaining whitespace. *num_skipped_ptr = StrIter_Skip_Prev_Whitespace(tail); DECREF(iter); return true; } } while (StrHelp_is_whitespace(code_point)); // Keep track of the first word boundary. StringIterator *word = NULL; uint32_t word_offset = 0; StrIter_Assign(iter, tail); for (uint32_t i = 0; STRITER_DONE != (code_point = StrIter_Prev(iter)); ++i) { if (code_point == '.') { StrIter_Assign(tail, iter); StrIter_Advance(tail, 1); // Include period. *num_skipped_ptr = i; DECREF(word); DECREF(iter); return true; } if (StrHelp_is_whitespace(code_point)) { if (word == NULL) { word = StrIter_Clone(iter); word_offset = i + 1; } } else if (i >= max_skip) { // Break only at non-whitespace to allow another sentence // boundary to be found. break; } } if (word == NULL) { // Make space for ellipsis. *num_skipped_ptr = StrIter_Recede(tail, 1); } else { // Use word boundary if no sentence boundary was found. StrIter_Assign(tail, word); // Strip whitespace and punctuation that collides with an ellipsis. while (STRITER_DONE != (code_point = StrIter_Prev(tail))) { if (!StrHelp_is_whitespace(code_point) && code_point != '.' && code_point != ',' && code_point != ';' && code_point != ':' && code_point != ':' && code_point != '?' && code_point != '!' ) { StrIter_Advance(tail, 1); // Back up. break; } ++word_offset; } *num_skipped_ptr = word_offset; } DECREF(word); DECREF(iter); return false; }
// Find a starting boundary after the current position given by the iterator. // Skip up to max_skip code points plus potential whitespace. Update the // iterator and return number of code points skipped. Return true if a // starting edge (sentence) was found. bool S_find_starting_boundary(StringIterator *top, uint32_t max_skip, uint32_t *num_skipped_ptr) { // Keep track of the first word boundary. StringIterator *word = NULL; uint32_t word_offset = 0; // Check if we're at a starting boundary already. StringIterator *iter = StrIter_Clone(top); while (true) { int32_t code_point = StrIter_Prev(iter); if (code_point == STRITER_DONE || code_point == '.') { // Skip remaining whitespace. *num_skipped_ptr = StrIter_Skip_Next_Whitespace(top); DECREF(iter); return true; } if (StrHelp_is_whitespace(code_point)) { if (word == NULL) { word = StrIter_Clone(top); } } else { break; } } // Try to start on a boundary. uint32_t num_skipped = 0; bool found_edge = false; StrIter_Assign(iter, top); for (uint32_t i = 0; i < max_skip; ++i) { int32_t code_point = StrIter_Next(iter); if (code_point == STRITER_DONE || code_point == '.') { found_edge = true; StrIter_Assign(top, iter); num_skipped = i + 1; break; } if (word == NULL && StrHelp_is_whitespace(code_point)) { word = StrIter_Clone(iter); word_offset = i + 1; } } // Try to use word boundary if no sentence boundary was found. if (!found_edge && word != NULL) { StrIter_Assign(top, word); num_skipped = word_offset; } // Skip remaining whitespace. num_skipped += StrIter_Skip_Next_Whitespace(top); *num_skipped_ptr = num_skipped; DECREF(word); DECREF(iter); return found_edge; }