/* Splits a line into a set of (word,word_position) tuples. */ static GSList * tokenize_line (GString * line) { GSList * tokens = NULL; char *utf = (char *) line->str; GString * word; gunichar uc; size_t cur_pos = 0; size_t start_pos = 0; word = g_string_new (NULL); while (cur_pos < line->len && *utf) { int i; /* Skip non-word characters. */ cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf); uc = g_utf8_get_char (utf); while (cur_pos < line->len && *utf && !is_word_char(uc,0)) { utf = g_utf8_next_char (utf); uc = g_utf8_get_char (utf); cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf); } start_pos = cur_pos; /* Skip over word. */ while (cur_pos < line->len && *utf && is_word_char(uc,1)) { g_string_append_unichar (word, uc); utf = g_utf8_next_char (utf); uc = g_utf8_get_char (utf); cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf); } /* Do not accept one or more ' at the end of the word. */ i = word->len-1; while ((i >= 0) && (word->str[i] == '\'')) { g_string_truncate (word, i); i--; } /* Save (word, position) tuple. */ if (word->len) { tokens = g_slist_append (tokens, g_string_new_len (word->str, word->len)); tokens = g_slist_append (tokens, GINT_TO_POINTER(start_pos)); g_string_truncate (word, 0); } } g_string_free (word, TRUE); return tokens; }
bool is_valid_cql_id(const std::string& str) { for (std::string::const_iterator i = str.begin(), end = str.end(); i != end; ++i) { if (!is_word_char(*i)) { return false; } } return true; }
void for_each_token(char* line, void (*callback)(void* opaque, const char* word, int length), void* opaque) { #if DEBUG printf("tokenizing [%s]\n", line); const char* i; #endif char* start = line, *end = line; while (*end) { *end = tolower(*end); if (is_word_char(*end)) { end++; } else { if (start != end) { #if DEBUG printf("["); for (i = start; i < end; i++) { putchar(*i); } printf("] "); #endif callback(opaque, start, end - start); } start = end; while (!is_word_char(*start) && *start) { start++; } end = start; } } if (start != end) { #if DEBUG printf("["); for (i = start; i < end; i++) { putchar(*i); } printf("] "); #endif callback(opaque, start, end - start); } #if DEBUG printf("\n"); #endif }
/* * Removes all non-word characters from a given line, separating each group of * word characters (words) with a space. * Input with no word characters is output as the empty string ("\0"). * * Word characters are all alpha-numerals (0-9, a-z, A-Z). * Non-word characters are any other ASCII characters. * * Input: char *input - the original line to be normalized * size_t len - the length of the original line * char **output - pointer to where the normalized line will be stored */ void normalize(char *input, size_t len, char **output) { /* Normalized line will not be longer than the original */ char *normal = ALLOC(len); size_t pos = 0, itr = 0; while (itr < len) { while (input[itr] != '\0' && !is_word_char(input[itr])) itr++; if (input[itr] != '\0' && pos > 0) normal[pos++] = ' '; while (input[itr] != '\0' && is_word_char(input[itr])) normal[pos++] = input[itr++]; itr++; } normal[pos] = '\0'; *output = normal; }
static gchar * get_word_at_iter (GscProviderDevhelp *devhelp, GtkTextIter *iter) { GtkTextIter start = *iter; gint line = gtk_text_iter_get_line (iter); gboolean went_back = TRUE; GtkTextMark *mark; if (!gtk_text_iter_backward_char (&start)) { return NULL; } while (went_back && line == gtk_text_iter_get_line (&start) && is_word_char (gtk_text_iter_get_char (&start))) { went_back = gtk_text_iter_backward_char (&start); } if (went_back) { gtk_text_iter_forward_char (&start); } if (gtk_text_iter_equal (iter, &start)) { return NULL; } mark = gtk_text_buffer_get_mark (gtk_text_iter_get_buffer (iter), MARK_NAME); if (mark) { gtk_text_buffer_move_mark (gtk_text_iter_get_buffer (iter), mark, &start); } else { mark = gtk_text_buffer_create_mark (gtk_text_iter_get_buffer (iter), MARK_NAME, &start, TRUE); } return gtk_text_iter_get_text (&start, iter); }
static void getword (parser *p) { char word[32]; int i = 0; /* we know the first char is acceptable (and might be '$' or '@') */ word[i++] = p->ch; parser_getc(p); #ifdef USE_RLIB /* allow for R.foo function namespace */ if (*word == 'R' && p->ch == '.' && *p->point != '$') { word[i++] = p->ch; parser_getc(p); } #endif while (p->ch != 0 && is_word_char(p) && i < 31) { word[i++] = p->ch; parser_getc(p); } word[i] = '\0'; #if LDEBUG fprintf(stderr, "getword: word = '%s'\n", word); #endif while (p->ch != 0 && strchr(wordchars, p->ch) != NULL) { /* flush excess word characters */ parser_getc(p); } if (p->flags & P_GETSTR) { /* uninterpreted string wanted */ p->sym = STR; p->idstr = gretl_strdup(word); p->flags ^= P_GETSTR; return; } if ((*word == '$' && word[1]) || !strcmp(word, "obs")) { look_up_dollar_word(word, p); } else if (*word == '@') { /* do we actually want to do this? */ look_up_string_variable(word, p); } else if (*word == '$' && word[1] == '\0' && p->ch == '[') { p->sym = BUNDLE; p->idstr = gretl_strdup("$"); } else { look_up_word(word, p); } if (!p->err && *word != '@') { word_check_next_char(p); } #if LDEBUG fprintf(stderr, "getword: p->err = %d\n", p->err); #endif }