示例#1
0
/* Splits a line into a set of (word,word_position) tuples. */
static GSList *
tokenize_line (GString * line)
{
	GSList * tokens = NULL;
	char *utf = (char *) line->str;

	GString * word;
	
	gunichar uc;
	size_t cur_pos = 0;
	size_t start_pos = 0;
	word = g_string_new (NULL);

	while (cur_pos < line->len && *utf) {
		int i;

	        /* Skip non-word characters. */
		cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
		uc = g_utf8_get_char (utf);
		while (cur_pos < line->len && *utf && !is_word_char(uc,0)) {
		        utf = g_utf8_next_char (utf);
			uc = g_utf8_get_char (utf);
			cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
		}
		start_pos = cur_pos;

		/* Skip over word. */
		while (cur_pos < line->len && *utf && is_word_char(uc,1)) {
			g_string_append_unichar (word, uc);
		        utf = g_utf8_next_char (utf);
			uc = g_utf8_get_char (utf);
			cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
		}

	        /* Do not accept one or more  ' at the end of the word. */
		i = word->len-1;
	        while ((i >= 0) && (word->str[i] == '\'')) {
	                g_string_truncate (word, i);
			i--;
		}

		/* Save (word, position) tuple. */
                if (word->len) {
		        tokens = g_slist_append (tokens, g_string_new_len (word->str, word->len));
			tokens = g_slist_append (tokens, GINT_TO_POINTER(start_pos));
			g_string_truncate (word, 0);
		}
	}
	g_string_free (word, TRUE);

	return tokens;
}
示例#2
0
bool is_valid_cql_id(const std::string& str) {
  for (std::string::const_iterator i = str.begin(),
       end = str.end(); i != end; ++i) {
    if (!is_word_char(*i)) {
      return false;
    }
  }
  return true;
}
示例#3
0
void for_each_token(char* line, void (*callback)(void* opaque, const char* word, int length), void* opaque) {
#if DEBUG
    printf("tokenizing [%s]\n", line);
    const char* i;
#endif
    char* start = line, *end = line;
    while (*end) {
        *end = tolower(*end);
        if (is_word_char(*end)) {
            end++;
        } else {
            if (start != end) {
#if DEBUG
                printf("[");
                for (i = start; i < end; i++) {
                    putchar(*i);
                }
                printf("] ");
#endif

                callback(opaque, start, end - start);
            }
            start = end;
            while (!is_word_char(*start) && *start) {
                start++;
            }
            end = start;
        }
    }

    if (start != end) {
#if DEBUG
        printf("[");
        for (i = start; i < end; i++) {
            putchar(*i);
        }
        printf("] ");
#endif
        callback(opaque, start, end - start);
    }
#if DEBUG
    printf("\n");
#endif
}
/*
 * Removes all non-word characters from a given line, separating each group of
 * word characters (words) with a space.
 * Input with no word characters is output as the empty string ("\0").
 *
 * Word characters are all alpha-numerals (0-9, a-z, A-Z).
 * Non-word characters are any other ASCII characters.
 *
 * Input: char *input - the original line to be normalized
 *        size_t len - the length of the original line
 *        char **output - pointer to where the normalized line will be stored
 */
void normalize(char *input, size_t len, char **output)
{
        /* Normalized line will not be longer than the original */
        char *normal = ALLOC(len);
        size_t pos = 0, itr = 0;

        while (itr < len) {
                while (input[itr] != '\0' && !is_word_char(input[itr]))
                        itr++;

                if (input[itr] != '\0' && pos > 0)
                        normal[pos++] = ' ';

                while (input[itr] != '\0' && is_word_char(input[itr]))
                        normal[pos++] = input[itr++];
                
                itr++;
        }

        normal[pos] = '\0';
        *output = normal;
}
示例#5
0
static gchar *
get_word_at_iter (GscProviderDevhelp *devhelp,
                  GtkTextIter        *iter)
{
	GtkTextIter start = *iter;
	gint line = gtk_text_iter_get_line (iter);
	gboolean went_back = TRUE;
	GtkTextMark *mark;
	
	if (!gtk_text_iter_backward_char (&start))
	{
		return NULL;
	}

	while (went_back &&
	       line == gtk_text_iter_get_line (&start) && 
	       is_word_char (gtk_text_iter_get_char (&start)))
	{
		went_back = gtk_text_iter_backward_char (&start);
	}
	
	if (went_back)
	{
		gtk_text_iter_forward_char (&start);
	}
	
	if (gtk_text_iter_equal (iter, &start))
	{
		return NULL;
	}

	mark = gtk_text_buffer_get_mark (gtk_text_iter_get_buffer (iter),
					 MARK_NAME);
	
	if (mark)
	{
		gtk_text_buffer_move_mark (gtk_text_iter_get_buffer (iter),
		                           mark,
		                           &start);
	}
	else
	{
		mark = gtk_text_buffer_create_mark (gtk_text_iter_get_buffer (iter),
		                                    MARK_NAME,
		                                    &start,
		                                    TRUE);
	}
	
	return gtk_text_iter_get_text (&start, iter);
}
示例#6
0
文件: genlex.c 项目: agaurav/QT-GRETL
static void getword (parser *p)
{  
    char word[32];
    int i = 0;

    /* we know the first char is acceptable (and might be '$' or '@') */
    word[i++] = p->ch;
    parser_getc(p);

#ifdef USE_RLIB
    /* allow for R.foo function namespace */
    if (*word == 'R' && p->ch == '.' && *p->point != '$') {
	word[i++] = p->ch;
	parser_getc(p);
    }
#endif

    while (p->ch != 0 && is_word_char(p) && i < 31) {
	word[i++] = p->ch;
	parser_getc(p);
    }

    word[i] = '\0';

#if LDEBUG
    fprintf(stderr, "getword: word = '%s'\n", word);
#endif

    while (p->ch != 0 && strchr(wordchars, p->ch) != NULL) {
	/* flush excess word characters */
	parser_getc(p);
    }

    if (p->flags & P_GETSTR) {
	/* uninterpreted string wanted */
	p->sym = STR;
	p->idstr = gretl_strdup(word);
	p->flags ^= P_GETSTR;
	return;
    }

    if ((*word == '$' && word[1]) || !strcmp(word, "obs")) {
	look_up_dollar_word(word, p);
    } else if (*word == '@') {
	/* do we actually want to do this? */
	look_up_string_variable(word, p);
    } else if (*word == '$' && word[1] == '\0' && p->ch == '[') {
	p->sym = BUNDLE;
	p->idstr = gretl_strdup("$");
    } else {
	look_up_word(word, p);
    }

    if (!p->err && *word != '@') {
	word_check_next_char(p);
    }

#if LDEBUG
    fprintf(stderr, "getword: p->err = %d\n", p->err);
#endif
}