static t_result lex_token(const char **string_p) { t_result result; result = lex_token_impl(string_p); if (result.token || result.error) return (result); result = lex_word(string_p); if (result.token || result.error) return (result); return (RESULT_ERROR(hs("Unexpected character"), *string_p)); }
/* The function ft_lex_main takes a NULL-terminated character string (may be unicode or ascii), tokenizes it, * and returns the head of a singly-linked list of tokens. */ static token *ft_lex_main (TCHAR *sentence, int *status) { token *head; token *tail; token *temp; int check_for_operator_keyword = TRUE; ASSERT (sentence != NULL); /* Initialize the first node in the list. */ head = tman_malloc (sizeof (token)); memset (head, '\0', sizeof (token)); head->lexeme = tm_empty_string_new(); head->token_id = TOK_SYSSTART; /* We start at the beginning of the list. */ tail = head; /* Examine each word of sentence and build a token that corresponds to it. * Append that token to the end of the list pointed to by head. */ while (*sentence != _TEXT('\0') && *status == TMAN_OK) { // Allocate the node for the current token. temp = tman_malloc (sizeof (token)); memset (temp, '\0', sizeof (token)); temp->lexeme = tm_empty_string_new(); if (*sentence == _TEXT('-')) // NOT op { temp->token_id = TOK_FT_NOT; tm_string_append_char (temp->lexeme, *sentence); sentence++; } else if (*sentence == _TEXT('(')) { temp->token_id = TOK_LPAREN; tm_string_append_char (temp->lexeme, *sentence); sentence++; } else if (*sentence == _TEXT(')')) { temp->token_id = TOK_RPAREN; tm_string_append_char (temp->lexeme, *sentence); sentence++; } else if (*sentence == _TEXT('"')) // a phrase { temp->token_id = TOK_FT_PHRASE; sentence++; while (*sentence != _TEXT('\0') && *sentence != _TEXT('"')) { tm_string_append_char (temp->lexeme, *sentence); sentence++; } if (*sentence == _TEXT('"')) { sentence++; } else // End of phrase was not found. { logwrite ("Error: The delimiter '\"' is missing at the end of phrase in the query string."); *status = TMAN_ERROR; } } // A '+' is placed in front of a word to force its search, even if it is a stoplist word (exception 'the'). else if (*sentence == _TEXT('+')) { // Once a + sign appears in front of a word, it is not an operator anymore. check_for_operator_keyword = FALSE; tm_string_append_char (temp->lexeme, *sentence); sentence++; if (lex_word (&sentence, temp, status, &check_for_operator_keyword) == FALSE) { *status = TMAN_ERROR; temp->token_id = TOK_BAD; logwrite ("Error: Illegal use of '+'. It can only precede a word that you definitely want to consider" " in a condition string (use it to force inclusion of common words)."); } } else if (lex_word (&sentence, temp, status, &check_for_operator_keyword)) { ; // nothing needs to be done, the word has already been lexed into token temp. } else if (*sentence == _TEXT('*')) { // We should never find a '*' in the query string all by itself. *status = TMAN_ERROR; logwrite ("Error: Illegal use of Wildcard character '*'. It can only appear at the end of a word."); temp->token_id = TOK_BAD; } else { // Anything else except the above is white space for a free text lexer. temp->token_id = TOK_FT_WS; sentence++; // Eat up rest of whitespaces as well. while (_tcschr(_TEXT("-*()_\"'+"), *sentence) == NULL && !(_istalnum (*sentence))) { sentence++; } } temp->next = NULL; /* End of list points to NULL. */ temp->prev = tail; /* Point back to end of list */ tail->next = temp; /* Append temp to the list. */ tail = temp; /* Point tail to the new end of list. */ } // while ( *sentence != '\0' ) // Initialize the last node in the list. temp = tman_malloc (sizeof (token)); memset (temp, '\0', sizeof (token)); temp->lexeme = tm_empty_string_new(); temp->token_id = TOK_SYSEND; // Append the trailing node. temp->next = NULL; temp->prev = tail; tail->next = temp; tail = temp; return (head); }
/* This function tokenizes the string inside the phrase into a list of word tokens. */ token *ft_phrase_lex (TCHAR *sentence) { token *head; token *tail; token *temp; int status = TMAN_OK; ASSERT (sentence != NULL); /* Initialize the first node in the list. */ head = tman_malloc (sizeof (token)); memset (head, '\0', sizeof (token)); head->lexeme = tm_empty_string_new(); head->token_id = TOK_SYSSTART; /* We start at the beginning of the list. */ tail = head; /* Examine each word of sentence and build a token that corresponds to it. * Append that token to the end of the list pointed to by head. */ while (*sentence != _TEXT('\0') && status == TMAN_OK) { // Inside a phrase, you cannot have operator keywords. So and/or inside phrases are // actual words. int check_for_operator_keyword = FALSE; // Allocate the node for the current token. temp = tman_malloc (sizeof (token)); memset (temp, '\0', sizeof (token)); temp->lexeme = tm_empty_string_new(); if (lex_word (&sentence, temp, &status, &check_for_operator_keyword)) { ; // The word is already lexed into temp. } // A '+' is placed in front of a word to force its search, even if it is a stoplist word (exception 'the'). else if (*sentence == _TEXT('+')) { tm_string_append_char (temp->lexeme, *sentence); sentence++; if (lex_word (&sentence, temp, &status, &check_for_operator_keyword) == FALSE) { status = TMAN_ERROR; temp->token_id = TOK_BAD; logwrite ("Error: Illegal use of '+'. It can only precede a word that you definitely want to consider" " in a condition string (use it to force inclusion of common words)."); } } else if (*sentence == _TEXT('*')) { // We should never find a '*' in the query string all by itself. status = TMAN_ERROR; logwrite ("Error: Illegal use of Wildcard character '*'. It can only appear at the end of a word."); temp->token_id = TOK_BAD; } else { // Anything else except the above is white space for a free text lexer. temp->token_id = TOK_FT_WS; sentence++; // Eat up rest of whitespaces as well. while (_tcschr(_TEXT("*_'+"), *sentence) == NULL && !(_istalnum (*sentence))) { sentence++; } } temp->next = NULL; /* End of list points to NULL. */ temp->prev = tail; /* Point back to end of list */ tail->next = temp; /* Append temp to the list. */ tail = temp; /* Point tail to the new end of list. */ } // while ( *sentence != '\0' ) // Initialize the last node in the list. temp = tman_malloc (sizeof (token)); memset (temp, '\0', sizeof (token)); temp->lexeme = tm_empty_string_new(); temp->token_id = TOK_SYSEND; // Append the trailing node. temp->next = NULL; temp->prev = tail; tail->next = temp; tail = temp; return (head); }