Example #1
0
static t_result         lex_token(const char **string_p)
{
  t_result              result;

  result = lex_token_impl(string_p);
  if (result.token || result.error)
    return (result);
  result = lex_word(string_p);
  if (result.token || result.error)
    return (result);
  return (RESULT_ERROR(hs("Unexpected character"), *string_p));
}
Example #2
0
/* The function ft_lex_main takes a NULL-terminated character string (may be unicode or ascii), tokenizes it, 
 * and returns the head of a singly-linked list of tokens.
 */
static token *ft_lex_main (TCHAR *sentence, int *status)
{
	token *head;
	token *tail;
	token *temp;
	int check_for_operator_keyword = TRUE;  

	ASSERT (sentence != NULL);

    /* Initialize the first node in the list. */
    head = tman_malloc (sizeof (token));
    memset (head, '\0', sizeof (token));
    head->lexeme = tm_empty_string_new();
	head->token_id = TOK_SYSSTART;

    /* We start at the beginning of the list. */
    tail = head;

    /* Examine each word of sentence and build a token that corresponds to it.
     * Append that token to the end of the list pointed to by head.
     */
	while (*sentence != _TEXT('\0') && *status == TMAN_OK)
    {
        // Allocate the node for the current token.
		temp = tman_malloc (sizeof (token));
		memset (temp, '\0', sizeof (token));
		temp->lexeme = tm_empty_string_new();

		if (*sentence == _TEXT('-'))	// NOT op
		{
			temp->token_id = TOK_FT_NOT;
			tm_string_append_char (temp->lexeme, *sentence);
			sentence++;
		}
		else if (*sentence == _TEXT('('))
		{
			temp->token_id = TOK_LPAREN;
			tm_string_append_char (temp->lexeme, *sentence);
			sentence++;
		}
		else if (*sentence == _TEXT(')'))
		{
			temp->token_id = TOK_RPAREN;
			tm_string_append_char (temp->lexeme, *sentence);
			sentence++;
		}
		else if (*sentence == _TEXT('"'))	// a phrase
		{
			temp->token_id = TOK_FT_PHRASE;
			sentence++;

			while (*sentence != _TEXT('\0') && *sentence != _TEXT('"'))
			{
				tm_string_append_char (temp->lexeme, *sentence);
				sentence++;
			}

			if (*sentence == _TEXT('"'))
			{
				sentence++;
			}
			else	// End of phrase was not found.
			{
				logwrite ("Error: The delimiter '\"' is missing at the end of phrase in the query string.");
				*status = TMAN_ERROR;
			}
		}
		// A '+' is placed in front of a word to force its search, even if it is a stoplist word (exception 'the').
		else if (*sentence == _TEXT('+'))
		{
			// Once a + sign appears in front of a word, it is not an operator anymore.
			check_for_operator_keyword = FALSE;
			tm_string_append_char (temp->lexeme, *sentence);
			sentence++;
			if (lex_word (&sentence, temp, status, &check_for_operator_keyword) == FALSE)
			{
				*status = TMAN_ERROR;
				temp->token_id = TOK_BAD;
				logwrite ("Error: Illegal use of '+'. It can only precede a word that you definitely want to consider"
							" in a condition string (use it to force inclusion of common words).");
			}

		}
		else if (lex_word (&sentence, temp, status, &check_for_operator_keyword))
		{
			; // nothing needs to be done, the word has already been lexed into token temp.
		}
		else if (*sentence == _TEXT('*'))
		{
			// We should never find a '*' in the query string all by itself.
			*status = TMAN_ERROR;
			logwrite ("Error: Illegal use of Wildcard character '*'. It can only appear at the end of a word.");
			temp->token_id = TOK_BAD;
		}
		else
		{
			// Anything else except the above is white space for a free text lexer.
			temp->token_id = TOK_FT_WS;
			sentence++;
			// Eat up rest of whitespaces as well.
			while (_tcschr(_TEXT("-*()_\"'+"), *sentence) == NULL && !(_istalnum (*sentence)))
			{
				sentence++;
			}                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
		}


		temp->next = NULL; /* End of list points to NULL.         */
		temp->prev = tail; /* Point back to end of list           */
		tail->next = temp; /* Append temp to the list.            */
		tail = temp;       /* Point tail to the new end of list.  */
    } // while ( *sentence != '\0' )

    // Initialize the last node in the list.
    temp = tman_malloc (sizeof (token));
    memset (temp, '\0', sizeof (token));
    temp->lexeme = tm_empty_string_new();
	temp->token_id = TOK_SYSEND;

	// Append the trailing node.
	temp->next = NULL;
	temp->prev = tail;
	tail->next = temp;
	tail = temp;

    return (head);
}
Example #3
0
/* This function tokenizes the string inside the phrase into a list of word tokens. */
token *ft_phrase_lex (TCHAR *sentence)
{
	token *head;
	token *tail;
	token *temp;
	int status = TMAN_OK;
	
	ASSERT (sentence != NULL);

    /* Initialize the first node in the list. */
    head = tman_malloc (sizeof (token));
    memset (head, '\0', sizeof (token));
    head->lexeme = tm_empty_string_new();
	head->token_id = TOK_SYSSTART;

    /* We start at the beginning of the list. */
    tail = head;

    /* Examine each word of sentence and build a token that corresponds to it.
     * Append that token to the end of the list pointed to by head.
     */
	while (*sentence != _TEXT('\0') && status == TMAN_OK)
    {		
		// Inside a phrase, you cannot have operator keywords. So and/or inside phrases are
		// actual words.
		int check_for_operator_keyword = FALSE;

        // Allocate the node for the current token.
		temp = tman_malloc (sizeof (token));
		memset (temp, '\0', sizeof (token));
		temp->lexeme = tm_empty_string_new();

		if (lex_word (&sentence, temp, &status, &check_for_operator_keyword))
        {
			; // The word is already lexed into temp.
		}
		// A '+' is placed in front of a word to force its search, even if it is a stoplist word (exception 'the').
		else if (*sentence == _TEXT('+'))
		{
			tm_string_append_char (temp->lexeme, *sentence);
			sentence++;
			if (lex_word (&sentence, temp, &status, &check_for_operator_keyword) == FALSE)
			{
				status = TMAN_ERROR;
				temp->token_id = TOK_BAD;
				logwrite ("Error: Illegal use of '+'. It can only precede a word that you definitely want to consider"
							" in a condition string (use it to force inclusion of common words).");
			}

		}
		else if (*sentence == _TEXT('*'))
		{
			// We should never find a '*' in the query string all by itself.
			status = TMAN_ERROR;
			logwrite ("Error: Illegal use of Wildcard character '*'. It can only appear at the end of a word.");
			temp->token_id = TOK_BAD;
		}
		else
		{
			// Anything else except the above is white space for a free text lexer.
			temp->token_id = TOK_FT_WS;
			sentence++;
			// Eat up rest of whitespaces as well.
			while (_tcschr(_TEXT("*_'+"), *sentence) == NULL && !(_istalnum (*sentence)))
			{
				sentence++;
			}                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
		}
		temp->next = NULL; /* End of list points to NULL.         */
		temp->prev = tail; /* Point back to end of list           */
		tail->next = temp; /* Append temp to the list.            */
		tail = temp;       /* Point tail to the new end of list.  */
	} // while ( *sentence != '\0' )

    // Initialize the last node in the list.
    temp = tman_malloc (sizeof (token));
    memset (temp, '\0', sizeof (token));
    temp->lexeme = tm_empty_string_new();
	temp->token_id = TOK_SYSEND;

	// Append the trailing node.
	temp->next = NULL;
	temp->prev = tail;
	tail->next = temp;
	tail = temp;

    return (head);
}