Exemplo n.º 1
0
// Remove all the stop words not preceded by the inclusion operator
void remove_stop_words(syntree **free_text_syntree)
{
	ASSERT(free_text_syntree);
	ASSERT(*free_text_syntree);
	
	// If a stop word is alone, return an error.
	// Otherwise if a word is alone, return.
	if (((word_syntree *)(*free_text_syntree))->type == FT_WORD_TYPE)
	{
		if (is_stop_word(get_chars(((word_syntree *)(*free_text_syntree))->word)))
		{
			logwrite("Note: The following word is very common and was not included in your search: %s.",
				      get_chars (((word_syntree *)(*free_text_syntree))->word));
			ft_word_delete((word_syntree *)(*free_text_syntree));
			*free_text_syntree = NULL;
			return;
		}
		else
		{
			return;
		}
	}
	// If a phrase is alone, return.
	if (((word_syntree *)(*free_text_syntree))->type == FT_PHRASE_TYPE)
	{
		return;
	}
	
	remove_stop_words_helper(free_text_syntree);
}
Exemplo n.º 2
0
bool
is_word_valid( string word )
{
	if ( word.length() < MIN_WORD_LENGTH || word.length() > MAX_WORD_LENGTH )
		// skip too small and too large words
		return false;
	
	if ( is_number(word) )
		// skip numbers
		return false;
	
	// skip if the entire word consists of a single char
	bool single_char = true;
	for ( unsigned int i=1; i<word.length(); i++ )
		if ( word[i] != word[0] )
		{
			single_char = false;
			break;
		}
	if ( single_char )
		return false;
	
	if ( is_stop_word(word) )
		// skip stop words
		return false;
	
	if ( word[0] == '0' && word[1] == 'x' )
	{ // 0x...
		return false;
	}
	
	return true;
}
Exemplo n.º 3
0
void test_stop_words()
{
    assert(is_stop_word("non_stop") == 0); 
    assert(is_stop_word("the der") == 0); 
    assert(is_stop_word("the") == 1); 
    assert(is_stop_word("der") == 1); 
    assert(is_stop_word("на") == 1); 
    assert(is_stop_word("") == 1); 
}
Exemplo n.º 4
0
// Helper function to remove stop words not preceded by the inclusion operator
void remove_stop_words_helper(syntree **free_text_tree)
{
	int right_tree_stop_word = FALSE;
	int left_tree_stop_word = FALSE;
	
	syntree *left_stop_word_to_be_deleted = NULL;
	syntree *right_stop_word_to_be_deleted = NULL;
	syntree *operator_tree_to_be_deleted = NULL;

	syntree *left_word_tree = NULL;
	syntree *right_word_tree = NULL;	

	// We don't want to look at/below the WORD nodes or PHRASE nodes.
	// Because the tree below them is inconsequential.	
	if ((*free_text_tree)->left_tree && ((*free_text_tree)->left_tree->type != FT_WORD_TYPE && 
		                                 (*free_text_tree)->left_tree->type != FT_PHRASE_TYPE &&
										 (*free_text_tree)->left_tree->type != FT_NOT_TYPE))
	{
		remove_stop_words_helper(&((*free_text_tree)->left_tree));
	}
	if ((*free_text_tree)->right_tree && ((*free_text_tree)->right_tree->type != FT_WORD_TYPE && 
		                                  (*free_text_tree)->right_tree->type != FT_PHRASE_TYPE &&
										  (*free_text_tree)->right_tree->type != FT_NOT_TYPE))
	{
		remove_stop_words_helper(&((*free_text_tree)->right_tree));
	}
	
	if (((*free_text_tree)->type == FT_AND_TYPE) || ((*free_text_tree)->type == FT_OR_TYPE))
	{
		// Check if the right or left tree of an operator is NULL. If it is,
		// the remove that node and make it point to its non-null child.
		
		// Case 1 : Left tree is NULL while right tree is not NULL.
		if ((*free_text_tree)->left_tree == NULL && (*free_text_tree)->right_tree != NULL)
		{
			right_word_tree = (*free_text_tree)->right_tree;
			operator_tree_to_be_deleted = (*free_text_tree);

			*free_text_tree = right_word_tree;

			syntree_delete_root_node(operator_tree_to_be_deleted);
			return;
		}
		// Case 2 : Right tree is NULL while left tree is not NULL.
		if ((*free_text_tree)->right_tree == NULL && (*free_text_tree)->left_tree != NULL)
		{
			left_word_tree = (*free_text_tree)->left_tree;
			operator_tree_to_be_deleted = (*free_text_tree);

			*free_text_tree = left_word_tree;

			syntree_delete_root_node(operator_tree_to_be_deleted);
			return;
		}

		// Look at the left tree for a stop word
		if ((*free_text_tree)->left_tree->type == FT_WORD_TYPE)
		{
			if (is_stop_word(get_chars(((word_syntree *)(*free_text_tree)->left_tree)->word)))
			{
				left_tree_stop_word = TRUE;
			}
		}
		
		// Look at the right tree for a stop word
		if ((*free_text_tree)->right_tree->type == FT_WORD_TYPE)
		{
			if (is_stop_word(get_chars(((word_syntree *)(*free_text_tree)->right_tree)->word)))
			{
				right_tree_stop_word = TRUE;
			}
		}

		// Case 1 : Both right and left trees have stop words
		// Delete the left and right stop word nodes along with the current operator node and 
		// make it null.
		if (left_tree_stop_word == TRUE && right_tree_stop_word == TRUE)
		{
			left_stop_word_to_be_deleted = (*free_text_tree)->left_tree;
	        right_stop_word_to_be_deleted = (*free_text_tree)->right_tree;
			operator_tree_to_be_deleted = (*free_text_tree);

			*free_text_tree = NULL;
	
			logwrite ("Note: The following words are very common and were not included in your search: %s %s.",
			     	   get_chars (((word_syntree *)left_stop_word_to_be_deleted)->word), 
					   get_chars (((word_syntree *)right_stop_word_to_be_deleted)->word));
		
			ft_word_delete((word_syntree *)left_stop_word_to_be_deleted);
			ft_word_delete((word_syntree *)right_stop_word_to_be_deleted);
			syntree_delete_root_node(operator_tree_to_be_deleted);
		}

		// Case 2 : Left tree is a stop word
		// Delete the left node, the operator node and make the current node point
		// to the right tree.
		if (left_tree_stop_word == TRUE && right_tree_stop_word == FALSE)
		{
			left_stop_word_to_be_deleted = (*free_text_tree)->left_tree;
			right_word_tree = (*free_text_tree)->right_tree;
			operator_tree_to_be_deleted = (*free_text_tree);

			*free_text_tree = right_word_tree;
			
			logwrite("Note: The following word is very common and was not included in your search: %s.",
				      get_chars (((word_syntree *)left_stop_word_to_be_deleted)->word));
				      
			ft_word_delete((word_syntree *)left_stop_word_to_be_deleted);
			syntree_delete_root_node(operator_tree_to_be_deleted);
		}

		// Case 3 : Right tree is a stop word
		// Delete the right node, the operator node and make the current node point
		// to the left tree.
		if (right_tree_stop_word == TRUE && left_tree_stop_word == FALSE)
		{
			right_stop_word_to_be_deleted = (*free_text_tree)->right_tree;
			left_word_tree = (*free_text_tree)->left_tree;
			operator_tree_to_be_deleted = (*free_text_tree);

			*free_text_tree = left_word_tree;

			logwrite("Note: The following word is very common and was not included in your search: %s.",
				      get_chars (((word_syntree *)right_stop_word_to_be_deleted)->word));

			ft_word_delete((word_syntree *)right_stop_word_to_be_deleted);
			syntree_delete_root_node(operator_tree_to_be_deleted);
		}
	}
}