C++ (Cpp) NormalizeWord 예제들

예제 #1

0

파일 보기

파일: sortlist.c 프로젝트: patxu/cs50-Software-Design-and-Implementation

//takes in an array of search terms, gets the docID, score, and which list it should be added in
//and passes the information to the function addScoreToList to be added to the list
//
//PSEUDO CODE
//for all searchterms
//    get the docIDs associated with the word and add the score to the list but factor in a WEIGHT
//    if the prev searchterm is not OR
//	 add to list with weight
//    else
//	 add to the list
void processSearchTerms(INVERTED_INDEX* index, char* searchterms) {
    int docID;
    int score;
    char* prevterm = NULL;
    char* currentterm;
    int pos;
    DOCNODE* d;
    while (searchterms != NULL) {
        currentterm = searchterms;
        pos = 0;
        if(isSearchTerm(currentterm) == TRUE) { //if it's a search term, normalize it and search for it
            NormalizeWord(currentterm);
            while((d = getDoc(index, currentterm, &pos)) != NULL) {
                docID = d->doc_id;
                score = d->page_word_freq;
                if(isNotOR(prevterm) == TRUE) { //add with weighteded score because it must be ADD
                    addScoreToList(querylist, TRUE, docID, (score*WEIGHT));
                }
                else//add with regular score
                    addScoreToList(querylist, FALSE, docID, score);
            }
        }
        prevterm = currentterm;
        searchterms = strtok(NULL, " "); //get next searchterm
    }
    if (querylist->start != NULL) {
        slist = NEW(SORTLIST);
        MALLOC_CHECK(slist);
        BZERO(slist, sizeof(SORTLIST));
        sortList(slist, querylist);
        printList(slist);
    }
}

예제 #2

0

파일 보기

파일: normalizer.c 프로젝트: eaudeweb/naaya

static PyObject *normalize(Normalizer *self, PyObject *args)
{
    int j;
    PyObject * data=NULL ;

    if (! (PyArg_ParseTuple(args,"O", &data)))
        return NULL;

    if (PyList_Check(data)) {
        PyObject *list;

        list = PyList_New(0);

        data = PySequence_Fast(data, "object must be sequence"); 

        for (j=0; j<PyList_Size(data); j++) {
            PyObject *word=NULL,*item=NULL;

            item = PySequence_Fast_GET_ITEM(data,j);
            word = NormalizeWord(self, item);
            PyList_Append(list, word);
        }

        return list;

    } else if (PyUnicode_Check(data) || PyString_Check(data) ) {

        PyObject *word=NULL;

        if (! (word = NormalizeWord(self,data)))
            return NULL;

        return (PyObject *) word;

    } else {
        PyErr_SetString(PyExc_TypeError,"argument must be unicode or string");
        return NULL;
    }

    return data;
}

예제 #3

0

파일 보기

파일: indexer.c 프로젝트: somebodyschelsea/search-engine

// updateIndex takes a word, a document_id, and an index.  It adds the document to the index,
// and the word itself if it's not already contained in the index.  Returns 0 if success, 1 if failure.
int updateIndex(char* word, int document_id, INVERTED_INDEX* in_index)
{
	DocumentNode* docnode;
	WordNode* wordnode;
	DocumentNode* current_doc_node;

	int page_node_exists;

	page_node_exists = 0;

// creates a DocumentNode from the doc_id
	docnode = malloc(sizeof(DocumentNode));
  	MALLOC_CHECK(docnode);
  	docnode->document_id = document_id;
  	docnode->page_word_frequency = 1;
	docnode->next = NULL;

// makes it lower case (necessary for the query system)
	NormalizeWord(word);

	if(addData(in_index, docnode, word))	// if the wordnode already exists
	{
		wordnode = getData(in_index, word);

		if(wordnode != NULL)
		{
			current_doc_node = wordnode->data;

			while(current_doc_node != NULL)
			{
				if((current_doc_node->document_id) == document_id)
				{
					page_node_exists = 1;
					current_doc_node->page_word_frequency = (current_doc_node->page_word_frequency)+1;
					free(docnode);
					break;
				}
				else
					if(current_doc_node->next == NULL)
						break;
					else
						current_doc_node = current_doc_node->next;
			}

			if(!page_node_exists)
			{
				current_doc_node->next = docnode;
			}
		}			
	}

	return 0;
}

예제 #4

0

파일 보기

파일: dict_ispell.c 프로젝트: sunyangkobe/cscd43

Datum
spell_lexize(PG_FUNCTION_ARGS)
{
	DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0);
	char	   *in = (char *) PG_GETARG_POINTER(1);
	char	   *txt;
	char	  **res;
	char	  **ptr,
			  **cptr;

	if (!PG_GETARG_INT32(2))
		PG_RETURN_POINTER(NULL);

	res = palloc(sizeof(char *) * 2);
	txt = pnstrdup(in, PG_GETARG_INT32(2));
	res = NormalizeWord(&(d->obj), txt);
	pfree(txt);

	if (res == NULL)
		PG_RETURN_POINTER(NULL);

	ptr = cptr = res;
	while (*ptr)
	{
		if (searchstoplist(&(d->stoplist), *ptr))
		{
			pfree(*ptr);
			*ptr = NULL;
			ptr++;
		}
		else
		{
			*cptr = *ptr;
			cptr++;
			ptr++;
		}
	}
	*cptr = NULL;

	PG_RETURN_POINTER(res);
}

예제 #5

0

파일 보기

파일: query.c 프로젝트: MisaZhu/tinysearchengine-1

// For a single word, returns the start of the result Docnode list.
// Makes an exact copy of the docnode from index
DocNode *getResultsForWord(char *word, INVERTED_INDEX *index) {
  // check for reserved words first
  if (!strncmp(word, "AND", 3) || !strncmp(word, "OR", 2)) {
    printf("AND and OR are reserved words. Please enter a different query.\n");
    return NULL;
  }

  NormalizeWord(word);

  int h = makeHash(word);
  int word_not_found = 1;  // 1 for true, 0 for false
  WordNode *cluster_end = NULL;

  // collisionAction is like getDatawith key for Dictionary..
  cluster_end = collisionAction(index, h, word, &word_not_found);

  if (word_not_found)
    return NULL;

  else {
    DocNode *d = cluster_end->data; // page from the wordnode
    DocNode *dcopy = initDocNode(d->doc_id, d->page_word_frequency);

    // see if more documents exist
    for (d=d->next; d!=NULL; d=d->next)
      updateDocNode(dcopy, d->doc_id, d->page_word_frequency);

    // This also works but commented because ORhelper has to free
    // the incoming doc list...
    // since we are making exact copy of the docnode from index,
    // we need to add the docs that are not already in our copy.
    // So this is like an OR operation.
    // if (d->next != NULL)
      // ORHelper(&dcopy, d->next);

    return dcopy;
  }
}

예제 #6

0

파일 보기

파일: index_func.c 프로젝트: chickonice/Search_Engine

/*  -----------------------------------------------------------------------
    Function Name: initial_index()
    Functionality: Crawler data in directory specified is scanned and each word 
    inserted into the Hashtable Index  
    Input <--- directory where crawler data lives (html from crawled pages) and
        the Hashtable Index 
    Outputs ---> void (updated index)
    ----------------------------------------------------------------------- */
void initial_index(char *argv[], HashTable *Index)
{
    int doc_id;
    char **results_filenames = NULL;

    int num_files = GetFilenamesInDir(argv[1], &results_filenames);

    //going through each file, allocating memory, parsing html and updating the index with word
    for (doc_id = 1; doc_id < num_files; doc_id++) {

        char *file_name = calloc(1, floor(log10(abs(doc_id))) + 2);
        snprintf(file_name, floor(log10(abs(doc_id))) + 2, "%d", doc_id);
        // printf("results_filename is: %s", file_name);

        char *file_path = calloc(1, strlen(argv[1]) + strlen(file_name) + 4);
        snprintf(file_path, strlen(argv[1]) + strlen(file_name) + 4, "./%s/%s", argv[1], file_name);
        // printf("path is: %s \n", file_path);

        if (IsFile(file_path)) {

            char *content = file_parsing(file_path); //parsing out first two line_contents of each file, so only html left over
          
            //normalizing each word in each file and updating the index hashtable with it
            if (content != NULL) {
                int pos = 0;
                char *word;
                while ((pos = GetNextWord(content, pos, &word)) > 0) {
                    NormalizeWord(word);
                    insert_to_index(doc_id, word, Index);
                }
            }
            free(content);
        }
        free(file_name);
        free(file_path);
    }
    free(results_filenames);
}

예제 #7

0

파일 보기

파일: query.c 프로젝트: pratapl/COSC50

int main(int argc, char* argv[])
{
	
    // Program parameter processing
	if(argc != 3){
		printf("Error: Incorrect usage\n");
		printf("Query Usage: ./query [indexed data(eg. indexer.dat)] [html data(eg. data)]\n");
		return 1;
	}

	//Get the supplied directory name.
    	int dirSize = strlen(argv[2]);
    	char htmlDirectory[dirSize + 1];
    	htmlDirectory[0] = '\0';
    	strcat(htmlDirectory, argv[2]);

	//Get the fileName.
	int fileSize = strlen(argv[1]);
	char indexedFile[fileSize + 1];
	indexedFile[0] = '\0';
	strcat(indexedFile, argv[1]);
	
    if(IsFile(indexedFile) == 0){
        printf("Incorrect path for indexed file\n");
        return 1;
    }

   	//Check if the path provided is a valid directory.
	if(IsDir(htmlDirectory) == 0){
		printf("Incorrect path for html directory\n");
        return 1;
	}
        
DocumentNode *final = NULL;
DocumentNode* orList[MAX_INPUT]; //OR's
HashTable *tempHashTable = initHashTable(); 

// recreate the inverted index
tempHashTable = ReadFile(indexedFile); 
int revert = SaveIndexToFile(tempHashTable,indexedFile);

if (revert == 0)
    printf("0 Returned from inverting\n");

printf("Satrting query..\n");

//Queries
char inp[MAX_INPUT];
char buff[MAX_INPUT];
int orFlag;
int orIndex;

//loop until user exits
printf("Query:>");

LABEL:while ((fgets(inp,MAX_INPUT,stdin)))
{ 
  
  printf("Query:>");
 
  for (int index = 0; index < MAX_INPUT; index++){
    orList[index] = NULL;//init list elements to null
  }
  
  orFlag = 999;
  orIndex = 0;
  
  // if its a blank enter
  if (strcmp(inp, "\n") == 0){
    fprintf(stderr, "You entered a blank line. Please enter query words!\n");
    continue;
  }  
 
  // remove trailing newline
  char *pos;
  if ((pos=strchr(inp, '\n')) != NULL){
    *pos = '\0';
  }
  
  // check for the last word
  strcpy(buff, inp);
  char *isLast;
  char *lastWord;
  isLast = strtok(buff, " ");
  
  // find the last word
  while (isLast != NULL){
    lastWord = isLast;
    isLast = strtok(NULL, " ");
    
    // check for AND OR and OR AND consecutively
    if (isLast != NULL)
    {
      if ((strcmp(lastWord, "AND") == 0) || (strcmp(lastWord, "OR") == 0))
      {
        if ((strcmp(isLast, "OR") == 0) || (strcmp(isLast, "AND") == 0))
        {
          fprintf(stderr, "Two consecutive query words is invalid. Please try again.\n");
          goto LABEL;
        }
      }
    }
  }
  
  
  if ((strcmp(lastWord, "AND") == 0) || (strcmp(lastWord, "OR") == 0))
  {
    fprintf(stderr, "Last word in query is invalid: %s\n", lastWord);
    continue;
  }

  
  
  char *words;
  words = strtok(inp, " "); //break input on spaces

  //first word validity
  if ((strcmp(words, "AND") == 0) || (strcmp(words, "OR") == 0))
  {
    fprintf(stderr, "First word in query is invalid: %s\n", words);
    continue;
  }
  NormalizeWord(words);//normalize the first valid word
  final = getDocumentList(words, tempHashTable, final);//init doc list
  
  
  words = strtok(NULL, " ");
  // return the list for a one word query(next = null)
  if (words == NULL)
  { 
    final = querySort(final);//recursive sort
    printResult(final, htmlDirectory);//display
           
    freeDocumentList(final);
    final = NULL;

예제 #8

0

파일 보기

파일: query.c 프로젝트: cjunmokim/Search-Engine

int GetLinks(char *line, HashTable *Index) {
	
	// Declare variables.
	char *buf;
	char word[MAX];
	int flag; // flag to do union or intersection operations.
	int count; // variable to count the position of a word in the line.
	
	// Initialize variables.
	buf = line;
	flag = 1;
	count = 0;
	temp_list = NULL;
	final_list = NULL;
	
	// Loop through the line and do the appropriate operations.
	while (sscanf(buf, "%s", word) == 1) {
		
		count++;
		
		// If word is AND, then ignore and read in new word.
		if (strcmp(word, operator1) == 0) {
			if (count == 1) { // If there is no previous word, then throw an error.
				return 0;
			}
			
			// Increment position in line.
			buf = strstr(buf, word) + strlen(word);
			continue;
		}
		
		// If word is OR, then tell the program to do OR operation.
		if (strcmp(word, operator2) == 0) {
			flag = 2; // Set flag to union operation.
			if (count == 1) { // If there is no previous word, then throw an error.
				return 0;
			}
		}
		
		// Hold onto original copy of word in case NormalizeWord() changes its content.
		char *word_old = (char *)calloc(1, strlen(word) + 1);
		strcpy(word_old, word);
		
		// Change word to lowercase.
		if (strcmp(word, operator1) != 0 && strcmp(word, operator2) != 0) {
			NormalizeWord(word); // Normalize if word is not an operator.
		}
		
		// Add list of docs to temp_list.
		// Case when it is the first word of the block.
		if (count == 1) {
			
			// Declare variables.
			WordNode *current; // variable for traversal.
			DocumentNode *ptr, *ptr2; // variables for traversal.
			int num;
			
			// Case when the word is in the InvertedIndex.
			if ((num = InHashTable(word, Index))) {
				unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code.
				current = Index->table[index]->data;
				
				// Loop until we get the matching WordNode.
				for (int i=1; i < num; i++) {
					current = current->next;
				}
				
				// Loop through each DocumentNode and add to temp_list.
				for (ptr = current->page; ptr != NULL; ptr = ptr->next) {
					
					// Declare and initialize a DocumentNode with the same values as ptr.
					DocumentNode *dn;
					dn = (DocumentNode *)calloc(1, sizeof(DocumentNode));
					dn->doc_id = ptr->doc_id;
					dn->freq = ptr->freq;
					
					// Add the new DocumentNode to temp_list.
					if (temp_list == NULL) { // Case when temp_list is empty.
						temp_list = dn;
						ptr2 = temp_list;
					}
					else { // Case when temp_list is nonempty.
						ptr2->next = dn;
						ptr2 = ptr2->next;
					}
				}
			}
		}
		else { // If not first word of the block, then do the operation.
		
			// Check if the current operation is "AND".
			if (flag == 1) {
				And(word, Index);
			}
			
			// Check if the current operation is "OR".
			if (flag == 2) {
				if (temp_list != NULL) {
					Or();
				}
				flag = 1; // Set flag back to "AND" operation.
				count = 0; // Set word count to 0 to signal the start of a new block of words.
			}
		}
		
		// Increment position in the query line to read in next word.
		buf = strstr(buf, word_old) + strlen(word_old);
		
		free(word_old); // Cleanup.
	}
	
	// If the last word of the query line is an operator, throw an error.
	if (strcmp(word, operator1) == 0 || strcmp(word, operator2) == 0) {
		return 0;
	
	}
	
	// If nonempty, flush out temp_list to final_list.
	if (temp_list != NULL) { 
		Or();
	}
	
	return 1; // Return 1 if successful.
}

예제 #9

0

파일 보기

파일: parse.c 프로젝트: joonhyongcho/TinySearchEngine

List *ParseWordsToList(char *parse_string, HashTable *word_index) {
 	// make sure beginning of string is a word
 	while (!isalpha(parse_string[0])) {
		int delete_index = 0; 
		memmove(&parse_string[delete_index], &parse_string[delete_index + 1], strlen(parse_string) - delete_index);
 	}


 	// Get the first word and keep a pointer to the original string 
 	char *copy_parse_string = strdup(parse_string);
 	char *parse_string_pointer = copy_parse_string;
 	char *word = strsep(&copy_parse_string, " ");

 	// Check that the first word is not OR or AND

 	// Make list node that holds the entire group of nodes connected by AND
 	ListNode *AND_group = calloc(1, sizeof(List));
 	AND_group->object = NULL;
 	AND_group->prev = NULL;
 	AND_group->next = NULL;

 	// Allocate list of all groups and set variables to AND_group	
 	List *all_groups = calloc(1, sizeof(List));
 	all_groups->current = AND_group;
 	all_groups->head = AND_group;
 	all_groups->tail = AND_group;

 	// Until end of query
 	while (word != NULL) {
 		// Disregard word if it is AND and move on to the next one
 		if (strcmp(word, "AND") == 0) {
 			word = strsep(&copy_parse_string, " ");
 			continue;
 		}
 		// if (strcmp(word, "\0") == 0) {
 				
 		// }

 		// If the word is OR, then we make a new list, with a new list node and word node for the next word
 		if (strcmp(word, "OR") == 0) {

 			// Check that there are words in the previous list
 			if (all_groups->current->object == NULL) {
 				word = strsep(&copy_parse_string, " ");
 				continue;
 			}

 			// allocate new list
 			ListNode *new_AND_group = calloc(1, sizeof(ListNode));
 			new_AND_group->object = NULL;
 			new_AND_group->prev = all_groups->tail;
 			new_AND_group->next = NULL;

 			// connect previous list to new list and update List tail and current
 			all_groups->current->next = new_AND_group;
 			if (all_groups->tail != all_groups->current) {
 				fprintf(stderr, "Something was wrong with updating tail_pointer\n");
 				exit(-1);
 			}
 			all_groups->tail = new_AND_group;
 			all_groups->current = new_AND_group;

 			AND_group = new_AND_group;

 			// get next word
 			word = strsep(&copy_parse_string, " "); 	

 			continue;
 		}

 		/* Since word is neither OR or AND, we can look it up and made a new node */ 
 		// normalize word		
 		NormalizeWord(word);

 		/* Find the word in the hashtable */
 		HashTableNode *current_hash_node;
 		// if the word does not exist in the hashtable discard entire group until new "OR" 
 		if ((current_hash_node = LookupKey(word_index, word)) == NULL) {

 			// Make new list of list nodes
 			ListNode *new_AND_group = calloc(1, sizeof(List));
 			new_AND_group->object = NULL;
 			new_AND_group->prev = NULL;
 			new_AND_group->next = NULL;

 			// Remove current list 
 			ListNode *list_to_remove = all_groups->current;
 			if (list_to_remove->prev != NULL) {
 				(list_to_remove->prev)->next = new_AND_group;
				new_AND_group->prev = list_to_remove->prev;
 			} else if (list_to_remove->prev == NULL) {
 				all_groups->head = new_AND_group;

 			}
 			// set tail and current for all_groups list
 			all_groups->tail = new_AND_group;
 			all_groups->current = new_AND_group;
 			AND_group = new_AND_group;

 			// Free entire group
 			ListNode *current_list_node = (ListNode *)list_to_remove->object;
 			ListNode *next_list_node;

 			while (current_list_node != NULL) {
 				ListNode *current_list_node_pointer = current_list_node;
 				next_list_node = current_list_node->next;
 				free(current_list_node_pointer);
 				current_list_node = next_list_node;
 			}
 			free(list_to_remove);

 			// Look for next OR
 			while (word != NULL && strcmp(word, "OR") != 0) {
 				word = strsep(&copy_parse_string, " ");
 			}
 			if (word == NULL) {
 				//remove the new lit

 			}
 		} 
 		// Otherwise, if there is a word, there, get the WordNode, create a new one to put in the list,
 		else {

 			// Get the word node from the hashtable
			WordNode *word_node_to_add = (WordNode *)current_hash_node->object;

			// Make a list node to hold new word node
			ListNode *new_AND_node = calloc(1, sizeof(ListNode));
			new_AND_node->next = NULL;
			new_AND_node->object = word_node_to_add;

 			// If the group's first word node has not been set, then make it the first
			if (AND_group->object == NULL) {
				AND_group->object = new_AND_node;
				new_AND_node->prev = NULL;
			} 
			// else we add the list node to the end of the group
			else {
				// get the first list node from the group				
	 			ListNode *current_list_node = AND_group->object;
	 			ListNode *next_list_node;
	 			// Get to the last list node in the group
	 			while (current_list_node->next != NULL) {
	 				next_list_node = current_list_node->next;
	 				current_list_node = next_list_node;
	 			} 	
	 			// Connect the last list node to the new list node	
	 			current_list_node->next = new_AND_node;
	 			new_AND_node->prev = current_list_node;
			}		
 		}

 		word = strsep(&copy_parse_string, " ");
 	}

 	free(parse_string_pointer);
 	
 	// Check the case in which there is nothing that works.
 	ListNode *irregular_list_node = all_groups->head;
 	WordNode *irregular_word_node = (WordNode *)irregular_list_node->object;

 	if (irregular_word_node == NULL) {
 		free(copy_parse_string);
 		free(irregular_list_node);
 		free(all_groups);
 		return NULL;
 	}
 	// Check that the end of all groups is not NULL
 	ListNode *current_group_to_check = all_groups->tail;
 	if (current_group_to_check->object == NULL) {
 		all_groups->tail = current_group_to_check->prev;
 		all_groups->tail->next = NULL;
 		free(current_group_to_check);
 		all_groups->current = all_groups->tail;
 	}

 	return all_groups;
}

예제 #10

0

파일 보기

파일: query.c 프로젝트: ArminMahban/C_code_samples

int main(int argc, char* argv[]){

//call the check args function to check the input arguments
checkArgs(argc, argv); 

//init the HashTable
HashTable* Table = ReadFile(argv[1]);
//init the array to hold all of the input words
char wordArray[MAX_ROWS][MAX_ROWS][MAX_WORD_LENGTH + 1];

//init keyboard input string
char line[MAX_WORD_LENGTH+1];



while (1){	//main loop

printf("\nEnter your string (enter \"QUIT\" to exit the function) \n");	
//accept user input. Deal with user input longer than the max line
if (fgets(line, MAX_LINE, stdin)){
	if (NULL == strchr(line, '\n')){
		printf("Query only accepts 1000 characters\n");
		eat_extra(); //"eats" characters after 1000 characters are input then exits
		exit(1);
	}
}

//handle when the user quits the program
if (strcmp(line, "QUIT\n") == 0){
	printf("Exit command reached, Cleaning memory and quitting\n");
	CleanHashMemory(Table);
	exit(0);
}

// size_t length = strlen(line);
// printf("length of input is %zu\n", length );

//check if the inputted line ends with AND or OR
EndsWithAND(line);
EndsWithOR(line);

char* argv2 = argv[2];

//make sure the wordArray is cleared out between queries
memset(wordArray, 0, sizeof(wordArray[0][0][0]) * 500 * MAX_ROWS * MAX_WORD_LENGTH + 1);

int FinalDocMatchArray[1705] = {0}; //keep the documents ids that have matched all the criteria
int FinalArrayIndex = 0;
int scoreArray[1705] = {0}; //keep the scores of the FinalDocMatchArray in parallel positions
int index = 0; 

//init variables for GetNextWord
int pos = 0;
int counter = 0;
int andPos = 0;
int andFlag = 0;
int orFlag = 0; 
int orPos = 0; 
char* word;
while((pos = GetNextWord(line, pos, &word)) > 0){ //go through the words in the query   	
	//if the word exists, add it to the hash table
	if (word != NULL && strlen(word) < MAX_WORD_LENGTH) { 
		//check if it starts with AND or OR
		if (counter == 0 && (strcmp(word, "AND") == 0 || strcmp(word, "OR") == 0)){
			printf("Input cannot start or end with AND or OR\n");
			exit(1);
		}
		else if (strcmp(word, "AND") == 0){
			// printf("AND detected\n");
			if (andFlag == 1)
			{
				printf("Two ANDs in a row. Invalid input.\n");
				exit(1);
			}
			andFlag = 1;
		}
		//detect ORs and increment position in wordArray
		else if (strcmp(word, "OR") == 0){
			// printf("OR detected\n");
			if (orFlag == 1)
			{
				printf("Two ORs in a row. Invalid input.\n");
				exit(1);
			}
			orPos++;
			andPos = 0;
			orFlag = 1;
		}
		else{

		NormalizeWord(word);
		// printf("Word is %s %i\n", word, counter);
		andFlag = 0;
		orFlag = 0;
		//put the word in the wordArray at the appropriate place
		int len = strlen(word+1);
		char wordCpy[len+1];
		strcpy(wordCpy,word);
		strcpy(wordArray[andPos][orPos], wordCpy);
		// printf("Adding %s to array at %i %i \n",word, andPos, orPos );
		andPos++;

		}

		counter++;
	}
	free(word);
	word = NULL;
}


//k is incremented every time an OR is processed
int k = 0;
while (strcmp(wordArray[0][k], "") != 0){ 

int docMatchArray[1705] = {0}; //temporary array of matching documents
int docMatchArrayIndex = 0;

char* firstWord = wordArray[0][k];
// printf("Word is: %s\n", firstWord);

//compute jenkins hash
int hashResult = JenkinsHash(firstWord, MAX_HASH_SLOT);

if (Table->table[hashResult] == NULL){
	printf("%s does not exist in hashTable database\n", firstWord );
	exit(1);
}

//go through the hashtable until you find the appropriate word and documents
//put it into a temporary array to be matched against
else{
	WordNode* node2 = Table->table[hashResult];
	WordNode* dummyWord = node2;
	while (dummyWord != NULL){ //go through all the linked words
		DocumentNode *dummy_doc = dummyWord->page;
		if (strcmp(dummyWord->word, firstWord) == 0){//if they are the same word, go through the document nodes
		    //go through the document nodes
		    while (dummy_doc != NULL) {
			    //put all of the first words docs into the temp list
			    docMatchArray[docMatchArrayIndex] = dummy_doc->doc_id;
			    docMatchArrayIndex++;
	            //advance
	            dummy_doc = dummy_doc->next;	
        }
        break; //you've found the word, no need to continue to other words
		}
		else{
			// printf("Did not find %s\n", firstWord );
		}

		dummyWord = dummyWord->next;
		// printf("Advancing\n");
	}
}

//if there's only 1 word to examine, no need to compare other words
if (strcmp(wordArray[1][k], "") == 0){ 
	//add everything in the doc match array to the FinalDocMatchArray
	for (int i = 0; i < docMatchArrayIndex; i ++ ){
		if (docMatchArray[i] != '\0'){
			int dupIndex = 0;
			int dupFlag = 0;
			while (FinalDocMatchArray[dupIndex] != '\0'){
				//check if they're the same
				if(docMatchArray[i] == FinalDocMatchArray[dupIndex]){
					// printf("FOUND A DUPLICATE for %i\n", docMatchArray[i] );
					dupFlag = 1; 
					//a duplicate was found, compute the final score and increment that element
					int finalScore = 0;
					int index3=0;
					// printf("docNum is %i\n",FinalDocMatchArray[index]);
					while(strcmp(wordArray[index3][k],"") != 0){ //for every word 
						//go through all the words and compute the final score
						finalScore += ComputeScore(FinalDocMatchArray[dupIndex], Table, wordArray[index3][k]);
						index3++;
					}
					//put it in the score array
					scoreArray[dupIndex] += finalScore;
					finalScore = 0;										
					break;
				}
				dupIndex++;
			}
			//if the duplicate was not found and there's only 1 word, then put everything into the final array
			if (dupFlag != 1) { //if a duplicate was not found in the list
				FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i];
				FinalArrayIndex++; 
			}
		}
	}
}

//if there's more than one word between the OR statements, compute the final scores for them all
else{        
	for (int i = 0; i < 1705; i ++){
		if (docMatchArray[i] != 0){
		int result = 1;
		int m = 0;//make sure to adjust based on current position in masterList

		//for every doc in the docMatchArray, test if all other words contain that doc
		while (strcmp(wordArray[m][k], "") != 0) {  //increment word 

			//check if this word's documents and see if there's a match
			result = findDocMatch(docMatchArray[i], Table, wordArray[m][k]);

			if (result != 0){
				break; //the document had no matches, skip the rest
			}
			m++;
		}
		if (result == 0){
			//before you add it to the final array, check if you've already added it
			int dupIndex2 = 0;
			int dupFlag2 = 0;
			while (FinalDocMatchArray[dupIndex2] != '\0'){
				//if it's already in the list, then only increment the score
				if(docMatchArray[i] == FinalDocMatchArray[dupIndex2]){
					dupFlag2 = 1;
					int finalScore2 = 0;
					int index4 = 0;
					while(strcmp(wordArray[index4][k],"") != 0){//for every word 
						// printf("Word is %s\n",wordArray[index4][k]);
						finalScore2 += ComputeScore(FinalDocMatchArray[dupIndex2], Table, wordArray[index4][k]);
						index4++;
					}					
					scoreArray[dupIndex2] += finalScore2; //increment the appropriate score
					finalScore2 = 0;
					break; 
				}
				dupIndex2++;
			}
			//otherwise, add it to end of the Final Array
			if (dupFlag2 != 1){
				FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i];
				FinalArrayIndex++;
			}
			}
		}
	}
}

//compute the scores for all the non-duplicates toward the end of the array
int finalScore = 0;
while (FinalDocMatchArray[index] != '\0'){ //for every doc that matches all AND words
	int index2=0;
	// printf("docNum is %i\n",FinalDocMatchArray[index]);
	while(strcmp(wordArray[index2][k],"") != 0){//for every word 
		// printf("Word is %s\n",wordArray[index2][k]);
		finalScore += ComputeScore(FinalDocMatchArray[index], Table, wordArray[index2][k]);
		index2++;
	}

	// printf("Score for %i is %i\n",FinalDocMatchArray[index], finalScore);
	//put it in the score array
	scoreArray[index] = finalScore;
	finalScore = 0;
	index++;
}
k++; //increment OR position
}

//sort the Final Array
BubbleSort(FinalDocMatchArray, scoreArray, argv2);

}//loop back to string entry

} //end main

예제 #11

0

파일 보기

파일: queryfuncs.c 프로젝트: somebodyschelsea/search-engine

// takes a char* input_line, a QUERY** queries, and a pointer to an int num_queries
// parses input_line for QUERYs, placing them into queries, and incrementing 
// num_queries as it does so
// returns -1 if input_line is bad (empty, ends in "OR")
// returns 1 if input_line == "q" (quit command)
// returns 0 if successful
int pullQueries(char* input_line, QUERY** queries, int* num_queries)
{
	char *current_keywords[MAX_NUM_KEYWORDS];
	char *word;	
	int current_index;
	QUERY* query;
	int position;

	word = malloc(MAX_KEYWORD_LENGTH*sizeof(char)); 
	BZERO(word, MAX_KEYWORD_LENGTH*sizeof(char));
	
	*num_queries = 0;
	current_index = 0;		// corresponds to index of current_keywords
	position = 0;			// matches index in input_line

// getNextWord parses the input_line for a word, storing it into word
// works just like getNextURL
	while((position = getNextWord(input_line, word, position)) != -1)
	{	
		word[strlen(word)] = '\0';		

// if quit command
		if(current_index == 0 && strcmp(word, "q") == 0)
		{
			free(word);			
			return 1;
		}

// if OR (a new QUERY is about to begin)
		if(strcmp(word, "OR") == 0)
		{
			query = malloc(sizeof(QUERY)); 
			MALLOC_CHECK(query);

// for each keyword in current_keywords, put it into the search_words
// parameter of query
			for(int i = 0; i < current_index; i++)
			{
				query->search_words[i] = malloc(MAX_KEYWORD_LENGTH*sizeof(char));	 	
				BZERO((query->search_words)[i], MAX_KEYWORD_LENGTH*sizeof(char));
				strncpy((query->search_words)[i], current_keywords[i], MAX_KEYWORD_LENGTH*sizeof(char));
				free(current_keywords[i]);
			}

// include a null-terminator just in case
			query->search_words[current_index] = NULL;

// place the new query into queries (while incrementing num_queries)
			queries[(*num_queries)++] = query;

// empty current_keywords and reset its index current_index
			BZERO(current_keywords, MAX_NUM_KEYWORDS);
			current_index = 0;
		}

// if it's a regular keyword
		else
		{

// make it lower case
			NormalizeWord(word);		

// add it to current_keywords while incrementing its index current_index	
			current_keywords[current_index] = malloc(MAX_KEYWORD_LENGTH*sizeof(char));			
			BZERO(current_keywords[current_index], MAX_KEYWORD_LENGTH*sizeof(char));
			strncpy(current_keywords[current_index++], word, MAX_KEYWORD_LENGTH*sizeof(char));		
		}

// empty the word out
		BZERO(word, MAX_KEYWORD_LENGTH*sizeof(char));
	}

	free(word); 

// if current_index = 0, that means the last word in input_line was "OR"
// and therefore the input is bad
	if(current_index == 0)
	{
		for(int i = 0; i < *num_queries; i++)
		{
			query = queries[i];
			
			position = 0;			

			while((word = (query->search_words)[position++]) != NULL)
				free(word);

			free(query);
		}

		return -1;
	}

// otherwise, the last QUERY hasn't been created yet, and we need to make it
// in the same way
	query = malloc(sizeof(QUERY)); 
	MALLOC_CHECK(query);

	for(int i = 0; i < current_index; i++)
	{
		query->search_words[i] = malloc(MAX_KEYWORD_LENGTH*sizeof(char));				
		BZERO((query->search_words)[i], MAX_KEYWORD_LENGTH);
		strncpy((query->search_words)[i], current_keywords[i], MAX_KEYWORD_LENGTH);
		free(current_keywords[i]);
	}

	query->search_words[current_index] = NULL;

	queries[(*num_queries)++] = query;
			
	return 0;	
}

예제 #12

0

파일 보기

파일: common_query.c 프로젝트: irenelfeng/query_engine

WordNode * processQuery(char * query, HashTable *index){
	//TODO: eat leftover from the buffer

	if (NULL == strchr(query, '\n')){
		printf("Whoa there. You entered too many characters. Query must exit.");
		return NULL;
    }
	//if there is more than 1000 characters. didn't handle yet. 
	printf("\n");

	// get first word in query
	char * pch = strtok(query," \n");
	if(pch == NULL){
		fprintf(stdout, "No input specified. \n");
		printf("QUERY>: ");
		return init_list();
	}
	
	if ((strcmp("OR", pch) == 0) || (strcmp("AND",pch) == 0)) {
			fprintf(stderr, "Invalid input. AND/OR cannot start your search query. please use a non-operator word to search \n");
			printf("QUERY>: ");
			return NULL; 
	}

	//initialize search results 
	int or = -1; //set to negative one on first run
	WordNode * search_results = NULL;
	WordNode * tmp_list = NULL;

	// go through rest of the query string
	while (pch != NULL)
	{

		//check if OR or AND
		if (strcmp("OR", pch) == 0) {
			// printf("or here");
			or = 1;
			pch = strtok (NULL, " \n");
			continue;
		} else if (strcmp("AND", pch) == 0) {
			// printf("and here");

			or = 0;
			pch = strtok (NULL, " \n");
			continue;
		}

		// switch to lowercase
		NormalizeWord(pch);

		if(or == 1){
			// fprintf(stdout,"Doing OR");
			if (search_results) {
				// if there has been an OR before, now unionize
				search_results = unionize(search_results, tmp_list);

			} else {
				// only the first or
				// need to hold on to previous list
				search_results = tmp_list;
			}

			// set tmp_list to the new list word
			tmp_list = make_copy((WordNode *) get_value(pch ,index));


		} else if (or == 0){
			// printf("Doing AND"); 
			// get intersect 
			tmp_list = intersection(tmp_list, make_copy(get_value(pch ,index))); 

		} else {
			// first run (or = -1)
			tmp_list = make_copy((WordNode *) get_value(pch ,index));
			// if(tmp_list){
			// 	fprintf(stderr, "GOT SOMETHING!!\n");
			// 	if (tmp_list->head)
			// 	fprintf(stderr, "here is something %d", tmp_list->head->docID);
			// }

		}

		or = 0; // "marks AND for next"
		pch = strtok (NULL, " \n");

	} //end search loop

	// in the end, unionize the two lists! 
	search_results = unionize(search_results, tmp_list);

	return search_results;
}

예제 #13

0

파일 보기

파일: indexer.c 프로젝트: joonhyongcho/TinySearchEngine

int main (int argc, char **argv) {

	/* Check Arguments */
	if (!CheckArguments(argc, argv)) {
		exit(-1);
	}

	/* Make variables for all things needed for indexer and indexer testing */
	char *page_directory;
	char *index_filename;
	char *read_index_filename;
	char *new_index_filename;
	// If argument count is 3 initialize only 2 variables else initialize all
	page_directory = argv[1];
	index_filename = argv[2];

	// Initialize hashtable, word node, and document node
	HashTable *index_hashtable = calloc(1, sizeof(HashTable));

	/*Make array to hold filenames (just document numbers) and use GetFilenamesInDir to grab all names */
	char **filename_array;
	int number_of_files;
	if ((number_of_files = GetFilenamesInDir(page_directory, &filename_array)) < 0) {
		fprintf(stderr, "Could not get filenames in page directory. Exiting Now.\n");
		exit(-1);
	}


	/* Add page_directory to the front of the filenames */
	for (int i = 0; i < number_of_files; i++) {
		// Make pointe to current string in filename_array
		char *previous_string = filename_array[i];
		// Get length of full string and initialize element of filename_array to that size
		int len = strlen(page_directory) + strlen(previous_string) + 1;
		char *new_string = calloc(len, sizeof(char));
		// Make new string and free previous string
		strcpy(new_string, page_directory);
		strcat(new_string, previous_string);
		if (previous_string)
			free(previous_string);		

		filename_array[i] = new_string;
	}

	/* Populate the index data structure from the words on each doc
	 * Then Save to an index file                     
	 */
	for (int i = 0; i < number_of_files; i++) {

		/* Check that the filenames are digits */
		int continue_flag = 0;
		char *digit_string = filename_array[i] + strlen(page_directory);
		// Check that every character in the filename is a digit
		for (int j = 0; j < strlen(digit_string); j++) {
			if (!isdigit(digit_string[j])) {
				fprintf(stderr, "This file %s contains something other than a digit \n", filename_array[i]);
				continue_flag = 1;
			}
		}
		if (continue_flag ==1)
			continue;

		// Check that each file in the filename array is a good file
		char *file_name = filename_array[i];
		if (!IsFile(file_name)) {
			fprintf(stderr, "not file\n");	
			continue;
		}

		// Get contents of file into a string
		char *document = LoadDocument(file_name);
		if (document == NULL) {
			continue;
		}

		// Get DocumentID of file (check if bad)
		int document_id = GetDocumentId(file_name, page_directory);
		if (document_id < 0) {
			fprintf(stderr, "Error when converting document id char to integer\n");
			continue;
		}

		// Use GetNext word, with pos variable and buffer, to get every word and add the word to the data structure
		int pos = 0;
		char *word_buffer;
		while ((pos = GetNextWord(document, pos, &word_buffer)) > 0) {
			// Update the index for each word
			// Normalize word then update index with that word
			NormalizeWord(word_buffer);
			UpdateIndex(word_buffer, document_id, index_hashtable);
			free(word_buffer);
		}
		// free the string containing the html and the word in filenamearray
		free(document);
	}

	/* Save to index file, and check that it actually went well */
	if (!SaveIndexToFile(index_hashtable, index_filename)) {
		fprintf(stderr, "Could not save index hashtable to file\n");
		exit(-1);
	}

	for (int i = 0; i < number_of_files; i++) {
		free(filename_array[i]);
	}
	free(filename_array);
	FreeHashTable(index_hashtable);

	if (argc == 3) {
		;
	}
	/* Read index file into data strucutres and save to new index file */
 	else {
 		// Assign 2 filenames
 		read_index_filename = argv[3];
		new_index_filename = argv[4];
		// Read index file into data structures 
		HashTable *read_index = ReadFile(read_index_filename);
		if (read_index == NULL) {
			fprintf(stderr, "Error when reading index file into data structures.\n");
			exit(-1);
		}
		// Save index data structures into new file
		if (!SaveIndexToFile(read_index, new_index_filename)) {
			fprintf(stderr, "Could not save read index file into new index file\n");
			exit(-1);
		}
		
		FreeHashTable(read_index);
    }

	return 0;
}

예제 #14

0

파일 보기

파일: indexer.c 프로젝트: DanielleJude/TinySearchEngine

//return 1 if successful, 0 otherwise
int buildIndexFromDirectory(char *dir, HashTable *hashtable)
{
	char **filenames = NULL;
	int num_files = 0;

	//get the file names
	num_files = GetFilenamesInDir(dir, &filenames);
	if (num_files < 0) {
		fprintf(stderr, "Error: Unable to obtain files in directory\n");
		free(filenames);
		return 0;
	}

	//for each of the files in the directory, read and add to the indexer
	for(int i = 0; i < num_files; i++){

    	//allocate a char array, directoryname/filename
    	char *file_name = malloc((strlen(dir) + strlen(filenames[i])) * (sizeof(char))); 

	    //makes filename based on whether the directoryname has a slash at the end or not
	    if (dir[strlen(dir)-1] == '/')
	        sprintf(file_name, "%s%s", dir, filenames[i]);
	    else
	        sprintf(file_name, "%s/%s", dir, filenames[i]);
   
	    FILE *file = fopen(file_name, "rb");
		if (file) {
			    fseek(file, 0, SEEK_END);
			    long html_len = ftell(file);
			    fseek(file, 0, SEEK_SET);

			    //Get rid of the first 2 lines

			    //get rid of url
			    int offset = 0;
			    char character;
			    do{
			    	character = fgetc(file);
			    	offset++;
			    } while(character != '\n' && character != EOF);
			    fseek(file, offset, SEEK_SET);

			    //get rid of depth
			    do{
			    	character = fgetc(file);
			    	offset++;
			    } while(character != '\n' && character != EOF);
			    fseek(file, offset, SEEK_SET);

			    //read in the file
			    char *html = malloc(html_len * (sizeof(char)));
			    fread(html, sizeof(char), html_len, file); 

			    fclose(file);

				int pos = 0;
		 		char *word = NULL;

				//errstring will hold error message from function, char **errstring = &errstring, it is the mailbox
				int doc_id = strtol(filenames[i], NULL, 10);
				if (doc_id == 0) { 
					fprintf(stderr, "Error: %s is an invalid crawler filename.\n", filenames[i]);
					continue;
				} 

		 		//get the words
		 		while((pos = GetNextWord(html, pos, &word)) > 0){
		 			NormalizeWord(word);
		 			addToHashTable(hashtable, word, doc_id);		//add word to indexer
		 			free(word);
		 			word = NULL;
		  		}
		  		free(html);	
	  		} 
	  	free(file_name);
	  	if (filenames[i]) 
	  		free(filenames[i]);
	  }
	free(filenames);
	return 1;
}

예제 #15

0

파일 보기

파일: getList.c 프로젝트: ChrisHoder/Projects

/*
  pseudocode

  1) Get first word
  2) check to make sure its not 'AND' or 'OR' or no word
  3) Normalize word
  4) store all DocNodes in Templist
  5) WHILE(not end of input) DO
      - get next word, w
      - IF (word is AND) THEN
         - FLAG is 1
	 - continue (return to top of loop)
      - ELSE IF (word is OR) THEN
         - FLAG is 2
	 - continue (return to top of loop)
      - ELSE (i.e. it is a word)
         - IF (FLAG > 2) THEN
	    - IF ( First add to final list) THEN
	        - FinalList = TempList
	      ELSE
	        - AND FinalList and TempList
	      FI
	    - Free TempList
	    - Normalize w
	    - store all DocNodes for w in Templist
	   FI
	 - IF (FLAG == 2 ) THEN
	    - normalize w
	    - OR ( Templist and w DocNodes)
        FI
      - FLAG = 0
    DONE
  6) Free allocated memory
  7) return Final List
	    

*/
DocumentNode *getList(char *input){
  char *w;
  int pos,FLAG,count;
  DocumentNode *TempList,*FL;
 

  //malloc space for 2 words
  w = (char *)malloc(MAX_WORD_LENGTH);
  MALLOC_CHECK(w);
  BZERO(w,MAX_WORD_LENGTH);

  pos = 0;
  count = 0;
  FLAG = 0;
  //while not end

  //first call, can't start with 'AND' or 'OR'
  if( ((pos = getWord(input,w,pos)) < 0) || (strcmp(w,"AND") == 0) || (strcmp(w,"OR") == 0) ){
    LOG("BAD INPUT!");
    exit(-1);
  }
  //normalize/get all DocNodes with first word
  NormalizeWord(w);
  TempList = getDocList(w);
  BZERO(w,MAX_WORD_LENGTH);

  //while not end of string
  while( (pos = getWord(input,w,pos)) > 0){
 
    // Input checking and operation determination
    
    //not OR
    if (strcmp(w,"OR") !=0){
      //word is AND
      if( strcmp(w,"AND") == 0){
	//Bad input -- exit
	if( FLAG > 0 ){
	  LOG("Cannot input two operators in a row");
	  exit(-1);
	}
	FLAG = 1;
	BZERO(w,MAX_WORD_LENGTH);
	continue;
      }
    }
    // word is OR
    else {
      //bad input -- exits file
      if ( FLAG > 0 ){
	LOG("Cannot input two operators in a row");
	exit(-1);
      }
      //we are good
      FLAG = 2;
      BZERO(w,MAX_WORD_LENGTH);
      continue;
    }
   
    //Word is actually a word and not 'AND' or 'OR'
    //now will update the finalList or Templist

    // either a space or AND previously
    if( FLAG < 2 ){
      //if first call to Final List
      if( count == 0){
	FL = TempList;
	count = 1;
	TempList = NULL;
      }
      //Already initilized the final list
      else{
	FL = AND(FL,TempList);
      }
      //empy the templist
      //store new word in the templist
	NormalizeWord(w);
	freeDocNodeList(TempList);
	TempList = getDocList(w);
    }
    //previously was an OR
    //need to add word to old word list
    if( FLAG == 2 ){
      NormalizeWord(w);
      TempList = OR(TempList,w);
    }
    //set flag to zero (i.e. last getWord was a word and not a switch
    FLAG = 0;


    BZERO(w,MAX_WORD_LENGTH);
  }

  //all words parsed out of string

  //if no calls to the final list yet
  if( count == 0 ){
    FL = TempList;
    count = 1;
    TempList = NULL;
  }
  //end of string is always AND
  else{
   FL=AND(FL,TempList);
  }
  //free allocated memory
  free(w);
  freeDocNodeList(TempList);

  //return DocNode List
  return FL;
}

예제 #16

0

파일 보기

파일: indexer.c 프로젝트: jlee9595/TinySearchEngine

int main(int argc, char* argv[]) {
	//check argument number
	if (argc < 3 || argc > 4) {
		printf("too many or too little arguments, please try again");
		exit(0);
	}
	
	//check directory validity
	if (!IsDir(argv[1])) {
		printf("invalid directory, please try again");
		exit(0);
	}
	
	//Initialize variables and index
	int docId;
	int pos;
	char *doc;
	char **filenames = NULL;
	int num_files = 0;
	HashTable *WordsFound = calloc(1, sizeof(HashTable));
	num_files = GetFilenamesInDir(argv[1], &filenames);

	//check whether the folder has files
	if (num_files < 0) {
		printf("failed to get any filenames");
		exit(0);
	}

	//iterate through each file in the directory
	for (int i = 0; i < num_files; i++) {
		
		//check that the file is in the correct format (title is a number)
		int filechecker = 0;
		for (int c = 0; c < strlen(filenames[i]); c++) {
			if (!isdigit(filenames[i][c])) {
				filechecker = 1;
			}
		}
		if (filechecker == 1) {
			continue;
		}

		//Load the document
		char *word;
		char file[100];
		strcpy(file, argv[1]);
		strcat(file, filenames[i]);
		doc = LoadDocument(file);
		docId = GetDocumentId(filenames[i]);
		free(filenames[i]);
		
		pos = 0;
		//Iterate through each word in the html file (doc)
		while ((pos = GetNextWord(doc, pos, &word)) > 0) {
			NormalizeWord(word);
			if (InHashTable(word, WordsFound) == 0) {
				AddToHashTable(word, WordsFound);
				UpdateHashTable(word, docId, WordsFound);
			}
			else {
				UpdateHashTable(word, docId, WordsFound);
				free(word);
			}
		}
		free(doc);
	}	
	free(filenames);
	SaveIndexToFile(argv[2], WordsFound);				//Save the index to the file specified
	FreeHashTable(WordsFound);

	//only proceed if there was a third argument specified. If so, reload the index form the file you just created
	if (argc == 4) {
		HashTable *ReloadedIndex = ReadFile(argv[2]);
		SaveIndexToFile(argv[3], ReloadedIndex);
		FreeHashTable(ReloadedIndex);
	}
	return 0;
}

예제 #17

0

파일 보기

파일: query.c 프로젝트: nahokitade/Tiny-Search-Engine

int main(int argc, char* argv[]){
	int success;			// contains 1 if removing from SinLL was successful
	int funcSuccess;
	int orNext;			// contains > 0 if the next word in query should be ORed
	int firstAdd;			// contains > 0 if the addition to SinLL is the first addition
	int tempChar;			// used to flush the stdin for too long inputs
	char query[MAX_QUERY_LEN];	// contains string of query
	char *getsSuccess;			// determines if EOF is met.
	int status = 1;
	SinLL *wordList;

	if(argc != 3){		// invalid number of arguments
		fprintf(stderr, ANSI_COLOR_RED "Usage: query [INDEXER OUTPUT FILE] [CRAWLER OUTPUT FILE DIRECTORY]"  ANSI_COLOR_RESET "\n");
		return 0;
	}

	if(!(access(argv[1], F_OK) != -1)){	// invalid file
		fprintf(stderr, ANSI_COLOR_RED "First argument is not a valid file."  ANSI_COLOR_RESET "\n");
                return 0;
	}	

	if(!IsDir(argv[2])){	// invalid "directory"
		fprintf(stderr, ANSI_COLOR_RED "Second argument is not a directory."  ANSI_COLOR_RESET "\n");
		return 0;
	}

	HashTable *invertedIndex;
	invertedIndex = calloc(1, sizeof(HashTable));

	if(!invertedIndex){
		status = 0;
		goto cleanup;
	}

	funcSuccess = readFile(argv[1], invertedIndex);	// recreate inverted index
	if(!funcSuccess){
		status = 0;
                goto cleanup;
	}

	while(1){
		// get the query from user
		fputs("QUERY> ", stdout);
  		fflush(stdout); 
		
		getsSuccess = fgets(query, sizeof(char)*MAX_QUERY_LEN, stdin);
		if(!getsSuccess) break;	// EOF means exiting program

		// this means the user input more than MAX_QUERY_LEN characters to query
		if(getsSuccess[strlen(getsSuccess)-1] != '\n'){
			fprintf(stderr, ANSI_COLOR_RED "Query length is over the maximum 1000 characters!"     ANSI_COLOR_RESET "\n");
			while((tempChar = getchar()) != '\n' && tempChar != EOF){
				/*do nothing*/
			}
			continue;
		}

		// at this stage, the next add is the first add, and we have not seen a 
		// OR yet.
		orNext = 0;
		firstAdd = 1;

		wordList = CreateSinLL();
		if(!wordList) break;

		char *wordP;  
		wordP = strtok(query," ");

		// get all the words from the query
		while(wordP){
			// last word in query will have a \n attached to it, so if 
			// there is a \n at the end of a word, take that out
			if(wordP[strlen(wordP)-1] == '\n'){
				wordP[strlen(wordP)-1] = 0;
			}

			// ignore ANDs.
			if(strcmp(AND, wordP) == 0){ 
				wordP = strtok (NULL, " ");
				continue;
			}
	
			// ignore ORs but make sure you OR the next coming word.
			if(strcmp(OR, wordP) == 0){
				orNext = 1;
				wordP = strtok (NULL, " ");
				continue;			
			}

			// make word lowercase. If this word is the first one, or 
			// the previous word was OR, make a new node in the SinLL 
			// of WordChainList
			NormalizeWord(wordP);
			if(firstAdd){
				funcSuccess = appendNewWordChain(wordP, wordList);
				if(!funcSuccess) break;
				firstAdd = 0;
        	        }
			else if(orNext){
				funcSuccess = appendNewWordChain(wordP, wordList);
				if(!funcSuccess) break;
				orNext = 0;
			}
			// if not the previous two cases, just append the word to 
			// current node.
			else{
				appendWord(wordP, wordList);
			}
			wordP = strtok (NULL, " ");
		}
		
		// first process will AND all the words contained in each WordChainNodes
		// of the list.
		WordChainNode *curWordChain = wordList->head;
		while(curWordChain){		// while there are more nodes
			firstAdd = 1;
			DocNode *tempProcessDocNode;	// contains original DocNodes to AND from index
                	DocNode *processDocNode;	// contains copied version of above.

			WordsLL *wordsProc = curWordChain->words;	// gettng first set of words.
			while(wordsProc){	// while there are more words
	
				// get DocNodes associated with that word from the inverted index and 
				// copy it as to not mess up the inverted index. 
				tempProcessDocNode = DocsFromWordNode(wordsProc->word, invertedIndex);	
				processDocNode = CopyDocs(tempProcessDocNode);
				
				// merge the above DocNodes with the DocNodes saved at the current
				// WordChainNode.
				DocMergedID(&processDocNode, &(curWordChain->docs));			

				// if it is the first add, we want to skip this step. If it isnt the 
				// first add, and the above DocNodes with the ocNodes saved at the current
				// WordChainNode.	
				if(!firstAdd){
					ProcessAND(&processDocNode);
				}

				// Add the processed (ANDed) DocNode chain at the current
				// WordChainNode.
				AddDocNodeChain(curWordChain, processDocNode);
			
				// iterate through to the next word at the current node.
				wordsProc = wordsProc->nextWord;
				firstAdd = 0;
			}
			// move on to the next node. 
			curWordChain = curWordChain->nextWords;
		}
		
		// now we OR each individual WordChainNodes' DocNode lists.
		curWordChain = wordList->head;
		DocNode *curDocs;
		DocNode *nextDocs;
		success = removeTopDoc(wordList, &curDocs);	// gets the DocNode list from the first node
		// if there you fail here, it means that the list is empty/
		if(success){
			success = removeTopDoc(wordList, &nextDocs);	// gets the next DocNode list from
									// the next WordChainNode
			while(success){		// if you fail here, there was only one WordChainNode in the list

				// process the DocNodes together by ORing them
				DocMergedID(&curDocs, &nextDocs);
				ProcessOR(&curDocs);
				// move on to the next DocNodes from the next WordChainNode.
				success = removeTopDoc(wordList, &nextDocs);
			}
		}
		// the list was empty, so found nothing. 
		else{
			printf("Found 0 pages\n"); 
			continue;
		}

		// sort by the rank and print the results.
		SortByRank(&curDocs);
		PrintQueryResult(curDocs, argv[2]);
		free(wordList);		// clean up for next query
	}
	cleanup:
		if(invertedIndex) DeleteHashTable(invertedIndex); 	// final clean up
		if(!status){ 
			fprintf(stderr, ANSI_COLOR_RED "Failed inverted index building."  ANSI_COLOR_RESET "\n");
			return 0;
		}
	return 1;
}

예제 #18

0

파일 보기

파일: queryengine_test.c 프로젝트: empotix/searchengine-1

int main(int argc, char* argv[]){
	// Declare variables-------------------------------------------------------------
	HashTable Index;				// Inverted index
	InitialiseHashTable(&Index);
	char text[MAXLEN];
	int test = 0;
	
	// 1. Check input parameters--------------------------------------------------------
	if (argc != 3 ){ 				// check number of arguments
		fprintf(stderr,"Error: Incorrect number of input argument\n");
		return -1;
	}else if(!IsFile(argv[1])){		// check if file is valid
		fprintf(stderr,"Error: File %s is invalid\n", argv[1]);
		return -1;
	}else if(!IsDir(argv[2])){		// check if directory is valid
		fprintf(stderr,"Error: Directory %s cannot be found\n", argv[2]);
		return -1;
	}
	
	// 2. Reconstruct Inverted Index-----------------------------------------------------
	printf("Please wait while the query engine is loading. It might take a few minutes... \n");
	if(!ReadFile(&Index, argv[1])){
		CleanUpHash(&Index);
		return -1;
	}
	
	
	// 3. Command Line interface and query -----------------------------------------------
	
	for(int j=0; j<9; j++){
		
		// Create text array for automated testing
		
		switch (j){
			case 0:
				printf("\n3.%d Test invalid input syntax\n",j+1);
				printf("QUERY :> AND dog\n");
				strcpy(text,"AND dog\n");
				break;
			case 1:
				printf("\n3.%d Test invalid input syntax\n", j+1);
				printf("QUERY :> cat OR AND dog\n");
				strcpy(text,"cat OR AND dog\n");
				break;
			case 2:
				printf("\n3.%d Test no result\n", j+1);
				printf("QUERY :> thisisrandom\n");
				strcpy(text,"thisisrandom\n");
				break;
			case 3:
				printf("\n3.%d Test single entry\n", j+1);
				printf("QUERY :> incredible\n");
				strcpy(text,"incredible\n");
				break;
			case 4:
				printf("\n3.%d Test uppercase\n", j+1);
				printf("QUERY :> Incredible\n");
				strcpy(text,"Incredible\n");
				break;
			case 5:
				printf("\n3.%d Test AND\n", j+1);
				printf("QUERY :> Dartmouth AND College AND Computer AND Science\n");
				strcpy(text,"Dartmouth AND College AND Computer AND Science\n");
				break;
			case 6:
				printf("\n3.%d Test space as AND\n", j+1);
				printf("QUERY :> Dartmouth College Computer Science\n");
				strcpy(text,"Dartmouth College Computer Science\n");
				break;
			case 7:
				printf("\n3.%d Test OR\n", j+1);
				printf("QUERY :> Dartmouth OR Computer\n");
				strcpy(text,"Dartmouth OR Computer\n");
				break;
			case 8:
				printf("\n3.%d Test combined\n", j+1);
				printf("QUERY :> Dartmouth College AND Hanlon OR Mathematics AND Computer Science AND Philosophy OR incredibles Pixar\n");
				strcpy(text,"Dartmouth College AND Hanlon OR Mathematics AND Computer Science AND Philosophy OR incredibles Pixar\n");
				break;
		
		}
		// a) declare variables
		int unionflag, flag, size_temp, size_intersect, size_final, count;
		char wordarray[MAXLEN][MAXLEN];
		int temparray[MAXSIZE][2], intersect[MAXSIZE][2], final[MAXSIZE][2];
		
		// b) instantiate variables
		size_temp = size_intersect = size_final = unionflag = flag = 0;
		count = StringToWord(wordarray,text);
		
		// c) query
		for(int i=0; i<count; i++){
			if(i==0 && strcmp(wordarray[i],"AND") && strcmp(wordarray[i],"OR")){ 	// if it's the first word and is not invalid
				NormalizeWord(wordarray[i]);
				size_intersect = FindHash(wordarray[i], intersect, Index);
				continue;
			}else if(i==0){ 	// if it is first word and invalid
				flag = 1; break;
			}else if(unionflag){
				if(strcmp(wordarray[i],"AND") && strcmp(wordarray[i],"OR")){
					NormalizeWord(wordarray[i]);
					size_intersect = FindHash(wordarray[i], intersect, Index);
					unionflag = 0;
					continue;
				}else{
					flag = 1; break;
				}
			}
			
			if (!strcmp(wordarray[i],"AND")){	// if it's AND
				if(CheckOperator(wordarray,i,count)){
					NormalizeWord(wordarray[i+1]);
					size_temp = FindHash(wordarray[i+1], temparray, Index);
					size_intersect = FindIntersection(intersect, size_intersect, temparray, size_temp);
					i++;
					continue;
				}else{
					flag = 1; break;
				}
			}else if(!strcmp(wordarray[i],"OR")){ // if it's OR
				if(CheckOperator(wordarray,i,count)){
					size_final = FindUnion(final, size_final, intersect, size_intersect);
					size_intersect = 0;
					unionflag = 1;
					continue;
				}else{
					flag = 1; break;
				}
			}else{
				NormalizeWord(wordarray[i]);
				size_temp = FindHash(wordarray[i], temparray, Index);
				size_intersect = FindIntersection(intersect, size_intersect, temparray, size_temp);
				continue;
			}
		}