//takes in an array of search terms, gets the docID, score, and which list it should be added in //and passes the information to the function addScoreToList to be added to the list // //PSEUDO CODE //for all searchterms // get the docIDs associated with the word and add the score to the list but factor in a WEIGHT // if the prev searchterm is not OR // add to list with weight // else // add to the list void processSearchTerms(INVERTED_INDEX* index, char* searchterms) { int docID; int score; char* prevterm = NULL; char* currentterm; int pos; DOCNODE* d; while (searchterms != NULL) { currentterm = searchterms; pos = 0; if(isSearchTerm(currentterm) == TRUE) { //if it's a search term, normalize it and search for it NormalizeWord(currentterm); while((d = getDoc(index, currentterm, &pos)) != NULL) { docID = d->doc_id; score = d->page_word_freq; if(isNotOR(prevterm) == TRUE) { //add with weighteded score because it must be ADD addScoreToList(querylist, TRUE, docID, (score*WEIGHT)); } else//add with regular score addScoreToList(querylist, FALSE, docID, score); } } prevterm = currentterm; searchterms = strtok(NULL, " "); //get next searchterm } if (querylist->start != NULL) { slist = NEW(SORTLIST); MALLOC_CHECK(slist); BZERO(slist, sizeof(SORTLIST)); sortList(slist, querylist); printList(slist); } }
static PyObject *normalize(Normalizer *self, PyObject *args) { int j; PyObject * data=NULL ; if (! (PyArg_ParseTuple(args,"O", &data))) return NULL; if (PyList_Check(data)) { PyObject *list; list = PyList_New(0); data = PySequence_Fast(data, "object must be sequence"); for (j=0; j<PyList_Size(data); j++) { PyObject *word=NULL,*item=NULL; item = PySequence_Fast_GET_ITEM(data,j); word = NormalizeWord(self, item); PyList_Append(list, word); } return list; } else if (PyUnicode_Check(data) || PyString_Check(data) ) { PyObject *word=NULL; if (! (word = NormalizeWord(self,data))) return NULL; return (PyObject *) word; } else { PyErr_SetString(PyExc_TypeError,"argument must be unicode or string"); return NULL; } return data; }
// updateIndex takes a word, a document_id, and an index. It adds the document to the index, // and the word itself if it's not already contained in the index. Returns 0 if success, 1 if failure. int updateIndex(char* word, int document_id, INVERTED_INDEX* in_index) { DocumentNode* docnode; WordNode* wordnode; DocumentNode* current_doc_node; int page_node_exists; page_node_exists = 0; // creates a DocumentNode from the doc_id docnode = malloc(sizeof(DocumentNode)); MALLOC_CHECK(docnode); docnode->document_id = document_id; docnode->page_word_frequency = 1; docnode->next = NULL; // makes it lower case (necessary for the query system) NormalizeWord(word); if(addData(in_index, docnode, word)) // if the wordnode already exists { wordnode = getData(in_index, word); if(wordnode != NULL) { current_doc_node = wordnode->data; while(current_doc_node != NULL) { if((current_doc_node->document_id) == document_id) { page_node_exists = 1; current_doc_node->page_word_frequency = (current_doc_node->page_word_frequency)+1; free(docnode); break; } else if(current_doc_node->next == NULL) break; else current_doc_node = current_doc_node->next; } if(!page_node_exists) { current_doc_node->next = docnode; } } } return 0; }
Datum spell_lexize(PG_FUNCTION_ARGS) { DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); char *txt; char **res; char **ptr, **cptr; if (!PG_GETARG_INT32(2)) PG_RETURN_POINTER(NULL); res = palloc(sizeof(char *) * 2); txt = pnstrdup(in, PG_GETARG_INT32(2)); res = NormalizeWord(&(d->obj), txt); pfree(txt); if (res == NULL) PG_RETURN_POINTER(NULL); ptr = cptr = res; while (*ptr) { if (searchstoplist(&(d->stoplist), *ptr)) { pfree(*ptr); *ptr = NULL; ptr++; } else { *cptr = *ptr; cptr++; ptr++; } } *cptr = NULL; PG_RETURN_POINTER(res); }
// For a single word, returns the start of the result Docnode list. // Makes an exact copy of the docnode from index DocNode *getResultsForWord(char *word, INVERTED_INDEX *index) { // check for reserved words first if (!strncmp(word, "AND", 3) || !strncmp(word, "OR", 2)) { printf("AND and OR are reserved words. Please enter a different query.\n"); return NULL; } NormalizeWord(word); int h = makeHash(word); int word_not_found = 1; // 1 for true, 0 for false WordNode *cluster_end = NULL; // collisionAction is like getDatawith key for Dictionary.. cluster_end = collisionAction(index, h, word, &word_not_found); if (word_not_found) return NULL; else { DocNode *d = cluster_end->data; // page from the wordnode DocNode *dcopy = initDocNode(d->doc_id, d->page_word_frequency); // see if more documents exist for (d=d->next; d!=NULL; d=d->next) updateDocNode(dcopy, d->doc_id, d->page_word_frequency); // This also works but commented because ORhelper has to free // the incoming doc list... // since we are making exact copy of the docnode from index, // we need to add the docs that are not already in our copy. // So this is like an OR operation. // if (d->next != NULL) // ORHelper(&dcopy, d->next); return dcopy; } }
/* ----------------------------------------------------------------------- Function Name: initial_index() Functionality: Crawler data in directory specified is scanned and each word inserted into the Hashtable Index Input <--- directory where crawler data lives (html from crawled pages) and the Hashtable Index Outputs ---> void (updated index) ----------------------------------------------------------------------- */ void initial_index(char *argv[], HashTable *Index) { int doc_id; char **results_filenames = NULL; int num_files = GetFilenamesInDir(argv[1], &results_filenames); //going through each file, allocating memory, parsing html and updating the index with word for (doc_id = 1; doc_id < num_files; doc_id++) { char *file_name = calloc(1, floor(log10(abs(doc_id))) + 2); snprintf(file_name, floor(log10(abs(doc_id))) + 2, "%d", doc_id); // printf("results_filename is: %s", file_name); char *file_path = calloc(1, strlen(argv[1]) + strlen(file_name) + 4); snprintf(file_path, strlen(argv[1]) + strlen(file_name) + 4, "./%s/%s", argv[1], file_name); // printf("path is: %s \n", file_path); if (IsFile(file_path)) { char *content = file_parsing(file_path); //parsing out first two line_contents of each file, so only html left over //normalizing each word in each file and updating the index hashtable with it if (content != NULL) { int pos = 0; char *word; while ((pos = GetNextWord(content, pos, &word)) > 0) { NormalizeWord(word); insert_to_index(doc_id, word, Index); } } free(content); } free(file_name); free(file_path); } free(results_filenames); }
int main(int argc, char* argv[]) { // Program parameter processing if(argc != 3){ printf("Error: Incorrect usage\n"); printf("Query Usage: ./query [indexed data(eg. indexer.dat)] [html data(eg. data)]\n"); return 1; } //Get the supplied directory name. int dirSize = strlen(argv[2]); char htmlDirectory[dirSize + 1]; htmlDirectory[0] = '\0'; strcat(htmlDirectory, argv[2]); //Get the fileName. int fileSize = strlen(argv[1]); char indexedFile[fileSize + 1]; indexedFile[0] = '\0'; strcat(indexedFile, argv[1]); if(IsFile(indexedFile) == 0){ printf("Incorrect path for indexed file\n"); return 1; } //Check if the path provided is a valid directory. if(IsDir(htmlDirectory) == 0){ printf("Incorrect path for html directory\n"); return 1; } DocumentNode *final = NULL; DocumentNode* orList[MAX_INPUT]; //OR's HashTable *tempHashTable = initHashTable(); // recreate the inverted index tempHashTable = ReadFile(indexedFile); int revert = SaveIndexToFile(tempHashTable,indexedFile); if (revert == 0) printf("0 Returned from inverting\n"); printf("Satrting query..\n"); //Queries char inp[MAX_INPUT]; char buff[MAX_INPUT]; int orFlag; int orIndex; //loop until user exits printf("Query:>"); LABEL:while ((fgets(inp,MAX_INPUT,stdin))) { printf("Query:>"); for (int index = 0; index < MAX_INPUT; index++){ orList[index] = NULL;//init list elements to null } orFlag = 999; orIndex = 0; // if its a blank enter if (strcmp(inp, "\n") == 0){ fprintf(stderr, "You entered a blank line. Please enter query words!\n"); continue; } // remove trailing newline char *pos; if ((pos=strchr(inp, '\n')) != NULL){ *pos = '\0'; } // check for the last word strcpy(buff, inp); char *isLast; char *lastWord; isLast = strtok(buff, " "); // find the last word while (isLast != NULL){ lastWord = isLast; isLast = strtok(NULL, " "); // check for AND OR and OR AND consecutively if (isLast != NULL) { if ((strcmp(lastWord, "AND") == 0) || (strcmp(lastWord, "OR") == 0)) { if ((strcmp(isLast, "OR") == 0) || (strcmp(isLast, "AND") == 0)) { fprintf(stderr, "Two consecutive query words is invalid. Please try again.\n"); goto LABEL; } } } } if ((strcmp(lastWord, "AND") == 0) || (strcmp(lastWord, "OR") == 0)) { fprintf(stderr, "Last word in query is invalid: %s\n", lastWord); continue; } char *words; words = strtok(inp, " "); //break input on spaces //first word validity if ((strcmp(words, "AND") == 0) || (strcmp(words, "OR") == 0)) { fprintf(stderr, "First word in query is invalid: %s\n", words); continue; } NormalizeWord(words);//normalize the first valid word final = getDocumentList(words, tempHashTable, final);//init doc list words = strtok(NULL, " "); // return the list for a one word query(next = null) if (words == NULL) { final = querySort(final);//recursive sort printResult(final, htmlDirectory);//display freeDocumentList(final); final = NULL;
int GetLinks(char *line, HashTable *Index) { // Declare variables. char *buf; char word[MAX]; int flag; // flag to do union or intersection operations. int count; // variable to count the position of a word in the line. // Initialize variables. buf = line; flag = 1; count = 0; temp_list = NULL; final_list = NULL; // Loop through the line and do the appropriate operations. while (sscanf(buf, "%s", word) == 1) { count++; // If word is AND, then ignore and read in new word. if (strcmp(word, operator1) == 0) { if (count == 1) { // If there is no previous word, then throw an error. return 0; } // Increment position in line. buf = strstr(buf, word) + strlen(word); continue; } // If word is OR, then tell the program to do OR operation. if (strcmp(word, operator2) == 0) { flag = 2; // Set flag to union operation. if (count == 1) { // If there is no previous word, then throw an error. return 0; } } // Hold onto original copy of word in case NormalizeWord() changes its content. char *word_old = (char *)calloc(1, strlen(word) + 1); strcpy(word_old, word); // Change word to lowercase. if (strcmp(word, operator1) != 0 && strcmp(word, operator2) != 0) { NormalizeWord(word); // Normalize if word is not an operator. } // Add list of docs to temp_list. // Case when it is the first word of the block. if (count == 1) { // Declare variables. WordNode *current; // variable for traversal. DocumentNode *ptr, *ptr2; // variables for traversal. int num; // Case when the word is in the InvertedIndex. if ((num = InHashTable(word, Index))) { unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code. current = Index->table[index]->data; // Loop until we get the matching WordNode. for (int i=1; i < num; i++) { current = current->next; } // Loop through each DocumentNode and add to temp_list. for (ptr = current->page; ptr != NULL; ptr = ptr->next) { // Declare and initialize a DocumentNode with the same values as ptr. DocumentNode *dn; dn = (DocumentNode *)calloc(1, sizeof(DocumentNode)); dn->doc_id = ptr->doc_id; dn->freq = ptr->freq; // Add the new DocumentNode to temp_list. if (temp_list == NULL) { // Case when temp_list is empty. temp_list = dn; ptr2 = temp_list; } else { // Case when temp_list is nonempty. ptr2->next = dn; ptr2 = ptr2->next; } } } } else { // If not first word of the block, then do the operation. // Check if the current operation is "AND". if (flag == 1) { And(word, Index); } // Check if the current operation is "OR". if (flag == 2) { if (temp_list != NULL) { Or(); } flag = 1; // Set flag back to "AND" operation. count = 0; // Set word count to 0 to signal the start of a new block of words. } } // Increment position in the query line to read in next word. buf = strstr(buf, word_old) + strlen(word_old); free(word_old); // Cleanup. } // If the last word of the query line is an operator, throw an error. if (strcmp(word, operator1) == 0 || strcmp(word, operator2) == 0) { return 0; } // If nonempty, flush out temp_list to final_list. if (temp_list != NULL) { Or(); } return 1; // Return 1 if successful. }
List *ParseWordsToList(char *parse_string, HashTable *word_index) { // make sure beginning of string is a word while (!isalpha(parse_string[0])) { int delete_index = 0; memmove(&parse_string[delete_index], &parse_string[delete_index + 1], strlen(parse_string) - delete_index); } // Get the first word and keep a pointer to the original string char *copy_parse_string = strdup(parse_string); char *parse_string_pointer = copy_parse_string; char *word = strsep(©_parse_string, " "); // Check that the first word is not OR or AND // Make list node that holds the entire group of nodes connected by AND ListNode *AND_group = calloc(1, sizeof(List)); AND_group->object = NULL; AND_group->prev = NULL; AND_group->next = NULL; // Allocate list of all groups and set variables to AND_group List *all_groups = calloc(1, sizeof(List)); all_groups->current = AND_group; all_groups->head = AND_group; all_groups->tail = AND_group; // Until end of query while (word != NULL) { // Disregard word if it is AND and move on to the next one if (strcmp(word, "AND") == 0) { word = strsep(©_parse_string, " "); continue; } // if (strcmp(word, "\0") == 0) { // } // If the word is OR, then we make a new list, with a new list node and word node for the next word if (strcmp(word, "OR") == 0) { // Check that there are words in the previous list if (all_groups->current->object == NULL) { word = strsep(©_parse_string, " "); continue; } // allocate new list ListNode *new_AND_group = calloc(1, sizeof(ListNode)); new_AND_group->object = NULL; new_AND_group->prev = all_groups->tail; new_AND_group->next = NULL; // connect previous list to new list and update List tail and current all_groups->current->next = new_AND_group; if (all_groups->tail != all_groups->current) { fprintf(stderr, "Something was wrong with updating tail_pointer\n"); exit(-1); } all_groups->tail = new_AND_group; all_groups->current = new_AND_group; AND_group = new_AND_group; // get next word word = strsep(©_parse_string, " "); continue; } /* Since word is neither OR or AND, we can look it up and made a new node */ // normalize word NormalizeWord(word); /* Find the word in the hashtable */ HashTableNode *current_hash_node; // if the word does not exist in the hashtable discard entire group until new "OR" if ((current_hash_node = LookupKey(word_index, word)) == NULL) { // Make new list of list nodes ListNode *new_AND_group = calloc(1, sizeof(List)); new_AND_group->object = NULL; new_AND_group->prev = NULL; new_AND_group->next = NULL; // Remove current list ListNode *list_to_remove = all_groups->current; if (list_to_remove->prev != NULL) { (list_to_remove->prev)->next = new_AND_group; new_AND_group->prev = list_to_remove->prev; } else if (list_to_remove->prev == NULL) { all_groups->head = new_AND_group; } // set tail and current for all_groups list all_groups->tail = new_AND_group; all_groups->current = new_AND_group; AND_group = new_AND_group; // Free entire group ListNode *current_list_node = (ListNode *)list_to_remove->object; ListNode *next_list_node; while (current_list_node != NULL) { ListNode *current_list_node_pointer = current_list_node; next_list_node = current_list_node->next; free(current_list_node_pointer); current_list_node = next_list_node; } free(list_to_remove); // Look for next OR while (word != NULL && strcmp(word, "OR") != 0) { word = strsep(©_parse_string, " "); } if (word == NULL) { //remove the new lit } } // Otherwise, if there is a word, there, get the WordNode, create a new one to put in the list, else { // Get the word node from the hashtable WordNode *word_node_to_add = (WordNode *)current_hash_node->object; // Make a list node to hold new word node ListNode *new_AND_node = calloc(1, sizeof(ListNode)); new_AND_node->next = NULL; new_AND_node->object = word_node_to_add; // If the group's first word node has not been set, then make it the first if (AND_group->object == NULL) { AND_group->object = new_AND_node; new_AND_node->prev = NULL; } // else we add the list node to the end of the group else { // get the first list node from the group ListNode *current_list_node = AND_group->object; ListNode *next_list_node; // Get to the last list node in the group while (current_list_node->next != NULL) { next_list_node = current_list_node->next; current_list_node = next_list_node; } // Connect the last list node to the new list node current_list_node->next = new_AND_node; new_AND_node->prev = current_list_node; } } word = strsep(©_parse_string, " "); } free(parse_string_pointer); // Check the case in which there is nothing that works. ListNode *irregular_list_node = all_groups->head; WordNode *irregular_word_node = (WordNode *)irregular_list_node->object; if (irregular_word_node == NULL) { free(copy_parse_string); free(irregular_list_node); free(all_groups); return NULL; } // Check that the end of all groups is not NULL ListNode *current_group_to_check = all_groups->tail; if (current_group_to_check->object == NULL) { all_groups->tail = current_group_to_check->prev; all_groups->tail->next = NULL; free(current_group_to_check); all_groups->current = all_groups->tail; } return all_groups; }
int main(int argc, char* argv[]){ //call the check args function to check the input arguments checkArgs(argc, argv); //init the HashTable HashTable* Table = ReadFile(argv[1]); //init the array to hold all of the input words char wordArray[MAX_ROWS][MAX_ROWS][MAX_WORD_LENGTH + 1]; //init keyboard input string char line[MAX_WORD_LENGTH+1]; while (1){ //main loop printf("\nEnter your string (enter \"QUIT\" to exit the function) \n"); //accept user input. Deal with user input longer than the max line if (fgets(line, MAX_LINE, stdin)){ if (NULL == strchr(line, '\n')){ printf("Query only accepts 1000 characters\n"); eat_extra(); //"eats" characters after 1000 characters are input then exits exit(1); } } //handle when the user quits the program if (strcmp(line, "QUIT\n") == 0){ printf("Exit command reached, Cleaning memory and quitting\n"); CleanHashMemory(Table); exit(0); } // size_t length = strlen(line); // printf("length of input is %zu\n", length ); //check if the inputted line ends with AND or OR EndsWithAND(line); EndsWithOR(line); char* argv2 = argv[2]; //make sure the wordArray is cleared out between queries memset(wordArray, 0, sizeof(wordArray[0][0][0]) * 500 * MAX_ROWS * MAX_WORD_LENGTH + 1); int FinalDocMatchArray[1705] = {0}; //keep the documents ids that have matched all the criteria int FinalArrayIndex = 0; int scoreArray[1705] = {0}; //keep the scores of the FinalDocMatchArray in parallel positions int index = 0; //init variables for GetNextWord int pos = 0; int counter = 0; int andPos = 0; int andFlag = 0; int orFlag = 0; int orPos = 0; char* word; while((pos = GetNextWord(line, pos, &word)) > 0){ //go through the words in the query //if the word exists, add it to the hash table if (word != NULL && strlen(word) < MAX_WORD_LENGTH) { //check if it starts with AND or OR if (counter == 0 && (strcmp(word, "AND") == 0 || strcmp(word, "OR") == 0)){ printf("Input cannot start or end with AND or OR\n"); exit(1); } else if (strcmp(word, "AND") == 0){ // printf("AND detected\n"); if (andFlag == 1) { printf("Two ANDs in a row. Invalid input.\n"); exit(1); } andFlag = 1; } //detect ORs and increment position in wordArray else if (strcmp(word, "OR") == 0){ // printf("OR detected\n"); if (orFlag == 1) { printf("Two ORs in a row. Invalid input.\n"); exit(1); } orPos++; andPos = 0; orFlag = 1; } else{ NormalizeWord(word); // printf("Word is %s %i\n", word, counter); andFlag = 0; orFlag = 0; //put the word in the wordArray at the appropriate place int len = strlen(word+1); char wordCpy[len+1]; strcpy(wordCpy,word); strcpy(wordArray[andPos][orPos], wordCpy); // printf("Adding %s to array at %i %i \n",word, andPos, orPos ); andPos++; } counter++; } free(word); word = NULL; } //k is incremented every time an OR is processed int k = 0; while (strcmp(wordArray[0][k], "") != 0){ int docMatchArray[1705] = {0}; //temporary array of matching documents int docMatchArrayIndex = 0; char* firstWord = wordArray[0][k]; // printf("Word is: %s\n", firstWord); //compute jenkins hash int hashResult = JenkinsHash(firstWord, MAX_HASH_SLOT); if (Table->table[hashResult] == NULL){ printf("%s does not exist in hashTable database\n", firstWord ); exit(1); } //go through the hashtable until you find the appropriate word and documents //put it into a temporary array to be matched against else{ WordNode* node2 = Table->table[hashResult]; WordNode* dummyWord = node2; while (dummyWord != NULL){ //go through all the linked words DocumentNode *dummy_doc = dummyWord->page; if (strcmp(dummyWord->word, firstWord) == 0){//if they are the same word, go through the document nodes //go through the document nodes while (dummy_doc != NULL) { //put all of the first words docs into the temp list docMatchArray[docMatchArrayIndex] = dummy_doc->doc_id; docMatchArrayIndex++; //advance dummy_doc = dummy_doc->next; } break; //you've found the word, no need to continue to other words } else{ // printf("Did not find %s\n", firstWord ); } dummyWord = dummyWord->next; // printf("Advancing\n"); } } //if there's only 1 word to examine, no need to compare other words if (strcmp(wordArray[1][k], "") == 0){ //add everything in the doc match array to the FinalDocMatchArray for (int i = 0; i < docMatchArrayIndex; i ++ ){ if (docMatchArray[i] != '\0'){ int dupIndex = 0; int dupFlag = 0; while (FinalDocMatchArray[dupIndex] != '\0'){ //check if they're the same if(docMatchArray[i] == FinalDocMatchArray[dupIndex]){ // printf("FOUND A DUPLICATE for %i\n", docMatchArray[i] ); dupFlag = 1; //a duplicate was found, compute the final score and increment that element int finalScore = 0; int index3=0; // printf("docNum is %i\n",FinalDocMatchArray[index]); while(strcmp(wordArray[index3][k],"") != 0){ //for every word //go through all the words and compute the final score finalScore += ComputeScore(FinalDocMatchArray[dupIndex], Table, wordArray[index3][k]); index3++; } //put it in the score array scoreArray[dupIndex] += finalScore; finalScore = 0; break; } dupIndex++; } //if the duplicate was not found and there's only 1 word, then put everything into the final array if (dupFlag != 1) { //if a duplicate was not found in the list FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i]; FinalArrayIndex++; } } } } //if there's more than one word between the OR statements, compute the final scores for them all else{ for (int i = 0; i < 1705; i ++){ if (docMatchArray[i] != 0){ int result = 1; int m = 0;//make sure to adjust based on current position in masterList //for every doc in the docMatchArray, test if all other words contain that doc while (strcmp(wordArray[m][k], "") != 0) { //increment word //check if this word's documents and see if there's a match result = findDocMatch(docMatchArray[i], Table, wordArray[m][k]); if (result != 0){ break; //the document had no matches, skip the rest } m++; } if (result == 0){ //before you add it to the final array, check if you've already added it int dupIndex2 = 0; int dupFlag2 = 0; while (FinalDocMatchArray[dupIndex2] != '\0'){ //if it's already in the list, then only increment the score if(docMatchArray[i] == FinalDocMatchArray[dupIndex2]){ dupFlag2 = 1; int finalScore2 = 0; int index4 = 0; while(strcmp(wordArray[index4][k],"") != 0){//for every word // printf("Word is %s\n",wordArray[index4][k]); finalScore2 += ComputeScore(FinalDocMatchArray[dupIndex2], Table, wordArray[index4][k]); index4++; } scoreArray[dupIndex2] += finalScore2; //increment the appropriate score finalScore2 = 0; break; } dupIndex2++; } //otherwise, add it to end of the Final Array if (dupFlag2 != 1){ FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i]; FinalArrayIndex++; } } } } } //compute the scores for all the non-duplicates toward the end of the array int finalScore = 0; while (FinalDocMatchArray[index] != '\0'){ //for every doc that matches all AND words int index2=0; // printf("docNum is %i\n",FinalDocMatchArray[index]); while(strcmp(wordArray[index2][k],"") != 0){//for every word // printf("Word is %s\n",wordArray[index2][k]); finalScore += ComputeScore(FinalDocMatchArray[index], Table, wordArray[index2][k]); index2++; } // printf("Score for %i is %i\n",FinalDocMatchArray[index], finalScore); //put it in the score array scoreArray[index] = finalScore; finalScore = 0; index++; } k++; //increment OR position } //sort the Final Array BubbleSort(FinalDocMatchArray, scoreArray, argv2); }//loop back to string entry } //end main
// takes a char* input_line, a QUERY** queries, and a pointer to an int num_queries // parses input_line for QUERYs, placing them into queries, and incrementing // num_queries as it does so // returns -1 if input_line is bad (empty, ends in "OR") // returns 1 if input_line == "q" (quit command) // returns 0 if successful int pullQueries(char* input_line, QUERY** queries, int* num_queries) { char *current_keywords[MAX_NUM_KEYWORDS]; char *word; int current_index; QUERY* query; int position; word = malloc(MAX_KEYWORD_LENGTH*sizeof(char)); BZERO(word, MAX_KEYWORD_LENGTH*sizeof(char)); *num_queries = 0; current_index = 0; // corresponds to index of current_keywords position = 0; // matches index in input_line // getNextWord parses the input_line for a word, storing it into word // works just like getNextURL while((position = getNextWord(input_line, word, position)) != -1) { word[strlen(word)] = '\0'; // if quit command if(current_index == 0 && strcmp(word, "q") == 0) { free(word); return 1; } // if OR (a new QUERY is about to begin) if(strcmp(word, "OR") == 0) { query = malloc(sizeof(QUERY)); MALLOC_CHECK(query); // for each keyword in current_keywords, put it into the search_words // parameter of query for(int i = 0; i < current_index; i++) { query->search_words[i] = malloc(MAX_KEYWORD_LENGTH*sizeof(char)); BZERO((query->search_words)[i], MAX_KEYWORD_LENGTH*sizeof(char)); strncpy((query->search_words)[i], current_keywords[i], MAX_KEYWORD_LENGTH*sizeof(char)); free(current_keywords[i]); } // include a null-terminator just in case query->search_words[current_index] = NULL; // place the new query into queries (while incrementing num_queries) queries[(*num_queries)++] = query; // empty current_keywords and reset its index current_index BZERO(current_keywords, MAX_NUM_KEYWORDS); current_index = 0; } // if it's a regular keyword else { // make it lower case NormalizeWord(word); // add it to current_keywords while incrementing its index current_index current_keywords[current_index] = malloc(MAX_KEYWORD_LENGTH*sizeof(char)); BZERO(current_keywords[current_index], MAX_KEYWORD_LENGTH*sizeof(char)); strncpy(current_keywords[current_index++], word, MAX_KEYWORD_LENGTH*sizeof(char)); } // empty the word out BZERO(word, MAX_KEYWORD_LENGTH*sizeof(char)); } free(word); // if current_index = 0, that means the last word in input_line was "OR" // and therefore the input is bad if(current_index == 0) { for(int i = 0; i < *num_queries; i++) { query = queries[i]; position = 0; while((word = (query->search_words)[position++]) != NULL) free(word); free(query); } return -1; } // otherwise, the last QUERY hasn't been created yet, and we need to make it // in the same way query = malloc(sizeof(QUERY)); MALLOC_CHECK(query); for(int i = 0; i < current_index; i++) { query->search_words[i] = malloc(MAX_KEYWORD_LENGTH*sizeof(char)); BZERO((query->search_words)[i], MAX_KEYWORD_LENGTH); strncpy((query->search_words)[i], current_keywords[i], MAX_KEYWORD_LENGTH); free(current_keywords[i]); } query->search_words[current_index] = NULL; queries[(*num_queries)++] = query; return 0; }
WordNode * processQuery(char * query, HashTable *index){ //TODO: eat leftover from the buffer if (NULL == strchr(query, '\n')){ printf("Whoa there. You entered too many characters. Query must exit."); return NULL; } //if there is more than 1000 characters. didn't handle yet. printf("\n"); // get first word in query char * pch = strtok(query," \n"); if(pch == NULL){ fprintf(stdout, "No input specified. \n"); printf("QUERY>: "); return init_list(); } if ((strcmp("OR", pch) == 0) || (strcmp("AND",pch) == 0)) { fprintf(stderr, "Invalid input. AND/OR cannot start your search query. please use a non-operator word to search \n"); printf("QUERY>: "); return NULL; } //initialize search results int or = -1; //set to negative one on first run WordNode * search_results = NULL; WordNode * tmp_list = NULL; // go through rest of the query string while (pch != NULL) { //check if OR or AND if (strcmp("OR", pch) == 0) { // printf("or here"); or = 1; pch = strtok (NULL, " \n"); continue; } else if (strcmp("AND", pch) == 0) { // printf("and here"); or = 0; pch = strtok (NULL, " \n"); continue; } // switch to lowercase NormalizeWord(pch); if(or == 1){ // fprintf(stdout,"Doing OR"); if (search_results) { // if there has been an OR before, now unionize search_results = unionize(search_results, tmp_list); } else { // only the first or // need to hold on to previous list search_results = tmp_list; } // set tmp_list to the new list word tmp_list = make_copy((WordNode *) get_value(pch ,index)); } else if (or == 0){ // printf("Doing AND"); // get intersect tmp_list = intersection(tmp_list, make_copy(get_value(pch ,index))); } else { // first run (or = -1) tmp_list = make_copy((WordNode *) get_value(pch ,index)); // if(tmp_list){ // fprintf(stderr, "GOT SOMETHING!!\n"); // if (tmp_list->head) // fprintf(stderr, "here is something %d", tmp_list->head->docID); // } } or = 0; // "marks AND for next" pch = strtok (NULL, " \n"); } //end search loop // in the end, unionize the two lists! search_results = unionize(search_results, tmp_list); return search_results; }
int main (int argc, char **argv) { /* Check Arguments */ if (!CheckArguments(argc, argv)) { exit(-1); } /* Make variables for all things needed for indexer and indexer testing */ char *page_directory; char *index_filename; char *read_index_filename; char *new_index_filename; // If argument count is 3 initialize only 2 variables else initialize all page_directory = argv[1]; index_filename = argv[2]; // Initialize hashtable, word node, and document node HashTable *index_hashtable = calloc(1, sizeof(HashTable)); /*Make array to hold filenames (just document numbers) and use GetFilenamesInDir to grab all names */ char **filename_array; int number_of_files; if ((number_of_files = GetFilenamesInDir(page_directory, &filename_array)) < 0) { fprintf(stderr, "Could not get filenames in page directory. Exiting Now.\n"); exit(-1); } /* Add page_directory to the front of the filenames */ for (int i = 0; i < number_of_files; i++) { // Make pointe to current string in filename_array char *previous_string = filename_array[i]; // Get length of full string and initialize element of filename_array to that size int len = strlen(page_directory) + strlen(previous_string) + 1; char *new_string = calloc(len, sizeof(char)); // Make new string and free previous string strcpy(new_string, page_directory); strcat(new_string, previous_string); if (previous_string) free(previous_string); filename_array[i] = new_string; } /* Populate the index data structure from the words on each doc * Then Save to an index file */ for (int i = 0; i < number_of_files; i++) { /* Check that the filenames are digits */ int continue_flag = 0; char *digit_string = filename_array[i] + strlen(page_directory); // Check that every character in the filename is a digit for (int j = 0; j < strlen(digit_string); j++) { if (!isdigit(digit_string[j])) { fprintf(stderr, "This file %s contains something other than a digit \n", filename_array[i]); continue_flag = 1; } } if (continue_flag ==1) continue; // Check that each file in the filename array is a good file char *file_name = filename_array[i]; if (!IsFile(file_name)) { fprintf(stderr, "not file\n"); continue; } // Get contents of file into a string char *document = LoadDocument(file_name); if (document == NULL) { continue; } // Get DocumentID of file (check if bad) int document_id = GetDocumentId(file_name, page_directory); if (document_id < 0) { fprintf(stderr, "Error when converting document id char to integer\n"); continue; } // Use GetNext word, with pos variable and buffer, to get every word and add the word to the data structure int pos = 0; char *word_buffer; while ((pos = GetNextWord(document, pos, &word_buffer)) > 0) { // Update the index for each word // Normalize word then update index with that word NormalizeWord(word_buffer); UpdateIndex(word_buffer, document_id, index_hashtable); free(word_buffer); } // free the string containing the html and the word in filenamearray free(document); } /* Save to index file, and check that it actually went well */ if (!SaveIndexToFile(index_hashtable, index_filename)) { fprintf(stderr, "Could not save index hashtable to file\n"); exit(-1); } for (int i = 0; i < number_of_files; i++) { free(filename_array[i]); } free(filename_array); FreeHashTable(index_hashtable); if (argc == 3) { ; } /* Read index file into data strucutres and save to new index file */ else { // Assign 2 filenames read_index_filename = argv[3]; new_index_filename = argv[4]; // Read index file into data structures HashTable *read_index = ReadFile(read_index_filename); if (read_index == NULL) { fprintf(stderr, "Error when reading index file into data structures.\n"); exit(-1); } // Save index data structures into new file if (!SaveIndexToFile(read_index, new_index_filename)) { fprintf(stderr, "Could not save read index file into new index file\n"); exit(-1); } FreeHashTable(read_index); } return 0; }
//return 1 if successful, 0 otherwise int buildIndexFromDirectory(char *dir, HashTable *hashtable) { char **filenames = NULL; int num_files = 0; //get the file names num_files = GetFilenamesInDir(dir, &filenames); if (num_files < 0) { fprintf(stderr, "Error: Unable to obtain files in directory\n"); free(filenames); return 0; } //for each of the files in the directory, read and add to the indexer for(int i = 0; i < num_files; i++){ //allocate a char array, directoryname/filename char *file_name = malloc((strlen(dir) + strlen(filenames[i])) * (sizeof(char))); //makes filename based on whether the directoryname has a slash at the end or not if (dir[strlen(dir)-1] == '/') sprintf(file_name, "%s%s", dir, filenames[i]); else sprintf(file_name, "%s/%s", dir, filenames[i]); FILE *file = fopen(file_name, "rb"); if (file) { fseek(file, 0, SEEK_END); long html_len = ftell(file); fseek(file, 0, SEEK_SET); //Get rid of the first 2 lines //get rid of url int offset = 0; char character; do{ character = fgetc(file); offset++; } while(character != '\n' && character != EOF); fseek(file, offset, SEEK_SET); //get rid of depth do{ character = fgetc(file); offset++; } while(character != '\n' && character != EOF); fseek(file, offset, SEEK_SET); //read in the file char *html = malloc(html_len * (sizeof(char))); fread(html, sizeof(char), html_len, file); fclose(file); int pos = 0; char *word = NULL; //errstring will hold error message from function, char **errstring = &errstring, it is the mailbox int doc_id = strtol(filenames[i], NULL, 10); if (doc_id == 0) { fprintf(stderr, "Error: %s is an invalid crawler filename.\n", filenames[i]); continue; } //get the words while((pos = GetNextWord(html, pos, &word)) > 0){ NormalizeWord(word); addToHashTable(hashtable, word, doc_id); //add word to indexer free(word); word = NULL; } free(html); } free(file_name); if (filenames[i]) free(filenames[i]); } free(filenames); return 1; }
/* pseudocode 1) Get first word 2) check to make sure its not 'AND' or 'OR' or no word 3) Normalize word 4) store all DocNodes in Templist 5) WHILE(not end of input) DO - get next word, w - IF (word is AND) THEN - FLAG is 1 - continue (return to top of loop) - ELSE IF (word is OR) THEN - FLAG is 2 - continue (return to top of loop) - ELSE (i.e. it is a word) - IF (FLAG > 2) THEN - IF ( First add to final list) THEN - FinalList = TempList ELSE - AND FinalList and TempList FI - Free TempList - Normalize w - store all DocNodes for w in Templist FI - IF (FLAG == 2 ) THEN - normalize w - OR ( Templist and w DocNodes) FI - FLAG = 0 DONE 6) Free allocated memory 7) return Final List */ DocumentNode *getList(char *input){ char *w; int pos,FLAG,count; DocumentNode *TempList,*FL; //malloc space for 2 words w = (char *)malloc(MAX_WORD_LENGTH); MALLOC_CHECK(w); BZERO(w,MAX_WORD_LENGTH); pos = 0; count = 0; FLAG = 0; //while not end //first call, can't start with 'AND' or 'OR' if( ((pos = getWord(input,w,pos)) < 0) || (strcmp(w,"AND") == 0) || (strcmp(w,"OR") == 0) ){ LOG("BAD INPUT!"); exit(-1); } //normalize/get all DocNodes with first word NormalizeWord(w); TempList = getDocList(w); BZERO(w,MAX_WORD_LENGTH); //while not end of string while( (pos = getWord(input,w,pos)) > 0){ // Input checking and operation determination //not OR if (strcmp(w,"OR") !=0){ //word is AND if( strcmp(w,"AND") == 0){ //Bad input -- exit if( FLAG > 0 ){ LOG("Cannot input two operators in a row"); exit(-1); } FLAG = 1; BZERO(w,MAX_WORD_LENGTH); continue; } } // word is OR else { //bad input -- exits file if ( FLAG > 0 ){ LOG("Cannot input two operators in a row"); exit(-1); } //we are good FLAG = 2; BZERO(w,MAX_WORD_LENGTH); continue; } //Word is actually a word and not 'AND' or 'OR' //now will update the finalList or Templist // either a space or AND previously if( FLAG < 2 ){ //if first call to Final List if( count == 0){ FL = TempList; count = 1; TempList = NULL; } //Already initilized the final list else{ FL = AND(FL,TempList); } //empy the templist //store new word in the templist NormalizeWord(w); freeDocNodeList(TempList); TempList = getDocList(w); } //previously was an OR //need to add word to old word list if( FLAG == 2 ){ NormalizeWord(w); TempList = OR(TempList,w); } //set flag to zero (i.e. last getWord was a word and not a switch FLAG = 0; BZERO(w,MAX_WORD_LENGTH); } //all words parsed out of string //if no calls to the final list yet if( count == 0 ){ FL = TempList; count = 1; TempList = NULL; } //end of string is always AND else{ FL=AND(FL,TempList); } //free allocated memory free(w); freeDocNodeList(TempList); //return DocNode List return FL; }
int main(int argc, char* argv[]) { //check argument number if (argc < 3 || argc > 4) { printf("too many or too little arguments, please try again"); exit(0); } //check directory validity if (!IsDir(argv[1])) { printf("invalid directory, please try again"); exit(0); } //Initialize variables and index int docId; int pos; char *doc; char **filenames = NULL; int num_files = 0; HashTable *WordsFound = calloc(1, sizeof(HashTable)); num_files = GetFilenamesInDir(argv[1], &filenames); //check whether the folder has files if (num_files < 0) { printf("failed to get any filenames"); exit(0); } //iterate through each file in the directory for (int i = 0; i < num_files; i++) { //check that the file is in the correct format (title is a number) int filechecker = 0; for (int c = 0; c < strlen(filenames[i]); c++) { if (!isdigit(filenames[i][c])) { filechecker = 1; } } if (filechecker == 1) { continue; } //Load the document char *word; char file[100]; strcpy(file, argv[1]); strcat(file, filenames[i]); doc = LoadDocument(file); docId = GetDocumentId(filenames[i]); free(filenames[i]); pos = 0; //Iterate through each word in the html file (doc) while ((pos = GetNextWord(doc, pos, &word)) > 0) { NormalizeWord(word); if (InHashTable(word, WordsFound) == 0) { AddToHashTable(word, WordsFound); UpdateHashTable(word, docId, WordsFound); } else { UpdateHashTable(word, docId, WordsFound); free(word); } } free(doc); } free(filenames); SaveIndexToFile(argv[2], WordsFound); //Save the index to the file specified FreeHashTable(WordsFound); //only proceed if there was a third argument specified. If so, reload the index form the file you just created if (argc == 4) { HashTable *ReloadedIndex = ReadFile(argv[2]); SaveIndexToFile(argv[3], ReloadedIndex); FreeHashTable(ReloadedIndex); } return 0; }
int main(int argc, char* argv[]){ int success; // contains 1 if removing from SinLL was successful int funcSuccess; int orNext; // contains > 0 if the next word in query should be ORed int firstAdd; // contains > 0 if the addition to SinLL is the first addition int tempChar; // used to flush the stdin for too long inputs char query[MAX_QUERY_LEN]; // contains string of query char *getsSuccess; // determines if EOF is met. int status = 1; SinLL *wordList; if(argc != 3){ // invalid number of arguments fprintf(stderr, ANSI_COLOR_RED "Usage: query [INDEXER OUTPUT FILE] [CRAWLER OUTPUT FILE DIRECTORY]" ANSI_COLOR_RESET "\n"); return 0; } if(!(access(argv[1], F_OK) != -1)){ // invalid file fprintf(stderr, ANSI_COLOR_RED "First argument is not a valid file." ANSI_COLOR_RESET "\n"); return 0; } if(!IsDir(argv[2])){ // invalid "directory" fprintf(stderr, ANSI_COLOR_RED "Second argument is not a directory." ANSI_COLOR_RESET "\n"); return 0; } HashTable *invertedIndex; invertedIndex = calloc(1, sizeof(HashTable)); if(!invertedIndex){ status = 0; goto cleanup; } funcSuccess = readFile(argv[1], invertedIndex); // recreate inverted index if(!funcSuccess){ status = 0; goto cleanup; } while(1){ // get the query from user fputs("QUERY> ", stdout); fflush(stdout); getsSuccess = fgets(query, sizeof(char)*MAX_QUERY_LEN, stdin); if(!getsSuccess) break; // EOF means exiting program // this means the user input more than MAX_QUERY_LEN characters to query if(getsSuccess[strlen(getsSuccess)-1] != '\n'){ fprintf(stderr, ANSI_COLOR_RED "Query length is over the maximum 1000 characters!" ANSI_COLOR_RESET "\n"); while((tempChar = getchar()) != '\n' && tempChar != EOF){ /*do nothing*/ } continue; } // at this stage, the next add is the first add, and we have not seen a // OR yet. orNext = 0; firstAdd = 1; wordList = CreateSinLL(); if(!wordList) break; char *wordP; wordP = strtok(query," "); // get all the words from the query while(wordP){ // last word in query will have a \n attached to it, so if // there is a \n at the end of a word, take that out if(wordP[strlen(wordP)-1] == '\n'){ wordP[strlen(wordP)-1] = 0; } // ignore ANDs. if(strcmp(AND, wordP) == 0){ wordP = strtok (NULL, " "); continue; } // ignore ORs but make sure you OR the next coming word. if(strcmp(OR, wordP) == 0){ orNext = 1; wordP = strtok (NULL, " "); continue; } // make word lowercase. If this word is the first one, or // the previous word was OR, make a new node in the SinLL // of WordChainList NormalizeWord(wordP); if(firstAdd){ funcSuccess = appendNewWordChain(wordP, wordList); if(!funcSuccess) break; firstAdd = 0; } else if(orNext){ funcSuccess = appendNewWordChain(wordP, wordList); if(!funcSuccess) break; orNext = 0; } // if not the previous two cases, just append the word to // current node. else{ appendWord(wordP, wordList); } wordP = strtok (NULL, " "); } // first process will AND all the words contained in each WordChainNodes // of the list. WordChainNode *curWordChain = wordList->head; while(curWordChain){ // while there are more nodes firstAdd = 1; DocNode *tempProcessDocNode; // contains original DocNodes to AND from index DocNode *processDocNode; // contains copied version of above. WordsLL *wordsProc = curWordChain->words; // gettng first set of words. while(wordsProc){ // while there are more words // get DocNodes associated with that word from the inverted index and // copy it as to not mess up the inverted index. tempProcessDocNode = DocsFromWordNode(wordsProc->word, invertedIndex); processDocNode = CopyDocs(tempProcessDocNode); // merge the above DocNodes with the DocNodes saved at the current // WordChainNode. DocMergedID(&processDocNode, &(curWordChain->docs)); // if it is the first add, we want to skip this step. If it isnt the // first add, and the above DocNodes with the ocNodes saved at the current // WordChainNode. if(!firstAdd){ ProcessAND(&processDocNode); } // Add the processed (ANDed) DocNode chain at the current // WordChainNode. AddDocNodeChain(curWordChain, processDocNode); // iterate through to the next word at the current node. wordsProc = wordsProc->nextWord; firstAdd = 0; } // move on to the next node. curWordChain = curWordChain->nextWords; } // now we OR each individual WordChainNodes' DocNode lists. curWordChain = wordList->head; DocNode *curDocs; DocNode *nextDocs; success = removeTopDoc(wordList, &curDocs); // gets the DocNode list from the first node // if there you fail here, it means that the list is empty/ if(success){ success = removeTopDoc(wordList, &nextDocs); // gets the next DocNode list from // the next WordChainNode while(success){ // if you fail here, there was only one WordChainNode in the list // process the DocNodes together by ORing them DocMergedID(&curDocs, &nextDocs); ProcessOR(&curDocs); // move on to the next DocNodes from the next WordChainNode. success = removeTopDoc(wordList, &nextDocs); } } // the list was empty, so found nothing. else{ printf("Found 0 pages\n"); continue; } // sort by the rank and print the results. SortByRank(&curDocs); PrintQueryResult(curDocs, argv[2]); free(wordList); // clean up for next query } cleanup: if(invertedIndex) DeleteHashTable(invertedIndex); // final clean up if(!status){ fprintf(stderr, ANSI_COLOR_RED "Failed inverted index building." ANSI_COLOR_RESET "\n"); return 0; } return 1; }
int main(int argc, char* argv[]){ // Declare variables------------------------------------------------------------- HashTable Index; // Inverted index InitialiseHashTable(&Index); char text[MAXLEN]; int test = 0; // 1. Check input parameters-------------------------------------------------------- if (argc != 3 ){ // check number of arguments fprintf(stderr,"Error: Incorrect number of input argument\n"); return -1; }else if(!IsFile(argv[1])){ // check if file is valid fprintf(stderr,"Error: File %s is invalid\n", argv[1]); return -1; }else if(!IsDir(argv[2])){ // check if directory is valid fprintf(stderr,"Error: Directory %s cannot be found\n", argv[2]); return -1; } // 2. Reconstruct Inverted Index----------------------------------------------------- printf("Please wait while the query engine is loading. It might take a few minutes... \n"); if(!ReadFile(&Index, argv[1])){ CleanUpHash(&Index); return -1; } // 3. Command Line interface and query ----------------------------------------------- for(int j=0; j<9; j++){ // Create text array for automated testing switch (j){ case 0: printf("\n3.%d Test invalid input syntax\n",j+1); printf("QUERY :> AND dog\n"); strcpy(text,"AND dog\n"); break; case 1: printf("\n3.%d Test invalid input syntax\n", j+1); printf("QUERY :> cat OR AND dog\n"); strcpy(text,"cat OR AND dog\n"); break; case 2: printf("\n3.%d Test no result\n", j+1); printf("QUERY :> thisisrandom\n"); strcpy(text,"thisisrandom\n"); break; case 3: printf("\n3.%d Test single entry\n", j+1); printf("QUERY :> incredible\n"); strcpy(text,"incredible\n"); break; case 4: printf("\n3.%d Test uppercase\n", j+1); printf("QUERY :> Incredible\n"); strcpy(text,"Incredible\n"); break; case 5: printf("\n3.%d Test AND\n", j+1); printf("QUERY :> Dartmouth AND College AND Computer AND Science\n"); strcpy(text,"Dartmouth AND College AND Computer AND Science\n"); break; case 6: printf("\n3.%d Test space as AND\n", j+1); printf("QUERY :> Dartmouth College Computer Science\n"); strcpy(text,"Dartmouth College Computer Science\n"); break; case 7: printf("\n3.%d Test OR\n", j+1); printf("QUERY :> Dartmouth OR Computer\n"); strcpy(text,"Dartmouth OR Computer\n"); break; case 8: printf("\n3.%d Test combined\n", j+1); printf("QUERY :> Dartmouth College AND Hanlon OR Mathematics AND Computer Science AND Philosophy OR incredibles Pixar\n"); strcpy(text,"Dartmouth College AND Hanlon OR Mathematics AND Computer Science AND Philosophy OR incredibles Pixar\n"); break; } // a) declare variables int unionflag, flag, size_temp, size_intersect, size_final, count; char wordarray[MAXLEN][MAXLEN]; int temparray[MAXSIZE][2], intersect[MAXSIZE][2], final[MAXSIZE][2]; // b) instantiate variables size_temp = size_intersect = size_final = unionflag = flag = 0; count = StringToWord(wordarray,text); // c) query for(int i=0; i<count; i++){ if(i==0 && strcmp(wordarray[i],"AND") && strcmp(wordarray[i],"OR")){ // if it's the first word and is not invalid NormalizeWord(wordarray[i]); size_intersect = FindHash(wordarray[i], intersect, Index); continue; }else if(i==0){ // if it is first word and invalid flag = 1; break; }else if(unionflag){ if(strcmp(wordarray[i],"AND") && strcmp(wordarray[i],"OR")){ NormalizeWord(wordarray[i]); size_intersect = FindHash(wordarray[i], intersect, Index); unionflag = 0; continue; }else{ flag = 1; break; } } if (!strcmp(wordarray[i],"AND")){ // if it's AND if(CheckOperator(wordarray,i,count)){ NormalizeWord(wordarray[i+1]); size_temp = FindHash(wordarray[i+1], temparray, Index); size_intersect = FindIntersection(intersect, size_intersect, temparray, size_temp); i++; continue; }else{ flag = 1; break; } }else if(!strcmp(wordarray[i],"OR")){ // if it's OR if(CheckOperator(wordarray,i,count)){ size_final = FindUnion(final, size_final, intersect, size_intersect); size_intersect = 0; unionflag = 1; continue; }else{ flag = 1; break; } }else{ NormalizeWord(wordarray[i]); size_temp = FindHash(wordarray[i], temparray, Index); size_intersect = FindIntersection(intersect, size_intersect, temparray, size_temp); continue; } }