static void ProcessWellFormedWord(char *word, article *a, hashset *stopWords, hashset *wordHash, hashset *articlesSeen) { currWord w; char* word2 = strdup(word); if(HashSetLookup(stopWords, &word2) == NULL) { //not a stopword w.thisWord = word2; VectorNew(&w.articles, sizeof(article),NULL, 100); currWord* elemAddr = (currWord*)HashSetLookup(wordHash,&w); if(elemAddr == NULL){ // Hasn't been seen a->numOccurrences = 1; VectorAppend(&w.articles, a); HashSetEnter(wordHash, &w); } else { UpdateOccurences(&elemAddr->articles,a); // we just need to update, not add // clean up free(word2); VectorDispose(&w.articles); } } else { free(word2); // free stop word } }
/** * Function: TestHashTable * ----------------------- * Runs a test of the hashset using a frequency structure as the element * type. It will open a file, read each char, and count the number of * times each char occurs. Tests enter, lookup, and mapping for the hashset. * Prints contents of table to stdout. Then it dumps all the table elements * into a vector and sorts them by frequency of occurrences and * prints the array out. Note that this particular stress test passes * 0 as the initialAllocation, which the vector is required to handle * gracefully - be careful! */ static void TestHashTable(void) { hashset counts; vector sortedCounts; HashSetNew(&counts, sizeof(struct frequency), kNumBuckets, HashFrequency, CompareLetter, NULL); fprintf(stdout, "\n\n ------------------------- Starting the HashTable test\n"); BuildTableOfLetterCounts(&counts); fprintf(stdout, "Here is the unordered contents of the table:\n"); HashSetMap(&counts, PrintFrequency, stdout); // print contents of table VectorNew(&sortedCounts, sizeof(struct frequency), NULL, 0); HashSetMap(&counts, AddFrequency, &sortedCounts); // add all freq to array VectorSort(&sortedCounts, CompareLetter); // sort by char fprintf(stdout, "\nHere are the trials sorted by char: \n"); VectorMap(&sortedCounts, PrintFrequency, stdout); VectorSort(&sortedCounts, CompareOccurrences); //sort by occurrences fprintf(stdout, "\nHere are the trials sorted by occurrence & char: \n"); VectorMap(&sortedCounts, PrintFrequency, stdout); // print out array VectorDispose(&sortedCounts); // free all storage HashSetDispose(&counts); }
void HashSetDispose(hashset *h) { if(h->freefn!=NULL){ for(int i=0; i<h->numBuckets; i++) VectorDispose(h->buckets+i); } free(h->buckets); }
void HashSetDispose(hashset *h){ int i; vector *vAddress; for(i = 0; i < h->numBuckets; i++){ vAddress = h->buckets + i; VectorDispose(vAddress); } free(h->buckets); }
static void ChallengingTest() { vector lotsOfNumbers; fprintf(stdout, "\n\n------------------------- Starting the more advanced tests...\n"); VectorNew(&lotsOfNumbers, sizeof(long), NULL, 4); InsertPermutationOfNumbers(&lotsOfNumbers, kLargePrime, kEvenLargerPrime); SortPermutation(&lotsOfNumbers); DeleteEverythingVerySlowly(&lotsOfNumbers); VectorDispose(&lotsOfNumbers); }
static void cleanThreadData(rssDatabase *db) { VectorDispose(&db->threads); //shall be first becouse it executes pthread_join(); HashSetDispose(&(db->locks.limitConnToServerLock)); pthread_mutex_destroy(&(db->locks.serverDataLock)); pthread_mutex_destroy(&(db->locks.articlesVectorLock)); pthread_mutex_destroy(&(db->locks.indicesHashSetLock)); pthread_mutex_destroy(&(db->locks.stopWordsHashSetLock)); sem_destroy(&(db->locks.connectionsLock)); }
static void SimpleTest() { fprintf(stdout, " ------------------------- Starting the basic test...\n"); vector alphabet; VectorNew(&alphabet, sizeof(char), NULL, 4); TestAppend(&alphabet); TestSortSearch(&alphabet); TestAt(&alphabet); TestInsertDelete(&alphabet); TestReplace(&alphabet); VectorDispose(&alphabet); }
static void QueryIndices(rssDatabase *db) { char response[1024]; while (true) { printf("Please enter a single query term that might be in our set of indices [enter to quit]: "); fgets(response, sizeof(response), stdin); response[strlen(response) - 1] = '\0'; if (strcasecmp(response, "") == 0) break; ProcessResponse(db, response); } HashSetDispose(&db->indices); VectorDispose(&db->previouslySeenArticles); HashSetDispose(&db->stopWords); }
static void QueryIndices(rssData *allData) { char response[1024]; while (true) { printf("Please enter a single query term that might be in our set of indices [enter to quit]: "); fgets(response, sizeof(response), stdin); response[strlen(response) - 1] = '\0'; if (strcasecmp(response, "") == 0) break; ProcessResponse(response, allData); } // free the memory when we're finished HashSetDispose(&allData->indices); HashSetDispose(&allData->stopwords); VectorDispose(&allData->explored); }
/* convert text to tokens, remove tags and convert back to string: tokens and keep all positions in array "positions" array: text, tokens, positions must be released by the caller program. */ void convertToken2Text( char*text, char* tokens, int** positions) { cvector tokenVector; int* p; char* t = tokens; Token *token; int i,j,len; assert(text); assert(tokens); assert(positions); VectorNew (&tokenVector, sizeof (Token),free_token, DEF_VECTOR_SIZE); getTokensFromText(text, &tokenVector); assert (text); for (i=0;i<tokenVector.ItemsCount;i++) { token = (Token*)VectorNth(&tokenVector,i); sprintf( t, "%s", token->term); t+=strlen(token->term); //append ' ' to the end of each token if (i!=tokenVector.ItemsCount) *t++=' '; } *t='\0'; //record the position of token in the text *positions = (int*)malloc( sizeof(int) * strlen(text) ); if (!*positions) { printf("Not enough memory!\n"); exit(0); } p=*positions; for (i=0;i<tokenVector.ItemsCount;i++) { token = (Token*)VectorNth(&tokenVector,i); len = strlen(token->term); //printf("\ntoken: %s\n", token->term); //record position of the token (each char will has a position, and value is the positon of first character of the token) for (j=0;j<len;j++) { *p++ = token->position+j; //printf("%c->%d ",*(text+token->position+j), token->position+j); } if (i!=tokenVector.ItemsCount-1) *p++= token->position+j-1; //position as next token } VectorDispose(&tokenVector); }
static void MemoryTest() { int i; const char * const kQuestionWords[] = {"who", "what", "where", "how", "why"}; const int kNumQuestionWords = sizeof(kQuestionWords) / sizeof(kQuestionWords[0]); vector questionWords; char *questionWord; fprintf(stdout, "\n\n------------------------- Starting the memory tests...\n"); fprintf(stdout, "Creating a vector designed to store dynamically allocated C-strings.\n"); VectorNew(&questionWords, sizeof(char *), FreeString, kNumQuestionWords); fprintf(stdout, "Populating the char * vector with the question words.\n"); for (i = 0; i < kNumQuestionWords; i++) { questionWord = malloc(strlen(kQuestionWords[i]) + 1); strcpy(questionWord, kQuestionWords[i]); VectorInsert(&questionWords, &questionWord, 0); // why the ampersand? isn't questionWord already a pointer? } fprintf(stdout, "Mapping over the char * vector (ask yourself: why are char **'s passed to PrintString?!!)\n"); VectorMap(&questionWords, PrintString, stdout); fprintf(stdout, "Finally, destroying the char * vector.\n"); VectorDispose(&questionWords); }
static void IndexEntryFree(void *elem) { rssIndexEntry *entry = elem; StringFree(&entry->meaningfulWord); VectorDispose(&entry->relevantArticles); }
static void IndexFree(void *elem) { indexEntry *entry = elem; free( entry->word ); VectorDispose(&entry->articles); }
static void wordSetFreeFn(void *elemAddr) { wordSet *ws = (wordSet *) elemAddr; free(ws->word); VectorDispose(&ws->occ); }
int getAddress (char* url) { char *stream, *text, *textHighlight, *lenstr; char *tokens = (char*)malloc(DEF_BUFF_SIZE); int *positions;// array to record position of each token in the text cvector addressVector; Address *adr; long len; int MAXLEN = 1805; int EXTRA = 11; /* 4 for field name "data", 1 for "=" */ int MAXINPUT = MAXLEN+EXTRA+2; char input[MAXINPUT]; char* data = input, *p; int rightOrWrong = -1, numRight, numTotal; //char* domain_url; int i; http_setTimeout(8);//seconds //fetch web page int ret = httpFetch (url, &stream); if (ret == -1) { printf("%s\n",http_strerror()); exit(0); } //printf("ret: %d, strlen: %d\n",ret, strlen(stream)); assert(stream); text= (char*)malloc(ret+2); if (!text) { printf("out of memory when convert text to tokens!\n"); exit(0); } strncpy(text, stream, ret); //append a '\0' to the end of string to make sure it is end with two '\0' for flex to scan *(text+ret) = '\0'; *(text+ret+1) = '\0'; free(stream); /* convert text to tokens, remove tags and convert back to string: tokens and keep all positions in array "positions" */ convertToken2Text(text, tokens, &positions); // get base domain of given url //e.g. given http://www.google.com/address, return http://www.google.com to domain_url /* domain_url = (char*)malloc(strlen(url)+1); strcpy(domain_url, url); for (i=strlen(url); i>0; i--) { if (url[i] == '/') { if (url[i-1] == '/' ) // is "//" break; else // not "//" domain_url[i] = '\0'; } } printf ("<base href=\"%s%s\">\n", GEO_URL, domain_url); free(domain_url); */ VectorNew (&addressVector, sizeof (Address),free_address, DEF_ADDRESS_PER_PAGE); //extract address, //get position from positions vector //and save extracted address, position, country to addressVector extractAddress(tokens, positions, &addressVector); //display the parsed text //printf("tokens: %s\n",tokens); //printf("url: %s\n", url); //printf("domain_url: %s\n", domain_url); //output header printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\">\n"); printf("<table border=1 width=100%%><tr><td><table border=1 bgcolor=#ffffff cellpadding=10 cellspacing=0 width=100%% color=#ffffff><tr><td>\n"); printf("<font face=arial,sans-serif color=black size=-1>\n"); printf("<b><a href='%s'>US, UK & Canadian Addresses</a> extracted by <a href='%s'>Geo Extractor</a> from web page</b> <a href='%s'>%s</a></font><br><br>\n",LIST_FILES_URL, HOME_PAGE,url,url); //printf("%s,",textHighlight); //display extracted address //table header printf("<table width=100%% border=0 cellpadding=0 cellspacing=0><tr><td bgcolor=#3366cc><img width=1 height=1 alt=''></td></tr></table>\n"); printf("<table width=100%% border=0 cellpadding=0 cellspacing=0 bgcolor=#e5ecf9><tr><td width=10></td><td bgcolor=#e5ecf9 nowrap><br>\n"); printf("<font face=arial,sans-serif color=black size=-1><b>\n"); for (i=0; i<addressVector.ItemsCount; i++) { adr = (Address*)VectorNth(&addressVector,i); printf("%s<br>\n", adr->address); /*printf("%s, start: %d, end: %d<br>\n",adr->address, adr->start, adr->end); for (j=adr->start; j<=adr->end; j++) printf("%c",*(text+j)); printf("\n"); */ } printf("</b></font>\n"); printf("<br></td></tr></table>\n"); printf("<table width=100%% border=0 cellpadding=0 cellspacing=0><tr><td bgcolor=#3366cc><img width=1 height=1 alt=''></td></tr></table>\n"); textHighlight = (char*)malloc(DEF_BUFF_SIZE); numRight=numTotal =addressVector.ItemsCount; /* if there is a user post, we save the user input to get tagged data*/ lenstr = getenv("CONTENT_LENGTH"); if ( !(lenstr == NULL || sscanf(lenstr,"%ld",&len)!=1 || len > MAXLEN) ) { tagAddress(text, textHighlight, &addressVector); fgets(input, len+1, stdin); URLdecode(input); data = input+EXTRA; //printf("posted: %s\n",data); len = strlen("right"); if ( strncmp(data, "right", len)==0 ) { rightOrWrong = 0; //set flag for right or wrong extraction } len = strlen("wrong"); if ( strncmp(data, "wrong", len)==0 ) { rightOrWrong = 1; //user input "Wrong Extraction" } // get user input: numRight, which is number of correct extracted address data += strlen("right") + strlen("&numRight="); p = data; while (*data++ !='&'); *data= '\0'; numRight = atoi(p); //printf("numRight: %d\n", numRight); // get user input numTotal, which is number of total address in the page p = data+strlen("numTotal="); numTotal = atoi(p); //printf("numTotal: %d\n", numTotal); if (rightOrWrong == 0) { //printf("webpage saved to RIGHT folder\n"); saveTaggedText(url, text, textHighlight, rightOrWrong, numRight, addressVector.ItemsCount, numTotal); } if ( rightOrWrong == 1 ) { //printf("webpage saved to WRONG folder\n"); saveTaggedText(url, text, textHighlight, rightOrWrong, numRight, addressVector.ItemsCount, numTotal); } //printf("tagged text: %s\n", textHighlight); } // give source text, and addressVector //highlight all extracted address in the webpage getHighlight(text, textHighlight, &addressVector); /* if there is at least one address extracted, show user input to let user judge where extraction is correct*/ if ( SHOW_COLLECT_DATA_INTERFACE) { printf("<FORM ACTION=\"%s%s\" METHOD=\"POST\">\n", GEO_URL, url); printf("<font face=arial,sans-serif color=black size=-1>\n"); printf("<P><input name=\"extraction\" type=\"radio\" value=\"right\" "); if ((rightOrWrong == 0)||(rightOrWrong == -1)) //if no user input or user input: extracted address all correct printf("checked"); printf("> All address extracted correctly<br>\n"); printf("<input name=\"extraction\" type=\"radio\" value=\"wrong\" "); if (rightOrWrong == 1) //user input: extracted address all correct printf("checked"); printf("> Not all addresses extracted correctly. \n"); printf("<input type=\"text\" name=\"numRight\" size=\"4\" value=\"%d\"> addresses extracted correctly from total <input type=\"text\" name=\"numTotal\" size=\"4\" value=\"%d\"> addresses<BR>\n", numRight, numTotal); printf("<INPUT TYPE=\"SUBMIT\" VALUE=\"Save Webpage\"></font></FORM>\n"); //show google search printf("<SCRIPT language=\"JavaScript\">function OnSubmitForm(){ document.g.action =\"%shttp://www.google.com/search?num=100&q=\"+document.g.q.value.replace(\" \",\"%%2B\");}</SCRIPT>\n", GEO_URL); printf("<table border=0 align=right><tr><td>\n"); printf("<form action=\"\" method=\"post\" name=\"g\" onSubmit=\"return OnSubmitForm();\">\n"); printf("<input size=\"32\" name=\"q\">\n"); printf("<INPUT TYPE=\"SUBMIT\" name=\"Submit\" VALUE=\"Google\"></form>\n"); printf("</td></tr></table>\n"); } printf("</td></tr></table></td></tr></table>\n"); //extract address from original html text // extract_address(text); // printf("Original <hr>%s",text); printf("<hr>\n"); //printf("%s",textHighlight); displayHtmlAbsoluteURL(textHighlight, url); VectorDispose(&addressVector); free (positions); free (text); free (tokens); return 0; }
static void IndexFree(void*elemAddr){ indexData *data = elemAddr; free(data->word); VectorDispose(&data->data); }
static void ThesEntryFree(void *elem) { thesaurusEntry *entry = elem; free(entry->word); VectorDispose(&entry->synonyms); }
static void WordFree(void *elem) { currWord *word = (currWord*)elem; free(word->thisWord); VectorDispose(&word->articles); }