static void ProcessWellFormedWord(char *word, article *a, hashset *stopWords,
				  hashset *wordHash, hashset *articlesSeen)
{   
  currWord w; 
  char* word2 = strdup(word);

  if(HashSetLookup(stopWords, &word2) == NULL)  { //not a stopword
    w.thisWord = word2;	  
    VectorNew(&w.articles, sizeof(article),NULL, 100);
    currWord* elemAddr = (currWord*)HashSetLookup(wordHash,&w);

    if(elemAddr == NULL){ // Hasn't been seen
      a->numOccurrences = 1;
      VectorAppend(&w.articles, a);
      HashSetEnter(wordHash, &w);
    } else {
      UpdateOccurences(&elemAddr->articles,a); // we just need to update, not add
      // clean up
      free(word2);
      VectorDispose(&w.articles); 
    }

  } else {
    free(word2); // free stop word
  }
}
示例#2
0
文件: main.c 项目: shtef12/CS107
/**
 * Function: TestHashTable
 * -----------------------
 * Runs a test of the hashset using a frequency structure as the element
 * type.  It will open a file, read each char, and count the number of
 * times each char occurs.  Tests enter, lookup, and mapping for the hashset.
 * Prints contents of table to stdout.  Then it dumps all the table elements
 * into a vector and sorts them by frequency of occurrences and
 * prints the array out.  Note that this particular stress test passes
 * 0 as the initialAllocation, which the vector is required to handle
 * gracefully - be careful!
 */
static void TestHashTable(void)
{
  hashset counts;
  vector sortedCounts;

  HashSetNew(&counts, sizeof(struct frequency), kNumBuckets, HashFrequency, CompareLetter, NULL);

  fprintf(stdout, "\n\n ------------------------- Starting the HashTable test\n");
  BuildTableOfLetterCounts(&counts);

  fprintf(stdout, "Here is the unordered contents of the table:\n");
  HashSetMap(&counts, PrintFrequency, stdout);  // print contents of table

  VectorNew(&sortedCounts, sizeof(struct frequency), NULL, 0);
  HashSetMap(&counts, AddFrequency, &sortedCounts);   // add all freq to array
  VectorSort(&sortedCounts, CompareLetter);      // sort by char
  fprintf(stdout, "\nHere are the trials sorted by char: \n");
  VectorMap(&sortedCounts, PrintFrequency, stdout);

  VectorSort(&sortedCounts, CompareOccurrences); //sort by occurrences
  fprintf(stdout, "\nHere are the trials sorted by occurrence & char: \n");
  VectorMap(&sortedCounts, PrintFrequency, stdout);	// print out array

  VectorDispose(&sortedCounts);				// free all storage
  HashSetDispose(&counts);
}
示例#3
0
文件: hashset.c 项目: JuDa-hku/ACM
void HashSetDispose(hashset *h)
{
  if(h->freefn!=NULL){
    for(int i=0; i<h->numBuckets; i++)
      VectorDispose(h->buckets+i);
  }
  free(h->buckets);
}
示例#4
0
void HashSetDispose(hashset *h){
	int i;
	vector *vAddress;
	for(i = 0; i < h->numBuckets; i++){
		vAddress = h->buckets + i;
		VectorDispose(vAddress);
	}
	free(h->buckets);
}
示例#5
0
文件: vectortest.c 项目: JuDa-hku/ACM
static void ChallengingTest()
{
  vector lotsOfNumbers;
  fprintf(stdout, "\n\n------------------------- Starting the more advanced tests...\n");  
  VectorNew(&lotsOfNumbers, sizeof(long), NULL, 4);
  InsertPermutationOfNumbers(&lotsOfNumbers, kLargePrime, kEvenLargerPrime);
  SortPermutation(&lotsOfNumbers);
  DeleteEverythingVerySlowly(&lotsOfNumbers);
  VectorDispose(&lotsOfNumbers);
}
示例#6
0
static void cleanThreadData(rssDatabase *db) 
{
  VectorDispose(&db->threads); //shall be first becouse it executes pthread_join(); 
  HashSetDispose(&(db->locks.limitConnToServerLock));

  pthread_mutex_destroy(&(db->locks.serverDataLock));  
  pthread_mutex_destroy(&(db->locks.articlesVectorLock));
  pthread_mutex_destroy(&(db->locks.indicesHashSetLock));
  pthread_mutex_destroy(&(db->locks.stopWordsHashSetLock));
  sem_destroy(&(db->locks.connectionsLock));
  
}
示例#7
0
文件: vectortest.c 项目: JuDa-hku/ACM
static void SimpleTest()
{
  fprintf(stdout, " ------------------------- Starting the basic test...\n");
  vector alphabet;
  VectorNew(&alphabet, sizeof(char), NULL, 4);
  TestAppend(&alphabet);
  TestSortSearch(&alphabet);
  TestAt(&alphabet);
  TestInsertDelete(&alphabet);
  TestReplace(&alphabet);
  VectorDispose(&alphabet);
}
示例#8
0
static void QueryIndices(rssDatabase *db)
{
  char response[1024];
  while (true) {
    printf("Please enter a single query term that might be in our set of indices [enter to quit]: ");
    fgets(response, sizeof(response), stdin);
    response[strlen(response) - 1] = '\0';
    if (strcasecmp(response, "") == 0) break;
    ProcessResponse(db, response);
  }
  
  HashSetDispose(&db->indices);
  VectorDispose(&db->previouslySeenArticles); 
  HashSetDispose(&db->stopWords);
}
示例#9
0
static void QueryIndices(rssData *allData)
{
  char response[1024];
  while (true) {
    printf("Please enter a single query term that might be in our set of indices [enter to quit]: ");
    fgets(response, sizeof(response), stdin);
    response[strlen(response) - 1] = '\0';
    if (strcasecmp(response, "") == 0) break;
    ProcessResponse(response, allData);
  }

  // free the memory when we're finished 
  HashSetDispose(&allData->indices); 
  HashSetDispose(&allData->stopwords); 
  VectorDispose(&allData->explored); 

}
示例#10
0
/* convert text to tokens, remove tags
   and convert back to string: tokens
   and keep all positions in array "positions"
   array: text, tokens, positions must be released by the caller program.
*/
void convertToken2Text( char*text, char* tokens, int** positions) {
	cvector tokenVector;
	int* p;
	char* t = tokens;
	Token *token;
	int i,j,len;
	assert(text);
	assert(tokens);
	assert(positions);
        VectorNew (&tokenVector, sizeof (Token),free_token, DEF_VECTOR_SIZE);
	getTokensFromText(text, &tokenVector);
	assert (text);
	for (i=0;i<tokenVector.ItemsCount;i++) {
		token = (Token*)VectorNth(&tokenVector,i);
		sprintf( t, "%s", token->term);
		t+=strlen(token->term);
		//append ' ' to the end of each token
		if (i!=tokenVector.ItemsCount)
			*t++=' ';
	} 
	*t='\0';
	//record the position of token in the text
	*positions = (int*)malloc( sizeof(int) * strlen(text) );

	if (!*positions) {
		printf("Not enough memory!\n");
		exit(0);
	}
	p=*positions;
    for (i=0;i<tokenVector.ItemsCount;i++) {
	    token = (Token*)VectorNth(&tokenVector,i);
		len = strlen(token->term);
		//printf("\ntoken: %s\n", token->term);
		//record position of the token (each char will has a position, and value is the positon of first character of the token)
		for (j=0;j<len;j++) {
			*p++ = token->position+j;
			//printf("%c->%d ",*(text+token->position+j), token->position+j);
		}
		if (i!=tokenVector.ItemsCount-1)
			*p++= token->position+j-1; //position as next token
    }

	VectorDispose(&tokenVector);

}
示例#11
0
文件: vectortest.c 项目: JuDa-hku/ACM
static void MemoryTest()
{
  int i;
  const char * const kQuestionWords[] = {"who", "what", "where", "how", "why"};
  const int kNumQuestionWords = sizeof(kQuestionWords) / sizeof(kQuestionWords[0]);
  vector questionWords;
  char *questionWord;
  
  fprintf(stdout, "\n\n------------------------- Starting the memory tests...\n");
  fprintf(stdout, "Creating a vector designed to store dynamically allocated C-strings.\n");
  VectorNew(&questionWords, sizeof(char *), FreeString, kNumQuestionWords);
  fprintf(stdout, "Populating the char * vector with the question words.\n");
  for (i = 0; i < kNumQuestionWords; i++) {
    questionWord = malloc(strlen(kQuestionWords[i]) + 1);
    strcpy(questionWord, kQuestionWords[i]);
    VectorInsert(&questionWords, &questionWord, 0);  // why the ampersand? isn't questionWord already a pointer?
  }
  
  fprintf(stdout, "Mapping over the char * vector (ask yourself: why are char **'s passed to PrintString?!!)\n");
  VectorMap(&questionWords, PrintString, stdout);
  fprintf(stdout, "Finally, destroying the char * vector.\n");
  VectorDispose(&questionWords);
}
示例#12
0
static void IndexEntryFree(void *elem)
{
  rssIndexEntry *entry = elem;
  StringFree(&entry->meaningfulWord);
  VectorDispose(&entry->relevantArticles);
}
示例#13
0
static void IndexFree(void *elem)
{ 
  indexEntry *entry = elem; 
  free( entry->word ); 
  VectorDispose(&entry->articles); 
}
示例#14
0
static void wordSetFreeFn(void *elemAddr)
{
    wordSet *ws = (wordSet *) elemAddr;
    free(ws->word);
    VectorDispose(&ws->occ);
}
示例#15
0
int getAddress (char* url) {
	char *stream, *text, *textHighlight, *lenstr;
	char *tokens = (char*)malloc(DEF_BUFF_SIZE);
	int *positions;// array to record position of each token in the text
	cvector addressVector;
	Address *adr;

	long len;
	int MAXLEN = 1805;
    int EXTRA = 11;
	/* 4 for field name "data", 1 for "=" */
    int MAXINPUT = MAXLEN+EXTRA+2;
	char input[MAXINPUT];
	char* data = input, *p;
	int rightOrWrong = -1, numRight, numTotal;


	//char* domain_url;
	int i;
	http_setTimeout(8);//seconds 
	//fetch web page
	int ret = httpFetch (url, &stream);
	if (ret == -1) {
		printf("%s\n",http_strerror());
		exit(0);
	}
	//printf("ret: %d, strlen: %d\n",ret, strlen(stream));
	assert(stream);
	text= (char*)malloc(ret+2);
	if (!text) {
		printf("out of memory when convert text to tokens!\n");
		exit(0);
	}

	strncpy(text, stream, ret);
	//append a '\0' to the end of string to make sure it is end with two '\0' for flex to scan
	*(text+ret) = '\0';
	*(text+ret+1) = '\0';
	free(stream);



	/* convert text to tokens, remove tags
	and convert back to string: tokens
	and keep all positions in array "positions"
	*/
	convertToken2Text(text, tokens, &positions);

	// get base domain of given url
	//e.g. given http://www.google.com/address, return http://www.google.com to domain_url
	/* domain_url = (char*)malloc(strlen(url)+1);
	strcpy(domain_url, url);
	for (i=strlen(url); i>0; i--) {
	if (url[i] == '/') {
	if (url[i-1] == '/' ) // is "//"
	break;
	else   // not "//"
	domain_url[i] = '\0';
	}
	}
	printf ("<base href=\"%s%s\">\n", GEO_URL, domain_url);
	free(domain_url);
	*/
	VectorNew (&addressVector, sizeof (Address),free_address, DEF_ADDRESS_PER_PAGE);
	//extract address,
	//get position from positions vector
    //and save extracted address, position, country to addressVector
	extractAddress(tokens, positions, &addressVector);

	//display the parsed text
	//printf("tokens: %s\n",tokens);
	//printf("url: %s\n", url);
	//printf("domain_url: %s\n", domain_url);
	//output header

	printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\">\n");
	printf("<table border=1 width=100%%><tr><td><table border=1 bgcolor=#ffffff cellpadding=10 cellspacing=0 width=100%% color=#ffffff><tr><td>\n");
	printf("<font face=arial,sans-serif color=black size=-1>\n");
	printf("<b><a href='%s'>US, UK & Canadian Addresses</a> extracted by <a href='%s'>Geo Extractor</a> from web page</b> <a href='%s'>%s</a></font><br><br>\n",LIST_FILES_URL, HOME_PAGE,url,url);

	//printf("%s,",textHighlight);
	//display extracted address
	//table header
	printf("<table width=100%% border=0 cellpadding=0 cellspacing=0><tr><td bgcolor=#3366cc><img width=1 height=1 alt=''></td></tr></table>\n");
	printf("<table width=100%% border=0 cellpadding=0 cellspacing=0 bgcolor=#e5ecf9><tr><td width=10></td><td bgcolor=#e5ecf9 nowrap><br>\n");
	printf("<font face=arial,sans-serif color=black size=-1><b>\n");

	for (i=0; i<addressVector.ItemsCount; i++) {
		adr = (Address*)VectorNth(&addressVector,i);
		printf("%s<br>\n", adr->address);
		/*printf("%s, start: %d, end: %d<br>\n",adr->address, adr->start, adr->end);
		for (j=adr->start; j<=adr->end; j++)
			printf("%c",*(text+j));
		printf("\n");
		*/

	}
	printf("</b></font>\n");
	printf("<br></td></tr></table>\n");
	printf("<table width=100%% border=0 cellpadding=0 cellspacing=0><tr><td bgcolor=#3366cc><img width=1 height=1 alt=''></td></tr></table>\n");

	textHighlight = (char*)malloc(DEF_BUFF_SIZE);
	numRight=numTotal =addressVector.ItemsCount;

	/* if there is a user post, we save the user input to get tagged data*/
	lenstr = getenv("CONTENT_LENGTH");
	if ( !(lenstr == NULL || sscanf(lenstr,"%ld",&len)!=1 || len > MAXLEN) ) {
		tagAddress(text, textHighlight, &addressVector);
		fgets(input, len+1, stdin);
		URLdecode(input);
		data = input+EXTRA;
		//printf("posted: %s\n",data);
		len = strlen("right");
		if ( strncmp(data, "right", len)==0 ) { 
			rightOrWrong = 0; //set flag for right or wrong extraction
		}
		len = strlen("wrong");
		if ( strncmp(data, "wrong", len)==0 ) {
			rightOrWrong = 1; //user input "Wrong Extraction"
		}

		// get user input: numRight, which is number of correct extracted address
		data += strlen("right") + strlen("&numRight=");
		p = data;
		while (*data++ !='&');
		*data= '\0';
		numRight = atoi(p);
		//printf("numRight: %d\n", numRight);

		// get user input numTotal, which is number of total address in the page
		p = data+strlen("numTotal=");
		numTotal = atoi(p);
		//printf("numTotal: %d\n", numTotal);



		if (rightOrWrong == 0) {
			//printf("webpage saved to RIGHT folder\n");
			saveTaggedText(url, text, textHighlight, rightOrWrong, numRight, addressVector.ItemsCount, numTotal);
		}
		if ( rightOrWrong == 1 ) {
			//printf("webpage saved to WRONG folder\n");
			saveTaggedText(url, text, textHighlight, rightOrWrong, numRight, addressVector.ItemsCount, numTotal);
		}

		//printf("tagged text: %s\n", textHighlight);
	}

	// give source text, and addressVector
	//highlight all extracted address in the webpage
	getHighlight(text, textHighlight, &addressVector);


	/* if there is at least one address extracted, show user input to let user 
	judge where extraction is correct*/


	if ( SHOW_COLLECT_DATA_INTERFACE) {

		printf("<FORM ACTION=\"%s%s\" METHOD=\"POST\">\n", GEO_URL, url);
		printf("<font face=arial,sans-serif color=black size=-1>\n");
		
		printf("<P><input name=\"extraction\" type=\"radio\" value=\"right\" ");
		if ((rightOrWrong == 0)||(rightOrWrong == -1)) //if no user input or user input: extracted address all correct
			printf("checked");
		printf("> All address extracted correctly<br>\n");
		printf("<input name=\"extraction\" type=\"radio\" value=\"wrong\" ");
		if (rightOrWrong == 1) //user input: extracted address all correct
			printf("checked");
		printf("> Not all addresses extracted correctly. \n");
		printf("<input type=\"text\" name=\"numRight\" size=\"4\" value=\"%d\"> addresses extracted correctly from total <input type=\"text\" name=\"numTotal\" size=\"4\" value=\"%d\"> addresses<BR>\n", numRight, numTotal);
		printf("<INPUT TYPE=\"SUBMIT\" VALUE=\"Save Webpage\"></font></FORM>\n");
		//show google search
		printf("<SCRIPT language=\"JavaScript\">function OnSubmitForm(){ document.g.action =\"%shttp://www.google.com/search?num=100&q=\"+document.g.q.value.replace(\" \",\"%%2B\");}</SCRIPT>\n", GEO_URL);
		printf("<table border=0 align=right><tr><td>\n");
		printf("<form action=\"\" method=\"post\" name=\"g\" onSubmit=\"return OnSubmitForm();\">\n");
		printf("<input size=\"32\" name=\"q\">\n");
		printf("<INPUT TYPE=\"SUBMIT\" name=\"Submit\" VALUE=\"Google\"></form>\n");
		printf("</td></tr></table>\n");

	}
	printf("</td></tr></table></td></tr></table>\n");



	//extract address from original html text
  // extract_address(text);
 // printf("Original <hr>%s",text);
  printf("<hr>\n");
  //printf("%s",textHighlight);
  displayHtmlAbsoluteURL(textHighlight, url);

  VectorDispose(&addressVector);
  free (positions);
  free (text);
  free (tokens);
  

  return 0;

}
示例#16
0
static void IndexFree(void*elemAddr){
  indexData *data = elemAddr;
  free(data->word);
  VectorDispose(&data->data);
}
static void ThesEntryFree(void *elem)
{
  thesaurusEntry *entry = elem;
  free(entry->word);
  VectorDispose(&entry->synonyms);
} 
static void WordFree(void *elem)
{
  currWord *word = (currWord*)elem;
  free(word->thisWord);
  VectorDispose(&word->articles);
}