コード例 #1
0
static void TokenizeAndBuildThesaurus(hashset *thesaurus, streamtokenizer *st)
{
  printf("Loading thesaurus. Be patient! ");
  fflush(stdout);

  char buffer[2048];
  while (STNextToken(st, buffer, sizeof(buffer))) {
    thesaurusEntry entry;
    entry.word = strdup(buffer);
    VectorNew(&entry.synonyms, sizeof(char *), StringFree, 4);
    while (STNextToken(st, buffer, sizeof(buffer)) && (buffer[0] == ',')) {
      STNextToken(st, buffer, sizeof(buffer));
      char *synonym = strdup(buffer);
      VectorAppend(&entry.synonyms, &synonym);
    }
    HashSetEnter(thesaurus, &entry);
    if (HashSetCount(thesaurus) % 1000 == 0) {
      printf(".");
      fflush(stdout);
    }
  }

  printf(" [All done!]\n");
  fflush(stdout);
}
コード例 #2
0
static void Welcome(const char *welcomeTextURL)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, welcomeTextURL);  
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    Welcome(urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      
      printf("%s\n", buffer);
    }  
    printf("\n");
    fflush(stdout);
    STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. 
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
コード例 #3
0
ファイル: rss-news-search.c プロジェクト: ej2xu/cs107
static void LoadStopWords(hashset *stopWords, const char *stopWordsURL)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, stopWordsURL);
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    LoadStopWords(stopWords, urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    HashSetNew(stopWords, sizeof(char *), kNumStopWordsBuckets, StringHash, StringCompare, StringFree);
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      char *stopWord = strdup(buffer);
      HashSetEnter(stopWords, &stopWord);
    }
    STDispose(&st);
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
コード例 #4
0
static void BuildIndices(const char *feedsFileURL ,rssFeedData * data)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, feedsFileURL);
  URLConnectionNew(&urlconn, &u);
 
  if (urlconn.responseCode / 100 == 3) { // redirection, so recurse
    BuildIndices(urlconn.newUrl, data);
  } else {
    streamtokenizer st;
    char remoteDocumentURL[2048];
    
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
      STSkipOver(&st, ": ");		   // now ignore the semicolon and any whitespace directly after it
      STNextToken(&st, remoteDocumentURL, sizeof(remoteDocumentURL));   
      ProcessFeed(remoteDocumentURL,data);
    }
    
    printf("\n");
    STDispose(&st);
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
コード例 #5
0
ファイル: rss-news-search.c プロジェクト: ej2xu/cs107
static void BuildIndices(rssDatabase *db, const char *feedsFileURL)
{
  url u;
  urlconnection urlconn;
  URLNewAbsolute(&u, feedsFileURL);
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    BuildIndices(db, urlconn.newUrl);
  } else {
    streamtokenizer st;
    char remoteFileName[2048];
    HashSetNew(&db->indices, sizeof(rssIndexEntry), kNumIndexEntryBuckets, IndexEntryHash, IndexEntryCompare, IndexEntryFree);
    VectorNew(&db->previouslySeenArticles, sizeof(rssNewsArticle), NewsArticleFree, 0);
  
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
      STSkipOver(&st, ": ");		   // now ignore the semicolon and any whitespace directly after it
      STNextToken(&st, remoteFileName, sizeof(remoteFileName));
      ProcessFeed(db, remoteFileName);
    }
  
    printf("\n");
    STDispose(&st);
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
コード例 #6
0
static void ScanArticle(streamtokenizer *st, void* userData)
{
  rssFeedData *data = userData;
  articleData* article = AddArticle(&data->articles,&data->item);
  
  int numWords = 0;
  char word[1024];
  char longestWord[1024] = {'\0'};

  while (STNextToken(st, word, sizeof(word))) {
    if (strcasecmp(word, "<") == 0) {
      SkipIrrelevantContent(st); // in html-utls.h
    } else {
      RemoveEscapeCharacters(word);
      if (WordIsWellFormed(word)) {
	char* dummy = word;//need this becouse cant do &word in c
	if(HashSetLookup(&data->stopWords,&dummy)==NULL){// skip stopwords

	  indexData *entry = addWordRecord(&data->indices, word);
	  indexWord(&entry->data,article);
	  
	  numWords++;
	  if (strlen(word) > strlen(longestWord))
	    strcpy(longestWord, word);
	}
      }
    }
  }

  printf("\tWe counted %d well-formed words [including duplicates].\n", numWords);
  printf("\tThe longest word scanned was \"%s\".", longestWord);
  if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) 
    printf(" [Ooooo... long word!]");
  printf("\n");
}
コード例 #7
0
static void LoadStopWords(const char *StopWordsTextURL, hashset* stopWords)
{  
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, StopWordsTextURL);  
  URLConnectionNew(&urlconn, &u);

  if (urlconn.responseCode / 100 == 3) {
    Welcome(urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      char * copy = strdup(buffer);
      HashSetEnter(stopWords,&copy);
  
    }  
    STDispose(&st);
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
コード例 #8
0
ファイル: rss-news-search.c プロジェクト: mariolew/cs107
static void ScanArticle(streamtokenizer *st, const char *articleTitle, const char *unused, const char *articleURL,
                        hashset *stopWords, hashset *wordCounts)
{
  int numWords = 0;
  char word[1024];
  char longestWord[1024] = {'\0'};

  while (STNextToken(st, word, sizeof(word))) {
    if (strcasecmp(word, "<") == 0) {
      SkipIrrelevantContent(st); // in html-utls.h
    } else {
      RemoveEscapeCharacters(word);
      char *dummy = word;
      void *pos = HashSetLookup(stopWords, &dummy);
      if (WordIsWellFormed(word) && pos == NULL) {
          //numWords++;
          //if (strlen(word) > strlen(longestWord))
          //  strcpy(longestWord, word);
          wordCountEnter(wordCounts, word, articleURL, articleTitle);
      }
    }
  }

  /*printf("\tWe counted %d well-formed words [including duplicates].\n", numWords);
  printf("\tThe longest word scanned was \"%s\".", longestWord);
  if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) 
    printf(" [Ooooo... long word!]");
    printf("\n");*/
}
コード例 #9
0
ファイル: rss-news-search.c プロジェクト: azavadil/CS107_PA
static void TokenizeAndBuildStopwords(hashset *stopwords, streamtokenizer *tokenMaker)
{ 
  printf("loading Stopwords...\n"); 
  
  char buffer[2048]; 
  while(STNextToken(tokenMaker, buffer, sizeof(buffer))){ 
    const char *currWordPtr; 
    currWordPtr = strdup(buffer); 
    HashSetEnter(stopwords, &currWordPtr); 
  }
  printf("loaded %d words\n", HashSetCount(stopwords));
}  
コード例 #10
0
ファイル: rss-news-search.c プロジェクト: azavadil/CS107_PA
static void ScanArticle(streamtokenizer *st, article *a, int articleIndex, rssData *allData )
{
  int numWords = 0;
  char word[1024];
  char longestWord[1024] = {'\0'};

  while (STNextToken(st, word, sizeof(word))) {
    if (strcasecmp(word, "<") == 0) {
      SkipIrrelevantContent(st); // in html-utls.h
    } else {
      RemoveEscapeCharacters(word);
      if (WordIsWellFormed(word)) {
	numWords++;	
	char *dummy = word;
	
	
	
	if ( WordNotInStopwords(&allData->stopwords, word)) { 
	  /**  Try looking up the word. If the word is not in the indices, create a new indexEntry 
	   *   initialized with the word and an empty vector and enter it into the hashset
	   */
	  indexEntry entry = {word}; 
	  indexEntry *found = HashSetLookup(&allData->indices, &entry);

	  if (found == NULL) {  
	    entry.word = strdup(dummy); 
	    VectorNew(&entry.articles, sizeof(wordcountEntry), NULL, 10); 
	    HashSetEnter(&allData->indices, &entry);
	  }

	  // now we act as if the entry was in the index all along
	  found  = (indexEntry*)HashSetLookup( &allData->indices, &entry); 	  
	  


	  UpdateIndices(&found->articles, articleIndex);

	}
	if (strlen(word) > strlen(longestWord))
	  strcpy(longestWord, word);
      }
    }
  }

  printf("\tWe counted %d well-formed words [including duplicates].\n", numWords);
  printf("\tThe longest word scanned was \"%s\".", longestWord);
  if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) 
    printf(" [Ooooo... long word!]");
  printf("\n");
}
コード例 #11
0
static void ScanArticle(streamtokenizer *st, article* a,
			hashset* stopWords, hashset* wordHash,
			hashset *articlesSeen)
{
  char word[1024];

  while (STNextToken(st, word, sizeof(word))) {
    if (strcasecmp(word, "<") == 0) {
      SkipIrrelevantContent(st); // in html-utls.h
    } else {
      RemoveEscapeCharacters(word);
      if (WordIsWellFormed(word)) {
	ProcessWellFormedWord(word,a,stopWords,wordHash,articlesSeen);
      }
    }
  }
}
コード例 #12
0
ファイル: rss-news-search.c プロジェクト: azavadil/CS107_PA
static void Welcome(const char *welcomeTextFileName)
{
  FILE *infile;
  streamtokenizer st;
  char buffer[1024];
  
  infile = fopen(welcomeTextFileName, "r");
  assert(infile != NULL);    
  
  STNew(&st, infile, kNewLineDelimiters, true);
  while (STNextToken(&st, buffer, sizeof(buffer))) {
    printf("%s\n", buffer);
  }
  
  printf("\n");
  STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. 
  fclose(infile);
}
コード例 #13
0
ファイル: rss-news-search.c プロジェクト: mariolew/cs107
static void loadStopWords(hashset *s)
{
    HashSetNew(s, sizeof(char *), 1009, StringHash, StringCmp, StringFree);
    
    FILE *infile;
    streamtokenizer st;
    char buffer[1024];
    infile = fopen(kStopListFile, "r");
    assert(infile != NULL);

    STNew(&st, infile, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
        char *newWord = strdup(buffer);
        HashSetEnter(s, &newWord);
    }
    STDispose(&st);
    fclose(infile);
}
コード例 #14
0
ファイル: rss-news-search.c プロジェクト: azavadil/CS107_PA
static void BuildIndices(const char *feedsFileName, rssData *allData )
{
  FILE *infile;
  streamtokenizer st;
  char remoteFileName[1024];
  
  infile = fopen(feedsFileName, "r");
  assert(infile != NULL);
  STNew(&st, infile, kNewLineDelimiters, true);
  while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
    STSkipOver(&st, ": ");		 // now ignore the semicolon and any whitespace directly after it
    STNextToken(&st, remoteFileName, sizeof(remoteFileName));   
    ProcessFeed(remoteFileName, allData );
  }
  
  STDispose(&st);
  fclose(infile);
  printf("\n");
}
コード例 #15
0
ファイル: rss-news-search.c プロジェクト: siumai/107
static void PullAllNewsItems(urlconnection *urlconn, rssFeedData *dataPtr)
{
  streamtokenizer st;
  char buffer[2048];

  XML_Parser rssFeedParser = XML_ParserCreate(NULL);
  XML_SetUserData(rssFeedParser, dataPtr);
  XML_SetElementHandler(rssFeedParser, ProcessStartTag, ProcessEndTag);
  XML_SetCharacterDataHandler(rssFeedParser, ProcessTextData);

  STNew(&st, urlconn->dataStream, "\n", false);
  while (STNextToken(&st, buffer, sizeof(buffer))) {
    XML_Parse(rssFeedParser, buffer, strlen(buffer), false);
  }
  STDispose(&st);
  
  XML_Parse(rssFeedParser, "", 0, true);
  XML_ParserFree(rssFeedParser);  
}
コード例 #16
0
ファイル: rss-news-search.c プロジェクト: ej2xu/cs107
static void PullAllNewsItems(rssDatabase *db, urlconnection *urlconn)
{
  rssFeedState state = {db}; // passed through the parser by address as auxiliary data.
  streamtokenizer st;
  char buffer[2048];

  XML_Parser rssFeedParser = XML_ParserCreate(NULL);
  XML_SetUserData(rssFeedParser, &state);
  XML_SetElementHandler(rssFeedParser, ProcessStartTag, ProcessEndTag);
  XML_SetCharacterDataHandler(rssFeedParser, ProcessTextData);

  STNew(&st, urlconn->dataStream, "\n", false);
  while (STNextToken(&st, buffer, sizeof(buffer))) {
    XML_Parse(rssFeedParser, buffer, strlen(buffer), false);
  }
  STDispose(&st);
  
  XML_Parse(rssFeedParser, "", 0, true); // instructs the xml parser that we're done parsing..
  XML_ParserFree(rssFeedParser);  
}
コード例 #17
0
					       //stop words, so we choose
					       //the first prime > 1000.
static void BuildStopWordsHashset(hashset *stopWords, const char *stopWordsFileName)
{
  FILE *infile;
  streamtokenizer st;
  char buffer[1024];
  
  infile = fopen(stopWordsFileName, "r");
  assert(infile != NULL);    
  
  HashSetNew(stopWords, sizeof(char*), kApproximateWordCount, StringHash, StringCompare, StringFree); 

  STNew(&st, infile, kNewLineDelimiters, true);
  while (STNextToken(&st, buffer, sizeof(buffer))) {
    char *elem = strdup(buffer);
    HashSetEnter(stopWords, &elem);
  }

  STDispose(&st); 
  fclose(infile);
}
コード例 #18
0
ファイル: rss-news-search.c プロジェクト: ej2xu/cs107
static void ScanArticle(streamtokenizer *st, int articleID, hashset *indices, hashset *stopWords, pthread_mutex_t* indicesLock, pthread_mutex_t* stopWordsLock)
{
  char word[1024];

  while (STNextToken(st, word, sizeof(word))) {
    if (strcasecmp(word, "<") == 0) {
      SkipIrrelevantContent(st);
    } else {
      RemoveEscapeCharacters(word);
      pthread_mutex_lock(stopWordsLock);
      bool startIndexNow = WordIsWorthIndexing(word, stopWords);
      pthread_mutex_unlock(stopWordsLock);
      if (startIndexNow){
	
	pthread_mutex_lock(indicesLock);
	AddWordToIndices(indices, word, articleID);
	//printf("DONE INDEXING");
	pthread_mutex_unlock(indicesLock);
      }
    }
  }
}
コード例 #19
0
ファイル: rss-news-search.c プロジェクト: siumai/107
static void LoadStopWords(const char *stopWordsURL, rssFeedData *dataPtr) {
	url u;
	urlconnection urlconn;
	
	URLNewAbsolute(&u, stopWordsURL);
	URLConnectionNew(&urlconn, &u);

	if(urlconn.responseCode / 100 == 3) {
		LoadStopWords(urlconn.newUrl, dataPtr);
	} else {
		streamtokenizer st;
		char buffer[4096];
		STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
		while (STNextToken(&st, buffer, sizeof(buffer))) {
      			char *s = strdup(buffer);
			HashSetEnter(&(dataPtr->stopWords), &s);
			//printf("%s\n", buffer);
    		}  
    		printf("\n");
    		STDispose(&st); 
	}
	URLConnectionDispose(&urlconn);
  	URLDispose(&u);
}
コード例 #20
0
ファイル: rss-news-search.c プロジェクト: siumai/107
static void ScanArticle(streamtokenizer *st, rssFeedData *data)
{
  articleData *article = addArticle(&data->articles, &data->rssItem);
  /*rssFeedItem *item = &(data->rssItem);
  char *articleTitle = item->title;
  char *articleURL = item->url;*/

  int numWords = 0;
  char word[1024];
  char longestWord[1024] = {'\0'};

  while (STNextToken(st, word, sizeof(word))) {
    if (strcasecmp(word, "<") == 0) {
      SkipIrrelevantContent(st); // in html-utls.h
    } else {
      RemoveEscapeCharacters(word);
      if (WordIsWellFormed(word)) {
	char *dummy = word;
	if (HashSetLookup(&(data->stopWords), &dummy)==NULL) {
		//not in stop list, index the word
		indexData *entry = addWordRecord(&data->indices, word);
		indexWord(&entry->counters, article);
		numWords++;
		if (strlen(word) > strlen(longestWord))
	  		strcpy(longestWord, word);
	}
      }
    }
  }

  printf("\tWe counted %d well-formed words [including duplicates].\n", numWords);
  printf("\tThe longest word scanned was \"%s\".", longestWord);
  if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) 
    printf(" [Ooooo... long word!]");
  printf("\n");
}