Example #1
0
static void BuildIndices(const char *feedsFileURL ,rssFeedData * data)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, feedsFileURL);
  URLConnectionNew(&urlconn, &u);
 
  if (urlconn.responseCode / 100 == 3) { // redirection, so recurse
    BuildIndices(urlconn.newUrl, data);
  } else {
    streamtokenizer st;
    char remoteDocumentURL[2048];
    
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
      STSkipOver(&st, ": ");		   // now ignore the semicolon and any whitespace directly after it
      STNextToken(&st, remoteDocumentURL, sizeof(remoteDocumentURL));   
      ProcessFeed(remoteDocumentURL,data);
    }
    
    printf("\n");
    STDispose(&st);
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #2
0
static void ParseArticle(rssDatabase *db, const char *articleTitle, const char *articleURL)
{      
  url u;
  urlconnection urlconn;
  streamtokenizer st;
  int articleID;

  URLNewAbsolute(&u, articleURL);
  rssNewsArticle newsArticle = { articleTitle, u.serverName, u.fullName };
  
  pthread_mutex_t *articlesLock = &(db->locks.articlesVectorLock);
  pthread_mutex_lock(articlesLock);
  if (VectorSearch(&db->previouslySeenArticles, &newsArticle, NewsArticleCompare, 0, false) >= 0) { 
    pthread_mutex_unlock(articlesLock);
    printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle);
    URLDispose(&u);     
    return;
  }
  pthread_mutex_unlock(articlesLock);  
  lockConnection(db,u.serverName);
  URLConnectionNew(&urlconn, &u);
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: //printf("[%s] Ready to Index \"%s\"\n", u.serverName, articleTitle);
	      pthread_mutex_lock(articlesLock);
	      printf("[%s] Indexing \"%s\"\n", u.serverName, articleTitle);
	      NewsArticleClone(&newsArticle, articleTitle, u.serverName, u.fullName);
	      
	      VectorAppend(&db->previouslySeenArticles, &newsArticle);
	      articleID = VectorLength(&db->previouslySeenArticles) - 1;
	      pthread_mutex_unlock(articlesLock);

	      STNew(&st, urlconn.dataStream, kTextDelimiters, false);	
	      ScanArticle(&st, articleID, &db->indices, &db->stopWords,
			  &(db->locks.indicesHashSetLock),&(db->locks.stopWordsHashSetLock) );    	      	    
	      STDispose(&st);
	      
	      break;
      case 301: 
      case 302:{ // just pretend we have the redirected URL all along, though index using the new URL and not the old one... 
	        
	        int newURLLength = strlen(urlconn.newUrl)+1; 
		char newURLBuffer[newURLLength];
		strcpy(newURLBuffer, urlconn.newUrl);
	        URLConnectionDispose(&urlconn);
		unlockConnection(db,u.serverName);
		URLDispose(&u);
		
		ParseArticle(db, articleTitle, newURLBuffer);
                return;
		
      } default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
		break;
  }
  
  URLConnectionDispose(&urlconn);
  unlockConnection(db,u.serverName);
  URLDispose(&u);
}
Example #3
0
static void LoadStopWords(const char *StopWordsTextURL, hashset* stopWords)
{  
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, StopWordsTextURL);  
  URLConnectionNew(&urlconn, &u);

  if (urlconn.responseCode / 100 == 3) {
    Welcome(urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      char * copy = strdup(buffer);
      HashSetEnter(stopWords,&copy);
  
    }  
    STDispose(&st);
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #4
0
static void LoadStopWords(hashset *stopWords, const char *stopWordsURL)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, stopWordsURL);
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    LoadStopWords(stopWords, urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    HashSetNew(stopWords, sizeof(char *), kNumStopWordsBuckets, StringHash, StringCompare, StringFree);
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      char *stopWord = strdup(buffer);
      HashSetEnter(stopWords, &stopWord);
    }
    STDispose(&st);
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #5
0
static void BuildIndices(rssDatabase *db, const char *feedsFileURL)
{
  url u;
  urlconnection urlconn;
  URLNewAbsolute(&u, feedsFileURL);
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    BuildIndices(db, urlconn.newUrl);
  } else {
    streamtokenizer st;
    char remoteFileName[2048];
    HashSetNew(&db->indices, sizeof(rssIndexEntry), kNumIndexEntryBuckets, IndexEntryHash, IndexEntryCompare, IndexEntryFree);
    VectorNew(&db->previouslySeenArticles, sizeof(rssNewsArticle), NewsArticleFree, 0);
  
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
      STSkipOver(&st, ": ");		   // now ignore the semicolon and any whitespace directly after it
      STNextToken(&st, remoteFileName, sizeof(remoteFileName));
      ProcessFeed(db, remoteFileName);
    }
  
    printf("\n");
    STDispose(&st);
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #6
0
static void Welcome(const char *welcomeTextURL)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, welcomeTextURL);  
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    Welcome(urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      
      printf("%s\n", buffer);
    }  
    printf("\n");
    fflush(stdout);
    STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. 
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #7
0
static void ParseArticle(const char *articleTitle, const char *articleDescription, const char *articleURL,
                         hashset *stopWords, hashset *wordCounts)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;

  URLNewAbsolute(&u, articleURL);
  URLConnectionNew(&urlconn, &u);
  
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName);
          STNew(&st, urlconn.dataStream, kTextDelimiters, false);
          ScanArticle(&st, articleTitle, articleDescription, articleURL, stopWords, wordCounts);
          STDispose(&st);
		break;
      case 301:
      case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one...
          ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordCounts);
		break;
      default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
          break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #8
0
static void ParseArticle(void *userData)
{

  rssFeedData *data = userData;
  rssFeedItem *item = &data->item; 

  url u;
  urlconnection urlconn;
  streamtokenizer st;
  URLNewAbsolute(&u, item->url);
  URLConnectionNew(&urlconn, &u);

  switch (urlconn.responseCode) {
  case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", item->url);
    break;
  case 200: printf("[%s] Indexing \"%s\"\n", u.serverName, item->title);
    STNew(&st, urlconn.dataStream, kTextDelimiters, false);
    ScanArticle(&st, data);
    STDispose(&st);
    break;
  case 301: 
  case 302:
  case 303: // just pretend we have the redirected URL, though index using the new URL and not the old one...
    strcpy(item->url,urlconn.newUrl);
    ParseArticle(data);
    break;
  default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", item->title, u.fullName, urlconn.responseCode);
    break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #9
0
static void PullAllNewsItems(urlconnection *urlconn, rssData *allData ) 
{
  streamtokenizer st;
  STNew(&st, urlconn->dataStream, kTextDelimiters, false);
  while (GetNextItemTag(&st)) { // if true is returned, then assume that <item ...> has just been read and pulled from the data stream
    ProcessSingleNewsItem(&st, allData );
  }
  
  STDispose(&st);
}
Example #10
0
static void PullAllNewsItems(urlconnection *urlconn,  hashset *stopWords, hashset *prevSeenArticles, hashset *wordCounts)
{
  streamtokenizer st;
  STNew(&st, urlconn->dataStream, kTextDelimiters, false);
  while (GetNextItemTag(&st)) { // if true is returned, then assume that <item ...> has just been read and pulled from the data stream
      ProcessSingleNewsItem(&st, stopWords, prevSeenArticles, wordCounts);
  }
  
  STDispose(&st);
}
static void ReadThesaurus(hashset *thesaurus, const char *filename)
{
  FILE *infile = fopen(filename, "r");
  if (infile == NULL) {
    fprintf(stderr, "Could not open thesaurus file named \"%s\"\n", filename);
    exit(1);
  }
  
  streamtokenizer st;
  STNew(&st, infile, ",\n", false);
  TokenizeAndBuildThesaurus(thesaurus, &st);
  STDispose(&st);
  fclose(infile);
}
Example #12
0
static void ReadStopwords(hashset *stopwords, const char *filename)
{
  FILE *infile = fopen(filename, "r"); 
  if (infile == NULL){ 
    fprintf(stderr, "Could not open Stopword file name \"%s\"\n", filename); 
    exit(1); 
  }
  
  streamtokenizer tokenMaker; 
  STNew(&tokenMaker, infile, "\n", true); 
  TokenizeAndBuildStopwords(stopwords, &tokenMaker); 
  STDispose(&tokenMaker); 
  fclose(infile); 
} 
static void ParseArticle(const char *articleTitle,const char *articleDescription,
			 const char *articleURL,hashset* stopWords,
			 hashset* wordHash,hashset *articlesSeen)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;
    
  URLNewAbsolute(&u, articleURL);
  URLConnectionNew(&urlconn, &u);

  article currArt;
  currArt.server = strdup(u.serverName);
  currArt.title = strdup(articleTitle);
  currArt.url = strdup(articleURL);
  currArt.numOccurrences = 0;

  switch (urlconn.responseCode) {
      case 0:   printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	        ArticleFree(&currArt);
	        break;
      case 200:
	        if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before
		  printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle);
		  HashSetEnter(articlesSeen, &currArt);
		  STNew(&st, urlconn.dataStream, kTextDelimiters, false);
		  ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen);
		  STDispose(&st);
		  break;
		}
		else { //if we have seen it before
		  printf("[Ignoring  \"%s\": we've seen it before.]\n",
			 articleTitle);
		  ArticleFree(&currArt);
		  break;
		}
      case 301:
      case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one...
	        ParseArticle(articleTitle, articleDescription,
			     urlconn.newUrl, stopWords, wordHash,articlesSeen);
		ArticleFree(&currArt);
		break;
      default:  printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
		ArticleFree(&currArt);
	        break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #14
0
static void ParseArticle(const char *articleURL, const char *articleTitle, rssData *allData)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;
  int articleIndex; 

  URLNewAbsolute(&u, articleURL);
  
  
  /* check to see if we've previously scanned the article. If the article we're processing 
   * has already been scanned release the url and return 
   */ 

  article a = {articleURL, articleTitle, u.serverName}; 


  if(VectorSearch(&allData->explored, &a, ArticleCmp, 0, false) >= 0) { 
    printf("[Pass. article already indexed: \"%s\"]\n", articleTitle); 
    URLDispose(&u); 
    return; 
  }

  URLConnectionNew(&urlconn, &u);
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName);
	        STNew(&st, urlconn.dataStream, kTextDelimiters, false);
		PersistArticle(&a, articleURL, articleTitle, u.serverName); 
		VectorAppend(&allData->explored, &a);
		articleIndex = VectorLength(&allData->explored)-1; 
		ScanArticle(&st, &a, articleIndex, allData);
		STDispose(&st);
		break;
      case 301:
      case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one...
	        ParseArticle(urlconn.newUrl, articleTitle, allData );
		break;
      default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
	       break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #15
0
static void loadStopWords(hashset *s)
{
    HashSetNew(s, sizeof(char *), 1009, StringHash, StringCmp, StringFree);
    
    FILE *infile;
    streamtokenizer st;
    char buffer[1024];
    infile = fopen(kStopListFile, "r");
    assert(infile != NULL);

    STNew(&st, infile, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
        char *newWord = strdup(buffer);
        HashSetEnter(s, &newWord);
    }
    STDispose(&st);
    fclose(infile);
}
Example #16
0
static void Welcome(const char *welcomeTextFileName)
{
  FILE *infile;
  streamtokenizer st;
  char buffer[1024];
  
  infile = fopen(welcomeTextFileName, "r");
  assert(infile != NULL);    
  
  STNew(&st, infile, kNewLineDelimiters, true);
  while (STNextToken(&st, buffer, sizeof(buffer))) {
    printf("%s\n", buffer);
  }
  
  printf("\n");
  STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. 
  fclose(infile);
}
Example #17
0
static void PullAllNewsItems(urlconnection *urlconn, rssFeedData *dataPtr)
{
  streamtokenizer st;
  char buffer[2048];

  XML_Parser rssFeedParser = XML_ParserCreate(NULL);
  XML_SetUserData(rssFeedParser, dataPtr);
  XML_SetElementHandler(rssFeedParser, ProcessStartTag, ProcessEndTag);
  XML_SetCharacterDataHandler(rssFeedParser, ProcessTextData);

  STNew(&st, urlconn->dataStream, "\n", false);
  while (STNextToken(&st, buffer, sizeof(buffer))) {
    XML_Parse(rssFeedParser, buffer, strlen(buffer), false);
  }
  STDispose(&st);
  
  XML_Parse(rssFeedParser, "", 0, true);
  XML_ParserFree(rssFeedParser);  
}
Example #18
0
static void BuildIndices(const char *feedsFileName, rssData *allData )
{
  FILE *infile;
  streamtokenizer st;
  char remoteFileName[1024];
  
  infile = fopen(feedsFileName, "r");
  assert(infile != NULL);
  STNew(&st, infile, kNewLineDelimiters, true);
  while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
    STSkipOver(&st, ": ");		 // now ignore the semicolon and any whitespace directly after it
    STNextToken(&st, remoteFileName, sizeof(remoteFileName));   
    ProcessFeed(remoteFileName, allData );
  }
  
  STDispose(&st);
  fclose(infile);
  printf("\n");
}
Example #19
0
static void PullAllNewsItems(rssDatabase *db, urlconnection *urlconn)
{
  rssFeedState state = {db}; // passed through the parser by address as auxiliary data.
  streamtokenizer st;
  char buffer[2048];

  XML_Parser rssFeedParser = XML_ParserCreate(NULL);
  XML_SetUserData(rssFeedParser, &state);
  XML_SetElementHandler(rssFeedParser, ProcessStartTag, ProcessEndTag);
  XML_SetCharacterDataHandler(rssFeedParser, ProcessTextData);

  STNew(&st, urlconn->dataStream, "\n", false);
  while (STNextToken(&st, buffer, sizeof(buffer))) {
    XML_Parse(rssFeedParser, buffer, strlen(buffer), false);
  }
  STDispose(&st);
  
  XML_Parse(rssFeedParser, "", 0, true); // instructs the xml parser that we're done parsing..
  XML_ParserFree(rssFeedParser);  
}
					       //stop words, so we choose
					       //the first prime > 1000.
static void BuildStopWordsHashset(hashset *stopWords, const char *stopWordsFileName)
{
  FILE *infile;
  streamtokenizer st;
  char buffer[1024];
  
  infile = fopen(stopWordsFileName, "r");
  assert(infile != NULL);    
  
  HashSetNew(stopWords, sizeof(char*), kApproximateWordCount, StringHash, StringCompare, StringFree); 

  STNew(&st, infile, kNewLineDelimiters, true);
  while (STNextToken(&st, buffer, sizeof(buffer))) {
    char *elem = strdup(buffer);
    HashSetEnter(stopWords, &elem);
  }

  STDispose(&st); 
  fclose(infile);
}
Example #21
0
static void LoadStopWords(const char *stopWordsURL, rssFeedData *dataPtr) {
	url u;
	urlconnection urlconn;
	
	URLNewAbsolute(&u, stopWordsURL);
	URLConnectionNew(&urlconn, &u);

	if(urlconn.responseCode / 100 == 3) {
		LoadStopWords(urlconn.newUrl, dataPtr);
	} else {
		streamtokenizer st;
		char buffer[4096];
		STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
		while (STNextToken(&st, buffer, sizeof(buffer))) {
      			char *s = strdup(buffer);
			HashSetEnter(&(dataPtr->stopWords), &s);
			//printf("%s\n", buffer);
    		}  
    		printf("\n");
    		STDispose(&st); 
	}
	URLConnectionDispose(&urlconn);
  	URLDispose(&u);
}