Example #1
0
static void BuildIndices(const char *feedsFileURL, rssFeedData *dataPtr)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, feedsFileURL);
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) { // redirection, so recurse
    BuildIndices(urlconn.newUrl, dataPtr);
  } else {
    streamtokenizer st;
    char remoteDocumentURL[2048];
    
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
      STSkipOver(&st, ": ");		   // now ignore the semicolon and any whitespace directly after it
      STNextToken(&st, remoteDocumentURL, sizeof(remoteDocumentURL));   
      ProcessFeed(remoteDocumentURL, dataPtr);
    }
    
    printf("\n");
    STDispose(&st);
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
static void ParseArticle(const char *articleTitle,const char *articleDescription,
			 const char *articleURL,hashset* stopWords,
			 hashset* wordHash,hashset *articlesSeen)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;
    
  URLNewAbsolute(&u, articleURL);
  URLConnectionNew(&urlconn, &u);

  article currArt;
  currArt.server = strdup(u.serverName);
  currArt.title = strdup(articleTitle);
  currArt.url = strdup(articleURL);
  currArt.numOccurrences = 0;

  switch (urlconn.responseCode) {
      case 0:   printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	        ArticleFree(&currArt);
	        break;
      case 200:
	        if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before
		  printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle);
		  HashSetEnter(articlesSeen, &currArt);
		  STNew(&st, urlconn.dataStream, kTextDelimiters, false);
		  ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen);
		  STDispose(&st);
		  break;
		}
		else { //if we have seen it before
		  printf("[Ignoring  \"%s\": we've seen it before.]\n",
			 articleTitle);
		  ArticleFree(&currArt);
		  break;
		}
      case 301:
      case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one...
	        ParseArticle(articleTitle, articleDescription,
			     urlconn.newUrl, stopWords, wordHash,articlesSeen);
		ArticleFree(&currArt);
		break;
      default:  printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
		ArticleFree(&currArt);
	        break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #3
0
static void ParseArticle(const char *articleURL, const char *articleTitle, rssData *allData)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;
  int articleIndex; 

  URLNewAbsolute(&u, articleURL);
  
  
  /* check to see if we've previously scanned the article. If the article we're processing 
   * has already been scanned release the url and return 
   */ 

  article a = {articleURL, articleTitle, u.serverName}; 


  if(VectorSearch(&allData->explored, &a, ArticleCmp, 0, false) >= 0) { 
    printf("[Pass. article already indexed: \"%s\"]\n", articleTitle); 
    URLDispose(&u); 
    return; 
  }

  URLConnectionNew(&urlconn, &u);
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName);
	        STNew(&st, urlconn.dataStream, kTextDelimiters, false);
		PersistArticle(&a, articleURL, articleTitle, u.serverName); 
		VectorAppend(&allData->explored, &a);
		articleIndex = VectorLength(&allData->explored)-1; 
		ScanArticle(&st, &a, articleIndex, allData);
		STDispose(&st);
		break;
      case 301:
      case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one...
	        ParseArticle(urlconn.newUrl, articleTitle, allData );
		break;
      default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
	       break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #4
0
static void ProcessFeed(rssDatabase *db, const char *remoteDocumentName)
{
  url u;
  urlconnection urlconn;  
  URLNewAbsolute(&u, remoteDocumentName);
  URLConnectionNew(&urlconn, &u);
  
  switch (urlconn.responseCode) {
    
      case 0: printf("Unable to connect to \"%s\".  Ignoring...", u.serverName);
	      break;
      case 200: PullAllNewsItems(db, &urlconn);
                break;
      case 301: 
      case 302: ProcessFeed(db, urlconn.newUrl);
                break;
      default: printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n",
		      u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage);
	       break;
  };
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #5
0
static void ProcessFeed(const char *remoteDocumentName, hashset *stopWords, hashset *prevSeenArticles, hashset *wordCounts)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, remoteDocumentName);
  URLConnectionNew(&urlconn, &u);
  
  switch (urlconn.responseCode) {
  case 0: printf("Unable to connect to \"%s\".  Ignoring...", u.serverName);
      break;
  case 200: PullAllNewsItems(&urlconn, stopWords, prevSeenArticles, wordCounts);
      break;
  case 301: 
  case 302: ProcessFeed(urlconn.newUrl, stopWords, prevSeenArticles, wordCounts);
      break;
  default: printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n",
                  u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage);
      break;
  };
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Example #6
0
static void LoadStopWords(const char *stopWordsURL, rssFeedData *dataPtr) {
	url u;
	urlconnection urlconn;
	
	URLNewAbsolute(&u, stopWordsURL);
	URLConnectionNew(&urlconn, &u);

	if(urlconn.responseCode / 100 == 3) {
		LoadStopWords(urlconn.newUrl, dataPtr);
	} else {
		streamtokenizer st;
		char buffer[4096];
		STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
		while (STNextToken(&st, buffer, sizeof(buffer))) {
      			char *s = strdup(buffer);
			HashSetEnter(&(dataPtr->stopWords), &s);
			//printf("%s\n", buffer);
    		}  
    		printf("\n");
    		STDispose(&st); 
	}
	URLConnectionDispose(&urlconn);
  	URLDispose(&u);
}
Example #7
0
static void Welcome(const char *welcomeTextURL)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, welcomeTextURL);
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    Welcome(urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      printf("%s\n", buffer);
    }  
    printf("\n");
    STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. 
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}