Esempio n. 1
0
static void ParseArticle(rssDatabase *db, const char *articleTitle, const char *articleURL)
{      
  url u;
  urlconnection urlconn;
  streamtokenizer st;
  int articleID;

  URLNewAbsolute(&u, articleURL);
  rssNewsArticle newsArticle = { articleTitle, u.serverName, u.fullName };
  
  pthread_mutex_t *articlesLock = &(db->locks.articlesVectorLock);
  pthread_mutex_lock(articlesLock);
  if (VectorSearch(&db->previouslySeenArticles, &newsArticle, NewsArticleCompare, 0, false) >= 0) { 
    pthread_mutex_unlock(articlesLock);
    printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle);
    URLDispose(&u);     
    return;
  }
  pthread_mutex_unlock(articlesLock);  
  lockConnection(db,u.serverName);
  URLConnectionNew(&urlconn, &u);
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: //printf("[%s] Ready to Index \"%s\"\n", u.serverName, articleTitle);
	      pthread_mutex_lock(articlesLock);
	      printf("[%s] Indexing \"%s\"\n", u.serverName, articleTitle);
	      NewsArticleClone(&newsArticle, articleTitle, u.serverName, u.fullName);
	      
	      VectorAppend(&db->previouslySeenArticles, &newsArticle);
	      articleID = VectorLength(&db->previouslySeenArticles) - 1;
	      pthread_mutex_unlock(articlesLock);

	      STNew(&st, urlconn.dataStream, kTextDelimiters, false);	
	      ScanArticle(&st, articleID, &db->indices, &db->stopWords,
			  &(db->locks.indicesHashSetLock),&(db->locks.stopWordsHashSetLock) );    	      	    
	      STDispose(&st);
	      
	      break;
      case 301: 
      case 302:{ // just pretend we have the redirected URL all along, though index using the new URL and not the old one... 
	        
	        int newURLLength = strlen(urlconn.newUrl)+1; 
		char newURLBuffer[newURLLength];
		strcpy(newURLBuffer, urlconn.newUrl);
	        URLConnectionDispose(&urlconn);
		unlockConnection(db,u.serverName);
		URLDispose(&u);
		
		ParseArticle(db, articleTitle, newURLBuffer);
                return;
		
      } default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
		break;
  }
  
  URLConnectionDispose(&urlconn);
  unlockConnection(db,u.serverName);
  URLDispose(&u);
}
Esempio n. 2
0
static void LoadStopWords(hashset *stopWords, const char *stopWordsURL)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, stopWordsURL);
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    LoadStopWords(stopWords, urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    HashSetNew(stopWords, sizeof(char *), kNumStopWordsBuckets, StringHash, StringCompare, StringFree);
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      char *stopWord = strdup(buffer);
      HashSetEnter(stopWords, &stopWord);
    }
    STDispose(&st);
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Esempio n. 3
0
static void BuildIndices(rssDatabase *db, const char *feedsFileURL)
{
  url u;
  urlconnection urlconn;
  URLNewAbsolute(&u, feedsFileURL);
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    BuildIndices(db, urlconn.newUrl);
  } else {
    streamtokenizer st;
    char remoteFileName[2048];
    HashSetNew(&db->indices, sizeof(rssIndexEntry), kNumIndexEntryBuckets, IndexEntryHash, IndexEntryCompare, IndexEntryFree);
    VectorNew(&db->previouslySeenArticles, sizeof(rssNewsArticle), NewsArticleFree, 0);
  
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
      STSkipOver(&st, ": ");		   // now ignore the semicolon and any whitespace directly after it
      STNextToken(&st, remoteFileName, sizeof(remoteFileName));
      ProcessFeed(db, remoteFileName);
    }
  
    printf("\n");
    STDispose(&st);
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Esempio n. 4
0
static void ParseArticle(void *userData)
{

  rssFeedData *data = userData;
  rssFeedItem *item = &data->item; 

  url u;
  urlconnection urlconn;
  streamtokenizer st;
  URLNewAbsolute(&u, item->url);
  URLConnectionNew(&urlconn, &u);

  switch (urlconn.responseCode) {
  case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", item->url);
    break;
  case 200: printf("[%s] Indexing \"%s\"\n", u.serverName, item->title);
    STNew(&st, urlconn.dataStream, kTextDelimiters, false);
    ScanArticle(&st, data);
    STDispose(&st);
    break;
  case 301: 
  case 302:
  case 303: // just pretend we have the redirected URL, though index using the new URL and not the old one...
    strcpy(item->url,urlconn.newUrl);
    ParseArticle(data);
    break;
  default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", item->title, u.fullName, urlconn.responseCode);
    break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Esempio n. 5
0
static void LoadStopWords(const char *StopWordsTextURL, hashset* stopWords)
{  
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, StopWordsTextURL);  
  URLConnectionNew(&urlconn, &u);

  if (urlconn.responseCode / 100 == 3) {
    Welcome(urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      char * copy = strdup(buffer);
      HashSetEnter(stopWords,&copy);
  
    }  
    STDispose(&st);
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Esempio n. 6
0
static void BuildIndices(const char *feedsFileURL ,rssFeedData * data)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, feedsFileURL);
  URLConnectionNew(&urlconn, &u);
 
  if (urlconn.responseCode / 100 == 3) { // redirection, so recurse
    BuildIndices(urlconn.newUrl, data);
  } else {
    streamtokenizer st;
    char remoteDocumentURL[2048];
    
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
      STSkipOver(&st, ": ");		   // now ignore the semicolon and any whitespace directly after it
      STNextToken(&st, remoteDocumentURL, sizeof(remoteDocumentURL));   
      ProcessFeed(remoteDocumentURL,data);
    }
    
    printf("\n");
    STDispose(&st);
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Esempio n. 7
0
static void Welcome(const char *welcomeTextURL)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, welcomeTextURL);  
  URLConnectionNew(&urlconn, &u);
  
  if (urlconn.responseCode / 100 == 3) {
    Welcome(urlconn.newUrl);
  } else {
    streamtokenizer st;
    char buffer[4096];
    STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
    while (STNextToken(&st, buffer, sizeof(buffer))) {
      
      printf("%s\n", buffer);
    }  
    printf("\n");
    fflush(stdout);
    STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. 
  }

  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Esempio n. 8
0
static void ParseArticle(const char *articleTitle, const char *articleDescription, const char *articleURL,
                         hashset *stopWords, hashset *wordCounts)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;

  URLNewAbsolute(&u, articleURL);
  URLConnectionNew(&urlconn, &u);
  
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName);
          STNew(&st, urlconn.dataStream, kTextDelimiters, false);
          ScanArticle(&st, articleTitle, articleDescription, articleURL, stopWords, wordCounts);
          STDispose(&st);
		break;
      case 301:
      case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one...
          ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordCounts);
		break;
      default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
          break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
static void ProcessFeed(const char *remoteDocumentName, hashset*stopWords,
			hashset* wordHash, hashset *articlesSeen)
{
  url u;
  urlconnection urlconn;
  
  URLNewAbsolute(&u, remoteDocumentName);
  URLConnectionNew(&urlconn, &u);
  
  switch (urlconn.responseCode) {
    case 0:   printf("Unable to connect to \"%s\".  Ignoring...", u.serverName);
              break;
    case 200: PullAllNewsItems(&urlconn, stopWords, wordHash,articlesSeen);
              break;
    case 301: 
    case 302:
              ProcessFeed(urlconn.newUrl, stopWords, wordHash,articlesSeen);
              break;
    default:  printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n", u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage);
              break;
  };
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
static void ParseArticle(const char *articleTitle,const char *articleDescription,
			 const char *articleURL,hashset* stopWords,
			 hashset* wordHash,hashset *articlesSeen)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;
    
  URLNewAbsolute(&u, articleURL);
  URLConnectionNew(&urlconn, &u);

  article currArt;
  currArt.server = strdup(u.serverName);
  currArt.title = strdup(articleTitle);
  currArt.url = strdup(articleURL);
  currArt.numOccurrences = 0;

  switch (urlconn.responseCode) {
      case 0:   printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	        ArticleFree(&currArt);
	        break;
      case 200:
	        if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before
		  printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle);
		  HashSetEnter(articlesSeen, &currArt);
		  STNew(&st, urlconn.dataStream, kTextDelimiters, false);
		  ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen);
		  STDispose(&st);
		  break;
		}
		else { //if we have seen it before
		  printf("[Ignoring  \"%s\": we've seen it before.]\n",
			 articleTitle);
		  ArticleFree(&currArt);
		  break;
		}
      case 301:
      case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one...
	        ParseArticle(articleTitle, articleDescription,
			     urlconn.newUrl, stopWords, wordHash,articlesSeen);
		ArticleFree(&currArt);
		break;
      default:  printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
		ArticleFree(&currArt);
	        break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Esempio n. 11
0
static void ParseArticle(const char *articleURL, const char *articleTitle, rssData *allData)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;
  int articleIndex; 

  URLNewAbsolute(&u, articleURL);
  
  
  /* check to see if we've previously scanned the article. If the article we're processing 
   * has already been scanned release the url and return 
   */ 

  article a = {articleURL, articleTitle, u.serverName}; 


  if(VectorSearch(&allData->explored, &a, ArticleCmp, 0, false) >= 0) { 
    printf("[Pass. article already indexed: \"%s\"]\n", articleTitle); 
    URLDispose(&u); 
    return; 
  }

  URLConnectionNew(&urlconn, &u);
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName);
	        STNew(&st, urlconn.dataStream, kTextDelimiters, false);
		PersistArticle(&a, articleURL, articleTitle, u.serverName); 
		VectorAppend(&allData->explored, &a);
		articleIndex = VectorLength(&allData->explored)-1; 
		ScanArticle(&st, &a, articleIndex, allData);
		STDispose(&st);
		break;
      case 301:
      case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one...
	        ParseArticle(urlconn.newUrl, articleTitle, allData );
		break;
      default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
	       break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
Esempio n. 12
0
static void LoadStopWords(const char *stopWordsURL, rssFeedData *dataPtr) {
	url u;
	urlconnection urlconn;
	
	URLNewAbsolute(&u, stopWordsURL);
	URLConnectionNew(&urlconn, &u);

	if(urlconn.responseCode / 100 == 3) {
		LoadStopWords(urlconn.newUrl, dataPtr);
	} else {
		streamtokenizer st;
		char buffer[4096];
		STNew(&st, urlconn.dataStream, kNewLineDelimiters, true);
		while (STNextToken(&st, buffer, sizeof(buffer))) {
      			char *s = strdup(buffer);
			HashSetEnter(&(dataPtr->stopWords), &s);
			//printf("%s\n", buffer);
    		}  
    		printf("\n");
    		STDispose(&st); 
	}
	URLConnectionDispose(&urlconn);
  	URLDispose(&u);
}