예제 #1
0
static void ParseArticle(rssDatabase *db, const char *articleTitle, const char *articleURL)
{      
  url u;
  urlconnection urlconn;
  streamtokenizer st;
  int articleID;

  URLNewAbsolute(&u, articleURL);
  rssNewsArticle newsArticle = { articleTitle, u.serverName, u.fullName };
  
  pthread_mutex_t *articlesLock = &(db->locks.articlesVectorLock);
  pthread_mutex_lock(articlesLock);
  if (VectorSearch(&db->previouslySeenArticles, &newsArticle, NewsArticleCompare, 0, false) >= 0) { 
    pthread_mutex_unlock(articlesLock);
    printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle);
    URLDispose(&u);     
    return;
  }
  pthread_mutex_unlock(articlesLock);  
  lockConnection(db,u.serverName);
  URLConnectionNew(&urlconn, &u);
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: //printf("[%s] Ready to Index \"%s\"\n", u.serverName, articleTitle);
	      pthread_mutex_lock(articlesLock);
	      printf("[%s] Indexing \"%s\"\n", u.serverName, articleTitle);
	      NewsArticleClone(&newsArticle, articleTitle, u.serverName, u.fullName);
	      
	      VectorAppend(&db->previouslySeenArticles, &newsArticle);
	      articleID = VectorLength(&db->previouslySeenArticles) - 1;
	      pthread_mutex_unlock(articlesLock);

	      STNew(&st, urlconn.dataStream, kTextDelimiters, false);	
	      ScanArticle(&st, articleID, &db->indices, &db->stopWords,
			  &(db->locks.indicesHashSetLock),&(db->locks.stopWordsHashSetLock) );    	      	    
	      STDispose(&st);
	      
	      break;
      case 301: 
      case 302:{ // just pretend we have the redirected URL all along, though index using the new URL and not the old one... 
	        
	        int newURLLength = strlen(urlconn.newUrl)+1; 
		char newURLBuffer[newURLLength];
		strcpy(newURLBuffer, urlconn.newUrl);
	        URLConnectionDispose(&urlconn);
		unlockConnection(db,u.serverName);
		URLDispose(&u);
		
		ParseArticle(db, articleTitle, newURLBuffer);
                return;
		
      } default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
		break;
  }
  
  URLConnectionDispose(&urlconn);
  unlockConnection(db,u.serverName);
  URLDispose(&u);
}
예제 #2
0
static void ParseArticle(void *userData)
{

  rssFeedData *data = userData;
  rssFeedItem *item = &data->item; 

  url u;
  urlconnection urlconn;
  streamtokenizer st;
  URLNewAbsolute(&u, item->url);
  URLConnectionNew(&urlconn, &u);

  switch (urlconn.responseCode) {
  case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", item->url);
    break;
  case 200: printf("[%s] Indexing \"%s\"\n", u.serverName, item->title);
    STNew(&st, urlconn.dataStream, kTextDelimiters, false);
    ScanArticle(&st, data);
    STDispose(&st);
    break;
  case 301: 
  case 302:
  case 303: // just pretend we have the redirected URL, though index using the new URL and not the old one...
    strcpy(item->url,urlconn.newUrl);
    ParseArticle(data);
    break;
  default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", item->title, u.fullName, urlconn.responseCode);
    break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
예제 #3
0
static void* PthreadParseArticle(void * threadData){
  threadArguments *arg = threadData;

  ParseArticle(arg->db ,arg->title,arg->URL);
  
  pthread_exit(NULL);
}
예제 #4
0
static void ParseArticle(const char *articleTitle, const char *articleDescription, const char *articleURL,
                         hashset *stopWords, hashset *wordCounts)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;

  URLNewAbsolute(&u, articleURL);
  URLConnectionNew(&urlconn, &u);
  
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName);
          STNew(&st, urlconn.dataStream, kTextDelimiters, false);
          ScanArticle(&st, articleTitle, articleDescription, articleURL, stopWords, wordCounts);
          STDispose(&st);
		break;
      case 301:
      case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one...
          ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordCounts);
		break;
      default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
          break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
예제 #5
0
static void ProcessEndTag(void *userData, const char *name)
{
  rssFeedData *data = userData;
  rssFeedItem *item = &data->item; 
  
  item->activeField = NULL;
  if (strcasecmp(name, "item") == 0)
    ParseArticle(userData); 
}
예제 #6
0
static void ParseArticle(const char *articleTitle,const char *articleDescription,
			 const char *articleURL,hashset* stopWords,
			 hashset* wordHash,hashset *articlesSeen)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;
    
  URLNewAbsolute(&u, articleURL);
  URLConnectionNew(&urlconn, &u);

  article currArt;
  currArt.server = strdup(u.serverName);
  currArt.title = strdup(articleTitle);
  currArt.url = strdup(articleURL);
  currArt.numOccurrences = 0;

  switch (urlconn.responseCode) {
      case 0:   printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	        ArticleFree(&currArt);
	        break;
      case 200:
	        if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before
		  printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle);
		  HashSetEnter(articlesSeen, &currArt);
		  STNew(&st, urlconn.dataStream, kTextDelimiters, false);
		  ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen);
		  STDispose(&st);
		  break;
		}
		else { //if we have seen it before
		  printf("[Ignoring  \"%s\": we've seen it before.]\n",
			 articleTitle);
		  ArticleFree(&currArt);
		  break;
		}
      case 301:
      case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one...
	        ParseArticle(articleTitle, articleDescription,
			     urlconn.newUrl, stopWords, wordHash,articlesSeen);
		ArticleFree(&currArt);
		break;
      default:  printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
		ArticleFree(&currArt);
	        break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
예제 #7
0
static void ParseArticle(const char *articleURL, const char *articleTitle, rssData *allData)
{
  url u;
  urlconnection urlconn;
  streamtokenizer st;
  int articleIndex; 

  URLNewAbsolute(&u, articleURL);
  
  
  /* check to see if we've previously scanned the article. If the article we're processing 
   * has already been scanned release the url and return 
   */ 

  article a = {articleURL, articleTitle, u.serverName}; 


  if(VectorSearch(&allData->explored, &a, ArticleCmp, 0, false) >= 0) { 
    printf("[Pass. article already indexed: \"%s\"]\n", articleTitle); 
    URLDispose(&u); 
    return; 
  }

  URLConnectionNew(&urlconn, &u);
  switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
	      break;
      case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName);
	        STNew(&st, urlconn.dataStream, kTextDelimiters, false);
		PersistArticle(&a, articleURL, articleTitle, u.serverName); 
		VectorAppend(&allData->explored, &a);
		articleIndex = VectorLength(&allData->explored)-1; 
		ScanArticle(&st, &a, articleIndex, allData);
		STDispose(&st);
		break;
      case 301:
      case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one...
	        ParseArticle(urlconn.newUrl, articleTitle, allData );
		break;
      default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
	       break;
  }
  
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}
예제 #8
0
static void ProcessSingleNewsItem(streamtokenizer *st,  hashset *stopWords, hashset *prevSeenArticles, hashset *wordCounts)
{
  char htmlTag[1024];
  char articleTitle[1024];
  char articleDescription[1024];
  char articleURL[1024];
  articleTitle[0] = articleDescription[0] = articleURL[0] = '\0';
  
  while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) {
    if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0) ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle));
    if (strncasecmp(htmlTag, kDescriptionTagPrefix, strlen(kDescriptionTagPrefix)) == 0) ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription));
    if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0) ExtractElement(st, htmlTag, articleURL, sizeof(articleURL));
  }
  
  if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return;     // punt, since it's not going to take us anywhere
  if (isNewArticle(prevSeenArticles, articleTitle, articleURL)) {
      ParseArticle(articleTitle, articleDescription, articleURL, stopWords, wordCounts);
  }
  else {
      printf("Skipping previously seen article: \"%s\"\n\tfrom \"%s\"\n", articleTitle, articleURL);
  }
}
예제 #9
0
static void ProcessSingleNewsItem(streamtokenizer *st, hashset *stopWords,
				  hashset * wordHash, hashset *articlesSeen)
{
  char htmlTag[1024];
  char articleTitle[1024];
  char articleDescription[1024];
  char articleURL[1024];
  articleTitle[0] = articleDescription[0] = articleURL[0] = '\0';
  
  while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) {
    if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0)
      ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle));
    if (strncasecmp(htmlTag, kDescriptionTagPrefix,strlen(kDescriptionTagPrefix)) == 0) 
      ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription));
    if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0)
      ExtractElement(st, htmlTag, articleURL, sizeof(articleURL));
  }
  
  if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return; // punt, since it's not going to take us anywhere

  ParseArticle(articleTitle,articleDescription,articleURL,stopWords,wordHash,articlesSeen);
}
예제 #10
0
static void ProcessSingleNewsItem(streamtokenizer *st, rssData *allData)
{
  char htmlTag[1024];
  char articleTitle[1024];
  char articleDescription[1024];
  char articleURL[1024];
  articleTitle[0] = articleDescription[0] = articleURL[0] = '\0';
  
  int count = 0; 
 
  while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) {
    if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0) ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle));
    if (strncasecmp(htmlTag, kDescriptionTagPrefix, strlen(kDescriptionTagPrefix)) == 0) ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription));
    if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0) ExtractElement(st, htmlTag, articleURL, sizeof(articleURL));
    count++; 
    if (count == 5 ) break;
  }
  
  if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return;     // punt, since it's not going to take us anywhere
  
  ParseArticle(articleURL, articleTitle, allData); 

}