static void ParseArticle(rssDatabase *db, const char *articleTitle, const char *articleURL) { url u; urlconnection urlconn; streamtokenizer st; int articleID; URLNewAbsolute(&u, articleURL); rssNewsArticle newsArticle = { articleTitle, u.serverName, u.fullName }; pthread_mutex_t *articlesLock = &(db->locks.articlesVectorLock); pthread_mutex_lock(articlesLock); if (VectorSearch(&db->previouslySeenArticles, &newsArticle, NewsArticleCompare, 0, false) >= 0) { pthread_mutex_unlock(articlesLock); printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle); URLDispose(&u); return; } pthread_mutex_unlock(articlesLock); lockConnection(db,u.serverName); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: //printf("[%s] Ready to Index \"%s\"\n", u.serverName, articleTitle); pthread_mutex_lock(articlesLock); printf("[%s] Indexing \"%s\"\n", u.serverName, articleTitle); NewsArticleClone(&newsArticle, articleTitle, u.serverName, u.fullName); VectorAppend(&db->previouslySeenArticles, &newsArticle); articleID = VectorLength(&db->previouslySeenArticles) - 1; pthread_mutex_unlock(articlesLock); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, articleID, &db->indices, &db->stopWords, &(db->locks.indicesHashSetLock),&(db->locks.stopWordsHashSetLock) ); STDispose(&st); break; case 301: case 302:{ // just pretend we have the redirected URL all along, though index using the new URL and not the old one... int newURLLength = strlen(urlconn.newUrl)+1; char newURLBuffer[newURLLength]; strcpy(newURLBuffer, urlconn.newUrl); URLConnectionDispose(&urlconn); unlockConnection(db,u.serverName); URLDispose(&u); ParseArticle(db, articleTitle, newURLBuffer); return; } default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); unlockConnection(db,u.serverName); URLDispose(&u); }
static void ParseArticle(void *userData) { rssFeedData *data = userData; rssFeedItem *item = &data->item; url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, item->url); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", item->url); break; case 200: printf("[%s] Indexing \"%s\"\n", u.serverName, item->title); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, data); STDispose(&st); break; case 301: case 302: case 303: // just pretend we have the redirected URL, though index using the new URL and not the old one... strcpy(item->url,urlconn.newUrl); ParseArticle(data); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", item->title, u.fullName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void* PthreadParseArticle(void * threadData){ threadArguments *arg = threadData; ParseArticle(arg->db ,arg->title,arg->URL); pthread_exit(NULL); }
static void ParseArticle(const char *articleTitle, const char *articleDescription, const char *articleURL, hashset *stopWords, hashset *wordCounts) { url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, articleURL); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, articleTitle, articleDescription, articleURL, stopWords, wordCounts); STDispose(&st); break; case 301: case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one... ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordCounts); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ProcessEndTag(void *userData, const char *name) { rssFeedData *data = userData; rssFeedItem *item = &data->item; item->activeField = NULL; if (strcasecmp(name, "item") == 0) ParseArticle(userData); }
static void ParseArticle(const char *articleTitle,const char *articleDescription, const char *articleURL,hashset* stopWords, hashset* wordHash,hashset *articlesSeen) { url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, articleURL); URLConnectionNew(&urlconn, &u); article currArt; currArt.server = strdup(u.serverName); currArt.title = strdup(articleTitle); currArt.url = strdup(articleURL); currArt.numOccurrences = 0; switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); ArticleFree(&currArt); break; case 200: if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle); HashSetEnter(articlesSeen, &currArt); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen); STDispose(&st); break; } else { //if we have seen it before printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle); ArticleFree(&currArt); break; } case 301: case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one... ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordHash,articlesSeen); ArticleFree(&currArt); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); ArticleFree(&currArt); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(const char *articleURL, const char *articleTitle, rssData *allData) { url u; urlconnection urlconn; streamtokenizer st; int articleIndex; URLNewAbsolute(&u, articleURL); /* check to see if we've previously scanned the article. If the article we're processing * has already been scanned release the url and return */ article a = {articleURL, articleTitle, u.serverName}; if(VectorSearch(&allData->explored, &a, ArticleCmp, 0, false) >= 0) { printf("[Pass. article already indexed: \"%s\"]\n", articleTitle); URLDispose(&u); return; } URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName); STNew(&st, urlconn.dataStream, kTextDelimiters, false); PersistArticle(&a, articleURL, articleTitle, u.serverName); VectorAppend(&allData->explored, &a); articleIndex = VectorLength(&allData->explored)-1; ScanArticle(&st, &a, articleIndex, allData); STDispose(&st); break; case 301: case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one... ParseArticle(urlconn.newUrl, articleTitle, allData ); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ProcessSingleNewsItem(streamtokenizer *st, hashset *stopWords, hashset *prevSeenArticles, hashset *wordCounts) { char htmlTag[1024]; char articleTitle[1024]; char articleDescription[1024]; char articleURL[1024]; articleTitle[0] = articleDescription[0] = articleURL[0] = '\0'; while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) { if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0) ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle)); if (strncasecmp(htmlTag, kDescriptionTagPrefix, strlen(kDescriptionTagPrefix)) == 0) ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription)); if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0) ExtractElement(st, htmlTag, articleURL, sizeof(articleURL)); } if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return; // punt, since it's not going to take us anywhere if (isNewArticle(prevSeenArticles, articleTitle, articleURL)) { ParseArticle(articleTitle, articleDescription, articleURL, stopWords, wordCounts); } else { printf("Skipping previously seen article: \"%s\"\n\tfrom \"%s\"\n", articleTitle, articleURL); } }
static void ProcessSingleNewsItem(streamtokenizer *st, hashset *stopWords, hashset * wordHash, hashset *articlesSeen) { char htmlTag[1024]; char articleTitle[1024]; char articleDescription[1024]; char articleURL[1024]; articleTitle[0] = articleDescription[0] = articleURL[0] = '\0'; while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) { if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0) ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle)); if (strncasecmp(htmlTag, kDescriptionTagPrefix,strlen(kDescriptionTagPrefix)) == 0) ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription)); if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0) ExtractElement(st, htmlTag, articleURL, sizeof(articleURL)); } if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return; // punt, since it's not going to take us anywhere ParseArticle(articleTitle,articleDescription,articleURL,stopWords,wordHash,articlesSeen); }
static void ProcessSingleNewsItem(streamtokenizer *st, rssData *allData) { char htmlTag[1024]; char articleTitle[1024]; char articleDescription[1024]; char articleURL[1024]; articleTitle[0] = articleDescription[0] = articleURL[0] = '\0'; int count = 0; while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) { if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0) ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle)); if (strncasecmp(htmlTag, kDescriptionTagPrefix, strlen(kDescriptionTagPrefix)) == 0) ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription)); if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0) ExtractElement(st, htmlTag, articleURL, sizeof(articleURL)); count++; if (count == 5 ) break; } if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return; // punt, since it's not going to take us anywhere ParseArticle(articleURL, articleTitle, allData); }