static void ParseArticle(rssDatabase *db, const char *articleTitle, const char *articleURL) { url u; urlconnection urlconn; streamtokenizer st; int articleID; URLNewAbsolute(&u, articleURL); rssNewsArticle newsArticle = { articleTitle, u.serverName, u.fullName }; pthread_mutex_t *articlesLock = &(db->locks.articlesVectorLock); pthread_mutex_lock(articlesLock); if (VectorSearch(&db->previouslySeenArticles, &newsArticle, NewsArticleCompare, 0, false) >= 0) { pthread_mutex_unlock(articlesLock); printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle); URLDispose(&u); return; } pthread_mutex_unlock(articlesLock); lockConnection(db,u.serverName); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: //printf("[%s] Ready to Index \"%s\"\n", u.serverName, articleTitle); pthread_mutex_lock(articlesLock); printf("[%s] Indexing \"%s\"\n", u.serverName, articleTitle); NewsArticleClone(&newsArticle, articleTitle, u.serverName, u.fullName); VectorAppend(&db->previouslySeenArticles, &newsArticle); articleID = VectorLength(&db->previouslySeenArticles) - 1; pthread_mutex_unlock(articlesLock); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, articleID, &db->indices, &db->stopWords, &(db->locks.indicesHashSetLock),&(db->locks.stopWordsHashSetLock) ); STDispose(&st); break; case 301: case 302:{ // just pretend we have the redirected URL all along, though index using the new URL and not the old one... int newURLLength = strlen(urlconn.newUrl)+1; char newURLBuffer[newURLLength]; strcpy(newURLBuffer, urlconn.newUrl); URLConnectionDispose(&urlconn); unlockConnection(db,u.serverName); URLDispose(&u); ParseArticle(db, articleTitle, newURLBuffer); return; } default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); unlockConnection(db,u.serverName); URLDispose(&u); }
static void LoadStopWords(hashset *stopWords, const char *stopWordsURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, stopWordsURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { LoadStopWords(stopWords, urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; HashSetNew(stopWords, sizeof(char *), kNumStopWordsBuckets, StringHash, StringCompare, StringFree); STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *stopWord = strdup(buffer); HashSetEnter(stopWords, &stopWord); } STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void BuildIndices(rssDatabase *db, const char *feedsFileURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, feedsFileURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { BuildIndices(db, urlconn.newUrl); } else { streamtokenizer st; char remoteFileName[2048]; HashSetNew(&db->indices, sizeof(rssIndexEntry), kNumIndexEntryBuckets, IndexEntryHash, IndexEntryCompare, IndexEntryFree); VectorNew(&db->previouslySeenArticles, sizeof(rssNewsArticle), NewsArticleFree, 0); STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it STNextToken(&st, remoteFileName, sizeof(remoteFileName)); ProcessFeed(db, remoteFileName); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(void *userData) { rssFeedData *data = userData; rssFeedItem *item = &data->item; url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, item->url); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", item->url); break; case 200: printf("[%s] Indexing \"%s\"\n", u.serverName, item->title); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, data); STDispose(&st); break; case 301: case 302: case 303: // just pretend we have the redirected URL, though index using the new URL and not the old one... strcpy(item->url,urlconn.newUrl); ParseArticle(data); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", item->title, u.fullName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void LoadStopWords(const char *StopWordsTextURL, hashset* stopWords) { url u; urlconnection urlconn; URLNewAbsolute(&u, StopWordsTextURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { Welcome(urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char * copy = strdup(buffer); HashSetEnter(stopWords,©); } STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void BuildIndices(const char *feedsFileURL ,rssFeedData * data) { url u; urlconnection urlconn; URLNewAbsolute(&u, feedsFileURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { // redirection, so recurse BuildIndices(urlconn.newUrl, data); } else { streamtokenizer st; char remoteDocumentURL[2048]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it STNextToken(&st, remoteDocumentURL, sizeof(remoteDocumentURL)); ProcessFeed(remoteDocumentURL,data); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void Welcome(const char *welcomeTextURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, welcomeTextURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { Welcome(urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { printf("%s\n", buffer); } printf("\n"); fflush(stdout); STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(const char *articleTitle, const char *articleDescription, const char *articleURL, hashset *stopWords, hashset *wordCounts) { url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, articleURL); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, articleTitle, articleDescription, articleURL, stopWords, wordCounts); STDispose(&st); break; case 301: case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one... ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordCounts); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ProcessFeed(const char *remoteDocumentName, hashset*stopWords, hashset* wordHash, hashset *articlesSeen) { url u; urlconnection urlconn; URLNewAbsolute(&u, remoteDocumentName); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Ignoring...", u.serverName); break; case 200: PullAllNewsItems(&urlconn, stopWords, wordHash,articlesSeen); break; case 301: case 302: ProcessFeed(urlconn.newUrl, stopWords, wordHash,articlesSeen); break; default: printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n", u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage); break; }; URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(const char *articleTitle,const char *articleDescription, const char *articleURL,hashset* stopWords, hashset* wordHash,hashset *articlesSeen) { url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, articleURL); URLConnectionNew(&urlconn, &u); article currArt; currArt.server = strdup(u.serverName); currArt.title = strdup(articleTitle); currArt.url = strdup(articleURL); currArt.numOccurrences = 0; switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); ArticleFree(&currArt); break; case 200: if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle); HashSetEnter(articlesSeen, &currArt); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen); STDispose(&st); break; } else { //if we have seen it before printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle); ArticleFree(&currArt); break; } case 301: case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one... ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordHash,articlesSeen); ArticleFree(&currArt); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); ArticleFree(&currArt); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(const char *articleURL, const char *articleTitle, rssData *allData) { url u; urlconnection urlconn; streamtokenizer st; int articleIndex; URLNewAbsolute(&u, articleURL); /* check to see if we've previously scanned the article. If the article we're processing * has already been scanned release the url and return */ article a = {articleURL, articleTitle, u.serverName}; if(VectorSearch(&allData->explored, &a, ArticleCmp, 0, false) >= 0) { printf("[Pass. article already indexed: \"%s\"]\n", articleTitle); URLDispose(&u); return; } URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName); STNew(&st, urlconn.dataStream, kTextDelimiters, false); PersistArticle(&a, articleURL, articleTitle, u.serverName); VectorAppend(&allData->explored, &a); articleIndex = VectorLength(&allData->explored)-1; ScanArticle(&st, &a, articleIndex, allData); STDispose(&st); break; case 301: case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one... ParseArticle(urlconn.newUrl, articleTitle, allData ); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void LoadStopWords(const char *stopWordsURL, rssFeedData *dataPtr) { url u; urlconnection urlconn; URLNewAbsolute(&u, stopWordsURL); URLConnectionNew(&urlconn, &u); if(urlconn.responseCode / 100 == 3) { LoadStopWords(urlconn.newUrl, dataPtr); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *s = strdup(buffer); HashSetEnter(&(dataPtr->stopWords), &s); //printf("%s\n", buffer); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }