static void BuildIndices(const char *feedsFileURL, rssFeedData *dataPtr) { url u; urlconnection urlconn; URLNewAbsolute(&u, feedsFileURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { // redirection, so recurse BuildIndices(urlconn.newUrl, dataPtr); } else { streamtokenizer st; char remoteDocumentURL[2048]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it STNextToken(&st, remoteDocumentURL, sizeof(remoteDocumentURL)); ProcessFeed(remoteDocumentURL, dataPtr); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(const char *articleTitle,const char *articleDescription, const char *articleURL,hashset* stopWords, hashset* wordHash,hashset *articlesSeen) { url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, articleURL); URLConnectionNew(&urlconn, &u); article currArt; currArt.server = strdup(u.serverName); currArt.title = strdup(articleTitle); currArt.url = strdup(articleURL); currArt.numOccurrences = 0; switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); ArticleFree(&currArt); break; case 200: if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle); HashSetEnter(articlesSeen, &currArt); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen); STDispose(&st); break; } else { //if we have seen it before printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle); ArticleFree(&currArt); break; } case 301: case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one... ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordHash,articlesSeen); ArticleFree(&currArt); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); ArticleFree(&currArt); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(const char *articleURL, const char *articleTitle, rssData *allData) { url u; urlconnection urlconn; streamtokenizer st; int articleIndex; URLNewAbsolute(&u, articleURL); /* check to see if we've previously scanned the article. If the article we're processing * has already been scanned release the url and return */ article a = {articleURL, articleTitle, u.serverName}; if(VectorSearch(&allData->explored, &a, ArticleCmp, 0, false) >= 0) { printf("[Pass. article already indexed: \"%s\"]\n", articleTitle); URLDispose(&u); return; } URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName); STNew(&st, urlconn.dataStream, kTextDelimiters, false); PersistArticle(&a, articleURL, articleTitle, u.serverName); VectorAppend(&allData->explored, &a); articleIndex = VectorLength(&allData->explored)-1; ScanArticle(&st, &a, articleIndex, allData); STDispose(&st); break; case 301: case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one... ParseArticle(urlconn.newUrl, articleTitle, allData ); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ProcessFeed(rssDatabase *db, const char *remoteDocumentName) { url u; urlconnection urlconn; URLNewAbsolute(&u, remoteDocumentName); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Ignoring...", u.serverName); break; case 200: PullAllNewsItems(db, &urlconn); break; case 301: case 302: ProcessFeed(db, urlconn.newUrl); break; default: printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n", u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage); break; }; URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ProcessFeed(const char *remoteDocumentName, hashset *stopWords, hashset *prevSeenArticles, hashset *wordCounts) { url u; urlconnection urlconn; URLNewAbsolute(&u, remoteDocumentName); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Ignoring...", u.serverName); break; case 200: PullAllNewsItems(&urlconn, stopWords, prevSeenArticles, wordCounts); break; case 301: case 302: ProcessFeed(urlconn.newUrl, stopWords, prevSeenArticles, wordCounts); break; default: printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n", u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage); break; }; URLConnectionDispose(&urlconn); URLDispose(&u); }
static void LoadStopWords(const char *stopWordsURL, rssFeedData *dataPtr) { url u; urlconnection urlconn; URLNewAbsolute(&u, stopWordsURL); URLConnectionNew(&urlconn, &u); if(urlconn.responseCode / 100 == 3) { LoadStopWords(urlconn.newUrl, dataPtr); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *s = strdup(buffer); HashSetEnter(&(dataPtr->stopWords), &s); //printf("%s\n", buffer); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void Welcome(const char *welcomeTextURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, welcomeTextURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { Welcome(urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { printf("%s\n", buffer); } printf("\n"); STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. } URLConnectionDispose(&urlconn); URLDispose(&u); }