static void BuildIndices(const char *feedsFileURL ,rssFeedData * data) { url u; urlconnection urlconn; URLNewAbsolute(&u, feedsFileURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { // redirection, so recurse BuildIndices(urlconn.newUrl, data); } else { streamtokenizer st; char remoteDocumentURL[2048]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it STNextToken(&st, remoteDocumentURL, sizeof(remoteDocumentURL)); ProcessFeed(remoteDocumentURL,data); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(rssDatabase *db, const char *articleTitle, const char *articleURL) { url u; urlconnection urlconn; streamtokenizer st; int articleID; URLNewAbsolute(&u, articleURL); rssNewsArticle newsArticle = { articleTitle, u.serverName, u.fullName }; pthread_mutex_t *articlesLock = &(db->locks.articlesVectorLock); pthread_mutex_lock(articlesLock); if (VectorSearch(&db->previouslySeenArticles, &newsArticle, NewsArticleCompare, 0, false) >= 0) { pthread_mutex_unlock(articlesLock); printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle); URLDispose(&u); return; } pthread_mutex_unlock(articlesLock); lockConnection(db,u.serverName); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: //printf("[%s] Ready to Index \"%s\"\n", u.serverName, articleTitle); pthread_mutex_lock(articlesLock); printf("[%s] Indexing \"%s\"\n", u.serverName, articleTitle); NewsArticleClone(&newsArticle, articleTitle, u.serverName, u.fullName); VectorAppend(&db->previouslySeenArticles, &newsArticle); articleID = VectorLength(&db->previouslySeenArticles) - 1; pthread_mutex_unlock(articlesLock); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, articleID, &db->indices, &db->stopWords, &(db->locks.indicesHashSetLock),&(db->locks.stopWordsHashSetLock) ); STDispose(&st); break; case 301: case 302:{ // just pretend we have the redirected URL all along, though index using the new URL and not the old one... int newURLLength = strlen(urlconn.newUrl)+1; char newURLBuffer[newURLLength]; strcpy(newURLBuffer, urlconn.newUrl); URLConnectionDispose(&urlconn); unlockConnection(db,u.serverName); URLDispose(&u); ParseArticle(db, articleTitle, newURLBuffer); return; } default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); unlockConnection(db,u.serverName); URLDispose(&u); }
static void LoadStopWords(const char *StopWordsTextURL, hashset* stopWords) { url u; urlconnection urlconn; URLNewAbsolute(&u, StopWordsTextURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { Welcome(urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char * copy = strdup(buffer); HashSetEnter(stopWords,©); } STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void LoadStopWords(hashset *stopWords, const char *stopWordsURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, stopWordsURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { LoadStopWords(stopWords, urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; HashSetNew(stopWords, sizeof(char *), kNumStopWordsBuckets, StringHash, StringCompare, StringFree); STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *stopWord = strdup(buffer); HashSetEnter(stopWords, &stopWord); } STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void BuildIndices(rssDatabase *db, const char *feedsFileURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, feedsFileURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { BuildIndices(db, urlconn.newUrl); } else { streamtokenizer st; char remoteFileName[2048]; HashSetNew(&db->indices, sizeof(rssIndexEntry), kNumIndexEntryBuckets, IndexEntryHash, IndexEntryCompare, IndexEntryFree); VectorNew(&db->previouslySeenArticles, sizeof(rssNewsArticle), NewsArticleFree, 0); STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it STNextToken(&st, remoteFileName, sizeof(remoteFileName)); ProcessFeed(db, remoteFileName); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void Welcome(const char *welcomeTextURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, welcomeTextURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { Welcome(urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { printf("%s\n", buffer); } printf("\n"); fflush(stdout); STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(const char *articleTitle, const char *articleDescription, const char *articleURL, hashset *stopWords, hashset *wordCounts) { url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, articleURL); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, articleTitle, articleDescription, articleURL, stopWords, wordCounts); STDispose(&st); break; case 301: case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one... ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordCounts); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(void *userData) { rssFeedData *data = userData; rssFeedItem *item = &data->item; url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, item->url); URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", item->url); break; case 200: printf("[%s] Indexing \"%s\"\n", u.serverName, item->title); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, data); STDispose(&st); break; case 301: case 302: case 303: // just pretend we have the redirected URL, though index using the new URL and not the old one... strcpy(item->url,urlconn.newUrl); ParseArticle(data); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", item->title, u.fullName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void PullAllNewsItems(urlconnection *urlconn, rssData *allData ) { streamtokenizer st; STNew(&st, urlconn->dataStream, kTextDelimiters, false); while (GetNextItemTag(&st)) { // if true is returned, then assume that <item ...> has just been read and pulled from the data stream ProcessSingleNewsItem(&st, allData ); } STDispose(&st); }
static void PullAllNewsItems(urlconnection *urlconn, hashset *stopWords, hashset *prevSeenArticles, hashset *wordCounts) { streamtokenizer st; STNew(&st, urlconn->dataStream, kTextDelimiters, false); while (GetNextItemTag(&st)) { // if true is returned, then assume that <item ...> has just been read and pulled from the data stream ProcessSingleNewsItem(&st, stopWords, prevSeenArticles, wordCounts); } STDispose(&st); }
static void ReadThesaurus(hashset *thesaurus, const char *filename) { FILE *infile = fopen(filename, "r"); if (infile == NULL) { fprintf(stderr, "Could not open thesaurus file named \"%s\"\n", filename); exit(1); } streamtokenizer st; STNew(&st, infile, ",\n", false); TokenizeAndBuildThesaurus(thesaurus, &st); STDispose(&st); fclose(infile); }
static void ReadStopwords(hashset *stopwords, const char *filename) { FILE *infile = fopen(filename, "r"); if (infile == NULL){ fprintf(stderr, "Could not open Stopword file name \"%s\"\n", filename); exit(1); } streamtokenizer tokenMaker; STNew(&tokenMaker, infile, "\n", true); TokenizeAndBuildStopwords(stopwords, &tokenMaker); STDispose(&tokenMaker); fclose(infile); }
static void ParseArticle(const char *articleTitle,const char *articleDescription, const char *articleURL,hashset* stopWords, hashset* wordHash,hashset *articlesSeen) { url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, articleURL); URLConnectionNew(&urlconn, &u); article currArt; currArt.server = strdup(u.serverName); currArt.title = strdup(articleTitle); currArt.url = strdup(articleURL); currArt.numOccurrences = 0; switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); ArticleFree(&currArt); break; case 200: if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle); HashSetEnter(articlesSeen, &currArt); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen); STDispose(&st); break; } else { //if we have seen it before printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle); ArticleFree(&currArt); break; } case 301: case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one... ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordHash,articlesSeen); ArticleFree(&currArt); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); ArticleFree(&currArt); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ParseArticle(const char *articleURL, const char *articleTitle, rssData *allData) { url u; urlconnection urlconn; streamtokenizer st; int articleIndex; URLNewAbsolute(&u, articleURL); /* check to see if we've previously scanned the article. If the article we're processing * has already been scanned release the url and return */ article a = {articleURL, articleTitle, u.serverName}; if(VectorSearch(&allData->explored, &a, ArticleCmp, 0, false) >= 0) { printf("[Pass. article already indexed: \"%s\"]\n", articleTitle); URLDispose(&u); return; } URLConnectionNew(&urlconn, &u); switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); break; case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName); STNew(&st, urlconn.dataStream, kTextDelimiters, false); PersistArticle(&a, articleURL, articleTitle, u.serverName); VectorAppend(&allData->explored, &a); articleIndex = VectorLength(&allData->explored)-1; ScanArticle(&st, &a, articleIndex, allData); STDispose(&st); break; case 301: case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one... ParseArticle(urlconn.newUrl, articleTitle, allData ); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void loadStopWords(hashset *s) { HashSetNew(s, sizeof(char *), 1009, StringHash, StringCmp, StringFree); FILE *infile; streamtokenizer st; char buffer[1024]; infile = fopen(kStopListFile, "r"); assert(infile != NULL); STNew(&st, infile, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *newWord = strdup(buffer); HashSetEnter(s, &newWord); } STDispose(&st); fclose(infile); }
static void Welcome(const char *welcomeTextFileName) { FILE *infile; streamtokenizer st; char buffer[1024]; infile = fopen(welcomeTextFileName, "r"); assert(infile != NULL); STNew(&st, infile, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { printf("%s\n", buffer); } printf("\n"); STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. fclose(infile); }
static void PullAllNewsItems(urlconnection *urlconn, rssFeedData *dataPtr) { streamtokenizer st; char buffer[2048]; XML_Parser rssFeedParser = XML_ParserCreate(NULL); XML_SetUserData(rssFeedParser, dataPtr); XML_SetElementHandler(rssFeedParser, ProcessStartTag, ProcessEndTag); XML_SetCharacterDataHandler(rssFeedParser, ProcessTextData); STNew(&st, urlconn->dataStream, "\n", false); while (STNextToken(&st, buffer, sizeof(buffer))) { XML_Parse(rssFeedParser, buffer, strlen(buffer), false); } STDispose(&st); XML_Parse(rssFeedParser, "", 0, true); XML_ParserFree(rssFeedParser); }
static void BuildIndices(const char *feedsFileName, rssData *allData ) { FILE *infile; streamtokenizer st; char remoteFileName[1024]; infile = fopen(feedsFileName, "r"); assert(infile != NULL); STNew(&st, infile, kNewLineDelimiters, true); while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it STNextToken(&st, remoteFileName, sizeof(remoteFileName)); ProcessFeed(remoteFileName, allData ); } STDispose(&st); fclose(infile); printf("\n"); }
static void PullAllNewsItems(rssDatabase *db, urlconnection *urlconn) { rssFeedState state = {db}; // passed through the parser by address as auxiliary data. streamtokenizer st; char buffer[2048]; XML_Parser rssFeedParser = XML_ParserCreate(NULL); XML_SetUserData(rssFeedParser, &state); XML_SetElementHandler(rssFeedParser, ProcessStartTag, ProcessEndTag); XML_SetCharacterDataHandler(rssFeedParser, ProcessTextData); STNew(&st, urlconn->dataStream, "\n", false); while (STNextToken(&st, buffer, sizeof(buffer))) { XML_Parse(rssFeedParser, buffer, strlen(buffer), false); } STDispose(&st); XML_Parse(rssFeedParser, "", 0, true); // instructs the xml parser that we're done parsing.. XML_ParserFree(rssFeedParser); }
//stop words, so we choose //the first prime > 1000. static void BuildStopWordsHashset(hashset *stopWords, const char *stopWordsFileName) { FILE *infile; streamtokenizer st; char buffer[1024]; infile = fopen(stopWordsFileName, "r"); assert(infile != NULL); HashSetNew(stopWords, sizeof(char*), kApproximateWordCount, StringHash, StringCompare, StringFree); STNew(&st, infile, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *elem = strdup(buffer); HashSetEnter(stopWords, &elem); } STDispose(&st); fclose(infile); }
static void LoadStopWords(const char *stopWordsURL, rssFeedData *dataPtr) { url u; urlconnection urlconn; URLNewAbsolute(&u, stopWordsURL); URLConnectionNew(&urlconn, &u); if(urlconn.responseCode / 100 == 3) { LoadStopWords(urlconn.newUrl, dataPtr); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *s = strdup(buffer); HashSetEnter(&(dataPtr->stopWords), &s); //printf("%s\n", buffer); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }