static void TokenizeAndBuildThesaurus(hashset *thesaurus, streamtokenizer *st) { printf("Loading thesaurus. Be patient! "); fflush(stdout); char buffer[2048]; while (STNextToken(st, buffer, sizeof(buffer))) { thesaurusEntry entry; entry.word = strdup(buffer); VectorNew(&entry.synonyms, sizeof(char *), StringFree, 4); while (STNextToken(st, buffer, sizeof(buffer)) && (buffer[0] == ',')) { STNextToken(st, buffer, sizeof(buffer)); char *synonym = strdup(buffer); VectorAppend(&entry.synonyms, &synonym); } HashSetEnter(thesaurus, &entry); if (HashSetCount(thesaurus) % 1000 == 0) { printf("."); fflush(stdout); } } printf(" [All done!]\n"); fflush(stdout); }
static void Welcome(const char *welcomeTextURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, welcomeTextURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { Welcome(urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { printf("%s\n", buffer); } printf("\n"); fflush(stdout); STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void LoadStopWords(hashset *stopWords, const char *stopWordsURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, stopWordsURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { LoadStopWords(stopWords, urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; HashSetNew(stopWords, sizeof(char *), kNumStopWordsBuckets, StringHash, StringCompare, StringFree); STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *stopWord = strdup(buffer); HashSetEnter(stopWords, &stopWord); } STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void BuildIndices(const char *feedsFileURL ,rssFeedData * data) { url u; urlconnection urlconn; URLNewAbsolute(&u, feedsFileURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { // redirection, so recurse BuildIndices(urlconn.newUrl, data); } else { streamtokenizer st; char remoteDocumentURL[2048]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it STNextToken(&st, remoteDocumentURL, sizeof(remoteDocumentURL)); ProcessFeed(remoteDocumentURL,data); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void BuildIndices(rssDatabase *db, const char *feedsFileURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, feedsFileURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { BuildIndices(db, urlconn.newUrl); } else { streamtokenizer st; char remoteFileName[2048]; HashSetNew(&db->indices, sizeof(rssIndexEntry), kNumIndexEntryBuckets, IndexEntryHash, IndexEntryCompare, IndexEntryFree); VectorNew(&db->previouslySeenArticles, sizeof(rssNewsArticle), NewsArticleFree, 0); STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it STNextToken(&st, remoteFileName, sizeof(remoteFileName)); ProcessFeed(db, remoteFileName); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ScanArticle(streamtokenizer *st, void* userData) { rssFeedData *data = userData; articleData* article = AddArticle(&data->articles,&data->item); int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { char* dummy = word;//need this becouse cant do &word in c if(HashSetLookup(&data->stopWords,&dummy)==NULL){// skip stopwords indexData *entry = addWordRecord(&data->indices, word); indexWord(&entry->data,article); numWords++; if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
static void LoadStopWords(const char *StopWordsTextURL, hashset* stopWords) { url u; urlconnection urlconn; URLNewAbsolute(&u, StopWordsTextURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { Welcome(urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char * copy = strdup(buffer); HashSetEnter(stopWords,©); } STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ScanArticle(streamtokenizer *st, const char *articleTitle, const char *unused, const char *articleURL, hashset *stopWords, hashset *wordCounts) { int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); char *dummy = word; void *pos = HashSetLookup(stopWords, &dummy); if (WordIsWellFormed(word) && pos == NULL) { //numWords++; //if (strlen(word) > strlen(longestWord)) // strcpy(longestWord, word); wordCountEnter(wordCounts, word, articleURL, articleTitle); } } } /*printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n");*/ }
static void TokenizeAndBuildStopwords(hashset *stopwords, streamtokenizer *tokenMaker) { printf("loading Stopwords...\n"); char buffer[2048]; while(STNextToken(tokenMaker, buffer, sizeof(buffer))){ const char *currWordPtr; currWordPtr = strdup(buffer); HashSetEnter(stopwords, &currWordPtr); } printf("loaded %d words\n", HashSetCount(stopwords)); }
static void ScanArticle(streamtokenizer *st, article *a, int articleIndex, rssData *allData ) { int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { numWords++; char *dummy = word; if ( WordNotInStopwords(&allData->stopwords, word)) { /** Try looking up the word. If the word is not in the indices, create a new indexEntry * initialized with the word and an empty vector and enter it into the hashset */ indexEntry entry = {word}; indexEntry *found = HashSetLookup(&allData->indices, &entry); if (found == NULL) { entry.word = strdup(dummy); VectorNew(&entry.articles, sizeof(wordcountEntry), NULL, 10); HashSetEnter(&allData->indices, &entry); } // now we act as if the entry was in the index all along found = (indexEntry*)HashSetLookup( &allData->indices, &entry); UpdateIndices(&found->articles, articleIndex); } if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
static void ScanArticle(streamtokenizer *st, article* a, hashset* stopWords, hashset* wordHash, hashset *articlesSeen) { char word[1024]; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { ProcessWellFormedWord(word,a,stopWords,wordHash,articlesSeen); } } } }
static void Welcome(const char *welcomeTextFileName) { FILE *infile; streamtokenizer st; char buffer[1024]; infile = fopen(welcomeTextFileName, "r"); assert(infile != NULL); STNew(&st, infile, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { printf("%s\n", buffer); } printf("\n"); STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. fclose(infile); }
static void loadStopWords(hashset *s) { HashSetNew(s, sizeof(char *), 1009, StringHash, StringCmp, StringFree); FILE *infile; streamtokenizer st; char buffer[1024]; infile = fopen(kStopListFile, "r"); assert(infile != NULL); STNew(&st, infile, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *newWord = strdup(buffer); HashSetEnter(s, &newWord); } STDispose(&st); fclose(infile); }
static void BuildIndices(const char *feedsFileName, rssData *allData ) { FILE *infile; streamtokenizer st; char remoteFileName[1024]; infile = fopen(feedsFileName, "r"); assert(infile != NULL); STNew(&st, infile, kNewLineDelimiters, true); while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it STNextToken(&st, remoteFileName, sizeof(remoteFileName)); ProcessFeed(remoteFileName, allData ); } STDispose(&st); fclose(infile); printf("\n"); }
static void PullAllNewsItems(urlconnection *urlconn, rssFeedData *dataPtr) { streamtokenizer st; char buffer[2048]; XML_Parser rssFeedParser = XML_ParserCreate(NULL); XML_SetUserData(rssFeedParser, dataPtr); XML_SetElementHandler(rssFeedParser, ProcessStartTag, ProcessEndTag); XML_SetCharacterDataHandler(rssFeedParser, ProcessTextData); STNew(&st, urlconn->dataStream, "\n", false); while (STNextToken(&st, buffer, sizeof(buffer))) { XML_Parse(rssFeedParser, buffer, strlen(buffer), false); } STDispose(&st); XML_Parse(rssFeedParser, "", 0, true); XML_ParserFree(rssFeedParser); }
static void PullAllNewsItems(rssDatabase *db, urlconnection *urlconn) { rssFeedState state = {db}; // passed through the parser by address as auxiliary data. streamtokenizer st; char buffer[2048]; XML_Parser rssFeedParser = XML_ParserCreate(NULL); XML_SetUserData(rssFeedParser, &state); XML_SetElementHandler(rssFeedParser, ProcessStartTag, ProcessEndTag); XML_SetCharacterDataHandler(rssFeedParser, ProcessTextData); STNew(&st, urlconn->dataStream, "\n", false); while (STNextToken(&st, buffer, sizeof(buffer))) { XML_Parse(rssFeedParser, buffer, strlen(buffer), false); } STDispose(&st); XML_Parse(rssFeedParser, "", 0, true); // instructs the xml parser that we're done parsing.. XML_ParserFree(rssFeedParser); }
//stop words, so we choose //the first prime > 1000. static void BuildStopWordsHashset(hashset *stopWords, const char *stopWordsFileName) { FILE *infile; streamtokenizer st; char buffer[1024]; infile = fopen(stopWordsFileName, "r"); assert(infile != NULL); HashSetNew(stopWords, sizeof(char*), kApproximateWordCount, StringHash, StringCompare, StringFree); STNew(&st, infile, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *elem = strdup(buffer); HashSetEnter(stopWords, &elem); } STDispose(&st); fclose(infile); }
static void ScanArticle(streamtokenizer *st, int articleID, hashset *indices, hashset *stopWords, pthread_mutex_t* indicesLock, pthread_mutex_t* stopWordsLock) { char word[1024]; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); } else { RemoveEscapeCharacters(word); pthread_mutex_lock(stopWordsLock); bool startIndexNow = WordIsWorthIndexing(word, stopWords); pthread_mutex_unlock(stopWordsLock); if (startIndexNow){ pthread_mutex_lock(indicesLock); AddWordToIndices(indices, word, articleID); //printf("DONE INDEXING"); pthread_mutex_unlock(indicesLock); } } } }
static void LoadStopWords(const char *stopWordsURL, rssFeedData *dataPtr) { url u; urlconnection urlconn; URLNewAbsolute(&u, stopWordsURL); URLConnectionNew(&urlconn, &u); if(urlconn.responseCode / 100 == 3) { LoadStopWords(urlconn.newUrl, dataPtr); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *s = strdup(buffer); HashSetEnter(&(dataPtr->stopWords), &s); //printf("%s\n", buffer); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void ScanArticle(streamtokenizer *st, rssFeedData *data) { articleData *article = addArticle(&data->articles, &data->rssItem); /*rssFeedItem *item = &(data->rssItem); char *articleTitle = item->title; char *articleURL = item->url;*/ int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { char *dummy = word; if (HashSetLookup(&(data->stopWords), &dummy)==NULL) { //not in stop list, index the word indexData *entry = addWordRecord(&data->indices, word); indexWord(&entry->counters, article); numWords++; if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }