static void ScanArticle(streamtokenizer *st, const char *articleTitle, const char *unused, const char *articleURL, hashset *stopWords, hashset *wordCounts) { int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); char *dummy = word; void *pos = HashSetLookup(stopWords, &dummy); if (WordIsWellFormed(word) && pos == NULL) { //numWords++; //if (strlen(word) > strlen(longestWord)) // strcpy(longestWord, word); wordCountEnter(wordCounts, word, articleURL, articleTitle); } } } /*printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n");*/ }
static void ScanArticle(streamtokenizer *st, void* userData) { rssFeedData *data = userData; articleData* article = AddArticle(&data->articles,&data->item); int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { char* dummy = word;//need this becouse cant do &word in c if(HashSetLookup(&data->stopWords,&dummy)==NULL){// skip stopwords indexData *entry = addWordRecord(&data->indices, word); indexWord(&entry->data,article); numWords++; if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
static void ProcessResponse(const char *askedWord,void* userData) { if (WordIsWellFormed(askedWord)) { rssFeedData *data = userData; if(HashSetLookup(&data->stopWords,&askedWord)==NULL){ indexData* resultData = HashSetLookup(&data->indices,&askedWord); printResult(resultData, askedWord); }else printf("\tToo common a word to be taken seriously. Try something more specificn %s \n", askedWord); }else printf("\tWe won't be allowing words like \"%s\" into our set of indices.\n", askedWord); }
static void ProcessResponse(const char *word, hashset *stopWords, hashset *wordHash, hashset *articlesSeen) { if (WordIsWellFormed(word)) { if (HashSetLookup(stopWords, &word) != NULL) { printf("This is too common a word. Please be more specific.\n"); } else { ProcessValidResponse(word,stopWords,wordHash,articlesSeen); } } else { printf("We won't be allowing words like \"%s\" into our set of indices.\n", word); } }
static void ScanArticle(streamtokenizer *st, article *a, int articleIndex, rssData *allData ) { int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { numWords++; char *dummy = word; if ( WordNotInStopwords(&allData->stopwords, word)) { /** Try looking up the word. If the word is not in the indices, create a new indexEntry * initialized with the word and an empty vector and enter it into the hashset */ indexEntry entry = {word}; indexEntry *found = HashSetLookup(&allData->indices, &entry); if (found == NULL) { entry.word = strdup(dummy); VectorNew(&entry.articles, sizeof(wordcountEntry), NULL, 10); HashSetEnter(&allData->indices, &entry); } // now we act as if the entry was in the index all along found = (indexEntry*)HashSetLookup( &allData->indices, &entry); UpdateIndices(&found->articles, articleIndex); } if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
static void ScanArticle(streamtokenizer *st, article* a, hashset* stopWords, hashset* wordHash, hashset *articlesSeen) { char word[1024]; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { ProcessWellFormedWord(word,a,stopWords,wordHash,articlesSeen); } } } }
static void ProcessResponse(const char *word, rssData *allData) { if (WordIsWellFormed(word)) { void *found = HashSetLookup(&allData->indices, &word); if (found != NULL) { indexEntry *entry = (indexEntry*)found; VectorSort(&entry->articles, ReverseWordcountCmp); VectorMap(&entry->articles, PrintArticle, &allData->explored); } else { printf("\tWord not found in our indices\n"); } } else { printf("\tWe won't be allowing words like \"%s\" into our set of indices.\n", word); } }
static void ProcessResponse(const char *word, hashset *stopWords, hashset *wordCounts) { if (!WordIsWellFormed(word)) { //printf("\tWell, we don't have the database mapping words to online news articles yet, but if we DID have\n"); //printf("\tour hashset of indices, we'd list all of the articles containing \"%s\".\n", word); printf("\tWe won't be allowing words like \"%s\" into our set of indices.\n", word); } else if(HashSetLookup(stopWords, &word)) { printf("\tToo common a word to be taken seriously. Try something more specific.\n"); } else { vector *found = wordCountLookup(wordCounts, word); if (found != NULL) { printArticles(found, word); } else { printf("\tNone of today's news contain the word \"%s\". \n", word); } } }
static void ProcessResponse(rssDatabase *db, const char *word) { if (!WordIsWellFormed(word)) { printf("That search term couldn't possibly be in our set of indices.\n\n"); return; } if (HashSetLookup(&db->stopWords, &word) != NULL) { printf("\"%s\" is too common a word to be taken seriously. Please be more specific.\n\n", word); return; } rssIndexEntry entry = { word }; rssIndexEntry *existingIndex = HashSetLookup(&db->indices, &entry); if (existingIndex == NULL) { printf("None of today's news articles contain the word \"%s\".\n\n", word); return; } ListTopArticles(existingIndex, &db->previouslySeenArticles); }
static void ProcessResponse(const char *word, void *userData) { if (WordIsWellFormed(word)) { rssFeedData *data = userData; if(HashSetLookup(&data->stopWords, &word)==NULL) { indexData *resultData = HashSetLookup(&data->indices, &word); if(resultData!=NULL) { vector resultVector = resultData->counters; printf("there are %d records of this word", VectorLength(&resultVector)); VectorSort(&resultVector, SortVectorCmpFn); int i=1; VectorMap(&resultVector, PrintResultMapFn, &i); printf("\n"); } else { printf("\tWe don't have records about %s into our set of indices.\n", word); } } } else { printf("\tWe won't be allowing words like \"%s\" into our set of indices.\n", word); } }
static void ScanArticle(streamtokenizer *st, rssFeedData *data) { articleData *article = addArticle(&data->articles, &data->rssItem); /*rssFeedItem *item = &(data->rssItem); char *articleTitle = item->title; char *articleURL = item->url;*/ int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { char *dummy = word; if (HashSetLookup(&(data->stopWords), &dummy)==NULL) { //not in stop list, index the word indexData *entry = addWordRecord(&data->indices, word); indexWord(&entry->counters, article); numWords++; if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
static bool WordIsWorthIndexing(const char *word, hashset *stopWords) { return WordIsWellFormed(word) && HashSetLookup(stopWords, &word) == NULL; }