static void ScanArticle(streamtokenizer *st, void* userData) { rssFeedData *data = userData; articleData* article = AddArticle(&data->articles,&data->item); int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { char* dummy = word;//need this becouse cant do &word in c if(HashSetLookup(&data->stopWords,&dummy)==NULL){// skip stopwords indexData *entry = addWordRecord(&data->indices, word); indexWord(&entry->data,article); numWords++; if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
static void ScanArticle(streamtokenizer *st, const char *articleTitle, const char *unused, const char *articleURL, hashset *stopWords, hashset *wordCounts) { int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); char *dummy = word; void *pos = HashSetLookup(stopWords, &dummy); if (WordIsWellFormed(word) && pos == NULL) { //numWords++; //if (strlen(word) > strlen(longestWord)) // strcpy(longestWord, word); wordCountEnter(wordCounts, word, articleURL, articleTitle); } } } /*printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n");*/ }
static void ScanArticle(streamtokenizer *st, article *a, int articleIndex, rssData *allData ) { int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { numWords++; char *dummy = word; if ( WordNotInStopwords(&allData->stopwords, word)) { /** Try looking up the word. If the word is not in the indices, create a new indexEntry * initialized with the word and an empty vector and enter it into the hashset */ indexEntry entry = {word}; indexEntry *found = HashSetLookup(&allData->indices, &entry); if (found == NULL) { entry.word = strdup(dummy); VectorNew(&entry.articles, sizeof(wordcountEntry), NULL, 10); HashSetEnter(&allData->indices, &entry); } // now we act as if the entry was in the index all along found = (indexEntry*)HashSetLookup( &allData->indices, &entry); UpdateIndices(&found->articles, articleIndex); } if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
static void ScanArticle(streamtokenizer *st, article* a, hashset* stopWords, hashset* wordHash, hashset *articlesSeen) { char word[1024]; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { ProcessWellFormedWord(word,a,stopWords,wordHash,articlesSeen); } } } }
static void ScanArticle(streamtokenizer *st, int articleID, hashset *indices, hashset *stopWords, pthread_mutex_t* indicesLock, pthread_mutex_t* stopWordsLock) { char word[1024]; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); } else { RemoveEscapeCharacters(word); pthread_mutex_lock(stopWordsLock); bool startIndexNow = WordIsWorthIndexing(word, stopWords); pthread_mutex_unlock(stopWordsLock); if (startIndexNow){ pthread_mutex_lock(indicesLock); AddWordToIndices(indices, word, articleID); //printf("DONE INDEXING"); pthread_mutex_unlock(indicesLock); } } } }
static void ScanArticle(streamtokenizer *st, rssFeedData *data) { articleData *article = addArticle(&data->articles, &data->rssItem); /*rssFeedItem *item = &(data->rssItem); char *articleTitle = item->title; char *articleURL = item->url;*/ int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { char *dummy = word; if (HashSetLookup(&(data->stopWords), &dummy)==NULL) { //not in stop list, index the word indexData *entry = addWordRecord(&data->indices, word); indexWord(&entry->counters, article); numWords++; if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }