static void ProcessWellFormedWord(char *word, article *a, hashset *stopWords, hashset *wordHash, hashset *articlesSeen) { currWord w; char* word2 = strdup(word); if(HashSetLookup(stopWords, &word2) == NULL) { //not a stopword w.thisWord = word2; VectorNew(&w.articles, sizeof(article),NULL, 100); currWord* elemAddr = (currWord*)HashSetLookup(wordHash,&w); if(elemAddr == NULL){ // Hasn't been seen a->numOccurrences = 1; VectorAppend(&w.articles, a); HashSetEnter(wordHash, &w); } else { UpdateOccurences(&elemAddr->articles,a); // we just need to update, not add // clean up free(word2); VectorDispose(&w.articles); } } else { free(word2); // free stop word } }
static void ProcessResponse(const char *askedWord,void* userData) { if (WordIsWellFormed(askedWord)) { rssFeedData *data = userData; if(HashSetLookup(&data->stopWords,&askedWord)==NULL){ indexData* resultData = HashSetLookup(&data->indices,&askedWord); printResult(resultData, askedWord); }else printf("\tToo common a word to be taken seriously. Try something more specificn %s \n", askedWord); }else printf("\tWe won't be allowing words like \"%s\" into our set of indices.\n", askedWord); }
static indexData* addWordRecord(hashset *indices, char*someWord){ indexData index; index.word = someWord; indexData * find = HashSetLookup(indices,&index); if(find==NULL){ index.word = strdup(someWord); VectorNew(&index.data,sizeof(wordCounter),NULL,1 ); HashSetEnter(indices,&index); return HashSetLookup(indices,&index); }else return find; }
static indexData* addWordRecord(hashset *indices, char *word) { indexData index; index.word = word; void *found = HashSetLookup(indices, &index); if(found == NULL) { index.word = strdup(word); VectorNew(&(index.counters), sizeof(wordCounter), NULL, 1); HashSetEnter(indices, &index); return HashSetLookup(indices, &index); } else { return (indexData*)found; } }
static void ScanArticle(streamtokenizer *st, article *a, int articleIndex, rssData *allData ) { int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { numWords++; char *dummy = word; if ( WordNotInStopwords(&allData->stopwords, word)) { /** Try looking up the word. If the word is not in the indices, create a new indexEntry * initialized with the word and an empty vector and enter it into the hashset */ indexEntry entry = {word}; indexEntry *found = HashSetLookup(&allData->indices, &entry); if (found == NULL) { entry.word = strdup(dummy); VectorNew(&entry.articles, sizeof(wordcountEntry), NULL, 10); HashSetEnter(&allData->indices, &entry); } // now we act as if the entry was in the index all along found = (indexEntry*)HashSetLookup( &allData->indices, &entry); UpdateIndices(&found->articles, articleIndex); } if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
static void ScanArticle(streamtokenizer *st, const char *articleTitle, const char *unused, const char *articleURL, hashset *stopWords, hashset *wordCounts) { int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); char *dummy = word; void *pos = HashSetLookup(stopWords, &dummy); if (WordIsWellFormed(word) && pos == NULL) { //numWords++; //if (strlen(word) > strlen(longestWord)) // strcpy(longestWord, word); wordCountEnter(wordCounts, word, articleURL, articleTitle); } } } /*printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n");*/ }
static void ProcessValidResponse(const char *word, hashset *stopWords, hashset *wordHash, hashset *articlesSeen) { currWord curr; char* test = strdup(word); curr.thisWord = test; currWord* elemAddr = (currWord*)HashSetLookup(wordHash,&curr); if(elemAddr != NULL){ int numArts = VectorLength(&elemAddr->articles); if(numArts > 10){ printf("\nNice! We found %i articles that include the word\"%s\". [We'll just list 10 of them,though.]\n\n",numArts, curr.thisWord); numArts = 10; } else { printf("\nNice! We found %i articles that include the word \"%s\".\n\n",numArts, curr.thisWord); } VectorSort(&elemAddr->articles, OccurrenceCompare); MapFoundArticles(&elemAddr->articles, numArts); } else { printf("None of today's news articles contain the word \"%s\".\n",word); } free(test); }
static void ScanArticle(streamtokenizer *st, void* userData) { rssFeedData *data = userData; articleData* article = AddArticle(&data->articles,&data->item); int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { char* dummy = word;//need this becouse cant do &word in c if(HashSetLookup(&data->stopWords,&dummy)==NULL){// skip stopwords indexData *entry = addWordRecord(&data->indices, word); indexWord(&entry->data,article); numWords++; if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
vector *WordCountLookup(const hashset *wordCount, const char *word) { wordSet *existingWord = (wordSet *) HashSetLookup((hashset *) wordCount, &word); if (existingWord == NULL) return NULL; else return &existingWord->occ; }
static bool WordNotInStopwords(hashset *stopwords, char *word ) { char *dummy = word; void *found = HashSetLookup(stopwords, &dummy); if (found == NULL) { return true; } else { return false; } }
void WordCountEnter(hashset *wordCount, const char *word, const char *articleTitle, const char *articleURL) { /** * Three possible cases: * 1. Word has not been entered * 2. Word has been entered, but word/article combination has not * 3. Word/article combination has been entered */ wordSet *existingWord = (wordSet *) HashSetLookup(wordCount, &word); // ensure that the word exists in the hashset if (existingWord == NULL) { wordSet ws; ws.word = strdup(word); VectorNew(&ws.occ, sizeof(articleCount), articleCountFreeFn, 25); HashSetEnter(wordCount, &ws); } // an entry for the word should always exist now existingWord = (wordSet *) HashSetLookup(wordCount, &word); assert(existingWord != NULL); // now either add the article to the word count vector or increment its current count articleCount articleKey = { { (char *) articleTitle, (char *) articleURL }, 1 }; int existingArticleIndex = VectorSearch(&existingWord->occ, &articleKey, articleCountCompareFn, 0, false); if (existingArticleIndex == -1) { // word/article pairing is new, append it to the vector with a count of 1 articleCount newArticle; (newArticle.source).title = strdup(articleTitle); (newArticle.source).url = strdup(articleURL); newArticle.count = 1; VectorAppend(&existingWord->occ, &newArticle); } else { // word/article pairing exists, increment its count articleCount *existingArticle = (articleCount *) VectorNth(&existingWord->occ, existingArticleIndex); existingArticle->count++; } }
static void ProcessResponse(rssDatabase *db, const char *word) { if (!WordIsWellFormed(word)) { printf("That search term couldn't possibly be in our set of indices.\n\n"); return; } if (HashSetLookup(&db->stopWords, &word) != NULL) { printf("\"%s\" is too common a word to be taken seriously. Please be more specific.\n\n", word); return; } rssIndexEntry entry = { word }; rssIndexEntry *existingIndex = HashSetLookup(&db->indices, &entry); if (existingIndex == NULL) { printf("None of today's news articles contain the word \"%s\".\n\n", word); return; } ListTopArticles(existingIndex, &db->previouslySeenArticles); }
static sem_t* findServerLock(hashset *serverLocks, pthread_mutex_t *dataLock, const char* serverURL){ pthread_mutex_lock(dataLock); sem_t * serverLock; serverLockData newLockData = {serverURL}; serverLockData* lockDataP = HashSetLookup(serverLocks, &newLockData); if(lockDataP==NULL){ newLockData.url = strdup(serverURL); newLockData.serverLock = malloc(sizeof(sem_t)); sem_init(newLockData.serverLock,0,kSimultaneousServerConn); HashSetEnter(serverLocks, &newLockData); lockDataP = HashSetLookup(serverLocks, &newLockData); //create semaphore } serverLock = lockDataP->serverLock; pthread_mutex_unlock(dataLock); return serverLock; }
static void ProcessResponse(const char *word, hashset *stopWords, hashset *wordHash, hashset *articlesSeen) { if (WordIsWellFormed(word)) { if (HashSetLookup(stopWords, &word) != NULL) { printf("This is too common a word. Please be more specific.\n"); } else { ProcessValidResponse(word,stopWords,wordHash,articlesSeen); } } else { printf("We won't be allowing words like \"%s\" into our set of indices.\n", word); } }
static void ProcessResponse(const char *word, void *userData) { if (WordIsWellFormed(word)) { rssFeedData *data = userData; if(HashSetLookup(&data->stopWords, &word)==NULL) { indexData *resultData = HashSetLookup(&data->indices, &word); if(resultData!=NULL) { vector resultVector = resultData->counters; printf("there are %d records of this word", VectorLength(&resultVector)); VectorSort(&resultVector, SortVectorCmpFn); int i=1; VectorMap(&resultVector, PrintResultMapFn, &i); printf("\n"); } else { printf("\tWe don't have records about %s into our set of indices.\n", word); } } } else { printf("\tWe won't be allowing words like \"%s\" into our set of indices.\n", word); } }
static void ParseArticle(const char *articleTitle,const char *articleDescription, const char *articleURL,hashset* stopWords, hashset* wordHash,hashset *articlesSeen) { url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, articleURL); URLConnectionNew(&urlconn, &u); article currArt; currArt.server = strdup(u.serverName); currArt.title = strdup(articleTitle); currArt.url = strdup(articleURL); currArt.numOccurrences = 0; switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); ArticleFree(&currArt); break; case 200: if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle); HashSetEnter(articlesSeen, &currArt); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen); STDispose(&st); break; } else { //if we have seen it before printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle); ArticleFree(&currArt); break; } case 301: case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one... ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordHash,articlesSeen); ArticleFree(&currArt); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); ArticleFree(&currArt); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
void HashSetEnter(hashset *h, const void *elemAddr) { assert(elemAddr!=NULL); void *find=HashSetLookup(h, elemAddr); if(find) { memcpy(find, elemAddr, h->elemSize); } else { vector* v=HashSetElemVector(h, elemAddr); VectorAppend(v, elemAddr); h->count++; } }
static void AddWordToIndices(hashset *indices, const char *word, int articleIndex) { rssIndexEntry indexEntry = { word }; // partial intialization rssIndexEntry *existingIndexEntry = HashSetLookup(indices, &indexEntry); if (existingIndexEntry == NULL) { indexEntry.meaningfulWord = strdup(word); VectorNew(&indexEntry.relevantArticles, sizeof(rssRelevantArticleEntry), NULL, 0); HashSetEnter(indices, &indexEntry); existingIndexEntry = HashSetLookup(indices, &indexEntry); // pretend like it's been there all along assert(existingIndexEntry != NULL); } rssRelevantArticleEntry articleEntry = { articleIndex, 0 }; int existingArticleIndex = VectorSearch(&existingIndexEntry->relevantArticles, &articleEntry, ArticleIndexCompare, 0, false); if (existingArticleIndex == -1) { VectorAppend(&existingIndexEntry->relevantArticles, &articleEntry); existingArticleIndex = VectorLength(&existingIndexEntry->relevantArticles) - 1; } rssRelevantArticleEntry *existingArticleEntry = VectorNth(&existingIndexEntry->relevantArticles, existingArticleIndex); existingArticleEntry->freq++; }
/** * Function: ScanArticle * --------------------- * Parses the specified article, skipping over all HTML tags, and counts the numbers * of well-formed words that could potentially serve as keys in the set of indices. * Once the full article has been scanned, the number of well-formed words is * printed, and the longest well-formed word we encountered along the way * is printed as well. * * This is really a placeholder implementation for what will ultimately be * code that indexes the specified content. */ static articleData* addArticle(hashset *articles, rssFeedItem *item) { articleData article; article.title = item->title; article.url = item->url; articleData *articleP = &article; void *found = HashSetLookup(articles, &articleP); if(found == NULL) { articleData *newArticle = malloc(sizeof(articleData)); newArticle->title = strdup(item->title); newArticle->url = strdup(item->url); HashSetEnter(articles, &newArticle); return newArticle; } else { return *(articleData**)found; } }
static void ProcessResponse(const char *word, rssData *allData) { if (WordIsWellFormed(word)) { void *found = HashSetLookup(&allData->indices, &word); if (found != NULL) { indexEntry *entry = (indexEntry*)found; VectorSort(&entry->articles, ReverseWordcountCmp); VectorMap(&entry->articles, PrintArticle, &allData->explored); } else { printf("\tWord not found in our indices\n"); } } else { printf("\tWe won't be allowing words like \"%s\" into our set of indices.\n", word); } }
static void ProcessResponse(const char *word, hashset *stopWords, hashset *wordCounts) { if (!WordIsWellFormed(word)) { //printf("\tWell, we don't have the database mapping words to online news articles yet, but if we DID have\n"); //printf("\tour hashset of indices, we'd list all of the articles containing \"%s\".\n", word); printf("\tWe won't be allowing words like \"%s\" into our set of indices.\n", word); } else if(HashSetLookup(stopWords, &word)) { printf("\tToo common a word to be taken seriously. Try something more specific.\n"); } else { vector *found = wordCountLookup(wordCounts, word); if (found != NULL) { printArticles(found, word); } else { printf("\tNone of today's news contain the word \"%s\". \n", word); } } }
static void QueryThesaurus(hashset *thesaurus) { char response[1024]; char *responsep = response; while (true) { printf("Go ahead and enter a word: "); fgets(response, sizeof(response), stdin); response[strlen(response) - 1] = '\0'; if (strlen(response) == 0) return; thesaurusEntry *found = HashSetLookup(thesaurus, &responsep); if (found != NULL) { int numSynonyms = VectorLength(&found->synonyms); char *synonym = *(char **) VectorNth(&found->synonyms, RandomInteger(0, numSynonyms - 1)); printf("We found \"%s\" in the thesaurus! Its related word of the day is \"%s\".\n", response, synonym); } else { printf("My apologies, but I know of no such word spelled \"%s\".\n", response); } } }
char* ran(hashset* hash, vector *keys, char *prev) { int i, rnd, b; char *rs; keysv search, *tmp; search.string = strdup(prev); tmp = (keysv*)HashSetLookup(hash, &search); b = tmp->amount; if (b == 0) { b = keys->currentPosition; rnd = rand() % b; return VectorNth(keys, rnd); } rnd = rand() % b + 1; for (i = 0; rnd > 0 && i < tmp->vectorv.currentPosition; i++) { rnd = rnd - ((vectorv*)VectorNth(&tmp->vectorv, i))->frecuency; rs = ((vectorv*)VectorNth(&tmp->vectorv, i))->string; } return rs; }
static void BuildTableOfLetterCounts(hashset *counts) { struct frequency localFreq, *found; int ch; FILE *fp = fopen("main.c", "r"); // open self as file assert(fp != NULL); while ((ch = getc(fp)) != EOF) { if (isalpha(ch)) { // only count letters localFreq.ch = tolower(ch); localFreq.occurrences = 1; // See if we already have an entry for this char found = (struct frequency *) HashSetLookup(counts, &localFreq); if (found != NULL) // increment if already there localFreq.occurrences = found->occurrences + 1; HashSetEnter(counts, &localFreq); // enter should overwrite if needed } } fclose(fp); }
static void ScanArticle(streamtokenizer *st, rssFeedData *data) { articleData *article = addArticle(&data->articles, &data->rssItem); /*rssFeedItem *item = &(data->rssItem); char *articleTitle = item->title; char *articleURL = item->url;*/ int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { char *dummy = word; if (HashSetLookup(&(data->stopWords), &dummy)==NULL) { //not in stop list, index the word indexData *entry = addWordRecord(&data->indices, word); indexWord(&entry->counters, article); numWords++; if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
void calculateNext(hashset *hash, vector* keys, int k, char* f, int size) { int currK, i, elems = 0, seedNumber; char *nxt, *cnt, *key2, *storage = (char*)malloc(sizeof(char) * k); FILE *fileR; vectorv keyNext; keysv *rs, key; k+=1; fileR = fopen(f, "r"); assert(fileR != NULL && "Cannot open the file"); VectorNew(keys, sizeof(char*) * k, NULL, 10); while (fgets(storage, k, fileR) != NULL) { currK = strlen(storage); if (currK < k && storage[currK - 1] == '\n') { fgets(&storage[currK], k - currK, fileR); } VectorAppend(keys, storage); } storage = (char*)VectorNth(keys, keys->currentPosition - 1); fclose(fileR); HashSetNew(hash, sizeof(keysv), keys->currentPosition * 3, hashVector, cmpVector, NULL); for (i = 0; i < (keys->currentPosition - 1); i++) { rs = (keysv*)malloc(sizeof(keysv)); vector nexts; cnt = VectorNth(keys, i); nxt = VectorNth(keys, i + 1); rs->string = strdup(cnt); rs = (keysv*)HashSetLookup(hash, rs); keyNext.string = nxt; key.string = cnt; if (rs == NULL) { keyNext.frecuency = 1; VectorNew(&nexts, sizeof(vectorv), NULL, 1); VectorAppend(&nexts, &keyNext); key.frecuency = 1; key.vectorv = nexts; key.amount = 1; HashSetEnter(hash, &key); } else { rs->frecuency++; rs->amount++; vectorv* rSucessor; int idx = VectorSearch(&rs->vectorv, &keyNext, cmpvct, 0, false); if (idx >= 0) { rSucessor = VectorNth(&rs->vectorv, idx); rSucessor->frecuency++; } else { keyNext.frecuency = 1; VectorAppend(&rs->vectorv, &keyNext); } } } key.string = VectorNth(keys, keys->currentPosition - 1); key.frecuency = 1; key.amount = 0; HashSetEnter(hash, &key); if (k == 0) { elems = keys->currentPosition; } else { HashSetMap(hash, mapFn, &elems); } seedNumber = rand() % elems; key2 = (char*)VectorNth(keys, seedNumber); printf("Generated text:\n"); printf("%s", key2); if (k > 0) { for (i = 0; i < size;) { key2 = ran(hash, keys, key2); printf("%s", key2); if (strstr(key2, " ") != NULL || strstr(key2, "\n") != NULL) { i++; } } } else { for (i = 0; i < size;) { seedNumber = rand() % elems; key2 = (char*)VectorNth(keys, seedNumber); printf("%s", key2); if (strstr(key2, " ") != NULL || strstr(key2, "\n") != NULL) { i++; } } } printf("\n"); }
static bool WordIsWorthIndexing(const char *word, hashset *stopWords) { return WordIsWellFormed(word) && HashSetLookup(stopWords, &word) == NULL; }