static void ProcessWellFormedWord(char *word, article *a, hashset *stopWords, hashset *wordHash, hashset *articlesSeen) { currWord w; char* word2 = strdup(word); if(HashSetLookup(stopWords, &word2) == NULL) { //not a stopword w.thisWord = word2; VectorNew(&w.articles, sizeof(article),NULL, 100); currWord* elemAddr = (currWord*)HashSetLookup(wordHash,&w); if(elemAddr == NULL){ // Hasn't been seen a->numOccurrences = 1; VectorAppend(&w.articles, a); HashSetEnter(wordHash, &w); } else { UpdateOccurences(&elemAddr->articles,a); // we just need to update, not add // clean up free(word2); VectorDispose(&w.articles); } } else { free(word2); // free stop word } }
static void LoadStopWords(const char *StopWordsTextURL, hashset* stopWords) { url u; urlconnection urlconn; URLNewAbsolute(&u, StopWordsTextURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { Welcome(urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char * copy = strdup(buffer); HashSetEnter(stopWords,©); } STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void LoadStopWords(hashset *stopWords, const char *stopWordsURL) { url u; urlconnection urlconn; URLNewAbsolute(&u, stopWordsURL); URLConnectionNew(&urlconn, &u); if (urlconn.responseCode / 100 == 3) { LoadStopWords(stopWords, urlconn.newUrl); } else { streamtokenizer st; char buffer[4096]; HashSetNew(stopWords, sizeof(char *), kNumStopWordsBuckets, StringHash, StringCompare, StringFree); STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *stopWord = strdup(buffer); HashSetEnter(stopWords, &stopWord); } STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
static void TokenizeAndBuildThesaurus(hashset *thesaurus, streamtokenizer *st) { printf("Loading thesaurus. Be patient! "); fflush(stdout); char buffer[2048]; while (STNextToken(st, buffer, sizeof(buffer))) { thesaurusEntry entry; entry.word = strdup(buffer); VectorNew(&entry.synonyms, sizeof(char *), StringFree, 4); while (STNextToken(st, buffer, sizeof(buffer)) && (buffer[0] == ',')) { STNextToken(st, buffer, sizeof(buffer)); char *synonym = strdup(buffer); VectorAppend(&entry.synonyms, &synonym); } HashSetEnter(thesaurus, &entry); if (HashSetCount(thesaurus) % 1000 == 0) { printf("."); fflush(stdout); } } printf(" [All done!]\n"); fflush(stdout); }
static void TokenizeAndBuildStopwords(hashset *stopwords, streamtokenizer *tokenMaker) { printf("loading Stopwords...\n"); char buffer[2048]; while(STNextToken(tokenMaker, buffer, sizeof(buffer))){ const char *currWordPtr; currWordPtr = strdup(buffer); HashSetEnter(stopwords, &currWordPtr); } printf("loaded %d words\n", HashSetCount(stopwords)); }
static indexData* addWordRecord(hashset *indices, char*someWord){ indexData index; index.word = someWord; indexData * find = HashSetLookup(indices,&index); if(find==NULL){ index.word = strdup(someWord); VectorNew(&index.data,sizeof(wordCounter),NULL,1 ); HashSetEnter(indices,&index); return HashSetLookup(indices,&index); }else return find; }
static void ScanArticle(streamtokenizer *st, article *a, int articleIndex, rssData *allData ) { int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { numWords++; char *dummy = word; if ( WordNotInStopwords(&allData->stopwords, word)) { /** Try looking up the word. If the word is not in the indices, create a new indexEntry * initialized with the word and an empty vector and enter it into the hashset */ indexEntry entry = {word}; indexEntry *found = HashSetLookup(&allData->indices, &entry); if (found == NULL) { entry.word = strdup(dummy); VectorNew(&entry.articles, sizeof(wordcountEntry), NULL, 10); HashSetEnter(&allData->indices, &entry); } // now we act as if the entry was in the index all along found = (indexEntry*)HashSetLookup( &allData->indices, &entry); UpdateIndices(&found->articles, articleIndex); } if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
static void ParseArticle(const char *articleTitle,const char *articleDescription, const char *articleURL,hashset* stopWords, hashset* wordHash,hashset *articlesSeen) { url u; urlconnection urlconn; streamtokenizer st; URLNewAbsolute(&u, articleURL); URLConnectionNew(&urlconn, &u); article currArt; currArt.server = strdup(u.serverName); currArt.title = strdup(articleTitle); currArt.url = strdup(articleURL); currArt.numOccurrences = 0; switch (urlconn.responseCode) { case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL); ArticleFree(&currArt); break; case 200: if(HashSetLookup(articlesSeen,&currArt)== NULL){ //if we haven't seen this article before printf("[%s] Indexing \"%s\"\n", u.serverName,articleTitle); HashSetEnter(articlesSeen, &currArt); STNew(&st, urlconn.dataStream, kTextDelimiters, false); ScanArticle(&st, &currArt, stopWords, wordHash, articlesSeen); STDispose(&st); break; } else { //if we have seen it before printf("[Ignoring \"%s\": we've seen it before.]\n", articleTitle); ArticleFree(&currArt); break; } case 301: case 302: // just pretend we have the redirected URL all along,though index using the new URL and not the old one... ParseArticle(articleTitle, articleDescription, urlconn.newUrl, stopWords, wordHash,articlesSeen); ArticleFree(&currArt); break; default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode); ArticleFree(&currArt); break; } URLConnectionDispose(&urlconn); URLDispose(&u); }
static indexData* addWordRecord(hashset *indices, char *word) { indexData index; index.word = word; void *found = HashSetLookup(indices, &index); if(found == NULL) { index.word = strdup(word); VectorNew(&(index.counters), sizeof(wordCounter), NULL, 1); HashSetEnter(indices, &index); return HashSetLookup(indices, &index); } else { return (indexData*)found; } }
/** * Function: ScanArticle * --------------------- * Parses the specified article, skipping over all HTML tags, and counts the numbers * of well-formed words that could potentially serve as keys in the set of indices. * Once the full article has been scanned, the number of well-formed words is * printed, and the longest well-formed word we encountered along the way * is printed as well. * * This is really a placeholder implementation for what will ultimately be * code that indexes the specified content. */ static articleData* addArticle(hashset *articles, rssFeedItem *item) { articleData article; article.title = item->title; article.url = item->url; articleData *articleP = &article; void *found = HashSetLookup(articles, &articleP); if(found == NULL) { articleData *newArticle = malloc(sizeof(articleData)); newArticle->title = strdup(item->title); newArticle->url = strdup(item->url); HashSetEnter(articles, &newArticle); return newArticle; } else { return *(articleData**)found; } }
static void loadStopWords(hashset *s) { HashSetNew(s, sizeof(char *), 1009, StringHash, StringCmp, StringFree); FILE *infile; streamtokenizer st; char buffer[1024]; infile = fopen(kStopListFile, "r"); assert(infile != NULL); STNew(&st, infile, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *newWord = strdup(buffer); HashSetEnter(s, &newWord); } STDispose(&st); fclose(infile); }
//stop words, so we choose //the first prime > 1000. static void BuildStopWordsHashset(hashset *stopWords, const char *stopWordsFileName) { FILE *infile; streamtokenizer st; char buffer[1024]; infile = fopen(stopWordsFileName, "r"); assert(infile != NULL); HashSetNew(stopWords, sizeof(char*), kApproximateWordCount, StringHash, StringCompare, StringFree); STNew(&st, infile, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *elem = strdup(buffer); HashSetEnter(stopWords, &elem); } STDispose(&st); fclose(infile); }
void WordCountEnter(hashset *wordCount, const char *word, const char *articleTitle, const char *articleURL) { /** * Three possible cases: * 1. Word has not been entered * 2. Word has been entered, but word/article combination has not * 3. Word/article combination has been entered */ wordSet *existingWord = (wordSet *) HashSetLookup(wordCount, &word); // ensure that the word exists in the hashset if (existingWord == NULL) { wordSet ws; ws.word = strdup(word); VectorNew(&ws.occ, sizeof(articleCount), articleCountFreeFn, 25); HashSetEnter(wordCount, &ws); } // an entry for the word should always exist now existingWord = (wordSet *) HashSetLookup(wordCount, &word); assert(existingWord != NULL); // now either add the article to the word count vector or increment its current count articleCount articleKey = { { (char *) articleTitle, (char *) articleURL }, 1 }; int existingArticleIndex = VectorSearch(&existingWord->occ, &articleKey, articleCountCompareFn, 0, false); if (existingArticleIndex == -1) { // word/article pairing is new, append it to the vector with a count of 1 articleCount newArticle; (newArticle.source).title = strdup(articleTitle); (newArticle.source).url = strdup(articleURL); newArticle.count = 1; VectorAppend(&existingWord->occ, &newArticle); } else { // word/article pairing exists, increment its count articleCount *existingArticle = (articleCount *) VectorNth(&existingWord->occ, existingArticleIndex); existingArticle->count++; } }
static void BuildTableOfLetterCounts(hashset *counts) { struct frequency localFreq, *found; int ch; FILE *fp = fopen("main.c", "r"); // open self as file assert(fp != NULL); while ((ch = getc(fp)) != EOF) { if (isalpha(ch)) { // only count letters localFreq.ch = tolower(ch); localFreq.occurrences = 1; // See if we already have an entry for this char found = (struct frequency *) HashSetLookup(counts, &localFreq); if (found != NULL) // increment if already there localFreq.occurrences = found->occurrences + 1; HashSetEnter(counts, &localFreq); // enter should overwrite if needed } } fclose(fp); }
static sem_t* findServerLock(hashset *serverLocks, pthread_mutex_t *dataLock, const char* serverURL){ pthread_mutex_lock(dataLock); sem_t * serverLock; serverLockData newLockData = {serverURL}; serverLockData* lockDataP = HashSetLookup(serverLocks, &newLockData); if(lockDataP==NULL){ newLockData.url = strdup(serverURL); newLockData.serverLock = malloc(sizeof(sem_t)); sem_init(newLockData.serverLock,0,kSimultaneousServerConn); HashSetEnter(serverLocks, &newLockData); lockDataP = HashSetLookup(serverLocks, &newLockData); //create semaphore } serverLock = lockDataP->serverLock; pthread_mutex_unlock(dataLock); return serverLock; }
static void AddWordToIndices(hashset *indices, const char *word, int articleIndex) { rssIndexEntry indexEntry = { word }; // partial intialization rssIndexEntry *existingIndexEntry = HashSetLookup(indices, &indexEntry); if (existingIndexEntry == NULL) { indexEntry.meaningfulWord = strdup(word); VectorNew(&indexEntry.relevantArticles, sizeof(rssRelevantArticleEntry), NULL, 0); HashSetEnter(indices, &indexEntry); existingIndexEntry = HashSetLookup(indices, &indexEntry); // pretend like it's been there all along assert(existingIndexEntry != NULL); } rssRelevantArticleEntry articleEntry = { articleIndex, 0 }; int existingArticleIndex = VectorSearch(&existingIndexEntry->relevantArticles, &articleEntry, ArticleIndexCompare, 0, false); if (existingArticleIndex == -1) { VectorAppend(&existingIndexEntry->relevantArticles, &articleEntry); existingArticleIndex = VectorLength(&existingIndexEntry->relevantArticles) - 1; } rssRelevantArticleEntry *existingArticleEntry = VectorNth(&existingIndexEntry->relevantArticles, existingArticleIndex); existingArticleEntry->freq++; }
static void LoadStopWords(const char *stopWordsURL, rssFeedData *dataPtr) { url u; urlconnection urlconn; URLNewAbsolute(&u, stopWordsURL); URLConnectionNew(&urlconn, &u); if(urlconn.responseCode / 100 == 3) { LoadStopWords(urlconn.newUrl, dataPtr); } else { streamtokenizer st; char buffer[4096]; STNew(&st, urlconn.dataStream, kNewLineDelimiters, true); while (STNextToken(&st, buffer, sizeof(buffer))) { char *s = strdup(buffer); HashSetEnter(&(dataPtr->stopWords), &s); //printf("%s\n", buffer); } printf("\n"); STDispose(&st); } URLConnectionDispose(&urlconn); URLDispose(&u); }
void calculateNext(hashset *hash, vector* keys, int k, char* f, int size) { int currK, i, elems = 0, seedNumber; char *nxt, *cnt, *key2, *storage = (char*)malloc(sizeof(char) * k); FILE *fileR; vectorv keyNext; keysv *rs, key; k+=1; fileR = fopen(f, "r"); assert(fileR != NULL && "Cannot open the file"); VectorNew(keys, sizeof(char*) * k, NULL, 10); while (fgets(storage, k, fileR) != NULL) { currK = strlen(storage); if (currK < k && storage[currK - 1] == '\n') { fgets(&storage[currK], k - currK, fileR); } VectorAppend(keys, storage); } storage = (char*)VectorNth(keys, keys->currentPosition - 1); fclose(fileR); HashSetNew(hash, sizeof(keysv), keys->currentPosition * 3, hashVector, cmpVector, NULL); for (i = 0; i < (keys->currentPosition - 1); i++) { rs = (keysv*)malloc(sizeof(keysv)); vector nexts; cnt = VectorNth(keys, i); nxt = VectorNth(keys, i + 1); rs->string = strdup(cnt); rs = (keysv*)HashSetLookup(hash, rs); keyNext.string = nxt; key.string = cnt; if (rs == NULL) { keyNext.frecuency = 1; VectorNew(&nexts, sizeof(vectorv), NULL, 1); VectorAppend(&nexts, &keyNext); key.frecuency = 1; key.vectorv = nexts; key.amount = 1; HashSetEnter(hash, &key); } else { rs->frecuency++; rs->amount++; vectorv* rSucessor; int idx = VectorSearch(&rs->vectorv, &keyNext, cmpvct, 0, false); if (idx >= 0) { rSucessor = VectorNth(&rs->vectorv, idx); rSucessor->frecuency++; } else { keyNext.frecuency = 1; VectorAppend(&rs->vectorv, &keyNext); } } } key.string = VectorNth(keys, keys->currentPosition - 1); key.frecuency = 1; key.amount = 0; HashSetEnter(hash, &key); if (k == 0) { elems = keys->currentPosition; } else { HashSetMap(hash, mapFn, &elems); } seedNumber = rand() % elems; key2 = (char*)VectorNth(keys, seedNumber); printf("Generated text:\n"); printf("%s", key2); if (k > 0) { for (i = 0; i < size;) { key2 = ran(hash, keys, key2); printf("%s", key2); if (strstr(key2, " ") != NULL || strstr(key2, "\n") != NULL) { i++; } } } else { for (i = 0; i < size;) { seedNumber = rand() % elems; key2 = (char*)VectorNth(keys, seedNumber); printf("%s", key2); if (strstr(key2, " ") != NULL || strstr(key2, "\n") != NULL) { i++; } } } printf("\n"); }