int main(int argc, char* argv[]) { chash* table = chash_new(); char* text = read_entire_file(fopen(argv[1], "r")); char* token = strtok(text, " \t\n"); while (token) { char* cleaned_word = clean_word(token); if (cleaned_word != NULL) { see_word(table, cleaned_word); free(cleaned_word); } token = strtok(NULL, " \t\n"); } chash_iterator iter; chash_iterator_init(&iter, table); char *key; void *value; pair* pairs = malloc(chash_size(table) * sizeof(*pairs)); int i = 0; while (chash_iterator_next(&iter, &key, &value)) { pairs[i].key = key; pairs[i].value = value; i++; } qsort(pairs, chash_size(table), sizeof(pair), compare_by_frequency); printf("Words sorted by frequency:\n"); for (i = 0; i < chash_size(table); ++i) printf("%s: %d\n", pairs[i].key, *((int*)pairs[i].value)); chash_free(table); return 0; }
int test_file(FILE *fp, int argc, char **argv) { char buf[65535 + 1]; char *pos; unsigned int strategy = 0; /* what bucketing strategy we're using */ void *ptr = NULL; unsigned int bucketsize = 0; struct params params = {0}; struct chash *hash = NULL; char name[256]; if (!parse_params(argc, argv, ¶ms)) { fprintf(stderr, "failed to parse params\n"); return 0; } while (fgets((char *) buf, 65535, fp)) { str_rtrim(buf); pos = (char *) str_ltrim(buf); if (!str_casecmp(pos, "new")) { /* creating a new bucket */ unsigned int size = -1; if (ptr) { chash_delete(hash); free(ptr); } /* read parameters */ if ((fscanf(fp, "%255s %u %u", name, &strategy, &size) == 3) && (size <= 65535) && (bucketsize = size) && (ptr = malloc(size)) && (hash = chash_ptr_new(1, 2.0, /* some fn pointer casting dodginess */ (unsigned int (*)(const void *)) str_len, (int (*)(const void *, const void *)) str_cmp)) && (bucket_new(ptr, bucketsize, strategy))) { /* succeeded, do nothing */ if (params.verbose) { printf("%s: new bucket with size %u strategy %u\n", name, size, strategy); } } else { fprintf(stderr, "%s: failed to create bucket\n", name); return 0; } } else if (!str_casecmp(pos, "add")) { /* adding a term to the bucket */ void *ret; unsigned int veclen, succeed, len; int toobig; if (!ptr) { return 0; } /* read parameters */ if ((fscanf(fp, "%65535s %u %u", buf, &veclen, &succeed) == 3) && (veclen <= 65535)) { len = str_len(buf); if ((((ret = bucket_alloc(ptr, bucketsize, strategy, buf, len, veclen, &toobig, NULL)) && succeed) || (!ret && !succeed))) { /* do nothing */ if (params.verbose) { printf("%s: added term '%s'\n", name, buf); } } else if (succeed) { fprintf(stderr, "%s: failed to add '%s' to bucket\n", name, buf); return 0; } else if (!succeed) { fprintf(stderr, "%s: add '%s' succeeded but shouldn't " "have\n", name, buf); return 0; } } else { fprintf(stderr, "%s: failed to add\n", name); return 0; } } else if (!str_casecmp(pos, "ls")) { /* matching stuff in the bucket */ unsigned int numterms, i, len, veclen, veclen2, state; void *addr; struct chash *tmphash; const char *term; void **tmpptr, *tmp; if (!ptr) { return 0; } if (!(tmphash = chash_ptr_new(1, 2.0, /* some fn pointer casting dodginess */ (unsigned int (*)(const void *)) str_len, (int (*)(const void *, const void *)) str_cmp))) { fprintf(stderr, "%s: failed to init hashtable\n", name); return 0; } /* first, fill hashtable with all terms from bucket */ state = 0; while ((term = bucket_next_term(ptr, bucketsize, strategy, &state, &len, &addr, &veclen))) { if (!((term = str_ndup(term, len)) && (chash_ptr_ptr_insert(tmphash, term, (void*) term) == CHASH_OK))) { fprintf(stderr, "%s: failed to init hashtable\n", name); return 0; } } /* now, take terms from file, comparing them with hashtable * entries */ if (fscanf(fp, "%u", &numterms)) { for (i = 0; i < numterms; i++) { if (fscanf(fp, "%65535s %u ", buf, &veclen)) { if (params.verbose) { printf("%s: ls checking %s\n", name, buf); } if ((addr = bucket_find(ptr, bucketsize, strategy, buf, str_len(buf), &veclen2, NULL)) /* remove it from hashtable */ && chash_ptr_ptr_find(tmphash, buf, &tmpptr) == CHASH_OK && chash_ptr_ptr_remove(tmphash, *tmpptr, &tmp) == CHASH_OK && (free(tmp), 1) && (veclen <= 65535) && (veclen2 == veclen) && fread(buf, veclen, 1, fp) && ((buf[veclen] = '\0'), 1) && (!params.verbose || printf("%s: ls check read '%s'\n", name, buf)) && !memcmp(buf, addr, veclen)) { /* do nothing */ } else { unsigned int j; fprintf(stderr, "%s: ls failed cmp '%s' with '", name, buf); for (j = 0; j < veclen; j++) { putc(((char *) addr)[j], stderr); } fprintf(stderr, "'\n"); return 0; } } else { fprintf(stderr, "%s: ls failed\n", name); return 0; } } if (chash_size(tmphash)) { fprintf(stderr, "%s: ls failed\n", name); return 0; } } else { fprintf(stderr, "%s: ls failed\n", name); return 0; } chash_delete(tmphash); if (params.verbose) { printf("%s: matched all (%u) entries\n", name, numterms); } } else if (!str_casecmp(pos, "set")) { /* setting the vector for a term in the bucket */ unsigned int veclen, reallen; void *addr; if (!ptr) { return 0; } /* read parameters */ if ((fscanf(fp, "%65535s %u ", buf, &veclen) == 2) && (veclen <= 65535)) { addr = bucket_find(ptr, bucketsize, strategy, buf, str_len(buf), &reallen, NULL); if (addr && (reallen == veclen) && fread(addr, 1, veclen, fp)) { /* do nothing */ if (params.verbose) { unsigned int j; printf("%s: set term '%s' to '", name, buf); for (j = 0; j < reallen; j++) { putc(((char *) addr)[j], stdout); } printf("'\n"); } } else { fprintf(stderr, "%s: failed to set!\n", name); return 0; } } else { fprintf(stderr, "%s: failed to set\n", name); return 0; } } else if (!str_casecmp(pos, "realloc")) { /* reallocating a term in the bucket */ unsigned int veclen, succeed; int toobig; if (!ptr) { return 0; } /* read parameters */ if ((fscanf(fp, "%65535s %u %u", buf, &veclen, &succeed) == 3) && (veclen <= 65535)) { if (!bucket_realloc(ptr, bucketsize, strategy, buf, str_len(buf), veclen, &toobig)) { fprintf(stderr, "%s: failed to realloc!\n", name); return 0; } } else { fprintf(stderr, "%s: failed to realloc\n", name); return 0; } if (params.verbose) { printf("%s: realloc'd term '%s'\n", name, buf); } } else if (!str_casecmp(pos, "rm")) { /* removing something from the bucket */ unsigned int succeed; if (!ptr) { return 0; } if (fscanf(fp, "%65535s %u", buf, &succeed) == 2) { if (succeed) { if (!(bucket_remove(ptr, bucketsize, strategy, buf, str_len(buf)))) { fprintf(stderr, "%s: failed to rm '%s'\n", name, buf); return 0; } else if (params.verbose) { printf("%s: rm term '%s'\n", name, buf); } } else if (succeed) { fprintf(stderr, "%s: failed to rm\n", name); return 0; } } else { fprintf(stderr, "%s: failed to rm\n", name); return 0; } } else if (!str_casecmp(pos, "print")) { /* printing out the bucket contents */ unsigned int state = 0, len, veclen; const char *term; char format[100]; void *addr; if (!ptr) { printf("can't print, no bucket\n"); } else { do { term = bucket_next_term(ptr, bucketsize, strategy, &state, &len, &addr, &veclen); } while (term && memcpy(buf, term, len) && ((buf[len] = '\0') || 1) && snprintf(format, 100, "%%.%us (%%u): '%%.%us' (%%u) " "(off %%u)\n", len, veclen) && printf(format, term, len, (char*) addr, veclen, ((char *) addr) - (char *) ptr)); if (!state) { printf("(empty)\n"); } printf("%u entries, %u data, %u string, %u overhead, %u free\n", bucket_entries(ptr, bucketsize, strategy), bucket_utilised(ptr, bucketsize, strategy), bucket_string(ptr, bucketsize, strategy), bucket_overhead(ptr, bucketsize, strategy), bucket_unused(ptr, bucketsize, strategy)); } } else if (!str_casecmp(pos, "match")) { unsigned int veclen, veclen2; void *addr; if (fscanf(fp, "%65535s %u ", buf, &veclen)) { if ((addr = bucket_find(ptr, bucketsize, strategy, buf, str_len(buf), &veclen2, NULL)) && (veclen <= 65535) && (veclen2 >= veclen) && (!params.verbose || printf("%s: match on '%s' ", name, buf)) && fread(buf, veclen, 1, fp) && !memcmp(buf, addr, veclen)) { if (params.verbose) { printf("content succeeded\n"); } } else { fprintf(stderr, "%s: match failed (%s vs %s)\n", name, buf, (char *) addr); return 0; } } else { fprintf(stderr, "%s: match failed\n", name); return 0; } } else if ((*pos != '#') && str_len(pos)) { fprintf(stderr, "%s: unknown command '%s'\n", name, pos); return 0; } } if (ptr) { chash_delete(hash); free(ptr); } return 1; }
enum search_ret impact_ord_eval(struct index *idx, struct query *query, struct chash *accumulators, unsigned int acc_limit, struct alloc *alloc, unsigned int mem) { double norm_B; unsigned int i, terms = 0, blockfine, blocks_read, postings_read = 0, postings = 0, bytes = 0, bytes_read = 0; struct term_data *term, *largest; struct disksrc *dsrc; if (query->terms == 0) { /* no terms to process */ return SEARCH_OK; /* allocate space for array */ } else if (!(term = malloc(sizeof(*term) * query->terms))) { return SEARCH_ENOMEM; } /* sort by selectivity (by inverse t_f) */ qsort(query->term, query->terms, sizeof(*query->term), f_t_cmp); norm_B = pow(idx->impact_stats.w_qt_max / idx->impact_stats.w_qt_min, idx->impact_stats.w_qt_min / (idx->impact_stats.w_qt_max - idx->impact_stats.w_qt_min)); /* initialise data for each query term */ for (i = 0; i < query->terms; i++) { unsigned int termfine; double w_qt; /* initialise src/vec for term */ term[i].v.pos = term[i].v.end = NULL; term[i].src = NULL; w_qt = (1 + log(query->term[i].f_qt)) * log(1 + (idx->impact_stats.avg_f_t / query->term[i].f_t)); w_qt = impact_normalise(w_qt, norm_B, idx->impact_stats.slope, idx->impact_stats.w_qt_max, idx->impact_stats.w_qt_min); term[i].w_qt = impact_quantise(w_qt, idx->impact_stats.quant_bits, idx->impact_stats.w_qt_max, idx->impact_stats.w_qt_min); /* apply term fine to term impact */ termfine = (i < 2) ? 0 : i - 2; if (termfine < term[i].w_qt) { term[i].w_qt -= termfine; /* initialise to highest impact, so we'll select and initialise this * term before real processing */ term[i].impact = INT_MAX; terms++; } else { /* we won't use this term */ term[i].w_qt = 0; term[i].impact = 0; } term[i].blocksize = 0; /* XXX */ postings += query->term[i].f_t; bytes += query->term[i].term.vocab.size; } /* get sources for each term (do this in a seperate loop so we've already * excluded lists that we won't use) */ for (i = 0; i < terms; i++) { unsigned int memsize = mem / (terms - i); if (memsize > query->term[i].term.vocab.size) { memsize = query->term[i].term.vocab.size; } if (!(term[i].src = search_term_src(idx, &query->term[i].term, alloc, memsize))) { source_delete(term, terms); free(term); return SEARCH_EINVAL; } mem -= memsize; } blockfine = blocks_read = 0; heap_heapify(term, terms, sizeof(*term), term_data_cmp); do { largest = heap_pop(term, &terms, sizeof(*term), term_data_cmp); if (largest && (largest->impact > blockfine)) { postings_read += largest->blocksize; if (chash_size(accumulators) < acc_limit) { /* reserve enough memory for accumulators and decode */ if (chash_reserve(accumulators, largest->blocksize) >= largest->blocksize) { impact_decode_block(accumulators, largest, blockfine); } else { assert(!CRASH); ERROR("impact_ord_eval()"); source_delete(term, terms); free(term); return SEARCH_EINVAL; } } else { impact_decode_block_and(accumulators, largest, blockfine); } if (VEC_LEN(&largest->v) < 2 * VEC_VBYTE_MAX) { /* need to read more data */ unsigned int bytes; enum search_ret sret; if ((sret = largest->src->readlist(largest->src, VEC_LEN(&largest->v), (void **) &largest->v.pos, &bytes)) == SEARCH_OK) { /* read succeeded */ largest->v.end = largest->v.pos + bytes; } else if (sret == SEARCH_FINISH) { if (VEC_LEN(&largest->v) || largest->blocksize) { /* didn't finish properly */ assert(!CRASH); ERROR("impact_ord_eval()"); source_delete(term, terms); free(term); return SEARCH_EINVAL; } /* otherwise it will be finished below */ } else { assert(!CRASH); ERROR("impact_ord_eval()"); source_delete(term, terms); free(term); return sret; } } if (!largest->blocksize) { /* need to read the start of the next block */ unsigned long int tmp_bsize, tmp_impact; if (vec_vbyte_read(&largest->v, &tmp_bsize) && (vec_vbyte_read(&largest->v, &tmp_impact) /* second read failed, rewind past first vbyte */ || ((largest->v.pos -= vec_vbyte_len(tmp_bsize)), 0))) { blocks_read++; if (blocks_read > terms) { blockfine++; } largest->blocksize = tmp_bsize; largest->impact = (tmp_impact + 1) * largest->w_qt; largest->docno = -1; heap_insert(term, &terms, sizeof(*term), term_data_cmp, largest); } else if (!VEC_LEN(&largest->v)) { /* finished, don't put back on the heap */ dsrc = (void *) largest->src; bytes_read += dsrc->pos; largest->src->delet(largest->src); largest->src = NULL; } else if (largest->impact != INT_MAX) { /* ensure that this vector is chosen next, as we need the * next impact score */ largest->impact = INT_MAX; assert(largest->blocksize == 0); heap_insert(term, &terms, sizeof(*term), term_data_cmp, largest); } else { /* huh? */ assert(!CRASH); ERROR("impact_ord_eval()"); source_delete(term, terms); free(term); return SEARCH_EINVAL; } } else { heap_insert(term, &terms, sizeof(*term), term_data_cmp, largest); } } } while (largest && (largest->impact > blockfine)); for (i = 0; i < terms; i++) { dsrc = (void *) term[i].src; bytes_read += dsrc->pos; } if (largest) { largest->src->delet(largest->src); largest->src = NULL; } /* end of ranking */ source_delete(term, terms); free(term); return SEARCH_OK; }