Example #1
0
int main(int argc, char* argv[])
{
    chash* table = chash_new();
    char* text = read_entire_file(fopen(argv[1], "r"));
    char* token = strtok(text, " \t\n");
    while (token)
    {
        char* cleaned_word = clean_word(token);
        if (cleaned_word != NULL)
        {
            see_word(table, cleaned_word);
            free(cleaned_word);
        }
        token = strtok(NULL, " \t\n");
    }

    chash_iterator iter;
    chash_iterator_init(&iter, table);
    char *key; void *value;
    pair* pairs = malloc(chash_size(table) * sizeof(*pairs));
    int i = 0;
    while (chash_iterator_next(&iter, &key, &value)) {
      pairs[i].key = key;
      pairs[i].value = value;
      i++;
    }
    qsort(pairs, chash_size(table), sizeof(pair), compare_by_frequency);
    printf("Words sorted by frequency:\n");
    for (i = 0; i < chash_size(table); ++i)
        printf("%s: %d\n", pairs[i].key, *((int*)pairs[i].value));
    chash_free(table);
    return 0;
}
Example #2
0
int test_file(FILE *fp, int argc, char **argv) {
    char buf[65535 + 1];
    char *pos;
    unsigned int strategy = 0;  /* what bucketing strategy we're 
                                          using */
    void *ptr = NULL;
    unsigned int bucketsize = 0;
    struct params params = {0};
    struct chash *hash = NULL;
    char name[256];

    if (!parse_params(argc, argv, &params)) {
        fprintf(stderr, "failed to parse params\n");
        return 0;
    }

    while (fgets((char *) buf, 65535, fp)) {
        str_rtrim(buf);
        pos = (char *) str_ltrim(buf);

        if (!str_casecmp(pos, "new")) {

            /* creating a new bucket */
            unsigned int size = -1;

            if (ptr) {
                chash_delete(hash);
                free(ptr);
            }

            /* read parameters */
            if ((fscanf(fp, "%255s %u %u", name, &strategy, &size) == 3)
              && (size <= 65535) 
              && (bucketsize = size)
              && (ptr = malloc(size))
              && (hash = chash_ptr_new(1, 2.0, 
                /* some fn pointer casting dodginess */
                (unsigned int (*)(const void *)) str_len, 
                (int (*)(const void *, const void *)) str_cmp))
              && (bucket_new(ptr, bucketsize, strategy))) {
                /* succeeded, do nothing */
                if (params.verbose) {
                    printf("%s: new bucket with size %u strategy %u\n", name, 
                      size, strategy);
                }
            } else {
                fprintf(stderr, "%s: failed to create bucket\n", name);
                return 0;
            }
        } else if (!str_casecmp(pos, "add")) {
            /* adding a term to the bucket */
            void *ret;
            unsigned int veclen,
                         succeed,
                         len;
            int toobig;

            if (!ptr) { return 0; }

            /* read parameters */
            if ((fscanf(fp, "%65535s %u %u", buf, &veclen, &succeed) == 3) 
              && (veclen <= 65535)) {

                len = str_len(buf);
                if ((((ret = bucket_alloc(ptr, bucketsize, strategy, buf, len, 
                        veclen, &toobig, NULL))
                      && succeed)
                    || (!ret && !succeed))) {
                    /* do nothing */
                    if (params.verbose) {
                        printf("%s: added term '%s'\n", name, buf);
                    }
                } else if (succeed) {
                    fprintf(stderr, "%s: failed to add '%s' to bucket\n", 
                      name, buf);
                    return 0;
                } else if (!succeed) {
                    fprintf(stderr, "%s: add '%s' succeeded but shouldn't "
                      "have\n", name, buf);
                    return 0;
                }
            } else {
                fprintf(stderr, "%s: failed to add\n", name);
                return 0;
            }
        } else if (!str_casecmp(pos, "ls")) {
            /* matching stuff in the bucket */
            unsigned int numterms,
                         i,
                         len,
                         veclen,
                         veclen2,
                         state;
            void *addr;
            struct chash *tmphash;
            const char *term;
            void **tmpptr,
                  *tmp;

            if (!ptr) { return 0; }

            if (!(tmphash = chash_ptr_new(1, 2.0, 
              /* some fn pointer casting dodginess */
              (unsigned int (*)(const void *)) str_len, 
              (int (*)(const void *, const void *)) str_cmp))) {
                fprintf(stderr, "%s: failed to init hashtable\n", name);
                return 0;
            }

            /* first, fill hashtable with all terms from bucket */
            state = 0;
            while ((term 
              = bucket_next_term(ptr, bucketsize, strategy,
                  &state, &len, &addr, &veclen))) {

                if (!((term = str_ndup(term, len)) 
                  && (chash_ptr_ptr_insert(tmphash, term, (void*) term) 
                      == CHASH_OK))) {

                    fprintf(stderr, "%s: failed to init hashtable\n", name);
                    return 0;
                }
            }

            /* now, take terms from file, comparing them with hashtable 
             * entries */
            if (fscanf(fp, "%u", &numterms)) {
                for (i = 0; i < numterms; i++) {
                    if (fscanf(fp, "%65535s %u ", buf, &veclen)) {
                        if (params.verbose) {
                            printf("%s: ls checking %s\n", name, buf);
                        }
                        
                        if ((addr = bucket_find(ptr, bucketsize, strategy,
                            buf, str_len(buf), &veclen2, NULL))
                          /* remove it from hashtable */
                          && chash_ptr_ptr_find(tmphash, buf, &tmpptr) 
                            == CHASH_OK
                          && chash_ptr_ptr_remove(tmphash, *tmpptr, &tmp) 
                            == CHASH_OK
                          && (free(tmp), 1)
                          && (veclen <= 65535)
                          && (veclen2 == veclen)
                          && fread(buf, veclen, 1, fp)
                          && ((buf[veclen] = '\0'), 1)
                          && (!params.verbose 
                            || printf("%s: ls check read '%s'\n", name, buf))
                          && !memcmp(buf, addr, veclen)) {
                            /* do nothing */
                        } else {
                            unsigned int j;

                            fprintf(stderr, "%s: ls failed cmp '%s' with '", 
                              name, buf);
                            for (j = 0; j < veclen; j++) {
                                putc(((char *) addr)[j], stderr);
                            }
                            fprintf(stderr, "'\n");
                            return 0;
                        }
                    } else {
                        fprintf(stderr, "%s: ls failed\n", name);
                        return 0;
                    }
                }

                if (chash_size(tmphash)) {
                    fprintf(stderr, "%s: ls failed\n", name);
                    return 0;
                }
            } else {
                fprintf(stderr, "%s: ls failed\n", name);
                return 0;
            }

            chash_delete(tmphash);

            if (params.verbose) {
                printf("%s: matched all (%u) entries\n", name, numterms);
            }
        } else if (!str_casecmp(pos, "set")) {
            /* setting the vector for a term in the bucket */
            unsigned int veclen,
                        reallen;
            void *addr;

            if (!ptr) { return 0; }

            /* read parameters */
            if ((fscanf(fp, "%65535s %u ", buf, &veclen) == 2) 
              && (veclen <= 65535)) {

                addr = bucket_find(ptr, bucketsize, strategy, buf, 
                  str_len(buf), &reallen, NULL);

                if (addr && (reallen == veclen) 
                  && fread(addr, 1, veclen, fp)) {
                    /* do nothing */
                    if (params.verbose) {
                        unsigned int j;

                        printf("%s: set term '%s' to '", name, buf);
                        for (j = 0; j < reallen; j++) {
                            putc(((char *) addr)[j], stdout);
                        }
                        printf("'\n");
                    }
                } else {
                    fprintf(stderr, "%s: failed to set!\n", name);
                    return 0;
                }
            } else {
                fprintf(stderr, "%s: failed to set\n", name);
                return 0;
            }
        } else if (!str_casecmp(pos, "realloc")) {
            /* reallocating a term in the bucket */
            unsigned int veclen,
                         succeed;
            int toobig;

            if (!ptr) { return 0; }

            /* read parameters */
            if ((fscanf(fp, "%65535s %u %u", buf, &veclen, &succeed) == 3) 
              && (veclen <= 65535)) {

                if (!bucket_realloc(ptr, bucketsize, strategy, buf, 
                  str_len(buf), veclen, &toobig)) {
                    fprintf(stderr, "%s: failed to realloc!\n", name);
                    return 0;
                }
            } else {
                fprintf(stderr, "%s: failed to realloc\n", name);
                return 0;
            }

            if (params.verbose) {
                printf("%s: realloc'd term '%s'\n", name, buf);
            }
        } else if (!str_casecmp(pos, "rm")) {
            /* removing something from the bucket */
            unsigned int succeed;

            if (!ptr) { return 0; }

            if (fscanf(fp, "%65535s %u", buf, &succeed) == 2) {
                if (succeed) {

                    if (!(bucket_remove(ptr, bucketsize, strategy, buf, 
                      str_len(buf)))) {
                        fprintf(stderr, "%s: failed to rm '%s'\n", name, 
                          buf);
                        return 0;
                    } else if (params.verbose) {
                        printf("%s: rm term '%s'\n", name, buf);
                    }
                } else if (succeed) {
                    fprintf(stderr, "%s: failed to rm\n", name);
                    return 0;
                }
            } else {
                fprintf(stderr, "%s: failed to rm\n", name);
                return 0;
            }
        } else if (!str_casecmp(pos, "print")) {
            /* printing out the bucket contents */
            unsigned int state = 0,
                         len,
                         veclen;
            const char *term;
            char format[100];
            void *addr;

            if (!ptr) { 
                printf("can't print, no bucket\n");
            } else {
                do {
                    term 
                      = bucket_next_term(ptr, bucketsize, strategy, &state, 
                        &len, &addr, &veclen);
                } while (term 
                  && memcpy(buf, term, len)
                  && ((buf[len] = '\0') || 1)
                  && snprintf(format, 100, "%%.%us (%%u): '%%.%us' (%%u) "
                    "(off %%u)\n", len, veclen) 
                  && printf(format, term, len, (char*) addr, veclen, 
                    ((char *) addr) - (char *) ptr));

                if (!state) {
                    printf("(empty)\n");
                }

                printf("%u entries, %u data, %u string, %u overhead, %u free\n", 
                  bucket_entries(ptr, bucketsize, strategy), 
                  bucket_utilised(ptr, bucketsize, strategy), 
                  bucket_string(ptr, bucketsize, strategy), 
                  bucket_overhead(ptr, bucketsize, strategy),
                  bucket_unused(ptr, bucketsize, strategy));
            }
        } else if (!str_casecmp(pos, "match")) {
            unsigned int veclen,
                         veclen2;
            void *addr;

            if (fscanf(fp, "%65535s %u ", buf, &veclen)) {
                if ((addr = bucket_find(ptr, bucketsize, strategy,
                    buf, str_len(buf), &veclen2, NULL))
                  && (veclen <= 65535)
                  && (veclen2 >= veclen)
                  && (!params.verbose 
                    || printf("%s: match on '%s' ", name, buf))
                  && fread(buf, veclen, 1, fp)
                  && !memcmp(buf, addr, veclen)) {
                    if (params.verbose) {
                        printf("content succeeded\n");
                    }
                } else {
                    fprintf(stderr, "%s: match failed (%s vs %s)\n", name, buf,
                      (char *) addr);
                    return 0;
                }
            } else {
                fprintf(stderr, "%s: match failed\n", name);
                return 0;
            }
        } else if ((*pos != '#') && str_len(pos)) {
            fprintf(stderr, "%s: unknown command '%s'\n", name, pos);
            return 0;
        }
    }

    if (ptr) {
        chash_delete(hash);
        free(ptr);
    }

    return 1;
}
Example #3
0
enum search_ret impact_ord_eval(struct index *idx, struct query *query, 
  struct chash *accumulators, unsigned int acc_limit, struct alloc *alloc, 
  unsigned int mem) {
    double norm_B;
    unsigned int i,
                 terms = 0,
                 blockfine,
                 blocks_read,
                 postings_read = 0,
                 postings = 0,
                 bytes = 0,
                 bytes_read = 0;
    struct term_data *term,
                     *largest;
    struct disksrc *dsrc;

    if (query->terms == 0) {
        /* no terms to process */
        return SEARCH_OK;
    /* allocate space for array */
    } else if (!(term = malloc(sizeof(*term) * query->terms))) {
        return SEARCH_ENOMEM;
    }

    /* sort by selectivity (by inverse t_f) */
    qsort(query->term, query->terms, sizeof(*query->term), f_t_cmp);

    norm_B = pow(idx->impact_stats.w_qt_max / idx->impact_stats.w_qt_min,
        idx->impact_stats.w_qt_min 
          / (idx->impact_stats.w_qt_max - idx->impact_stats.w_qt_min));

    /* initialise data for each query term */
    for (i = 0; i < query->terms; i++) {
        unsigned int termfine;
        double w_qt;

        /* initialise src/vec for term */
        term[i].v.pos = term[i].v.end = NULL;
        term[i].src = NULL;

        w_qt = (1 + log(query->term[i].f_qt)) *
          log(1 + (idx->impact_stats.avg_f_t / query->term[i].f_t));
        w_qt = impact_normalise(w_qt, norm_B, 
            idx->impact_stats.slope, idx->impact_stats.w_qt_max, 
            idx->impact_stats.w_qt_min);
        term[i].w_qt = impact_quantise(w_qt, 
            idx->impact_stats.quant_bits, idx->impact_stats.w_qt_max, 
            idx->impact_stats.w_qt_min);

        /* apply term fine to term impact */
        termfine = (i < 2) ? 0 : i - 2;
        if (termfine < term[i].w_qt) {
            term[i].w_qt -= termfine;
            /* initialise to highest impact, so we'll select and initialise this
             * term before real processing */
            term[i].impact = INT_MAX;
            terms++;
        } else {
            /* we won't use this term */
            term[i].w_qt = 0;
            term[i].impact = 0;
        }
        term[i].blocksize = 0;

        /* XXX */
        postings += query->term[i].f_t;
        bytes += query->term[i].term.vocab.size;
    }

    /* get sources for each term (do this in a seperate loop so we've already
     * excluded lists that we won't use) */
    for (i = 0; i < terms; i++) {
        unsigned int memsize = mem / (terms - i);

        if (memsize > query->term[i].term.vocab.size) {
            memsize = query->term[i].term.vocab.size;
        }

        if (!(term[i].src 
          = search_term_src(idx, &query->term[i].term, alloc, memsize))) {
            source_delete(term, terms);
            free(term);
            return SEARCH_EINVAL;
        }

        mem -= memsize;
    }

    blockfine = blocks_read = 0;
    heap_heapify(term, terms, sizeof(*term), term_data_cmp);

    do {
        largest = heap_pop(term, &terms, sizeof(*term), term_data_cmp);

        if (largest && (largest->impact > blockfine)) {
            postings_read += largest->blocksize;
            if (chash_size(accumulators) < acc_limit) {
                /* reserve enough memory for accumulators and decode */
                if (chash_reserve(accumulators, largest->blocksize) 
                  >= largest->blocksize) {
                    impact_decode_block(accumulators, largest, blockfine);
                } else {
                    assert(!CRASH); ERROR("impact_ord_eval()");
                    source_delete(term, terms);
                    free(term);
                    return SEARCH_EINVAL;
                }
            } else {
                impact_decode_block_and(accumulators, largest, blockfine);
            }

            if (VEC_LEN(&largest->v) < 2 * VEC_VBYTE_MAX) {
                /* need to read more data */
                unsigned int bytes;
                enum search_ret sret;

                if ((sret 
                  = largest->src->readlist(largest->src, VEC_LEN(&largest->v), 
                    (void **) &largest->v.pos, &bytes)) == SEARCH_OK) {

                    /* read succeeded */
                    largest->v.end = largest->v.pos + bytes;
                } else if (sret == SEARCH_FINISH) {
                    if (VEC_LEN(&largest->v) || largest->blocksize) {
                        /* didn't finish properly */
                        assert(!CRASH); ERROR("impact_ord_eval()");
                        source_delete(term, terms);
                        free(term);
                        return SEARCH_EINVAL;
                    }
                    /* otherwise it will be finished below */
                } else {
                    assert(!CRASH); ERROR("impact_ord_eval()");
                    source_delete(term, terms);
                    free(term);
                    return sret;
                }
            }

            if (!largest->blocksize) {
                /* need to read the start of the next block */
                unsigned long int tmp_bsize,
                                  tmp_impact;

                if (vec_vbyte_read(&largest->v, &tmp_bsize)
                  && (vec_vbyte_read(&largest->v, &tmp_impact) 
                    /* second read failed, rewind past first vbyte */
                    || ((largest->v.pos -= vec_vbyte_len(tmp_bsize)), 0))) {

                    blocks_read++;
                    if (blocks_read > terms) {
                        blockfine++;
                    }

                    largest->blocksize = tmp_bsize;
                    largest->impact = (tmp_impact + 1) * largest->w_qt;
                    largest->docno = -1;
                    heap_insert(term, &terms, sizeof(*term), term_data_cmp, 
                      largest);
                } else if (!VEC_LEN(&largest->v)) {
                    /* finished, don't put back on the heap */
                    dsrc = (void *) largest->src; bytes_read += dsrc->pos;
                    largest->src->delet(largest->src);
                    largest->src = NULL;
                } else if (largest->impact != INT_MAX) {
                    /* ensure that this vector is chosen next, as we need the
                     * next impact score */
                    largest->impact = INT_MAX;
                    assert(largest->blocksize == 0);
                    heap_insert(term, &terms, sizeof(*term), term_data_cmp, 
                      largest);
                } else {
                    /* huh? */
                    assert(!CRASH); ERROR("impact_ord_eval()");
                    source_delete(term, terms);
                    free(term);
                    return SEARCH_EINVAL;
                }
            } else {
                heap_insert(term, &terms, sizeof(*term), term_data_cmp, 
                  largest);
            }
        }
    } while (largest && (largest->impact > blockfine));

    for (i = 0; i < terms; i++) {
        dsrc = (void *) term[i].src; bytes_read += dsrc->pos;
    }

    if (largest) {
        largest->src->delet(largest->src);
        largest->src = NULL;
    }

    /* end of ranking */
    source_delete(term, terms);
    free(term);
    return SEARCH_OK;
}