Esempio n. 1
0
File: sally.c Progetto: yangke/sally
/**
 * Main processing routine of Sally. This function processes chunks of
 * strings. It might be suitable for OpenMP support in a later version.
 */
static void sally_process()
{
    long read, i, j;
    int chunk;
    const char *hash_file;

    /* Check if a hash file is set */
    config_lookup_string(&cfg, "features.hash_file", &hash_file);

    /* Get chunk size */
    config_lookup_int(&cfg, "input.chunk_size", &chunk);

    /* Allocate space */
    fvec_t **fvec = malloc(sizeof(fvec_t *) * chunk);
    string_t *strs = malloc(sizeof(string_t) * chunk);

    if (!fvec || !strs)
        fatal("Could not allocate memory for embedding");

    info_msg(1, "Processing %d strings in chunks of %d.", entries, chunk);

    for (i = 0, read = 0; i < entries; i += read) {
        read = input_read(strs, chunk);
        if (read <= 0)
            fatal("Failed to read strings from input '%s'", input);

        /* Generic preprocessing of input */
        input_preproc(strs, read);

#ifdef ENABLE_OPENMP
#pragma omp parallel for
#endif
        for (j = 0; j < read; j++) {
            fvec[j] = fvec_extract(strs[j].str, strs[j].len);
            fvec_set_label(fvec[j], strs[j].label);
            fvec_set_source(fvec[j], strs[j].src);
        }

        if (!output_write(fvec, read))
            fatal("Failed to write vectors to output '%s'", output);

        /* Free memory */
        input_free(strs, read);
        output_free(fvec, read);

        /* Reset hash if enabled but no hash file is set */
        if (fhash_enabled() && !strlen(hash_file) > 0)
            fhash_reset();

        prog_bar(0, entries, i + read);
    }
    
    free(fvec);
    free(strs);
}
Esempio n. 2
0
/**
 * Simple linkage clustering algorithm by Mutargh. The algorithm has a
 * worst-case run-time of O(n^3) but usually runs in O(n^2). Note that in
 * the generic case linkage clustering has a worst-case time complexity
 * of O(n^2 log n).
 * @param c Clustering structure
 * @param d Minimum distance
 * @param m Clustering mode
 */
static void cluster_murtagh(cluster_t *c, double *d, double dm, char m)
{
    assert(c && d);
    double dmin, dnew;
    long k, j, i, jj, ii;
    long jm = 0, im = 0;

    /* Allocate stuff */
    char *done = calloc(1, sizeof(char) * c->len);
    long *nn = malloc(sizeof(long) * c->len);
    double *dnn = malloc(sizeof(double) * c->len);

    /* Check for memory problems */
    if (!done || !nn || !dnn) {
        error("Could not allocate memory for clustering algorithm.");
        goto err;
    }

    /* Main loop */
    for (k = 0; k < c->len - 1; k++) {
        /* Update nearest neighbors for each point */
#pragma omp parallel for default(shared) private(dmin, jj, j)
        for (i = 0; i < c->len; i++) {
            if (done[i] || (k > 0 && (nn[i] != im && nn[i] != jm)))
                continue;
            dmin = DBL_MAX, jj = 0;
            for (j = i + 1; j < c->len; j++) {
                if (done[j] || D(i, j) >= dmin)
                    continue;
                dmin = D(i, j), jj = j;
            }
            dnn[i] = dmin, nn[i] = jj;
        }

        /* Determine smalled distance */
        dmin = DBL_MAX, im = 0;
        for (i = 0; i < c->len; i++) {
            if (done[i] || dnn[i] >= dmin)
                continue;
            dmin = dnn[i], im = i;
        }
        jm = nn[im];

        /* Check for minimum distance */
        if (dmin > dm)
            break;

        /* Update */
        done[jm] = TRUE;
        c->num--;

        /* Update clusters and distance matrix */
        int cm = c->cluster[jm];

#pragma omp parallel for default(shared) private(dnew)
        for (i = 0; i < c->len; i++) {
            /* Update cluster assignments */
            if (c->cluster[i] == cm)
                c->cluster[i] = c->cluster[im];
            if (done[i] || i == im)
                continue;

            switch (m) {
                /* Single linkage */
            case 's':
                dnew = fmin(D(im, i), D(jm, i));
                break;
                /* Average linkage */
            case 'a':
                dnew = (D(im, i) + D(jm, i)) / 2;
                break;
                /* Complete linkage */
            default:
            case 'c':
                dnew = fmax(D(im, i), D(jm, i));
                break;
            }
            d[tria_pos(i, im, c->len)] = dnew;
        }

        /* Update nearest neighbors */
        dmin = DBL_MAX, ii = 0;
        for (i = 0; i < c->len; i++) {
            if (done[i] || i == im || D(im, i) >= dmin)
                continue;
            dmin = D(im, i), ii = i;
        }
        dnn[im] = dmin;
        nn[im] = ii;

        if (verbose)
            prog_bar(0, c->len - 1, k);
    }
    if (verbose > 0)
        prog_bar(0, 1, 1);
  err:
    /* Free remaining arrays */
    free(done);
    free(nn);
    free(dnn);
}