/** * Main processing routine of Sally. This function processes chunks of * strings. It might be suitable for OpenMP support in a later version. */ static void sally_process() { long read, i, j; int chunk; const char *hash_file; /* Check if a hash file is set */ config_lookup_string(&cfg, "features.hash_file", &hash_file); /* Get chunk size */ config_lookup_int(&cfg, "input.chunk_size", &chunk); /* Allocate space */ fvec_t **fvec = malloc(sizeof(fvec_t *) * chunk); string_t *strs = malloc(sizeof(string_t) * chunk); if (!fvec || !strs) fatal("Could not allocate memory for embedding"); info_msg(1, "Processing %d strings in chunks of %d.", entries, chunk); for (i = 0, read = 0; i < entries; i += read) { read = input_read(strs, chunk); if (read <= 0) fatal("Failed to read strings from input '%s'", input); /* Generic preprocessing of input */ input_preproc(strs, read); #ifdef ENABLE_OPENMP #pragma omp parallel for #endif for (j = 0; j < read; j++) { fvec[j] = fvec_extract(strs[j].str, strs[j].len); fvec_set_label(fvec[j], strs[j].label); fvec_set_source(fvec[j], strs[j].src); } if (!output_write(fvec, read)) fatal("Failed to write vectors to output '%s'", output); /* Free memory */ input_free(strs, read); output_free(fvec, read); /* Reset hash if enabled but no hash file is set */ if (fhash_enabled() && !strlen(hash_file) > 0) fhash_reset(); prog_bar(0, entries, i + read); } free(fvec); free(strs); }
/** * Simple linkage clustering algorithm by Mutargh. The algorithm has a * worst-case run-time of O(n^3) but usually runs in O(n^2). Note that in * the generic case linkage clustering has a worst-case time complexity * of O(n^2 log n). * @param c Clustering structure * @param d Minimum distance * @param m Clustering mode */ static void cluster_murtagh(cluster_t *c, double *d, double dm, char m) { assert(c && d); double dmin, dnew; long k, j, i, jj, ii; long jm = 0, im = 0; /* Allocate stuff */ char *done = calloc(1, sizeof(char) * c->len); long *nn = malloc(sizeof(long) * c->len); double *dnn = malloc(sizeof(double) * c->len); /* Check for memory problems */ if (!done || !nn || !dnn) { error("Could not allocate memory for clustering algorithm."); goto err; } /* Main loop */ for (k = 0; k < c->len - 1; k++) { /* Update nearest neighbors for each point */ #pragma omp parallel for default(shared) private(dmin, jj, j) for (i = 0; i < c->len; i++) { if (done[i] || (k > 0 && (nn[i] != im && nn[i] != jm))) continue; dmin = DBL_MAX, jj = 0; for (j = i + 1; j < c->len; j++) { if (done[j] || D(i, j) >= dmin) continue; dmin = D(i, j), jj = j; } dnn[i] = dmin, nn[i] = jj; } /* Determine smalled distance */ dmin = DBL_MAX, im = 0; for (i = 0; i < c->len; i++) { if (done[i] || dnn[i] >= dmin) continue; dmin = dnn[i], im = i; } jm = nn[im]; /* Check for minimum distance */ if (dmin > dm) break; /* Update */ done[jm] = TRUE; c->num--; /* Update clusters and distance matrix */ int cm = c->cluster[jm]; #pragma omp parallel for default(shared) private(dnew) for (i = 0; i < c->len; i++) { /* Update cluster assignments */ if (c->cluster[i] == cm) c->cluster[i] = c->cluster[im]; if (done[i] || i == im) continue; switch (m) { /* Single linkage */ case 's': dnew = fmin(D(im, i), D(jm, i)); break; /* Average linkage */ case 'a': dnew = (D(im, i) + D(jm, i)) / 2; break; /* Complete linkage */ default: case 'c': dnew = fmax(D(im, i), D(jm, i)); break; } d[tria_pos(i, im, c->len)] = dnew; } /* Update nearest neighbors */ dmin = DBL_MAX, ii = 0; for (i = 0; i < c->len; i++) { if (done[i] || i == im || D(im, i) >= dmin) continue; dmin = D(im, i), ii = i; } dnn[im] = dmin; nn[im] = ii; if (verbose) prog_bar(0, c->len - 1, k); } if (verbose > 0) prog_bar(0, 1, 1); err: /* Free remaining arrays */ free(done); free(nn); free(dnn); }