Beispiel #1
0
void fastq_grep(FILE* fin, FILE* fout, FILE* mismatch_file, pcre* re)
{
    int rc;
    int ovector[3];
    size_t count = 0;

    fastq_t* fqf = fastq_create(fin);
    seq_t* seq = seq_create();

    while (fastq_read(fqf, seq)) {
        rc = pcre_exec(re,          /* pattern */
                       NULL,        /* extra data */
                       id_flag ? seq->id1.s : seq->seq.s,
                       id_flag ? seq->id1.n : seq->seq.n,
                       0,           /* subject offset */
                       0,           /* options */
                       ovector,     /* output vector */
                       3         ); /* output vector length */

        if ((invert_flag && rc == PCRE_ERROR_NOMATCH) || (!invert_flag && rc >= 0)) {
            if (count_flag) count++;
            else            fastq_print_maybe_trim(fout, seq, ovector);
        }
        else if (mismatch_file) {
            fastq_print(mismatch_file, seq);
        }
    }

    seq_free(seq);
    fastq_free(fqf);

    if (count_flag) fprintf(fout, "%zu\n", count);
}
Beispiel #2
0
/* n-way merge sort to stdout */
void merge_sort(const seq_dumps_t* d, int (*cmp)(const void*, const void*))
{
    FILE** files = malloc_or_die(d->n * sizeof(FILE*));
    size_t i;
    for (i = 0; i < d->n; ++i) {
        files[i] = fopen(d->fns[i], "rb");
        if (files[i] == NULL) {
            fprintf(stderr, "Cannot open temporary file %s for reading.\n",
                    d->fns[i]);
            exit(EXIT_FAILURE);
        }
    }

    fastq_t** fs = malloc_or_die(d->n * sizeof(fastq_t*));
    seq_t** seqs = malloc_or_die(d->n * sizeof(seq_t*));
    for (i = 0; i < d->n; ++i) {
        fs[i] = fastq_create(files[i]);
        seqs[i] = seq_create();
    }

    /* A binary heap of indexes to fs. We use this to repeatedly pop the
     * smallest fastq entry. */
    size_t* heap = malloc_or_die(d->n * sizeof(size_t));

    /* heap size */
    size_t m = 0;

    for (i = 0; i < d->n; ++i) {
        if (fastq_read(fs[i], seqs[i])) {
            heap_push(heap, d->n, &m, seqs, cmp, i);
        }
    }

    while (m > 0) {
        i = heap_pop(heap, &m, seqs, cmp);
        fastq_print(stdout, seqs[i]);
        if (fastq_read(fs[i], seqs[i])) {
            heap_push(heap, d->n, &m, seqs, cmp, i);
        }
    }

    for (i = 0; i < d->n; ++i) {
        seq_free(seqs[i]);
        fastq_free(fs[i]);
        fclose(files[i]);
    }

    free(files);
    free(fs);
}
Beispiel #3
0
void count_fastq_kmers(FILE* fin, uint32_t* cs)
{
    seq_t* seq = seq_create();
    fastq_t* fqf = fastq_create(fin);
    int i;
    int n;
    uint32_t kmer;

    while (fastq_read(fqf, seq)) {
        n = (int)seq->seq.n - k + 1;
        for (i = 0; i < n; i++) {
            if( packkmer(seq->seq.s + i, &kmer, k) ) {
                cs[kmer]++;
            }
        }
    }

    seq_free(seq);
    fastq_free(fqf);
}
Beispiel #4
0
void fastq_sample(unsigned long rng_seed,
                  const char* prefix, const char* cprefix,
                  FILE* file1, FILE* file2, unsigned long k, double p)
{
    /*
     * The basic idea is this:
     *
     * 1. Count the number of lines in the file, n.
     *
     * 2a. If sampling with replacement, generate k random integers in [0, n-1].
     *
     * 2b. If sampling without replacement, generate a list of integers 0..(n-1),
     *     shuffle with fisher-yates, then consider the first k.
     *
     * 3. Sort the integer list.
     *
     * 3. Read through the file again, when the number at the front of the integer
     *    list matches the index of the fastq etry, print the entry, and pop the
     *    number.
     */


    unsigned long n, n2;

    fastq_t* f1 = fastq_create(file1);
    fastq_t* f2 = file2 == NULL ? NULL : fastq_create(file2);

    n = count_entries(f1);
    if (f2 != NULL) {
        n2 = count_entries(f2);
        if (n != n2) {
            fprintf(stderr, "Input files have differing numbers of entries (%lu != %lu).\n", n, n2);
            exit(1);
        }
    }

    fastq_rewind(f1);
    if (f2 != NULL) fastq_rewind(f2);

    if (p > 0.0) {
        k = (unsigned long) round(p * (double) n);
        if (!replacement_flag && k > n) k = n;
    }

    rng_t* rng = fastq_rng_alloc();
    fastq_rng_seed(rng, rng_seed);

    unsigned long* xs;
    if (replacement_flag) xs = index_with_replacement(rng, n, k);
    else                  xs = index_without_replacement(rng, n);

    qsort(xs, k, sizeof(unsigned long), cmpul);

    /* open output */
    FILE* fout1;
    FILE* fout2;

    char* output_name;
    size_t output_len;
    if (file2 == NULL) {
        output_len = strlen(prefix) + 7;
        output_name = malloc_or_die((output_len + 1) * sizeof(char));

        snprintf(output_name, output_len, "%s.fastq", prefix);
        fout1 = open_without_clobber(output_name);
        if (fout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        fout2 = NULL;

        free(output_name);
    }
    else {
        output_len = strlen(prefix) + 9;
        output_name = malloc_or_die((output_len + 1) * sizeof(char));

        snprintf(output_name, output_len, "%s.1.fastq", prefix);
        fout1 = open_without_clobber(output_name);
        if (fout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        snprintf(output_name, output_len, "%s.2.fastq", prefix);
        fout1 = open_without_clobber(output_name);
        if (fout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        free(output_name);
    }

    /* open complement output */
    FILE* cfout1 = NULL;
    FILE* cfout2 = NULL;

    if (cprefix != NULL && file2 == NULL) {
        output_len = strlen(cprefix) + 7;
        output_name = malloc_or_die((output_len + 1) * sizeof(char));

        snprintf(output_name, output_len, "%s.fastq", cprefix);
        cfout1 = fopen(output_name, "wb");
        if (cfout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        cfout2 = NULL;

        free(output_name);
    }
    else if (cprefix != NULL) {
        output_len = strlen(cprefix) + 9;
        output_name = malloc_or_die((output_len + 1) * sizeof(char));

        snprintf(output_name, output_len, "%s.1.fastq", cprefix);
        cfout1 = fopen(output_name, "wb");
        if (cfout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        snprintf(output_name, output_len, "%s.2.fastq", cprefix);
        cfout2 = fopen(output_name, "wb");
        if (cfout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        free(output_name);
    }

    unsigned long i = 0; // read number
    unsigned long j = 0; // index into xs

    int ret;
    seq_t* seq1 = seq_create();
    seq_t* seq2 = seq_create();

    while (j < k && fastq_read(f1, seq1)) {
        if (f2 != NULL) {
            ret = fastq_read(f2, seq2);
            if (ret == 0) {
                fputs("Input files have differing numbers of entries.\n", stderr);
                exit(1);
            }
        }

        if (xs[j] == i) {
            while (j < k && xs[j] == i) {
                fastq_print(fout1, seq1);
                if (f2 != NULL) fastq_print(fout2, seq2);
                ++j;
            }
        }
        else if (cfout1 != NULL) {
            fastq_print(cfout1, seq1);
            if (f2 != NULL) fastq_print(cfout2, seq2);
        }

        ++i;
    }

    seq_free(seq1);
    seq_free(seq2);
    fastq_free(f1);
    if (f2 != NULL) fastq_free(f2);

    fclose(fout1);
    if (fout2 != NULL) fclose(fout2);

    if (cfout1 != NULL) fclose(cfout1);
    if (cfout2 != NULL) fclose(cfout2);

    fastq_rng_free(rng);
    free(xs);
}
Beispiel #5
0
int main(int argc, char* argv[])
{
    int opt, opt_idx;

    int in_fmt = INPUT_FMT_FASTA;
    int out_fmt = ADJ_GRAPH_FMT_MM;

    /* Size of the graph structure. */
    size_t n = 100000000;

    /* K-mer size. */
    size_t k = 25;

    /* Number of threads. */
    size_t num_threads = 1;

    struct option long_options[] =
    {
        {"fasta",   no_argument,       &in_fmt, INPUT_FMT_FASTA},
        {"fastq",   no_argument,       &in_fmt, INPUT_FMT_FASTQ},
        {"mm",      no_argument,       &out_fmt, ADJ_GRAPH_FMT_MM},
        {"hb",      no_argument,       &out_fmt, ADJ_GRAPH_FMT_HB},
        {"threads", required_argument, NULL, 't'},
        {"verbose", no_argument,       NULL, 'v'},
        {"help",    no_argument,       NULL, 'h'},
        {0, 0, 0, 0}
    };

    while (true) {
        opt = getopt_long(argc, argv, "n:k:t:vh", long_options, &opt_idx);
        if (opt == -1) break;

        switch (opt) {
            case 'n':
                n = strtoul(optarg, NULL, 10);
                break;

            case 'k':
                k = strtoul(optarg, NULL, 10);
                break;

            case 't':
                num_threads = strtoul(optarg, NULL, 10);
                break;

            case 'v':
                pique_verbose = true;
                break;

            case 'h':
                print_help(stdout);
                return EXIT_SUCCESS;

            case '?':
                return 1;

            case 0:
                break;

            default:
                abort();
        }
    }

    kmer_init();
    dbg_t* G = dbg_alloc(n, k);

    pthread_mutex_t f_mutex;
    pthread_mutex_init_or_die(&f_mutex, NULL);

    pthread_t* threads = malloc_or_die(num_threads * sizeof(pthread_t));

    pique_ctx_t ctx;
    ctx.fmt = in_fmt;
    ctx.G = G;
    ctx.f_mutex = &f_mutex;
    size_t i;

    if (optind >= argc) {
        ctx.f = fastq_create(stdin);
        for (i = 0; i < num_threads; ++i) {
            pthread_create(&threads[i], NULL, pique_thread, &ctx);
        }

        for (i = 0; i < num_threads; ++i) {
            pthread_join(threads[i], NULL);
        }

        fastq_free(ctx.f);
    } else {
        FILE* file;
        for (; optind < argc; ++optind) {
            file = fopen(argv[optind], "r");
            if (file == NULL) {
                fprintf(stderr, "Cannot open %s for reading.\n", argv[optind]);
                return EXIT_FAILURE;
            }
            ctx.f = fastq_create(file);

            for (i = 0; i < num_threads; ++i) {
                pthread_create(&threads[i], NULL, pique_thread, &ctx);
            }

            for (i = 0; i < num_threads; ++i) {
                pthread_join(threads[i], NULL);
            }

            fastq_free(ctx.f);
        }
    }

    dbg_dump(G, stdout, num_threads, out_fmt);

    pthread_mutex_destroy(&f_mutex);
    dbg_free(G);
    kmer_free();

    return EXIT_SUCCESS;
}