示例#1
0
/* n-way merge sort to stdout */
void merge_sort(const seq_dumps_t* d, int (*cmp)(const void*, const void*))
{
    FILE** files = malloc_or_die(d->n * sizeof(FILE*));
    size_t i;
    for (i = 0; i < d->n; ++i) {
        files[i] = fopen(d->fns[i], "rb");
        if (files[i] == NULL) {
            fprintf(stderr, "Cannot open temporary file %s for reading.\n",
                    d->fns[i]);
            exit(EXIT_FAILURE);
        }
    }

    fastq_t** fs = malloc_or_die(d->n * sizeof(fastq_t*));
    seq_t** seqs = malloc_or_die(d->n * sizeof(seq_t*));
    for (i = 0; i < d->n; ++i) {
        fs[i] = fastq_create(files[i]);
        seqs[i] = seq_create();
    }

    /* A binary heap of indexes to fs. We use this to repeatedly pop the
     * smallest fastq entry. */
    size_t* heap = malloc_or_die(d->n * sizeof(size_t));

    /* heap size */
    size_t m = 0;

    for (i = 0; i < d->n; ++i) {
        if (fastq_read(fs[i], seqs[i])) {
            heap_push(heap, d->n, &m, seqs, cmp, i);
        }
    }

    while (m > 0) {
        i = heap_pop(heap, &m, seqs, cmp);
        fastq_print(stdout, seqs[i]);
        if (fastq_read(fs[i], seqs[i])) {
            heap_push(heap, d->n, &m, seqs, cmp, i);
        }
    }

    for (i = 0; i < d->n; ++i) {
        seq_free(seqs[i]);
        fastq_free(fs[i]);
        fclose(files[i]);
    }

    free(files);
    free(fs);
}
示例#2
0
void fastq_grep(FILE* fin, FILE* fout, FILE* mismatch_file, pcre* re)
{
    int rc;
    int ovector[3];
    size_t count = 0;

    fastq_t* fqf = fastq_create(fin);
    seq_t* seq = seq_create();

    while (fastq_read(fqf, seq)) {
        rc = pcre_exec(re,          /* pattern */
                       NULL,        /* extra data */
                       id_flag ? seq->id1.s : seq->seq.s,
                       id_flag ? seq->id1.n : seq->seq.n,
                       0,           /* subject offset */
                       0,           /* options */
                       ovector,     /* output vector */
                       3         ); /* output vector length */

        if ((invert_flag && rc == PCRE_ERROR_NOMATCH) || (!invert_flag && rc >= 0)) {
            if (count_flag) count++;
            else            fastq_print_maybe_trim(fout, seq, ovector);
        }
        else if (mismatch_file) {
            fastq_print(mismatch_file, seq);
        }
    }

    seq_free(seq);
    fastq_free(fqf);

    if (count_flag) fprintf(fout, "%zu\n", count);
}
	fastq_read Merge(size_t index, paired_fastq_read paired_read) {
		string merged_name = MergeNames(index, paired_read.left_read.name,
				paired_read.right_read.name);
		pair<string, string> merged_seq_qual = MergeQualifiedSeq(paired_read);
		return fastq_read(merged_name, merged_seq_qual.first,
				merged_seq_qual.second);
	}
示例#4
0
文件: pique.c 项目: dcjones/pique
void* pique_thread(void* arg)
{
    pique_ctx_t* ctx = arg;
    seq_t* seq = seq_create();
    twobit_t* tb = twobit_alloc();
    rng_t* rng = rng_alloc(1234);
    bool r;

    while (true) {
        pthread_mutex_lock(ctx->f_mutex);
        if (ctx->fmt == INPUT_FMT_FASTA)      r = fasta_read(ctx->f, seq);
        else if (ctx->fmt == INPUT_FMT_FASTQ) r = fastq_read(ctx->f, seq);
        pthread_mutex_unlock(ctx->f_mutex);
        if (!r) break;

        /* TODO: remove sequences with Ns? */

        twobit_copy_str_n(tb, seq->seq.s, seq->seq.n);
        dbg_add_twobit_seq(ctx->G, rng, tb);
    }

    rng_free(rng);
    seq_free(seq);
    return NULL;
}
示例#5
0
/* count the number of entries in a fastq file */
unsigned long count_entries(fastq_t* fqf)
{
    seq_t* seq = seq_create();
    unsigned long n = 0;
    while (fastq_read(fqf, seq)) ++n;
    seq_free(seq);

    return n;
}
示例#6
0
void count_fastq_kmers(FILE* fin, uint32_t* cs)
{
    seq_t* seq = seq_create();
    fastq_t* fqf = fastq_create(fin);
    int i;
    int n;
    uint32_t kmer;

    while (fastq_read(fqf, seq)) {
        n = (int)seq->seq.n - k + 1;
        for (i = 0; i < n; i++) {
            if( packkmer(seq->seq.s + i, &kmer, k) ) {
                cs[kmer]++;
            }
        }
    }

    seq_free(seq);
    fastq_free(fqf);
}
示例#7
0
void fastq_sample(unsigned long rng_seed,
                  const char* prefix, const char* cprefix,
                  FILE* file1, FILE* file2, unsigned long k, double p)
{
    /*
     * The basic idea is this:
     *
     * 1. Count the number of lines in the file, n.
     *
     * 2a. If sampling with replacement, generate k random integers in [0, n-1].
     *
     * 2b. If sampling without replacement, generate a list of integers 0..(n-1),
     *     shuffle with fisher-yates, then consider the first k.
     *
     * 3. Sort the integer list.
     *
     * 3. Read through the file again, when the number at the front of the integer
     *    list matches the index of the fastq etry, print the entry, and pop the
     *    number.
     */


    unsigned long n, n2;

    fastq_t* f1 = fastq_create(file1);
    fastq_t* f2 = file2 == NULL ? NULL : fastq_create(file2);

    n = count_entries(f1);
    if (f2 != NULL) {
        n2 = count_entries(f2);
        if (n != n2) {
            fprintf(stderr, "Input files have differing numbers of entries (%lu != %lu).\n", n, n2);
            exit(1);
        }
    }

    fastq_rewind(f1);
    if (f2 != NULL) fastq_rewind(f2);

    if (p > 0.0) {
        k = (unsigned long) round(p * (double) n);
        if (!replacement_flag && k > n) k = n;
    }

    rng_t* rng = fastq_rng_alloc();
    fastq_rng_seed(rng, rng_seed);

    unsigned long* xs;
    if (replacement_flag) xs = index_with_replacement(rng, n, k);
    else                  xs = index_without_replacement(rng, n);

    qsort(xs, k, sizeof(unsigned long), cmpul);

    /* open output */
    FILE* fout1;
    FILE* fout2;

    char* output_name;
    size_t output_len;
    if (file2 == NULL) {
        output_len = strlen(prefix) + 7;
        output_name = malloc_or_die((output_len + 1) * sizeof(char));

        snprintf(output_name, output_len, "%s.fastq", prefix);
        fout1 = open_without_clobber(output_name);
        if (fout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        fout2 = NULL;

        free(output_name);
    }
    else {
        output_len = strlen(prefix) + 9;
        output_name = malloc_or_die((output_len + 1) * sizeof(char));

        snprintf(output_name, output_len, "%s.1.fastq", prefix);
        fout1 = open_without_clobber(output_name);
        if (fout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        snprintf(output_name, output_len, "%s.2.fastq", prefix);
        fout1 = open_without_clobber(output_name);
        if (fout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        free(output_name);
    }

    /* open complement output */
    FILE* cfout1 = NULL;
    FILE* cfout2 = NULL;

    if (cprefix != NULL && file2 == NULL) {
        output_len = strlen(cprefix) + 7;
        output_name = malloc_or_die((output_len + 1) * sizeof(char));

        snprintf(output_name, output_len, "%s.fastq", cprefix);
        cfout1 = fopen(output_name, "wb");
        if (cfout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        cfout2 = NULL;

        free(output_name);
    }
    else if (cprefix != NULL) {
        output_len = strlen(cprefix) + 9;
        output_name = malloc_or_die((output_len + 1) * sizeof(char));

        snprintf(output_name, output_len, "%s.1.fastq", cprefix);
        cfout1 = fopen(output_name, "wb");
        if (cfout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        snprintf(output_name, output_len, "%s.2.fastq", cprefix);
        cfout2 = fopen(output_name, "wb");
        if (cfout1 == NULL) {
            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
            exit(1);
        }

        free(output_name);
    }

    unsigned long i = 0; // read number
    unsigned long j = 0; // index into xs

    int ret;
    seq_t* seq1 = seq_create();
    seq_t* seq2 = seq_create();

    while (j < k && fastq_read(f1, seq1)) {
        if (f2 != NULL) {
            ret = fastq_read(f2, seq2);
            if (ret == 0) {
                fputs("Input files have differing numbers of entries.\n", stderr);
                exit(1);
            }
        }

        if (xs[j] == i) {
            while (j < k && xs[j] == i) {
                fastq_print(fout1, seq1);
                if (f2 != NULL) fastq_print(fout2, seq2);
                ++j;
            }
        }
        else if (cfout1 != NULL) {
            fastq_print(cfout1, seq1);
            if (f2 != NULL) fastq_print(cfout2, seq2);
        }

        ++i;
    }

    seq_free(seq1);
    seq_free(seq2);
    fastq_free(f1);
    if (f2 != NULL) fastq_free(f2);

    fclose(fout1);
    if (fout2 != NULL) fclose(fout2);

    if (cfout1 != NULL) fclose(cfout1);
    if (cfout2 != NULL) fclose(cfout2);

    fastq_rng_free(rng);
    free(xs);
}