void fastq_grep(FILE* fin, FILE* fout, FILE* mismatch_file, pcre* re) { int rc; int ovector[3]; size_t count = 0; fastq_t* fqf = fastq_create(fin); seq_t* seq = seq_create(); while (fastq_read(fqf, seq)) { rc = pcre_exec(re, /* pattern */ NULL, /* extra data */ id_flag ? seq->id1.s : seq->seq.s, id_flag ? seq->id1.n : seq->seq.n, 0, /* subject offset */ 0, /* options */ ovector, /* output vector */ 3 ); /* output vector length */ if ((invert_flag && rc == PCRE_ERROR_NOMATCH) || (!invert_flag && rc >= 0)) { if (count_flag) count++; else fastq_print_maybe_trim(fout, seq, ovector); } else if (mismatch_file) { fastq_print(mismatch_file, seq); } } seq_free(seq); fastq_free(fqf); if (count_flag) fprintf(fout, "%zu\n", count); }
/* n-way merge sort to stdout */ void merge_sort(const seq_dumps_t* d, int (*cmp)(const void*, const void*)) { FILE** files = malloc_or_die(d->n * sizeof(FILE*)); size_t i; for (i = 0; i < d->n; ++i) { files[i] = fopen(d->fns[i], "rb"); if (files[i] == NULL) { fprintf(stderr, "Cannot open temporary file %s for reading.\n", d->fns[i]); exit(EXIT_FAILURE); } } fastq_t** fs = malloc_or_die(d->n * sizeof(fastq_t*)); seq_t** seqs = malloc_or_die(d->n * sizeof(seq_t*)); for (i = 0; i < d->n; ++i) { fs[i] = fastq_create(files[i]); seqs[i] = seq_create(); } /* A binary heap of indexes to fs. We use this to repeatedly pop the * smallest fastq entry. */ size_t* heap = malloc_or_die(d->n * sizeof(size_t)); /* heap size */ size_t m = 0; for (i = 0; i < d->n; ++i) { if (fastq_read(fs[i], seqs[i])) { heap_push(heap, d->n, &m, seqs, cmp, i); } } while (m > 0) { i = heap_pop(heap, &m, seqs, cmp); fastq_print(stdout, seqs[i]); if (fastq_read(fs[i], seqs[i])) { heap_push(heap, d->n, &m, seqs, cmp, i); } } for (i = 0; i < d->n; ++i) { seq_free(seqs[i]); fastq_free(fs[i]); fclose(files[i]); } free(files); free(fs); }
void fastq_print_maybe_trim(FILE* fout, seq_t* seq, int* ovector) { if (!trim_before_flag && !trim_after_flag) { fastq_print(fout, seq); return; } // trimming seq_t* trimmed = seq_create(); int trimmed_start = 0; int trimmed_end = 0; int match_start = ovector[0]; int match_end = ovector[1]; if (trim_before_flag) { trimmed_end = seq->seq.n; trimmed_start = trim_match_flag ? match_end : match_start; } else if (trim_after_flag) { trimmed_start = 0; trimmed_end = trim_match_flag ? match_start : match_end; } seq_trim(seq, trimmed, trimmed_start, trimmed_end); fastq_print(fout, trimmed); seq_free(trimmed); }
void fastq_sample(unsigned long rng_seed, const char* prefix, const char* cprefix, FILE* file1, FILE* file2, unsigned long k, double p) { /* * The basic idea is this: * * 1. Count the number of lines in the file, n. * * 2a. If sampling with replacement, generate k random integers in [0, n-1]. * * 2b. If sampling without replacement, generate a list of integers 0..(n-1), * shuffle with fisher-yates, then consider the first k. * * 3. Sort the integer list. * * 3. Read through the file again, when the number at the front of the integer * list matches the index of the fastq etry, print the entry, and pop the * number. */ unsigned long n, n2; fastq_t* f1 = fastq_create(file1); fastq_t* f2 = file2 == NULL ? NULL : fastq_create(file2); n = count_entries(f1); if (f2 != NULL) { n2 = count_entries(f2); if (n != n2) { fprintf(stderr, "Input files have differing numbers of entries (%lu != %lu).\n", n, n2); exit(1); } } fastq_rewind(f1); if (f2 != NULL) fastq_rewind(f2); if (p > 0.0) { k = (unsigned long) round(p * (double) n); if (!replacement_flag && k > n) k = n; } rng_t* rng = fastq_rng_alloc(); fastq_rng_seed(rng, rng_seed); unsigned long* xs; if (replacement_flag) xs = index_with_replacement(rng, n, k); else xs = index_without_replacement(rng, n); qsort(xs, k, sizeof(unsigned long), cmpul); /* open output */ FILE* fout1; FILE* fout2; char* output_name; size_t output_len; if (file2 == NULL) { output_len = strlen(prefix) + 7; output_name = malloc_or_die((output_len + 1) * sizeof(char)); snprintf(output_name, output_len, "%s.fastq", prefix); fout1 = open_without_clobber(output_name); if (fout1 == NULL) { fprintf(stderr, "Cannot open file %s for writing.\n", output_name); exit(1); } fout2 = NULL; free(output_name); } else { output_len = strlen(prefix) + 9; output_name = malloc_or_die((output_len + 1) * sizeof(char)); snprintf(output_name, output_len, "%s.1.fastq", prefix); fout1 = open_without_clobber(output_name); if (fout1 == NULL) { fprintf(stderr, "Cannot open file %s for writing.\n", output_name); exit(1); } snprintf(output_name, output_len, "%s.2.fastq", prefix); fout1 = open_without_clobber(output_name); if (fout1 == NULL) { fprintf(stderr, "Cannot open file %s for writing.\n", output_name); exit(1); } free(output_name); } /* open complement output */ FILE* cfout1 = NULL; FILE* cfout2 = NULL; if (cprefix != NULL && file2 == NULL) { output_len = strlen(cprefix) + 7; output_name = malloc_or_die((output_len + 1) * sizeof(char)); snprintf(output_name, output_len, "%s.fastq", cprefix); cfout1 = fopen(output_name, "wb"); if (cfout1 == NULL) { fprintf(stderr, "Cannot open file %s for writing.\n", output_name); exit(1); } cfout2 = NULL; free(output_name); } else if (cprefix != NULL) { output_len = strlen(cprefix) + 9; output_name = malloc_or_die((output_len + 1) * sizeof(char)); snprintf(output_name, output_len, "%s.1.fastq", cprefix); cfout1 = fopen(output_name, "wb"); if (cfout1 == NULL) { fprintf(stderr, "Cannot open file %s for writing.\n", output_name); exit(1); } snprintf(output_name, output_len, "%s.2.fastq", cprefix); cfout2 = fopen(output_name, "wb"); if (cfout1 == NULL) { fprintf(stderr, "Cannot open file %s for writing.\n", output_name); exit(1); } free(output_name); } unsigned long i = 0; // read number unsigned long j = 0; // index into xs int ret; seq_t* seq1 = seq_create(); seq_t* seq2 = seq_create(); while (j < k && fastq_read(f1, seq1)) { if (f2 != NULL) { ret = fastq_read(f2, seq2); if (ret == 0) { fputs("Input files have differing numbers of entries.\n", stderr); exit(1); } } if (xs[j] == i) { while (j < k && xs[j] == i) { fastq_print(fout1, seq1); if (f2 != NULL) fastq_print(fout2, seq2); ++j; } } else if (cfout1 != NULL) { fastq_print(cfout1, seq1); if (f2 != NULL) fastq_print(cfout2, seq2); } ++i; } seq_free(seq1); seq_free(seq2); fastq_free(f1); if (f2 != NULL) fastq_free(f2); fclose(fout1); if (fout2 != NULL) fclose(fout2); if (cfout1 != NULL) fclose(cfout1); if (cfout2 != NULL) fclose(cfout2); fastq_rng_free(rng); free(xs); }