int main_interleave(int argc, char *argv[]) { gzFile fp1, fp2; kseq_t *seq[2]; kstring_t str; if (argc < 3) { fprintf(stderr, "Usage: fermi interleave <in1.fq> <in2.fq>\n"); return 1; } str.l = str.m = 0; str.s = 0; fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); seq[0] = kseq_init(fp1); seq[1] = kseq_init(fp2); while (kseq_read(seq[0]) >= 0) { if (kseq_read(seq[1]) < 0) break; // one file ends str.l = 0; if (seq[0]->name.l > 2 && seq[0]->name.s[seq[0]->name.l-2] == '/' && isdigit(seq[0]->name.s[seq[0]->name.l-1])) seq[0]->name.s[(seq[0]->name.l -= 2)] = 0; // trim tailing "/[0-9]$" seq[1]->name.l = 0; kputsn(seq[0]->name.s, seq[0]->name.l, &seq[1]->name); // make sure two ends having the same name write_seq(seq[0], &str); write_seq(seq[1], &str); fputs(str.s, stdout); } kseq_destroy(seq[0]); gzclose(fp1); kseq_destroy(seq[1]); gzclose(fp2); free(str.s); return 0; }
void get_bases_for_each_snp(char filename[], int snp_locations[], char ** bases_for_snps, int length_of_genome, int number_of_snps) { int l; int i = 0; int sequence_number = 0; gzFile fp; kseq_t *seq; fp = gzopen(filename, "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { for(i = 0; i< number_of_snps; i++) { bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]); // Present gaps and unknowns in the same way to Gubbins if(bases_for_snps[i][sequence_number] == 'N') { bases_for_snps[i][sequence_number] = '-'; } } sequence_number++; } kseq_destroy(seq); gzclose(fp); }
int build_reference_sequence(char reference_sequence[], char filename[]) { int i; int length_of_genome; gzFile fp; kseq_t *seq; fp = gzopen(filename, "r"); seq = kseq_init(fp); kseq_read(seq); for(i = 0; i < seq->seq.l; i++) { reference_sequence[i] = toupper(seq->seq.s[i]); if(reference_sequence[i] == 'N') { reference_sequence[i] = '-'; } } if(reference_sequence[seq->seq.l] != '\0') { reference_sequence[seq->seq.l] = '\0'; } kseq_destroy(seq); gzclose(fp); return 1; }
int streamAndCountOneFile(KWTCounterManager *manager) { gzFile inputFP; kseq_t* seq; //open file to read lines if(!( inputFP = gzopen ( manager->inputFileName , "r" ))) { printf("Could not open input file \"%s\" for reading\n", manager->inputFileName); return 1; } // initialize reader seq = kseq_init(inputFP); // read sequences while(kseq_read(seq) >= 0) { if(streamOneStringUnchanged(manager, seq->seq.s, seq->seq.l) != 0) { gzclose(inputFP); return 1; } } kseq_destroy(seq); gzclose(inputFP); return 0; }
void bwa_seq_close(bwa_seqio_t *bs) { if (bs == 0) return; gzclose(bs->ks->f->f); kseq_destroy(bs->ks); free(bs); }
int main_read_stat(int argc, char **argv) { if (argc > 1) { fprintf(stderr, "Usage: cat *.fq | %s\n", argv[0]); exit(1); } gzFile fp = gzdopen(fileno(stdin), "r"); kseq_t *seq = kseq_init(fp); // kseq to read files int max_len = 0; int min_len = 999999999; long long total_len = 0; long long num_reads = 0; while (kseq_read(seq) >= 0) { ++num_reads; total_len += seq->seq.l; max_len = std::max(seq->seq.l, (size_t)max_len); min_len = std::min(seq->seq.l, (size_t)min_len); } double avg_len = total_len * 1.0 / num_reads; printf("number reads: %lld\ntotal size:%lld\nlongest: %d\nshortest: %d\navg: %lf\n", num_reads, total_len, max_len, min_len, avg_len); kseq_destroy(seq); gzclose(fp); return 0; }
bseq1_t *bseq_read(const char *fn, int *n_) { FILE *fp; bseq1_t *seqs; kseq_t *ks; int m, n; uint64_t size = 0; *n_ = 0; fp = fopen(fn, "rb"); if (fp == 0) return 0; ks = kseq_init(fp); m = n = 0; seqs = 0; while (kseq_read(ks) >= 0) { bseq1_t *s; if (n >= m) { m = m? m<<1 : 256; seqs = realloc(seqs, m * sizeof(bseq1_t)); } s = &seqs[n]; s->seq = strdup(ks->seq.s); s->qual = ks->qual.l? strdup(ks->qual.s) : 0; s->l_seq = ks->seq.l; size += seqs[n++].l_seq; } *n_ = n; kseq_destroy(ks); fclose(fp); return seqs; }
static seqs_t *load_seqs(const char *fn) { seqs_t *s; seq1_t *p; gzFile fp; int l; kseq_t *seq; fp = xzopen(fn, "r"); seq = kseq_init(fp); s = (seqs_t*)calloc(1, sizeof(seqs_t)); s->m_seqs = 256; s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t)); while ((l = kseq_read(seq)) >= 0) { if (s->n_seqs == s->m_seqs) { s->m_seqs <<= 1; s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t)); } p = s->seqs + (s->n_seqs++); p->l = seq->seq.l; p->s = (unsigned char*)malloc(p->l + 1); memcpy(p->s, seq->seq.s, p->l); p->s[p->l] = 0; p->n = strdup((const char*)seq->name.s); } kseq_destroy(seq); gzclose(fp); fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); return s; }
static khash_t(s) *load_mask(const char *fn) { kseq_t *seq; gzFile fp; khash_t(s) *h; h = kh_init(s); fp = gzopen(fn, "r"); seq = kseq_init(fp); while (kseq_read(seq) >= 0) { khint_t k; int ret, i; mask32_t *p; k = kh_put(s, h, strdup(seq->name.s), &ret); assert(ret); // duplicated name p = &kh_val(h, k); p->ori_len = seq->seq.l; p->mask = (uint32_t*)calloc((seq->seq.l+31)/32, 4); for (i = 0; i < seq->seq.l; ++i) if (seq->seq.s[i] == '3') p->mask[i/32] |= 1u<<i%32; } kseq_destroy(seq); gzclose(fp); return h; }
void reads_parsing::close_file(int file_indx) { kseq_destroy(read_files[file_indx]->seq); gzclose(read_files[file_indx]->fp); read_files[file_indx]->fp= NULL; }
static void write_dict(const char *fn, args_t *args) { hts_md5_context *md5; int l, i, k; gzFile fp; kseq_t *seq; unsigned char digest[16]; char hex[33]; fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "dict: %s: No such file or directory\n", fn); exit(1); } FILE *out = stdout; if (args->output_fname) { out = fopen(args->output_fname, "w"); if (out == NULL) { fprintf(stderr, "dict: %s: Cannot open file for writing\n", args->output_fname); exit(1); } } if (!(md5 = hts_md5_init())) exit(1); seq = kseq_init(fp); if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n"); while ((l = kseq_read(seq)) >= 0) { for (i = k = 0; i < seq->seq.l; ++i) { if (seq->seq.s[i] >= '!' && seq->seq.s[i] <= '~') seq->seq.s[k++] = toupper(seq->seq.s[i]); } hts_md5_reset(md5); hts_md5_update(md5, (unsigned char*)seq->seq.s, k); hts_md5_final(digest, md5); hts_md5_hex(hex, digest); fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex); if (args->uri) fprintf(out, "\tUR:%s", args->uri); else if (strcmp(fn, "-") != 0) { #ifdef _WIN32 char *real_path = _fullpath(NULL, fn, PATH_MAX); #else char *real_path = realpath(fn, NULL); #endif fprintf(out, "\tUR:file://%s", real_path); free(real_path); } if (args->assembly) fprintf(out, "\tAS:%s", args->assembly); if (args->species) fprintf(out, "\tSP:%s", args->species); fprintf(out, "\n"); } kseq_destroy(seq); hts_md5_destroy(md5); if (args->output_fname) fclose(out); }
int stk_hety(int argc, char *argv[]) { gzFile fp; kseq_t *seq; int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; char *buf; uint32_t cnt[3]; if (argc == 1) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk hety [options] <in.fa>\n\n"); fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); fprintf(stderr, " -m treat lowercases as masked\n"); fprintf(stderr, "\n"); return 1; } while ((c = getopt(argc, argv, "w:t:m")) >= 0) { switch (c) { case 'w': win_size = atoi(optarg); break; case 't': n_start = atoi(optarg); break; case 'm': is_lower_mask = 1; break; } } fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); seq = kseq_init(fp); win_step = win_size / n_start; buf = calloc(win_size, 1); while ((l = kseq_read(seq)) >= 0) { int x, i, y, z, next = 0; cnt[0] = cnt[1] = cnt[2] = 0; for (i = 0; i <= l; ++i) { if ((i >= win_size && i % win_step == 0) || i == l) { if (i == l && l >= win_size) { for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; } if (cnt[1] + cnt[2] > 0) printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); next = i; } if (i < l) { y = i % win_size; c = seq->seq.s[i]; if (is_lower_mask && islower(c)) c = 'N'; c = seq_nt16_table[c]; x = bitcnt_table[c]; if (i >= win_size) --cnt[(int)buf[y]]; buf[y] = z = x > 2? 0 : x == 2? 2 : 1; ++cnt[z]; } } } free(buf); kseq_destroy(seq); gzclose(fp); return 0; }
int detect_snps(char reference_sequence[], char filename[], int length_of_genome, int exclude_gaps) { int i; int number_of_snps = 0; int l; gzFile fp; kseq_t *seq; fp = gzopen(filename, "r"); seq = kseq_init(fp); // First sequence is the reference sequence so skip it kseq_read(seq); while ((l = kseq_read(seq)) >= 0) { for(i = 0; i < length_of_genome; i++) { if(exclude_gaps) { // If there is an indel in the reference sequence, replace with the first proper base you find if((reference_sequence[i] == '-' && seq->seq.s[i] != '-' ) || (toupper(reference_sequence[i]) == 'N' && seq->seq.s[i] != 'N' )) { reference_sequence[i] = toupper(seq->seq.s[i]); } if(reference_sequence[i] != '*' && seq->seq.s[i] != '-' && toupper(seq->seq.s[i]) != 'N' && reference_sequence[i] != toupper(seq->seq.s[i])) { reference_sequence[i] = '*'; number_of_snps++; } } else { char input_base = toupper(seq->seq.s[i]); if(input_base == 'N') { input_base = '-'; } if(reference_sequence[i] != '*' && reference_sequence[i] != input_base) { reference_sequence[i] = '*'; number_of_snps++; } } } } kseq_destroy(seq); gzclose(fp); return number_of_snps; }
void bwa_seq_close(bwa_seqio_t *bs) { if (bs == 0) return; if (bs->is_bam) bam_close(bs->fp); else { gzclose(bs->ks->f->f); kseq_destroy(bs->ks); } free(bs); }
void ta_opt_free(ta_opt_t *opt) { int i; gzFile fp = opt->ks->f->f; for (i = 0; i < opt->n_adaps; ++i) free(opt->adaps[i].seq); kseq_destroy(opt->ks); gzclose(fp); free(opt->adaps); }
static inline void parse_sequences( const char *filename, char ***strings_, unsigned long **sizes_, unsigned long *count_) { FILE* fp; kseq_t *seq = NULL; int l = 0; char **strings = NULL; unsigned long *sizes = NULL; unsigned long count = 0; unsigned long memory = 1000; fp = fopen(filename, "r"); if(fp == NULL) { perror("fopen"); exit(1); } strings = malloc(sizeof(char*) * memory); sizes = malloc(sizeof(unsigned long) * memory); seq = kseq_init(fileno(fp)); while ((l = kseq_read(seq)) >= 0) { strings[count] = strdup(seq->seq.s); if (NULL == strings[count]) { perror("strdup"); exit(1); } sizes[count] = seq->seq.l; ++count; if (count >= memory) { char **new_strings = NULL; unsigned long *new_sizes = NULL; memory *= 2; new_strings = realloc(strings, sizeof(char*) * memory); if (NULL == new_strings) { perror("realloc"); exit(1); } strings = new_strings; new_sizes = realloc(sizes, sizeof(unsigned long) * memory); if (NULL == new_sizes) { perror("realloc"); exit(1); } sizes = new_sizes; } } kseq_destroy(seq); fclose(fp); *strings_ = strings; *sizes_ = sizes; *count_ = count; }
int pull_by_re(char *input_file, pcre *re, pcre_extra *re_extra, int min, int max, int length, int exclude, int convert, int just_count) { gzFile fp; int count=0,l; int excluded = 0; int is_fasta = 0; /* assume fastq */ kseq_t *seq; /* open fasta file */ fp = gzopen(input_file,"r"); if (!fp) { fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file); exit(EXIT_FAILURE); } seq = kseq_init(fp); /* determine file type */ l = kseq_read(seq); /* read the first sequence */ is_fasta = seq->qual.s == NULL ? 1 : 0; gzrewind(fp); kseq_rewind(seq); /* rewind to beginning for main loop */ if (verbose_flag) { if (is_fasta) fprintf(stderr, "Input is FASTA format\n"); else fprintf(stderr, "Input is FASTQ format\n"); } /* search through list and see if this header matches */ while((l = kseq_read(seq)) >= 0) { if (exclude) { if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) excluded++; else { /* regex doesn't match, so check size/print */ count += size_filter(seq, is_fasta, min, max, length, convert, just_count); } } else { if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) { /* regex matches so check size/print */ count += size_filter(seq, is_fasta, min, max, length, convert, just_count); } else excluded++; } } /* end of seq traversal */ kseq_destroy(seq); gzclose(fp); /* done reading file so close */ if (just_count) { fprintf(stdout, "Total output: %i\n", count); fprintf(stdout, "Total excluded: %i\n", excluded); } return count; }
void bwa_seq_close(bwa_seqio_t *bs) { if (bs == 0) return; if (bs->is_bam) { if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file"); } else { err_gzclose(bs->ks->f->f); kseq_destroy(bs->ks); } free(bs); }
void FastqFile::close() { if (kseq != NULL) { kseq_destroy(kseq); gzclose(fp); fnit = fnames.end(); kseq = NULL; } }
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l; bntamb1_t *q; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { // add the reverse complemented sequence m_pac = (bns->l_pac * 2 + 3) / 4 * 4; pac = realloc(pac, m_pac/4); memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); } ret = bns->l_pac; { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return ret; }
int fdb_config_destroy (fdb_config_t *cfg) { if (cfg->infns != NULL){ for (int iii = 0; iii < cfg->n_infs; iii++) { if (cfg->infns[iii] != NULL) free(cfg->infns[iii]); } free(cfg->infns); } if (cfg->in_kseqs != NULL){ for (int iii = 0; iii < cfg->n_infs; iii++) { if (cfg->in_kseqs[iii] != NULL) { FDB_FP_CLOSE(cfg->in_kseqs[iii]->f->f); kseq_destroy(cfg->in_kseqs[iii]); } } free(cfg->in_kseqs); } if (cfg->barcode_file != NULL) free(cfg->barcode_file); if (cfg->buffer_seq != NULL) free(cfg->buffer_seq); if (cfg->out_dir != NULL) free(cfg->out_dir); if (cfg->out_suffix != NULL) free(cfg->out_suffix); if (cfg->leftover_suffix != NULL) free(cfg->leftover_suffix); if (cfg->barcodes != NULL) { for (int iii = 0; iii < cfg->n_barcodes; iii++) { if (cfg->barcodes[iii] != NULL) { if (cfg->barcodes[iii]->name.s != NULL) { free(cfg->barcodes[iii]->name.s); } if (cfg->barcodes[iii]->seq.s != NULL) { free(cfg->barcodes[iii]->seq.s); } for (int jjj = 0; jjj < cfg->n_infs; jjj++) { if (cfg->barcodes[iii]->fns[jjj] != NULL) { free(cfg->barcodes[iii]->fns[jjj]); } if (cfg->barcodes[iii]->fps[jjj] != NULL) { FDB_FP_CLOSE(cfg->barcodes[iii]->fps[jjj]); } } free(cfg->barcodes[iii]); } } free(cfg->barcodes); } if (cfg->leftover_outfps != NULL) { for (int iii = 0; iii < cfg->n_infs; iii++) { if (cfg->leftover_outfps[iii] != NULL) { FDB_FP_CLOSE(cfg->leftover_outfps[iii]); } } free(cfg->leftover_outfps); } }
int main(int argc, char *argv[]) { int l, i, c; long long cnt[5], tot; kseq_t *seq; gzFile fp; cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0; while ((c = getopt(argc, argv, "l:r:")) >= 0) { switch (c) { case 'l': g_len = atoi(optarg); break; case 'r': g_ratio = atof(optarg); break; } } if (argc == optind) { fprintf(stderr, "Usage: gen_mask [-l %d] [-r %.2lf] <in.rawMask.fa>\n", g_len, g_ratio); return 1; } fp = gzopen(argv[optind], "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { int n_good = 0, n_all = 0, n_mid = 0; printf(">%s %d %.3lf", seq->name.s, g_len, g_ratio); for (i = 0; i < l + g_len - 1; ++i) { int c1, c2; unsigned x = i < l? get_cnt(seq->seq.s[i]) : 0; c1 = x>>16; c2 = x&0xffff; if (c1 == 1) ++cnt[4]; if (c1) { ++n_all; if (is_good(c1, c2)) ++n_good; if (c1 == 1) ++n_mid; } x = i >= g_len? get_cnt(seq->seq.s[i - g_len]) : 0; c1 = x>>16; c2 = x&0xffff; if (c1) { --n_all; if (is_good(c1, c2)) --n_good; if (c1 == 1) --n_mid; } assert(n_all <= g_len && n_good <= n_all); if (i % 60 == 0) putchar('\n'); x = n_all == 0? 0 : (double)n_good/n_all >= g_ratio? 3 : (double)n_mid/n_all >= g_ratio? 2 : 1; putchar(x + '0'); cnt[x]++; } putchar('\n'); } tot = cnt[1] + cnt[2] + cnt[3]; fprintf(stderr, "%lld, %lld, %lld, %lld, %lld\n", cnt[0], cnt[1], cnt[2], cnt[3], cnt[4]); fprintf(stderr, "%lf, %lf, %lf\n", (double)cnt[3] / tot, (double)(cnt[2] + cnt[3]) / tot, (double)cnt[4] / tot); kseq_destroy(seq); gzclose(fp); return 0; }
void Fast5Map::load_from_fasta(std::string fasta_filename) { gzFile gz_fp; FILE* fp = fopen(fasta_filename.c_str(), "r"); if(fp == NULL) { fprintf(stderr, "error: could not open %s for read\n", fasta_filename.c_str()); exit(EXIT_FAILURE); } gz_fp = gzdopen(fileno(fp), "r"); if(gz_fp == NULL) { fprintf(stderr, "error: could not open %s using gzdopen\n", fasta_filename.c_str()); exit(EXIT_FAILURE); } kseq_t* seq = kseq_init(gz_fp); while(kseq_read(seq) >= 0) { if(seq->comment.l == 0) { fprintf(stderr, "error: no path associated with read %s\n", seq->name.s); exit(EXIT_FAILURE); } // This splitting code implicitly handles both the 2 and 3 field // fasta format that poretools will output. The FAST5 path // is always the last field. std::vector<std::string> fields = split(seq->comment.s, ' '); read_to_path_map[seq->name.s] = fields.back(); } kseq_destroy(seq); gzclose(gz_fp); fclose(fp); // Sanity check that the first path actually points to a file if(read_to_path_map.size() > 0) { std::string first_read = read_to_path_map.begin()->first; std::string first_path = read_to_path_map.begin()->second; struct stat file_s; int ret = stat(first_path.c_str(), &file_s); if(ret != 0) { fprintf(stderr, "Error: could not find path to FAST5 for read %s\n", first_read.c_str()); fprintf(stderr, "Please make sure that this path is accessible: %s\n", first_path.c_str()); exit(EXIT_FAILURE); } } // Write the map as a fofn file so next time we don't have to parse // the entire fasta write_to_fofn(fasta_filename + FOFN_SUFFIX); }
void reads_parsing::close() { for(int i=0;i<nb_files;i++) { if(read_files[i]->fp !=NULL) { kseq_destroy(read_files[i]->seq); gzclose(read_files[i]->fp); read_files[i]->fp =NULL; } //free(read_files[i]); } }
int main(void) { gzFile fp; kseq_t *seq; int n = 0, slen = 0, qlen = 0; fp = gzdopen(fileno(stdin), "r"); seq = kseq_init(fp); while (kseq_read(seq) >= 0) printf("%s\t%s\t%s\t%s\n", seq->name.s, seq->comment.s, seq->seq.s, seq->qual.s); kseq_destroy(seq); gzclose(fp); return 0; }
int64_t dump_forward_pac(gzFile fp_fa, const char *prefix) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac; bntamb1_t *q; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".bis.pac"); fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); ret = bns->l_pac; { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } /* re-dump forward bis bns, otherwise the .bis.ann and .bis.amb have twice as long pac */ /* strcpy(name, prefix); strcat(name, ".bis"); */ /* bis_bns_dump(bns, prefix); */ bns_destroy(bns); kseq_destroy(seq); free(pac); return ret; }
int main(int argc, char *argv[]) { bwaidx_t *idx; gzFile fp; kseq_t *ks; mem_opt_t *opt; if (argc < 3) { fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n"); return 1; } idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index if (NULL == idx) { fprintf(stderr, "Index load failed.\n"); exit(EXIT_FAILURE); } fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); if (NULL == fp) { fprintf(stderr, "Couldn't open %s : %s\n", strcmp(argv[2], "-") ? argv[2] : "stdin", errno ? strerror(errno) : "Out of memory"); exit(EXIT_FAILURE); } ks = kseq_init(fp); // initialize the FASTA/Q parser opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values while (kseq_read(ks) >= 0) { // read one sequence mem_alnreg_v ar; int i, k; ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits for (i = 0; i < ar.n; ++i) { // traverse each hit mem_aln_t a; if (ar.a[i].secondary >= 0) continue; // skip secondary alignments a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR // print alignment err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); for (k = 0; k < a.n_cigar; ++k) // print CIGAR err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); err_printf("\t%d\n", a.NM); // print edit distance free(a.cigar); // don't forget to deallocate CIGAR } free(ar.a); // and deallocate the hit list } free(opt); kseq_destroy(ks); err_gzclose(fp); bwa_idx_destroy(idx); return 0; }
static void md5_one(const char *fn) { hts_md5_context *md5_one, *md5_all; int l, i, k; gzFile fp; kseq_t *seq; unsigned char unordered[16], digest[16]; char hex[33]; for (l = 0; l < 16; ++l) unordered[l] = 0; fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "md5fa: %s: No such file or directory\n", fn); exit(1); } if (!(md5_all = hts_md5_init())) exit(1); if (!(md5_one = hts_md5_init())) { hts_md5_destroy(md5_all); exit(1); } seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { for (i = k = 0; i < seq->seq.l; ++i) { if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]); else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i]; } hts_md5_reset(md5_one); hts_md5_update(md5_one, (unsigned char*)seq->seq.s, k); hts_md5_final(digest, md5_one); hts_md5_hex(hex, digest); for (l = 0; l < 16; ++l) unordered[l] ^= digest[l]; printf("%s %s %s\n", hex, fn, seq->name.s); hts_md5_update(md5_all, (unsigned char*)seq->seq.s, k); } hts_md5_final(digest, md5_all); kseq_destroy(seq); hts_md5_hex(hex, digest); printf("%s %s >ordered\n", hex, fn); hts_md5_hex(hex, unordered); printf("%s %s >unordered\n", hex, fn); hts_md5_destroy(md5_all); hts_md5_destroy(md5_one); }
void bwa_seq_close(bwa_seqio_t *bs) { int i; if (bs == 0) return; if (bs->is_bam) bam_close(bs->fp); else { gzclose(bs->ks->f->f); kseq_destroy(bs->ks); } for(i=0; i!=3; ++i) if(bs->sai[i]) fclose(bs->sai[i]); free(bs); }
void read_fastq(std::string fastq_filename, std::vector<std::string>* fastq_names, std::vector<size_t>* fastq_lengths) { gzFile fastq_file = gzopen(fastq_filename.c_str(), "r"); kseq_t* seq; int l; seq = kseq_init(fastq_file); while((l = kseq_read(seq)) >= 0) { fastq_names->push_back((std::string) seq->name.s); fastq_lengths->push_back(seq->seq.l); } kseq_destroy(seq); gzclose(fastq_file); }