static void _check_quality(char *OUTPUT_PREFIX, int WRITE_LOWQ, int WRITE_SPLITREAD, int MAPPING_QUALITY, int MIN_ALIGNED_PCT, int IGNORE_DUPLICATES) { bam1_t *b1 = bam_init1(), *b2 = bam_init1();; if (WRITE_SPLITREAD) { samfile_t *split_file = b2g_samfile_open("%s_splitread.bam", "rb", 0, OUTPUT_PREFIX); while (-1 < samread(split_file, b1)) { samread(split_file, b2); assert(b2g_bam_pair_split(b1, b2, MAPPING_QUALITY, MIN_ALIGNED_PCT, IGNORE_DUPLICATES)); } samclose(split_file); } if (WRITE_LOWQ) { samfile_t *lowq_file = b2g_samfile_open("%s_lowqual.bam", "rb", 0, OUTPUT_PREFIX); while (-1 < samread(lowq_file, b1)) { samread(lowq_file, b2); if (WRITE_SPLITREAD) { assert(!b2g_bam_pair_split(b1, b2, MAPPING_QUALITY, MIN_ALIGNED_PCT, IGNORE_DUPLICATES)); assert(!b2g_bams_highq(b1, b2, MAPPING_QUALITY, MIN_ALIGNED_PCT, IGNORE_DUPLICATES)); } else assert(!b2g_bams_highq(b1, b2, MAPPING_QUALITY, MIN_ALIGNED_PCT, IGNORE_DUPLICATES) || b2g_bam_pair_split(b1, b2, MAPPING_QUALITY, MIN_ALIGNED_PCT, IGNORE_DUPLICATES)); } samclose(lowq_file); } bam_destroy1(b1); bam_destroy1(b2); }
BamMerge::BamMerge(const vector<string>& bam_fnames, vector<int64_t> file_offsets) : _bam_fnames(bam_fnames), _lines(less_bam(true)), _last_id(0) { if (bam_fnames.size() <= 0) return; for (size_t i = 0; i < _bam_fnames.size(); ++i) { const char* fname = _bam_fnames[i].c_str(); samfile_t* fp = samopen(fname, "rb", 0); if (fp==0) { warn_msg(ERR_BAM_OPEN, fname); exit(1); } if (bam_fnames.size() == file_offsets.size() && file_offsets[i] > 0) bgzf_seek(fp->x.bam, file_offsets[i], SEEK_SET); bam1_t* b = bam_init1(); if (samread(fp, b) > 0) { _src_files.push_back(fp); CBamLine brec(_lines.size(), b, fp->header); _lines.push(brec); } else { bam_destroy1(b); } } if (_lines.size() == 0) { warn_msg("Warning: no input BAM records found.\n"); exit(1); } }
void samToOpenBed(char *samIn, FILE *f) /* Like samToOpenBed, but the output is the already open file f. */ { samfile_t *sf = samopen(samIn, "r", NULL); bam_header_t *bamHeader = sf->header; bam1_t one; ZeroVar(&one); int err; while ((err = samread(sf, &one)) >= 0) { int32_t tid = one.core.tid; if (tid < 0) continue; char *chrom = bamHeader->target_name[tid]; // Approximate here... can do better if parse cigar. int start = one.core.pos; int size = one.core.l_qseq; int end = start + size; boolean isRc = (one.core.flag & BAM_FREVERSE); char strand = '+'; if (isRc) { strand = '-'; reverseIntRange(&start, &end, bamHeader->target_len[tid]); } fprintf(f, "%s\t%d\t%d\t.\t0\t%c\n", chrom, start, end, strand); } if (err < 0 && err != -1) errnoAbort("samread err %d", err); samclose(sf); }
hash_table* hash_ids(const char* fn) { fprintf(stderr, "hashing ... \n"); hash_table* T = create_hash_table(); samfile_t* f = samopen(fn, "rb", NULL); if (f == NULL) { fprintf(stderr, "can't open bam file %s\n", fn); exit(1); } bam1_t* b = bam_init1(); uint32_t n = 0; while (samread(f, b) >= 0) { if (++n % 1000000 == 0) { fprintf(stderr, "\t%d reads\n", n); } inc_hash_table(T, bam1_qname(b), b->core.l_qname); } bam_destroy1(b); samclose(f); fprintf(stderr, "done.\n"); return T; }
static int fill_buf(samfile_t *in, buffer_t *buf) { int i, ret, last_tid, min_rpos = 0x7fffffff, capacity; bam1_t *b = bam_init1(); bam1_core_t *c = &b->core; // squeeze out the empty cells at the beginning for (i = 0; i < buf->n; ++i) if (buf->buf[i].b) break; if (i < buf->n) { // squeeze if (i > 0) { memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i)); buf->n = buf->n - i; } } else buf->n = 0; // calculate min_rpos for (i = 0; i < buf->n; ++i) { elem_t *e = buf->buf + i; if (e->b && e->rpos >= 0 && e->rpos < min_rpos) min_rpos = buf->buf[i].rpos; } // fill the buffer buf->x = -1; last_tid = buf->n? buf->buf[0].b->core.tid : -1; capacity = buf->n + BLOCK_SIZE; while ((ret = samread(in, b)) >= 0) { elem_t *e; uint8_t *qual = bam1_qual(b); int is_mapped; if (last_tid < 0) last_tid = c->tid; if (c->tid != last_tid) { if (buf->x < 0) buf->x = buf->n; } if (buf->n >= buf->max) { // enlarge buf->max = buf->max? buf->max<<1 : 8; buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max); } e = &buf->buf[buf->n++]; e->b = bam_dup1(b); e->rpos = -1; e->score = 0; for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1; e->score = (double)e->score / sqrt(c->l_qseq + 1); is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1; if (!is_mapped) e->score = -1; if (is_mapped && (c->flag & BAM_FREVERSE)) { e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b)); if (min_rpos > e->rpos) min_rpos = e->rpos; } if (buf->n >= capacity) { if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE; else break; } } if (ret >= 0 && buf->x < 0) buf->x = buf->n; bam_destroy1(b); return buf->n; }
static int _count_reads(char *path) { int count = 0; samfile_t *bamfile = b2g_samfile_open(path, "rb", 0); if (!bamfile) return 0; bam1_t *bam = bam_init1(); while (-1 < samread(bamfile, bam)) count++; bam_destroy1(bam); samclose(bamfile); return count; }
bam1_t * SAM_istream::read() throw (SAM_IO_Error) { if (sam_file == NULL) { throw SAM_IO_Error(SAM_IO_Error::file_not_opened, "tried to read from a not previously opened file"); } bam1_t * b = bam_init1(); int bytes = samread(sam_file,b); if (bytes == -1) end_of_file = true; return b; }
void test_cigar_to_spans() { char *sam_filename = "testdata/RUM.sam"; samfile_t *samfile = samopen(sam_filename, "r", NULL); bam1_t *rec = bam_init1(); struct SpanAssertion { int read_num; int num_spans; struct Span spans[10]; }; struct SpanAssertion cases[] = { { 102, 1, { { 12465667, 12465724 } } }, { 104, 1, { { 2095233, 2095289 } } }, { 128, 1, { { 152316, 152373 } } }, { 162, 1, { { 14232813, 14232886 } } }, { 172, 2, { { 3619619, 3619627 }, { 3619984, 3620048 } } }, { 642, 1, { { 15291546, 15291622 } } }, { 670, 2, { { 3950665, 3950724 }, { 3951436, 3951453 } } } }; int num_cases = sizeof(cases) / sizeof(struct SpanAssertion); int read_num = 0; int case_num = 0; CigarCursor curs; while (case_num < num_cases && samread(samfile, rec) > 0) { if (cases[case_num].read_num == read_num) { int num_spans = cases[case_num].num_spans; Span *span; init_cigar_cursor(&curs, rec); for (span = cases[case_num].spans; span < cases[case_num].spans + num_spans; span++) { assert_equals(1, next_span(&curs), "Should have found a span"); assert_equals(span->start, curs.start, "Start"); assert_equals(span->end, curs.end, "End"); } assert_equals(0, next_span(&curs), "No more spans"); case_num++; } read_num++; } if (case_num < num_cases) assert_equals(0, 1, "Ran out of records in sam file"); }
hash_table* hash_ids(const char* fn) { fprintf(stderr, "hashing ... \n"); hash_table* T = create_hash_table(); samfile_t* f = samopen(fn, "rb", NULL); if (f == NULL) { fprintf(stderr, "can't open bam file %s\n", fn); exit(1); } bam1_t* b = bam_init1(); uint32_t n = 0; char* qname = NULL; size_t qname_size = 0; while (samread(f, b) >= 0) { if (++n % 1000000 == 0) { fprintf(stderr, "\t%d reads\n", n); } if (qname_size < b->core.l_qname + 3) { qname_size = b->core.l_qname + 3; qname = realloc(qname, qname_size); } memcpy(qname, bam1_qname(b), b->core.l_qname); if (b->core.flag & BAM_FREAD2) { qname[b->core.l_qname] = '/'; qname[b->core.l_qname + 1] = '2'; qname[b->core.l_qname + 2] = '\0'; } else { qname[b->core.l_qname] = '/'; qname[b->core.l_qname + 1] = '1'; qname[b->core.l_qname + 2] = '\0'; } inc_hash_table(T, qname, b->core.l_qname + 2); } free(qname); bam_destroy1(b); samclose(f); fprintf(stderr, "done.\n"); return T; }
int main(int argc, char *argv[]) { samfile_t *fp; if ((fp = samopen(argv[1], "rb", 0)) == 0) { fprintf(stderr, "showbam: Fail to open BAM file %s\n", argv[1]); return 1; } bam1_t *b = bam_init1(); while (samread(fp, b) >= 0) fetch_func(b); bam_destroy1(b); samclose(fp); return 0; }
void edwSamRepeatAnalysis(char *inSam, char *outRa) /* edwSamRepeatAnalysis - Analyze result of alignment vs. RepeatMasker type libraries.. */ { /* Go through sam file, filling in hiLevelHash with count of each hi level repeat class we see. */ struct hash *hiLevelHash = hashNew(0); samfile_t *sf = samopen(inSam, "r", NULL); bam_header_t *bamHeader = sf->header; bam1_t one; ZeroVar(&one); int err; long long hit = 0, miss = 0; while ((err = samread(sf, &one)) >= 0) { int32_t tid = one.core.tid; if (tid < 0) { ++miss; continue; } ++hit; /* Parse out hiLevel classification from target, which is something like 7SLRNA#SINE/Alu * from which we'd want to extract SINE. The '/' is not present in all input. */ char *target = bamHeader->target_name[tid]; char *hashPos = strchr(target, '#'); if (hashPos == NULL) errAbort("# not found in target %s", target); char *hiLevel = cloneString(hashPos + 1); char *slashPos = strchr(hiLevel, '/'); if (slashPos != NULL) *slashPos = 0; hashIncInt(hiLevelHash, hiLevel); } samclose(sf); /* Output some basic stats as well as contents of hash */ FILE *f = mustOpen(outRa, "w"); double invTotal = 1.0 / (hit + miss); double mapRatio = (double)hit * invTotal; struct hashEl *hel, *helList = hashElListHash(hiLevelHash); slSort(&helList, hashElCmp); for (hel = helList; hel != NULL; hel = hel->next) { double hitRatio = ptToInt(hel->val) * invTotal; fprintf(f, "%s %g\n", hel->name, hitRatio); } fprintf(f, "total %g\n", mapRatio); carefulClose(&f); }
int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) { bam_plbuf_t *buf; int ret; bam1_t *b; b = bam_init1(); buf = bam_plbuf_init(func, func_data); bam_plbuf_set_mask(buf, mask); while ((ret = samread(fp, b)) >= 0) bam_plbuf_push(b, buf); bam_plbuf_push(0, buf); bam_plbuf_destroy(buf); bam_destroy1(b); return 0; }
void test_cigar_to_spans2() { char *sam_filename = "testdata/cigar_bug.sam"; samfile_t *samfile = samopen(sam_filename, "r", NULL); bam1_t *rec = bam_init1(); struct SpanAssertion { int read_num; int num_spans; struct Span spans[10]; }; struct SpanAssertion cases[] = { { 0, 2, { { 46383142, 46383163 }, { 46384677, 46384749 } } } }; int num_cases = sizeof(cases) / sizeof(struct SpanAssertion); int read_num = 0; int case_num = 0; CigarCursor curs; while (case_num < num_cases && samread(samfile, rec) > 0) { if (cases[case_num].read_num == read_num) { int num_spans = cases[case_num].num_spans; Span *span; init_cigar_cursor(&curs, rec); for (span = cases[case_num].spans; span < cases[case_num].spans + num_spans; span++) { if (next_span(&curs)) { assert_equals(span->start, curs.start, "Start"); assert_equals(span->end, curs.end, "End"); } } assert_equals(0, next_span(&curs), "No more spans"); case_num++; } read_num++; } }
int gt_samfile_iterator_next(GtSamfileIterator *s_iter, GtSamAlignment **s_alignment) { int read; if (s_iter->current_alignment == NULL) s_iter->current_alignment = gt_sam_alignment_new(s_iter->alphabet); read = samread(s_iter->samfile, s_iter->current_alignment->s_alignment); if (read > 0) { *s_alignment = s_iter->current_alignment; return read; } else { *s_alignment = NULL; return read; } }
int main(int argc, char* argv[]) { samfile_t *ifile = NULL, *ofile = NULL; bam1_t *read = bam_init1(); int keep = 0; char *p = NULL; //Open input file, either SAM or BAM p = strrchr(argv[1], '.'); if(strcmp(p, ".bam") == 0) { ifile = samopen(argv[1], "rb", NULL); } else { ifile = samopen(argv[1], "r", NULL); } bam_header_t *head = ifile->header; //Open output file // ofile = samopen("AND_type.bam", "wb", ifile->header); ofile = samopen(argv[2], "wb", ifile->header); //Iterate through the lines while(samread(ifile, read) > 1) { keep = 0; //Is the read's mate on the same chromosome/contig? if(read->core.tid == read->core.mtid) { //Are the mates on opposite strands? if(read->core.flag & BAM_FREVERSE && !(read->core.flag & BAM_FMREVERSE)) { if(read->core.pos < read->core.mpos) { // Are mates 500 bp or less from the ends? if (read-> core.pos <= 500 && read->core.mpos > head->target_len[read->core.tid] - 500) keep=1; } } else if(!(read->core.flag & BAM_FREVERSE) && read->core.flag & BAM_FMREVERSE) { if(read->core.mpos < read->core.pos) { if (read-> core.mpos <= 500 && read->core.pos > head->target_len[read->core.tid] - 500) keep=1; } } } if(keep) samwrite(ofile, read); } bam_destroy1(read); samclose(ifile); samclose(ofile); return 0; }
int _walk_through_sam_and_split(samfile_t * fin, samfile_t **foutList) { bam1_t *b = bam_init1(); int r, count = 0; while (0 <= (r = samread(fin, b))) { if(b->core.tid > -1){ samwrite(foutList[b->core.tid], b); }else{ samwrite(foutList[fin->header->n_targets], b); } count++; } bam_destroy1(b); return r >= -1 ? count : -1 * count; }
static void scanSam(char *samIn, FILE *f, struct genomeRangeTree *grt, long long *retHit, long long *retMiss, long long *retTotalBasesInHits) /* Scan through sam file doing several things:counting how many reads hit and how many * miss target during mapping phase, copying those that hit to a little bed file, and * also defining regions covered in a genomeRangeTree. */ { samfile_t *sf = samopen(samIn, "r", NULL); bam_header_t *bamHeader = sf->header; bam1_t one; ZeroVar(&one); int err; long long hit = 0, miss = 0, totalBasesInHits = 0; while ((err = samread(sf, &one)) >= 0) { int32_t tid = one.core.tid; if (tid < 0) { ++miss; continue; } ++hit; char *chrom = bamHeader->target_name[tid]; // Approximate here... can do better if parse cigar. int start = one.core.pos; int size = one.core.l_qseq; int end = start + size; totalBasesInHits += size; boolean isRc = (one.core.flag & BAM_FREVERSE); char strand = '+'; if (isRc) { strand = '-'; reverseIntRange(&start, &end, bamHeader->target_len[tid]); } if (start < 0) start=0; if (f != NULL) fprintf(f, "%s\t%d\t%d\t.\t0\t%c\n", chrom, start, end, strand); genomeRangeTreeAdd(grt, chrom, start, end); } if (err < 0 && err != -1) errnoAbort("samread err %d", err); samclose(sf); *retHit = hit; *retMiss = miss; *retTotalBasesInHits = totalBasesInHits; }
int32_t tmap_sam_io_read(tmap_sam_io_t *samio, tmap_sam_t *sam) { if(NULL != sam->b) { bam_destroy1(sam->b); } sam->b = bam_init1(); // check if we're past optional end bam virtual file offset if (samio->bam_end_vfo > 0) { BGZF* bgzf_fp = samio->fp->x.bam; if (bam_tell(bgzf_fp) >= samio->bam_end_vfo) { fprintf(stderr, "stopping at bam virtual file offset %lu\n", samio->bam_end_vfo); return -1; } } if(0 < samread(samio->fp, sam->b)) { char *str; int32_t i, len; // name str = bam1_qname(sam->b); len = strlen(str); tmap_sam_io_update_string(&sam->name, str, len); sam->name->s[len] = '\0'; // seq and qual len = sam->b->core.l_qseq; tmap_sam_io_update_string(&sam->seq, NULL, len); tmap_sam_io_update_string(&sam->qual, (char*)bam1_qual(sam->b), len); for(i=0;i<len;i++) { sam->seq->s[i] = bam_nt16_rev_table[bam1_seqi(bam1_seq(sam->b), i)]; sam->qual->s[i] = QUAL2CHAR(sam->qual->s[i]); } sam->seq->s[len] = sam->qual->s[len] = '\0'; // reverse compliment if necessary if((sam->b->core.flag & BAM_FREVERSE)) { tmap_sam_reverse_compliment(sam); } return 1; } return -1; }
int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) { bam_plbuf_t *buf; int ret; bam1_t *b; b = bam_init1(); buf = bam_plbuf_init(func, func_data); if (mask < 0) mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; else mask |= BAM_FUNMAP; while ((ret = samread(fp, b)) >= 0) { // bam_plp_push() itself now filters out unmapped reads only if (b->core.flag & mask) b->core.flag |= BAM_FUNMAP; bam_plbuf_push(b, buf); } bam_plbuf_push(0, buf); bam_plbuf_destroy(buf); bam_destroy1(b); return 0; }
int sam_fetch(char *ifn, char *ofn, char *reg, void *data, sam_fetch_f func) { int ret = 0; samfile_t *in = samopen(ifn, "rb", 0); samfile_t *out = 0; if (ofn) out = samopen(ofn, "wb", in->header); if (reg) { bam_index_t *idx = bam_index_load(ifn); if (idx == 0) { fprintf(stderr, "[%s:%d] Random alignment retrieval only works for indexed BAM files.\n", __func__, __LINE__); exit(1); } int tid, beg, end; bam_parse_region(in->header, reg, &tid, &beg, &end); if (tid < 0) { fprintf(stderr, "[%s:%d] Region \"%s\" specifies an unknown reference name. \n", __func__, __LINE__, reg); exit(1); } bam_iter_t iter; bam1_t *b = bam_init1(); iter = bam_iter_query(idx, tid, beg, end); while ((ret = bam_iter_read(in->x.bam, iter, b)) >= 0) func(b, in, out, data); bam_iter_destroy(iter); bam_destroy1(b); bam_index_destroy(idx); } else { bam1_t *b = bam_init1(); while ((ret = samread(in, b)) >= 0) func(b, in, out, data); bam_destroy1(b); } if (out) samclose(out); samclose(in); if (ret != -1) { /* truncated is -2 */ fprintf(stderr, "[%s:%d] Alignment retrieval failed due to truncated file\n", __func__, __LINE__); exit(1); } return ret; }
void build_wiggles(const std::string& bam_filename, WiggleProcessor& processor) { samfile_t *bam_in = samopen(bam_filename.c_str(), "r", NULL); general_assert(bam_in != NULL, "Cannot open " + bam_filename + "!"); bam_hdr_t *header = bam_in->header; bool *used = new bool[header->n_targets]; memset(used, 0, sizeof(bool) * header->n_targets); int cur_tid = -1; //current tid; HIT_INT_TYPE cnt = 0; bam1_t *b = bam_init1(); Wiggle wiggle; while (samread(bam_in, b) >= 0) { if (bam_is_unmapped(b)) continue; if (b->core.tid != cur_tid) { if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); } cur_tid = b->core.tid; wiggle.name = header->target_name[cur_tid]; wiggle.length = header->target_len[cur_tid]; wiggle.read_depth.assign(wiggle.length, 0.0); } add_bam_record_to_wiggle(b, wiggle); ++cnt; if (cnt % 1000000 == 0) std::cout<< cnt<< std::endl; } if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); } for (int32_t i = 0; i < header->n_targets; i++) if (!used[i]) { wiggle.name = header->target_name[i]; wiggle.length = header->target_len[i]; wiggle.read_depth.clear(); processor.process(wiggle); } bam_destroy1(b); samclose(bam_in); delete[] used; }
bool bam_streamer:: next() { if (NULL==_bfp) return false; int ret; if (NULL == _biter) { ret = samread(_bfp, _brec._bp); } else { ret = bam_iter_read(_bfp->x.bam, _biter, _brec._bp); } _is_record_set=(ret >= 0); if (_is_record_set) _record_no++; return _is_record_set; }
bool bam_streamer:: next() { if (nullptr == _bfp) return false; int ret; if (nullptr == _hitr) { ret = samread(_bfp, _brec._bp); } else { ret = sam_itr_next(_bfp->file, _hitr, _brec._bp); } _is_record_set=(ret >= 0); if (_is_record_set) _record_no++; return _is_record_set; }
int add_dindel(const char *bam_in, const char *bam_out, const char *ref) { data_t_dindel tmp; int count = 0; bam1_t *b = NULL; if ((tmp.in = samopen(bam_in, "rb", 0)) == 0) { LOG_FATAL("Failed to open BAM file %s\n", bam_in); return 1; } if ((tmp.fai = fai_load(ref)) == 0) { LOG_FATAL("Failed to open reference file %s\n", ref); return 1; } /*warn_old_fai(ref);*/ if (!bam_out || bam_out[0] == '-') { tmp.out = bam_dopen(fileno(stdout), "w"); } else { tmp.out = bam_open(bam_out, "w"); } bam_header_write(tmp.out, tmp.in->header); b = bam_init1(); tmp.tid = -1; tmp.hpcount = 0; tmp.rlen = 0; while (samread(tmp.in, b) >= 0) { count++; dindel_fetch_func(b, &tmp); } bam_destroy1(b); if (tmp.hpcount) free(tmp.hpcount); samclose(tmp.in); bam_close(tmp.out); fai_destroy(tmp.fai); LOG_VERBOSE("Processed %d reads\n", count); return 0; }
void filter_by_id(const char* fn, hash_table* T) { fprintf(stderr, "filtering ... \n"); samfile_t* fin = samopen(fn, "rb", NULL); if (fin == NULL) { fprintf(stderr, "can't open bam file %s\n", fn); exit(1); } samfile_t* fout = samopen("-", "w", (void*)fin->header); if (fout == NULL) { fprintf(stderr, "can't open stdout, for some reason.\n"); exit(1); } fputs(fin->header->text, stdout); bam1_t* b = bam_init1(); uint32_t n = 0; while (samread(fin, b) >= 0) { if (++n % 1000000 == 0) { fprintf(stderr, "\t%d reads\n", n); } if (get_hash_table(T, bam1_qname(b), b->core.l_qname) == 1) { samwrite(fout, b); } } bam_destroy1(b); samclose(fout); samclose(fin); fprintf(stderr, "done.\n"); }
int main() { samfile_t *in = 0, *out = 0; int slx2sngr = 0; char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0; strcpy(in_mode, "r"); strcpy(out_mode, "w"); strcat(in_mode, "b"); printf("start******************************************************\n"); char *filename = "/ifs1/RD/shaohaojing/DAI/1000306/1000306_HUMgqbRLJDIAAPE_091015_I58_FC42UC6AAXX_L2_HUMgqbRLJDIAAPE_1.fq.bam.sort.bam"; if ((in = samopen(filename, in_mode, fn_list)) == 0) { fprintf(stderr, "[main_samview] fail to open file for reading.\n"); exit(0); } if (in->header == 0) { fprintf(stderr, "[main_samview] fail to read the header.\n"); exit(0); } if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { fprintf(stderr, "[main_samview] fail to open file for writing.\n"); exit(0); } bam1_t *b = bam_init1(); int r; char *s; while((r = samread(in, b)) >= 0) { if (!__g_skip_aln(in->header, b)) { s = bam_format1_core(out->header, b, out->type>>2&3); } printf("%s\n", s); free(s); // printf("%s", b->data); // printf("\n++++++++++%d+++++++++++", r); // printf("********************\n"); }
/** * DATE: 2010-7-29 * FUNCTION: read a line from sam/bam file. * PARAMETER: line: the read data will stored in this varible. * RETURN: the line size when read successful. -1 when read failed */ int SamCtrl::readline(std::string &line) { if (m_in == 0) { return -1; } int ret = 0; // begin to read while ((ret = samread(m_in, m_b)) >= 0) { // when read failed continue if (__g_skip_aln(m_in->header, m_b)) { continue; } m_s = bam_format1_core(m_out->header, m_b, m_out->type>>2&3); // read the buffer line = m_s; // store into the line free(m_s); return line.size(); } return -1; }
int add_uniform(const char *bam_in, const char *bam_out, const int ins_qual, const int del_qual) { data_t_uniform tmp; uint8_t iq = ENCODE_Q(ins_qual+33); uint8_t dq = ENCODE_Q(del_qual+33); bam1_t *b = NULL; int count = 0; if ((tmp.in = samopen(bam_in, "rb", 0)) == 0) { LOG_FATAL("Failed to open BAM file %s\n", bam_in); return 1; } tmp.iq = iq; tmp.dq = dq; if (!bam_out || bam_out[0] == '-') { tmp.out = bam_dopen(fileno(stdout), "w"); } else { tmp.out = bam_open(bam_out, "w"); } bam_header_write(tmp.out, tmp.in->header); b = bam_init1(); while (samread(tmp.in, b) >= 0) { count++; uniform_fetch_func(b, &tmp); } bam_destroy1(b); samclose(tmp.in); bam_close(tmp.out); LOG_VERBOSE("Processed %d reads\n", count); return 0; }
/** * @brief Major file/read parsing function * * @param cptr Pointer to calloc initialized scores. Uses samtools API as opposed to seq_density -> faster * @param databl_start Pointer to genomic start positions of data blocks * @param databl_end Pointer to genomic end positions of data blocks * @param user_args Structure holding all user input * @param cs Pointer to chromosome_size structure holding chromosome dimensions * @param bam_file Pointer to samfile_t structure holding the file handler * @param first_call Is this the first call to this function? * @return Structure seq_block with results of the read scanning such as mapmass and file status * @details Major function looping through every read in file until a new chromosome starts. Updates cptr scores and returns statistics on reads * @note Very long. Should be refactored? * @todo */ seq_block_t seq_density(usersize *cptr, uint32_t *databl_start, uint32_t *databl_end, user_arguments_t *user_args, chromosome_size_t *cs, samfile_t *bam_file, int *first_call) /*reads all reads from bam until a new chromosome is reached and sets first read infos stored in the struct fr. * returns a flag about the parse status and writes the scores to cptr*/ { int abs_gen_end; uint32_t nindex=0,*ind_start,*ind_end,max_index=cs->min_indexspace; usersize * cptr_beg=cptr; seq_block_t bresults={0}; read_metrics_t rm={0}; static buffered_read_t fr={0}; if(*first_call)fr.cigar=(uint32_t *)Calloc(MAX_NCIGAR,uint32_t); int lpos=fr.pos,last_end=fr.pos+fr.l_seq; uint32_t BUFFERLIMIT=cs->max_pos+1;//there are no more than this many items calloc'ed uint32_t INDEXLIMIT=cs->min_indexspace; ind_start=databl_start;ind_end=databl_end;//note the start of pointers while(1){//heavy main loop will go through this n=[amount of reads] times! bam1_t * current_read=bam_init1(); /* ######### SCAN BLOCK ################## */ bresults.file_status=samread(bam_file,current_read); if (bresults.file_status == -1){// EOF if (*first_call){// EOF warning("No compatible read found for this settings!\n"); if(!bresults.paired && user_args->PAIRED)warning("No proper pair [flag 2] found in this file. \nSet 'paired_only' to FALSE\n"); bresults.file_status = -10; return bresults; } #if verbose==1 printf("EOF detected -> %d read(s) screened!\n",bresults.total_reads); #endif cs->max_pos=last_end;//set maximal absolut position *(databl_end++)=cs->max_pos;//completes the missing end entry to be on par with start cs->min_scorespace+=*(databl_end-1)-*(databl_start-1)+1; cs->min_indexspace=nindex; *databl_end=0;*databl_start=0;//end flag databl_start=ind_start;databl_end=ind_end;//reset pointer to the beginning free_samio(&fr,current_read); bresults.file_status = 0; return bresults; }else if(bresults.file_status<-1){ warning("File truncated!\n"); if (!*first_call)free_samio(&fr,current_read); return bresults; // truncated } /* MAIN QUALITY CHECKPOINT */ quality_check(&rm,current_read,user_args,&bresults,lpos); #if verbose==4 print_readinfo(&bresults,current_read,&rm,bam_file); #endif if(rm.skip<0){ bresults.file_status = rm.skip; return bresults; }else if(rm.skip)goto SKIP_READ; abs_gen_end=rm.genomic_end+user_args->EXTEND; /* ######### END SCAN ######################*/ /*################ BOUNDARY CHECK ######################*/ if(fr.chrom_index!=current_read->core.tid || *first_call){//Did we reach the end of the chromosome? If yes save data and return! if (!*first_call){ cs->max_pos=last_end;//set maximal absolut position *databl_end++=cs->max_pos;//completes the missing end entry to be on par with start cs->min_scorespace+=*(databl_end-1)-*(databl_start-1)+1; cs->min_indexspace=nindex; *databl_end=0;*databl_start=0;//end flag databl_start=ind_start;databl_end=ind_end;//reset pointer to the beginning } *first_call=0; bresults.file_status=1; bresults.chrom_index_next=current_read->core.tid; store_read(&fr,current_read,&rm); return bresults; } else if(lpos>current_read->core.pos){ bresults.file_status=-5;//something wrong with the positions warning("Last position>current position. File doesn't seem to be sorted!\n"); free_samio(&fr,current_read); return bresults; } else if(BUFFERLIMIT<abs_gen_end || abs_gen_end < 0){ //skip read if sequence out of bounce //possibly bad header with wrong chromosome margins or EXTEND too large! warning("BUFFER only %d\n But POS: %d cur_seq_len: %d EXTEND: %d -> %d \n GLOBAL %d\n", BUFFERLIMIT,current_read->core.pos,rm.read_length,user_args->EXTEND, current_read->core.pos+rm.read_length+user_args->EXTEND,abs_gen_end); #if pedantic==1 bresults.file_status=-4; return bresults; #endif bam_destroy1(current_read); continue; } /*##################################### WRITE ###############*/ cptr+=current_read->core.pos;//align pointer to current position if(user_args->READTHROUGH){ write_density_ungapped(cptr,rm.read_length,&bresults.maxScore); }else{ write_density_gapped(cptr,bam1_cigar(current_read),current_read->core.n_cigar,&bresults.maxScore);//minor speed panelty to ungapped } if(user_args->EXTEND>0){ if(rm.revcomp){ abs_gen_end-=user_args->EXTEND; if(cptr-user_args->EXTEND>cptr_beg)cptr-=user_args->EXTEND; else goto NOEXTEND; }else{ cptr=cptr_beg; cptr+=rm.genomic_end; } int k=0; for(;k<user_args->EXTEND;++k){++*cptr;++cptr;} } NOEXTEND: cptr=cptr_beg;//jump back to the beginning of the chromosome using the helper if(!fr.written){//flush the first read of the chromosome stored in the struct fr cptr+=fr.pos; if(user_args->READTHROUGH){write_density_ungapped(cptr,fr.tlen,&bresults.maxScore); }else{write_density_gapped(cptr,fr.cigar,fr.n_cigar,&bresults.maxScore);} if(user_args->EXTEND>0){ if(fr.revcomp){ if(cptr-user_args->EXTEND>cptr_beg)cptr-=user_args->EXTEND; else goto FRNOEXTEND; }else{ cptr=cptr_beg; cptr+=fr.genomic_end; } int k=0; for(;k<user_args->EXTEND;++k){++*cptr;++cptr;} } FRNOEXTEND: cptr=cptr_beg;//jump back to the beginning of the chromosome using the helper *databl_start++=fr.pos;++nindex;//genomic coordinate where the data block starts cs->min_pos=fr.pos; cs->min_scorespace=0;//will only be set based on the index cs->min_indexspace=nindex; cs->max_pos=abs_gen_end; lpos=fr.pos; last_end=fr.genomic_end;//load last read info of first read fr.written=1;//indicate that information has been used } /*################ INDEXING ######################*/ cptr=cptr_beg;//jump back to the beginning of the chromosome using the helper if((current_read->core.pos-last_end)>=user_args->COMPRESSION){//check whether there was a large block without any data -> Triggers a jump on the sequence! nindex++;//add one index if(max_index<=nindex){//check whether we are already over the allocated index space printf("Index space found: %d > Index space allocated: %d !!",nindex,max_index); error("Error in indexing allocation detected!"); } *databl_end++=last_end;//genomic coordinate where the data block ends | lags one index position behind start in the main loop! cs->min_scorespace+=*(databl_end-1)-*(databl_start-1)+1;//+1 because start=end is still one Bp! *databl_start++=current_read->core.pos;//genomic coordinate where the data block starts beginning with the current read } lpos=current_read->core.pos; last_end= user_args->READTHROUGH ? max(current_read->core.pos+rm.read_length,last_end) : max(abs_gen_end,last_end); SKIP_READ: if(nindex>=INDEXLIMIT && !first_call)error("Index overflow!\n"); bam_destroy1(current_read); }//end of bam index parsing skip tag section bresults.file_status=-4; return bresults;//can never happen }
void mapper( char *ref, int length, int start_base_pos, const char *bam ) { anal_t input; gzFile pRef; kseq_t * seq = NULL; char chr[8] = { 0, }; int ret; bam_plbuf_t *buf; bam1_t *b; /* fprintf( stderr, "ref: %s\n", ref ); fprintf( stderr, "length: %d\n", length ); fprintf( stderr, "start_base_pos: %d\n", start_base_pos ); fprintf( stderr, "bam: %s\n", bam ); */ input.beg = 0; input.end = 0x7fffffff; input.in = samopen(bam, "rb", 0); if (input.in == 0) { fprintf(stderr, "Fail to open BAM file %s\n", bam); return; } pRef = gzopen( ref, "r" ); fprintf( stderr, "ref : %s\n", ref ); fprintf( stderr, "pRef: %p\n", pRef ); if( pRef == NULL ) { fprintf( stderr, "ref : %s\n", ref ); fprintf( stderr, "pRef: %p\n", pRef ); return; } seq = kseq_init( pRef ); b = bam_init1(); // alloc memory size of bam1_t //fprintf( stderr, "%\pn", b ); buf = bam_plbuf_init(pileup_func, &input); // alloc memory bam_plbuf_set_mask(buf, -1); while ((ret = samread( input.in, b)) >= 0) { bam_plbuf_push(b, buf); //fprintf( stderr, "%x\n", b->core.flag ); if( b->core.flag & 0x0004 ) // unmapped { // do nothing /* qname1 = strtok(bam1_qname(b), ":\t\n "); qname2 = strtok(NULL, ":\t\n "); qname3 = atoi(qname2); fprintf( stderr, "%s:%10d:%s:%d\t%c:%d:%d:%d\n", qname1, qname3, "*", b->core.pos, '*', b->core.flag, b->core.qual, ret ); */ fprintf( stdout, "%s:%s:%d\t%c:0x%x:%d:%d\n", bam1_qname(b), "*", b->core.pos+1, '*', b->core.flag, b->core.qual, ret ); /* fprintf( stderr, "%s:%s:%d\t%c:0x%x:%d:%d\n", bam1_qname(b), "*", b->core.pos, '*', b->core.flag, b->core.qual, ret ); */ } else { // to find a base in the reference genome, seq. if( ( seq != NULL ) && ( strcmp( input.in->header->target_name[b->core.tid], chr ) == 0 ) ) { // already found that // fprintf( stderr, "found : %s\n", chr ); }else { if( find_chr(input.in->header->target_name[b->core.tid], seq, chr) < 0 ) { fprintf( stderr, "ERROR : cannot find chromosome %s\n", \ input.in->header->target_name[b->core.tid] ); }else { fprintf( stderr, "FOUND CHR : %s\n", chr ); } } // remove not aligned to the chromosome fprintf( stdout, "%s:%s:%d\t%c:%d:%d:%d\n", bam1_qname(b), input.in->header->target_name[b->core.tid], b->core.pos+1, seq->seq.s[b->core.pos], b->core.flag, b->core.qual, ret ); /* fprintf( stderr, "%s:%s:%d\t%c:%d:%d:%d\n", bam1_qname(b), input.in->header->target_name[b->core.tid], b->core.pos, seq->seq.s[b->core.pos], b->core.flag, b->core.qual, ret ); */ } } // for the last bases... // printf("pos:%d(%c), flag:%d qual: %d(ret %d)\n", // b->core.pos+1, seq->seq.s[b->core.pos], b->core.flag, b->core.qual, ret ); bam_plbuf_push(0, buf); bam_plbuf_destroy(buf); // release memory bam_destroy1(b); // release memory size of bam1_t samclose(input.in); kseq_destroy( seq ); gzclose( pRef ); return; }