boolean bamFileExists(char *fileOrUrl) /* Return TRUE if we can successfully open the bam file and its index file. */ { char *bamFileName = fileOrUrl; samfile_t *fh = samopen(bamFileName, "rb", NULL); boolean usingUrl = TRUE; usingUrl = (strstr(fileOrUrl, "tp://") || strstr(fileOrUrl, "https://")); if (fh != NULL) { #ifndef KNETFILE_HOOKS // When file is an URL, this caches the index file in addition to validating: // Since samtools's url-handling code saves the .bai file to the current directory, // chdir to a trash directory before calling bam_index_load, then chdir back. char *runDir = getCurrentDir(); char *samDir = getSamDir(); if (usingUrl) setCurrentDir(samDir); #endif//ndef KNETFILE_HOOKS bam_index_t *idx = bam_index_load(bamFileName); #ifndef KNETFILE_HOOKS if (usingUrl) setCurrentDir(runDir); #endif//ndef KNETFILE_HOOKS samclose(fh); if (idx == NULL) { warn("bamFileExists: failed to read index corresponding to %s", bamFileName); return FALSE; } free(idx); // Not freeMem, freez etc -- sam just uses malloc/calloc. return TRUE; } return FALSE; }
static bam_index_t *_bam_tryindexload(const char *indexname) { bam_index_t *index = bam_index_load(indexname); if (index == 0) Rf_error("failed to load BAM index\n file: %s", indexname); return index; }
struct metaBig* metaBigOpenWithTmpDir(char* fileOrUrlwSections, char* cacheDir, char* sectionsBed) /* load a file or URL with or without sectioning */ /* if it's a bam, load the index. */ { struct metaBig* mb; char* fullFileName = NULL; char* remoteDir = NULL; char* baseFileName = NULL; char* sections = NULL; AllocVar(mb); mb->originalFileName = cloneString(fileOrUrlwSections); /* first deal with filename and separate URL/file/sections */ mb->isRemote = parseMetaBigFileName(fileOrUrlwSections, &remoteDir, &fullFileName, &baseFileName, §ions); mb->fileName = fullFileName; mb->baseFileName = baseFileName; mb->remoteSiteAndDir = remoteDir; /* sniff the file */ mb->type = sniffBigFile(mb->fileName); /* depending on the type, open the files and get the chrom-size hash different ways */ if (mb->type == isaBigBed) { mb->big.bbi = bigBedFileOpen(mb->fileName); mb->chromSizeHash = bbiChromSizes(mb->big.bbi); mb->numReads = bigBedItemCount(mb->big.bbi); } #ifdef USE_HTSLIB else if (mb->type == isaBam) { mb->chromSizeHash = bamChromSizes(mb->fileName); mb->header = bamGetHeaderOnly(mb->fileName); mb->big.bam = sam_open(mb->fileName, "r"); /* Also need to load the index since it's a bam */ mb->idx = bam_index_load(mb->fileName); metaBigBamFlagCountsInit(mb); } #endif else if (mb->type == isaBigWig) { mb->big.bbi = bigWigFileOpenWithDir(mb->fileName, cacheDir); mb->chromSizeHash = bbiChromSizes(mb->big.bbi); } else { /* maybe I should free some stuff up here */ if (fullFileName) freeMem(fullFileName); if (remoteDir) freeMem(remoteDir); if (baseFileName) freeMem(baseFileName); if (sections) freeMem(sections); freez(&mb); return NULL; } if (sectionsBed && sections) { struct bed* regions = (fileExists(sectionsBed)) ? regionsLoad(sectionsBed) : parseSectionString(sectionsBed, mb->chromSizeHash); struct bed* subsets = subset_beds(sections, ®ions, mb->chromSizeHash); mb->sections = subsets; } else if (sectionsBed) { mb->sections = (fileExists(sectionsBed)) ? regionsLoad(sectionsBed) : parseSectionString(sectionsBed, mb->chromSizeHash); } else mb->sections = parseSectionString(sections, mb->chromSizeHash); return mb; }
// load index if it hasn't been set already: void bam_streamer:: _load_index() { if (NULL != _bidx) return; // use the BAM index to read a region of the BAM file if (! (_bfp->type&0x01)) { log_os << "ERROR: file must be in BAM format for region lookup: " << name() << "\n"; exit(EXIT_FAILURE); } /// TODO: Find out whether _bidx can be destroyed after the BAM /// iterator is created, in which case this could be a local /// variable. Until we know, _bidx should persist for the lifetime /// of _biter _bidx = bam_index_load(name()); // load BAM index if (NULL == _bidx) { log_os << "ERROR: BAM index is not available for file: " << name() << "\n"; exit(EXIT_FAILURE); } }
BAMOrderedReader::BAMOrderedReader(std::string bam_file, std::vector<GenomeInterval>& intervals) :bam_file(bam_file), intervals(intervals), sam(0), hdr(0), idx(0), itr(0) { const char* fname = bam_file.c_str(); int len = strlen(fname); if ( strcasecmp(".bam",fname+len-4) ) { fprintf(stderr, "[%s:%d %s] Not a BAM file: %s\n", __FILE__, __LINE__, __FUNCTION__, bam_file.c_str()); exit(1); } sam = sam_open(bam_file.c_str(), "r"); hdr = sam_hdr_read(sam); s = bam_init1(); idx = bam_index_load(bam_file.c_str()); if (idx==0) { fprintf(stderr, "[%s:%d %s] fail to load index for %s\n", __FILE__, __LINE__, __FUNCTION__, bam_file.c_str()); abort(); } else { index_loaded = true; } str = {0,0,0}; intervals_present = intervals.size()!=0; interval_index = 0; random_access_enabled = intervals_present && index_loaded; };
bool YTranscriptFetcher::fetchBAMTranscripts(const char* filename, const char *refName, unsigned int start, unsigned int end, std::vector<YTranscript*> *transcripts,std::set<std::string> *transcriptNames) { //Open the region in the bam file fetch_data_t data; fetch_data_t *d = &data; d->beg = start-1-buffer; d->end = end+buffer; d->transcripts = transcripts; d->requestedTranscripts = transcriptNames; d->in = samopen(filename, "rb", 0); if (d->in == 0) { fprintf(stderr, "Failed to open BAM file %s\n", filename); return 0; } bam_index_t *idx; idx = bam_index_load(filename); // load BAM index if (idx == 0) { fprintf(stderr, "BAM indexing file is not available.\n"); return 0; } bam_init_header_hash(d->in->header); d->tid = bam_get_tid(d->in->header, refName); if(d->tid == -1) { fprintf(stderr, "Reference id %s not found in BAM file",refName); return 0; } bam_fetch(d->in->x.bam, idx, d->tid, d->beg, d->end, d, fetch_func); bam_index_destroy(idx); samclose(d->in); return 1; }
uint calculate_cov_params(const char* const bam_name, const int32_t tid, const int32_t start, const int32_t stop) { bamFile fp = bam_open(bam_name, "r"); bam_index_t* fp_index = bam_index_load(bam_name); bam_plbuf_t *buf; covdata* cvdt = ckallocz(sizeof(covdata)); cvdt->tid = tid; cvdt->begin = start; cvdt->end = stop; cvdt->coverage = ckallocz((cvdt->end - cvdt->begin) * sizeof(uint32_t)); buf = bam_plbuf_init(pileup_func, cvdt); bam_fetch(fp, fp_index, tid, start, stop, buf, fetch_func); bam_plbuf_push(0, buf); bam_plbuf_destroy(buf); // calculate the mean coverage in the region of the putative deletion uint i, covsum; for(i = 0, covsum = 0; i < (cvdt->end - cvdt->begin); i++){ covsum += cvdt->coverage[i]; } uint avgcov = floor(covsum * 1.0/(cvdt->end - cvdt->begin)); ckfree(cvdt->coverage); ckfree(cvdt); bam_close(fp); bam_index_destroy(fp_index); return avgcov; }
int MyBamWrap::myGetIndex(char * fileName) { idx = bam_index_load(fileName); if (idx == 0) { cerr<<"BAM indexing file is not available for "<<fileName<<endl; exit(1); } return 0; }
void hash_reads( table* T, const char* reads_fn, interval_stack* is ) { samfile_t* reads_f = samopen( reads_fn, "rb", NULL ); if( reads_f == NULL ) { failf( "Can't open bam file '%s'.", reads_fn ); } bam_index_t* reads_index = bam_index_load( reads_fn ); if( reads_index == NULL ) { failf( "Can't open bam index '%s.bai'.", reads_fn ); } bam_init_header_hash( reads_f->header ); table_create( T, reads_f->header->n_targets ); T->seq_names = (char**)malloc( sizeof(char*) * reads_f->header->n_targets ); size_t k; for( k = 0; k < reads_f->header->n_targets; k++ ) { T->seq_names[k] = strdup(reads_f->header->target_name[k]); } log_puts( LOG_MSG, "hashing reads ... \n" ); log_indent(); bam_iter_t read_iter; bam1_t* read = bam_init1(); int tid; interval_stack::iterator i; for( i = is->begin(); i != is->end(); i++ ) { tid = bam_get_tid( reads_f->header, i->seqname ); if( tid < 0 ) continue; read_iter = bam_iter_query( reads_index, tid, i->start, i->end ); while( bam_iter_read( reads_f->x.bam, read_iter, read ) >= 0 ) { if( bam1_strand(read) == i->strand ) { table_inc( T, read ); } } bam_iter_destroy(read_iter); } bam_destroy1(read); log_unindent(); log_printf( LOG_MSG, "done. (%zu unique reads hashed)\n", T->m ); bam_index_destroy(reads_index); samclose(reads_f); }
int main(int argc, char *argv[]) { char *progname; char *bamfilename; int32_t tid; samfile_t *bamin; bam_index_t *bamidx; bam_plbuf_t *buf; bam1_t *bam_read; uint32_t next_pos = 1; progname = *argv; argv++; argc--; if (argc < 2) { printf("Usage: %s bam_file tid\n", progname); exit(1); } else { bamfilename = argv[0]; tid = strtol(argv[1], NULL, 10); } /* try to open bam file */ bamin = samopen(bamfilename, "rb", NULL); if (!bamin) { fprintf(stderr, "Error opening bamfile %s\n", bamfilename); exit(1); } /* try to open index */ bamidx = bam_index_load(bamfilename); if (!bamidx) { fprintf(stderr, "Error opening index for %s\n", bamfilename); exit(1); } bam_read = bam_init1(); buf = bam_plbuf_init(&pileup_func, &next_pos); /* disable maximum pileup depth */ bam_plp_set_maxcnt(buf->iter, INT_MAX); bam_fetch(bamin->x.bam, bamidx, tid, 0, INT_MAX, buf, &fetch_func); bam_plbuf_push(0, buf); /* finish pileup */ bam_plbuf_destroy(buf); bam_destroy1(bam_read); bam_index_destroy(bamidx); samclose(bamin); return 0; }
void init() { fp = samopen(bam_file_path.c_str(),"rb",0); if(fp == NULL) { throw std::runtime_error("samopen() error with " + bam_file_path); } bamidx = bam_index_load(bam_file_path.c_str()); if (bamidx == NULL) { throw std::runtime_error("bam_index_load() error with " + bam_file_path); } }
static int load_discordant_reads(MEI_data& mei_data, std::vector<bam_info>& bam_sources, const std::string& chr_name, const SearchWindow& window, UserDefinedSettings* userSettings) { // Loop over associated bam files. for (size_t i = 0; i < bam_sources.size(); i++) { // Locate file. bam_info source = bam_sources.at(i); LOG_DEBUG(*logStream << time_log() << "Loading discordant reads from " << source.BamFile << std::endl); // Setup link to bamfile, its index and header. bamFile fp = bam_open(source.BamFile.c_str(), "r"); bam_index_t *idx = bam_index_load(source.BamFile.c_str()); if (idx == NULL) { LOG_WARN(*logStream << time_log() << "Failed to load index for " << source.BamFile.c_str() << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } bam_header_t *header = bam_header_read(fp); bam_init_header_hash(header); int tid = bam_get_tid(header, chr_name.c_str()); if (tid < 0) { LOG_WARN(*logStream << time_log() << "Could not find sequence in alignment file: '" << chr_name << "'" << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } mei_data.sample_names = get_sample_dictionary(header); // Save insert size of current bamfile in data object provided for callback function. // Note: the insert size should ideally be separate from the MEI_data object, tried to do // this using a std::pair object, which did not work. Suggestions are welcome here. mei_data.current_insert_size = source.InsertSize; mei_data.current_chr_name = chr_name; // Set up environment variable for callback function. std::pair<MEI_data*, UserDefinedSettings*> env = std::make_pair(&mei_data, userSettings); // Load discordant reads into mei_data. bam_fetch(fp, idx, tid, window.getStart(), window.getEnd(), &env, fetch_disc_read_callback); bam_index_destroy(idx); } return 0; }
int sam_fetch(char *ifn, char *ofn, char *reg, void *data, sam_fetch_f func) { int ret = 0; samfile_t *in = samopen(ifn, "rb", 0); samfile_t *out = 0; if (ofn) out = samopen(ofn, "wb", in->header); if (reg) { bam_index_t *idx = bam_index_load(ifn); if (idx == 0) { fprintf(stderr, "[%s:%d] Random alignment retrieval only works for indexed BAM files.\n", __func__, __LINE__); exit(1); } int tid, beg, end; bam_parse_region(in->header, reg, &tid, &beg, &end); if (tid < 0) { fprintf(stderr, "[%s:%d] Region \"%s\" specifies an unknown reference name. \n", __func__, __LINE__, reg); exit(1); } bam_iter_t iter; bam1_t *b = bam_init1(); iter = bam_iter_query(idx, tid, beg, end); while ((ret = bam_iter_read(in->x.bam, iter, b)) >= 0) func(b, in, out, data); bam_iter_destroy(iter); bam_destroy1(b); bam_index_destroy(idx); } else { bam1_t *b = bam_init1(); while ((ret = samread(in, b)) >= 0) func(b, in, out, data); bam_destroy1(b); } if (out) samclose(out); samclose(in); if (ret != -1) { /* truncated is -2 */ fprintf(stderr, "[%s:%d] Alignment retrieval failed due to truncated file\n", __func__, __LINE__); exit(1); } return ret; }
int main(int argc, char *argv[]) { tmpstruct_t tmp; if (argc == 1) { fprintf(stderr, "Usage: calDepth <in.bam> [region]\n"); return 1; } tmp.beg = 0; tmp.end = 0x7fffffff; tmp.in = samopen(argv[1], "rb", 0); if (tmp.in == 0) { fprintf(stderr, "Fail to open BAM file %s\n", argv[1]); return 1; } if (argc == 2) { // if a region is not specified sampileup(tmp.in, -1, pileup_func, &tmp); } else { int ref; bam_index_t *idx; bam_plbuf_t *buf; idx = bam_index_load(argv[1]); // load BAM index if (idx == 0) { fprintf(stderr, "BAM indexing file is not available.\n"); return 1; } bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region if (ref < 0) { fprintf(stderr, "Invalid region %s\n", argv[2]); return 1; } buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func); bam_plbuf_push(0, buf); // finalize pileup bam_index_destroy(idx); bam_plbuf_destroy(buf); } samclose(tmp.in); return 0; }
void OpenBamFile(BamReaderData * data, char * filename) { // Allocate space data->data = (mplp_aux_t *) calloc(1, sizeof(mplp_aux_t)); // read the header and initialize data if (strcmp(filename, "-")) data->data->fp = bam_open(filename, "r"); else data->data->fp = bam_dopen(fileno(stdin), "r"); data->data->conf = data->conf; data->data->h = bam_header_read(data->data->fp); // Load index data->idx = bam_index_load(filename); if (data->idx == 0) { fprintf(stderr, "[%s] fail to load index for input.\n", __func__); exit(1); } // Start reading data->ref_tid = -1; seekRegion(data); }
void bamFetch(char *fileOrUrl, char *position, bam_fetch_f callbackFunc, void *callbackData, samfile_t **pSamFile) /* Open the .bam file, fetch items in the seq:start-end position range, * and call callbackFunc on each bam item retrieved from the file plus callbackData. * This handles BAM files with "chr"-less sequence names, e.g. from Ensembl. * The pSamFile parameter is optional. If non-NULL it will be filled in, just for * the benefit of the callback function, with the open samFile. */ { char *bamFileName = NULL; samfile_t *fh = bamOpen(fileOrUrl, &bamFileName); boolean usingUrl = TRUE; usingUrl = (strstr(fileOrUrl, "tp://") || strstr(fileOrUrl, "https://")); if (pSamFile != NULL) *pSamFile = fh; #ifndef KNETFILE_HOOKS // Since samtools' url-handling code saves the .bai file to the current directory, // chdir to a trash directory before calling bam_index_load, then chdir back. char *runDir = getCurrentDir(); char *samDir = getSamDir(); if (usingUrl) setCurrentDir(samDir); #endif//ndef KNETFILE_HOOKS bam_index_t *idx = bam_index_load(bamFileName); #ifndef KNETFILE_HOOKS if (usingUrl) setCurrentDir(runDir); #endif//ndef KNETFILE_HOOKS if (idx == NULL) warn("bam_index_load(%s) failed.", bamFileName); else { bamFetchAlreadyOpen(fh, idx, bamFileName, position, callbackFunc, callbackData); free(idx); // Not freeMem, freez etc -- sam just uses malloc/calloc. } bamClose(&fh); }
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level) #endif { bamFile fpout, *fp; heap1_t *heap; bam_header_t *hout = 0; bam_header_t *hheaders = NULL; int i, j, *RG_len = 0; uint64_t idx = 0; char **RG = 0, mode[8]; bam_iter_t *iter = 0; if (headers) { tamFile fpheaders = sam_open(headers); if (fpheaders == 0) { const char *message = strerror(errno); fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hheaders = sam_header_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (bamFile*)calloc(n, sizeof(bamFile)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t)); // prepare RG tag if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(void*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // read the first for (i = 0; i != n; ++i) { bam_header_t *hin; fp[i] = bam_open(fn[i], "r"); if (fp[i] == 0) { int j; fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) bam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = bam_header_read(fp[i]); if (i == 0) { // the first BAM hout = hin; } else { // validate multiple baf int min_n_targets = hout->n_targets; if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; for (j = 0; j < min_n_targets; ++j) if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", hout->target_name[j], hin->target_name[j], fn[i]); return -1; } // If this input file has additional target reference sequences, // add them to the headers to be output if (hin->n_targets > hout->n_targets) { swap_header_targets(hout, hin); // FIXME Possibly we should also create @SQ text headers // for the newly added reference sequences } bam_header_destroy(hin); } } if (hheaders) { // If the text headers to be swapped in include any @SQ headers, // check that they are consistent with the existing binary list // of reference information. if (hheaders->n_targets > 0) { if (hout->n_targets != hheaders->n_targets) { fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); if (!reg) return -1; } for (j = 0; j < hout->n_targets; ++j) if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); if (!reg) return -1; } } swap_header_text(hout, hheaders); bam_header_destroy(hheaders); } if (reg) { int tid, beg, end; if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { bam_index_t *idx; idx = bam_index_load(fn[i]); iter[i] = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } } for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; } if (flag & MERGE_UNCOMP) level = 0; else if (flag & MERGE_LEVEL1) level = 1; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; } bam_header_write(fpout, hout); bam_header_destroy(hout); #ifndef _PBGZF_USE if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256); #endif ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; free(heap->b->data); free(heap->b); heap->b = 0; } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i != n; ++i) { bam_iter_destroy(iter[i]); bam_close(fp[i]); } bam_close(fpout); free(fp); free(heap); free(iter); return 0; }
int main_depth(int argc, char *argv[]) #endif { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { switch (n) { case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold } } if (optind == argc) { fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n"); return 1; } // initialize the auxiliary data structures n = argc - optind; // the number of BAMs on the command line data = (aux_t **) calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = (aux_t *) calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator bam_index_destroy(idx); // the index is not needed any more; phase out of the memory } } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization n_plp = (int*) calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = (bam_pileup1_t **) calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); return 0; }
int main_samview(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; while ((c = getopt(argc, argv, "IbSl:t:")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSI] [-l level] <in.bam>|<in.sam> [region]\n"); return 1; } strcpy(moder, "r"); if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder, fn_ref); h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); if ((flag&4) == 0) { // SAM/BAM output htsFile *out; char modew[8]; strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&2) strcat(modew, "b"); out = hts_open("-", modew, 0); sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = bam_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while (bam_itr_next((BGZF*)in->fp, iter, b) >= 0) sam_write1(out, h, b); hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while (sam_read1(in, h, b) >= 0) sam_write1(out, h, b); sam_close(out); } bam_destroy1(b); bam_hdr_destroy(h); sam_close(in); return 0; }
int scorereads_main(int argc, char** argv) { parse_scorereads_options(argc, argv); omp_set_num_threads(opt::num_threads); Fast5Map name_map(opt::reads_file); ModelMap models; if (!opt::models_fofn.empty()) models = read_models_fofn(opt::models_fofn); // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for schedule(dynamic) for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { //load read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); SquiggleRead sr(read_name, fast5_path); // TODO: early exit when have processed all of the reads in readnames if (!opt::readnames.empty() && std::find(opt::readnames.begin(), opt::readnames.end(), read_name) == opt::readnames.end() ) continue; for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { std::vector<EventAlignment> ao = alignment_from_read(sr, strand_idx, read_idx, models, fai, hdr, record, clip_start, clip_end); if (ao.size() == 0) continue; // Update pore model based on alignment if ( opt::calibrate ) recalibrate_model(sr, strand_idx, ao, false); double score = model_score(sr, strand_idx, fai, ao, 500); if (score > 0) continue; #pragma omp critical(print) std::cout << read_name << " " << ( strand_idx ? "complement" : "template" ) << " " << sr.pore_model[strand_idx].name << " " << score << std::endl; } } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } } while(result >= 0); // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); return 0; }
void train_one_round(const Fast5Map& name_map, size_t round) { const PoreModelMap& current_models = PoreModelSet::get_models(opt::trained_model_type); // Initialize the training summary stats for each kmer for each model ModelTrainingMap model_training_data; for(auto current_model_iter = current_models.begin(); current_model_iter != current_models.end(); current_model_iter++) { // one summary entry per kmer in the model std::vector<StateSummary> summaries(current_model_iter->second.get_num_states()); model_training_data[current_model_iter->first] = summaries; } // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; Progress progress("[methyltrain]"); do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { add_aligned_events(name_map, fai, hdr, record, read_idx, clip_start, clip_end, round, model_training_data); } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } if(opt::progress) { fprintf(stderr, "Realigned %zu reads in %.1lfs\r", num_reads_realigned, progress.get_elapsed_seconds()); } } while(result >= 0); assert(num_records_buffered == 0); progress.end(); // open the summary file std::stringstream summary_fn; summary_fn << "methyltrain" << opt::out_suffix << ".summary"; FILE* summary_fp = fopen(summary_fn.str().c_str(), "w"); fprintf(summary_fp, "model_short_name\tkmer\tnum_matches\tnum_skips\t" "num_stays\tnum_events_for_training\twas_trained\t" "trained_level_mean\ttrained_level_stdv\n"); // open the tsv file with the raw training data std::stringstream training_fn; training_fn << "methyltrain" << opt::out_suffix << ".round" << round << ".events.tsv"; std::ofstream training_ofs(training_fn.str()); // write out a header for the training data StateTrainingData::write_header(training_ofs); // iterate over models: template, complement_pop1, complement_pop2 for(auto model_training_iter = model_training_data.begin(); model_training_iter != model_training_data.end(); model_training_iter++) { // Initialize the trained model from the input model auto current_model_iter = current_models.find(model_training_iter->first); assert(current_model_iter != current_models.end()); std::string model_name = model_training_iter->first; std::string model_short_name = current_model_iter->second.metadata.get_short_name(); // Initialize the new model from the current model PoreModel updated_model = current_model_iter->second; uint32_t k = updated_model.k; const std::vector<StateSummary>& summaries = model_training_iter->second; // Generate the complete set of kmers std::string gen_kmer(k, 'A'); std::vector<std::string> all_kmers; for(size_t ki = 0; ki < summaries.size(); ++ki) { all_kmers.push_back(gen_kmer); mtrain_alphabet->lexicographic_next(gen_kmer); } assert(gen_kmer == std::string(k, 'A')); assert(all_kmers.front() == std::string(k, 'A')); assert(all_kmers.back() == std::string(k, 'T')); // Update means for each kmer #pragma omp parallel for for(size_t ki = 0; ki < summaries.size(); ++ki) { assert(ki < all_kmers.size()); std::string kmer = all_kmers[ki]; // write the observed values to a tsv file #pragma omp critical { for(size_t ei = 0; ei < summaries[ki].events.size(); ++ei) { summaries[ki].events[ei].write_tsv(training_ofs, model_short_name, kmer); } } bool is_m_kmer = kmer.find('M') != std::string::npos; bool update_kmer = opt::training_target == TT_ALL_KMERS || (is_m_kmer && opt::training_target == TT_METHYLATED_KMERS) || (!is_m_kmer && opt::training_target == TT_UNMETHYLATED_KMERS); bool trained = false; // only train if there are a sufficient number of events for this kmer if(update_kmer && summaries[ki].events.size() >= opt::min_number_of_events_to_train) { // train a mixture model where a minority of k-mers aren't methylated ParamMixture mixture; float incomplete_methylation_rate = 0.05f; std::string um_kmer = mtrain_alphabet->unmethylate(kmer); size_t um_ki = mtrain_alphabet->kmer_rank(um_kmer.c_str(), k); // Initialize the training parameters. If this is a kmer containing // a methylation site we train a two component mixture, otherwise // just fit a gaussian float major_weight = is_m_kmer ? 1 - incomplete_methylation_rate : 1.0f; mixture.log_weights.push_back(log(major_weight)); mixture.params.push_back(current_model_iter->second.get_parameters(ki)); if(is_m_kmer) { // add second unmethylated component mixture.log_weights.push_back(std::log(incomplete_methylation_rate)); mixture.params.push_back(current_model_iter->second.get_parameters(um_ki)); } if(opt::verbose > 1) { fprintf(stderr, "INIT__MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(), std::exp(mixture.log_weights[0]), mixture.params[0].level_mean, mixture.params[0].level_stdv, std::exp(mixture.log_weights[1]), mixture.params[1].level_mean, mixture.params[1].level_stdv); } ParamMixture trained_mixture = train_gaussian_mixture(summaries[ki].events, mixture); if(opt::verbose > 1) { fprintf(stderr, "TRAIN_MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(), std::exp(trained_mixture.log_weights[0]), trained_mixture.params[0].level_mean, trained_mixture.params[0].level_stdv, std::exp(trained_mixture.log_weights[1]), trained_mixture.params[1].level_mean, trained_mixture.params[1].level_stdv); } #pragma omp critical updated_model.states[ki] = trained_mixture.params[0]; if (model_stdv()) { ParamMixture ig_mixture; // weights ig_mixture.log_weights = trained_mixture.log_weights; // states ig_mixture.params.emplace_back(trained_mixture.params[0]); if(is_m_kmer) { ig_mixture.params.emplace_back(current_model_iter->second.get_parameters(um_ki)); } // run training auto trained_ig_mixture = train_invgaussian_mixture(summaries[ki].events, ig_mixture); LOG("methyltrain", debug) << "IG_INIT__MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " [" << std::fixed << std::setprecision(5) << ig_mixture.params[0].sd_mean << " " << ig_mixture.params[1].sd_mean << "]" << std::endl << "IG_TRAIN_MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " [" << trained_ig_mixture.params[0].sd_mean << " " << trained_ig_mixture.params[1].sd_mean << "]" << std::endl; // update state #pragma omp critical { updated_model.states[ki] = trained_ig_mixture.params[0]; } } trained = true; } #pragma omp critical { fprintf(summary_fp, "%s\t%s\t%d\t%d\t%d\t%zu\t%d\t%.2lf\t%.2lf\n", model_short_name.c_str(), kmer.c_str(), summaries[ki].num_matches, summaries[ki].num_skips, summaries[ki].num_stays, summaries[ki].events.size(), trained, updated_model.states[ki].level_mean, updated_model.states[ki].level_stdv); } // add the updated model into the collection (or replace what is already there) PoreModelSet::insert_model(opt::trained_model_type, updated_model); } } // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); fclose(summary_fp); }
int main_depth(int argc, char *argv[]) #endif { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure char *file_list = NULL, **fn = NULL; bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold case 'f': file_list = optarg; break; } } if (optind == argc && !file_list) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -b <bed> list of positions or regions\n"); fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n"); fprintf(stderr, " -l <int> minQLen\n"); fprintf(stderr, " -q <int> base quality threshold\n"); fprintf(stderr, " -Q <int> mapping quality threshold\n"); fprintf(stderr, " -r <chr:from-to> region\n"); fprintf(stderr, "\n"); return 1; } // initialize the auxiliary data structures if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; n = nfiles; argv = fn; optind = 0; } else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator bam_index_destroy(idx); // the index is not needed any more; phase out of the memory } } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization bam_mplp_set_maxcnt(mplp,2147483647); // set max_depth to int max n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); if ( file_list ) { for (i=0; i<n; i++) free(fn[i]); free(fn); } return 0; }
int main_bedcov(int argc, char *argv[]) { extern void bam_init_header_hash(bam_header_t*); gzFile fp; kstring_t str; kstream_t *ks; bam_index_t **idx; bam_header_t *h = 0; aux_t **aux; int *n_plp, dret, i, n, c, min_mapQ = 0; int64_t *cnt; const bam_pileup1_t **plp; while ((c = getopt(argc, argv, "Q:")) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; } } if (optind + 2 > argc) { fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n"); return 1; } memset(&str, 0, sizeof(kstring_t)); n = argc - optind - 1; aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(bam_index_t*)); for (i = 0; i < n; ++i) { aux[i] = calloc(1, sizeof(aux_t)); aux[i]->min_mapQ = min_mapQ; aux[i]->fp = bam_open(argv[i+optind+1], "r"); idx[i] = bam_index_load(argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } bgzf_set_cache_size(aux[i]->fp, 20); if (i == 0) h = bam_header_read(aux[0]->fp); } bam_init_header_hash(h); cnt = calloc(n, 8); fp = gzopen(argv[optind], "rb"); ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; int tid, beg, end, pos; bam_mplp_t mplp; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_get_tid(h, q); *p = '\t'; if (tid < 0) goto bed_error; for (q = p = p + 1; isdigit(*p); ++p); if (*p != '\t') goto bed_error; *p = 0; beg = atoi(q); *p = '\t'; for (q = p = p + 1; isdigit(*p); ++p); if (*p == '\t' || *p == 0) { int c = *p; *p = 0; end = atoi(q); *p = c; } else goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); aux[i]->iter = bam_iter_query(idx[i], tid, beg, end); } mplp = bam_mplp_init(n, read_bam, (void**)aux); bam_mplp_set_maxcnt(mplp, 64000); memset(cnt, 0, 8 * n); while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) if (pos >= beg && pos < end) for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } puts(str.s); bam_mplp_destroy(mplp); continue; bed_error: fprintf(stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); for (i = 0; i < n; ++i) { if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); bam_index_destroy(idx[i]); bam_close(aux[i]->fp); free(aux[i]); } bam_header_destroy(h); free(aux); free(idx); free(str.s); return 0; }
int main(int argc, char *argv[]) { DBAdaptor * dba; StatementHandle *sth; ResultRow * row; Vector * slices; int nSlices; htsFile * out; int argNum = 1; char *inFName = NULL; char *outFName = NULL; char *dbUser = "******"; char *dbPass = NULL; int dbPort = 3306; char *dbHost = "ens-staging.internal.sanger.ac.uk"; char *dbName = "homo_sapiens_core_71_37"; char *assName = "GRCh37"; char *chrName = "1"; int flags = 0; int threads = 1; initEnsC(argc, argv); while (argNum < argc) { char *arg = argv[argNum]; char *val; // Ones without a val go here if (!strcmp(arg, "-U") || !strcmp(arg,"--ucsc_naming")) { flags |= M_UCSC_NAMING; } else { // Ones with a val go in this block if (argNum == argc-1) { Bamcov_usage(); } val = argv[++argNum]; if (!strcmp(arg, "-i") || !strcmp(arg,"--in_file")) { StrUtil_copyString(&inFName,val,0); } else if (!strcmp(arg, "-o") || !strcmp(arg,"--out_file")) { StrUtil_copyString(&outFName,val,0); } else if (!strcmp(arg, "-h") || !strcmp(arg,"--host")) { StrUtil_copyString(&dbHost,val,0); } else if (!strcmp(arg, "-p") || !strcmp(arg,"--password")) { StrUtil_copyString(&dbPass,val,0); } else if (!strcmp(arg, "-P") || !strcmp(arg,"--port")) { dbPort = atoi(val); } else if (!strcmp(arg, "-n") || !strcmp(arg,"--name")) { StrUtil_copyString(&dbName,val,0); } else if (!strcmp(arg, "-u") || !strcmp(arg,"--user")) { StrUtil_copyString(&dbUser,val,0); } else if (!strcmp(arg, "-t") || !strcmp(arg,"--threads")) { threads = atoi(val); } else if (!strcmp(arg, "-a") || !strcmp(arg,"--assembly")) { StrUtil_copyString(&assName,val,0); } else if (!strcmp(arg, "-v") || !strcmp(arg,"--verbosity")) { verbosity = atoi(val); // Temporary } else if (!strcmp(arg, "-c") || !strcmp(arg,"--chromosome")) { StrUtil_copyString(&chrName,val,0); } else { fprintf(stderr,"Error in command line at %s\n\n",arg); Bamcov_usage(); } } argNum++; } if (verbosity > 0) { printf("Program for calculating read coverage in a BAM file \n" "Steve M.J. Searle. [email protected] Last update April 2013.\n"); } if (!inFName || !outFName) { Bamcov_usage(); } dba = DBAdaptor_new(dbHost,dbUser,dbPass,dbName,dbPort,NULL); //nSlices = getSlices(dba, destName); nSlices = 1; slices = Vector_new(); SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba); Slice *slice = SliceAdaptor_fetchByRegion(sa,NULL,chrName,POS_UNDEF,POS_UNDEF,1,NULL, 0); Vector_addElement(slices,slice); if (Vector_getNumElement(slices) == 0) { fprintf(stderr, "Error: No slices.\n"); exit(1); } htsFile *in = hts_open(inFName, "rb"); if (in == 0) { fprintf(stderr, "Fail to open BAM file %s\n", inFName); return 1; } hts_set_threads(in, threads); hts_idx_t *idx; idx = bam_index_load(inFName); // load BAM index if (idx == 0) { fprintf(stderr, "BAM index file is not available.\n"); return 1; } int i; for (i=0; i<Vector_getNumElement(slices); i++) { Slice *slice = Vector_getElementAt(slices,i); if (verbosity > 0) printf("Working on '%s'\n",Slice_getName(slice)); // if (verbosity > 0) printf("Stage 1 - retrieving annotation from database\n"); // Vector *genes = getGenes(slice, flags); if (verbosity > 0) printf("Stage 1 - calculating coverage\n"); calcCoverage(inFName, slice, in, idx, flags); } hts_idx_destroy(idx); hts_close(in); if (verbosity > 0) printf("Done\n"); return 0; }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); int storeSize = 100; int delStore[2][100] = {{0},{0}}; typedef char * mstring; while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { //MDW start //for each position in the pileup column int charLen = 16; int countChars[ charLen ][2]; int countiChars[ charLen ][2]; int countGap[2]={0,0}; //double qvTotal=0; int numStruck=0; int numGood=0; int tti; int ttj; mstring insAllele[100]; int insAlleleCnt[100]; int sf=0; int flag=0; //typedef char * string; char insStr0[10000]; int iCnt0=0; char insStr1[10000]; int iCnt1=0; char delStr0[10000]; int dCnt0=0; char delStr1[10000]; int dCnt1=0; float qposP[10000]; int qposCnt=0; //initialize with zeros for(tti=0;tti<charLen;tti++){ countChars[tti][0]=0; countChars[tti][1]=0; } // define repeat length here; look back up to 10 prior positions // start one position away. int replC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos-1])==toupper(ref[pos-tti])){ replC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int reprC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos+1])==toupper(ref[pos+tti])){ reprC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int repT = replC; if(replC < reprC){ repT=reprC; } for (j = 0; j < n_plp[i]; ++j){ const bam_pileup1_t *p = plp[i] + j; /* SAME LOGIC AS pileup_seq() */ if(p->is_refskip){ // never count intron gaps in numStruck continue; } if(p->is_del){ // skip deletion gap, after first position which is the first aligned char continue; } if( p->b->core.qual < conf->min_mqToCount || // mapping quality conf->maxrepC < (repT) || // max homopolymer run, this will not (!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) || // trimEnd is 1-based p->zf == 1 || // fusion tag p->ih > conf->maxIH || // max hit index (p->nmd > conf->maxNM) || // max mismatch (conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs (conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary (conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup (conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY)) || // optionally strike secondary or dup (conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) )) // optionally strike secondary, dup and QCfail ){ numStruck++; continue; } //printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]); if(!p->is_del && p->indel==0){ countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++; numGood++; }else if(p->is_refskip){ countGap[ bam1_strand(p->b) ]++; } if(p->indel<0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr0[dCnt0] = ref[pos+tti]; dCnt0++; } delStr0[dCnt0] = ','; dCnt0++; }else{ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr1[dCnt1] = ref[pos+tti]; dCnt1++; } delStr1[dCnt1] = ','; dCnt1++; } }else if(p->indel>0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt0++; } insStr0[iCnt0] = ','; iCnt0++; }else{ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt1++; } insStr1[iCnt1] = ','; iCnt1++; } } //calculate position of variant within aligned read - no soft clips if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0 ){ //distance to end; calculate distance to end of aligned read. removes soft clips. int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd; qposP[qposCnt] = distToEnd; qposCnt++; // printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd); } } // //print A,C,G,T, by +/- printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", countChars[1][0],countChars[1][1], countChars[2][0],countChars[2][1], countChars[4][0],countChars[4][1], countChars[8][0],countChars[8][1], countChars[7][0],countChars[7][1]); putchar('\t'); for(tti=0;tti<dCnt0;tti++){ putchar(delStr0[tti]); } putchar('\t'); for(tti=0;tti<dCnt1;tti++){ putchar(delStr1[tti]); } putchar('\t'); for(tti=0;tti<iCnt0;tti++){ putchar(insStr0[tti]); } putchar('\t'); for(tti=0;tti<iCnt1;tti++){ putchar(insStr1[tti]); } printf("\t%d\t%d",numGood,numStruck); // get non-ref qpos variation float medqpos = -1; float medAbsDev = -1; if(qposCnt>0){ medqpos = median(qposCnt,qposP); float absDev[qposCnt]; for(tti=0;tti<qposCnt;tti++){ absDev[tti] = abs(medqpos - qposP[tti]); } medAbsDev = median(qposCnt-1,absDev); } printf("\t%f",medAbsDev); ///END MDW } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
void LoadIndex(const std::string& fn) { htsIndex_.reset(bam_index_load(fn.c_str())); if (!htsIndex_) throw std::runtime_error("could not load BAI index data"); }
SR_BamInStream* SR_BamInStreamAlloc(const char* bamFilename, uint32_t binLen, unsigned int numThreads, unsigned int buffCapacity, unsigned int reportSize, const SR_StreamMode* pStreamMode) { SR_BamInStream* pBamInStream = (SR_BamInStream*) calloc(1, sizeof(SR_BamInStream)); if (pBamInStream == NULL) SR_ErrQuit("ERROR: Not enough memory for a bam input stream object."); pBamInStream->bam_cur_status = -1; pBamInStream->fpBamInput = bam_open(bamFilename, "r"); if (pBamInStream->fpBamInput == NULL) SR_ErrQuit("ERROR: Cannot open bam file %s for reading.\n", bamFilename); if ((pStreamMode->controlFlag & SR_USE_BAM_INDEX) != 0) { pBamInStream->pBamIndex = bam_index_load(bamFilename); if (pBamInStream->pBamIndex == NULL) { SR_ErrMsg("WARNING: Cannot open bam index file for reading. Creating it......"); bam_index_build(bamFilename); SR_ErrMsg(" The bam index is created."); pBamInStream->pBamIndex = bam_index_load(bamFilename); } } pBamInStream->filterFunc = pStreamMode->filterFunc; pBamInStream->filterData = pStreamMode->filterData; pBamInStream->numThreads = numThreads; pBamInStream->reportSize = reportSize; pBamInStream->currRefID = NO_QUERY_YET; pBamInStream->currBinPos = NO_QUERY_YET; pBamInStream->binLen = binLen; pBamInStream->pNewNode = NULL; pBamInStream->pBamIterator = NULL; if (numThreads > 0) { pBamInStream->pRetLists = (SR_BamList*) calloc(numThreads, sizeof(SR_BamList)); if (pBamInStream->pRetLists == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of retrun alignment lists in the bam input stream object.\n"); pBamInStream->pAlgnTypes = (SR_AlgnType*) malloc(numThreads * reportSize * sizeof(SR_AlgnType)); if (pBamInStream->pAlgnTypes == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of pair alignment type in the bam input stream object.\n"); } else { pBamInStream->pRetLists = NULL; pBamInStream->pAlgnTypes = NULL; pBamInStream->reportSize = 0; } if ((pStreamMode->controlFlag & SR_PAIR_GENOMICALLY) == 0) { pBamInStream->pNameHashes[PREV_BIN] = kh_init(queryName); kh_resize(queryName, pBamInStream->pNameHashes[PREV_BIN], reportSize); } else { pBamInStream->pNameHashes[PREV_BIN] = NULL; pBamInStream->binLen = SR_MAX_BIN_LEN; } pBamInStream->pNameHashes[CURR_BIN] = kh_init(queryName); kh_resize(queryName, pBamInStream->pNameHashes[CURR_BIN], reportSize); pBamInStream->pMemPool = SR_BamMemPoolAlloc(buffCapacity); pBamInStream->bam_cur_status = 1; return pBamInStream; }
int methyltest_main(int argc, char** argv) { parse_methyltest_options(argc, argv); omp_set_num_threads(opt::num_threads); Fast5Map name_map(opt::reads_file); ModelMap models = read_models_fofn(opt::models_fofn, mtest_alphabet); // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize writers OutputHandles handles; handles.site_writer = fopen(std::string(opt::bam_file + ".methyltest.sites.bed").c_str(), "w"); handles.read_writer = fopen(std::string(opt::bam_file + ".methyltest.reads.tsv").c_str(), "w"); handles.strand_writer = fopen(std::string(opt::bam_file + ".methyltest.strand.tsv").c_str(), "w"); // Write a header to the reads.tsv file fprintf(handles.read_writer, "name\tsum_ll_ratio\tn_cpg\tcomplement_model\ttags\n"); // strand header fprintf(handles.strand_writer, "name\tsum_ll_ratio\tn_cpg\tmodel\n"); // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_processed = 0; size_t num_records_buffered = 0; Progress progress("[methyltest]"); do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_processed + i; if( (record->core.flag & BAM_FUNMAP) == 0) { calculate_methylation_for_read(models, name_map, fai, hdr, record, read_idx, handles); } } num_reads_processed += num_records_buffered; num_records_buffered = 0; } } while(result >= 0); assert(num_records_buffered == 0); progress.end(); // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup fclose(handles.site_writer); fclose(handles.read_writer); fclose(handles.strand_writer); sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); return EXIT_SUCCESS; }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { for (j = 0; j < n_plp[i]; ++j) pileup_seq(plp[i] + j, pos, ref_len, ref); putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam1_qual(p->b)[p->qpos] + 33; if (c > 126) c = 126; putchar(c); } if (conf->flag & MPLP_PRINT_MAPQ) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { int c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putchar(c); } } if (conf->flag & MPLP_PRINT_POS) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putchar(','); printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
int main(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; htsFile *out; char modew[8]; int r = 0, exit_code = 0; while ((c = getopt(argc, argv, "IbDCSl:t:")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'D': flag |= 4; break; case 'C': flag |= 8; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSCSI] [-l level] <in.bam>|<in.sam>|<in.cram> [region]\n"); return 1; } strcpy(moder, "r"); if (flag&4) strcat(moder, "c"); else if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder); h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&8) strcat(modew, "c"); else if (flag&2) strcat(modew, "b"); out = hts_open("-", modew); /* CRAM output */ if (flag & 8) { // Parse input header and use for CRAM output out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text); // Create CRAM references arrays if (fn_ref) cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref); else // Attempt to fill out a cram->refs[] array from @SQ headers cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL); } sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = bam_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while ((r = bam_itr_next(in, iter, b)) >= 0) { if (sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } } hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while ((r = sam_read1(in, h, b)) >= 0) { if (sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } } sam_close(out); if (r < -1) { fprintf(stderr, "Error parsing input.\n"); exit_code = 1; } bam_destroy1(b); bam_hdr_destroy(h); sam_close(in); return exit_code; }