bam_hdr_t * setup_test(const char *bam0_header_text, const refseq_info_t *bam0_refseqs, int32_t bam0_n_refseqs, const char *bam1_header_text, const refseq_info_t *bam1_refseqs, int32_t bam1_n_refseqs, merged_header_t *merged_hdr) { bam_hdr_t* bam0 = NULL; bam_hdr_t* bam1 = NULL; int32_t i; bam0 = bam_hdr_init(); bam0->text = strdup(bam0_header_text); if (!bam0->text) goto fail; bam0->l_text = strlen(bam0_header_text); bam0->n_targets = 1; bam0->target_name = (char**)calloc(bam0_n_refseqs, sizeof(char*)); bam0->target_len = (uint32_t*)calloc(bam0_n_refseqs, sizeof(uint32_t)); for (i = 0; i < bam0_n_refseqs; i++) { bam0->target_name[i] = strdup(bam0_refseqs[i].name); if (!bam0->target_name[i]) goto fail; bam0->target_len[i] = bam0_refseqs[i].len; } if (populate_merged_header(bam0, merged_hdr)) goto fail; bam1 = bam_hdr_init(); if (!bam1) goto fail; bam1->text = strdup(bam1_header_text); if (!bam1->text) goto fail; bam1->l_text = strlen(bam1_header_text); bam1->n_targets = bam1_n_refseqs; bam1->target_name = (char**)calloc(bam1_n_refseqs, sizeof(char*)); bam1->target_len = (uint32_t*)calloc(bam1_n_refseqs, sizeof(uint32_t)); for (i = 0; i < bam1_n_refseqs; i++) { bam1->target_name[i] = strdup(bam1_refseqs[i].name); if (!bam1->target_name[i]) goto fail; bam1->target_len[i] = bam1_refseqs[i].len; } bam_hdr_destroy(bam0); return bam1; fail: bam_hdr_destroy(bam1); bam_hdr_destroy(bam0); return NULL; }
bam_hdr_t *getHeadFromFai(const char *fname){ std::vector<char *> chrs; std::vector<int> lengths; FILE *fp = aio::getFILE(fname,"r"); char buf[1024]; while(fgets(buf,1024,fp)){ char *tok = strtok(buf,"\t \n"); //fprintf(stderr,"tok: %s\n",tok); chrs.push_back(strdup(tok));//<-strdup so don't clean here tok = strtok(NULL,"\t \n"); if(!tok){ fprintf(stderr,"\t-> fai file looks malformed? last reference: %s\n",chrs.back()); fclose(fp); return NULL; } // fprintf(stderr,"tok: %s\n",tok); lengths.push_back(atoi(tok)); } bam_hdr_t *ret = bam_hdr_init(); ret->l_text = strlen(fname); ret->text =(char*) malloc(strlen(fname)+1); ret->text = strcpy(ret->text,fname); ret->n_targets = chrs.size(); ret->target_len = (uint32_t*) malloc(sizeof(uint32_t)*chrs.size()); ret->target_name = (char**) malloc(sizeof(char*)*chrs.size()); for(size_t i=0;i<chrs.size();i++){ ret->target_len[i] = lengths[i]; ret->target_name[i] =strdup(chrs[i]); } for(uint i=0;i<chrs.size();i++) free(chrs[i]); fclose(fp); return ret; }
std::shared_ptr<bam_hdr_t> bamql::appendProgramToHeader( const bam_hdr_t *original, const std::string &name, const std::string &id, const std::string &version, const std::string &args) { auto copy = std::shared_ptr<bam_hdr_t>(bam_hdr_init(), bam_hdr_destroy); if (!copy) { return copy; } std::stringstream text; text << original->text << "@PG\tPN:" << name << "\tID:" << id << "\tVN:" << version << "\tCL:\"" << args << "\"\n"; auto text_str = text.str(); copy->n_targets = original->n_targets; copy->ignore_sam_err = original->ignore_sam_err; copy->l_text = text_str.length(); copy->cigar_tab = nullptr; copy->sdict = nullptr; copy->text = strdup(text_str.c_str()); copy->target_len = (uint32_t *)calloc(original->n_targets, sizeof(uint32_t)); copy->target_name = (char **)calloc(original->n_targets, sizeof(char *)); for (int it = 0; it < original->n_targets; it++) { copy->target_len[it] = original->target_len[it]; copy->target_name[it] = strdup(original->target_name[it]); } return copy; }
void setup_test_1(bam_hdr_t** hdr_in) { *hdr_in = bam_hdr_init(); const char *test1 = "@HD\tVN:1.4\n" "@SQ\tSN:blah\n" "@RG\tID:fish\n"; (*hdr_in)->text = strdup(test1); (*hdr_in)->l_text = strlen(test1); }
HtslibBamHeaderManager:: HtslibBamHeaderManager( const std::vector<bam_header_info::chrom_info>& chromData) : _header(bam_hdr_init()) { _header->n_targets = chromData.size(); _header->target_len = (uint32_t*)calloc(_header->n_targets, sizeof(uint32_t)); _header->target_name = (char**)calloc(_header->n_targets, sizeof(char*)); for (int i = 0; i < _header->n_targets; ++i) { _header->target_len[i] = chromData[i].length; _header->target_name[i] = strdup(chromData[i].label.c_str()); } }
bam_hdr_t *bcf_hdr_2_bam_hdr_t (htsstuff *hs){ bam_hdr_t *ret = bam_hdr_init(); ret->l_text = 0; ret->text =NULL; const char **seqnames = NULL; int nseq; seqnames = bcf_hdr_seqnames(hs->hdr, &nseq); assert(seqnames); ret->n_targets = nseq; ret->target_len = (uint32_t*) malloc(sizeof(uint32_t)*nseq); ret->target_name = (char**) malloc(sizeof(char*)*nseq); for(size_t i=0;i<nseq;i++){ // fprintf(stderr,"i:%d is:%d\n",i,bcf_hdr_id2length()) ret->target_len[i] =0x7fffffff;// strlen(seqnames[i]); ret->target_name[i] =strdup(seqnames[i]); } free(seqnames); return ret; }
bam_hdr_t *cram_header_to_bam(SAM_hdr *h) { int i; bam_hdr_t *header = bam_hdr_init(); header->l_text = ks_len(&h->text); header->text = malloc(header->l_text+1); memcpy(header->text, ks_str(&h->text), header->l_text); header->text[header->l_text] = 0; header->n_targets = h->nref; header->target_name = (char **)calloc(header->n_targets, sizeof(char *)); header->target_len = (uint32_t *)calloc(header->n_targets, 4); for (i = 0; i < h->nref; i++) { header->target_name[i] = strdup(h->ref[i].name); header->target_len[i] = h->ref[i].len; } return header; }
int calcCoverage(char *fName, Slice *slice, htsFile *in, hts_idx_t *idx, int flags) { int ref; int begRange; int endRange; char region[1024]; char region_name[512]; if (Slice_getChrStart(slice) != 1) { fprintf(stderr, "Currently only allow a slice start position of 1\n"); return 1; } if (flags & M_UCSC_NAMING) { sprintf(region,"chr%s", Slice_getSeqRegionName(slice)); } else { sprintf(region,"%s", Slice_getSeqRegionName(slice)); } bam_hdr_t *header = bam_hdr_init(); header = bam_hdr_read(in->fp.bgzf); ref = bam_name2id(header, region); if (ref < 0) { fprintf(stderr, "Invalid region %s\n", region); exit(1); } sprintf(region,"%s:%ld-%ld", region_name, Slice_getSeqRegionStart(slice), Slice_getSeqRegionEnd(slice)); if (hts_parse_reg(region, &begRange, &endRange) == NULL) { fprintf(stderr, "Could not parse %s\n", region); exit(2); } bam_hdr_destroy(header); hts_itr_t *iter = sam_itr_queryi(idx, ref, begRange, endRange); bam1_t *b = bam_init1(); Coverage *coverage = calloc(Slice_getLength(slice),sizeof(Coverage)); long counter = 0; long overlapping = 0; long bad = 0; int startIndex = 0; while (bam_itr_next(in, iter, b) >= 0) { if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) { bad++; continue; } int end; //end = bam_calend(&b->core, bam1_cigar(b)); end = bam_endpos(b); // There is a special case for reads which have zero length and start at begRange (so end at begRange ie. before the first base we're interested in). // That is the reason for the || end == begRange test if (end == begRange) { continue; } counter++; if (!(counter%1000000)) { if (verbosity > 1) { printf("."); } fflush(stdout); } // Remember: b->core.pos is zero based! int cigInd; int refPos; int readPos; uint32_t *cigar = bam_get_cigar(b); for (cigInd = readPos = 0, refPos = b->core.pos; cigInd < b->core.n_cigar; ++cigInd) { int k; int lenCigBlock = cigar[cigInd]>>4; int op = cigar[cigInd]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (k = 0; k < lenCigBlock; ++k) { //if (ref[refPos+k] == 0) break; // out of boundary coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; readPos += lenCigBlock; } else if (op == BAM_CDEL) { for (k = 0; k < lenCigBlock; ++k) { // if (ref[refPos+k] == 0) break; coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; } else if (op == BAM_CSOFT_CLIP) { readPos += lenCigBlock; } else if (op == BAM_CHARD_CLIP) { } else if (op == BAM_CINS) { readPos += lenCigBlock; } else if (op == BAM_CREF_SKIP) { refPos += lenCigBlock; } } #ifdef DONE int j; int done = 0; int hadOverlap = 0; for (j=startIndex; j < Vector_getNumElement(genes) && !done; j++) { Gene *gene = Vector_getElementAt(genes,j); if (!gene) { continue; } // Remember: b->core.pos is zero based! if (b->core.pos < Gene_getEnd(gene) && end >= Gene_getStart(gene)) { int k; int doneGene = 0; for (k=0; k<Gene_getTranscriptCount(gene) && !doneGene; k++) { Transcript *trans = Gene_getTranscriptAt(gene,k); if (b->core.pos < Transcript_getEnd(trans) && end >= Transcript_getStart(trans)) { int m; for (m=0; m<Transcript_getExonCount(trans) && !doneGene; m++) { Exon *exon = Transcript_getExonAt(trans,m); if (b->core.pos < Exon_getEnd(exon) && end >= Exon_getStart(exon)) { // Only count as overlapping once (could be that a read overlaps more than one gene) if (!hadOverlap) { overlapping++; hadOverlap = 1; } gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); gs->score++; doneGene = 1; } } } } } else if (Gene_getStart(gene) > end) { done = 1; } else if (Gene_getEnd(gene) < b->core.pos+1) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); if (verbosity > 1) { printf("Removing gene %s (index %d) with extent %d to %d\n", Gene_getStableId(gene), gs->index, Gene_getStart(gene), Gene_getEnd(gene)); } Vector_setElementAt(genes,j,NULL); // Magic (very important for speed) - move startIndex to first non null gene int n; startIndex = 0; for (n=0;n<Vector_getNumElement(genes);n++) { void *v = Vector_getElementAt(genes,n); if (v != NULL) { break; } startIndex++; } if (verbosity > 1) { printf("startIndex now %d\n",startIndex); } } } #endif } if (verbosity > 1) { printf("\n"); } #ifdef DONE // Print out read counts for what ever's left in the genes array int n; for (n=0;n<Vector_getNumElement(genes);n++) { Gene *gene = Vector_getElementAt(genes,n); if (gene != NULL) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); } } #endif printf("Read %ld reads. Number of bad reads (unmapped, qc fail, secondary, dup) %ld\n", counter, bad); long i; for (i=0; i< Slice_getLength(slice); i++) { printf("%ld %ld\n", i+1, coverage[i].coverage); } sam_itr_destroy(iter); bam_destroy1(b); return 1; }
/****************************************************************************** * * The main worker node function. * * int thread_id: the thread_id * char *fastq1: FIFO from which bowtie2 can get read1 * char *fastq2: FIFO from which bowtie2 can get read2 (if it exists) * *******************************************************************************/ void herd_worker_node(int thread_id, char *fastq1, char *fastq2) { int cmd_length = 1, max_qname = 0, status, strand; char *cmd, *last_qname = calloc(1, sizeof(char)); MPI_Header *packed_header; MPI_read *packed_read = calloc(1, sizeof(MPI_read)); bam_hdr_t *header; bam1_t *read1 = bam_init1(); bam1_t *read2 = bam_init1(); samFile *fp; #ifdef DEBUG MPI_Status stat; int current_p_size = 100; htsFile *of; bam_hdr_t *debug_header = bam_hdr_init(); bam1_t *debug_read = bam_init1(); global_header = bam_hdr_init(); void *p = calloc(100,1); char *oname = NULL; #else int i = 0; #endif time_t t0, t1; int swapped = 0; assert(last_qname); assert(packed_read); //Which strand should we be aligning to? if(config.directional) { strand = (thread_id-1) % 2; } else { strand = (thread_id-1) % 4; } packed_read->size = 0; packed_read->packed = NULL; //construct the bowtie2 command cmd_length += (int) strlen("bowtie2 -q --reorder") + 1; cmd_length += (int) strlen(config.bowtie2_options) + 1; cmd_length += (int) strlen("--norc -x") + 1; cmd_length += (int) strlen(config.genome_dir) + strlen("bisulfite_genome/CT_conversion/BS_CT") + 1; cmd_length += (int) 2*(strlen("-1 ") + strlen(fastq1)) + 3; if(config.paired) cmd_length += (int) strlen(fastq2); //This is likely unneeded. #ifdef DEBUG oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_X.bam"))); assert(oname); sprintf(oname, "%s%s_%i.bam", config.odir, config.basename, thread_id); if(!config.quiet) fprintf(stderr, "Writing output to %s\n", oname); of = sam_open(oname, "wb"); free(oname); #endif cmd = (char *) malloc(sizeof(char) * cmd_length); assert(cmd); if(strand == 0) { //OT Read#1 C->T, Read#2 G->A, Genome C->T only the + strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 1) { //OB Read#1 C->T, Read#2 G->A, Genome G->A only the - strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 2) { //CTOT Read#1 G->A, Read#2 C->T, Genome C->T, only the - strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 3) { //CTOB Read#1 G->A, Read#2 C->T, Genome G->A, only the + strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else { fprintf(stderr, "Oh shit, got strand %i!\n", strand); return; } //Start the process if(!config.quiet) fprintf(stderr, "Node %i executing: %s\n", thread_id, cmd); fflush(stderr); fp = sam_popen(cmd); header = sam_hdr_read(fp); #ifdef DEBUG sam_hdr_write(of, header); #endif #ifndef DEBUG packed_header = pack_header(header); if(thread_id == 1) { //Send the header MPI_Send((void *) &(packed_header->size), 1, MPI_INT, 0, 1, MPI_COMM_WORLD); status = MPI_Send((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD); if(status != MPI_SUCCESS) { fprintf(stderr, "MPI_Send returned %i\n", status); fflush(stderr); } } #else packed_header = pack_header(header); void *tmp_pointer = malloc(packed_header->size); assert(tmp_pointer); MPI_Request request; MPI_Isend((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &request); status = MPI_Recv(tmp_pointer, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &stat); if(status != MPI_SUCCESS) fprintf(stderr, "We seem to have not been able to send the message to ourselves!\n"); MPI_Wait(&request, &stat); unpack_header(debug_header, tmp_pointer); global_header = debug_header; free(tmp_pointer); #endif t0 = time(NULL); if(!config.quiet) fprintf(stderr, "Node %i began sending reads @%s", thread_id, ctime(&t0)); fflush(stderr); while(sam_read1(fp, header, read1) >= 0) { #ifdef DEBUG sam_write1(of, global_header, read1); #endif if(strcmp(bam_get_qname(read1), last_qname) == 0) { //Multimapper if(config.paired) { sam_read1(fp, header, read2); #ifdef DEBUG sam_write1(of, global_header, read2); #endif } continue; } else { if(read1->core.l_qname > max_qname) { max_qname = read1->core.l_qname + 10; last_qname = realloc(last_qname, sizeof(char) * max_qname); assert(last_qname); } strcpy(last_qname, bam_get_qname(read1)); } //Are paired-end reads in the wrong order? swapped = 0; if(config.paired) { if(read1->core.flag & BAM_FREAD2) { swapped = 1; sam_read1(fp, header, read2); packed_read = pack_read(read2, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else sam_write1(of, global_header, read2); if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); debug_read = unpack_read(debug_read, p); #endif } } //Send the read packed_read = pack_read(read1, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend(packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); #endif //Deal with paired-end reads if(config.paired && !swapped) { sam_read1(fp, header, read2); packed_read = pack_read(read2, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else sam_write1(of, global_header, read2); if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); debug_read = unpack_read(debug_read, p); #endif } #ifndef DEBUG i++; #endif } t1 = time(NULL); if(!config.quiet) fprintf(stderr, "Node %i finished sending reads @%s\t(%f sec elapsed)\n", thread_id, ctime(&t1), difftime(t1, t0)); fflush(stderr); //Notify the master node packed_read->size = 0; #ifndef DEBUG void *A = malloc(1); assert(A); MPI_Send(A, 1, MPI_BYTE, 0, 5, MPI_COMM_WORLD); free(A); #endif //Close things up bam_hdr_destroy(header); bam_destroy1(read1); bam_destroy1(read2); free(cmd); if(packed_read->packed != NULL) free(packed_read->packed); free(packed_read); if(packed_header->packed != NULL) free(packed_header->packed); free(packed_header); free(last_qname); sam_pclose(fp); //Remove the FIFO(s) unlink(fastq1); if(config.paired) unlink(fastq2); #ifdef DEBUG sam_close(of); bam_hdr_destroy(debug_header); bam_destroy1(debug_read); free(p); #endif if(!config.quiet) fprintf(stderr, "Exiting worker node %i\n", thread_id); fflush(stderr); };