예제 #1
0
파일: pair_reads.cpp 프로젝트: LTLA/csaw
SEXP get_leftovers (SEXP bam, SEXP index, SEXP processed) try { 
    BamFile bf(bam, index);
    BamRead br;

    if (!isString(processed)) { throw std::runtime_error("names of processed chromosomes should be strings"); }
    const int nchr=LENGTH(processed);
    std::set<std::string> already_there;
    for (int i=0; i<nchr; ++i) {
        already_there.insert(std::string(CHAR(STRING_ELT(processed, i))));        
    }

    // Getting the reads mapped to chromosomes we didn't look at due to 'restrict'.
    int leftovers=0;
    std::set<std::string>::iterator iat;
    for (int cid=0; cid<bf.header->n_targets; ++cid) {
        iat=already_there.find(std::string(bf.header->target_name[cid]));
        if (iat!=already_there.end()) { continue; }
        BamIterator biter(bf, cid, 0, bf.header->target_len[cid]);
        while (bam_itr_next(bf.in, biter.iter, br.read) >= 0){ ++leftovers; }
    } 
    
    // Also getting the unmapped guys. 
    BamIterator biter(bf);
    while (bam_itr_next(bf.in, biter.iter, br.read) >= 0){ ++leftovers; }
    return(ScalarInteger(leftovers));
} catch (std::exception &e) {
    return mkString(e.what());
}
예제 #2
0
/**
 * Reads next record, hides the random access of different regions from the user.
 */
bool BAMOrderedReader::read(bam1_t *s)
{
    if (random_access_enabled)
    {
        while(true)
        {
            if (itr && bam_itr_next(sam, itr, s)>=0)
            {
                return true;
            }
            else if (!initialize_next_interval())
            {
                return false;
            }
        }
    }
    else
    {
        if (bam_read1(sam->fp.bgzf, s)>=0)
        {
            //todo: filter via interval tree
            //if found in tree, return true else false
            return true;
        }
        else
        {
            return false;
        }
    }

    return false;
};
예제 #3
0
파일: pair_reads.cpp 프로젝트: LTLA/csaw
SEXP extract_pair_data(SEXP bam, SEXP index, SEXP chr, SEXP start, SEXP end, SEXP mapq, SEXP dedup, SEXP diagnostics) try {
    // Checking input values.
    if (!isInteger(mapq) || LENGTH(mapq)!=1) {
        throw std::runtime_error("mapping quality should be an integer scalar");
    }    
    const int minqual=asInteger(mapq);

    if (!isLogical(dedup) || LENGTH(dedup)!=1) {
        throw std::runtime_error("duplicate removal should be a logical scalar"); 
    }
    const bool rmdup=asLogical(dedup);

    if (!isLogical(diagnostics) || LENGTH(diagnostics)!=1) { 
        throw std::runtime_error("diagnostics specification should be a logical scalar"); 
    }
    const bool getnames=asLogical(diagnostics);

    // Initializing odds and ends.
    BamFile bf(bam, index);
    BamRead br;
    BamIterator biter(bf, chr, start, end);
    OutputContainer oc(getnames);
        
    typedef std::map<std::pair<int, std::string>, AlignData> Holder;
    std::deque<Holder> all_holders(4); // four holders, one for each strand/first combination; cut down searches.
    std::pair<int, std::string> current;
    Holder::iterator ith;
    int curpos, mate_pos;
    AlignData algn_data;
    bool am_mapped, is_first;

    bool mate_is_in;
    std::set<std::string> identical_pos;
    std::set<std::string>::iterator itip;
    int last_identipos=-1;

    while (bam_itr_next(bf.in, biter.iter, br.read) >= 0){
        ++oc.totals;
        curpos = (br.read->core).pos + 1; // Getting 1-indexed position.
        br.extract_data(algn_data);
        am_mapped=br.is_well_mapped(minqual, rmdup);

        /* Reasons to not add a read: */
       
//        // If we can see that it is obviously unmapped (IMPOSSIBLE for a sorted file).
//        if (((br.read -> core).flag & BAM_FUNMAP)!=0) { 
//            // We don't filter by additional mapping criteria, as we need to search 'holder' to pop out the partner and to store diagnostics.
//            continue;
//        } 
        
        // If it's a singleton.
        if (((br.read -> core).flag & BAM_FPAIRED)==0) {
            if (am_mapped) { oc.add_single(curpos, algn_data); }
            continue;
        }

        // Or, if we can see that its partner is obviously unmapped.
        if (((br.read -> core).flag & BAM_FMUNMAP)!=0) {
            if (am_mapped) { oc.add_onemapped(curpos, algn_data); }
            continue;
        }

        // Or if it's inter-chromosomal.
        is_first=(((br.read->core).flag & BAM_FREAD1)!=0);
        if (is_first==(((br.read->core).flag & BAM_FREAD2)!=0)) { 
            std::stringstream err;
            err << "read '" << bam_get_qname(br.read) << "' must be either first or second in the pair";
            throw std::runtime_error(err.str()); 
        }
      
        if ((br.read -> core).mtid!=(br.read -> core).tid) { 
            if (am_mapped) { oc.add_interchr(curpos, algn_data, bam_get_qname(br.read), is_first); } 
            continue;
        }

        /* Checking the map and adding it if it doesn't exist. */
        
        current.second.assign(bam_get_qname(br.read));
        mate_pos = (br.read -> core).mpos + 1; // 1-indexed position, again.
        mate_is_in=false;
        if (mate_pos < curpos) {
            mate_is_in=true;
        } else if (mate_pos == curpos) {
            // Identical mpos to curpos needs careful handling to figure out whether we've already seen it.
            if (curpos!=last_identipos) { 
                identical_pos.clear();
                last_identipos=curpos;
            }
            itip=identical_pos.lower_bound(current.second);
            if (itip!=identical_pos.end() && !(identical_pos.key_comp()(current.second, *itip))) {
                mate_is_in=true;
                identical_pos.erase(itip);
            } else {
                identical_pos.insert(itip, current.second);
            }
        }

        if (mate_is_in) {
            current.first = mate_pos;
            Holder& holder=all_holders[int(!is_first) + 2*int(bam_is_mrev(br.read))];
            ith=holder.find(current);

            if (ith != holder.end()) { 
                if (!am_mapped) {
                    // Searching to pop out the mate, to reduce the size of 'holder' for the remaining searches (and to store diagnostics).
                    oc.add_onemapped((ith->first).first, ith->second);
                    holder.erase(ith);
                    continue;
                }

                oc.add_genuine(curpos, algn_data, (ith->first).first, ith->second, is_first);
                holder.erase(ith);
            } else if (am_mapped) {
                // Only possible if the mate didn't get added because 'am_mapped' was false.
                oc.add_onemapped(curpos, algn_data);
            }
        } else if (am_mapped) {
            current.first = curpos;
            Holder& holder=all_holders[int(is_first) + 2*int(algn_data.is_reverse)];
            holder[current] = algn_data;
        }
    }

    // Leftovers treated as one_unmapped; marked as paired, but the mate is not in file.
    for (size_t h=0; h<all_holders.size(); ++h) { 
        Holder& holder=all_holders[h];
        for (ith=holder.begin(); ith!=holder.end(); ++ith) { 
            oc.add_onemapped((ith->first).first, ith->second);
        }
        holder.clear();
    }    

    // Storing all output.
    SEXP output=PROTECT(allocVector(VECSXP, getnames ? 9 : 2));
    try {
        SET_VECTOR_ELT(output, 0, allocVector(VECSXP, 2));
        SEXP left=VECTOR_ELT(output, 0);
        store_int_output(left, 0, oc.forward_pos_out);
        store_int_output(left, 1, oc.forward_len_out);
        
        SET_VECTOR_ELT(output, 1, allocVector(VECSXP, 2));
        SEXP right=VECTOR_ELT(output, 1);
        store_int_output(right, 0, oc.reverse_pos_out);
        store_int_output(right, 1, oc.reverse_len_out);
    
        if (getnames) {
            SET_VECTOR_ELT(output, 2, ScalarInteger(oc.totals));
            
            SET_VECTOR_ELT(output, 3, allocVector(VECSXP, 2));
            SEXP singles=VECTOR_ELT(output, 3);
            store_int_output(singles, 0, oc.single_pos);
            store_int_output(singles, 1, oc.single_len);

            SET_VECTOR_ELT(output, 4, allocVector(VECSXP, 2));
            SEXP first=VECTOR_ELT(output, 4);
            store_int_output(first, 0, oc.ufirst_pos);
            store_int_output(first, 1, oc.ufirst_len);
            
            SET_VECTOR_ELT(output, 5, allocVector(VECSXP, 2));
            SEXP second=VECTOR_ELT(output, 5);
            store_int_output(second, 0, oc.usecond_pos);
            store_int_output(second, 1, oc.usecond_len);

            SET_VECTOR_ELT(output, 6, allocVector(VECSXP, 2));
            SEXP onemap=VECTOR_ELT(output, 6);
            store_int_output(onemap, 0, oc.onemap_pos);
            store_int_output(onemap, 1, oc.onemap_len);

            SET_VECTOR_ELT(output, 7, allocVector(VECSXP, 3));
            SEXP interchr1=VECTOR_ELT(output, 7);
            store_int_output(interchr1, 0, oc.ifirst_pos);
            store_int_output(interchr1, 1, oc.ifirst_len);
            store_names(interchr1, 2, oc.interchr_names_1);

            SET_VECTOR_ELT(output, 8, allocVector(VECSXP, 3));
            SEXP interchr2=VECTOR_ELT(output, 8);
            store_int_output(interchr2, 0, oc.isecond_pos);
            store_int_output(interchr2, 1, oc.isecond_len);
            store_names(interchr2, 2, oc.interchr_names_2);
        }
    } catch (std::exception &e) {
        UNPROTECT(1);
        throw;
    }

    UNPROTECT(1);
    return output;
} catch (std::exception &e) {
    return mkString(e.what());
}
예제 #4
0
파일: samview.c 프로젝트: SamStudio8/htslib
int main_samview(int argc, char *argv[])
{
	samFile *in;
	char *fn_ref = 0;
	int flag = 0, c, clevel = -1, ignore_sam_err = 0;
	char moder[8];
	bam_hdr_t *h;
	bam1_t *b;

	while ((c = getopt(argc, argv, "IbSl:t:")) >= 0) {
		switch (c) {
		case 'S': flag |= 1; break;
		case 'b': flag |= 2; break;
		case 'l': clevel = atoi(optarg); flag |= 2; break;
		case 't': fn_ref = optarg; break;
		case 'I': ignore_sam_err = 1; break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "Usage: samview [-bSI] [-l level] <in.bam>|<in.sam> [region]\n");
		return 1;
	}
	strcpy(moder, "r");
	if ((flag&1) == 0) strcat(moder, "b");

	in = sam_open(argv[optind], moder, fn_ref);
	h = sam_hdr_read(in);
	h->ignore_sam_err = ignore_sam_err;
	b = bam_init1();

	if ((flag&4) == 0) { // SAM/BAM output
		htsFile *out;
		char modew[8];
		strcpy(modew, "w");
		if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
		if (flag&2) strcat(modew, "b");
		out = hts_open("-", modew, 0);
		sam_hdr_write(out, h);
		if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
			int i;
			hts_idx_t *idx;
			if ((idx = bam_index_load(argv[optind])) == 0) {
				fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
				return 1;
			}
			for (i = optind + 1; i < argc; ++i) {
				hts_itr_t *iter;
				if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) {
					fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
					continue;
				}
				while (bam_itr_next((BGZF*)in->fp, iter, b) >= 0) sam_write1(out, h, b);
				hts_itr_destroy(iter);
			}
			hts_idx_destroy(idx);
		} else while (sam_read1(in, h, b) >= 0) sam_write1(out, h, b);
		sam_close(out);
	}

	bam_destroy1(b);
	bam_hdr_destroy(h);
	sam_close(in);
	return 0;
}
예제 #5
0
파일: test_view.c 프로젝트: pkrusche/vt
int main(int argc, char *argv[])
{
	samFile *in;
	char *fn_ref = 0;
	int flag = 0, c, clevel = -1, ignore_sam_err = 0;
	char moder[8];
	bam_hdr_t *h;
	bam1_t *b;
	htsFile *out;
	char modew[8];
	int r = 0, exit_code = 0;

	while ((c = getopt(argc, argv, "IbDCSl:t:")) >= 0) {
		switch (c) {
		case 'S': flag |= 1; break;
		case 'b': flag |= 2; break;
		case 'D': flag |= 4; break;
		case 'C': flag |= 8; break;
		case 'l': clevel = atoi(optarg); flag |= 2; break;
		case 't': fn_ref = optarg; break;
		case 'I': ignore_sam_err = 1; break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "Usage: samview [-bSCSI] [-l level] <in.bam>|<in.sam>|<in.cram> [region]\n");
		return 1;
	}
	strcpy(moder, "r");
	if (flag&4) strcat(moder, "c");
	else if ((flag&1) == 0) strcat(moder, "b");

	in = sam_open(argv[optind], moder);
	h = sam_hdr_read(in);
	h->ignore_sam_err = ignore_sam_err;
	b = bam_init1();

	strcpy(modew, "w");
	if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
	if (flag&8) strcat(modew, "c");
	else if (flag&2) strcat(modew, "b");
	out = hts_open("-", modew);

	/* CRAM output */
	if (flag & 8) {
	    // Parse input header and use for CRAM output
	    out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text);

	    // Create CRAM references arrays
	    if (fn_ref)
		cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref);
	    else
		// Attempt to fill out a cram->refs[] array from @SQ headers
		cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL);
	}

	sam_hdr_write(out, h);
	if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
	    int i;
	    hts_idx_t *idx;
	    if ((idx = bam_index_load(argv[optind])) == 0) {
		fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
		return 1;
	    }
	    for (i = optind + 1; i < argc; ++i) {
		hts_itr_t *iter;
		if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) {
		    fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
		    continue;
		}
		while ((r = bam_itr_next(in, iter, b)) >= 0) {
		    if (sam_write1(out, h, b) < 0) {
			fprintf(stderr, "Error writing output.\n");
			exit_code = 1;
			break;
		    }
		}
		hts_itr_destroy(iter);
	    }
	    hts_idx_destroy(idx);
	} else while ((r = sam_read1(in, h, b)) >= 0) {
		if (sam_write1(out, h, b) < 0) {
			fprintf(stderr, "Error writing output.\n");
			exit_code = 1;
			break;
		}
	}
	sam_close(out);

	if (r < -1) {
	    fprintf(stderr, "Error parsing input.\n");
	    exit_code = 1;
	}

	bam_destroy1(b);
	bam_hdr_destroy(h);
	sam_close(in);
	return exit_code;
}
예제 #6
0
int calcCoverage(char *fName, Slice *slice, htsFile *in, hts_idx_t *idx, int flags) {
  int  ref;
  int  begRange;
  int  endRange;
  char region[1024];
  char region_name[512];


  if (Slice_getChrStart(slice) != 1) {
    fprintf(stderr, "Currently only allow a slice start position of 1\n");
    return 1;
  }
  if (flags & M_UCSC_NAMING) {
    sprintf(region,"chr%s", Slice_getSeqRegionName(slice));
  } else {
    sprintf(region,"%s", Slice_getSeqRegionName(slice));
  }
  bam_hdr_t *header = bam_hdr_init();
  header = bam_hdr_read(in->fp.bgzf);
  ref = bam_name2id(header, region);
  if (ref < 0) {
    fprintf(stderr, "Invalid region %s\n", region);
    exit(1);
  }
  sprintf(region,"%s:%ld-%ld", region_name,
                             Slice_getSeqRegionStart(slice),
                             Slice_getSeqRegionEnd(slice));
  if (hts_parse_reg(region, &begRange, &endRange) == NULL) {
    fprintf(stderr, "Could not parse %s\n", region);
    exit(2);
  }
  bam_hdr_destroy(header);


  hts_itr_t *iter = sam_itr_queryi(idx, ref, begRange, endRange);
  bam1_t *b = bam_init1();

  Coverage *coverage = calloc(Slice_getLength(slice),sizeof(Coverage));

  long counter = 0;
  long overlapping = 0;
  long bad = 0;
  int startIndex = 0;
  while (bam_itr_next(in, iter, b) >= 0) {
    if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) {
      bad++;
      continue;
    }

    int end;
    //end = bam_calend(&b->core, bam1_cigar(b));
    end = bam_endpos(b);

    // There is a special case for reads which have zero length and start at begRange (so end at begRange ie. before the first base we're interested in).
    // That is the reason for the || end == begRange test
    if (end == begRange) {
      continue;
    }
    counter++;

    if (!(counter%1000000)) {
      if (verbosity > 1) { printf("."); }
      fflush(stdout);
    }

// Remember: b->core.pos is zero based!
    int cigInd;
    int refPos;
    int readPos;
    uint32_t *cigar = bam_get_cigar(b);
    for (cigInd = readPos = 0, refPos = b->core.pos; cigInd < b->core.n_cigar; ++cigInd) {
      int k;
      int lenCigBlock = cigar[cigInd]>>4;
      int op          = cigar[cigInd]&0xf;

      if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
        for (k = 0; k < lenCigBlock; ++k) {
          //if (ref[refPos+k] == 0) break; // out of boundary
          coverage[refPos+k].coverage++;
        }
        if (k < lenCigBlock) break;
        refPos += lenCigBlock; readPos += lenCigBlock;
      } else if (op == BAM_CDEL) {
        for (k = 0; k < lenCigBlock; ++k) {
        //  if (ref[refPos+k] == 0) break;
          coverage[refPos+k].coverage++;
        }
        if (k < lenCigBlock) break;
        refPos += lenCigBlock;
      } else if (op == BAM_CSOFT_CLIP) {
        readPos += lenCigBlock;
      } else if (op == BAM_CHARD_CLIP) {
      } else if (op == BAM_CINS) {
         readPos += lenCigBlock;
      } else if (op == BAM_CREF_SKIP) {
         refPos += lenCigBlock;
      }
    }

#ifdef DONE
    int j;
    int done = 0;
    int hadOverlap = 0;
    
    for (j=startIndex; j < Vector_getNumElement(genes) && !done; j++) {
      Gene *gene = Vector_getElementAt(genes,j); 
      if (!gene) {
        continue;
      }
// Remember: b->core.pos is zero based!
      if (b->core.pos < Gene_getEnd(gene) && end >= Gene_getStart(gene)) {
        int k;

        int doneGene = 0;
        for (k=0; k<Gene_getTranscriptCount(gene) && !doneGene; k++) {
          Transcript *trans = Gene_getTranscriptAt(gene,k);

          if (b->core.pos < Transcript_getEnd(trans) && end >= Transcript_getStart(trans)) {
            int m;
     
            for (m=0; m<Transcript_getExonCount(trans) && !doneGene; m++) {
              Exon *exon = Transcript_getExonAt(trans,m);

              if (b->core.pos < Exon_getEnd(exon) && end >= Exon_getStart(exon)) {

                // Only count as overlapping once (could be that a read overlaps more than one gene)
                if (!hadOverlap) {
                  overlapping++;
                  hadOverlap = 1;
                }

                gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
                gs->score++;
                
                doneGene = 1;
              }
            }
          }
        }
      } else if (Gene_getStart(gene) > end) {
        done = 1;
      } else if (Gene_getEnd(gene) < b->core.pos+1) {
        gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
        printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), 
                                          Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", 
                                          gs->score);

        if (verbosity > 1) { 
          printf("Removing gene %s (index %d) with extent %d to %d\n", 
                 Gene_getStableId(gene), 
                 gs->index,
                 Gene_getStart(gene),
                 Gene_getEnd(gene));
        }
        Vector_setElementAt(genes,j,NULL);

        // Magic (very important for speed) - move startIndex to first non null gene
        int n;
        startIndex = 0;
        for (n=0;n<Vector_getNumElement(genes);n++) {
          void *v = Vector_getElementAt(genes,n);

          if (v != NULL) {
            break;
          }
          startIndex++;
        }
        if (verbosity > 1) { 
          printf("startIndex now %d\n",startIndex);
        }
      }
    }
#endif
  }
  if (verbosity > 1) { printf("\n"); }

#ifdef DONE
// Print out read counts for what ever's left in the genes array
  int n;
  for (n=0;n<Vector_getNumElement(genes);n++) {
    Gene *gene = Vector_getElementAt(genes,n);

    if (gene != NULL) {
      gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
      printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), 
                                        Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", 
                                        gs->score);
    }

  }
#endif

  printf("Read %ld reads. Number of bad reads (unmapped, qc fail, secondary, dup) %ld\n", counter, bad);

  long i;
  for (i=0; i< Slice_getLength(slice); i++) {
    printf("%ld %ld\n", i+1, coverage[i].coverage);
  }

  sam_itr_destroy(iter);
  bam_destroy1(b);


  return 1;
}