BamFilePrivate(const std::string& fn) : filename_(fn) , firstAlignmentOffset_(-1) { // ensure we've updated htslib verbosity with requested verbosity here hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity); // attempt open auto f = RawOpen(); #if !defined (PBBAM_NO_CHECK_EOF) || PBBAM_AUTOVALIDATE // sanity check on file const int eofCheck = bgzf_check_EOF(f->fp.bgzf); if (eofCheck <= 0 ) { // 1: EOF present & correct // 2: not seekable (e.g. reading from stdin) // 0: EOF absent // -1: some other error std::stringstream e; if (eofCheck == 0) e << fn << " : is missing EOF block" << std::endl; else e << fn << " : unknown error while checking EOF block" << std::endl; throw std::runtime_error(e.str()); } #endif // attempt fetch header std::unique_ptr<bam_hdr_t, internal::HtslibHeaderDeleter> hdr(sam_hdr_read(f.get())); header_ = internal::BamHeaderMemory::FromRawData(hdr.get()); // cache first alignment offset firstAlignmentOffset_ = bgzf_tell(f->fp.bgzf); }
bam_header_t *bam_header_read(bamFile fp) { bam_header_t *header; char buf[4]; int magic_len; int32_t i = 1, name_len; // check EOF i = bgzf_check_EOF(fp); if (i < 0) { // If the file is a pipe, checking the EOF marker will *always* fail // with ESPIPE. Suppress the error message in this case. if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF"); } else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n"); // read "BAM1" magic_len = bam_read(fp, buf, 4); if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); return 0; } header = bam_header_init(); // read plain text and the number of reference sequences bam_read(fp, &header->l_text, 4); if (bam_is_be) bam_swap_endian_4p(&header->l_text); header->text = (char*)calloc(header->l_text + 1, 1); bam_read(fp, header->text, header->l_text); bam_read(fp, &header->n_targets, 4); if (bam_is_be) bam_swap_endian_4p(&header->n_targets); // read reference sequence names and lengths header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); header->target_len = (uint32_t*)calloc(header->n_targets, 4); for (i = 0; i != header->n_targets; ++i) { bam_read(fp, &name_len, 4); if (bam_is_be) bam_swap_endian_4p(&name_len); header->target_name[i] = (char*)calloc(name_len, 1); bam_read(fp, header->target_name[i], name_len); bam_read(fp, &header->target_len[i], 4); if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); } bam_init_header_hash(header); return header; }
BgzfFileType::BgzfFileType(const char * filename, const char * mode) { // If the file is for write and is '-', then write to stdout. if(((mode[0] == 'w') || (mode[0] == 'W')) && (strcmp(filename, "-") == 0)) { // Write to stdout. bgzfHandle = bgzf_fdopen(fileno(stdout), mode); } else if(((mode[0] == 'r') || (mode[0] == 'R')) && (strcmp(filename, "-") == 0)) { // read from stdin bgzfHandle = bgzf_fdopen(fileno(stdin), mode); } else { bgzfHandle = bgzf_open(filename, mode); } myStartPos = 0; if (bgzfHandle != NULL) { // Check to see if the file is being opened for read, if the eof block // is required, and if it is, if it is there. if ((mode[0] == 'r' || mode[0] == 'R') && ourRequireEofBlock && (bgzf_check_EOF(bgzfHandle) == 0)) { std::cerr << "BGZF EOF marker is missing in " << filename << std::endl; // the block is supposed to be there, but isn't, so close the file. close(); } else { // Successfully opened a properly formatted file, so get the start // position. myStartPos = bgzf_tell(bgzfHandle); } } myEOF = false; }
int RawEOFCheck(const std::unique_ptr<samFile, internal::HtslibFileDeleter>& f) const { assert(f); assert(f->fp.bgzf); return bgzf_check_EOF(f->fp.bgzf); }
int main_quickcheck(int argc, char** argv) { int verbose = 0; hts_verbose = 0; const char* optstring = "v"; int opt; while ((opt = getopt(argc, argv, optstring)) != -1) { switch (opt) { case 'v': verbose++; break; default: usage_quickcheck(pysamerr); return 1; } } argc -= optind; argv += optind; if (argc < 1) { usage_quickcheck(stdout); return 1; } if (verbose >= 2) { fprintf(pysamerr, "verbosity set to %d\n", verbose); } if (verbose >= 4) { hts_verbose = 3; } int ret = 0; int i; for (i = 0; i < argc; i++) { char* fn = argv[i]; int file_state = 0; if (verbose >= 3) fprintf(pysamerr, "checking %s\n", fn); // attempt to open htsFile *hts_fp = hts_open(fn, "r"); if (hts_fp == NULL) { if (verbose >= 2) fprintf(pysamerr, "%s could not be opened for reading\n", fn); file_state |= 2; } else { if (verbose >= 3) fprintf(pysamerr, "opened %s\n", fn); // make sure we have sequence data const htsFormat *fmt = hts_get_format(hts_fp); if (fmt->category != sequence_data ) { if (verbose >= 2) fprintf(pysamerr, "%s was not identified as sequence data\n", fn); file_state |= 4; } else { if (verbose >= 3) fprintf(pysamerr, "%s is sequence data\n", fn); // check header bam_hdr_t *header = sam_hdr_read(hts_fp); if (header->n_targets <= 0) { if (verbose >= 2) fprintf(pysamerr, "%s had no targets in header\n", fn); file_state |= 8; } else { if (verbose >= 3) fprintf(pysamerr, "%s has %d targets in header\n", fn, header->n_targets); } // only check EOF on BAM for now // TODO implement and use hts_check_EOF() to include CRAM support if (fmt->format == bam) { if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) { if (verbose >= 2) fprintf(pysamerr, "%s was missing EOF block\n", fn); file_state |= 16; } else { if (verbose >= 3) fprintf(pysamerr, "%s has good EOF block\n", fn); } } } hts_close(hts_fp); } if (file_state > 0 && verbose >= 1) { fprintf(stdout, "%s\n", fn); } ret |= file_state; } return ret; }