Beispiel #1
0
    BamFilePrivate(const std::string& fn)
        : filename_(fn)
        , firstAlignmentOffset_(-1)
    {
        // ensure we've updated htslib verbosity with requested verbosity here
        hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity);

        // attempt open
        auto f = RawOpen();

#if !defined (PBBAM_NO_CHECK_EOF) || PBBAM_AUTOVALIDATE
        // sanity check on file
        const int eofCheck = bgzf_check_EOF(f->fp.bgzf);
        if (eofCheck <= 0 ) {
            // 1:  EOF present & correct
            // 2:  not seekable (e.g. reading from stdin)
            // 0:  EOF absent
            // -1: some other error
            std::stringstream e;
            if (eofCheck == 0)
                e << fn << " : is missing EOF block" << std::endl;
            else
                e << fn << " : unknown error while checking EOF block" << std::endl;
            throw std::runtime_error(e.str());
        }
#endif

        // attempt fetch header
        std::unique_ptr<bam_hdr_t, internal::HtslibHeaderDeleter> hdr(sam_hdr_read(f.get()));
        header_ = internal::BamHeaderMemory::FromRawData(hdr.get());

        // cache first alignment offset
        firstAlignmentOffset_ = bgzf_tell(f->fp.bgzf);
    }
Beispiel #2
0
bam_header_t *bam_header_read(bamFile fp)
{
	bam_header_t *header;
	char buf[4];
	int magic_len;
	int32_t i = 1, name_len;
	// check EOF
	i = bgzf_check_EOF(fp);
	if (i < 0) {
		// If the file is a pipe, checking the EOF marker will *always* fail
		// with ESPIPE.  Suppress the error message in this case.
		if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");
	}
	else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n");
	// read "BAM1"
	magic_len = bam_read(fp, buf, 4);
	if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
		fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
		return 0;
	}
	header = bam_header_init();
	// read plain text and the number of reference sequences
	bam_read(fp, &header->l_text, 4);
	if (bam_is_be) bam_swap_endian_4p(&header->l_text);
	header->text = (char*)calloc(header->l_text + 1, 1);
	bam_read(fp, header->text, header->l_text);
	bam_read(fp, &header->n_targets, 4);
	if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
	// read reference sequence names and lengths
	header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
	header->target_len = (uint32_t*)calloc(header->n_targets, 4);
	for (i = 0; i != header->n_targets; ++i) {
		bam_read(fp, &name_len, 4);
		if (bam_is_be) bam_swap_endian_4p(&name_len);
		header->target_name[i] = (char*)calloc(name_len, 1);
		bam_read(fp, header->target_name[i], name_len);
		bam_read(fp, &header->target_len[i], 4);
		if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
	}

	bam_init_header_hash(header);

	return header;
}
BgzfFileType::BgzfFileType(const char * filename, const char * mode)
{
    // If the file is for write and is '-', then write to stdout.
    if(((mode[0] == 'w') || (mode[0] == 'W')) && 
       (strcmp(filename, "-") == 0))
    {
        // Write to stdout.
        bgzfHandle = bgzf_fdopen(fileno(stdout), mode);
    }
    else if(((mode[0] == 'r') || (mode[0] == 'R')) && 
       (strcmp(filename, "-") == 0))
    {
        // read from stdin
        bgzfHandle = bgzf_fdopen(fileno(stdin), mode);
    }
    else
    {
        bgzfHandle = bgzf_open(filename, mode);
    }

    myStartPos = 0;
    if (bgzfHandle != NULL)
    {
        // Check to see if the file is being opened for read, if the eof block
        // is required, and if it is, if it is there.
        if ((mode[0] == 'r' || mode[0] == 'R') && ourRequireEofBlock &&
                (bgzf_check_EOF(bgzfHandle) == 0))
        {
            std::cerr << "BGZF EOF marker is missing in " << filename << std::endl;
            // the block is supposed to be there, but isn't, so close the file.
            close();
        }
        else
        {
            // Successfully opened a properly formatted file, so get the start
            // position.
            myStartPos = bgzf_tell(bgzfHandle);
        }
    }

    myEOF = false;
}
Beispiel #4
0
 int RawEOFCheck(const std::unique_ptr<samFile, internal::HtslibFileDeleter>& f) const
 {
     assert(f);
     assert(f->fp.bgzf);
     return bgzf_check_EOF(f->fp.bgzf);
 }
Beispiel #5
0
int main_quickcheck(int argc, char** argv)
{
    int verbose = 0;
    hts_verbose = 0;

    const char* optstring = "v";
    int opt;
    while ((opt = getopt(argc, argv, optstring)) != -1) {
        switch (opt) {
        case 'v':
            verbose++;
            break;
        default:
            usage_quickcheck(pysamerr);
            return 1;
        }
    }

    argc -= optind;
    argv += optind;

    if (argc < 1) {
        usage_quickcheck(stdout);
        return 1;
    }

    if (verbose >= 2) {
        fprintf(pysamerr, "verbosity set to %d\n", verbose);
    }

    if (verbose >= 4) {
        hts_verbose = 3;
    }

    int ret = 0;
    int i;

    for (i = 0; i < argc; i++) {
        char* fn = argv[i];
        int file_state = 0;

        if (verbose >= 3) fprintf(pysamerr, "checking %s\n", fn);

        // attempt to open
        htsFile *hts_fp = hts_open(fn, "r");
        if (hts_fp == NULL) {
            if (verbose >= 2) fprintf(pysamerr, "%s could not be opened for reading\n", fn);
            file_state |= 2;
        }
        else {
            if (verbose >= 3) fprintf(pysamerr, "opened %s\n", fn);
            // make sure we have sequence data
            const htsFormat *fmt = hts_get_format(hts_fp);
            if (fmt->category != sequence_data ) {
                if (verbose >= 2) fprintf(pysamerr, "%s was not identified as sequence data\n", fn);
                file_state |= 4;
            }
            else {
                if (verbose >= 3) fprintf(pysamerr, "%s is sequence data\n", fn);
                // check header
                bam_hdr_t *header = sam_hdr_read(hts_fp);
                if (header->n_targets <= 0) {
                    if (verbose >= 2) fprintf(pysamerr, "%s had no targets in header\n", fn);
                    file_state |= 8;
                }
                else {
                    if (verbose >= 3) fprintf(pysamerr, "%s has %d targets in header\n", fn, header->n_targets);
                }

                // only check EOF on BAM for now
                // TODO implement and use hts_check_EOF() to include CRAM support
                if (fmt->format == bam) {
                    if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) {
                        if (verbose >= 2) fprintf(pysamerr, "%s was missing EOF block\n", fn);
                        file_state |= 16;
                    }
                    else {
                        if (verbose >= 3) fprintf(pysamerr, "%s has good EOF block\n", fn);
                    }
                }
            }

            hts_close(hts_fp);
        }

        if (file_state > 0 && verbose >= 1) {
            fprintf(stdout, "%s\n", fn);
        }
        ret |= file_state;
    }

    return ret;
}