Exemplo n.º 1
0
BCFOrderedReader::BCFOrderedReader(std::string file_name, std::vector<GenomeInterval>& intervals)
{
    this->file_name = (file_name=="+")? "-" : file_name;
    file = NULL;
    hdr = NULL;
    idx = NULL;
    tbx = NULL;
    itr = NULL;

    this->intervals = intervals;
    interval_index = 0;
    index_loaded = false;

    file = hts_open(this->file_name.c_str(), "r");
    if (!file)
    {
        fprintf(stderr, "[%s:%d %s] Cannot open %s\n", __FILE__, __LINE__, __FUNCTION__, file_name.c_str());
        exit(1);    
    }    
    ftype = file->format;

    if (ftype.format!=vcf && ftype.format!=bcf)
    {
        fprintf(stderr, "[%s:%d %s] Not a VCF/BCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, file_name.c_str());
        exit(1);
    }

    s = {0, 0, 0};
    if (file==NULL) exit(1);
    hdr = bcf_alt_hdr_read(file);
    if (!hdr) exit(1);

    intervals_present =  intervals.size()!=0;

    if (ftype.format==bcf)
    {   
        if ((idx = bcf_index_load(file_name.c_str())))
        {
            index_loaded = true;
        }
        else
        {    
            if (intervals_present)
            {
                fprintf(stderr, "[E:%s] index cannot be loaded for %s for random access, ignoring specified intervals and reading from start.\n", __FUNCTION__, file_name.c_str());
//                exit(1);
            }
        }
    }
    else if (ftype.format==vcf)
    {
        if (ftype.compression==bgzf)
        {    
            if ((tbx = tbx_index_load(file_name.c_str())))
            {
                index_loaded = true;
            }
            else
            {
                if (intervals_present)
                {
                    fprintf(stderr, "[E:%s] index cannot be loaded for %s for random access, ignoring specified intervals and reading from start.\n", __FUNCTION__, file_name.c_str());
//                    exit(1);
                }
            }
        }
        else
        {
            if (intervals_present)
            {
                fprintf(stderr, "[E:%s] no random access support for VCF file: %s\n", __FUNCTION__, file_name.c_str());
//                exit(1);
            }
        }
    }

    random_access_enabled = intervals_present && index_loaded;
};
Exemplo n.º 2
0
/**
 * Constructor.
 *
 * @intervals - if empty, will add the contigs found in the header files
 */
BCFSyncedStreamReader::BCFSyncedStreamReader(std::vector<std::string>& vcf_files, std::vector<GenomeInterval>& intervals, bool sync_by_pos)
:vcf_files(vcf_files), intervals(intervals), sync_by_pos(sync_by_pos)
{
    nfiles = vcf_files.size();
    vcfs.resize(nfiles, 0);
    hdrs.resize(nfiles, 0);
    idxs.resize(nfiles, 0);
    tbxs.resize(nfiles, 0);
    itrs.resize(nfiles, 0);
    ftypes.resize(nfiles, -1);

    current_interval = "";
    current_pos1 = 0;

    buffer.resize(nfiles);
    s = {0, 0, 0};

    exists_selected_intervals = (intervals.size()!=0);
    for (uint32_t i=0; i<intervals.size(); ++i)
    {
        intervals_map[intervals[i].to_string()] = i;
    }
    intervals_index = 0;

    //1. check file type validity
    //2. loads indices
    //3. adds sequences found in all indexed files, this allows us to iterate through all sequences.
    for (int32_t i = 0; i<nfiles; ++i)
    {
        ftypes[i] = hts_file_type(vcf_files[i].c_str());
        vcfs[i] = bcf_open(vcf_files[i].c_str(), "r");
        if (vcfs[i]==NULL) exit(1);
        hdrs[i] = bcf_alt_hdr_read(vcfs[i]);

        if (i==0)
        {
            if (!(ftypes[i] & (FT_VCF|FT_BCF|FT_STDIN)))
            {
                fprintf(stderr, "[E:%s:%d %s] %s not a VCF or BCF file\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str());
                exit(1);
            }

            if (!load_index(i))
            {
                fprintf(stderr, "[I:%s:%d %s] index cannot be loaded for %s\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str());
                exit(1);
            }

            if (!exists_selected_intervals)
            {
                //add sequences from file i
                add_interval(i);
            }
        }
        else
        {
            if (!(ftypes[i] & (FT_VCF_GZ|FT_BCF_GZ)))
            {
                fprintf(stderr, "[E:%s:%d %s] %s not a VCF_GZ or BCF file\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str());
                exit(1);
            }

            if (!load_index(i))
            {
                fprintf(stderr, "[E:%s:%d %s] index cannot be loaded for %s\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str());
                exit(1);
            }

            if (!exists_selected_intervals)
            {
                //add sequences from file i
                add_interval(i);
            }
        }
    }
}
Exemplo n.º 3
0
/**
 * Constructor.
 *
 * @intervals - if empty, will add the contigs found in the header files
 */
BCFSyncedReader::BCFSyncedReader(std::vector<std::string>& file_names, std::vector<GenomeInterval>& intervals, bool sync_by_pos)
:file_names(file_names), intervals(intervals), sync_by_pos(sync_by_pos)
{
    nfiles = file_names.size();
    files.resize(nfiles, 0);
    hdrs.resize(nfiles, 0);
    idxs.resize(nfiles, 0);
    tbxs.resize(nfiles, 0);
    itrs.resize(nfiles, 0);
    ftypes.resize(nfiles);

    current_interval = "";
    current_pos1 = 0;

    buffer.resize(nfiles);
    s = {0, 0, 0};

    random_access = (intervals.size()!=0);
    for (size_t i=0; i<intervals.size(); ++i)
    {
        intervals_map[intervals[i].to_string()] = i;
    }
    intervals_index = 0;

    uint32_t no_stdins = 0;

    for (size_t i = 0; i<nfiles; ++i)
    {
        if (file_names[0]=="+")
        {
            file_names[0]="-";
            ++no_stdins;
        }
        else if (file_names[0]=="-")
        {
            file_names[0]="-";
            ++no_stdins;
        }

        if (no_stdins>1)
        {
            fprintf(stderr, "[E:%s:%d %s] BCFSyncedReader does not support reading from more than one STDIN stream\n", __FILE__, __LINE__, __FUNCTION__);
            exit(1);
        }

        files[i] = hts_open(file_names[i].c_str(), "r");
        if (files[i]==NULL)
        {
            fprintf(stderr, "[%s:%d %s] Cannot open %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
            exit(1);
        }
        ftypes[i] = files[i]->format;

        //check format
        if (ftypes[i].format!=vcf && ftypes[i].format!=bcf)
        {
            fprintf(stderr, "[E:%s:%d %s] %s not a VCF or BCF file\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
            exit(1);
        }

        //read header
        hdrs[i] = bcf_alt_hdr_read(files[i]);
        if (!hdrs[i])
        {
            fprintf(stderr, "[E:%s:%d %s] header cannot be read for %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
            exit(1);
        }

        //load index if intervals are specified
        if (random_access && !load_index(i))
        {
            fprintf(stderr, "[E:%s:%d %s] index cannot be loaded for %s for random access\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
            exit(1);
        }

        //check contigs consistency
        if (i)
        {
            int32_t nseqs0;
            const char ** seqnames0 = bcf_hdr_seqnames(hdrs[i], &nseqs0);

            int32_t nseqs;
            const char ** seqnames = bcf_hdr_seqnames(hdrs[i], &nseqs);

            if (nseqs0==0 || nseqs==0 || nseqs0!=nseqs)
            {
                fprintf(stderr, "[E:%s:%d %s] contigs in header not consistent with first file for %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
                exit(1);
            }

            for (size_t j=0; j<nseqs; ++j)
            {
                if (strcmp(seqnames0[j], seqnames[j]))
                {
                    fprintf(stderr, "[E:%s:%d %s] contigs in header not consistent with first file for %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
                    exit(1);
                }
            }

            free(seqnames0);
            free(seqnames);
        }
    }
}