Ejemplo n.º 1
0
filter_t *region_exact_filter_new(char *region_descriptor, int use_region_file, char *type, 
                                  const char *url, const char *species, const char *version) {
    assert(region_descriptor);
    assert(url);
    assert(species);
    assert(version);
    
    filter_t *filter = (filter_t*) malloc (sizeof(filter_t));
    filter->type = REGION;
    filter->filter_func = region_filter;
    filter->free_func = region_filter_free;
    filter->priority = 2;

    region_filter_args *filter_args = (region_filter_args*) malloc (sizeof(region_filter_args));
    if (use_region_file) {
        snprintf(filter->name, 11, "RegionFile");
        snprintf(filter->description, 64, "Regions read from '%s'", region_descriptor);
        if (ends_with(region_descriptor, ".gff")) {
            filter_args->regions = parse_regions_from_gff_file(region_descriptor, url, species, version);
        } else if (ends_with(region_descriptor, ".bed")) {
            filter_args->regions = parse_regions_from_bed_file(region_descriptor, url, species, version);
        } else {
            LOG_FATAL_F("Region file %s format not supported! Please use BED or GFF formats\n", region_descriptor);
        }
    } else {
        snprintf(filter->name, 11, "RegionList");
        snprintf(filter->description, 64, "Regions (could be more) %s", region_descriptor);
        filter_args->regions = parse_regions(region_descriptor, 1, url, species, version);
    }
    filter_args->type = type;
    filter->args = filter_args;

    return filter;
}
Ejemplo n.º 2
0
Archivo: tabix.c Proyecto: Illumina/akt
int main(int argc, char *argv[])
{
    int c, detect = 1, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0;
    tbx_conf_t conf = tbx_conf_gff;
    char *reheader = NULL;
    args_t args;
    memset(&args,0,sizeof(args_t));

    static const struct option loptions[] =
    {
        {"help", no_argument, NULL, 2},
        {"regions", required_argument, NULL, 'R'},
        {"targets", required_argument, NULL, 'T'},
        {"csi", no_argument, NULL, 'C'},
        {"zero-based", no_argument, NULL, '0'},
        {"print-header", no_argument, NULL, 'h'},
        {"only-header", no_argument, NULL, 'H'},
        {"begin", required_argument, NULL, 'b'},
        {"comment", required_argument, NULL, 'c'},
        {"end", required_argument, NULL, 'e'},
        {"force", no_argument, NULL, 'f'},
        {"preset", required_argument, NULL, 'p'},
        {"sequence", required_argument, NULL, 's'},
        {"skip-lines", required_argument, NULL, 'S'},
        {"list-chroms", no_argument, NULL, 'l'},
        {"reheader", required_argument, NULL, 'r'},
        {"version", no_argument, NULL, 1},
        {NULL, 0, NULL, 0}
    };

    char *tmp;
    while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0)
    {
        switch (c)
        {
            case 'R': args.regions_fname = optarg; break;
            case 'T': args.targets_fname = optarg; break;
            case 'C': do_csi = 1; break;
            case 'r': reheader = optarg; break;
            case 'h': args.print_header = 1; break;
            case 'H': args.print_header = 1; args.header_only = 1; break;
            case 'l': list_chroms = 1; break;
            case '0': conf.preset |= TBX_UCSC; detect = 0; break;
            case 'b':
                conf.bc = strtol(optarg,&tmp,10);
                if ( *tmp ) error("Could not parse argument: -b %s\n", optarg);
                detect = 0;
                break;
            case 'e':
                conf.ec = strtol(optarg,&tmp,10);
                if ( *tmp ) error("Could not parse argument: -e %s\n", optarg);
                detect = 0;
                break;
            case 'c': conf.meta_char = *optarg; detect = 0; break;
            case 'f': is_force = 1; break;
            case 'm':
                min_shift = strtol(optarg,&tmp,10);
                if ( *tmp ) error("Could not parse argument: -m %s\n", optarg);
                break;
            case 'p':
                detect = 0;
                if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff;
                else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed;
                else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam;
                else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf;
                else if (strcmp(optarg, "bcf") == 0) detect = 1; // bcf is autodetected, preset is not needed
                else if (strcmp(optarg, "bam") == 0) detect = 1; // same as bcf
                else error("The preset string not recognised: '%s'\n", optarg);
                break;
            case 's':
                conf.sc = strtol(optarg,&tmp,10);
                if ( *tmp ) error("Could not parse argument: -s %s\n", optarg);
                detect = 0;
                break;
            case 'S':
                conf.line_skip = strtol(optarg,&tmp,10);
                if ( *tmp ) error("Could not parse argument: -S %s\n", optarg);
                detect = 0;
                break;
            case 1:
                printf(
"tabix (htslib) %s\n"
"Copyright (C) 2017 Genome Research Ltd.\n", hts_version());
                return EXIT_SUCCESS;
            default: return usage();
        }
    }

    if ( optind==argc ) return usage();

    if ( list_chroms )
        return query_chroms(argv[optind]);

    if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname )
    {
        int nregs = 0;
        char **regs = NULL;
        if ( !args.header_only )
            regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs);
        return query_regions(&args, argv[optind], regs, nregs);
    }

    char *fname = argv[optind];
    int ftype = file_type(fname);
    if ( detect )  // no preset given
    {
        if ( ftype==IS_GFF ) conf = tbx_conf_gff;
        else if ( ftype==IS_BED ) conf = tbx_conf_bed;
        else if ( ftype==IS_SAM ) conf = tbx_conf_sam;
        else if ( ftype==IS_VCF )
        {
            conf = tbx_conf_vcf;
            if ( !min_shift && do_csi ) min_shift = 14;
        }
        else if ( ftype==IS_BCF )
        {
            if ( !min_shift ) min_shift = 14;
        }
        else if ( ftype==IS_BAM )
        {
            if ( !min_shift ) min_shift = 14;
        }
    }
    if ( do_csi )
    {
        if ( !min_shift ) min_shift = 14;
        min_shift *= do_csi;  // positive for CSIv2, negative for CSIv1
    }
    if ( min_shift!=0 && !do_csi ) do_csi = 1;

    if ( reheader )
        return reheader_file(fname, reheader, ftype, &conf);

    char *suffix = ".tbi";
    if ( do_csi ) suffix = ".csi";
    else if ( ftype==IS_BAM ) suffix = ".bai";
    else if ( ftype==IS_CRAM ) suffix = ".crai";

    char *idx_fname = calloc(strlen(fname) + 5, 1);
    strcat(strcpy(idx_fname, fname), suffix);

    struct stat stat_tbi, stat_file;
    if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
    {
        // Before complaining about existing index, check if the VCF file isn't
        // newer. This is a common source of errors, people tend not to notice
        // that tabix failed
        stat(fname, &stat_file);
        if ( stat_file.st_mtime <= stat_tbi.st_mtime )
            error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
    }
    free(idx_fname);

    if ( ftype==IS_CRAM )
    {
        if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
        return 0;
    }
    else if ( do_csi )
    {
        if ( ftype==IS_BCF )
        {
            if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
            return 0;
        }
        if ( ftype==IS_BAM )
        {
            if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
            return 0;
        }
        if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
        return 0;
    }
    else    // TBI index
    {
        if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
        return 0;
    }
    return 0;
}
int main(int argc, char* argv[])
{
  try
  {
    if (argc < 13 || argc % 2 != 1)
    {
      std::cerr << "usage: " << argv[0] << " number_of_threads region_file gff_or_bed_format extension bam_file bam_file_key hdf5_file "
                << "log_file write_warnings_to_stderr strand chr1 length1 ...\n"
        << "\ne.g. " << argv[0] << " /grail/annotations/HG19_SUM159_BRD4_-0_+0.gff gff"
        << "\n      /ifs/labs/bradner/bam/hg18/mm1s/04032013_D1L57ACXX_4.TTAGGC.hg18.bwt.sorted.bam 137 counts.hdf5 "
        << "\n      output/log.txt 1 _ chr1 247249719 chr2 242951149 chr3 199501827\n"
        << "\nstrand value of _ means use strand that is specified in region file (and use . if strand not specified in region file)."
        << "\nnumber of threads <= 0 means use a number of threads equal to the number of logical cpus."
        << "\nnote that this application is intended to be run from bamliquidator_batch.py -- see"
        << "\nhttps://github.com/BradnerLab/pipeline/wiki for more information"
        << std::endl;
      return 1;
    }

    const int number_of_threads = boost::lexical_cast<int>(argv[1]);
    const std::string region_file_path = argv[2];
    const std::string region_format = argv[3];
    const unsigned int extension = boost::lexical_cast<unsigned int>(argv[4]);
    const std::string bam_file_path = argv[5];
    const unsigned int bam_file_key = boost::lexical_cast<unsigned int>(argv[6]);
    const std::string hdf5_file_path = argv[7];
    const std::string log_file_path = argv[8];
    const bool write_warnings_to_stderr = boost::lexical_cast<bool>(argv[9]);
    const char strand = boost::lexical_cast<char>(argv[10]);
    const std::vector<std::pair<std::string, size_t>> chromosome_lengths = extract_chromosome_lengths(argc, argv, 11);

    tbb::task_scheduler_init init( number_of_threads <= 0 
                                 ? tbb::task_scheduler_init::automatic
                                 : number_of_threads); 

    Logger::configure(log_file_path, write_warnings_to_stderr);

    hid_t h5file = H5Fopen(hdf5_file_path.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
    if (h5file < 0)
    {
      Logger::error() << "Failed to open H5 file " << hdf5_file_path;
      return 3;
    }

    #ifdef time_region_parsing 
    boost::timer::cpu_timer timer; 
    #endif

    std::map<std::string, size_t> chromosome_to_length;
    for (auto& chr_length : chromosome_lengths)
    {
      chromosome_to_length[chr_length.first] = chr_length.second;
    }

    std::vector<Region> regions = parse_regions(region_file_path,
                                                region_format,
                                                bam_file_key,
                                                chromosome_to_length,
                                                strand);
    #ifdef time_region_parsing 
    timer.stop();
    std::cout << "parsing regions took" << timer.format() << std::endl;
    #endif
    if (regions.size() == 0)
    {
      Logger::warn() << "No valid regions detected in " << region_file_path;
      return 0;
    }

    liquidate_and_write(h5file, regions, extension, bam_file_path);
   
    H5Fclose(h5file);

    return 0;
  }
  catch(const std::exception& e)
  {
    Logger::error() << "Unhandled exception: " << e.what();

    return 4; 
  }
}