예제 #1
0
int main(int argc, char ** argv) {
    data D;

    //-------------------------
    // 1. DECLARE ALL OPTIONS
    //-------------------------

    bpo::options_description opt_basic ("\33[33mBasic options\33[0m");
    opt_basic.add_options()
        ("help", "Produces this help")
        ("silent", "Silent mode on terminal")
        ("seed", bpo::value< int >()->default_value(time(NULL)), "Random number seed. Useful to replicate runs.");

    bpo::options_description opt_files ("\33[33mInput/Output files\33[0m");
    opt_files.add_options()
        ("log,L", bpo::value< string >()->default_value("fastQTL_date_time_UUID.log"), "Screen output is copied in this file.")
        ("vcf,V", bpo::value< string >(), "Genotypes in VCF format.")
        ("bed,B", bpo::value< string >(), "Phenotypes in BED format.")
        ("cov,C", bpo::value< string >(), "Covariates in TXT format.")
        ("grp,G", bpo::value< string >(), "Phenotype groups in TXT format.")
        ("out,O", bpo::value< string >(), "Output file.");

    bpo::options_description opt_exclusion ("\33[33mExclusion/Inclusion files\33[0m");
    opt_exclusion.add_options()
        ("exclude-samples", bpo::value< string >(), "List of samples to exclude.")
        ("include-samples", bpo::value< string >(), "List of samples to include.")
        ("exclude-sites", bpo::value< string >(), "List of sites to exclude.")
        ("include-sites", bpo::value< string >(), "List of sites to include.")
        ("exclude-phenotypes", bpo::value< string >(), "List of phenotypes to exclude.")
        ("include-phenotypes", bpo::value< string >(), "List of phenotypes to include.")
        ("exclude-covariates", bpo::value< string >(), "List of covariates to exclude.")
        ("include-covariates", bpo::value< string >(), "List of covariates to include.");

    bpo::options_description opt_parameters ("\33[33mParameters\33[0m");
    opt_parameters.add_options()
        ("normal", "Normal transform the phenotypes.")
        ("window,W", bpo::value< double >()->default_value(1e6), "Cis-window size.")
        ("threshold,T", bpo::value< double >()->default_value(1.0), "P-value threshold used in nominal pass of association")
        ("maf-threshold", bpo::value< double >()->default_value(0.0), "Minor allele frequency threshold used when parsing genotypes")
        ("ma-sample-threshold", bpo::value< int >()->default_value(0), "Minimum number of samples carrying the minor allele; used when parsing genotypes")
        ("global-af-threshold", bpo::value< double >()->default_value(0.0), "AF threshold for all samples in VCF (used to filter AF in INFO field)")
        ("interaction-maf-threshold", bpo::value< double >()->default_value(0.0), "MAF threshold for interactions, applied to lower and upper half of samples");

    bpo::options_description opt_modes ("\33[33mModes\33[0m");
    opt_modes.add_options()
        ("permute,P", bpo::value< vector < int > >()->multitoken(), "Permutation pass to calculate corrected p-values for molecular phenotypes.")
        ("psequence", bpo::value< string >(), "Permutation sequence.")
        ("map", bpo::value< string >(), "Map best QTL candidates per molecular phenotype.")
        ("map-full", "Scan full cis-window to discover independent signals.")
        ("interaction", bpo::value< string >(), "Test for interactions with variable specified in file.")
        ("report-best-only", bpo::bool_switch()->default_value(false), "Report best variant only (nominal mode)");

    bpo::options_description opt_parallel ("\33[33mParallelization\33[0m");
    opt_parallel.add_options()
        ("chunk,K", bpo::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed")
        ("commands", bpo::value< vector < string > >()->multitoken(), "Generates all commands")
        ("region,R", bpo::value< string >(), "Region of interest.");

    bpo::options_description descriptions;
    descriptions.add(opt_basic).add(opt_files).add(opt_exclusion).add(opt_parameters).add(opt_modes).add(opt_parallel);

    //-------------------
    // 2. PARSE OPTIONS
    //-------------------
    bpo::variables_map options;
    try {
        bpo::store(bpo::command_line_parser(argc, argv).options(descriptions).run(), options);
        bpo::notify(options);
    } catch ( const boost::program_options::error& e ) {
        cerr << "Error parsing command line :" << string(e.what()) << endl;
        exit(0);
    }

    //-----------------------
    // 3. PRINT HEADER/HELP
    //-----------------------
    if (! options.count("silent")) {
        cout << endl;
        cout << "\33[33mF\33[0mast \33[33mQTL\33[0m" << endl;
        cout << "  * Authors : Olivier DELANEAU, Halit ONGEN, Alfonso BUIL & Manolis DERMITZAKIS" << endl;
        cout << "  * Contact : [email protected]" << endl;
        cout << "  * Webpage : http://fastqtl.sourceforge.net/" << endl;
        cout << "  * Version : v2.184_gtex" << endl;
        if (options.count("help")) { cout << descriptions<< endl; exit(1); }
    }

    //--------------
    // 4. LOG FILE
    //--------------
    struct timeval start_time, stop_time;
    gettimeofday(&start_time, 0);
    START_DATE = time(0);
    //localtime(&START_DATE);
    //string logfile = "fastQTL_" + sutils::date2str(&START_DATE, "%d%m%Y_%Hh%Mm%Ss") + "_" + putils::getRandomID() + ".log";
    if (!options["log"].defaulted()) {
        if (!LOG.open(options["log"].as < string > ())) {
            cerr << "Impossible to open log file[" << options["log"].as < string > () << "] check writing permissions!" << endl;
            exit(1);
        }
    } else LOG.muteL();
    if (options.count("silent")) LOG.muteC();

    //------------------------
    // 5. OPTIONS COMBINATIONS
    //------------------------
    if (!options.count("vcf")) LOG.error("Genotype data needs to be specified with --vcf [file.vcf]");
    if (!options.count("bed")) LOG.error("Phenotype data needs to be specified with --bed [file.bed]");
    if (!options.count("out")) LOG.error("Output needs to be specified with --out [file.out]");

    int nParallel = options.count("chunk") + options.count("commands") + options.count("region");
    if (nParallel != 1) LOG.error("Please, specify one of these options [--region, --chunk, --commands]");

    int nMode = options.count("permute") + options.count("map");
    if (nMode > 1) LOG.error("Please, specify only one of these options [--permute, --map]");

    //---------------
    // 6. CHECK FILES
    //---------------
    if (!futils::isFile(options["vcf"].as < string > ())) LOG.error(options["vcf"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (!futils::isFile(options["bed"].as < string > ())) LOG.error(options["bed"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("cov") && !futils::isFile(options["cov"].as < string > ())) LOG.error(options["cov"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("interaction") && !futils::isFile(options["interaction"].as < string > ())) LOG.error(options["interaction"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("grp") && !futils::isFile(options["grp"].as < string > ())) LOG.error(options["grp"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("map") && !futils::isFile(options["map"].as < string > ())) LOG.error(options["map"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (!futils::createFile(options["out"].as < string > ())) LOG.error(options["out"].as < string > () + " is impossible to create, check writing permissions");

    //-----------------------------------
    // 6. CHECK INCLUSION/EXCLUSION FILES
    //-----------------------------------
    if (options.count("exclude-samples") && !futils::isFile(options["exclude-samples"].as < string > ())) LOG.error(options["exclude-samples"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("include-samples") && !futils::isFile(options["include-samples"].as < string > ())) LOG.error(options["include-samples"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("exclude-sites") && !futils::isFile(options["exclude-sites"].as < string > ())) LOG.error(options["exclude-sites"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("include-sites") && !futils::isFile(options["include-sites"].as < string > ())) LOG.error(options["include-sites"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("exclude-phenotypes") && !futils::isFile(options["exclude-phenotypes"].as < string > ())) LOG.error(options["exclude-phenotypes"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("include-phenotypes") && !futils::isFile(options["include-phenotypes"].as < string > ())) LOG.error(options["include-phenotypes"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("exclude-covariates") && !futils::isFile(options["exclude-covariates"].as < string > ())) LOG.error(options["exclude-covariates"].as < string > () + " is impossible to open, check file existence or reading permissions");
    if (options.count("include-covariates") && !futils::isFile(options["include-covariates"].as < string > ())) LOG.error(options["include-covariates"].as < string > () + " is impossible to open, check file existence or reading permissions");

    //----------------------------
    // 7. CHECK METHODS/PARAMETERS
    //----------------------------
    if (options.count("interaction")) {
        if (options.count("permute")) {
            LOG.println("\nPerform permutation-based interaction analysis (used to calculate corrected p-values for MPs)");
            vector < int > nPerm = options["permute"].as < vector < int > > ();
            if (nPerm.size() != 1) LOG.error("Interactions only work with a fixed number of permutations!");
            else {
                if (nPerm[0] < 50) LOG.warning("Permutation number seems to be low, check parameters");
                LOG.println("  * Perform " + sutils::int2str(nPerm[0]) + " permutations");
            }
        } else {
            LOG.println("\nPerform nominal interaction analysis");
        }
        LOG.println("  * Test interaction with term from [" + options["interaction"].as < string > () + "]");
    } else if (options.count("permute")) {
        LOG.println("\nPerform permutation based analysis (used to calculate corrected p-values for MPs)");
        vector < int > nPerm = options["permute"].as < vector < int > > ();
        if (nPerm.size() > 3 || nPerm.size() < 1) LOG.error ("Option --permute takes 1, 2 or 3 arguments");
        if (nPerm.size() == 1) {
            if (nPerm[0] <= 0) LOG.error("Permutation number needs to be positive integer");
            if (nPerm[0] < 50) LOG.warning("Permutation number seems to be low, check parameters");
            LOG.println("  * Perform " + sutils::int2str(nPerm[0]) + " permutations");
        } else if (nPerm.size() == 2) {
            if (nPerm[0] <= 0 || nPerm[1] <= 0) LOG.error("Permutation number needs to be positive");
            if (nPerm[1] <= nPerm[0]) LOG.error("For adaptive permutation scheme, arg1 needs to be smaller than arg2!");
            LOG.println("  * Perform between " + sutils::int2str(nPerm[0]) + " and " + sutils::int2str(nPerm[1]) + " permutations");
        } else {
            if (nPerm[0] <= 0 || nPerm[1] <= 0 || nPerm[2] <= 0) LOG.error("Permutation number needs to be positive");
            if (nPerm[2] <= nPerm[0]) LOG.error("For adaptive permutation scheme, arg1 needs to be smaller than arg3!");
            if (nPerm[0] <= nPerm[1]) LOG.error("For adaptive permutation scheme, arg2 needs to be smaller than arg1!");
            LOG.println("  * Perform between " + sutils::int2str(nPerm[0]) + " and " + sutils::int2str(nPerm[2]) + " permutations and stop when " + sutils::int2str(nPerm[1]) + " best associations are found");
        }
        if (options.count("grp")) LOG.println("  * Using MP groups from [" + options["grp"].as < string > () + "]");
    } else if (options.count("map")) {
        LOG.println("\nPerform conditional based analysis (used to map significant QTLs for MPs");
        LOG.println("  * Using per MP p-value threshold from [" + options["map"].as < string > () + "]");
        if (options.count("map-full")) LOG.println("  * Scanning all variants in cis and not only nominally significant ones");
    } else {
        LOG.println("\nPerform nominal analysis (used to get raw p-values of association)");
        double threshold = options["threshold"].as < double > ();
        if (threshold <= 0.0 || threshold > 1.0) LOG.error("Incorrect --threshold value  :  0 < X <= 1");
        LOG.println("  * Using p-value threshold = " + sutils::double2str(threshold, 10));
    }

    if (options["seed"].as < int > () < 0) LOG.error("Random number generator needs a positive seed value");
    else srand(options["seed"].as < int > ());
    LOG.println("  * Random number generator is seeded with " + sutils::int2str(options["seed"].as < int > ()));

    if (options["window"].as < double > () <= 0) LOG.error ("Incorrect value for option --window (null or negative value)");
    if (options["window"].as < double > () > 1e9) LOG.error ("Cis-window cannot be larger than 1e9bp");
    LOG.println("  * Considering variants within " + sutils::double2str(options["window"].as < double > ()) + " bp of the MPs");
    D.cis_window = options["window"].as < double > ();

    D.maf_threshold = options["maf-threshold"].as < double > ();
    if (D.maf_threshold < 0.0 || D.maf_threshold >= 0.5) LOG.error("Incorrect --maf-threshold value  :  0 <= X < 0.5");
    LOG.println("  * Using minor allele frequency threshold = " + sutils::double2str(D.maf_threshold, 4));

    D.ma_sample_threshold = options["ma-sample-threshold"].as < int > ();
    if (D.ma_sample_threshold < 0) LOG.error("Incorrect --ma-sample-threshold  :  0 <= X");
    LOG.println("  * Using minor allele sample threshold = " + sutils::int2str(D.ma_sample_threshold));

    D.global_af_threshold = options["global-af-threshold"].as < double > ();
    if (D.global_af_threshold < 0.0 || D.global_af_threshold >= 0.5) LOG.error("Incorrect --global-af-threshold value  :  0 <= X < 0.5");
    LOG.println("  * Using INFO field AF threshold = " + sutils::double2str(D.global_af_threshold, 4));

    D.interaction_maf_threshold = options["interaction-maf-threshold"].as < double > ();
    if (D.interaction_maf_threshold < 0.0 || D.interaction_maf_threshold >= 0.5) LOG.error("Incorrect --interaction-maf-threshold  :  0 <= X < 0.5");
    LOG.println("  * Applying interaction MAF AF threshold = " + sutils::double2str(D.interaction_maf_threshold, 4));

    if (options.count("chunk")) {
        vector < int > nChunk = options["chunk"].as < vector < int > > ();
        if (nChunk.size() != 2) LOG.error ("--chunk needs 2 integer arguments");
        if (nChunk[0] > nChunk[1]) LOG.error ("arg1 for --chunk needs to be smaller or equal to arg2");
        LOG.println ("  * Chunk processed " + sutils::int2str(nChunk[0]) + " / " + sutils::int2str(nChunk[1]));
    } else if (options.count("commands")) {
        vector < string > nCommands = options["commands"].as < vector < string > > ();
        if (nCommands.size() != 2) LOG.error ("--commands needs 2 arguments");
        LOG.println ("  * " + nCommands[0] + " commands output in [" + nCommands[1] +"]");
    } else LOG.println ("  * Focus on region [" + options["region"].as < string > () +"]");

    //--------------------------------
    // 7. READ EXCLUDE / INCLUDE FILES
    //--------------------------------
    if (options.count("exclude-samples")) D.readSamplesToExclude(options["exclude-samples"].as < string > ());
    if (options.count("include-samples")) D.readSamplesToInclude(options["include-samples"].as < string > ());
    if (options.count("exclude-sites")) D.readGenotypesToExclude(options["exclude-sites"].as < string > ());
    if (options.count("include-sites")) D.readGenotypesToInclude(options["include-sites"].as < string > ());
    if (options.count("exclude-phenotypes")) D.readPhenotypesToExclude(options["exclude-phenotypes"].as < string > ());
    if (options.count("include-phenotypes")) D.readPhenotypesToInclude(options["include-phenotypes"].as < string > ());
    if (options.count("exclude-covariates")) D.readCovariatesToExclude(options["exclude-covariates"].as < string > ());
    if (options.count("include-covariates")) D.readCovariatesToInclude(options["include-covariates"].as < string > ());

    if (options.count("commands")) {
        //---------------------
        // 8. GENERATE COMMANDS
        //---------------------
        int nChunks = atoi(options["commands"].as < vector < string > > ()[0].c_str());
        D.scanPhenotypes(options["bed"].as < string > ());
        if (options.count("grp")) D.readGroups(options["grp"].as < string > ());  // need to read this before determining chunks (clusterizePhenotypes)
        D.clusterizePhenotypes(nChunks);
        D.writeCommands(options["commands"].as < vector < string > > ()[1], nChunks, argc, argv);
    } else {
        //--------------
        // 9. SET REGION
        //--------------
        if (options.count("chunk")) {
            D.scanPhenotypes(options["bed"].as < string > ());
            if (options.count("grp")) D.readGroups(options["grp"].as < string > ());  // need to read this before determining chunks (clusterizePhenotypes)
            D.clusterizePhenotypes(options["chunk"].as < vector < int > > ()[1]);
            D.setPhenotypeRegion(options["chunk"].as < vector < int > > ()[0] - 1);
            D.clear();
        } else if (!D.setPhenotypeRegion(options["region"].as < string > ())) LOG.error("Impossible to interpret region [" + options["region"].as < string > () + "]");
        D.deduceGenotypeRegion(options["window"].as < double > ());

        //---------------
        // 10. READ FILES
        //---------------
        D.readPhenotypes(options["bed"].as < string > ());
        if (options.count("interaction")) D.readInteractions(options["interaction"].as < string > ());  // used by optional MAF filter, read first
        D.readGenotypesVCF(options["vcf"].as < string > ());
        if (options.count("cov")) D.readCovariates(options["cov"].as < string > ());
        if (options.count("map")) D.readThresholds(options["map"].as < string > ());
        if (options.count("grp")) D.readGroups(options["grp"].as < string > ());  // read groups again, this time limited to phenotypes

        //------------------------
        // 11. INITIALIZE ANALYSIS
        //------------------------
        D.imputeGenotypes();
        D.imputePhenotypes();
        if (options.count("normal")) D.normalTranformPhenotypes();
        D.initResidualizer();

        //-----------------
        // 12. RUN ANALYSIS
        //-----------------
        if (options.count("interaction")) {
            if (options.count("permute")) {
                D.runPermutationInteraction(options["out"].as < string > (), options["permute"].as < vector < int > > ()[0]);
            } else if (options["report-best-only"].as<bool>()) {
                D.runNominalInteractionBest(options["out"].as < string > ());
            } else {
                D.runNominalInteraction(options["out"].as < string > (), options["threshold"].as < double > ());
            }
        } else if (options.count("permute") && options.count("grp"))
            D.runPermutationPerGroup(options["out"].as < string > (), options["permute"].as < vector < int > > ());
        else if (options.count("permute")) {
            if (options.count("psequence")) D.runPermutation(options["out"].as < string > (), options["psequence"].as < string > ());
            else D.runPermutation(options["out"].as < string > (), options["permute"].as < vector < int > > ());
        } else if (options.count("map")) {
            D.runMapping(options["out"].as < string > (), options.count("map-full"));
        } else if (options["report-best-only"].as<bool>()) {
            D.runNominalBest(options["out"].as < string > ());
        } else {
            D.runNominal(options["out"].as < string > (), options["threshold"].as < double > ());
        }
    }

    //----------------
    // 13. TERMINATION
    //----------------
    D.clear();
    gettimeofday(&stop_time, 0);
    int n_seconds = (int)floor(stop_time.tv_sec - start_time.tv_sec);
    LOG.println("\nRunning time: " + sutils::int2str(n_seconds) + " seconds");
    if (!options["log"].defaulted()) LOG.close();
}
예제 #2
0
void quan_main(vector < string > & argv) {
    quan_data D;
    //-------------------------
    // 1. DECLARE ALL OPTIONS
    //-------------------------
    D.declareBasicOptions();
    boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
    opt_files.add_options()
    	("gtf", boost::program_options::value< string >(), "Annotation in GTF format")
		("bam", boost::program_options::value< vector < string > > ()->multitoken(), "Sequence data in BAM/SAM format.")
		("samples",boost::program_options::value< vector < string > > ()->multitoken(), "Sample names or a file with sample names. [Optional]")
		("out-prefix", boost::program_options::value< string >(), "Output file prefix.");
    
	boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
	opt_parameters.add_options()
		("rpkm", "Print RPKM values.")
        ("debug", "Print debug info to stderr.")
		("gene-types", boost::program_options::value< vector < string > > ()->multitoken(), "Gene types to quantify. (Requires gene_type attribute in GTF. It will also use transcript_type if present).")
		("max-read-length", boost::program_options::value< unsigned int >()->default_value(1000), "Group genes separated by this much together. Set this larger than your read length");

    boost::program_options::options_description opt_filters ("\x1B[32mFilters\33[0m");
    opt_filters.add_options()
    	("filter-mapping-quality", boost::program_options::value< unsigned int >()->default_value(10), "Minimal mapping quality for a read to be considered.")
		("filter-mismatch", boost::program_options::value< double >()->default_value(-1.0,"OFF"), "Maximum mismatches allowed in a read. If between 0 and 1 taken as the fraction of read length. (Requires NM attribute)")
		("filter-mismatch-total", boost::program_options::value< double >()->default_value(-1.0,"OFF"), "Maximum total mismatches allowed in paired reads. If between 0 and 1 taken as the fraction of combined read length. (Requires NM attribute)")
		("check-proper-pairing", "If provided only properly paired reads according to the aligner that are in correct orientation will be considered. Otherwise all pairs in correct orientation will be considered.")
        ("check-consistency", "If provided checks the consistency of split reads with annotation, rather than pure overlap of one of the blocks of the split read.")
        ("no-merge", "If provided overlapping mate pairs will not be merged.")
		("legacy-options", "Exactly replicate Dermitzakis lab original quantification script. (DO NOT USE UNLESS YOU HAVE A GOOD REASON). Sets --no-merge as well.")
		("filter-failed-qc", "Remove fastq reads that fail sequencing QC (as indicated by the sequencer)")
		("filter-min-exon", boost::program_options::value< unsigned int >()->default_value(0), "Minimal exon length to consider. Exons smaller than this will not be printed out in the exon quantifications, but will still count towards gene quantifications.")
		("filter-remove-duplicates", "Remove duplicate sequencing reads in the process.");
    
    boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
    opt_parallel.add_options()
    	("chunk", boost::program_options::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed")
		("region", boost::program_options::value< string >(), "Region of interest.");
    
    D.option_descriptions.add(opt_files).add(opt_parameters).add(opt_filters).add(opt_parallel);
    
    //-------------------
    // 2. PARSE OPTIONS
    //-------------------
    boost::program_options::variables_map options;
    try {
        boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
        boost::program_options::notify(D.options);
    } catch ( const boost::program_options::error& e ) {
        cerr << "Error parsing [quan] command line :" << string(e.what()) << endl;
        exit(0);
    }
    
    //---------------------
    // 3. PRINT HELP/HEADER
    //---------------------
    vrb.ctitle("QUANTIFY GENES AND EXONS FROM BAM FILES");
    if (D.options.count("help")) {
        cout << D.option_descriptions << endl;
        exit(EXIT_SUCCESS);
    }
    
    //-----------------
    // 4. COMMON CHECKS
    //-----------------
    if (!D.options.count("gtf")) vrb.error("Genotype data needs to be specified with --gtf [file.gtf]");
    if (!D.options.count("bam")) vrb.error("Sequence data needs to be specified with --bam [file.bam]");
    if (!D.options.count("out-prefix")) vrb.error("Output needs to be specified with --out [file.out]");
    

    D.min_mapQ = D.options["filter-mapping-quality"].as < unsigned int > ();
    vrb.bullet("Minimum mapping quality: " + stb.str(D.min_mapQ));
    D.max_read_length = D.options["max-read-length"].as < unsigned int > ();
    vrb.bullet("Maximum read length: " + stb.str(D.max_read_length));
    double intpart;

    D.max_mismatch_count_total = D.options["filter-mismatch-total"].as < double > ();
    if(D.max_mismatch_count_total >= 0 && modf(D.max_mismatch_count_total, &intpart) != 0.0) {
    	if(D.max_mismatch_count_total > 1) vrb.error("--filter-mismatch-total cannot be greater than 1 when not an integer");
    	else D.fraction_mmt = true;
    }
    if ( D.max_mismatch_count_total >= 0) vrb.bullet("Maximum mismatch count per mate-pair: " + stb.str(D.max_mismatch_count_total));

    D.max_mismatch_count = D.options["filter-mismatch"].as < double > ();
    if(D.max_mismatch_count >= 0 && modf(D.max_mismatch_count, &intpart) != 0.0) {
    	if(D.max_mismatch_count > 1) vrb.error("--filter-mismatch cannot be greater than 1 when not an integer");
    	else D.fraction_mm = true;
    }
    if (D.max_mismatch_count < 0 && D.max_mismatch_count_total >= 0 && !D.fraction_mmt) D.max_mismatch_count = D.max_mismatch_count_total;
    if ( D.max_mismatch_count >= 0) vrb.bullet("Maximum mismatch count per read: " + stb.str(D.max_mismatch_count));



    if (D.options.count("check-proper-pairing")){
        vrb.bullet("Checking properly paired flag");
        D.proper_pair = true;
    }
    
    if (D.options.count("check-consistency")){
        vrb.bullet("Checking if all blocks of a split read are consistent with the annotation");
        D.check_consistency = true;
    }
    
    if (D.options.count("filter-remove-duplicates")){
        vrb.bullet("Filtering reads flagged as duplicate");
        D.dup_remove = true;
    }
    
    if (D.options.count("filter-failed-qc")){
        vrb.bullet("Filtering reads flagged as failing QC");
        D.fail_qc = true;
    }

    if (D.options.count("no-merge")){
        vrb.bullet("Not merging overlapping mate pairs");
        D.merge = false;
    }
    
    if (D.options.count("legacy-options")){
    	if (!D.options.count("no-merge")) vrb.bullet("Not merging overlapping mate pairs");
    	D.min_exon = 2;
    	vrb.bullet("Excluding exons smaller than " + stb.str(D.min_exon) );
        vrb.warning("You are using --legacy-options, do you know what you are doing?");
        D.old_wrong_split = true;
        D.merge = false;
    }else{
    	D.min_exon = D.options["filter-min-exon"].as < unsigned int > ();
    	vrb.bullet("Excluding exons smaller than " + stb.str(D.min_exon) );
    }

    if (D.options.count("gene-types")){
    	vector < string > t = D.options["gene-types"].as < vector < string > > ();
    	D.gene_types = set < string > (t.begin(),t.end());
        const char* const delim = " ";
        ostringstream temp;
        copy(D.gene_types.begin(), D.gene_types.end(), ostream_iterator<string>(temp, delim));
        vrb.bullet("Genes included: " + temp.str());
    }
    
    if (D.options.count("debug")) D.debug = true;

    int k=1,K=1;
    if (D.options.count("chunk")) {
        vector < int > nChunk = D.options["chunk"].as < vector < int > > ();
        if (nChunk.size() != 2 || nChunk[0] > nChunk[1]) vrb.error("Incorrect --chunk arguments!");
        vrb.bullet("Chunk = [" + stb.str(nChunk[0]) + "/" + stb.str(nChunk[1]) + "]");
        k=nChunk[0] , K = nChunk[1];
    } else if(D.options.count("region")) vrb.bullet("Region = [" + D.options["region"].as < string > () +"]");
    
    //TO DO CHECK PARAMETER VALUES

    
    //------------------------------------------
    // 5. READ FILES / INITIALIZE / RUN ANALYSIS
    //------------------------------------------
    
    D.processBasicOptions();
    D.bams = D.options["bam"].as < vector < string > > ();
    if (D.options.count("samples")) {
        vector < string > n = D.options["samples"].as <vector  < string > > ();
        D.read_Sample_Names(n);
    }else D.samples = D.bams;
    D.readGTF(D.options["gtf"].as < string > (),D.bams.size());
    if (D.options.count("region")) D.setRegion(D.options["region"].as < string >());
    if (D.options.count("chunk")) D.setChunk(k, K);
    D.readBams();
    D.printBEDcount(D.options["out-prefix"].as < string > ());
    if (D.options.count("rpkm")) D.printBEDrpkm(D.options["out-prefix"].as < string > ());
    D.printStats(D.options["out-prefix"].as < string > ());
}
예제 #3
0
void trans_main(vector < string > & argv) {
	trans_data D;

	//-------------------------
	// 1. DECLARE ALL OPTIONS
	//-------------------------
	D.declareBasicOptions();  //Mandatory
	boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
	opt_files.add_options()
		("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF format.")
		("bed", boost::program_options::value< string >(), "Phenotypes in BED format.")
		("cov", boost::program_options::value< string >(), "Covariates in TXT format.")
		("out", boost::program_options::value< string >(), "Output file.");

	boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
	opt_parameters.add_options()
		("normal", "Quantile normalize phenotype data.")
		("window", boost::program_options::value< double >()->default_value(5e6, "5e6"), "Cis-window of variants to be excluded.")
		("threshold", boost::program_options::value< double >()->default_value(1e-5, "1e-5"), "P-value threshold below which hits are reported.")
		("bins", boost::program_options::value< unsigned int >()->default_value(1000), "Number of bins to use to categorize all p-values above --threshold.");

	boost::program_options::options_description opt_modes ("\x1B[32mAnalysis type\33[0m");
	opt_modes.add_options()
		("nominal", "MODE1: NOMINAL PASS [Pvalues are not adjusted].")
		("adjust", boost::program_options::value< string >(), "MODE2: ADJUSTED PASS [Pvalues are adjusted].")
		("permute", "MODE3: PERMUTATION PASS [Permute all phenotypes once].")
		("sample", boost::program_options::value< unsigned int >(), "MODE4: PERMUTATION PASS [Permute randomly chosen phenotypes multiple times].");
    
    boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
    opt_parallel.add_options()
        ("chunk", boost::program_options::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed");

	D.option_descriptions.add(opt_files).add(opt_parameters).add(opt_modes).add(opt_parallel);

	//-------------------
	// 2. PARSE OPTIONS
	//-------------------
	try {
		boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
		boost::program_options::notify(D.options);
	} catch ( const boost::program_options::error& e ) {
		cerr << "Error parsing [trans] command line :" << string(e.what()) << endl;
		exit(0);
	}

	//---------------------
	// 3. PRINT HELP/HEADER
	//---------------------
	vrb.ctitle("MAPPING QTL IN TRANS");
	if (D.options.count("help")) {
		cout << D.option_descriptions << endl;
		exit(EXIT_SUCCESS);
	}

	//-----------------
	// 4. COMMON CHECKS
	//-----------------
	if (!D.options.count("vcf")) vrb.error("Genotype data needs to be specified with --vcf [file.vcf]");
	if (!D.options.count("bed")) vrb.error("Phenotype data needs to be specified with --bed [file.bed]");
	if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
	int nMode = D.options.count("nominal") + D.options.count("adjust") + D.options.count("permute") + D.options.count("sample");
	if (nMode != 1) vrb.error("Please, specify only one of these options [--nominal, --adjust, --permute, --sample]");
    if (D.options.count("chunk") && D.options.count("sample")) vrb.error("--chunk cannot be combined with --sample");

	//---------
	// 5. MODES
	//---------

	//MODE1: NOMINAL PASS NON ADJUSTED
	if (D.options.count("nominal")) {
		D.mode = TRANS_MODE1;
		vrb.bullet("TASK: Perform a full nominal pass, do not adjust p-values");
	}

	//MODE2: NOMINAL PASS ADJUSTED
	if (D.options.count("adjust")) {
		D.mode = TRANS_MODE2;
		vrb.bullet("TASK: Test and adjust p-values using [" + D.options["adjust"].as < string > () +"]");
	}

	//MODE3: PERMUTATION PASS
	if (D.options.count("permute")) {
		D.mode = TRANS_MODE3;
		vrb.bullet("TASK: Permute all phenotype once and test");
	}

	//MODE4: PERMUTATION PASS
	if (D.options.count("sample")) {
		D.mode = TRANS_MODE4;
		vrb.bullet("TASK: Permute randomly chosen phenotypes " + stb.str(D.options["sample"].as < unsigned int > ()) + " times and test");
	}

	//--------------
	// 6. SET PARAMS
	//--------------
	if (D.options["window"].as < double > () <= 0 || D.options["window"].as < double > () > 1e9) vrb.error ("Incorrect cis-window size");
	vrb.bullet("Cis-window size is " + stb.str((int)D.options["window"].as < double > ()) + " bp");
	D.cis_window = D.options["window"].as < double > ();
    if (D.options.count("chunk")) {
        vector < int > nChunk = D.options["chunk"].as < vector < int > > ();
        if (nChunk.size() != 2 || nChunk[0] > nChunk[1]) vrb.error("Incorrect --chunk arguments!");
        vrb.bullet("Chunk = [" + stb.str(nChunk[0]) + "/" + stb.str(nChunk[1]) + "]");
    }
	D.n_bins = D.options["bins"].as < unsigned int > ();
	vrb.bullet("#bins = " + stb.str(D.n_bins));
	vrb.bullet("threshold = " + stb.str(D.options["threshold"].as < double > ()));
    
    
    D.processBasicOptions();
    if (D.options.count("chunk")) {
        D.scanPhenotypes(D.options["bed"].as < string > ());
        D.setPhenotypeLines(D.options["chunk"].as < vector < int > > ()[0], D.options["chunk"].as < vector < int > > ()[1]);
        D.clear();
    }

	//---------------------------
	// 7. READ FILES & INITIALIZE
	//---------------------------
    //D.processBasicOptions();
	D.readSampleFromVCF(D.options["vcf"].as < string > ());
	D.checkSampleInBED(D.options["bed"].as < string > ());
	if (D.options.count("cov")) D.checkSampleInCOV(D.options["cov"].as < string > ());

	D.readPhenotypes(D.options["bed"].as < string > ());
	D.imputePhenotypes();

	if (D.options.count("cov")) {
		D.readCovariates(D.options["cov"].as < string > ());
		D.residualizePhenotypes();
	}
	if (D.options.count("normal")) D.normalTranformPhenotypes();

	if (D.options.count("permute")) D.shufflePhenotypes();
	if (D.options.count("sample")) D.samplePhenotypes(D.options["sample"].as < unsigned int > ());
	if (D.options.count("adjust")) D.buildNullDistribution(D.options["adjust"].as < string > ());
	D.getCorrelationThreshold(D.options["threshold"].as < double > ());

	D.normalizePhenotypes();

	//----------------
	// 8. RUN ANALYSIS
	//----------------
	D.runTransPass(D.options["vcf"].as < string > (), D.options["out"].as < string > ());
}
예제 #4
0
void extract_main(vector < string > & argv) {
	extract_data D;

	//-------------------------
	// 1. DECLARE ALL OPTIONS
	//-------------------------
	D.declareBasicOptions();

	boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
	opt_files.add_options()
		("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF format.")
		("bed", boost::program_options::value< vector < string > >()->multitoken(), "Phenotypes in BED format.")
		("cov", boost::program_options::value< string >(), "Covariates in TXT format.")
		("out", boost::program_options::value< string >(), "Output file.");

	boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
	opt_parallel.add_options()
		("region", boost::program_options::value< string >(), "Region of interest.");

	D.option_descriptions.add(opt_files).add(opt_parallel);

	//-------------------
	// 2. PARSE OPTIONS
	//-------------------
	try {
		boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
		boost::program_options::notify(D.options);
	} catch ( const boost::program_options::error& e ) {
		cerr << "Error parsing [extract] command line :" << string(e.what()) << endl;
		exit(0);
	}

	//---------------------
	// 3. PRINT HELP/HEADER
	//---------------------
	vrb.ctitle("DATA EXTRACTION");
	if (D.options.count("help")) {
		cout << D.option_descriptions << endl;
		exit(EXIT_SUCCESS);
	}

	//-----------------
	// 4. COMMON CHECKS
	//-----------------
	if ((D.options.count("vcf") + D.options.count("bed") + D.options.count("cov")) == 0) vrb.error("At least one input file has to be specified using either --vcf [file.vcf], --bed [file.bed] or --cov [file.txt]");
	if (!D.options.count("region")) vrb.warning("Please use --region to speed up data extraction for phenotype and genotype data!");

	//--------------
	// 5. SET REGION
	//--------------
	if (D.options.count("region") && !D.regionData.parse(D.options["region"].as < string > ()))
		vrb.error("Impossible to interpret region [" + D.options["region"].as < string > () + "]");

	//--------------
	// 6. READ FILES
	//--------------
	D.processBasicOptions();
	vector < string > bed_list = D.options["bed"].as < vector < string > > ();
	for (int b = 0 ; b < bed_list.size() ; b ++) D.readSampleFromBED(bed_list[b]);
	if (D.options.count("vcf")) D.readSampleFromVCF(D.options["vcf"].as < string > ());
	if (D.options.count("cov")) D.readSampleFromCOV(D.options["cov"].as < string > ());
	D.mergeSampleLists();

	for (int b = 0 ; b < bed_list.size() ; b ++) D.readBED(bed_list[b]);
	if (D.options.count("vcf")) D.readVCF(D.options["vcf"].as < string > ());
	if (D.options.count("cov")) D.readCOV(D.options["cov"].as < string > ());

	D.imputeMissing();

	D.writeOUT(D.options["out"].as < string > ());
}