/** * @param arg_fileName: a string character * @param arg_geneFile: which gene file to use * @param arg_geneName: which gene we are interested. (NOTE: only first one gene * is used). */ SEXP impl_readBGENToListByGene(SEXP arg_fileName, SEXP arg_geneFile, SEXP arg_geneName) { // begin std::string FLAG_fileName = CHAR(STRING_ELT(arg_fileName, 0)); std::string FLAG_geneFile = CHAR(STRING_ELT(arg_geneFile, 0)); std::string FLAG_geneName = CHAR(STRING_ELT(arg_geneName, 0)); OrderedMap<std::string, std::string> geneRange; loadGeneFile(FLAG_geneFile, FLAG_geneName, &geneRange); std::string range; int n = geneRange.size(); for (int i = 0; i < n; ++i) { if (range.size() > 0) { range += ","; } range += geneRange.valueAt(i); } REprintf("range = %s\n", range.c_str()); BGenFile bin(FLAG_fileName.c_str()); if (range.size()) bin.setRangeList(range.c_str()); else { error("Please provide a valid gene name before we can continue.\n"); }; return readBGEN2List(&bin); } // end readBGEN2List
/** * @param arg_fileName: a string character * @param arg_geneFile: which gene file to use * @param arg_geneName: which gene we are interested. (just allow One gene * name). */ SEXP impl_readBGENToMatrixByGene(SEXP arg_fileName, SEXP arg_geneFile, SEXP arg_geneName) { SEXP ans = R_NilValue; std::string FLAG_fileName = CHAR(STRING_ELT(arg_fileName, 0)); std::string FLAG_geneFile = CHAR(STRING_ELT(arg_geneFile, 0)); std::vector<std::string> FLAG_geneName; extractStringArray(arg_geneName, &FLAG_geneName); if (FLAG_fileName.size() == 0) { error("Please provide BGEN file name"); } if (FLAG_geneName.size() && FLAG_geneFile.size() == 0) { error("Please provide gene file name when extract genotype by gene"); } int nGene = FLAG_geneName.size(); Rprintf("%d region to be extracted.\n", nGene); int numAllocated = 0; // allocate return value PROTECT(ans = allocVector(VECSXP, nGene)); numAllocated++; setListNames(FLAG_geneName, &ans); OrderedMap<std::string, std::string> geneRange; loadGeneFile(FLAG_geneFile, FLAG_geneName, &geneRange); for (int i = 0; i < nGene; ++i) { // REprintf("range = %s\n", FLAG_geneName[i].c_str()); const std::string& range = geneRange[FLAG_geneName[i]]; // Rprintf( "range = %s\n", range.c_str()); BGenFile bin(FLAG_fileName); if (range.size()) bin.setRangeList(range.c_str()); else { warning("Gene name [ %s ] does not exists in provided gene file", FLAG_geneName[i].c_str()); UNPROTECT(numAllocated); return (ans); }; // real working part SET_VECTOR_ELT(ans, i, readBGEN2Matrix(&bin)); } UNPROTECT(numAllocated); return ans; }
int main(int argc, char** argv) { PARSE_PARAMETER(argc, argv); if (FLAG_help) { PARAMETER_HELP(); return 0; } welcome(); PARAMETER_STATUS(); if (FLAG_REMAIN_ARG.size() > 0) { fprintf(stderr, "Unparsed arguments: "); for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) { fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str()); } exit(1); } if (!FLAG_outPrefix.size()) FLAG_outPrefix = "rvtest"; if ((FLAG_inVcf.empty() ? 0 : 1) + (FLAG_inBgen.empty() ? 0 : 1) + (FLAG_inKgg.empty() ? 0 : 1) != 1) { fprintf(stderr, "Please provide one type of input file using: --inVcf, --inBgen or " "--inKgg\n"); exit(1); } // check new version if (!FLAG_noweb) { VersionChecker ver; if (ver.retrieveRemoteVersion("http://zhanxw.com/rvtests/version") < 0) { fprintf(stderr, "Retrieve remote version failed, use '--noweb' to skip.\n"); } else { ver.setLocalVersion(VERSION); if (ver.isRemoteVersionNewer()) { fprintf(stderr, "New version of rvtests is available:"); ver.printRemoteContent(); } } } // start logging Logger _logger((FLAG_outPrefix + ".log").c_str()); logger = &_logger; logger->info("Program version: %s", VERSION); logger->infoToFile("Git Version: %s", GIT_VERSION); logger->infoToFile("Parameters BEGIN"); PARAMETER_INSTANCE().WriteToFile(logger->getHandle()); logger->infoToFile("Parameters END"); logger->sync(); // set up multithreading #ifdef _OPENMP if (FLAG_numThread <= 0) { fprintf(stderr, "Invalid number of threads [ %d ], reset to single thread", FLAG_numThread); omp_set_num_threads(1); } else if (FLAG_numThread > omp_get_max_threads()) { int maxThreads = omp_get_max_threads(); fprintf(stderr, "Reduced your specified number of threads to the maximum of system " "limit [ %d ]", maxThreads); omp_set_num_threads(maxThreads); } else if (FLAG_numThread == 1) { // need to set to one thread, otherwise all CPUs may be used omp_set_num_threads(1); } else { logger->info("Set number of threads = [ %d ]", FLAG_numThread); omp_set_num_threads(FLAG_numThread); } #endif // start analysis time_t startTime = time(0); logger->info("Analysis started at: %s", currentTime().c_str()); GenotypeExtractor* ge = NULL; if (!FLAG_inVcf.empty()) { ge = new VCFGenotypeExtractor(FLAG_inVcf); } else if (!FLAG_inBgen.empty()) { ge = new BGenGenotypeExtractor(FLAG_inBgen, FLAG_inBgenSample); } else if (!FLAG_inKgg.empty()) { ge = new KGGGenotypeExtractor(FLAG_inKgg); } else { assert(false); } // set range filters here ge->setRangeList(FLAG_rangeList.c_str()); ge->setRangeFile(FLAG_rangeFile.c_str()); // set people filters here if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) { ge->excludeAllPeople(); ge->includePeople(FLAG_peopleIncludeID.c_str()); ge->includePeopleFromFile(FLAG_peopleIncludeFile.c_str()); } ge->excludePeople(FLAG_peopleExcludeID.c_str()); ge->excludePeopleFromFile(FLAG_peopleExcludeFile.c_str()); if (!FLAG_siteFile.empty()) { ge->setSiteFile(FLAG_siteFile); logger->info("Restrict analysis based on specified site file [ %s ]", FLAG_siteFile.c_str()); } if (FLAG_siteDepthMin > 0) { ge->setSiteDepthMin(FLAG_siteDepthMin); logger->info("Set site depth minimum to %d", FLAG_siteDepthMin); } if (FLAG_siteDepthMax > 0) { ge->setSiteDepthMax(FLAG_siteDepthMax); logger->info("Set site depth maximum to %d", FLAG_siteDepthMax); } if (FLAG_siteMACMin > 0) { ge->setSiteMACMin(FLAG_siteMACMin); logger->info("Set site minimum MAC to %d", FLAG_siteDepthMin); } if (FLAG_annoType != "") { ge->setAnnoType(FLAG_annoType.c_str()); logger->info("Set annotype type filter to %s", FLAG_annoType.c_str()); } std::vector<std::string> vcfSampleNames; ge->getPeopleName(&vcfSampleNames); logger->info("Loaded [ %zu ] samples from genotype files", vcfSampleNames.size()); DataLoader dataLoader; dataLoader.setPhenotypeImputation(FLAG_imputePheno); dataLoader.setCovariateImputation(FLAG_imputeCov); if (FLAG_multiplePheno.empty()) { dataLoader.loadPhenotype(FLAG_pheno, FLAG_mpheno, FLAG_phenoName); // // load phenotypes // std::map<std::string, double> phenotype; // if (FLAG_pheno.empty()) { // logger->error("Cannot do association when phenotype is missing!"); // return -1; // } // // check if alternative phenotype columns are used // if (!FLAG_mpheno.empty() && !FLAG_phenoName.empty()) { // logger->error("Please specify either --mpheno or --pheno-name"); // return -1; // } // if (!FLAG_mpheno.empty()) { // int col = atoi(FLAG_mpheno); // int ret = loadPedPhenotypeByColumn(FLAG_pheno.c_str(), &phenotype, // col); // if (ret < 0) { // logger->error("Loading phenotype failed!"); // return -1; // } // } else if (!FLAG_phenoName.empty()) { // int ret = loadPedPhenotypeByHeader(FLAG_pheno.c_str(), &phenotype, // FLAG_phenoName.c_str()); // if (ret < 0) { // logger->error("Loading phenotype failed!"); // return -1; // } // } else { // int col = 1; // default use the first phenotype // int ret = loadPedPhenotypeByColumn(FLAG_pheno.c_str(), &phenotype, // col); // if (ret < 0) { // logger->error("Loading phenotype failed!"); // return -1; // } // } // logger->info("Loaded [ %zu ] sample phenotypes.", phenotype.size()); // rearrange phenotypes // drop samples from phenotype or vcf matchPhenotypeAndVCF("missing phenotype", &dataLoader, ge); // // phenotype names (vcf sample names) arranged in the same order as in // VCF // std::vector<std::string> phenotypeNameInOrder; // std::vector<double> // phenotypeInOrder; // phenotype arranged in the same order as in VCF // rearrange(phenotype, vcfSampleNames, &vcfSampleToDrop, // &phenotypeNameInOrder, // &phenotypeInOrder, FLAG_imputePheno); // if (vcfSampleToDrop.size()) { // // exclude this sample from parsing VCF // ge->excludePeople(vcfSampleToDrop); // // output dropped samples // for (size_t i = 0; i < vcfSampleToDrop.size(); ++i) { // if (i == 0) // logger->warn( // "Total [ %zu ] samples are dropped from VCF file due to missing // " // "phenotype", // vcfSampleToDrop.size()); // if (i >= 10) { // logger->warn( // "Skip outputting additional [ %d ] samples with missing " // "phenotypes.", // ((int)vcfSampleToDrop.size() - 10)); // break; // } // logger->warn("Drop sample [ %s ] from VCF file due to missing // phenotype", // (vcfSampleToDrop)[i].c_str()); // } // // logger->warn("Drop %zu sample from VCF file since we don't have // their // // phenotypes", vcfSampleToDrop.size()); // } // if (phenotypeInOrder.size() != phenotype.size()) { // logger->warn( // "Drop [ %d ] samples from phenotype file due to missing genotypes // from " // "VCF files", // (int)(phenotype.size() - phenotypeInOrder.size())); // // We may output these samples by comparing keys of phenotype and // // phenotypeNameInOrder // } dataLoader.loadCovariate(FLAG_cov, FLAG_covName); matchCovariateAndVCF("missing covariate", &dataLoader, ge); // // load covariate // Matrix covariate; // HandleMissingCov handleMissingCov = COVARIATE_DROP; // if (FLAG_imputeCov) { // handleMissingCov = COVARIATE_IMPUTE; // } // if (FLAG_cov.empty() && !FLAG_covName.empty()) { // logger->info("Use phenotype file as covariate file [ %s ]", // FLAG_pheno.c_str()); // FLAG_cov = FLAG_pheno; // } // if (!FLAG_cov.empty()) { // logger->info("Begin to read covariate file."); // std::vector<std::string> columnNamesInCovariate; // std::set<std::string> sampleToDropInCovariate; // int ret = loadCovariate(FLAG_cov.c_str(), phenotypeNameInOrder, // FLAG_covName.c_str(), handleMissingCov, // &covariate, // &columnNamesInCovariate, // &sampleToDropInCovariate); // if (ret < 0) { // logger->error("Load covariate file failed !"); // exit(1); // } // // drop phenotype samples // if (!sampleToDropInCovariate.empty()) { // int idx = 0; // int n = phenotypeNameInOrder.size(); // for (int i = 0; i < n; ++i) { // if (sampleToDropInCovariate.count(phenotypeNameInOrder[i]) != // 0) { // need to drop // continue; // } // phenotypeNameInOrder[idx] = phenotypeNameInOrder[i]; // phenotypeInOrder[idx] = phenotypeInOrder[i]; // idx++; // } // phenotypeNameInOrder.resize(idx); // phenotypeInOrder.resize(idx); // logger->warn( // "[ %zu ] sample phenotypes are dropped due to lacking // covariates.", // sampleToDropInCovariate.size()); // } // // drop vcf samples; // for (std::set<std::string>::const_iterator iter = // sampleToDropInCovariate.begin(); // iter != sampleToDropInCovariate.end(); ++iter) { // ge->excludePeople(iter->c_str()); // } // } } else { dataLoader.loadMultiplePhenotype(FLAG_multiplePheno, FLAG_pheno, FLAG_cov); matchPhenotypeAndVCF("missing phenotype", &dataLoader, ge); matchCovariateAndVCF("missing covariate", &dataLoader, ge); } dataLoader.loadSex(); if (FLAG_sex) { dataLoader.useSexAsCovariate(); matchCovariateAndVCF("missing sex", &dataLoader, ge); } // // load sex // std::vector<int> sex; // if (loadSex(FLAG_pheno, phenotypeNameInOrder, &sex)) { // logger->error("Cannot load sex of samples from phenotype file"); // exit(1); // } // if (FLAG_sex) { // append sex in covariate // std::vector<int> index; // mark missing samples // int numMissing = findMissingSex(sex, &index); // logger->info("Futher exclude %d samples with missing sex", numMissing); // removeByIndex(index, &sex); // excludeSamplesByIndex(index, &ge, &phenotypeNameInOrder, // &phenotypeInOrder, // &covariate); // appendToMatrix("Sex", sex, &covariate); // } if (!FLAG_condition.empty()) { dataLoader.loadMarkerAsCovariate(FLAG_inVcf, FLAG_condition); matchCovariateAndVCF("missing in conditioned marker(s)", &dataLoader, ge); } // // load conditional markers // if (!FLAG_condition.empty()) { // Matrix geno; // std::vector<std::string> rowLabel; // if (loadMarkerFromVCF(FLAG_inVcf, FLAG_condition, &rowLabel, &geno) < 0) // { // logger->error("Load conditional markers [ %s ] from [ %s ] failed.", // FLAG_condition.c_str(), FLAG_inVcf.c_str()); // exit(1); // } // if (appendGenotype(&covariate, phenotypeNameInOrder, geno, rowLabel) < 0) // { // logger->error( // "Failed to combine conditional markers [ %s ] from [ %s ] failed.", // FLAG_condition.c_str(), FLAG_inVcf.c_str()); // exit(1); // } // } dataLoader.checkConstantCovariate(); // // check if some covariates are constant for all samples // // e.g. user may include covariate "1" in addition to intercept // // in such case, we will give a fatal error // for (int i = 0; i < covariate.cols; ++i) { // std::set<double> s; // s.clear(); // for (int j = 0; j < covariate.rows; ++j) { // s.insert(covariate(j,i)); // } // if (s.size() == 1) { // logger->error( // "Covariate [ %s ] equals [ %g ] for all samples, cannot fit " // "model...\n", // covariate.GetColumnLabel(i), *s.begin()); // exit(1); // } // } g_SummaryHeader = new SummaryHeader; g_SummaryHeader->recordCovariate(dataLoader.getCovariate()); // record raw phenotype g_SummaryHeader->recordPhenotype("Trait", dataLoader.getPhenotype().extractCol(0)); // adjust phenotype // bool binaryPhenotype; if (FLAG_qtl) { // binaryPhenotype = false; dataLoader.setTraitType(DataLoader::PHENOTYPE_QTL); logger->info("-- Force quantitative trait mode -- "); } else { if (dataLoader.detectPhenotypeType() == DataLoader::PHENOTYPE_BINARY) { logger->warn("-- Enabling binary phenotype mode -- "); dataLoader.setTraitType(DataLoader::PHENOTYPE_BINARY); } else { dataLoader.setTraitType(DataLoader::PHENOTYPE_QTL); } // binaryPhenotype = isBinaryPhenotype(phenotypeInOrder); // if (binaryPhenotype) { // logger->warn("-- Enabling binary phenotype mode -- "); // convertBinaryPhenotype(&phenotypeInOrder); // } } if (FLAG_useResidualAsPhenotype) { dataLoader.useResidualAsPhenotype(); g_SummaryHeader->recordEstimation(dataLoader.getEstimation()); } // // use residual as phenotype // if (FLAG_useResidualAsPhenotype) { // if (binaryPhenotype) { // logger->warn( // "WARNING: Skip transforming binary phenotype, although you want to // " // "use residual as phenotype!"); // } else { // if (covariate.cols > 0) { // LinearRegression lr; // Vector pheno; // Matrix covAndInt; // copy(phenotypeInOrder, &pheno); // copyCovariateAndIntercept(covariate.rows, covariate, &covAndInt); // if (!lr.FitLinearModel(covAndInt, pheno)) { // logger->error( // "Cannot fit model: [ phenotype ~ 1 + covariates ], now use the // " // "original phenotype"); // } else { // const int n = lr.GetResiduals().Length(); // for (int i = 0; i < n; ++i) { // phenotypeInOrder[i] = lr.GetResiduals()[i]; // } // covariate.Dimension(0, 0); // logger->info( // "DONE: Fit model [ phenotype ~ 1 + covariates ] and model " // "residuals will be used as responses."); // } // } else { // no covaraites // centerVector(&phenotypeInOrder); // logger->info("DONE: Use residual as phenotype by centerng it"); // } // } // } if (FLAG_inverseNormal) { dataLoader.inverseNormalizePhenotype(); g_SummaryHeader->setInverseNormalize(FLAG_inverseNormal); } // // phenotype transformation // if (FLAG_inverseNormal) { // if (binaryPhenotype) { // logger->warn( // "WARNING: Skip transforming binary phenotype, although you required // " // "inverse normalization!"); // } else { // logger->info("Now applying inverse normalize transformation."); // inverseNormalizeLikeMerlin(&phenotypeInOrder); // g_SummaryHeader->setInverseNormalize(FLAG_inverseNormal); // logger->info("DONE: inverse normalization transformation finished."); // } // } g_SummaryHeader->recordPhenotype("AnalyzedTrait", dataLoader.getPhenotype().extractCol(0)); if (dataLoader.getPhenotype().nrow() == 0) { logger->fatal("There are 0 samples with valid phenotypes, quitting..."); exit(1); } // if (phenotypeInOrder.empty()) { // logger->fatal("There are 0 samples with valid phenotypes, quitting..."); // exit(1); // } logger->info("Analysis begins with [ %d ] samples...", dataLoader.getPhenotype().nrow()); ////////////////////////////////////////////////////////////////////////////// // prepare each model bool singleVariantMode = FLAG_modelSingle.size() || FLAG_modelMeta.size(); bool groupVariantMode = (FLAG_modelBurden.size() || FLAG_modelVT.size() || FLAG_modelKernel.size()); if (singleVariantMode && groupVariantMode) { logger->error("Cannot support both single variant and region based tests"); exit(1); } ModelManager modelManager(FLAG_outPrefix); // set up models in qtl/binary modes if (dataLoader.isBinaryPhenotype()) { modelManager.setBinaryOutcome(); matchPhenotypeAndVCF("missing phenotype (not case/control)", &dataLoader, ge); } else { modelManager.setQuantitativeOutcome(); } // create models modelManager.create("single", FLAG_modelSingle); modelManager.create("burden", FLAG_modelBurden); modelManager.create("vt", FLAG_modelVT); modelManager.create("kernel", FLAG_modelKernel); modelManager.create("meta", FLAG_modelMeta); if (FLAG_outputRaw) { modelManager.create("outputRaw", "dump"); } const std::vector<ModelFitter*>& model = modelManager.getModel(); const std::vector<FileWriter*>& fOuts = modelManager.getResultFile(); const size_t numModel = model.size(); // TODO: optimize this to avoid data copying Matrix phenotypeMatrix; Matrix covariate; toMatrix(dataLoader.getPhenotype(), &phenotypeMatrix); toMatrix(dataLoader.getCovariate(), &covariate); // determine VCF file reading pattern // current support: // * line by line ( including range selection) // * gene by gene // * range by range std::string rangeMode = "Single"; if (FLAG_geneFile.size() && (FLAG_setFile.size() || FLAG_setList.size())) { logger->error("Cannot specify both gene file and set file."); exit(1); } if (!FLAG_gene.empty() && FLAG_geneFile.empty()) { logger->error("Please provide gene file for gene bases analysis."); exit(1); } OrderedMap<std::string, RangeList> geneRange; if (FLAG_geneFile.size()) { rangeMode = "Gene"; int ret = loadGeneFile(FLAG_geneFile.c_str(), FLAG_gene.c_str(), &geneRange); if (ret < 0 || geneRange.size() == 0) { logger->error("Error loading gene file or gene list is empty!"); return -1; } else { logger->info("Loaded [ %zu ] genes.", geneRange.size()); } } if (!FLAG_set.empty() && FLAG_setFile.empty()) { logger->error("Please provide set file for set bases analysis."); exit(1); } if (FLAG_setFile.size()) { rangeMode = "Range"; int ret = loadRangeFile(FLAG_setFile.c_str(), FLAG_set.c_str(), &geneRange); if (ret < 0 || geneRange.size() == 0) { logger->error("Error loading set file or set list is empty!"); return -1; } else { logger->info("Loaded [ %zu ] set to tests.", geneRange.size()); } } if (FLAG_setList.size()) { rangeMode = "Range"; int ret = appendListToRange(FLAG_setList, &geneRange); if (ret < 0) { logger->error("Error loading set list or set list is empty!"); return -1; } } DataConsolidator dc; dc.setSex(&dataLoader.getSex()); dc.setFormula(&dataLoader.getFormula()); dc.setGenotypeCounter(ge->getGenotypeCounter()); // load kinshp if needed by family models if (modelManager.hasFamilyModel() || (!FLAG_modelMeta.empty() && !FLAG_kinship.empty())) { logger->info("Family-based model specified. Loading kinship file..."); // process auto kinship if (dc.setKinshipSample(dataLoader.getPhenotype().getRowName()) || dc.setKinshipFile(DataConsolidator::KINSHIP_AUTO, FLAG_kinship) || dc.setKinshipEigenFile(DataConsolidator::KINSHIP_AUTO, FLAG_kinshipEigen) || dc.loadKinship(DataConsolidator::KINSHIP_AUTO)) { logger->error( "Failed to load autosomal kinship (you may use vcf2kinship to " "generate one)."); exit(1); } if (dc.setKinshipFile(DataConsolidator::KINSHIP_X, FLAG_xHemiKinship) || dc.setKinshipEigenFile(DataConsolidator::KINSHIP_X, FLAG_xHemiKinshipEigen) || dc.loadKinship(DataConsolidator::KINSHIP_X)) { logger->warn( "Autosomal kinship loaded, but no hemizygote region kinship " "provided, some sex chromosome tests will be skipped."); // keep the program going } } else if (!FLAG_kinship.empty() && FLAG_modelMeta.empty()) { logger->info( "Family-based model not specified. Options related to kinship will be " "ignored here."); } // set imputation method if (FLAG_impute.empty()) { logger->info("Impute missing genotype to mean (by default)"); dc.setStrategy(DataConsolidator::IMPUTE_MEAN); } else if (FLAG_impute == "mean") { logger->info("Impute missing genotype to mean"); dc.setStrategy(DataConsolidator::IMPUTE_MEAN); } else if (FLAG_impute == "hwe") { logger->info("Impute missing genotype by HWE"); dc.setStrategy(DataConsolidator::IMPUTE_HWE); } else if (FLAG_impute == "drop") { logger->info("Drop missing genotypes"); dc.setStrategy(DataConsolidator::DROP); } dc.setPhenotypeName(dataLoader.getPhenotype().getRowName()); // set up par region ParRegion parRegion(FLAG_xLabel, FLAG_xParRegion); dc.setParRegion(&parRegion); // genotype will be extracted and stored if (FLAG_freqUpper > 0) { ge->setSiteFreqMax(FLAG_freqUpper); logger->info("Set upper minor allele frequency limit to %g", FLAG_freqUpper); } if (FLAG_freqLower > 0) { ge->setSiteFreqMin(FLAG_freqLower); logger->info("Set lower minor allele frequency limit to %g", FLAG_freqLower); } // handle sex chromosome ge->setParRegion(&parRegion); ge->setSex(&dataLoader.getSex()); // use dosage instead GT if (!FLAG_dosageTag.empty()) { ge->setDosageTag(FLAG_dosageTag); logger->info("Use dosage genotype from VCF flag %s.", FLAG_dosageTag.c_str()); } // multi-allelic sites will be treats as ref/alt1, ref/alt2, ref/alt3.. // instead of ref/alt1 (biallelic) if (FLAG_multiAllele) { ge->enableMultiAllelicMode(); logger->info("Enable analysis using multiple allelic models"); } // genotype QC options if (FLAG_indvDepthMin > 0) { ge->setGDmin(FLAG_indvDepthMin); logger->info("Minimum GD set to %d (or marked as missing genotype).", FLAG_indvDepthMin); } if (FLAG_indvDepthMax > 0) { ge->setGDmax(FLAG_indvDepthMax); logger->info("Maximum GD set to %d (or marked as missing genotype).", FLAG_indvDepthMax); } if (FLAG_indvQualMin > 0) { ge->setGQmin(FLAG_indvQualMin); logger->info("Minimum GQ set to %d (or marked as missing genotype).", FLAG_indvQualMin); } // e.g. check colinearity and correlations between predictors dc.preRegressionCheck(phenotypeMatrix, covariate); // prepare PLINK files for BoltLMM model if (!FLAG_boltPlink.empty()) { if (dc.prepareBoltModel(FLAG_boltPlink, dataLoader.getPhenotype().getRowName(), dataLoader.getPhenotype())) { logger->error( "Failed to prepare inputs for BOLT-LMM association test model with " "this prefix [ %s ]!", FLAG_boltPlink.c_str()); exit(1); } } logger->info("Analysis started"); Result& buf = dc.getResult(); Matrix& genotype = dc.getOriginalGenotype(); // we have three modes: // * single variant reading, single variant test // * range variant reading, single variant test // * range variant reading, group variant test if (rangeMode == "Single" && singleVariantMode) { // use line by line mode buf.addHeader("CHROM"); buf.addHeader("POS"); if (FLAG_outputID) { buf.addHeader("ID"); } buf.addHeader("REF"); buf.addHeader("ALT"); buf.addHeader("N_INFORMATIVE"); // output headers for (size_t m = 0; m < model.size(); m++) { model[m]->writeHeader(fOuts[m], buf); } int variantProcessed = 0; while (true) { buf.clearValue(); int ret = ge->extractSingleGenotype(&genotype, &buf); if (ret == GenotypeExtractor::FILE_END) { // reach file end break; } if (ret == GenotypeExtractor::FAIL_FILTER) { continue; } if (ret != GenotypeExtractor::SUCCEED) { logger->error("Extract genotype failed at site: %s:%s!", buf["CHROM"].c_str(), buf["POS"].c_str()); continue; } if (genotype.cols == 0) { logger->warn("Extract [ %s:%s ] has 0 variants, skipping", buf["CHROM"].c_str(), buf["POS"].c_str()); continue; } ++variantProcessed; dc.consolidate(phenotypeMatrix, covariate, genotype); buf.updateValue("N_INFORMATIVE", toString(genotype.rows)); // logger->info("Test variant at site: %s:%s!", // buf["CHROM"].c_str(), buf["POS"].c_str()); // fit each model for (size_t m = 0; m != numModel; m++) { model[m]->reset(); model[m]->fit(&dc); model[m]->writeOutput(fOuts[m], buf); } } logger->info("Analyzed [ %d ] variants", variantProcessed); } else if (rangeMode != "Single" && singleVariantMode) { // read by gene/range model, single variant // test buf.addHeader(rangeMode); buf.addHeader("CHROM"); buf.addHeader("POS"); if (FLAG_outputID) { buf.addHeader("ID"); } buf.addHeader("REF"); buf.addHeader("ALT"); buf.addHeader("N_INFORMATIVE"); // output headers for (size_t m = 0; m < numModel; m++) { model[m]->writeHeader(fOuts[m], buf); } std::string geneName; RangeList rangeList; int variantProcessed = 0; for (size_t i = 0; i < geneRange.size(); ++i) { geneRange.at(i, &geneName, &rangeList); ge->setRange(rangeList); while (true) { buf.clearValue(); int ret = ge->extractSingleGenotype(&genotype, &buf); if (ret == GenotypeExtractor::FILE_END) { // reach end of this region break; } if (ret == GenotypeExtractor::FAIL_FILTER) { continue; } if (ret != GenotypeExtractor::SUCCEED) { logger->error("Extract genotype failed for gene %s!", geneName.c_str()); continue; } if (genotype.cols == 0) { logger->warn("Gene %s has 0 variants, skipping", geneName.c_str()); continue; } ++variantProcessed; dc.consolidate(phenotypeMatrix, covariate, genotype); buf.updateValue(rangeMode, geneName); buf.updateValue("N_INFORMATIVE", genotype.rows); // #pragma omp parallel for for (size_t m = 0; m != numModel; m++) { model[m]->reset(); model[m]->fit(&dc); model[m]->writeOutput(fOuts[m], buf); } } } logger->info("Analyzed [ %d ] variants from [ %d ] genes/regions", variantProcessed, (int)geneRange.size()); } else if (rangeMode != "Single" && groupVariantMode) { // read by gene/range mode, group variant // test buf.addHeader(rangeMode); buf.addHeader("RANGE"); buf.addHeader("N_INFORMATIVE"); buf.addHeader("NumVar"); buf.addHeader("NumPolyVar"); // output headers for (size_t m = 0; m < numModel; m++) { model[m]->writeHeader(fOuts[m], buf); } std::string geneName; RangeList rangeList; int variantProcessed = 0; ge->enableAutoMerge(); for (size_t i = 0; i < geneRange.size(); ++i) { geneRange.at(i, &geneName, &rangeList); ge->setRange(rangeList); buf.clearValue(); int ret = ge->extractMultipleGenotype(&genotype); if (ret != GenotypeExtractor::SUCCEED) { logger->error("Extract genotype failed for gene %s!", geneName.c_str()); continue; } if (genotype.cols == 0) { logger->info("Gene %s has 0 variants, skipping", geneName.c_str()); continue; } variantProcessed += genotype.cols; // genotype is people by marker dc.consolidate(phenotypeMatrix, covariate, genotype); buf.updateValue(rangeMode, geneName); buf.updateValue("RANGE", rangeList.toString()); buf.updateValue("N_INFORMATIVE", genotype.rows); buf.updateValue("NumVar", genotype.cols); buf.updateValue("NumPolyVar", dc.getFlippedToMinorPolymorphicGenotype().cols); // #ifdef _OPENMP // #pragma omp parallel for // #endif for (size_t m = 0; m != numModel; m++) { model[m]->reset(); model[m]->fit(&dc); model[m]->writeOutput(fOuts[m], buf); } } logger->info("Analyzed [ %d ] variants from [ %d ] genes/regions", variantProcessed, (int)geneRange.size()); } else { logger->error( "Unsupported reading mode and test modes! (need more parameters?)"); exit(1); } // Resource cleaning up modelManager.close(); delete g_SummaryHeader; time_t endTime = time(0); logger->info("Analysis ends at: %s", currentTime().c_str()); int elapsedSecond = (int)(endTime - startTime); logger->info("Analysis took %d seconds", elapsedSecond); fputs("RVTESTS finished successfully\n", stdout); return 0; }
int main(int argc, char** argv) { //////////////////////////////////////////////// BEGIN_PARAMETER_LIST(pl) ADD_PARAMETER_GROUP(pl, "Basic Input/Output") ADD_STRING_PARAMETER(pl, inVcf, "--inVcf", "Input VCF File") ADD_STRING_PARAMETER(pl, outPrefix, "--out", "Output prefix") ADD_BOOL_PARAMETER(pl, outputRaw, "--outputRaw", "Output genotypes, phenotype, covariates(if any) and " "collapsed genotype to tabular files") ADD_PARAMETER_GROUP(pl, "Specify Covariate") ADD_STRING_PARAMETER(pl, cov, "--covar", "Specify covariate file") ADD_STRING_PARAMETER( pl, covName, "--covar-name", "Specify the column name in covariate file to be included in analysis") ADD_BOOL_PARAMETER(pl, sex, "--sex", "Include sex (5th column in the PED file) as a covariate") ADD_PARAMETER_GROUP(pl, "Specify Phenotype") ADD_STRING_PARAMETER(pl, pheno, "--pheno", "Specify phenotype file") ADD_BOOL_PARAMETER(pl, inverseNormal, "--inverseNormal", "Transform phenotype like normal distribution") ADD_BOOL_PARAMETER( pl, useResidualAsPhenotype, "--useResidualAsPhenotype", "Fit covariate ~ phenotype, use residual to replace phenotype") ADD_STRING_PARAMETER(pl, mpheno, "--mpheno", "Specify which phenotype column to read (default: 1)") ADD_STRING_PARAMETER(pl, phenoName, "--pheno-name", "Specify which phenotype column to read by header") ADD_BOOL_PARAMETER(pl, qtl, "--qtl", "Treat phenotype as quantitative trait") ADD_STRING_PARAMETER( pl, multiplePheno, "--multiplePheno", "Specify aa template file for analyses of more than one phenotype") ADD_PARAMETER_GROUP(pl, "Specify Genotype") ADD_STRING_PARAMETER(pl, dosageTag, "--dosage", "Specify which dosage tag to use. (e.g. EC or DS)") ADD_PARAMETER_GROUP(pl, "Chromosome X Options") ADD_STRING_PARAMETER(pl, xLabel, "--xLabel", "Specify X chromosome label (default: 23|X)") ADD_STRING_PARAMETER(pl, xParRegion, "--xParRegion", "Specify PAR region (default: hg19), can be build " "number e.g. hg38, b37; or specify region, e.g. " "'60001-2699520,154931044-155260560'") ADD_PARAMETER_GROUP(pl, "People Filter") ADD_STRING_PARAMETER(pl, peopleIncludeID, "--peopleIncludeID", "List IDs of people that will be included in study") ADD_STRING_PARAMETER( pl, peopleIncludeFile, "--peopleIncludeFile", "From given file, set IDs of people that will be included in study") ADD_STRING_PARAMETER(pl, peopleExcludeID, "--peopleExcludeID", "List IDs of people that will be included in study") ADD_STRING_PARAMETER( pl, peopleExcludeFile, "--peopleExcludeFile", "From given file, set IDs of people that will be included in study") ADD_PARAMETER_GROUP(pl, "Site Filter") ADD_STRING_PARAMETER( pl, rangeList, "--rangeList", "Specify some ranges to use, please use chr:begin-end format.") ADD_STRING_PARAMETER( pl, rangeFile, "--rangeFile", "Specify the file containing ranges, please use chr:begin-end format.") ADD_STRING_PARAMETER(pl, siteFile, "--siteFile", "Specify the file containing sites to include, please " "use \"chr pos\" format.") ADD_INT_PARAMETER( pl, siteDepthMin, "--siteDepthMin", "Specify minimum depth(inclusive) to be included in analysis") ADD_INT_PARAMETER( pl, siteDepthMax, "--siteDepthMax", "Specify maximum depth(inclusive) to be included in analysis") ADD_INT_PARAMETER(pl, siteMACMin, "--siteMACMin", "Specify minimum Minor Allele Count(inclusive) to be " "included in analysis") ADD_STRING_PARAMETER(pl, annoType, "--annoType", "Specify annotation type that is followed by ANNO= in " "the VCF INFO field, regular expression is allowed ") ADD_PARAMETER_GROUP(pl, "Genotype Filter") ADD_INT_PARAMETER( pl, indvDepthMin, "--indvDepthMin", "Specify minimum depth(inclusive) of a sample to be included in analysis") ADD_INT_PARAMETER( pl, indvDepthMax, "--indvDepthMax", "Specify maximum depth(inclusive) of a sample to be included in analysis") ADD_INT_PARAMETER( pl, indvQualMin, "--indvQualMin", "Specify minimum depth(inclusive) of a sample to be included in analysis") ADD_PARAMETER_GROUP(pl, "Association Model") ADD_STRING_PARAMETER(pl, modelSingle, "--single", "Single variant tests, choose from: score, wald, exact, " "famScore, famLrt, famGrammarGamma, firth") ADD_STRING_PARAMETER(pl, modelBurden, "--burden", "Burden tests, choose from: cmc, zeggini, mb, exactCMC, " "rarecover, cmat, cmcWald") ADD_STRING_PARAMETER(pl, modelVT, "--vt", "Variable threshold tests, choose from: price, analytic") ADD_STRING_PARAMETER( pl, modelKernel, "--kernel", "Kernal-based tests, choose from: SKAT, KBAC, FamSKAT, SKATO") ADD_STRING_PARAMETER(pl, modelMeta, "--meta", "Meta-analysis related functions to generate summary " "statistics, choose from: score, cov, dominant, " "recessive") ADD_PARAMETER_GROUP(pl, "Family-based Models") ADD_STRING_PARAMETER(pl, kinship, "--kinship", "Specify a kinship file for autosomal analysis, use " "vcf2kinship to generate") ADD_STRING_PARAMETER(pl, xHemiKinship, "--xHemiKinship", "Provide kinship for the chromosome X hemizygote region") ADD_STRING_PARAMETER(pl, kinshipEigen, "--kinshipEigen", "Specify eigen decomposition results of a kinship file " "for autosomal analysis") ADD_STRING_PARAMETER( pl, xHemiKinshipEigen, "--xHemiKinshipEigen", "Specify eigen decomposition results of a kinship file for X analysis") ADD_PARAMETER_GROUP(pl, "Grouping Unit ") ADD_STRING_PARAMETER(pl, geneFile, "--geneFile", "Specify a gene file (for burden tests)") ADD_STRING_PARAMETER(pl, gene, "--gene", "Specify which genes to test") ADD_STRING_PARAMETER(pl, setList, "--setList", "Specify a list to test (for burden tests)") ADD_STRING_PARAMETER(pl, setFile, "--setFile", "Specify a list file (for burden tests, first 2 " "columns: setName chr:beg-end)") ADD_STRING_PARAMETER(pl, set, "--set", "Specify which set to test (1st column)") ADD_PARAMETER_GROUP(pl, "Frequency Cutoff") /*ADD_BOOL_PARAMETER(pl, freqFromFile, "--freqFromFile", "Obtain frequency * from external file")*/ // ADD_BOOL_PARAMETER(pl, freqFromControl, "--freqFromControl", "Calculate // frequency from case samples") ADD_DOUBLE_PARAMETER( pl, freqUpper, "--freqUpper", "Specify upper minor allele frequency bound to be included in analysis") ADD_DOUBLE_PARAMETER( pl, freqLower, "--freqLower", "Specify lower minor allele frequency bound to be included in analysis") ADD_PARAMETER_GROUP(pl, "Missing Data") ADD_STRING_PARAMETER( pl, impute, "--impute", "Impute missing genotype (default:mean): mean, hwe, and drop") ADD_BOOL_PARAMETER( pl, imputePheno, "--imputePheno", "Impute phenotype to mean of those have genotypes but no phenotypes") ADD_BOOL_PARAMETER(pl, imputeCov, "--imputeCov", "Impute each covariate to its mean, instead of drop " "samples with missing covariates") ADD_PARAMETER_GROUP(pl, "Conditional Analysis") ADD_STRING_PARAMETER(pl, condition, "--condition", "Specify markers to be conditions (specify range)") ADD_PARAMETER_GROUP(pl, "Auxiliary Functions") ADD_BOOL_PARAMETER(pl, noweb, "--noweb", "Skip checking new version") ADD_BOOL_PARAMETER(pl, help, "--help", "Print detailed help message") END_PARAMETER_LIST(pl); pl.Read(argc, argv); if (FLAG_help) { pl.Help(); return 0; } welcome(); pl.Status(); if (FLAG_REMAIN_ARG.size() > 0) { fprintf(stderr, "Unparsed arguments: "); for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) { fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str()); } exit(1); } if (!FLAG_outPrefix.size()) FLAG_outPrefix = "rvtest"; REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf"); // check new version if (!FLAG_noweb) { VersionChecker ver; if (ver.retrieveRemoteVersion("http://zhanxw.com/rvtests/version") < 0) { fprintf(stderr, "Retrieve remote version failed, use '--noweb' to skip.\n"); } else { ver.setLocalVersion(VERSION); if (ver.isRemoteVersionNewer()) { fprintf(stderr, "New version of rvtests is available:"); ver.printRemoteContent(); } } } // start logging Logger _logger((FLAG_outPrefix + ".log").c_str()); logger = &_logger; logger->info("Program version: %s", VERSION); logger->infoToFile("Git Version: %s", GIT_VERSION); logger->infoToFile("Parameters BEGIN"); pl.WriteToFile(logger->getHandle()); logger->infoToFile("Parameters END"); logger->sync(); // start analysis time_t startTime = time(0); logger->info("Analysis started at: %s", currentTime().c_str()); GenotypeExtractor ge(FLAG_inVcf); // set range filters here ge.setRangeList(FLAG_rangeList.c_str()); ge.setRangeFile(FLAG_rangeFile.c_str()); // set people filters here if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) { ge.excludeAllPeople(); ge.includePeople(FLAG_peopleIncludeID.c_str()); ge.includePeopleFromFile(FLAG_peopleIncludeFile.c_str()); } ge.excludePeople(FLAG_peopleExcludeID.c_str()); ge.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str()); if (FLAG_siteDepthMin > 0) { ge.setSiteDepthMin(FLAG_siteDepthMin); logger->info("Set site depth minimum to %d", FLAG_siteDepthMin); } if (FLAG_siteDepthMax > 0) { ge.setSiteDepthMax(FLAG_siteDepthMax); logger->info("Set site depth maximum to %d", FLAG_siteDepthMax); } if (FLAG_siteMACMin > 0) { ge.setSiteMACMin(FLAG_siteMACMin); logger->info("Set site minimum MAC to %d", FLAG_siteDepthMin); } if (FLAG_annoType != "") { ge.setAnnoType(FLAG_annoType.c_str()); logger->info("Set annotype type filter to %s", FLAG_annoType.c_str()); } std::vector<std::string> vcfSampleNames; ge.getPeopleName(&vcfSampleNames); logger->info("Loaded [ %zu ] samples from VCF files", vcfSampleNames.size()); DataLoader dataLoader; dataLoader.setPhenotypeImputation(FLAG_imputePheno); dataLoader.setCovariateImputation(FLAG_imputeCov); if (FLAG_multiplePheno.empty()) { dataLoader.loadPhenotype(FLAG_pheno, FLAG_mpheno, FLAG_phenoName); // // load phenotypes // std::map<std::string, double> phenotype; // if (FLAG_pheno.empty()) { // logger->error("Cannot do association when phenotype is missing!"); // return -1; // } // // check if alternative phenotype columns are used // if (!FLAG_mpheno.empty() && !FLAG_phenoName.empty()) { // logger->error("Please specify either --mpheno or --pheno-name"); // return -1; // } // if (!FLAG_mpheno.empty()) { // int col = atoi(FLAG_mpheno); // int ret = loadPedPhenotypeByColumn(FLAG_pheno.c_str(), &phenotype, // col); // if (ret < 0) { // logger->error("Loading phenotype failed!"); // return -1; // } // } else if (!FLAG_phenoName.empty()) { // int ret = loadPedPhenotypeByHeader(FLAG_pheno.c_str(), &phenotype, // FLAG_phenoName.c_str()); // if (ret < 0) { // logger->error("Loading phenotype failed!"); // return -1; // } // } else { // int col = 1; // default use the first phenotype // int ret = loadPedPhenotypeByColumn(FLAG_pheno.c_str(), &phenotype, // col); // if (ret < 0) { // logger->error("Loading phenotype failed!"); // return -1; // } // } // logger->info("Loaded [ %zu ] sample pheontypes.", phenotype.size()); // rearrange phenotypes // drop samples from phenotype or vcf matchPhenotypeAndVCF("missing phenotype", &dataLoader, &ge); // // phenotype names (vcf sample names) arranged in the same order as in // VCF // std::vector<std::string> phenotypeNameInOrder; // std::vector<double> // phenotypeInOrder; // phenotype arranged in the same order as in VCF // rearrange(phenotype, vcfSampleNames, &vcfSampleToDrop, // &phenotypeNameInOrder, // &phenotypeInOrder, FLAG_imputePheno); // if (vcfSampleToDrop.size()) { // // exclude this sample from parsing VCF // ge.excludePeople(vcfSampleToDrop); // // output dropped samples // for (size_t i = 0; i < vcfSampleToDrop.size(); ++i) { // if (i == 0) // logger->warn( // "Total [ %zu ] samples are dropped from VCF file due to missing // " // "phenotype", // vcfSampleToDrop.size()); // if (i >= 10) { // logger->warn( // "Skip outputting additional [ %d ] samples with missing " // "phenotypes.", // ((int)vcfSampleToDrop.size() - 10)); // break; // } // logger->warn("Drop sample [ %s ] from VCF file due to missing // phenotype", // (vcfSampleToDrop)[i].c_str()); // } // // logger->warn("Drop %zu sample from VCF file since we don't have // their // // phenotypes", vcfSampleToDrop.size()); // } // if (phenotypeInOrder.size() != phenotype.size()) { // logger->warn( // "Drop [ %d ] samples from phenotype file due to missing genotypes // from " // "VCF files", // (int)(phenotype.size() - phenotypeInOrder.size())); // // We may output these samples by comparing keys of phenotype and // // phenotypeNameInOrder // } dataLoader.loadCovariate(FLAG_cov, FLAG_covName); matchCovariateAndVCF("missing covariate", &dataLoader, &ge); // // load covariate // Matrix covariate; // HandleMissingCov handleMissingCov = COVARIATE_DROP; // if (FLAG_imputeCov) { // handleMissingCov = COVARIATE_IMPUTE; // } // if (FLAG_cov.empty() && !FLAG_covName.empty()) { // logger->info("Use phenotype file as covariate file [ %s ]", // FLAG_pheno.c_str()); // FLAG_cov = FLAG_pheno; // } // if (!FLAG_cov.empty()) { // logger->info("Begin to read covariate file."); // std::vector<std::string> columnNamesInCovariate; // std::set<std::string> sampleToDropInCovariate; // int ret = loadCovariate(FLAG_cov.c_str(), phenotypeNameInOrder, // FLAG_covName.c_str(), handleMissingCov, // &covariate, // &columnNamesInCovariate, // &sampleToDropInCovariate); // if (ret < 0) { // logger->error("Load covariate file failed !"); // exit(1); // } // // drop phenotype samples // if (!sampleToDropInCovariate.empty()) { // int idx = 0; // int n = phenotypeNameInOrder.size(); // for (int i = 0; i < n; ++i) { // if (sampleToDropInCovariate.count(phenotypeNameInOrder[i]) != // 0) { // need to drop // continue; // } // phenotypeNameInOrder[idx] = phenotypeNameInOrder[i]; // phenotypeInOrder[idx] = phenotypeInOrder[i]; // idx++; // } // phenotypeNameInOrder.resize(idx); // phenotypeInOrder.resize(idx); // logger->warn( // "[ %zu ] sample phenotypes are dropped due to lacking // covariates.", // sampleToDropInCovariate.size()); // } // // drop vcf samples; // for (std::set<std::string>::const_iterator iter = // sampleToDropInCovariate.begin(); // iter != sampleToDropInCovariate.end(); ++iter) { // ge.excludePeople(iter->c_str()); // } // } } else { dataLoader.loadMultiplePhenotype(FLAG_multiplePheno, FLAG_pheno, FLAG_cov); matchPhenotypeAndVCF("missing phenotype", &dataLoader, &ge); matchCovariateAndVCF("missing covariate", &dataLoader, &ge); } dataLoader.loadSex(); if (FLAG_sex) { dataLoader.useSexAsCovariate(); matchCovariateAndVCF("missing sex", &dataLoader, &ge); } // // load sex // std::vector<int> sex; // if (loadSex(FLAG_pheno, phenotypeNameInOrder, &sex)) { // logger->error("Cannot load sex of samples from phenotype file"); // exit(1); // } // if (FLAG_sex) { // append sex in covariate // std::vector<int> index; // mark missing samples // int numMissing = findMissingSex(sex, &index); // logger->info("Futher exclude %d samples with missing sex", numMissing); // removeByIndex(index, &sex); // excludeSamplesByIndex(index, &ge, &phenotypeNameInOrder, // &phenotypeInOrder, // &covariate); // appendToMatrix("Sex", sex, &covariate); // } if (!FLAG_condition.empty()) { dataLoader.loadMarkerAsCovariate(FLAG_inVcf, FLAG_condition); matchCovariateAndVCF("missing in conditioned marker(s)", &dataLoader, &ge); } // // load conditional markers // if (!FLAG_condition.empty()) { // Matrix geno; // std::vector<std::string> rowLabel; // if (loadMarkerFromVCF(FLAG_inVcf, FLAG_condition, &rowLabel, &geno) < 0) // { // logger->error("Load conditional markers [ %s ] from [ %s ] failed.", // FLAG_condition.c_str(), FLAG_inVcf.c_str()); // exit(1); // } // if (appendGenotype(&covariate, phenotypeNameInOrder, geno, rowLabel) < 0) // { // logger->error( // "Failed to combine conditional markers [ %s ] from [ %s ] failed.", // FLAG_condition.c_str(), FLAG_inVcf.c_str()); // exit(1); // } // } dataLoader.checkConstantCovariate(); // // check if some covariates are constant for all samples // // e.g. user may include covariate "1" in addition to intercept // // in such case, we will give a fatal error // for (int i = 0; i < covariate.cols; ++i) { // std::set<double> s; // s.clear(); // for (int j = 0; j < covariate.rows; ++j) { // s.insert(covariate[j][i]); // } // if (s.size() == 1) { // logger->error( // "Covariate [ %s ] equals [ %g ] for all samples, cannot fit " // "model...\n", // covariate.GetColumnLabel(i), *s.begin()); // exit(1); // } // } g_SummaryHeader = new SummaryHeader; g_SummaryHeader->recordCovariate(dataLoader.getCovariate()); // record raw phenotype g_SummaryHeader->recordPhenotype("Trait", dataLoader.getPhenotype().extractCol(0)); // adjust phenotype // bool binaryPhenotype; if (FLAG_qtl) { // binaryPhenotype = false; dataLoader.setTraitType(DataLoader::PHENOTYPE_QTL); logger->info("-- Force quantitative trait mode -- "); } else { if (dataLoader.detectPhenotypeType() == DataLoader::PHENOTYPE_BINARY) { logger->warn("-- Enabling binary phenotype mode -- "); dataLoader.setTraitType(DataLoader::PHENOTYPE_BINARY); } else { dataLoader.setTraitType(DataLoader::PHENOTYPE_QTL); } // binaryPhenotype = isBinaryPhenotype(phenotypeInOrder); // if (binaryPhenotype) { // logger->warn("-- Enabling binary phenotype mode -- "); // convertBinaryPhenotype(&phenotypeInOrder); // } } if (FLAG_useResidualAsPhenotype) { dataLoader.useResidualAsPhenotype(); g_SummaryHeader->recordEstimation(dataLoader.getEstimation()); } // // use residual as phenotype // if (FLAG_useResidualAsPhenotype) { // if (binaryPhenotype) { // logger->warn( // "WARNING: Skip transforming binary phenotype, although you want to // " // "use residual as phenotype!"); // } else { // if (covariate.cols > 0) { // LinearRegression lr; // Vector pheno; // Matrix covAndInt; // copy(phenotypeInOrder, &pheno); // copyCovariateAndIntercept(covariate.rows, covariate, &covAndInt); // if (!lr.FitLinearModel(covAndInt, pheno)) { // logger->error( // "Cannot fit model: [ phenotype ~ 1 + covariates ], now use the // " // "original phenotype"); // } else { // const int n = lr.GetResiduals().Length(); // for (int i = 0; i < n; ++i) { // phenotypeInOrder[i] = lr.GetResiduals()[i]; // } // covariate.Dimension(0, 0); // logger->info( // "DONE: Fit model [ phenotype ~ 1 + covariates ] and model " // "residuals will be used as responses."); // } // } else { // no covaraites // centerVector(&phenotypeInOrder); // logger->info("DONE: Use residual as phenotype by centerng it"); // } // } // } if (FLAG_inverseNormal) { dataLoader.inverseNormalizePhenotype(); g_SummaryHeader->setInverseNormalize(FLAG_inverseNormal); } // // phenotype transformation // if (FLAG_inverseNormal) { // if (binaryPhenotype) { // logger->warn( // "WARNING: Skip transforming binary phenotype, although you required // " // "inverse normalization!"); // } else { // logger->info("Now applying inverse normalize transformation."); // inverseNormalizeLikeMerlin(&phenotypeInOrder); // g_SummaryHeader->setInverseNormalize(FLAG_inverseNormal); // logger->info("DONE: inverse normal transformation finished."); // } // } g_SummaryHeader->recordPhenotype("AnalyzedTrait", dataLoader.getPhenotype().extractCol(0)); if (dataLoader.getPhenotype().nrow() == 0) { logger->fatal("There are 0 samples with valid phenotypes, quitting..."); exit(1); } // if (phenotypeInOrder.empty()) { // logger->fatal("There are 0 samples with valid phenotypes, quitting..."); // exit(1); // } logger->info("Analysis begins with [ %d ] samples...", dataLoader.getPhenotype().nrow()); ////////////////////////////////////////////////////////////////////////////// // prepare each model bool singleVariantMode = FLAG_modelSingle.size() || FLAG_modelMeta.size(); bool groupVariantMode = (FLAG_modelBurden.size() || FLAG_modelVT.size() || FLAG_modelKernel.size()); if (singleVariantMode && groupVariantMode) { logger->error("Cannot support both single variant and region based tests"); exit(1); } ModelManager modelManager(FLAG_outPrefix); // set up models in qtl/binary modes if (dataLoader.isBinaryPhenotype()) { modelManager.setBinaryOutcome(); matchPhenotypeAndVCF("missing phenotype (not case/control)", &dataLoader, &ge); } else { modelManager.setQuantitativeOutcome(); } // create models modelManager.create("single", FLAG_modelSingle); modelManager.create("burden", FLAG_modelBurden); modelManager.create("vt", FLAG_modelVT); modelManager.create("kernel", FLAG_modelKernel); modelManager.create("meta", FLAG_modelMeta); if (FLAG_outputRaw) { modelManager.create("outputRaw", "dump"); } const std::vector<ModelFitter*>& model = modelManager.getModel(); const std::vector<FileWriter*>& fOuts = modelManager.getResultFile(); const size_t numModel = model.size(); // TODO: optimize this by avoidding data copying Matrix phenotypeMatrix; Matrix covariate; toMatrix(dataLoader.getPhenotype(), &phenotypeMatrix); toMatrix(dataLoader.getCovariate(), &covariate); // determine VCF file reading pattern // current support: // * line by line ( including range selection) // * gene by gene // * range by range std::string rangeMode = "Single"; if (FLAG_geneFile.size() && (FLAG_setFile.size() || FLAG_setList.size())) { logger->error("Cannot specify both gene file and set file."); exit(1); } if (!FLAG_gene.empty() && FLAG_geneFile.empty()) { logger->error("Please provide gene file for gene bases analysis."); exit(1); } OrderedMap<std::string, RangeList> geneRange; if (FLAG_geneFile.size()) { rangeMode = "Gene"; int ret = loadGeneFile(FLAG_geneFile.c_str(), FLAG_gene.c_str(), &geneRange); if (ret < 0 || geneRange.size() == 0) { logger->error("Error loading gene file or gene list is empty!"); return -1; } else { logger->info("Loaded [ %zu ] genes.", geneRange.size()); } } if (!FLAG_set.empty() && FLAG_setFile.empty()) { logger->error("Please provide set file for set bases analysis."); exit(1); } if (FLAG_setFile.size()) { rangeMode = "Range"; int ret = loadRangeFile(FLAG_setFile.c_str(), FLAG_set.c_str(), &geneRange); if (ret < 0 || geneRange.size() == 0) { logger->error("Error loading set file or set list is empty!"); return -1; } else { logger->info("Loaded [ %zu ] set to tests.", geneRange.size()); } } if (FLAG_setList.size()) { rangeMode = "Range"; int ret = appendListToRange(FLAG_setList, &geneRange); if (ret < 0) { logger->error("Error loading set list or set list is empty!"); return -1; } } DataConsolidator dc; dc.setSex(&dataLoader.getSex()); dc.setFormula(&dataLoader.getFormula()); dc.setGenotypeCounter(ge.getGenotypeCounter()); // load kinshp if needed by family models if (modelManager.hasFamilyModel() || (!FLAG_modelMeta.empty() && !FLAG_kinship.empty())) { logger->info("Family-based model specified. Loading kinship file..."); // process auto kinship if (dc.setKinshipSample(dataLoader.getPhenotype().getRowName()) || dc.setKinshipFile(DataConsolidator::KINSHIP_AUTO, FLAG_kinship) || dc.setKinshipEigenFile(DataConsolidator::KINSHIP_AUTO, FLAG_kinshipEigen) || dc.loadKinship(DataConsolidator::KINSHIP_AUTO)) { logger->error( "Failed to load autosomal kinship (you may use vcf2kinship to " "generate one)."); exit(1); } if (dc.setKinshipFile(DataConsolidator::KINSHIP_X, FLAG_xHemiKinship) || dc.setKinshipEigenFile(DataConsolidator::KINSHIP_X, FLAG_xHemiKinshipEigen) || dc.loadKinship(DataConsolidator::KINSHIP_X)) { logger->warn( "Autosomal kinship loaded, but no hemizygote region kinship " "provided, some sex chromosome tests will be skipped."); // keep the program going } } else if (!FLAG_kinship.empty() && FLAG_modelMeta.empty()) { logger->info( "Family-based model not specified. Options related to kinship will be " "ignored here."); } // set imputation method if (FLAG_impute.empty()) { logger->info("Impute missing genotype to mean (by default)"); dc.setStrategy(DataConsolidator::IMPUTE_MEAN); } else if (FLAG_impute == "mean") { logger->info("Impute missing genotype to mean"); dc.setStrategy(DataConsolidator::IMPUTE_MEAN); } else if (FLAG_impute == "hwe") { logger->info("Impute missing genotype by HWE"); dc.setStrategy(DataConsolidator::IMPUTE_HWE); } else if (FLAG_impute == "drop") { logger->info("Drop missing genotypes"); dc.setStrategy(DataConsolidator::DROP); } dc.setPhenotypeName(dataLoader.getPhenotype().getRowName()); // set up par region ParRegion parRegion(FLAG_xLabel, FLAG_xParRegion); dc.setParRegion(&parRegion); // genotype will be extracted and stored Matrix& genotype = dc.getOriginalGenotype(); if (FLAG_freqUpper > 0) { ge.setSiteFreqMax(FLAG_freqUpper); logger->info("Set upper minor allele frequency limit to %g", FLAG_freqUpper); } if (FLAG_freqLower > 0) { ge.setSiteFreqMin(FLAG_freqLower); logger->info("Set lower minor allele frequency limit to %g", FLAG_freqLower); } // handle sex chromosome ge.setParRegion(&parRegion); ge.setSex(&dataLoader.getSex()); // use dosage instead GT if (!FLAG_dosageTag.empty()) { ge.setDosageTag(FLAG_dosageTag); logger->info("Use dosage genotype from VCF flag %s.", FLAG_dosageTag.c_str()); } // genotype QC options if (FLAG_indvDepthMin > 0) { ge.setGDmin(FLAG_indvDepthMin); logger->info("Minimum GD set to %d (or marked as missing genotype).", FLAG_indvDepthMin); } if (FLAG_indvDepthMax > 0) { ge.setGDmax(FLAG_indvDepthMax); logger->info("Maximum GD set to %d (or marked as missing genotype).", FLAG_indvDepthMax); } if (FLAG_indvQualMin > 0) { ge.setGQmin(FLAG_indvQualMin); logger->info("Minimum GQ set to %d (or marked as missing genotype).", FLAG_indvQualMin); } dc.preRegressionCheck(phenotypeMatrix, covariate); logger->info("Analysis started"); Result& buf = dc.getResult(); // we have three modes: // * single variant reading, single variant test // * range variant reading, single variant test // * range variant reading, group variant test if (rangeMode == "Single" && singleVariantMode) { // use line by line mode buf.addHeader("CHROM"); buf.addHeader("POS"); buf.addHeader("REF"); buf.addHeader("ALT"); buf.addHeader("N_INFORMATIVE"); // output headers for (size_t m = 0; m < model.size(); m++) { model[m]->writeHeader(fOuts[m], buf); } int variantProcessed = 0; while (true) { buf.clearValue(); int ret = ge.extractSingleGenotype(&genotype, &buf); if (ret == GenotypeExtractor::FILE_END) { // reach file end break; } if (ret == GenotypeExtractor::FAIL_FILTER) { continue; } if (ret != GenotypeExtractor::SUCCEED) { logger->error("Extract genotype failed at site: %s:%s!", buf["CHROM"].c_str(), buf["POS"].c_str()); continue; } if (genotype.cols == 0) { logger->warn("Extract [ %s:%s ] has 0 variants, skipping", buf["CHROM"].c_str(), buf["POS"].c_str()); continue; } ++variantProcessed; dc.consolidate(phenotypeMatrix, covariate, genotype); buf.updateValue("N_INFORMATIVE", toString(genotype.rows)); // fit each model for (size_t m = 0; m != numModel; m++) { model[m]->reset(); model[m]->fit(&dc); model[m]->writeOutput(fOuts[m], buf); } } logger->info("Analyzed [ %d ] variants", variantProcessed); } else if (rangeMode != "Single" && singleVariantMode) { // read by gene/range model, single variant // test buf.addHeader(rangeMode); buf.addHeader("CHROM"); buf.addHeader("POS"); buf.addHeader("REF"); buf.addHeader("ALT"); buf.addHeader("N_INFORMATIVE"); // output headers for (size_t m = 0; m < numModel; m++) { model[m]->writeHeader(fOuts[m], buf); } std::string geneName; RangeList rangeList; int variantProcessed = 0; for (size_t i = 0; i < geneRange.size(); ++i) { geneRange.at(i, &geneName, &rangeList); ge.setRange(rangeList); while (true) { buf.clearValue(); int ret = ge.extractSingleGenotype(&genotype, &buf); if (ret == GenotypeExtractor::FILE_END) { // reach end of this region break; } if (ret == GenotypeExtractor::FAIL_FILTER) { continue; } if (ret != GenotypeExtractor::SUCCEED) { logger->error("Extract genotype failed for gene %s!", geneName.c_str()); continue; } if (genotype.cols == 0) { logger->warn("Gene %s has 0 variants, skipping", geneName.c_str()); continue; } ++variantProcessed; dc.consolidate(phenotypeMatrix, covariate, genotype); buf.updateValue(rangeMode, geneName); buf.updateValue("N_INFORMATIVE", genotype.rows); // #pragma omp parallel for for (size_t m = 0; m != numModel; m++) { model[m]->reset(); model[m]->fit(&dc); model[m]->writeOutput(fOuts[m], buf); } } } logger->info("Analyzed [ %d ] variants from [ %d ] genes/regions", variantProcessed, (int)geneRange.size()); } else if (rangeMode != "Single" && groupVariantMode) { // read by gene/range mode, group variant // test buf.addHeader(rangeMode); buf.addHeader("RANGE"); buf.addHeader("N_INFORMATIVE"); buf.addHeader("NumVar"); buf.addHeader("NumPolyVar"); // output headers for (size_t m = 0; m < numModel; m++) { model[m]->writeHeader(fOuts[m], buf); } std::string geneName; RangeList rangeList; int variantProcessed = 0; ge.enableAutoMerge(); for (size_t i = 0; i < geneRange.size(); ++i) { geneRange.at(i, &geneName, &rangeList); ge.setRange(rangeList); buf.clearValue(); int ret = ge.extractMultipleGenotype(&genotype); if (ret != GenotypeExtractor::SUCCEED) { logger->error("Extract genotype failed for gene %s!", geneName.c_str()); continue; } if (genotype.cols == 0) { logger->info("Gene %s has 0 variants, skipping", geneName.c_str()); continue; } variantProcessed += genotype.cols; // genotype is people by marker dc.consolidate(phenotypeMatrix, covariate, genotype); buf.updateValue(rangeMode, geneName); buf.updateValue("RANGE", rangeList.toString()); buf.updateValue("N_INFORMATIVE", genotype.rows); buf.updateValue("NumVar", genotype.cols); buf.updateValue("NumPolyVar", dc.getFlippedToMinorPolymorphicGenotype().cols); // #ifdef _OPENMP // #pragma omp parallel for // #endif for (size_t m = 0; m != numModel; m++) { model[m]->reset(); model[m]->fit(&dc); model[m]->writeOutput(fOuts[m], buf); } } logger->info("Analyzed [ %d ] variants from [ %d ] genes/regions", variantProcessed, (int)geneRange.size()); } else { logger->error( "Unsupported reading mode and test modes! (need more parameters?)"); exit(1); } // Resource cleaning up modelManager.close(); delete g_SummaryHeader; time_t endTime = time(0); logger->info("Analysis ends at: %s", currentTime().c_str()); int elapsedSecond = (int)(endTime - startTime); logger->info("Analysis took %d seconds", elapsedSecond); return 0; }