int maxcut_haplotyping(char* fragmentfile, char* variantfile, char* outputfile) { // IMP NOTE: all SNPs start from 1 instead of 0 and all offsets are 1+ fprintf_time(stderr, "Calling Max-Likelihood-Cut based haplotype assembly algorithm\n"); int snps = 0; int fragments = 0, iter = 0, components = 0; int i = 0, k = 0; int* slist; int flag = 0; float bestscore = 0, miscalls = 0; int hic_iter=0; struct SNPfrags* snpfrag = NULL; struct BLOCK* clist; char* HAP1; float HIC_LL_SCORE = -80; float OLD_HIC_LL_SCORE = -80; int converged_count=0, split_count, new_components, component; int new_fragments = 0; struct fragment* new_Flist; // READ FRAGMENT MATRIX fragments = get_num_fragments(fragmentfile); struct fragment* Flist; Flist = (struct fragment*) malloc(sizeof (struct fragment)* fragments); flag = read_fragment_matrix(fragmentfile, Flist, fragments); if (MAX_IS != -1){ // we are going to filter out some insert sizes new_fragments = 0; new_Flist = (struct fragment*) malloc(sizeof (struct fragment)* fragments); for(i = 0; i < fragments; i++){ if (Flist[i].isize < MAX_IS) new_Flist[new_fragments++] = Flist[i]; } Flist = new_Flist; fragments = new_fragments; } if (flag < 0) { fprintf_time(stderr, "unable to read fragment matrix file %s \n", fragmentfile); return -1; } //ADD EDGES BETWEEN SNPS snps = count_variants_vcf(variantfile); if (snps < 0) { fprintf_time(stderr, "unable to read variant file %s \n", variantfile); return -1; } snpfrag = (struct SNPfrags*) malloc(sizeof (struct SNPfrags)*snps); update_snpfrags(Flist, fragments, snpfrag, snps, &components); detect_long_reads(Flist,fragments); // 10/25/2014, edges are only added between adjacent nodes in each fragment and used for determining connected components... for (i = 0; i < snps; i++) snpfrag[i].elist = (struct edge*) malloc(sizeof (struct edge)*(snpfrag[i].edges+1)); if (LONG_READS ==0){ add_edges(Flist,fragments,snpfrag,snps,&components); }else if (LONG_READS >=1){ add_edges_fosmids(Flist,fragments,snpfrag,snps,&components); } for (i = 0; i < snps; i++) snpfrag[i].telist = (struct edge*) malloc(sizeof (struct edge)*(snpfrag[i].edges+1)); // this considers only components with at least two nodes fprintf_time(stderr, "fragments %d snps %d component(blocks) %d\n", fragments, snps, components); // BUILD COMPONENT LIST clist = (struct BLOCK*) malloc(sizeof (struct BLOCK)*components); generate_clist_structure(Flist, fragments, snpfrag, snps, components, clist); // READ VCF FILE read_vcffile(variantfile, snpfrag, snps); // INITIALIZE RANDOM HAPLOTYPES HAP1 = (char*) malloc(snps + 1); for (i = 0; i < snps; i++) { if (snpfrag[i].frags == 0 || (SNVS_BEFORE_INDELS && (strlen(snpfrag[i].allele0) != 1 || strlen(snpfrag[i].allele1) != 1))) { HAP1[i] = '-'; } else if (drand48() < 0.5) { HAP1[i] = '0'; } else { HAP1[i] = '1'; } } // for each block, we maintain best haplotype solution under MFR criterion // compute the component-wise score for 'initHAP' haplotype miscalls = 0; bestscore = 0; for (k = 0; k < components; k++) { clist[k].SCORE = 0; clist[k].bestSCORE = 0; for (i = 0; i < clist[k].frags; i++) { update_fragscore(Flist, clist[k].flist[i], HAP1); clist[k].SCORE += Flist[clist[k].flist[i]].currscore; } clist[k].bestSCORE = clist[k].SCORE; bestscore += clist[k].bestSCORE; miscalls += clist[k].SCORE; } fprintf_time(stderr, "processed fragment file and variant file: fragments %d variants %d\n", fragments, snps); int MAXIS = -1; if (HIC){ // determine the probability of an h-trans interaction for read for (i=0; i<fragments;i++){ Flist[i].htrans_prob = -80; if (Flist[i].isize > MAXIS) MAXIS = Flist[i].isize; } HTRANS_MAXBINS = MAXIS/HTRANS_BINSIZE + 1; }else{ HTRANS_MAXBINS = 0; } // read in file with estimated probabilities of Hi-C h-trans interactions with distance if (strcmp(HTRANS_DATA_INFILE, "None") != 0){ int num_bins = count_htrans_bins(HTRANS_DATA_INFILE); float* htrans_probs = (float*) malloc(sizeof(float) * num_bins); read_htrans_file(HTRANS_DATA_INFILE, htrans_probs, num_bins); for (i=0; i<fragments;i++){ Flist[i].htrans_prob = log10(htrans_probs[Flist[i].isize / HTRANS_BINSIZE]); } free(htrans_probs); } slist = (int*) malloc(sizeof (int)*snps); OLD_HIC_LL_SCORE = bestscore; for (hic_iter = 0; hic_iter < MAX_HIC_EM_ITER; hic_iter++){ if (VERBOSE) fprintf_time(stdout, "HIC ITER %d\n", hic_iter); for (k = 0; k < components; k++){ clist[k].iters_since_improvement = 0; } for (i=0; i<snps; i++){ snpfrag[i].post_hap = 0; } // RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE LIKELIHOOD for (iter = 0; iter < MAXITER; iter++) { if (VERBOSE) fprintf_time(stdout, "PHASING ITER %d\n", iter); converged_count = 0; for (k = 0; k < components; k++){ if(VERBOSE && iter == 0) fprintf_time(stdout, "component %d length %d phased %d %d...%d\n", k, clist[k].length, clist[k].phased, clist[k].offset, clist[k].lastvar); if (clist[k].SCORE > 0) converged_count += evaluate_cut_component(Flist, snpfrag, clist, k, slist, HAP1); else converged_count++; } if (converged_count == components) { //fprintf(stdout, "Haplotype assembly terminated early because no improvement seen in blocks after %d iterations\n", CONVERGE); break; } } // H-TRANS ESTIMATION FOR HIC if (MAX_HIC_EM_ITER > 1){ // Possibly break if we're done improving HIC_LL_SCORE = 0; for (k = 0; k < components; k++){ HIC_LL_SCORE += clist[k].bestSCORE; } if (HIC_LL_SCORE >= OLD_HIC_LL_SCORE){ break; } OLD_HIC_LL_SCORE = HIC_LL_SCORE; likelihood_pruning(snps, Flist, snpfrag, HAP1, 0); // prune for only very high confidence SNPs // estimate the h-trans probabilities for the next round estimate_htrans_probs(Flist, fragments, HAP1, snpfrag); } } // BLOCK SPLITTING new_components = components; if (SPLIT_BLOCKS){ split_count = 0; for (k=0; k<components; k++){ // attempt to split block split_count += split_block(HAP1, clist, k, Flist, snpfrag, &new_components); } if (split_count > 0){ // regenerate clist if necessary free(clist); clist = (struct BLOCK*) malloc(sizeof (struct BLOCK)*new_components); generate_clist_structure(Flist, fragments, snpfrag, snps, new_components, clist); } components = new_components; }else if(ERROR_ANALYSIS_MODE && !HIC){ for (k=0; k<components; k++){ // run split_block but don't actually split, just get posterior probabilities split_block(HAP1, clist, k, Flist, snpfrag, &new_components); } } // PRUNE SNPS if (!SKIP_PRUNE){ discrete_pruning(snps, fragments, Flist, snpfrag, HAP1); likelihood_pruning(snps, Flist, snpfrag, HAP1, CALL_HOMOZYGOUS); } // PRINT OUTPUT FILE fprintf_time(stderr, "OUTPUTTING PRUNED HAPLOTYPE ASSEMBLY TO FILE %s\n", outputfile); print_hapfile(clist, components, HAP1, Flist, fragments, snpfrag, variantfile, miscalls, outputfile); char assignfile[4096]; sprintf(assignfile,"%s.fragments",outputfile); if (OUTPUT_RH_ASSIGNMENTS ==1) fragment_assignments(Flist,fragments,snpfrag,HAP1,assignfile); // added 03/10/2018 to output read-haplotype assignments char outvcffile[4096]; sprintf(outvcffile,"%s.phased.VCF",outputfile); if (OUTPUT_VCF ==1) { fprintf_time(stderr, "OUTPUTTING PHASED VCF TO FILE %s\n", outvcffile); output_vcf(variantfile,snpfrag,snps,HAP1,Flist,fragments,outvcffile,0); } // FREE UP MEMORY for (i = 0; i < snps; i++) free(snpfrag[i].elist); for (i = 0; i < snps; i++) free(snpfrag[i].telist); component = 0; for (i = 0; i < snps; i++) { free(snpfrag[i].flist); free(snpfrag[i].alist); free(snpfrag[i].jlist); free(snpfrag[i].klist); if (snpfrag[i].component == i && snpfrag[i].csize > 1) // root node of component { free(clist[component].slist); component++; } } for (i = 0; i < components; i++) free(clist[i].flist); free(snpfrag); free(clist); free(Flist); return 0; }
int main(int argc, char const *argv[]) { if (argc != 5) { std::cout << "USAGE: splitMultiAllelicVariants <variants> <output_prefix> <min_allele_posterior> <min_genotype_posterior>" << std::endl; return 1; } cout << "\n[" << Utils::getLocalTime() << "] Running BayesTyperUtils (" << BTU_VERSION << ") splitMultiAllelicVariants script ...\n" << endl; GenotypedVcfFileReader vcf_reader(string(argv[1]), true); Auxiliaries::removeNonRelevantFormatDescriptors(&(vcf_reader.metaData()), {"GT", "GPP"}); auto output_meta_data = vcf_reader.metaData(); output_meta_data.filterDescriptors().clear(); output_meta_data.infoDescriptors().erase("ACP"); output_meta_data.infoDescriptors().erase("AsmVar_ASQR"); output_meta_data.infoDescriptors().erase("DNE"); output_meta_data.formatDescriptors().erase("GPP"); VcfFileWriter output_vcf(string(argv[2]) + ".vcf", output_meta_data, true); float min_allele_posterior = stof(argv[3]); float min_genotype_posterior = stof(argv[4]); Variant * cur_var; auto sample_ids = output_meta_data.sampleIds(); uint num_variants = 0; uint num_alt_alleles = 0; uint num_missing_alleles = 0; uint num_called_split_variants = 0; uint num_filtered_alleles = 0; uint num_filtered_samples = 0; while (vcf_reader.getNextVariant(&cur_var)) { num_variants++; num_alt_alleles += cur_var->numAlts(); vector<string> complete_sample_ids; complete_sample_ids.reserve(sample_ids.size()); for (auto & sample_id: sample_ids) { Sample & cur_sample = cur_var->getSample(sample_id); if (cur_sample.callStatus() != Sample::CallStatus::Missing) { assert(cur_sample.callStatus() == Sample::CallStatus::Complete); auto genotype_gpp = Auxiliaries::getGenotypePosterior(cur_sample); assert(genotype_gpp == Auxiliaries::getMaxGenotypePosterior(cur_sample)); if (genotype_gpp.second) { if (genotype_gpp.first < min_genotype_posterior) { cur_sample.clearGenotypeEstimate(); assert(cur_sample.callStatus() == Sample::CallStatus::Missing); num_filtered_samples++; } else { complete_sample_ids.push_back(sample_id); } } else { assert(cur_sample.ploidy() == Sample::Ploidy::Zeroploid); } } } while (cur_var->numAlts() > 1) { assert(!(cur_var->alt(0).isMissing())); auto acp_value = cur_var->alt(0).info().getValue<float>("ACP"); assert(acp_value.second); if (acp_value.first < min_allele_posterior) { num_filtered_alleles++; } else { Variant * new_var = new Variant(*cur_var); vector<uint> alts_remove(new_var->numAlts() - 1); iota(alts_remove.begin(), alts_remove.end(), 1); new_var->removeAlts(alts_remove); assert(new_var->numAlls() == 2); assert(new_var->numAlts() == 1); num_called_split_variants++; updateGenotypes(new_var, complete_sample_ids); writeVariant(&output_vcf, new_var); } cur_var->removeAlts({0}); } assert(cur_var->numAlls() == 2); assert(cur_var->numAlts() == 1); auto acp_value = cur_var->alt(0).info().getValue<float>("ACP"); assert(acp_value.second); if (cur_var->alt(0).isMissing()) { num_missing_alleles++; } else if (acp_value.first < min_allele_posterior) { num_filtered_alleles++; } else { num_called_split_variants++; updateGenotypes(cur_var, complete_sample_ids); writeVariant(&output_vcf, cur_var); } if ((num_variants % 100000) == 0) { std::cout << "[" << Utils::getLocalTime() << "] Parsed " << num_variants << " variants" << endl; } delete cur_var; } cout << "\n[" << Utils::getLocalTime() << "] Parsed " << num_variants << " variants and " << num_alt_alleles << " alternative alleles (" << num_missing_alleles << " excluded missing alleles)." << endl; cout << "\n[" << Utils::getLocalTime() << "] Filtered " << num_filtered_alleles << " alternative alleles with an allele posterior less than " << min_allele_posterior << "." << endl; cout << "[" << Utils::getLocalTime() << "] Filtered " << num_filtered_samples << " samples with a genotype posterior less than " << min_genotype_posterior << "." << endl; cout << "\n[" << Utils::getLocalTime() << "] Wrote " << num_called_split_variants << " called bi-allelic variants." << endl; cout << endl; return 0; }
void merge(const vector<string> & in_vcf_filenames, const string & outfile_prefix) { assert(in_vcf_filenames.size() > 1); GenotypedVcfFileReader tmpl_vcf(in_vcf_filenames.front(), true); // Prepare output metadata VcfMetaData output_meta_data = tmpl_vcf.metaData(); uint num_samples = tmpl_vcf.metaData().numSamples(); vector<unique_ptr<GenotypedVcfFileReader> > in_vcfs; for (uint in_vcf_idx = 1; in_vcf_idx < in_vcf_filenames.size(); in_vcf_idx++) { in_vcfs.push_back(unique_ptr<GenotypedVcfFileReader> (new GenotypedVcfFileReader(in_vcf_filenames.at(in_vcf_idx), true))); num_samples += in_vcfs.back()->metaData().numSamples(); for (auto & smpl_id : in_vcfs.back()->metaData().sampleIds()) { output_meta_data.addSample(smpl_id); } assert(tmpl_vcf.metaData().contigs() == in_vcfs.back()->metaData().contigs()); assert(tmpl_vcf.metaData().infoDescriptors() == in_vcfs.back()->metaData().infoDescriptors()); assert(tmpl_vcf.metaData().filterDescriptors() == in_vcfs.back()->metaData().filterDescriptors()); assert(tmpl_vcf.metaData().formatDescriptors() == in_vcfs.back()->metaData().formatDescriptors()); } cout << "[" << Utils::getLocalTime() << "] Running BayesTyperUtils (" << BTU_VERSION << ") merge on " << in_vcf_filenames.size() << " files with containing " << num_samples << " samples in total ...\n" << endl; assert(output_meta_data.infoDescriptors().erase("HC")); VcfFileWriter output_vcf(outfile_prefix + ".vcf", output_meta_data, true); vector<string> var_value_assert_keys = {"VT", "VCS", "VCI", "VCGS", "VCGI", "HCR", "AE", "ACO", "AsmVar_ASQR"}; auto sample_ids = output_meta_data.sampleIds(); uint num_variants = 0; uint cache_size = 10000; vector<Variant*> tmpl_var_cache(cache_size, nullptr); vector<vector<Variant*> > in_var_caches(in_vcfs.size(), vector<Variant*>(cache_size, nullptr)); bool reached_last_var = false; while (!reached_last_var) { /* Fill cache */ for (uint cache_idx = 0; cache_idx < cache_size; cache_idx++) { reached_last_var = !tmpl_vcf.getNextVariant(&tmpl_var_cache.at(cache_idx)); if (reached_last_var) { cache_size = cache_idx; tmpl_var_cache.resize(cache_size); } } for (uint in_vcf_idx = 0; in_vcf_idx < in_vcfs.size(); in_vcf_idx++) { for (uint cache_idx = 0; cache_idx < cache_size; cache_idx++) { assert(in_vcfs.at(in_vcf_idx)->getNextVariant(&in_var_caches.at(in_vcf_idx).at(cache_idx))); } } /* Merge vars in cache */ for (uint cache_idx = 0; cache_idx < cache_size; cache_idx++) { num_variants++; Variant * cur_tmpl_var = tmpl_var_cache.at(cache_idx); assert(cur_tmpl_var); assert(cur_tmpl_var->filters().size() == 1); assert((cur_tmpl_var->filters().front() == "PASS") or (cur_tmpl_var->filters().front() == "UV")); set<ushort> alleles_not_covered; auto cur_tmpl_var_anc_values = cur_tmpl_var->info().getValue<string>("ANC"); if (cur_tmpl_var_anc_values.second) { auto cur_tmpl_var_anc_values_split = Utils::splitString(cur_tmpl_var_anc_values.first, ','); for (auto &anc_value: cur_tmpl_var_anc_values_split) { alleles_not_covered.insert(stoi(anc_value)); } } for (uint in_vcf_idx = 0; in_vcf_idx < in_vcfs.size(); in_vcf_idx++) { Variant * cur_in_var = in_var_caches.at(in_vcf_idx).at(cache_idx); assert(cur_in_var); assert(cur_tmpl_var->chrom() == cur_in_var->chrom()); assert(cur_tmpl_var->pos() == cur_in_var->pos()); assert(cur_tmpl_var->ids() == cur_in_var->ids()); assert(cur_tmpl_var->numAlts() == cur_in_var->numAlts()); for (auto & var_value_assert_key : var_value_assert_keys) { assert(cur_tmpl_var->info().getValue(var_value_assert_key) == cur_in_var->info().getValue(var_value_assert_key)); } assert(cur_in_var->filters().size() == 1); assert((cur_tmpl_var->filters().front() == "UV") == (cur_in_var->filters().front() == "UV")); if (cur_in_var->filters().front() == "UV") { assert(Utils::splitString(fetchValue<string>(cur_in_var->info(), "AE"), ',').size() == cur_in_var->numAlls()); assert(cur_tmpl_var->info().getValue("AC") == cur_in_var->info().getValue("AC")); assert(cur_tmpl_var->info().getValue("AF") == cur_in_var->info().getValue("AF")); assert(cur_tmpl_var->info().getValue("AN") == cur_in_var->info().getValue("AN")); assert(cur_tmpl_var->info().getValue("ACP") == cur_in_var->info().getValue("ACP")); assert(cur_tmpl_var->info().getValue("ANC") == cur_in_var->info().getValue("ANC")); } else { assert(cur_in_var->filters().front() == "PASS"); } if (cur_in_var->info().getValue("HRS").second) { cur_tmpl_var->info().addFlag("HRS"); } auto cur_in_var_anc_values = cur_in_var->info().getValue<string>("ANC"); if (cur_in_var_anc_values.second) { auto cur_in_var_anc_values_split = Utils::splitString(cur_in_var_anc_values.first, ','); for (auto &anc_value: cur_in_var_anc_values_split) { alleles_not_covered.insert(stoi(anc_value)); } } for (uint all_idx = 0; all_idx < cur_tmpl_var->numAlls(); all_idx++) { assert(cur_tmpl_var->allele(all_idx) == cur_in_var->allele(all_idx)); } for (auto & smpl_id : in_vcfs.at(in_vcf_idx)->metaData().sampleIds()) { cur_tmpl_var->addSample(smpl_id, cur_in_var->getSample(smpl_id)); } delete cur_in_var; } if (!(alleles_not_covered.empty())) { JoiningString anc_elements(','); for (auto &allele: alleles_not_covered) { anc_elements.join(to_string(allele)); } cur_tmpl_var->info().setValue<string>("ANC", anc_elements.str()); } auto allele_stats = Stats::calcAlleleStats(cur_tmpl_var); assert(!cur_tmpl_var->info().setValue<int>("AN", allele_stats.first.allele_count_sum)); auto map_call_prob_and_var_qual = Stats::calcAlleleCallProbAndQualFromAllelePosteriors(cur_tmpl_var); assert(map_call_prob_and_var_qual.first.size() == cur_tmpl_var->numAlls()); for (uint all_idx = 0; all_idx < cur_tmpl_var->numAlls(); all_idx++) { assert(!(cur_tmpl_var->allele(all_idx).info().setValue<float>("ACP", map_call_prob_and_var_qual.first.at(all_idx)))); if (all_idx > 0) { assert(!cur_tmpl_var->allele(all_idx).info().setValue<int>("AC", allele_stats.first.allele_counts.at(all_idx))); assert(!cur_tmpl_var->allele(all_idx).info().setValue<float>("AF", allele_stats.first.allele_freqs.at(all_idx))); } } cur_tmpl_var->setQual(make_pair(map_call_prob_and_var_qual.second, true)); output_vcf.write(cur_tmpl_var); delete cur_tmpl_var; if (num_variants % 100000 == 0) { cout << "[" << Utils::getLocalTime() << "] Merged " << num_variants << " variant(s)" << endl; } } tmpl_var_cache = vector<Variant*>(cache_size, nullptr); in_var_caches = vector<vector<Variant*> >(in_vcfs.size(), vector<Variant*>(cache_size, nullptr)); } Variant * dummy_var; for (auto & in_vcf : in_vcfs) { assert(!in_vcf->getNextVariant(&dummy_var)); } cout << "\n[" << Utils::getLocalTime() << "] Completed merge of " << num_variants << " variant(s)" << endl; cout << endl; }