Beispiel #1
0
int maxcut_haplotyping(char* fragmentfile, char* variantfile, char* outputfile) {
    // IMP NOTE: all SNPs start from 1 instead of 0 and all offsets are 1+
    fprintf_time(stderr, "Calling Max-Likelihood-Cut based haplotype assembly algorithm\n");

    int snps = 0;
    int fragments = 0, iter = 0, components = 0;
    int i = 0, k = 0;
    int* slist;
    int flag = 0;
    float bestscore = 0, miscalls = 0;
    int hic_iter=0;
    struct SNPfrags* snpfrag = NULL;
    struct BLOCK* clist;
    char* HAP1;
    float HIC_LL_SCORE = -80;
    float OLD_HIC_LL_SCORE = -80;
    int converged_count=0, split_count, new_components, component;

    int new_fragments = 0;
    struct fragment* new_Flist;

    // READ FRAGMENT MATRIX
    fragments = get_num_fragments(fragmentfile); 
    struct fragment* Flist;
    Flist     = (struct fragment*) malloc(sizeof (struct fragment)* fragments);
    flag = read_fragment_matrix(fragmentfile, Flist, fragments);

    if (MAX_IS != -1){
        // we are going to filter out some insert sizes
        new_fragments = 0;
        new_Flist = (struct fragment*) malloc(sizeof (struct fragment)* fragments);
        for(i = 0; i < fragments; i++){
            if (Flist[i].isize < MAX_IS) new_Flist[new_fragments++] = Flist[i];
        }
        Flist = new_Flist;
        fragments = new_fragments;
    }

    if (flag < 0) {
        fprintf_time(stderr, "unable to read fragment matrix file %s \n", fragmentfile);
        return -1;
    }

    //ADD EDGES BETWEEN SNPS
    snps = count_variants_vcf(variantfile);
    if (snps < 0) {
        fprintf_time(stderr, "unable to read variant file %s \n", variantfile);
        return -1;
    }

    snpfrag = (struct SNPfrags*) malloc(sizeof (struct SNPfrags)*snps);
    update_snpfrags(Flist, fragments, snpfrag, snps, &components);
    
    detect_long_reads(Flist,fragments);

    // 10/25/2014, edges are only added between adjacent nodes in each fragment and used for determining connected components...
    for (i = 0; i < snps; i++) snpfrag[i].elist = (struct edge*) malloc(sizeof (struct edge)*(snpfrag[i].edges+1));
    if (LONG_READS ==0){
        add_edges(Flist,fragments,snpfrag,snps,&components);
    }else if (LONG_READS >=1){
        add_edges_fosmids(Flist,fragments,snpfrag,snps,&components);
    }

    for (i = 0; i < snps; i++) snpfrag[i].telist = (struct edge*) malloc(sizeof (struct edge)*(snpfrag[i].edges+1));

    // this considers only components with at least two nodes
    fprintf_time(stderr, "fragments %d snps %d component(blocks) %d\n", fragments, snps, components);

    // BUILD COMPONENT LIST
    clist = (struct BLOCK*) malloc(sizeof (struct BLOCK)*components);
    generate_clist_structure(Flist, fragments, snpfrag, snps, components, clist);

    // READ VCF FILE
    read_vcffile(variantfile, snpfrag, snps);

    // INITIALIZE RANDOM HAPLOTYPES
    HAP1 = (char*) malloc(snps + 1);
    for (i = 0; i < snps; i++) {
        if (snpfrag[i].frags == 0 || (SNVS_BEFORE_INDELS && (strlen(snpfrag[i].allele0) != 1 || strlen(snpfrag[i].allele1) != 1))) {
            HAP1[i] = '-';
        } else if (drand48() < 0.5) {
            HAP1[i] = '0';
        } else {
            HAP1[i] = '1';
        }
    }

    // for each block, we maintain best haplotype solution under MFR criterion
    // compute the component-wise score for 'initHAP' haplotype
    miscalls = 0;
    bestscore = 0;
    for (k = 0; k < components; k++) {
        clist[k].SCORE = 0;
        clist[k].bestSCORE = 0;
        for (i = 0; i < clist[k].frags; i++) {
            update_fragscore(Flist, clist[k].flist[i], HAP1);
            clist[k].SCORE += Flist[clist[k].flist[i]].currscore;
        }
        clist[k].bestSCORE = clist[k].SCORE;
        bestscore += clist[k].bestSCORE;
        miscalls += clist[k].SCORE;
    }

    fprintf_time(stderr, "processed fragment file and variant file: fragments %d variants %d\n", fragments, snps);

    int MAXIS = -1;

    if (HIC){

        // determine the probability of an h-trans interaction for read

        for (i=0; i<fragments;i++){

            Flist[i].htrans_prob = -80;

            if (Flist[i].isize > MAXIS)
                MAXIS = Flist[i].isize;
        }

        HTRANS_MAXBINS = MAXIS/HTRANS_BINSIZE + 1;
    }else{
        HTRANS_MAXBINS = 0;
    }

    // read in file with estimated probabilities of Hi-C h-trans interactions with distance
    if (strcmp(HTRANS_DATA_INFILE, "None") != 0){
        int num_bins        = count_htrans_bins(HTRANS_DATA_INFILE);
        float* htrans_probs = (float*) malloc(sizeof(float) * num_bins);
        read_htrans_file(HTRANS_DATA_INFILE, htrans_probs, num_bins);
        for (i=0; i<fragments;i++){
            Flist[i].htrans_prob = log10(htrans_probs[Flist[i].isize / HTRANS_BINSIZE]);
        }
        free(htrans_probs);
    }

    slist = (int*) malloc(sizeof (int)*snps);

    OLD_HIC_LL_SCORE = bestscore;
    for (hic_iter = 0; hic_iter < MAX_HIC_EM_ITER; hic_iter++){
        if (VERBOSE)
            fprintf_time(stdout, "HIC ITER %d\n", hic_iter);
        for (k = 0; k < components; k++){
            clist[k].iters_since_improvement = 0;
        }
        for (i=0; i<snps; i++){
            snpfrag[i].post_hap = 0;
        }
        // RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE LIKELIHOOD
        for (iter = 0; iter < MAXITER; iter++) {
            if (VERBOSE)
                fprintf_time(stdout, "PHASING ITER %d\n", iter);
            converged_count = 0;
            for (k = 0; k < components; k++){
                if(VERBOSE && iter == 0)
                    fprintf_time(stdout, "component %d length %d phased %d %d...%d\n", k, clist[k].length, clist[k].phased, clist[k].offset, clist[k].lastvar);
                if (clist[k].SCORE > 0)
                    converged_count += evaluate_cut_component(Flist, snpfrag, clist, k, slist, HAP1);
                else converged_count++;
            }

            if (converged_count == components) {
                //fprintf(stdout, "Haplotype assembly terminated early because no improvement seen in blocks after %d iterations\n", CONVERGE);
                break;
            }
        }

        // H-TRANS ESTIMATION FOR HIC
        if (MAX_HIC_EM_ITER > 1){

            // Possibly break if we're done improving
            HIC_LL_SCORE = 0;
            for (k = 0; k < components; k++){
                HIC_LL_SCORE += clist[k].bestSCORE;
            }
            if (HIC_LL_SCORE >= OLD_HIC_LL_SCORE){
                break;
            }
            OLD_HIC_LL_SCORE = HIC_LL_SCORE;

            likelihood_pruning(snps, Flist, snpfrag, HAP1, 0); // prune for only very high confidence SNPs
            // estimate the h-trans probabilities for the next round
            estimate_htrans_probs(Flist, fragments, HAP1, snpfrag);
        }
    }

    // BLOCK SPLITTING
    new_components = components;
    if (SPLIT_BLOCKS){
        split_count = 0;
        for (k=0; k<components; k++){
            // attempt to split block
            split_count += split_block(HAP1, clist, k, Flist, snpfrag, &new_components);
        }
        if (split_count > 0){
            // regenerate clist if necessary
            free(clist);
            clist = (struct BLOCK*) malloc(sizeof (struct BLOCK)*new_components);
            generate_clist_structure(Flist, fragments, snpfrag, snps, new_components, clist);
        }
        components = new_components;
    }else if(ERROR_ANALYSIS_MODE && !HIC){
        for (k=0; k<components; k++){
            // run split_block but don't actually split, just get posterior probabilities
            split_block(HAP1, clist, k, Flist, snpfrag, &new_components);
        }
    }

    // PRUNE SNPS
    if (!SKIP_PRUNE){
        discrete_pruning(snps, fragments, Flist, snpfrag, HAP1);
        likelihood_pruning(snps, Flist, snpfrag, HAP1, CALL_HOMOZYGOUS);
    }
    // PRINT OUTPUT FILE
    fprintf_time(stderr, "OUTPUTTING PRUNED HAPLOTYPE ASSEMBLY TO FILE %s\n", outputfile);
    print_hapfile(clist, components, HAP1, Flist, fragments, snpfrag, variantfile, miscalls, outputfile);
    char assignfile[4096];  sprintf(assignfile,"%s.fragments",outputfile);
    if (OUTPUT_RH_ASSIGNMENTS ==1) fragment_assignments(Flist,fragments,snpfrag,HAP1,assignfile); // added 03/10/2018 to output read-haplotype assignments
    char outvcffile[4096];  sprintf(outvcffile,"%s.phased.VCF",outputfile);
    if (OUTPUT_VCF ==1) {
    	fprintf_time(stderr, "OUTPUTTING PHASED VCF TO FILE %s\n", outvcffile);
	output_vcf(variantfile,snpfrag,snps,HAP1,Flist,fragments,outvcffile,0);
    }

    // FREE UP MEMORY
    for (i = 0; i < snps; i++) free(snpfrag[i].elist);
    for (i = 0; i < snps; i++) free(snpfrag[i].telist);
    component = 0;
    for (i = 0; i < snps; i++) {
        free(snpfrag[i].flist);
        free(snpfrag[i].alist);
        free(snpfrag[i].jlist);
        free(snpfrag[i].klist);

        if (snpfrag[i].component == i && snpfrag[i].csize > 1) // root node of component
        {
            free(clist[component].slist);
            component++;
        }
    }

    for (i = 0; i < components; i++) free(clist[i].flist);
    free(snpfrag);
    free(clist);
    free(Flist);

    return 0;
}
int main(int argc, char const *argv[]) {

	if (argc != 5) {

		std::cout << "USAGE: splitMultiAllelicVariants <variants> <output_prefix> <min_allele_posterior> <min_genotype_posterior>" << std::endl;
		return 1;
	}

    cout << "\n[" << Utils::getLocalTime() << "] Running BayesTyperUtils (" << BTU_VERSION << ") splitMultiAllelicVariants script ...\n" << endl;

	GenotypedVcfFileReader vcf_reader(string(argv[1]), true);
 	Auxiliaries::removeNonRelevantFormatDescriptors(&(vcf_reader.metaData()), {"GT", "GPP"});

	auto output_meta_data = vcf_reader.metaData();

	output_meta_data.filterDescriptors().clear();
	output_meta_data.infoDescriptors().erase("ACP");
	output_meta_data.infoDescriptors().erase("AsmVar_ASQR");
	output_meta_data.infoDescriptors().erase("DNE");
	output_meta_data.formatDescriptors().erase("GPP");

	VcfFileWriter output_vcf(string(argv[2]) + ".vcf", output_meta_data, true);

	float min_allele_posterior = stof(argv[3]);
	float min_genotype_posterior = stof(argv[4]);

	Variant * cur_var;
	auto sample_ids = output_meta_data.sampleIds();

	uint num_variants = 0;
	uint num_alt_alleles = 0;
	uint num_missing_alleles = 0;
	uint num_called_split_variants = 0;

	uint num_filtered_alleles = 0;
	uint num_filtered_samples = 0;

	while (vcf_reader.getNextVariant(&cur_var)) {

		num_variants++;
		num_alt_alleles += cur_var->numAlts();

		vector<string> complete_sample_ids;
		complete_sample_ids.reserve(sample_ids.size());

    	for (auto & sample_id: sample_ids) {

			Sample & cur_sample = cur_var->getSample(sample_id);

            if (cur_sample.callStatus() != Sample::CallStatus::Missing) {

            	assert(cur_sample.callStatus() == Sample::CallStatus::Complete);

            	auto genotype_gpp = Auxiliaries::getGenotypePosterior(cur_sample);
            	assert(genotype_gpp == Auxiliaries::getMaxGenotypePosterior(cur_sample));

            	if (genotype_gpp.second) {

		            if (genotype_gpp.first < min_genotype_posterior) {

		                cur_sample.clearGenotypeEstimate();
		            	assert(cur_sample.callStatus() == Sample::CallStatus::Missing);

		                num_filtered_samples++;
		            
		            } else {

		            	complete_sample_ids.push_back(sample_id);
		            }

            	} else {

            		assert(cur_sample.ploidy() == Sample::Ploidy::Zeroploid);
            	}
            } 
        }
    	
		while (cur_var->numAlts() > 1) {

			assert(!(cur_var->alt(0).isMissing()));

			auto acp_value = cur_var->alt(0).info().getValue<float>("ACP");
			assert(acp_value.second);

			if (acp_value.first < min_allele_posterior) {

				num_filtered_alleles++;

			} else {

				Variant * new_var = new Variant(*cur_var);

				vector<uint> alts_remove(new_var->numAlts() - 1);
				iota(alts_remove.begin(), alts_remove.end(), 1);

				new_var->removeAlts(alts_remove);
	
				assert(new_var->numAlls() == 2);
				assert(new_var->numAlts() == 1);

				num_called_split_variants++;

				updateGenotypes(new_var, complete_sample_ids);
				writeVariant(&output_vcf, new_var);
			}

			cur_var->removeAlts({0});
		}

		assert(cur_var->numAlls() == 2);
		assert(cur_var->numAlts() == 1);

		auto acp_value = cur_var->alt(0).info().getValue<float>("ACP");
		assert(acp_value.second);

		if (cur_var->alt(0).isMissing()) {

			num_missing_alleles++;
			
		} else if (acp_value.first < min_allele_posterior) {

			num_filtered_alleles++;

		} else {

			num_called_split_variants++;

			updateGenotypes(cur_var, complete_sample_ids);
			writeVariant(&output_vcf, cur_var);
		}

		if ((num_variants % 100000) == 0) {

			std::cout << "[" << Utils::getLocalTime() << "] Parsed " << num_variants << " variants" << endl;
		}

		delete cur_var;
	}

	cout << "\n[" << Utils::getLocalTime() << "] Parsed " << num_variants << " variants and " << num_alt_alleles << " alternative alleles (" << num_missing_alleles << " excluded missing alleles)." <<  endl;
	cout << "\n[" << Utils::getLocalTime() << "] Filtered " << num_filtered_alleles << " alternative alleles with an allele posterior less than " << min_allele_posterior << "." << endl;
	cout << "[" << Utils::getLocalTime() << "] Filtered " << num_filtered_samples << " samples with a genotype posterior less than " << min_genotype_posterior << "." << endl;

	cout << "\n[" << Utils::getLocalTime() << "] Wrote " << num_called_split_variants << " called bi-allelic variants." << endl;

	cout << endl;

	return 0;
}
	void merge(const vector<string> & in_vcf_filenames, const string & outfile_prefix) {

		assert(in_vcf_filenames.size() > 1);
		GenotypedVcfFileReader tmpl_vcf(in_vcf_filenames.front(), true);

		// Prepare output metadata
		VcfMetaData output_meta_data = tmpl_vcf.metaData();
		uint num_samples = tmpl_vcf.metaData().numSamples();

		vector<unique_ptr<GenotypedVcfFileReader> > in_vcfs;
		
		for (uint in_vcf_idx = 1; in_vcf_idx < in_vcf_filenames.size(); in_vcf_idx++) {

			in_vcfs.push_back(unique_ptr<GenotypedVcfFileReader> (new GenotypedVcfFileReader(in_vcf_filenames.at(in_vcf_idx), true)));
			num_samples += in_vcfs.back()->metaData().numSamples();

			for (auto & smpl_id : in_vcfs.back()->metaData().sampleIds()) {

				output_meta_data.addSample(smpl_id);
			}

			assert(tmpl_vcf.metaData().contigs() == in_vcfs.back()->metaData().contigs());
			assert(tmpl_vcf.metaData().infoDescriptors() == in_vcfs.back()->metaData().infoDescriptors());
			assert(tmpl_vcf.metaData().filterDescriptors() == in_vcfs.back()->metaData().filterDescriptors());
			assert(tmpl_vcf.metaData().formatDescriptors() == in_vcfs.back()->metaData().formatDescriptors());
		}

		cout << "[" << Utils::getLocalTime() << "] Running BayesTyperUtils (" << BTU_VERSION << ") merge on " << in_vcf_filenames.size() << " files with containing " << num_samples << " samples in total ...\n" << endl;

		assert(output_meta_data.infoDescriptors().erase("HC"));

		VcfFileWriter output_vcf(outfile_prefix + ".vcf", output_meta_data, true);

		vector<string> var_value_assert_keys = {"VT", "VCS", "VCI", "VCGS", "VCGI", "HCR", "AE", "ACO", "AsmVar_ASQR"};

        auto sample_ids = output_meta_data.sampleIds();

		uint num_variants = 0;

		uint cache_size = 10000;
		vector<Variant*> tmpl_var_cache(cache_size, nullptr);
		vector<vector<Variant*> > in_var_caches(in_vcfs.size(), vector<Variant*>(cache_size, nullptr));

		bool reached_last_var = false;

		while (!reached_last_var) {

			/*
				Fill cache
			*/
			for (uint cache_idx = 0; cache_idx < cache_size; cache_idx++) {

				reached_last_var = !tmpl_vcf.getNextVariant(&tmpl_var_cache.at(cache_idx));

				if (reached_last_var) {

					cache_size = cache_idx;
					tmpl_var_cache.resize(cache_size);
				}
			}

			for (uint in_vcf_idx = 0; in_vcf_idx < in_vcfs.size(); in_vcf_idx++) {

				for (uint cache_idx = 0; cache_idx < cache_size; cache_idx++) {

					assert(in_vcfs.at(in_vcf_idx)->getNextVariant(&in_var_caches.at(in_vcf_idx).at(cache_idx)));
				}
			}

			/*
				Merge vars in cache
			*/

			for (uint cache_idx = 0; cache_idx < cache_size; cache_idx++) {

				num_variants++;

				Variant * cur_tmpl_var = tmpl_var_cache.at(cache_idx);
				
				assert(cur_tmpl_var);
				assert(cur_tmpl_var->filters().size() == 1);
				assert((cur_tmpl_var->filters().front() == "PASS") or (cur_tmpl_var->filters().front() == "UV"));

				set<ushort> alleles_not_covered;

				auto cur_tmpl_var_anc_values = cur_tmpl_var->info().getValue<string>("ANC");

				if (cur_tmpl_var_anc_values.second) {

					auto cur_tmpl_var_anc_values_split = Utils::splitString(cur_tmpl_var_anc_values.first, ',');

					for (auto &anc_value: cur_tmpl_var_anc_values_split) {

						alleles_not_covered.insert(stoi(anc_value));
					}
				}

				for (uint in_vcf_idx = 0; in_vcf_idx < in_vcfs.size(); in_vcf_idx++) {

					Variant * cur_in_var = in_var_caches.at(in_vcf_idx).at(cache_idx);
					assert(cur_in_var);

					assert(cur_tmpl_var->chrom() == cur_in_var->chrom());
					assert(cur_tmpl_var->pos() == cur_in_var->pos());

					assert(cur_tmpl_var->ids() == cur_in_var->ids());
					assert(cur_tmpl_var->numAlts() == cur_in_var->numAlts());

					for (auto & var_value_assert_key : var_value_assert_keys) {

						assert(cur_tmpl_var->info().getValue(var_value_assert_key) == cur_in_var->info().getValue(var_value_assert_key));
					}

					assert(cur_in_var->filters().size() == 1);
					assert((cur_tmpl_var->filters().front() == "UV") == (cur_in_var->filters().front() == "UV"));

					if (cur_in_var->filters().front() == "UV") {

						assert(Utils::splitString(fetchValue<string>(cur_in_var->info(), "AE"), ',').size() == cur_in_var->numAlls());

						assert(cur_tmpl_var->info().getValue("AC") == cur_in_var->info().getValue("AC"));
						assert(cur_tmpl_var->info().getValue("AF") == cur_in_var->info().getValue("AF"));
						assert(cur_tmpl_var->info().getValue("AN") == cur_in_var->info().getValue("AN"));
						assert(cur_tmpl_var->info().getValue("ACP") == cur_in_var->info().getValue("ACP"));
						assert(cur_tmpl_var->info().getValue("ANC") == cur_in_var->info().getValue("ANC"));

					} else {

						assert(cur_in_var->filters().front() == "PASS");
					}

					if (cur_in_var->info().getValue("HRS").second) {

						cur_tmpl_var->info().addFlag("HRS");
					}

					auto cur_in_var_anc_values = cur_in_var->info().getValue<string>("ANC");

					if (cur_in_var_anc_values.second) {

						auto cur_in_var_anc_values_split = Utils::splitString(cur_in_var_anc_values.first, ',');

						for (auto &anc_value: cur_in_var_anc_values_split) {

							alleles_not_covered.insert(stoi(anc_value));
						}
					}

					for (uint all_idx = 0; all_idx < cur_tmpl_var->numAlls(); all_idx++) {

						assert(cur_tmpl_var->allele(all_idx) == cur_in_var->allele(all_idx));
					}

					for (auto & smpl_id : in_vcfs.at(in_vcf_idx)->metaData().sampleIds()) {

						cur_tmpl_var->addSample(smpl_id, cur_in_var->getSample(smpl_id));
					}

					delete cur_in_var;
				}

				if (!(alleles_not_covered.empty())) {

					JoiningString anc_elements(',');

					for (auto &allele: alleles_not_covered) {

						anc_elements.join(to_string(allele));
					}

					cur_tmpl_var->info().setValue<string>("ANC", anc_elements.str());
				}

        		auto allele_stats = Stats::calcAlleleStats(cur_tmpl_var);
            	assert(!cur_tmpl_var->info().setValue<int>("AN", allele_stats.first.allele_count_sum));

				auto map_call_prob_and_var_qual = Stats::calcAlleleCallProbAndQualFromAllelePosteriors(cur_tmpl_var);
				assert(map_call_prob_and_var_qual.first.size() == cur_tmpl_var->numAlls());

				for (uint all_idx = 0; all_idx < cur_tmpl_var->numAlls(); all_idx++) {

					assert(!(cur_tmpl_var->allele(all_idx).info().setValue<float>("ACP", map_call_prob_and_var_qual.first.at(all_idx))));
   		
                    if (all_idx > 0) {

                        assert(!cur_tmpl_var->allele(all_idx).info().setValue<int>("AC", allele_stats.first.allele_counts.at(all_idx)));
                        assert(!cur_tmpl_var->allele(all_idx).info().setValue<float>("AF", allele_stats.first.allele_freqs.at(all_idx)));
                    }
                }

			    cur_tmpl_var->setQual(make_pair(map_call_prob_and_var_qual.second, true));
			    
				output_vcf.write(cur_tmpl_var);
				delete cur_tmpl_var;

				if (num_variants % 100000 == 0) {

					cout << "[" << Utils::getLocalTime() << "] Merged " << num_variants << " variant(s)" << endl;
				}
			}

			tmpl_var_cache = vector<Variant*>(cache_size, nullptr);
			in_var_caches = vector<vector<Variant*> >(in_vcfs.size(), vector<Variant*>(cache_size, nullptr));
		}

		Variant * dummy_var;

		for (auto & in_vcf : in_vcfs) {

			assert(!in_vcf->getNextVariant(&dummy_var));
		}

		cout << "\n[" << Utils::getLocalTime() << "] Completed merge of " << num_variants << " variant(s)" << endl;
		cout << endl;
	}