Ejemplo n.º 1
0
int maxcut_haplotyping(char* fragmentfile,char* variantfile,int snps,char* outputfile,int maxiter_hapcut)
{
	// IMP NOTE: all SNPs start from 1 instead of 0 and all offsets are 1+
	fprintf(stderr,"calling MAXCUT based haplotype assembly algorithm\n");
	int fragments=0,iter=0,components=0; int i=0,j=0,k=0,t=0,component;	int* slist;  int flag =0;
	float bestscore_mec = 0,calls=0, miscalls=0,ll = 0;
	char buffer[MAXBUF]; 

	/****************************** READ FRAGMENT MATRIX*************************************************/
	struct fragment* Flist; FILE* ff = fopen(fragmentfile,"r"); 
	if (ff == NULL) { fprintf(stderr,"couldn't open fragment file %s\n",fragmentfile); exit(0);}
	fragments =0; while ( fgets(buffer,MAXBUF,ff) != NULL) fragments++; fclose(ff);
	Flist = (struct fragment*)malloc(sizeof(struct fragment)*fragments); 
	flag = read_fragment_matrix(fragmentfile,Flist,fragments);
	if (flag < 0) { fprintf(stderr,"unable to read fragment matrix file %s \n",fragmentfile); return -1; } 

	if (VCFformat ==0) snps = count_variants(variantfile);
	else snps = count_variants_vcf(variantfile);
	if (snps < 0) { fprintf(stderr,"unable to read variant file %s \n",variantfile); return -1; } 
	fprintf(stderr,"processed fragment file and variant file: fragments %d variants %d\n",fragments,snps); 
	/****************************** READ FRAGMENT MATRIX*************************************************/

	struct SNPfrags* snpfrag = (struct SNPfrags*)malloc(sizeof(struct SNPfrags)*snps); 
	update_snpfrags(Flist,fragments,snpfrag,snps,&components);
	double MEM_ALLOC = 0; for (i=0;i<snps;i++) MEM_ALLOC += snpfrag[i].edges*0.002; MEM_ALLOC *= 0.016; 
	fprintf(stderr,"%f MB memory needs to be allocated for graph edges\n",MEM_ALLOC); // size of struct edge is 16/1000 bytes
	if (MEM_ALLOC >= MAX_MEMORY) 
	{
		fprintf(stderr,"\nstoring the HAPCUT graph structure requires more than %d MB of memory:\n 1. increase the maximum memory available using option \"--maxmem 12000\" where the memory is specified in megabytes OR \n 2. run the program with the options \"--longreads 1 \" to reduce the number of edges stored \n\n",MAX_MEMORY); 
		return -1;
	} 
	
	// too much memory allocated here for fosmid based data... 
	for (i=0;i<snps;i++) snpfrag[i].elist = (struct edge*)malloc(sizeof(struct edge)*snpfrag[i].edges);  
	for (i=0;i<snps;i++) snpfrag[i].telist = (struct edge*)malloc(sizeof(struct edge)*snpfrag[i].edges);  

	if (FOSMIDS ==0) add_edges(Flist,fragments,snpfrag,snps,&components);
	else if (FOSMIDS >=1) add_edges_fosmids(Flist,fragments,snpfrag,snps,&components);
	// this considers only components with at least two nodes
	fprintf(stderr,"fragments %d snps %d component(blocks) %d\n",fragments,snps,components); 
	
	struct BLOCK* clist = (struct BLOCK*)malloc(sizeof(struct BLOCK)*components); component =0;
	generate_clist_structure(Flist,fragments,snpfrag,snps,components,clist);

	/*****************************************************************************************************/

	char* HAP1 = (char*)malloc(snps+1); char* besthap_mec = (char*)malloc(snps+1);
	char* HAP2 = (char*)malloc(snps+1);
	struct tm  *ts1;   char       buf[80];	time_t     now;		
	slist = (int*)malloc(sizeof(int)*snps); char fn[1000];  

	if (VCFformat ==0) read_variantfile(variantfile,snpfrag,snps); else read_vcffile(variantfile,snpfrag,snps);

	/*****************************************************************************************************/
	if (RANDOM_START ==1)
	{				
		fprintf(stdout,"starting from a completely random solution SOLUTION \n");
		for (i=0;i<snps;i++) 
		{ 
			if (snpfrag[i].frags ==0) { HAP1[i] = '-'; HAP2[i] = '-'; } 
			else 
			{
				if (drand48() < 0.5) { HAP1[i] = '0'; HAP2[i] = '1'; }  else  {HAP1[i] = '1'; HAP2[i] = '0'; }
			}
		} 
	}
	for (i=0;i<snps;i++) { besthap_mec[i] = HAP1[i]; } 

	// for each block, we maintain best haplotype solution under MFR criterion 
	// compute the component-wise score for 'initHAP' haplotype 
	miscalls=0;bestscore_mec=0; 
	for (k=0;k<components;k++)
	{
		clist[k].MEC =0; clist[k].bestMEC =0; clist[k].calls =0;	clist[k].LL = 0; 
		for (i=0;i<clist[k].frags;i++) 
		{
			update_fragscore(Flist,clist[k].flist[i],HAP1); clist[k].MEC += Flist[clist[k].flist[i]].currscore;
			clist[k].LL += Flist[clist[k].flist[i]].ll; clist[k].calls += Flist[clist[k].flist[i]].calls;

		} 
		clist[k].bestMEC = clist[k].MEC; bestscore_mec += clist[k].bestMEC; miscalls += clist[k].MEC;	 clist[k].bestLL = clist[k].LL; 
	}
	
	//	annealing_haplotyping(Flist,fragments,snpfrag,snps,maxiter,HAP1,HAP2,clist,components,slist); return 1;
	//	annealing_haplotyping_full(Flist,fragments,snpfrag,snps,maxiter,HAP1,HAP2,0); return 1;
	

	/************************** RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE MEC SCORE*********************************/
	for (iter=0;iter<maxiter_hapcut;iter++)
	{
		mecscore(Flist,fragments,HAP1,&ll,&calls,&miscalls);
		time(&now); ts1 = localtime(&now); strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts1); 
		fprintf(stdout,"iter %d current haplotype MEC %f calls %d LL %f %s \n",iter,miscalls,(int)calls,ll,buf);  
		fprintf(stderr,"iter %d current haplotype MEC %f calls %d LL %f %s \n",iter,miscalls,(int)calls,ll,buf);  
		if ((iter%10==0 && iter > 0)) 
		{
			// new code added april 7 2012
			for (k=0;k<components;k++) find_bestvariant_segment(Flist,fragments,snpfrag,clist,k,HAP1,HAP2);

			sprintf(fn,"%s",outputfile);   // newfile for every update to score....
			//sprintf(fn,"%s.%f",outputfile,miscalls);   // newfile for every update to score....
			fprintf(stdout,"OUTPUTTING HAPLOTYPE ASSEMBLY TO FILE %s\n",fn); 	fprintf(stderr,"OUTPUTTING HAPLOTYPE ASSEMBLY TO FILE %s\n",fn); 
			//if (VCFformat ==1) print_haplotypes_vcf(clist,components,HAP1,Flist,fragments,snpfrag,snps,fn);
			print_hapfile(clist,components,HAP1,Flist,fragments,snpfrag,variantfile,miscalls,fn);   
			
			// do this only if some option is specified 
			if (PRINT_FRAGMENT_SCORES ==1) 
			{
				print_fragmentmatrix_MEC(Flist,fragments,HAP1,outputfile);
				//print_matrix(clist,components,HAP1,Flist,outputfile);
			}
		
		}

		for (k=0;k<components;k++) // COMPUTATION OF TREE FOR EACH COMPONENT 
		{
			//if ((k*50)%components ==0) fprintf(stderr,"#");
			if (iter ==0) fprintf(stdout,"\n component %d length %d phased %d %d...%d \n",k,clist[k].length,clist[k].phased,clist[k].offset,clist[k].lastvar); 
			// call function for each component only if MEC > 0 april 17 2012
			if (clist[k].MEC > 0) evaluate_cut_component(Flist,snpfrag,clist,k,slist,HAP1,iter);
		}
		for (i=0;i<snps;i++)
		{
			// commented out on april 6 4pm 2012
			//if (HAP1[i] == '0') HAP2[i] = '1'; else if (HAP1[i] == '1') HAP2[i] = '0'; else HAP2[i] = HAP1[i]; 
		}
	}
	/************************** RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE MEC SCORE*********************************/
	//	annealing_haplotyping_full(Flist,fragments,snpfrag,snps,maxiter+100,HAP1,HAP2,0); return 1;
	return 1;
}
Ejemplo n.º 2
0
int maxcut_haplotyping(char* fragmentfile, char* variantfile, char* outputfile) {
    // IMP NOTE: all SNPs start from 1 instead of 0 and all offsets are 1+
    fprintf_time(stderr, "Calling Max-Likelihood-Cut based haplotype assembly algorithm\n");

    int snps = 0;
    int fragments = 0, iter = 0, components = 0;
    int i = 0, k = 0;
    int* slist;
    int flag = 0;
    float bestscore = 0, miscalls = 0;
    int hic_iter=0;
    struct SNPfrags* snpfrag = NULL;
    struct BLOCK* clist;
    char* HAP1;
    float HIC_LL_SCORE = -80;
    float OLD_HIC_LL_SCORE = -80;
    int converged_count=0, split_count, new_components, component;

    int new_fragments = 0;
    struct fragment* new_Flist;

    // READ FRAGMENT MATRIX
    fragments = get_num_fragments(fragmentfile); 
    struct fragment* Flist;
    Flist     = (struct fragment*) malloc(sizeof (struct fragment)* fragments);
    flag = read_fragment_matrix(fragmentfile, Flist, fragments);

    if (MAX_IS != -1){
        // we are going to filter out some insert sizes
        new_fragments = 0;
        new_Flist = (struct fragment*) malloc(sizeof (struct fragment)* fragments);
        for(i = 0; i < fragments; i++){
            if (Flist[i].isize < MAX_IS) new_Flist[new_fragments++] = Flist[i];
        }
        Flist = new_Flist;
        fragments = new_fragments;
    }

    if (flag < 0) {
        fprintf_time(stderr, "unable to read fragment matrix file %s \n", fragmentfile);
        return -1;
    }

    //ADD EDGES BETWEEN SNPS
    snps = count_variants_vcf(variantfile);
    if (snps < 0) {
        fprintf_time(stderr, "unable to read variant file %s \n", variantfile);
        return -1;
    }

    snpfrag = (struct SNPfrags*) malloc(sizeof (struct SNPfrags)*snps);
    update_snpfrags(Flist, fragments, snpfrag, snps, &components);
    
    detect_long_reads(Flist,fragments);

    // 10/25/2014, edges are only added between adjacent nodes in each fragment and used for determining connected components...
    for (i = 0; i < snps; i++) snpfrag[i].elist = (struct edge*) malloc(sizeof (struct edge)*(snpfrag[i].edges+1));
    if (LONG_READS ==0){
        add_edges(Flist,fragments,snpfrag,snps,&components);
    }else if (LONG_READS >=1){
        add_edges_fosmids(Flist,fragments,snpfrag,snps,&components);
    }

    for (i = 0; i < snps; i++) snpfrag[i].telist = (struct edge*) malloc(sizeof (struct edge)*(snpfrag[i].edges+1));

    // this considers only components with at least two nodes
    fprintf_time(stderr, "fragments %d snps %d component(blocks) %d\n", fragments, snps, components);

    // BUILD COMPONENT LIST
    clist = (struct BLOCK*) malloc(sizeof (struct BLOCK)*components);
    generate_clist_structure(Flist, fragments, snpfrag, snps, components, clist);

    // READ VCF FILE
    read_vcffile(variantfile, snpfrag, snps);

    // INITIALIZE RANDOM HAPLOTYPES
    HAP1 = (char*) malloc(snps + 1);
    for (i = 0; i < snps; i++) {
        if (snpfrag[i].frags == 0 || (SNVS_BEFORE_INDELS && (strlen(snpfrag[i].allele0) != 1 || strlen(snpfrag[i].allele1) != 1))) {
            HAP1[i] = '-';
        } else if (drand48() < 0.5) {
            HAP1[i] = '0';
        } else {
            HAP1[i] = '1';
        }
    }

    // for each block, we maintain best haplotype solution under MFR criterion
    // compute the component-wise score for 'initHAP' haplotype
    miscalls = 0;
    bestscore = 0;
    for (k = 0; k < components; k++) {
        clist[k].SCORE = 0;
        clist[k].bestSCORE = 0;
        for (i = 0; i < clist[k].frags; i++) {
            update_fragscore(Flist, clist[k].flist[i], HAP1);
            clist[k].SCORE += Flist[clist[k].flist[i]].currscore;
        }
        clist[k].bestSCORE = clist[k].SCORE;
        bestscore += clist[k].bestSCORE;
        miscalls += clist[k].SCORE;
    }

    fprintf_time(stderr, "processed fragment file and variant file: fragments %d variants %d\n", fragments, snps);

    int MAXIS = -1;

    if (HIC){

        // determine the probability of an h-trans interaction for read

        for (i=0; i<fragments;i++){

            Flist[i].htrans_prob = -80;

            if (Flist[i].isize > MAXIS)
                MAXIS = Flist[i].isize;
        }

        HTRANS_MAXBINS = MAXIS/HTRANS_BINSIZE + 1;
    }else{
        HTRANS_MAXBINS = 0;
    }

    // read in file with estimated probabilities of Hi-C h-trans interactions with distance
    if (strcmp(HTRANS_DATA_INFILE, "None") != 0){
        int num_bins        = count_htrans_bins(HTRANS_DATA_INFILE);
        float* htrans_probs = (float*) malloc(sizeof(float) * num_bins);
        read_htrans_file(HTRANS_DATA_INFILE, htrans_probs, num_bins);
        for (i=0; i<fragments;i++){
            Flist[i].htrans_prob = log10(htrans_probs[Flist[i].isize / HTRANS_BINSIZE]);
        }
        free(htrans_probs);
    }

    slist = (int*) malloc(sizeof (int)*snps);

    OLD_HIC_LL_SCORE = bestscore;
    for (hic_iter = 0; hic_iter < MAX_HIC_EM_ITER; hic_iter++){
        if (VERBOSE)
            fprintf_time(stdout, "HIC ITER %d\n", hic_iter);
        for (k = 0; k < components; k++){
            clist[k].iters_since_improvement = 0;
        }
        for (i=0; i<snps; i++){
            snpfrag[i].post_hap = 0;
        }
        // RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE LIKELIHOOD
        for (iter = 0; iter < MAXITER; iter++) {
            if (VERBOSE)
                fprintf_time(stdout, "PHASING ITER %d\n", iter);
            converged_count = 0;
            for (k = 0; k < components; k++){
                if(VERBOSE && iter == 0)
                    fprintf_time(stdout, "component %d length %d phased %d %d...%d\n", k, clist[k].length, clist[k].phased, clist[k].offset, clist[k].lastvar);
                if (clist[k].SCORE > 0)
                    converged_count += evaluate_cut_component(Flist, snpfrag, clist, k, slist, HAP1);
                else converged_count++;
            }

            if (converged_count == components) {
                //fprintf(stdout, "Haplotype assembly terminated early because no improvement seen in blocks after %d iterations\n", CONVERGE);
                break;
            }
        }

        // H-TRANS ESTIMATION FOR HIC
        if (MAX_HIC_EM_ITER > 1){

            // Possibly break if we're done improving
            HIC_LL_SCORE = 0;
            for (k = 0; k < components; k++){
                HIC_LL_SCORE += clist[k].bestSCORE;
            }
            if (HIC_LL_SCORE >= OLD_HIC_LL_SCORE){
                break;
            }
            OLD_HIC_LL_SCORE = HIC_LL_SCORE;

            likelihood_pruning(snps, Flist, snpfrag, HAP1, 0); // prune for only very high confidence SNPs
            // estimate the h-trans probabilities for the next round
            estimate_htrans_probs(Flist, fragments, HAP1, snpfrag);
        }
    }

    // BLOCK SPLITTING
    new_components = components;
    if (SPLIT_BLOCKS){
        split_count = 0;
        for (k=0; k<components; k++){
            // attempt to split block
            split_count += split_block(HAP1, clist, k, Flist, snpfrag, &new_components);
        }
        if (split_count > 0){
            // regenerate clist if necessary
            free(clist);
            clist = (struct BLOCK*) malloc(sizeof (struct BLOCK)*new_components);
            generate_clist_structure(Flist, fragments, snpfrag, snps, new_components, clist);
        }
        components = new_components;
    }else if(ERROR_ANALYSIS_MODE && !HIC){
        for (k=0; k<components; k++){
            // run split_block but don't actually split, just get posterior probabilities
            split_block(HAP1, clist, k, Flist, snpfrag, &new_components);
        }
    }

    // PRUNE SNPS
    if (!SKIP_PRUNE){
        discrete_pruning(snps, fragments, Flist, snpfrag, HAP1);
        likelihood_pruning(snps, Flist, snpfrag, HAP1, CALL_HOMOZYGOUS);
    }
    // PRINT OUTPUT FILE
    fprintf_time(stderr, "OUTPUTTING PRUNED HAPLOTYPE ASSEMBLY TO FILE %s\n", outputfile);
    print_hapfile(clist, components, HAP1, Flist, fragments, snpfrag, variantfile, miscalls, outputfile);
    char assignfile[4096];  sprintf(assignfile,"%s.fragments",outputfile);
    if (OUTPUT_RH_ASSIGNMENTS ==1) fragment_assignments(Flist,fragments,snpfrag,HAP1,assignfile); // added 03/10/2018 to output read-haplotype assignments
    char outvcffile[4096];  sprintf(outvcffile,"%s.phased.VCF",outputfile);
    if (OUTPUT_VCF ==1) {
    	fprintf_time(stderr, "OUTPUTTING PHASED VCF TO FILE %s\n", outvcffile);
	output_vcf(variantfile,snpfrag,snps,HAP1,Flist,fragments,outvcffile,0);
    }

    // FREE UP MEMORY
    for (i = 0; i < snps; i++) free(snpfrag[i].elist);
    for (i = 0; i < snps; i++) free(snpfrag[i].telist);
    component = 0;
    for (i = 0; i < snps; i++) {
        free(snpfrag[i].flist);
        free(snpfrag[i].alist);
        free(snpfrag[i].jlist);
        free(snpfrag[i].klist);

        if (snpfrag[i].component == i && snpfrag[i].csize > 1) // root node of component
        {
            free(clist[component].slist);
            component++;
        }
    }

    for (i = 0; i < components; i++) free(clist[i].flist);
    free(snpfrag);
    free(clist);
    free(Flist);

    return 0;
}