int maxcut_haplotyping(char* fragmentfile,char* variantfile,int snps,char* outputfile,int maxiter_hapcut) { // IMP NOTE: all SNPs start from 1 instead of 0 and all offsets are 1+ fprintf(stderr,"calling MAXCUT based haplotype assembly algorithm\n"); int fragments=0,iter=0,components=0; int i=0,j=0,k=0,t=0,component; int* slist; int flag =0; float bestscore_mec = 0,calls=0, miscalls=0,ll = 0; char buffer[MAXBUF]; /****************************** READ FRAGMENT MATRIX*************************************************/ struct fragment* Flist; FILE* ff = fopen(fragmentfile,"r"); if (ff == NULL) { fprintf(stderr,"couldn't open fragment file %s\n",fragmentfile); exit(0);} fragments =0; while ( fgets(buffer,MAXBUF,ff) != NULL) fragments++; fclose(ff); Flist = (struct fragment*)malloc(sizeof(struct fragment)*fragments); flag = read_fragment_matrix(fragmentfile,Flist,fragments); if (flag < 0) { fprintf(stderr,"unable to read fragment matrix file %s \n",fragmentfile); return -1; } if (VCFformat ==0) snps = count_variants(variantfile); else snps = count_variants_vcf(variantfile); if (snps < 0) { fprintf(stderr,"unable to read variant file %s \n",variantfile); return -1; } fprintf(stderr,"processed fragment file and variant file: fragments %d variants %d\n",fragments,snps); /****************************** READ FRAGMENT MATRIX*************************************************/ struct SNPfrags* snpfrag = (struct SNPfrags*)malloc(sizeof(struct SNPfrags)*snps); update_snpfrags(Flist,fragments,snpfrag,snps,&components); double MEM_ALLOC = 0; for (i=0;i<snps;i++) MEM_ALLOC += snpfrag[i].edges*0.002; MEM_ALLOC *= 0.016; fprintf(stderr,"%f MB memory needs to be allocated for graph edges\n",MEM_ALLOC); // size of struct edge is 16/1000 bytes if (MEM_ALLOC >= MAX_MEMORY) { fprintf(stderr,"\nstoring the HAPCUT graph structure requires more than %d MB of memory:\n 1. increase the maximum memory available using option \"--maxmem 12000\" where the memory is specified in megabytes OR \n 2. run the program with the options \"--longreads 1 \" to reduce the number of edges stored \n\n",MAX_MEMORY); return -1; } // too much memory allocated here for fosmid based data... for (i=0;i<snps;i++) snpfrag[i].elist = (struct edge*)malloc(sizeof(struct edge)*snpfrag[i].edges); for (i=0;i<snps;i++) snpfrag[i].telist = (struct edge*)malloc(sizeof(struct edge)*snpfrag[i].edges); if (FOSMIDS ==0) add_edges(Flist,fragments,snpfrag,snps,&components); else if (FOSMIDS >=1) add_edges_fosmids(Flist,fragments,snpfrag,snps,&components); // this considers only components with at least two nodes fprintf(stderr,"fragments %d snps %d component(blocks) %d\n",fragments,snps,components); struct BLOCK* clist = (struct BLOCK*)malloc(sizeof(struct BLOCK)*components); component =0; generate_clist_structure(Flist,fragments,snpfrag,snps,components,clist); /*****************************************************************************************************/ char* HAP1 = (char*)malloc(snps+1); char* besthap_mec = (char*)malloc(snps+1); char* HAP2 = (char*)malloc(snps+1); struct tm *ts1; char buf[80]; time_t now; slist = (int*)malloc(sizeof(int)*snps); char fn[1000]; if (VCFformat ==0) read_variantfile(variantfile,snpfrag,snps); else read_vcffile(variantfile,snpfrag,snps); /*****************************************************************************************************/ if (RANDOM_START ==1) { fprintf(stdout,"starting from a completely random solution SOLUTION \n"); for (i=0;i<snps;i++) { if (snpfrag[i].frags ==0) { HAP1[i] = '-'; HAP2[i] = '-'; } else { if (drand48() < 0.5) { HAP1[i] = '0'; HAP2[i] = '1'; } else {HAP1[i] = '1'; HAP2[i] = '0'; } } } } for (i=0;i<snps;i++) { besthap_mec[i] = HAP1[i]; } // for each block, we maintain best haplotype solution under MFR criterion // compute the component-wise score for 'initHAP' haplotype miscalls=0;bestscore_mec=0; for (k=0;k<components;k++) { clist[k].MEC =0; clist[k].bestMEC =0; clist[k].calls =0; clist[k].LL = 0; for (i=0;i<clist[k].frags;i++) { update_fragscore(Flist,clist[k].flist[i],HAP1); clist[k].MEC += Flist[clist[k].flist[i]].currscore; clist[k].LL += Flist[clist[k].flist[i]].ll; clist[k].calls += Flist[clist[k].flist[i]].calls; } clist[k].bestMEC = clist[k].MEC; bestscore_mec += clist[k].bestMEC; miscalls += clist[k].MEC; clist[k].bestLL = clist[k].LL; } // annealing_haplotyping(Flist,fragments,snpfrag,snps,maxiter,HAP1,HAP2,clist,components,slist); return 1; // annealing_haplotyping_full(Flist,fragments,snpfrag,snps,maxiter,HAP1,HAP2,0); return 1; /************************** RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE MEC SCORE*********************************/ for (iter=0;iter<maxiter_hapcut;iter++) { mecscore(Flist,fragments,HAP1,&ll,&calls,&miscalls); time(&now); ts1 = localtime(&now); strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts1); fprintf(stdout,"iter %d current haplotype MEC %f calls %d LL %f %s \n",iter,miscalls,(int)calls,ll,buf); fprintf(stderr,"iter %d current haplotype MEC %f calls %d LL %f %s \n",iter,miscalls,(int)calls,ll,buf); if ((iter%10==0 && iter > 0)) { // new code added april 7 2012 for (k=0;k<components;k++) find_bestvariant_segment(Flist,fragments,snpfrag,clist,k,HAP1,HAP2); sprintf(fn,"%s",outputfile); // newfile for every update to score.... //sprintf(fn,"%s.%f",outputfile,miscalls); // newfile for every update to score.... fprintf(stdout,"OUTPUTTING HAPLOTYPE ASSEMBLY TO FILE %s\n",fn); fprintf(stderr,"OUTPUTTING HAPLOTYPE ASSEMBLY TO FILE %s\n",fn); //if (VCFformat ==1) print_haplotypes_vcf(clist,components,HAP1,Flist,fragments,snpfrag,snps,fn); print_hapfile(clist,components,HAP1,Flist,fragments,snpfrag,variantfile,miscalls,fn); // do this only if some option is specified if (PRINT_FRAGMENT_SCORES ==1) { print_fragmentmatrix_MEC(Flist,fragments,HAP1,outputfile); //print_matrix(clist,components,HAP1,Flist,outputfile); } } for (k=0;k<components;k++) // COMPUTATION OF TREE FOR EACH COMPONENT { //if ((k*50)%components ==0) fprintf(stderr,"#"); if (iter ==0) fprintf(stdout,"\n component %d length %d phased %d %d...%d \n",k,clist[k].length,clist[k].phased,clist[k].offset,clist[k].lastvar); // call function for each component only if MEC > 0 april 17 2012 if (clist[k].MEC > 0) evaluate_cut_component(Flist,snpfrag,clist,k,slist,HAP1,iter); } for (i=0;i<snps;i++) { // commented out on april 6 4pm 2012 //if (HAP1[i] == '0') HAP2[i] = '1'; else if (HAP1[i] == '1') HAP2[i] = '0'; else HAP2[i] = HAP1[i]; } } /************************** RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE MEC SCORE*********************************/ // annealing_haplotyping_full(Flist,fragments,snpfrag,snps,maxiter+100,HAP1,HAP2,0); return 1; return 1; }
int maxcut_haplotyping(char* fragmentfile, char* variantfile, char* outputfile) { // IMP NOTE: all SNPs start from 1 instead of 0 and all offsets are 1+ fprintf_time(stderr, "Calling Max-Likelihood-Cut based haplotype assembly algorithm\n"); int snps = 0; int fragments = 0, iter = 0, components = 0; int i = 0, k = 0; int* slist; int flag = 0; float bestscore = 0, miscalls = 0; int hic_iter=0; struct SNPfrags* snpfrag = NULL; struct BLOCK* clist; char* HAP1; float HIC_LL_SCORE = -80; float OLD_HIC_LL_SCORE = -80; int converged_count=0, split_count, new_components, component; int new_fragments = 0; struct fragment* new_Flist; // READ FRAGMENT MATRIX fragments = get_num_fragments(fragmentfile); struct fragment* Flist; Flist = (struct fragment*) malloc(sizeof (struct fragment)* fragments); flag = read_fragment_matrix(fragmentfile, Flist, fragments); if (MAX_IS != -1){ // we are going to filter out some insert sizes new_fragments = 0; new_Flist = (struct fragment*) malloc(sizeof (struct fragment)* fragments); for(i = 0; i < fragments; i++){ if (Flist[i].isize < MAX_IS) new_Flist[new_fragments++] = Flist[i]; } Flist = new_Flist; fragments = new_fragments; } if (flag < 0) { fprintf_time(stderr, "unable to read fragment matrix file %s \n", fragmentfile); return -1; } //ADD EDGES BETWEEN SNPS snps = count_variants_vcf(variantfile); if (snps < 0) { fprintf_time(stderr, "unable to read variant file %s \n", variantfile); return -1; } snpfrag = (struct SNPfrags*) malloc(sizeof (struct SNPfrags)*snps); update_snpfrags(Flist, fragments, snpfrag, snps, &components); detect_long_reads(Flist,fragments); // 10/25/2014, edges are only added between adjacent nodes in each fragment and used for determining connected components... for (i = 0; i < snps; i++) snpfrag[i].elist = (struct edge*) malloc(sizeof (struct edge)*(snpfrag[i].edges+1)); if (LONG_READS ==0){ add_edges(Flist,fragments,snpfrag,snps,&components); }else if (LONG_READS >=1){ add_edges_fosmids(Flist,fragments,snpfrag,snps,&components); } for (i = 0; i < snps; i++) snpfrag[i].telist = (struct edge*) malloc(sizeof (struct edge)*(snpfrag[i].edges+1)); // this considers only components with at least two nodes fprintf_time(stderr, "fragments %d snps %d component(blocks) %d\n", fragments, snps, components); // BUILD COMPONENT LIST clist = (struct BLOCK*) malloc(sizeof (struct BLOCK)*components); generate_clist_structure(Flist, fragments, snpfrag, snps, components, clist); // READ VCF FILE read_vcffile(variantfile, snpfrag, snps); // INITIALIZE RANDOM HAPLOTYPES HAP1 = (char*) malloc(snps + 1); for (i = 0; i < snps; i++) { if (snpfrag[i].frags == 0 || (SNVS_BEFORE_INDELS && (strlen(snpfrag[i].allele0) != 1 || strlen(snpfrag[i].allele1) != 1))) { HAP1[i] = '-'; } else if (drand48() < 0.5) { HAP1[i] = '0'; } else { HAP1[i] = '1'; } } // for each block, we maintain best haplotype solution under MFR criterion // compute the component-wise score for 'initHAP' haplotype miscalls = 0; bestscore = 0; for (k = 0; k < components; k++) { clist[k].SCORE = 0; clist[k].bestSCORE = 0; for (i = 0; i < clist[k].frags; i++) { update_fragscore(Flist, clist[k].flist[i], HAP1); clist[k].SCORE += Flist[clist[k].flist[i]].currscore; } clist[k].bestSCORE = clist[k].SCORE; bestscore += clist[k].bestSCORE; miscalls += clist[k].SCORE; } fprintf_time(stderr, "processed fragment file and variant file: fragments %d variants %d\n", fragments, snps); int MAXIS = -1; if (HIC){ // determine the probability of an h-trans interaction for read for (i=0; i<fragments;i++){ Flist[i].htrans_prob = -80; if (Flist[i].isize > MAXIS) MAXIS = Flist[i].isize; } HTRANS_MAXBINS = MAXIS/HTRANS_BINSIZE + 1; }else{ HTRANS_MAXBINS = 0; } // read in file with estimated probabilities of Hi-C h-trans interactions with distance if (strcmp(HTRANS_DATA_INFILE, "None") != 0){ int num_bins = count_htrans_bins(HTRANS_DATA_INFILE); float* htrans_probs = (float*) malloc(sizeof(float) * num_bins); read_htrans_file(HTRANS_DATA_INFILE, htrans_probs, num_bins); for (i=0; i<fragments;i++){ Flist[i].htrans_prob = log10(htrans_probs[Flist[i].isize / HTRANS_BINSIZE]); } free(htrans_probs); } slist = (int*) malloc(sizeof (int)*snps); OLD_HIC_LL_SCORE = bestscore; for (hic_iter = 0; hic_iter < MAX_HIC_EM_ITER; hic_iter++){ if (VERBOSE) fprintf_time(stdout, "HIC ITER %d\n", hic_iter); for (k = 0; k < components; k++){ clist[k].iters_since_improvement = 0; } for (i=0; i<snps; i++){ snpfrag[i].post_hap = 0; } // RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE LIKELIHOOD for (iter = 0; iter < MAXITER; iter++) { if (VERBOSE) fprintf_time(stdout, "PHASING ITER %d\n", iter); converged_count = 0; for (k = 0; k < components; k++){ if(VERBOSE && iter == 0) fprintf_time(stdout, "component %d length %d phased %d %d...%d\n", k, clist[k].length, clist[k].phased, clist[k].offset, clist[k].lastvar); if (clist[k].SCORE > 0) converged_count += evaluate_cut_component(Flist, snpfrag, clist, k, slist, HAP1); else converged_count++; } if (converged_count == components) { //fprintf(stdout, "Haplotype assembly terminated early because no improvement seen in blocks after %d iterations\n", CONVERGE); break; } } // H-TRANS ESTIMATION FOR HIC if (MAX_HIC_EM_ITER > 1){ // Possibly break if we're done improving HIC_LL_SCORE = 0; for (k = 0; k < components; k++){ HIC_LL_SCORE += clist[k].bestSCORE; } if (HIC_LL_SCORE >= OLD_HIC_LL_SCORE){ break; } OLD_HIC_LL_SCORE = HIC_LL_SCORE; likelihood_pruning(snps, Flist, snpfrag, HAP1, 0); // prune for only very high confidence SNPs // estimate the h-trans probabilities for the next round estimate_htrans_probs(Flist, fragments, HAP1, snpfrag); } } // BLOCK SPLITTING new_components = components; if (SPLIT_BLOCKS){ split_count = 0; for (k=0; k<components; k++){ // attempt to split block split_count += split_block(HAP1, clist, k, Flist, snpfrag, &new_components); } if (split_count > 0){ // regenerate clist if necessary free(clist); clist = (struct BLOCK*) malloc(sizeof (struct BLOCK)*new_components); generate_clist_structure(Flist, fragments, snpfrag, snps, new_components, clist); } components = new_components; }else if(ERROR_ANALYSIS_MODE && !HIC){ for (k=0; k<components; k++){ // run split_block but don't actually split, just get posterior probabilities split_block(HAP1, clist, k, Flist, snpfrag, &new_components); } } // PRUNE SNPS if (!SKIP_PRUNE){ discrete_pruning(snps, fragments, Flist, snpfrag, HAP1); likelihood_pruning(snps, Flist, snpfrag, HAP1, CALL_HOMOZYGOUS); } // PRINT OUTPUT FILE fprintf_time(stderr, "OUTPUTTING PRUNED HAPLOTYPE ASSEMBLY TO FILE %s\n", outputfile); print_hapfile(clist, components, HAP1, Flist, fragments, snpfrag, variantfile, miscalls, outputfile); char assignfile[4096]; sprintf(assignfile,"%s.fragments",outputfile); if (OUTPUT_RH_ASSIGNMENTS ==1) fragment_assignments(Flist,fragments,snpfrag,HAP1,assignfile); // added 03/10/2018 to output read-haplotype assignments char outvcffile[4096]; sprintf(outvcffile,"%s.phased.VCF",outputfile); if (OUTPUT_VCF ==1) { fprintf_time(stderr, "OUTPUTTING PHASED VCF TO FILE %s\n", outvcffile); output_vcf(variantfile,snpfrag,snps,HAP1,Flist,fragments,outvcffile,0); } // FREE UP MEMORY for (i = 0; i < snps; i++) free(snpfrag[i].elist); for (i = 0; i < snps; i++) free(snpfrag[i].telist); component = 0; for (i = 0; i < snps; i++) { free(snpfrag[i].flist); free(snpfrag[i].alist); free(snpfrag[i].jlist); free(snpfrag[i].klist); if (snpfrag[i].component == i && snpfrag[i].csize > 1) // root node of component { free(clist[component].slist); component++; } } for (i = 0; i < components; i++) free(clist[i].flist); free(snpfrag); free(clist); free(Flist); return 0; }