// not used for now april 9 2012 int removevariants(struct fragment* Flist,int fragments,struct SNPfrags* snpfrag,int snps,int maxiter,char* HAP1,char* HAP2,struct BLOCK* clist,int components) { int iter=0,k=0; float calls=0, miscalls=0,ll=0, bestll =0; //int switches =0; int prev = 0; /*****************************************************************************************************/ //for (i=0;i<snps;i++) fprintf(stdout,"first fragment for SNP %d %d \n",i,snpfrag[i].ff); mecscore(Flist,fragments,HAP1,&ll,&calls,&miscalls); bestll = ll; for (iter =0; iter < 1;iter++) { fprintf(stdout,"MEC score %f %f LL %f bestLL %f \n",miscalls,calls,ll,bestll); fprintf(stderr,"MEC score %f %f LL %f bestLL %f\n",miscalls,calls,ll,bestll); for (k=0;k<components;k++) find_bestvariant_segment(Flist,fragments,snpfrag,clist,k,HAP1,HAP2); //mecscore(Flist,fragments,HAP1,&ll,&calls,&miscalls); if (bestll > ll) bestll = ll; } return 1; }
int maxcut_haplotyping(char* fragmentfile,char* variantfile,int snps,char* outputfile,int maxiter_hapcut) { // IMP NOTE: all SNPs start from 1 instead of 0 and all offsets are 1+ fprintf(stderr,"calling MAXCUT based haplotype assembly algorithm\n"); int fragments=0,iter=0,components=0; int i=0,j=0,k=0,t=0,component; int* slist; int flag =0; float bestscore_mec = 0,calls=0, miscalls=0,ll = 0; char buffer[MAXBUF]; /****************************** READ FRAGMENT MATRIX*************************************************/ struct fragment* Flist; FILE* ff = fopen(fragmentfile,"r"); if (ff == NULL) { fprintf(stderr,"couldn't open fragment file %s\n",fragmentfile); exit(0);} fragments =0; while ( fgets(buffer,MAXBUF,ff) != NULL) fragments++; fclose(ff); Flist = (struct fragment*)malloc(sizeof(struct fragment)*fragments); flag = read_fragment_matrix(fragmentfile,Flist,fragments); if (flag < 0) { fprintf(stderr,"unable to read fragment matrix file %s \n",fragmentfile); return -1; } if (VCFformat ==0) snps = count_variants(variantfile); else snps = count_variants_vcf(variantfile); if (snps < 0) { fprintf(stderr,"unable to read variant file %s \n",variantfile); return -1; } fprintf(stderr,"processed fragment file and variant file: fragments %d variants %d\n",fragments,snps); /****************************** READ FRAGMENT MATRIX*************************************************/ struct SNPfrags* snpfrag = (struct SNPfrags*)malloc(sizeof(struct SNPfrags)*snps); update_snpfrags(Flist,fragments,snpfrag,snps,&components); double MEM_ALLOC = 0; for (i=0;i<snps;i++) MEM_ALLOC += snpfrag[i].edges*0.002; MEM_ALLOC *= 0.016; fprintf(stderr,"%f MB memory needs to be allocated for graph edges\n",MEM_ALLOC); // size of struct edge is 16/1000 bytes if (MEM_ALLOC >= MAX_MEMORY) { fprintf(stderr,"\nstoring the HAPCUT graph structure requires more than %d MB of memory:\n 1. increase the maximum memory available using option \"--maxmem 12000\" where the memory is specified in megabytes OR \n 2. run the program with the options \"--longreads 1 \" to reduce the number of edges stored \n\n",MAX_MEMORY); return -1; } // too much memory allocated here for fosmid based data... for (i=0;i<snps;i++) snpfrag[i].elist = (struct edge*)malloc(sizeof(struct edge)*snpfrag[i].edges); for (i=0;i<snps;i++) snpfrag[i].telist = (struct edge*)malloc(sizeof(struct edge)*snpfrag[i].edges); if (FOSMIDS ==0) add_edges(Flist,fragments,snpfrag,snps,&components); else if (FOSMIDS >=1) add_edges_fosmids(Flist,fragments,snpfrag,snps,&components); // this considers only components with at least two nodes fprintf(stderr,"fragments %d snps %d component(blocks) %d\n",fragments,snps,components); struct BLOCK* clist = (struct BLOCK*)malloc(sizeof(struct BLOCK)*components); component =0; generate_clist_structure(Flist,fragments,snpfrag,snps,components,clist); /*****************************************************************************************************/ char* HAP1 = (char*)malloc(snps+1); char* besthap_mec = (char*)malloc(snps+1); char* HAP2 = (char*)malloc(snps+1); struct tm *ts1; char buf[80]; time_t now; slist = (int*)malloc(sizeof(int)*snps); char fn[1000]; if (VCFformat ==0) read_variantfile(variantfile,snpfrag,snps); else read_vcffile(variantfile,snpfrag,snps); /*****************************************************************************************************/ if (RANDOM_START ==1) { fprintf(stdout,"starting from a completely random solution SOLUTION \n"); for (i=0;i<snps;i++) { if (snpfrag[i].frags ==0) { HAP1[i] = '-'; HAP2[i] = '-'; } else { if (drand48() < 0.5) { HAP1[i] = '0'; HAP2[i] = '1'; } else {HAP1[i] = '1'; HAP2[i] = '0'; } } } } for (i=0;i<snps;i++) { besthap_mec[i] = HAP1[i]; } // for each block, we maintain best haplotype solution under MFR criterion // compute the component-wise score for 'initHAP' haplotype miscalls=0;bestscore_mec=0; for (k=0;k<components;k++) { clist[k].MEC =0; clist[k].bestMEC =0; clist[k].calls =0; clist[k].LL = 0; for (i=0;i<clist[k].frags;i++) { update_fragscore(Flist,clist[k].flist[i],HAP1); clist[k].MEC += Flist[clist[k].flist[i]].currscore; clist[k].LL += Flist[clist[k].flist[i]].ll; clist[k].calls += Flist[clist[k].flist[i]].calls; } clist[k].bestMEC = clist[k].MEC; bestscore_mec += clist[k].bestMEC; miscalls += clist[k].MEC; clist[k].bestLL = clist[k].LL; } // annealing_haplotyping(Flist,fragments,snpfrag,snps,maxiter,HAP1,HAP2,clist,components,slist); return 1; // annealing_haplotyping_full(Flist,fragments,snpfrag,snps,maxiter,HAP1,HAP2,0); return 1; /************************** RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE MEC SCORE*********************************/ for (iter=0;iter<maxiter_hapcut;iter++) { mecscore(Flist,fragments,HAP1,&ll,&calls,&miscalls); time(&now); ts1 = localtime(&now); strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts1); fprintf(stdout,"iter %d current haplotype MEC %f calls %d LL %f %s \n",iter,miscalls,(int)calls,ll,buf); fprintf(stderr,"iter %d current haplotype MEC %f calls %d LL %f %s \n",iter,miscalls,(int)calls,ll,buf); if ((iter%10==0 && iter > 0)) { // new code added april 7 2012 for (k=0;k<components;k++) find_bestvariant_segment(Flist,fragments,snpfrag,clist,k,HAP1,HAP2); sprintf(fn,"%s",outputfile); // newfile for every update to score.... //sprintf(fn,"%s.%f",outputfile,miscalls); // newfile for every update to score.... fprintf(stdout,"OUTPUTTING HAPLOTYPE ASSEMBLY TO FILE %s\n",fn); fprintf(stderr,"OUTPUTTING HAPLOTYPE ASSEMBLY TO FILE %s\n",fn); //if (VCFformat ==1) print_haplotypes_vcf(clist,components,HAP1,Flist,fragments,snpfrag,snps,fn); print_hapfile(clist,components,HAP1,Flist,fragments,snpfrag,variantfile,miscalls,fn); // do this only if some option is specified if (PRINT_FRAGMENT_SCORES ==1) { print_fragmentmatrix_MEC(Flist,fragments,HAP1,outputfile); //print_matrix(clist,components,HAP1,Flist,outputfile); } } for (k=0;k<components;k++) // COMPUTATION OF TREE FOR EACH COMPONENT { //if ((k*50)%components ==0) fprintf(stderr,"#"); if (iter ==0) fprintf(stdout,"\n component %d length %d phased %d %d...%d \n",k,clist[k].length,clist[k].phased,clist[k].offset,clist[k].lastvar); // call function for each component only if MEC > 0 april 17 2012 if (clist[k].MEC > 0) evaluate_cut_component(Flist,snpfrag,clist,k,slist,HAP1,iter); } for (i=0;i<snps;i++) { // commented out on april 6 4pm 2012 //if (HAP1[i] == '0') HAP2[i] = '1'; else if (HAP1[i] == '1') HAP2[i] = '0'; else HAP2[i] = HAP1[i]; } } /************************** RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE MEC SCORE*********************************/ // annealing_haplotyping_full(Flist,fragments,snpfrag,snps,maxiter+100,HAP1,HAP2,0); return 1; return 1; }