示例#1
0
// not used for now april 9 2012
int removevariants(struct fragment* Flist,int fragments,struct SNPfrags* snpfrag,int snps,int maxiter,char* HAP1,char* HAP2,struct BLOCK* clist,int components)
{
	int iter=0,k=0;
	float calls=0, miscalls=0,ll=0, bestll =0;
	//int switches =0; int prev = 0;
	/*****************************************************************************************************/
	//for (i=0;i<snps;i++) fprintf(stdout,"first fragment for SNP %d %d \n",i,snpfrag[i].ff);
	mecscore(Flist,fragments,HAP1,&ll,&calls,&miscalls); bestll = ll; 

	for (iter =0; iter < 1;iter++)
	{
		fprintf(stdout,"MEC score %f %f  LL %f bestLL %f \n",miscalls,calls,ll,bestll);
		fprintf(stderr,"MEC score %f %f LL %f bestLL %f\n",miscalls,calls,ll,bestll);

		for (k=0;k<components;k++) find_bestvariant_segment(Flist,fragments,snpfrag,clist,k,HAP1,HAP2); 				
		//mecscore(Flist,fragments,HAP1,&ll,&calls,&miscalls); if (bestll > ll) bestll = ll;
	}
	return 1;
}
示例#2
0
文件: hapcut.c 项目: arkyl/hapcut
int maxcut_haplotyping(char* fragmentfile,char* variantfile,int snps,char* outputfile,int maxiter_hapcut)
{
	// IMP NOTE: all SNPs start from 1 instead of 0 and all offsets are 1+
	fprintf(stderr,"calling MAXCUT based haplotype assembly algorithm\n");
	int fragments=0,iter=0,components=0; int i=0,j=0,k=0,t=0,component;	int* slist;  int flag =0;
	float bestscore_mec = 0,calls=0, miscalls=0,ll = 0;
	char buffer[MAXBUF]; 

	/****************************** READ FRAGMENT MATRIX*************************************************/
	struct fragment* Flist; FILE* ff = fopen(fragmentfile,"r"); 
	if (ff == NULL) { fprintf(stderr,"couldn't open fragment file %s\n",fragmentfile); exit(0);}
	fragments =0; while ( fgets(buffer,MAXBUF,ff) != NULL) fragments++; fclose(ff);
	Flist = (struct fragment*)malloc(sizeof(struct fragment)*fragments); 
	flag = read_fragment_matrix(fragmentfile,Flist,fragments);
	if (flag < 0) { fprintf(stderr,"unable to read fragment matrix file %s \n",fragmentfile); return -1; } 

	if (VCFformat ==0) snps = count_variants(variantfile);
	else snps = count_variants_vcf(variantfile);
	if (snps < 0) { fprintf(stderr,"unable to read variant file %s \n",variantfile); return -1; } 
	fprintf(stderr,"processed fragment file and variant file: fragments %d variants %d\n",fragments,snps); 
	/****************************** READ FRAGMENT MATRIX*************************************************/

	struct SNPfrags* snpfrag = (struct SNPfrags*)malloc(sizeof(struct SNPfrags)*snps); 
	update_snpfrags(Flist,fragments,snpfrag,snps,&components);
	double MEM_ALLOC = 0; for (i=0;i<snps;i++) MEM_ALLOC += snpfrag[i].edges*0.002; MEM_ALLOC *= 0.016; 
	fprintf(stderr,"%f MB memory needs to be allocated for graph edges\n",MEM_ALLOC); // size of struct edge is 16/1000 bytes
	if (MEM_ALLOC >= MAX_MEMORY) 
	{
		fprintf(stderr,"\nstoring the HAPCUT graph structure requires more than %d MB of memory:\n 1. increase the maximum memory available using option \"--maxmem 12000\" where the memory is specified in megabytes OR \n 2. run the program with the options \"--longreads 1 \" to reduce the number of edges stored \n\n",MAX_MEMORY); 
		return -1;
	} 
	
	// too much memory allocated here for fosmid based data... 
	for (i=0;i<snps;i++) snpfrag[i].elist = (struct edge*)malloc(sizeof(struct edge)*snpfrag[i].edges);  
	for (i=0;i<snps;i++) snpfrag[i].telist = (struct edge*)malloc(sizeof(struct edge)*snpfrag[i].edges);  

	if (FOSMIDS ==0) add_edges(Flist,fragments,snpfrag,snps,&components);
	else if (FOSMIDS >=1) add_edges_fosmids(Flist,fragments,snpfrag,snps,&components);
	// this considers only components with at least two nodes
	fprintf(stderr,"fragments %d snps %d component(blocks) %d\n",fragments,snps,components); 
	
	struct BLOCK* clist = (struct BLOCK*)malloc(sizeof(struct BLOCK)*components); component =0;
	generate_clist_structure(Flist,fragments,snpfrag,snps,components,clist);

	/*****************************************************************************************************/

	char* HAP1 = (char*)malloc(snps+1); char* besthap_mec = (char*)malloc(snps+1);
	char* HAP2 = (char*)malloc(snps+1);
	struct tm  *ts1;   char       buf[80];	time_t     now;		
	slist = (int*)malloc(sizeof(int)*snps); char fn[1000];  

	if (VCFformat ==0) read_variantfile(variantfile,snpfrag,snps); else read_vcffile(variantfile,snpfrag,snps);

	/*****************************************************************************************************/
	if (RANDOM_START ==1)
	{				
		fprintf(stdout,"starting from a completely random solution SOLUTION \n");
		for (i=0;i<snps;i++) 
		{ 
			if (snpfrag[i].frags ==0) { HAP1[i] = '-'; HAP2[i] = '-'; } 
			else 
			{
				if (drand48() < 0.5) { HAP1[i] = '0'; HAP2[i] = '1'; }  else  {HAP1[i] = '1'; HAP2[i] = '0'; }
			}
		} 
	}
	for (i=0;i<snps;i++) { besthap_mec[i] = HAP1[i]; } 

	// for each block, we maintain best haplotype solution under MFR criterion 
	// compute the component-wise score for 'initHAP' haplotype 
	miscalls=0;bestscore_mec=0; 
	for (k=0;k<components;k++)
	{
		clist[k].MEC =0; clist[k].bestMEC =0; clist[k].calls =0;	clist[k].LL = 0; 
		for (i=0;i<clist[k].frags;i++) 
		{
			update_fragscore(Flist,clist[k].flist[i],HAP1); clist[k].MEC += Flist[clist[k].flist[i]].currscore;
			clist[k].LL += Flist[clist[k].flist[i]].ll; clist[k].calls += Flist[clist[k].flist[i]].calls;

		} 
		clist[k].bestMEC = clist[k].MEC; bestscore_mec += clist[k].bestMEC; miscalls += clist[k].MEC;	 clist[k].bestLL = clist[k].LL; 
	}
	
	//	annealing_haplotyping(Flist,fragments,snpfrag,snps,maxiter,HAP1,HAP2,clist,components,slist); return 1;
	//	annealing_haplotyping_full(Flist,fragments,snpfrag,snps,maxiter,HAP1,HAP2,0); return 1;
	

	/************************** RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE MEC SCORE*********************************/
	for (iter=0;iter<maxiter_hapcut;iter++)
	{
		mecscore(Flist,fragments,HAP1,&ll,&calls,&miscalls);
		time(&now); ts1 = localtime(&now); strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts1); 
		fprintf(stdout,"iter %d current haplotype MEC %f calls %d LL %f %s \n",iter,miscalls,(int)calls,ll,buf);  
		fprintf(stderr,"iter %d current haplotype MEC %f calls %d LL %f %s \n",iter,miscalls,(int)calls,ll,buf);  
		if ((iter%10==0 && iter > 0)) 
		{
			// new code added april 7 2012
			for (k=0;k<components;k++) find_bestvariant_segment(Flist,fragments,snpfrag,clist,k,HAP1,HAP2);

			sprintf(fn,"%s",outputfile);   // newfile for every update to score....
			//sprintf(fn,"%s.%f",outputfile,miscalls);   // newfile for every update to score....
			fprintf(stdout,"OUTPUTTING HAPLOTYPE ASSEMBLY TO FILE %s\n",fn); 	fprintf(stderr,"OUTPUTTING HAPLOTYPE ASSEMBLY TO FILE %s\n",fn); 
			//if (VCFformat ==1) print_haplotypes_vcf(clist,components,HAP1,Flist,fragments,snpfrag,snps,fn);
			print_hapfile(clist,components,HAP1,Flist,fragments,snpfrag,variantfile,miscalls,fn);   
			
			// do this only if some option is specified 
			if (PRINT_FRAGMENT_SCORES ==1) 
			{
				print_fragmentmatrix_MEC(Flist,fragments,HAP1,outputfile);
				//print_matrix(clist,components,HAP1,Flist,outputfile);
			}
		
		}

		for (k=0;k<components;k++) // COMPUTATION OF TREE FOR EACH COMPONENT 
		{
			//if ((k*50)%components ==0) fprintf(stderr,"#");
			if (iter ==0) fprintf(stdout,"\n component %d length %d phased %d %d...%d \n",k,clist[k].length,clist[k].phased,clist[k].offset,clist[k].lastvar); 
			// call function for each component only if MEC > 0 april 17 2012
			if (clist[k].MEC > 0) evaluate_cut_component(Flist,snpfrag,clist,k,slist,HAP1,iter);
		}
		for (i=0;i<snps;i++)
		{
			// commented out on april 6 4pm 2012
			//if (HAP1[i] == '0') HAP2[i] = '1'; else if (HAP1[i] == '1') HAP2[i] = '0'; else HAP2[i] = HAP1[i]; 
		}
	}
	/************************** RUN THE MAX_CUT ALGORITHM ITERATIVELY TO IMPROVE MEC SCORE*********************************/
	//	annealing_haplotyping_full(Flist,fragments,snpfrag,snps,maxiter+100,HAP1,HAP2,0); return 1;
	return 1;
}