예제 #1
0
// Run the mutation algorithm to generate an improved consensus sequence
std::string run_mutation(const std::string& base, const std::vector<HMMInputData>& input)
{
    PROFILE_FUNC("run_mutation")
    std::string result = base;

    // assume models for all the reads have the same k
    assert(!input.empty());
    const uint32_t k = input[0].read->pore_model[input[0].strand].k;

    int iteration = 0;
    while(iteration++ < 10) {

        // Generate possible sequences
        PathConsVector paths = generate_mutations(result, k);

        // score them in the HMM
        score_paths(paths, input);

        // check if no improvement was made
        if(paths[0].path == result)
            break;
        result = paths[0].path;
    }

    return result;
}
예제 #2
0
int core(FILE *fout1,FILE *fout2,char *argv, int std_dev, int size_l, int size_r, uint64_t N,int dist){ //most important function
	mutseq_t *ret[2];
	gzFile fp;
	uint64_t total_len;
	kseq_t *seq;
	int l,n_ref,max_size,Q,n_errors;
	uint64_t i,j,counter_a,counter_b;
	char *q_string,*q2_string;
	fp = gzopen(argv, "r");
	seq = kseq_init(fp);
	total_len = n_ref = 0;
	frequency_A=frequency_T=frequency_G=frequency_C=0.;
	nA=nT=nG=nC=j=0;
	max_size = size_l > size_r? size_l : size_r; //determine maximum size of the read
	Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; //calculates quality score
	q_string = (char *)malloc((size_l+1)*sizeof(char)); //first quality string
	q2_string = (char *)malloc((size_r+1)*sizeof(char)); // second quality string
	for(int k=0;k<size_l;k++){q_string[k]=Q;};//prepare quality strings
	for(int k=0;k<size_r;k++){q2_string[k]=Q;};
	fprintf(stderr, "[%s] calculating the total length of the sequnce...\n",__func__);
	
	while ((l = kseq_read(seq)) >= 0){ //prints out basic info and calculates the total length of sequence
		     printf("[%s] name: %s\n",__func__,seq->name.s);
		     if(seq->comment.l) printf("[%s] comment: %s\n",__func__,seq->comment.s);
		     total_len+=l;
		     ++n_ref;
		     if (seq->qual.l) printf ("qual: %s\n",seq->qual.s);
	}
	
	if (total_len == 0){ //in case of empty file simulation is terminated, -1 is returned
		     printf("[%s] input file is empty!\n",__func__);
		     return -1;
	}
	
	fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)total_len);
	kseq_destroy(seq);
	gzclose(fp);
	fp = gzopen(argv, "r");
	seq = kseq_init(fp);
	
	
	while ((l = kseq_read(seq)) >= 0){
		if (l < dist + 3*std_dev){
			fprintf(stderr,"[%s] ERROR sequence to short for given parametars!\n",__func__); //check if sequence is too short for given parameters
			return -1;
		}
		for (i=0;i<l;i++){ //calculate the total number of A,T,G,C
		if(seq->seq.s[i]== 'A')
		     nA++;
		else if(seq->seq.s[i] == 'T')
		     nT++;
		else if(seq->seq.s[i] == 'G')
		     nG++;
		else if(seq->seq.s[i] == 'C')
		     nC++;
		else 
		     nN++;
    } 
	frequency_A = (double_t)nA/l; //calculates frequency of nucleotides
	frequency_C = (double_t)nC/l;
	frequency_G = (double_t)nG/l;
	frequency_T = (double_t)nT/l;
	frequency_N = (double_t)nN/l;
	j++;	  
    printf("[%s] frequency per sequence [%llu/%llu] A - %f | C - %f | G - %f | T - %f | *N - %f - unknown nucleotides (percentage) \n",__func__,(long long)j,(long long)n_ref,frequency_A*100,frequency_C*100,frequency_G*100,frequency_T*100,frequency_N*100);
    }
    
    
    //tot_seq = (char *)malloc((total_len+1)*sizeof(char)); //allocates enough memory space for sequence
	tot_seq = seq->seq.s;
	printf("[%s] transferring sequence into memory and generating errors...\n",__func__);
	counter_a=counter_b=0; //set counters to zero
	for(i=0;i<N;i++){
		double ran;
		int d,pos;
		uint64_t fp1_b,fp1_e,fp2_b,fp2_e,begin,end;
		char *read1,*read2;
		n_sub[0]=n_sub[1]=n_err[0]=n_err[1]=n_ind[0]=n_ind[1]=0; //set the number of substitutions, errors and indels to zero (this would be printed in header of the output files)
		
		do{
			ran = ran_normal();
			ran = ran * std_dev + dist;
			d = (int)(ran + 0.5); //calculates size of fragment
			d = d > max_size ? d : max_size; //avoid boundary failure
			pos = (int)((total_len-d+1)*drand48()); //calculates the position (first index) of the fragment with normal distribution
		}while(pos < 0 || pos >= total_len || (pos + d - 1) >= total_len);
		
		read_f = (char *)malloc((d+1+(int)((INDEL_EXT+0.5)*d))*sizeof(char));counter_a++; //allocates enough memory for the fragment
		//read_f = (char *)malloc(2000*sizeof(char));
		read_f[d+1]='\0'; //adds \0 to the end of the fragment
		strncpy(read_f,tot_seq+pos,d); //copies part of the sequence into fragment
		begin=pos; end=pos+d; //store the beginning and the ending of the fragment
		generate_mutations(d,size_l,size_r); //generates mutations
		int n_n=0;
		int n_indel = (int)(INDEL_FRAC * d); //total number of indels
		int curr_dist=d; //distance between two reads
		if((n_indel >= d) || ((GAP_SIZE*n_indel)>=d)){
			fprintf(stderr,"[%s] ERROR sequence too short for given parametars!\n",__func__); //checks if sequence is too short for given parameters
			return -1;
		}
		do{
			int type_indel,gap_size;
			char *fragment;
			type_indel = (drand48()>=INDEL_EXT)?1:0; //if 0 generates gaps otherwise adds new random nucleotides
			if (type_indel == 1){
				pos = (int)trunc(drand48()*(curr_dist-1)); //calculates the position of gap
				if (pos<size_l) n_ind[0]++;
				if (pos >= (d-size_r)) n_ind[1]++;
				gap_size = poisson_random_number(GAP_SIZE); //generates gaps
				if (gap_size){
			        generate_gaps(pos,gap_size);
				}
			    curr_dist=curr_dist-gap_size; //reduce size of fragment
		    }
			else{
				char fragment[4]={0x0}; //new nucleotides will be stored here
				char keeper[1000]={0x0}; //temporary string
				int pos;
				double r;
				char base;
				r = drand48();
				base=(r < 0.25)?'A':((r>=0.25 && r<0.5)?'T':(r>=0.25 && r<0.75)?'C':'G'); //generates new random nucleotide
				fragment[0]=base;fragment[1]='\0'; //adds nucleotide to fragment
				pos = (int)trunc(drand48()*(curr_dist-1)); //calculates the position for insertion
				if (pos<size_l) n_ind[0]++;
				if (pos >= (d-size_r)) n_ind[1]++;
				strcat(keeper,read_f+pos); 
				read_f[pos]='\0';
				strcat(read_f,fragment);
				strcat(read_f,keeper);//inserts nucleotides
				curr_dist++;
				
			}
			n_n++;
		}while(n_n<n_indel);
		read1=(char *)malloc((size_l+1)*sizeof(char)); //generates two pair end reads 
		n_errors=(int)(INDEL_FRAC*size_l); //calculates the total number of base calling errors
		read2=(char *)malloc((size_r+1)*sizeof(char));
		strncpy(read1,read_f,size_l+1);read1[size_l+1]='\0';
		int internal_counter=0;
		for(int k=(curr_dist-1);internal_counter<size_r;k--){
			*(read2+internal_counter)=get_complement(*(read_f + k)); //makes complement of first fread
			internal_counter++;
		}
		read2[internal_counter]='\0';
		int err_1,err_2;
		read1=simulate_BCER(size_l,read1,&err_1); //generates base calling errors
		read2=simulate_BCER(size_r,read2,&err_2);
		fprintf(fout1,"@%s_%llu_%llu_%d:%d:%d_%d:%d:%d_%llx/%d\n",seq->name.s,(long long)begin,(long long)end,err_1,n_sub[0],n_ind[0],(int)(size_r*ERR_RATE),n_sub[1],n_ind[1],(long long)counter_a,1); //prints results into the file
		fprintf(fout1,"%s\n+\n%s\n",read1,q_string);
		fprintf(fout2,"@%s_%llu_%llu_%d:%d:%d_%d:%d:%d_%llx/%d\n",seq->name.s,(long long)begin,(long long)end,err_2,n_sub[0],n_ind[0],(int)(size_r*ERR_RATE),n_sub[1],n_ind[1],(long long)counter_a,2);
		fprintf(fout2,"%s\n+\n%s\n",read2,q2_string);
		free(read1);
		free(read2);
		free(read_f);
	}
    kseq_destroy(seq);
	gzclose(fp);
	free(q_string);
	free(q2_string);
}