Exemple #1
0
int core(FILE *fout1,FILE *fout2,char *argv, int std_dev, int size_l, int size_r, uint64_t N,int dist){ //most important function
	mutseq_t *ret[2];
	gzFile fp;
	uint64_t total_len;
	kseq_t *seq;
	int l,n_ref,max_size,Q,n_errors;
	uint64_t i,j,counter_a,counter_b;
	char *q_string,*q2_string;
	fp = gzopen(argv, "r");
	seq = kseq_init(fp);
	total_len = n_ref = 0;
	frequency_A=frequency_T=frequency_G=frequency_C=0.;
	nA=nT=nG=nC=j=0;
	max_size = size_l > size_r? size_l : size_r; //determine maximum size of the read
	Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; //calculates quality score
	q_string = (char *)malloc((size_l+1)*sizeof(char)); //first quality string
	q2_string = (char *)malloc((size_r+1)*sizeof(char)); // second quality string
	for(int k=0;k<size_l;k++){q_string[k]=Q;};//prepare quality strings
	for(int k=0;k<size_r;k++){q2_string[k]=Q;};
	fprintf(stderr, "[%s] calculating the total length of the sequnce...\n",__func__);
	
	while ((l = kseq_read(seq)) >= 0){ //prints out basic info and calculates the total length of sequence
		     printf("[%s] name: %s\n",__func__,seq->name.s);
		     if(seq->comment.l) printf("[%s] comment: %s\n",__func__,seq->comment.s);
		     total_len+=l;
		     ++n_ref;
		     if (seq->qual.l) printf ("qual: %s\n",seq->qual.s);
	}
	
	if (total_len == 0){ //in case of empty file simulation is terminated, -1 is returned
		     printf("[%s] input file is empty!\n",__func__);
		     return -1;
	}
	
	fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)total_len);
	kseq_destroy(seq);
	gzclose(fp);
	fp = gzopen(argv, "r");
	seq = kseq_init(fp);
	
	
	while ((l = kseq_read(seq)) >= 0){
		if (l < dist + 3*std_dev){
			fprintf(stderr,"[%s] ERROR sequence to short for given parametars!\n",__func__); //check if sequence is too short for given parameters
			return -1;
		}
		for (i=0;i<l;i++){ //calculate the total number of A,T,G,C
		if(seq->seq.s[i]== 'A')
		     nA++;
		else if(seq->seq.s[i] == 'T')
		     nT++;
		else if(seq->seq.s[i] == 'G')
		     nG++;
		else if(seq->seq.s[i] == 'C')
		     nC++;
		else 
		     nN++;
    } 
	frequency_A = (double_t)nA/l; //calculates frequency of nucleotides
	frequency_C = (double_t)nC/l;
	frequency_G = (double_t)nG/l;
	frequency_T = (double_t)nT/l;
	frequency_N = (double_t)nN/l;
	j++;	  
    printf("[%s] frequency per sequence [%llu/%llu] A - %f | C - %f | G - %f | T - %f | *N - %f - unknown nucleotides (percentage) \n",__func__,(long long)j,(long long)n_ref,frequency_A*100,frequency_C*100,frequency_G*100,frequency_T*100,frequency_N*100);
    }
    
    
    //tot_seq = (char *)malloc((total_len+1)*sizeof(char)); //allocates enough memory space for sequence
	tot_seq = seq->seq.s;
	printf("[%s] transferring sequence into memory and generating errors...\n",__func__);
	counter_a=counter_b=0; //set counters to zero
	for(i=0;i<N;i++){
		double ran;
		int d,pos;
		uint64_t fp1_b,fp1_e,fp2_b,fp2_e,begin,end;
		char *read1,*read2;
		n_sub[0]=n_sub[1]=n_err[0]=n_err[1]=n_ind[0]=n_ind[1]=0; //set the number of substitutions, errors and indels to zero (this would be printed in header of the output files)
		
		do{
			ran = ran_normal();
			ran = ran * std_dev + dist;
			d = (int)(ran + 0.5); //calculates size of fragment
			d = d > max_size ? d : max_size; //avoid boundary failure
			pos = (int)((total_len-d+1)*drand48()); //calculates the position (first index) of the fragment with normal distribution
		}while(pos < 0 || pos >= total_len || (pos + d - 1) >= total_len);
		
		read_f = (char *)malloc((d+1+(int)((INDEL_EXT+0.5)*d))*sizeof(char));counter_a++; //allocates enough memory for the fragment
		//read_f = (char *)malloc(2000*sizeof(char));
		read_f[d+1]='\0'; //adds \0 to the end of the fragment
		strncpy(read_f,tot_seq+pos,d); //copies part of the sequence into fragment
		begin=pos; end=pos+d; //store the beginning and the ending of the fragment
		generate_mutations(d,size_l,size_r); //generates mutations
		int n_n=0;
		int n_indel = (int)(INDEL_FRAC * d); //total number of indels
		int curr_dist=d; //distance between two reads
		if((n_indel >= d) || ((GAP_SIZE*n_indel)>=d)){
			fprintf(stderr,"[%s] ERROR sequence too short for given parametars!\n",__func__); //checks if sequence is too short for given parameters
			return -1;
		}
		do{
			int type_indel,gap_size;
			char *fragment;
			type_indel = (drand48()>=INDEL_EXT)?1:0; //if 0 generates gaps otherwise adds new random nucleotides
			if (type_indel == 1){
				pos = (int)trunc(drand48()*(curr_dist-1)); //calculates the position of gap
				if (pos<size_l) n_ind[0]++;
				if (pos >= (d-size_r)) n_ind[1]++;
				gap_size = poisson_random_number(GAP_SIZE); //generates gaps
				if (gap_size){
			        generate_gaps(pos,gap_size);
				}
			    curr_dist=curr_dist-gap_size; //reduce size of fragment
		    }
			else{
				char fragment[4]={0x0}; //new nucleotides will be stored here
				char keeper[1000]={0x0}; //temporary string
				int pos;
				double r;
				char base;
				r = drand48();
				base=(r < 0.25)?'A':((r>=0.25 && r<0.5)?'T':(r>=0.25 && r<0.75)?'C':'G'); //generates new random nucleotide
				fragment[0]=base;fragment[1]='\0'; //adds nucleotide to fragment
				pos = (int)trunc(drand48()*(curr_dist-1)); //calculates the position for insertion
				if (pos<size_l) n_ind[0]++;
				if (pos >= (d-size_r)) n_ind[1]++;
				strcat(keeper,read_f+pos); 
				read_f[pos]='\0';
				strcat(read_f,fragment);
				strcat(read_f,keeper);//inserts nucleotides
				curr_dist++;
				
			}
			n_n++;
		}while(n_n<n_indel);
		read1=(char *)malloc((size_l+1)*sizeof(char)); //generates two pair end reads 
		n_errors=(int)(INDEL_FRAC*size_l); //calculates the total number of base calling errors
		read2=(char *)malloc((size_r+1)*sizeof(char));
		strncpy(read1,read_f,size_l+1);read1[size_l+1]='\0';
		int internal_counter=0;
		for(int k=(curr_dist-1);internal_counter<size_r;k--){
			*(read2+internal_counter)=get_complement(*(read_f + k)); //makes complement of first fread
			internal_counter++;
		}
		read2[internal_counter]='\0';
		int err_1,err_2;
		read1=simulate_BCER(size_l,read1,&err_1); //generates base calling errors
		read2=simulate_BCER(size_r,read2,&err_2);
		fprintf(fout1,"@%s_%llu_%llu_%d:%d:%d_%d:%d:%d_%llx/%d\n",seq->name.s,(long long)begin,(long long)end,err_1,n_sub[0],n_ind[0],(int)(size_r*ERR_RATE),n_sub[1],n_ind[1],(long long)counter_a,1); //prints results into the file
		fprintf(fout1,"%s\n+\n%s\n",read1,q_string);
		fprintf(fout2,"@%s_%llu_%llu_%d:%d:%d_%d:%d:%d_%llx/%d\n",seq->name.s,(long long)begin,(long long)end,err_2,n_sub[0],n_ind[0],(int)(size_r*ERR_RATE),n_sub[1],n_ind[1],(long long)counter_a,2);
		fprintf(fout2,"%s\n+\n%s\n",read2,q2_string);
		free(read1);
		free(read2);
		free(read_f);
	}
    kseq_destroy(seq);
	gzclose(fp);
	free(q_string);
	free(q2_string);
}
Exemple #2
0
// Returns num of bases printed
size_t sim_reads(seq_file_t *reffile, gzFile out0, gzFile out1,
                 FileList *flist, float err_rate,
                 size_t insert, double insert_stddev, size_t rlen, double depth)
{
  size_t i, chromcap = 16, nchroms, glen = 0, nreads, chr, pos0, pos1, tlen;
  read_t *chroms;

  tlen = rlen + (out1 == NULL ? 0 : insert + rlen);

  chroms = malloc(chromcap * sizeof(read_t));
  nchroms = 0;

  // Load genome
  printf(" Loaded contigs:");
  while(1)
  {
    if(nchroms == chromcap) chroms = realloc(chroms, (chromcap*=2)*sizeof(read_t));
    seq_read_alloc(&chroms[nchroms]);
    if(seq_read(reffile, &chroms[nchroms]) <= 0)
    { seq_read_dealloc(&chroms[nchroms]); break; }
    if(chroms[nchroms].seq.end < tlen) { seq_read_dealloc(&chroms[nchroms]); }
    else {
      seq_read_truncate_name(&chroms[nchroms]);
      printf(" %s[%zu]", chroms[nchroms].name.b, chroms[nchroms].seq.end);
      glen += chroms[nchroms].seq.end;
      nchroms++;
    }
  }
  printf("\n Genome size: %zu\n", glen);

  if(nchroms == 0) {
    die("No sequences long enough in ref genome file [min len: %zu]: %s",
        tlen, reffile->path);
  }

  // Sample
  nreads = (glen * depth) / (out1 == NULL ? rlen : (2 * rlen));
  char read0[rlen+1], read1[rlen+1];
  read0[rlen] = read1[rlen] = '\0';

  printf("Sampling %zu %sreads...\n", nreads,
         out1 == NULL ? "single " : "paired-end ");

  // Sample paired-end if out1 != NULL
  for(i = 0; i < nreads; i++)
  {
    chr = (nchroms == 1) ? 0 : rand_chrom(chroms, nchroms, glen);
    pos0 = random_uniform(chroms[chr].seq.end - (out1 == NULL ? rlen : tlen));
    pos1 = pos0;
    memcpy(read0, chroms[chr].seq.b+pos0, rlen);
    if(out1 != NULL) {
      pos1 = pos0 + rlen + insert + ran_normal()*insert_stddev;
      if(pos1 + rlen > chroms[chr].seq.end) pos1 = chroms[chr].seq.end-rlen;
      memcpy(read1, chroms[chr].seq.b+pos1, rlen);
    }
    if(flist != NULL) {
      add_seq_error_profile(read0, rlen, flist);
      if(out1 != NULL)
        add_seq_error_profile(read1, rlen, flist);
    }
    else if(err_rate >= 0) {
      add_seq_error_rate(read0, rlen, err_rate);
    }
    gzprintf(out0, ">r%zu:%s:%zu:%zu%s\n%.*s\n", i, chroms[chr].name.b,
                   pos0, pos1, (out1 != NULL ? "/1" : ""), (int)rlen, read0);
    if(out1 != NULL) {
      dna_revcmp(read1, rlen);
      gzprintf(out1, ">r%zu:%s:%zu:%zu/2\n%.*s\n", i, chroms[chr].name.b,
                     pos0, pos1, (int)rlen, read1);
    }
  }

  for(i = 0; i < nchroms; i++) seq_read_dealloc(&chroms[i]);
  free(chroms);

  size_t num_bases = nreads * rlen;
  if(out1 != NULL) num_bases *= 2;

  return num_bases;
}