int core(FILE *fout1,FILE *fout2,char *argv, int std_dev, int size_l, int size_r, uint64_t N,int dist){ //most important function mutseq_t *ret[2]; gzFile fp; uint64_t total_len; kseq_t *seq; int l,n_ref,max_size,Q,n_errors; uint64_t i,j,counter_a,counter_b; char *q_string,*q2_string; fp = gzopen(argv, "r"); seq = kseq_init(fp); total_len = n_ref = 0; frequency_A=frequency_T=frequency_G=frequency_C=0.; nA=nT=nG=nC=j=0; max_size = size_l > size_r? size_l : size_r; //determine maximum size of the read Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; //calculates quality score q_string = (char *)malloc((size_l+1)*sizeof(char)); //first quality string q2_string = (char *)malloc((size_r+1)*sizeof(char)); // second quality string for(int k=0;k<size_l;k++){q_string[k]=Q;};//prepare quality strings for(int k=0;k<size_r;k++){q2_string[k]=Q;}; fprintf(stderr, "[%s] calculating the total length of the sequnce...\n",__func__); while ((l = kseq_read(seq)) >= 0){ //prints out basic info and calculates the total length of sequence printf("[%s] name: %s\n",__func__,seq->name.s); if(seq->comment.l) printf("[%s] comment: %s\n",__func__,seq->comment.s); total_len+=l; ++n_ref; if (seq->qual.l) printf ("qual: %s\n",seq->qual.s); } if (total_len == 0){ //in case of empty file simulation is terminated, -1 is returned printf("[%s] input file is empty!\n",__func__); return -1; } fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)total_len); kseq_destroy(seq); gzclose(fp); fp = gzopen(argv, "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0){ if (l < dist + 3*std_dev){ fprintf(stderr,"[%s] ERROR sequence to short for given parametars!\n",__func__); //check if sequence is too short for given parameters return -1; } for (i=0;i<l;i++){ //calculate the total number of A,T,G,C if(seq->seq.s[i]== 'A') nA++; else if(seq->seq.s[i] == 'T') nT++; else if(seq->seq.s[i] == 'G') nG++; else if(seq->seq.s[i] == 'C') nC++; else nN++; } frequency_A = (double_t)nA/l; //calculates frequency of nucleotides frequency_C = (double_t)nC/l; frequency_G = (double_t)nG/l; frequency_T = (double_t)nT/l; frequency_N = (double_t)nN/l; j++; printf("[%s] frequency per sequence [%llu/%llu] A - %f | C - %f | G - %f | T - %f | *N - %f - unknown nucleotides (percentage) \n",__func__,(long long)j,(long long)n_ref,frequency_A*100,frequency_C*100,frequency_G*100,frequency_T*100,frequency_N*100); } //tot_seq = (char *)malloc((total_len+1)*sizeof(char)); //allocates enough memory space for sequence tot_seq = seq->seq.s; printf("[%s] transferring sequence into memory and generating errors...\n",__func__); counter_a=counter_b=0; //set counters to zero for(i=0;i<N;i++){ double ran; int d,pos; uint64_t fp1_b,fp1_e,fp2_b,fp2_e,begin,end; char *read1,*read2; n_sub[0]=n_sub[1]=n_err[0]=n_err[1]=n_ind[0]=n_ind[1]=0; //set the number of substitutions, errors and indels to zero (this would be printed in header of the output files) do{ ran = ran_normal(); ran = ran * std_dev + dist; d = (int)(ran + 0.5); //calculates size of fragment d = d > max_size ? d : max_size; //avoid boundary failure pos = (int)((total_len-d+1)*drand48()); //calculates the position (first index) of the fragment with normal distribution }while(pos < 0 || pos >= total_len || (pos + d - 1) >= total_len); read_f = (char *)malloc((d+1+(int)((INDEL_EXT+0.5)*d))*sizeof(char));counter_a++; //allocates enough memory for the fragment //read_f = (char *)malloc(2000*sizeof(char)); read_f[d+1]='\0'; //adds \0 to the end of the fragment strncpy(read_f,tot_seq+pos,d); //copies part of the sequence into fragment begin=pos; end=pos+d; //store the beginning and the ending of the fragment generate_mutations(d,size_l,size_r); //generates mutations int n_n=0; int n_indel = (int)(INDEL_FRAC * d); //total number of indels int curr_dist=d; //distance between two reads if((n_indel >= d) || ((GAP_SIZE*n_indel)>=d)){ fprintf(stderr,"[%s] ERROR sequence too short for given parametars!\n",__func__); //checks if sequence is too short for given parameters return -1; } do{ int type_indel,gap_size; char *fragment; type_indel = (drand48()>=INDEL_EXT)?1:0; //if 0 generates gaps otherwise adds new random nucleotides if (type_indel == 1){ pos = (int)trunc(drand48()*(curr_dist-1)); //calculates the position of gap if (pos<size_l) n_ind[0]++; if (pos >= (d-size_r)) n_ind[1]++; gap_size = poisson_random_number(GAP_SIZE); //generates gaps if (gap_size){ generate_gaps(pos,gap_size); } curr_dist=curr_dist-gap_size; //reduce size of fragment } else{ char fragment[4]={0x0}; //new nucleotides will be stored here char keeper[1000]={0x0}; //temporary string int pos; double r; char base; r = drand48(); base=(r < 0.25)?'A':((r>=0.25 && r<0.5)?'T':(r>=0.25 && r<0.75)?'C':'G'); //generates new random nucleotide fragment[0]=base;fragment[1]='\0'; //adds nucleotide to fragment pos = (int)trunc(drand48()*(curr_dist-1)); //calculates the position for insertion if (pos<size_l) n_ind[0]++; if (pos >= (d-size_r)) n_ind[1]++; strcat(keeper,read_f+pos); read_f[pos]='\0'; strcat(read_f,fragment); strcat(read_f,keeper);//inserts nucleotides curr_dist++; } n_n++; }while(n_n<n_indel); read1=(char *)malloc((size_l+1)*sizeof(char)); //generates two pair end reads n_errors=(int)(INDEL_FRAC*size_l); //calculates the total number of base calling errors read2=(char *)malloc((size_r+1)*sizeof(char)); strncpy(read1,read_f,size_l+1);read1[size_l+1]='\0'; int internal_counter=0; for(int k=(curr_dist-1);internal_counter<size_r;k--){ *(read2+internal_counter)=get_complement(*(read_f + k)); //makes complement of first fread internal_counter++; } read2[internal_counter]='\0'; int err_1,err_2; read1=simulate_BCER(size_l,read1,&err_1); //generates base calling errors read2=simulate_BCER(size_r,read2,&err_2); fprintf(fout1,"@%s_%llu_%llu_%d:%d:%d_%d:%d:%d_%llx/%d\n",seq->name.s,(long long)begin,(long long)end,err_1,n_sub[0],n_ind[0],(int)(size_r*ERR_RATE),n_sub[1],n_ind[1],(long long)counter_a,1); //prints results into the file fprintf(fout1,"%s\n+\n%s\n",read1,q_string); fprintf(fout2,"@%s_%llu_%llu_%d:%d:%d_%d:%d:%d_%llx/%d\n",seq->name.s,(long long)begin,(long long)end,err_2,n_sub[0],n_ind[0],(int)(size_r*ERR_RATE),n_sub[1],n_ind[1],(long long)counter_a,2); fprintf(fout2,"%s\n+\n%s\n",read2,q2_string); free(read1); free(read2); free(read_f); } kseq_destroy(seq); gzclose(fp); free(q_string); free(q2_string); }
// Returns num of bases printed size_t sim_reads(seq_file_t *reffile, gzFile out0, gzFile out1, FileList *flist, float err_rate, size_t insert, double insert_stddev, size_t rlen, double depth) { size_t i, chromcap = 16, nchroms, glen = 0, nreads, chr, pos0, pos1, tlen; read_t *chroms; tlen = rlen + (out1 == NULL ? 0 : insert + rlen); chroms = malloc(chromcap * sizeof(read_t)); nchroms = 0; // Load genome printf(" Loaded contigs:"); while(1) { if(nchroms == chromcap) chroms = realloc(chroms, (chromcap*=2)*sizeof(read_t)); seq_read_alloc(&chroms[nchroms]); if(seq_read(reffile, &chroms[nchroms]) <= 0) { seq_read_dealloc(&chroms[nchroms]); break; } if(chroms[nchroms].seq.end < tlen) { seq_read_dealloc(&chroms[nchroms]); } else { seq_read_truncate_name(&chroms[nchroms]); printf(" %s[%zu]", chroms[nchroms].name.b, chroms[nchroms].seq.end); glen += chroms[nchroms].seq.end; nchroms++; } } printf("\n Genome size: %zu\n", glen); if(nchroms == 0) { die("No sequences long enough in ref genome file [min len: %zu]: %s", tlen, reffile->path); } // Sample nreads = (glen * depth) / (out1 == NULL ? rlen : (2 * rlen)); char read0[rlen+1], read1[rlen+1]; read0[rlen] = read1[rlen] = '\0'; printf("Sampling %zu %sreads...\n", nreads, out1 == NULL ? "single " : "paired-end "); // Sample paired-end if out1 != NULL for(i = 0; i < nreads; i++) { chr = (nchroms == 1) ? 0 : rand_chrom(chroms, nchroms, glen); pos0 = random_uniform(chroms[chr].seq.end - (out1 == NULL ? rlen : tlen)); pos1 = pos0; memcpy(read0, chroms[chr].seq.b+pos0, rlen); if(out1 != NULL) { pos1 = pos0 + rlen + insert + ran_normal()*insert_stddev; if(pos1 + rlen > chroms[chr].seq.end) pos1 = chroms[chr].seq.end-rlen; memcpy(read1, chroms[chr].seq.b+pos1, rlen); } if(flist != NULL) { add_seq_error_profile(read0, rlen, flist); if(out1 != NULL) add_seq_error_profile(read1, rlen, flist); } else if(err_rate >= 0) { add_seq_error_rate(read0, rlen, err_rate); } gzprintf(out0, ">r%zu:%s:%zu:%zu%s\n%.*s\n", i, chroms[chr].name.b, pos0, pos1, (out1 != NULL ? "/1" : ""), (int)rlen, read0); if(out1 != NULL) { dna_revcmp(read1, rlen); gzprintf(out1, ">r%zu:%s:%zu:%zu/2\n%.*s\n", i, chroms[chr].name.b, pos0, pos1, (int)rlen, read1); } } for(i = 0; i < nchroms; i++) seq_read_dealloc(&chroms[i]); free(chroms); size_t num_bases = nreads * rlen; if(out1 != NULL) num_bases *= 2; return num_bases; }