// return true if current position in record is valid and usable bool site_crawler:: process_record_line(char* line) { static const unsigned MAX_WORD(50); // do a low-level tab parse: { char* p(line); _word[0]=p; _n_word=1; while (true) { if ((*p == '\n') || (*p == '\0')) break; if (*p == sep) { *p = '\0'; _word[_n_word++] = p+1; if (_n_word == MAX_WORD) break; } ++p; } // allow for optional extra columns in each file format: if (_n_word<_opt.sti().col_count()) { log_os << "ERROR: Consensus record has " << _n_word << " column(s) but expecting at least " << _opt.sti().col_count() << "\n"; dump_state(log_os); exit(EXIT_FAILURE); } } const vcf_pos last_vpos(vpos()); _chrom=_opt.sti().chrom(_word); _vpos.pos=_opt.sti().pos(_word); _is_site_allele_current=false; _is_indel_allele_current=false; _vpos.is_indel=(_opt.sti().get_is_indel(_word)); if (pos()<1) { log_os << "ERROR: gvcf record position less than 1. position: " << pos() << " "; dump_state(log_os); exit(EXIT_FAILURE); } if (_opt.is_region()) { // deal with vcf records after the region of interest: if (pos()>_opt.region_end) { _is_sample_begin_state = false; _is_sample_end_state = true; return true; } } else { if (pos()>static_cast<pos_t>(_ref_seg.end())) { log_os << "ERROR: allele file position exceeds final position in reference sequence segment . position: " << pos() << " ref_contig_end: " << _ref_seg.end() << "\n"; dump_state(log_os); exit(EXIT_FAILURE); } } if (! _opt.sti().get_nonindel_ref_length(pos(),is_indel(),_word,_locus_size)) { //log_os << "ERROR: failed to parse locus at pos: " << pos << "\n"; log_os << "WARNING: failed to parse locus at: " << vpos() << "\n"; dump_state(log_os); //exit(EXIT_FAILURE); _locus_size=0; } _locus_offset=0; // deal with vcf records which fully proceed the region of interest: if (_opt.is_region()) { if ((pos()+_locus_size-1)<_opt.region_begin) return false; } //const bool last_is_call(is_call); _is_call = _opt.sti().get_is_call(_word,pos(),_skip_call_begin_pos,_skip_call_end_pos); _n_total = _opt.sti().total(_word); if (is_indel()) { if (! _is_return_indels) { _vpos.pos=last_vpos.pos; _locus_size=0; return false; } else { _locus_size=0; } } if (is_any_call()) { _is_call=update_allele(); } // don't allow failed block read-through, so that we can get through indel-overlap errors //if(! is_call) { // if(_locus_size>1) _locus_size=1; //} if (! _is_sample_begin_state) { if (! (last_vpos < vpos()) ) { if (_opt.is_murdock_mode) { _vpos=last_vpos; _locus_size=0; return false; } else { log_os << "ERROR: unexpected position order in variant file. current_pos: " << pos() << " last " << last_vpos << "\n"; dump_state(log_os); exit(EXIT_FAILURE); } } } else { _is_sample_begin_state=false; } // deal with vcf records which partially overlap the region of interest: if (_opt.is_region()) { if (pos()<_opt.region_begin) return false; } return true; }
//if you want to simulate a true hom, pass in var with both alleles the same void simulator(int depth, int read_len, int kmer, double seq_err_per_base, int number_repetitions, int colour_indiv, int colour_allele1, int colour_allele2, int colour_ref_minus_site, VariantBranchesAndFlanks* var, int len_genome_minus_site, zygosity true_gt, GraphAndModelInfo* model_info, char* fasta, char* true_ml_gt_name, int working_colour1, int working_colour2, boolean using_1and2_nets, dBGraph* db_graph) //dBNode** genome_minus_site //boolean are_the_two_alleles_identical //char* filelist_net1, char* filelist_net2 //int working_colour_1net, int working_colour_2net { if (NUMBER_OF_COLOURS<4) { die("Cannot run the simulator with <4 colours. Recompile.\n"); } int count_passes = 0; int count_fails = 0; const gsl_rng_type * T; gsl_rng * r; // GLS setup: /* create a generator chosen by the environment variable GSL_RNG_TYPE */ gsl_rng_env_setup(); T = gsl_rng_default; r = gsl_rng_alloc (T); //put the alleles and reference into their colours: mark_allele_with_all_1s_or_more(var->one_allele, var->len_one_allele, colour_allele1); mark_allele_with_all_1s_or_more(var->other_allele, var->len_other_allele, colour_allele2); int i; for (i=0; i<number_repetitions; i++) { zero_path_except_two_alleles_and_ref(var->one_allele, var->len_one_allele, colour_allele1, colour_allele2, colour_ref_minus_site); zero_path_except_two_alleles_and_ref(var->other_allele, var->len_other_allele, colour_allele1, colour_allele2, colour_ref_minus_site); //give the each allele depth which is taken from a Poisson with mean = (D/2) * (R-k+1)/R * (1-k*epsilon) //printf("Depth %d, var->len one allele - k,er +1 = %d, and 1-kmer * seq_err = %f \n", depth, (var->len_one_allele)-kmer+1, 1-kmer*seq_err_per_base ) ; double exp_depth_on_allele1 = 0; if (true_gt==het) //het. So 1/3 of seq errors on the other allele end up on this one { exp_depth_on_allele1 = ((double) depth/2) * ( (double)(read_len+(var->len_one_allele)-kmer+1)/read_len) * (1-kmer*seq_err_per_base); // + ((double) depth/2) * //( (double)(read_len+(var->len_other_allele)-kmer+1)/read_len) * (kmer*seq_err_per_base/3); //some errors from the other allele give covg here } else if (true_gt==hom_one) {//hom exp_depth_on_allele1 = ((double) depth) * ( (double)(read_len+(var->len_one_allele)-kmer+1)/read_len) * (1-kmer*seq_err_per_base); } else if (true_gt==hom_other) { exp_depth_on_allele1 = ((double) depth) * ( (double)(read_len+(var->len_other_allele)-kmer+1)/read_len) * kmer*seq_err_per_base/3;// 1/3 of the errors on the true allele are on this one } double exp_depth_on_allele2 = 0; if (true_gt==het) //het. So 1/3 of seq errors are not a problem. So loss of covg is (1-k*(2/3)*e) { exp_depth_on_allele2 = exp_depth_on_allele1; } else if (true_gt==hom_one) { exp_depth_on_allele2 = ( (double)(read_len+(var->len_one_allele)-kmer+1)/read_len) * kmer*seq_err_per_base/3; } else if (true_gt==hom_other) { exp_depth_on_allele2 = ((double) depth) * ( (double)(read_len+(var->len_other_allele)-kmer+1)/read_len) * (1-kmer*seq_err_per_base); } double exp_depth_on_ref_minus_site = (double) depth * ((double)(len_genome_minus_site-kmer+1)/read_len) * (1-kmer*seq_err_per_base); //printf("exp ZAMMER %f %f %f\n", exp_depth_on_allele1, exp_depth_on_allele2, exp_depth_on_ref_minus_site); unsigned int sampled_covg_allele1 = gsl_ran_poisson (r, exp_depth_on_allele1); unsigned int sampled_covg_allele2 = gsl_ran_poisson (r, exp_depth_on_allele2); unsigned int sampled_covg_rest_of_genome = gsl_ran_poisson (r, exp_depth_on_ref_minus_site); printf("Sampled covgs on alleles 1,2 and genome are %d %d %d\n", sampled_covg_allele1, sampled_covg_allele2, sampled_covg_rest_of_genome); update_allele(var->one_allele, var->len_one_allele, colour_indiv, sampled_covg_allele1,read_len-kmer+1); update_allele(var->other_allele, var->len_other_allele, colour_indiv, sampled_covg_allele2, read_len-kmer+1); //update_allele(genome_minus_site,len_genome_minus_site, colour_indiv, sampled_covg_rest_of_genome); test(var, model_info, fasta, colour_ref_minus_site, colour_indiv, using_1and2_nets, working_colour1, working_colour2, &count_passes, &count_fails, db_graph, true_ml_gt_name); } //cleanup zero_allele(var->one_allele, var->len_one_allele, colour_indiv, colour_allele1, colour_allele2, colour_ref_minus_site); zero_allele(var->other_allele, var->len_other_allele, colour_indiv, colour_allele1, colour_allele2, colour_ref_minus_site); CU_ASSERT((double)count_passes/(double)(count_passes+count_fails) > 0.9 );//actually, we could set this to ==1 printf("Number of passes: %d, number of fails %d\n", count_passes, count_fails); gsl_rng_free (r); }