// return true if current position in record is valid and usable
bool
site_crawler::
process_record_line(char* line)
{
    static const unsigned MAX_WORD(50);

    // do a low-level tab parse:
    {
        char* p(line);
        _word[0]=p;
        _n_word=1;
        while (true) {
            if ((*p == '\n') || (*p == '\0')) break;
            if (*p == sep) {
                *p = '\0';
                _word[_n_word++] = p+1;
                if (_n_word == MAX_WORD) break;
            }
            ++p;
        }
        // allow for optional extra columns in each file format:
        if (_n_word<_opt.sti().col_count()) {
            log_os << "ERROR: Consensus record has " << _n_word << " column(s) but expecting at least " << _opt.sti().col_count() << "\n";
            dump_state(log_os);
            exit(EXIT_FAILURE);
        }
    }

    const vcf_pos last_vpos(vpos());
    _chrom=_opt.sti().chrom(_word);
    _vpos.pos=_opt.sti().pos(_word);

    _is_site_allele_current=false;
    _is_indel_allele_current=false;

    _vpos.is_indel=(_opt.sti().get_is_indel(_word));

    if (pos()<1) {
        log_os << "ERROR: gvcf record position less than 1. position: " << pos() << " ";
        dump_state(log_os);
        exit(EXIT_FAILURE);
    }

    if (_opt.is_region()) {
        // deal with vcf records after the region of interest:
        if (pos()>_opt.region_end) {
            _is_sample_begin_state = false;
            _is_sample_end_state = true;
            return true;
        }
    } else {
        if (pos()>static_cast<pos_t>(_ref_seg.end())) {
            log_os << "ERROR: allele file position exceeds final position in reference sequence segment . position: "
                   << pos() << " ref_contig_end: " << _ref_seg.end() << "\n";
            dump_state(log_os);
            exit(EXIT_FAILURE);
        }
    }

    if (! _opt.sti().get_nonindel_ref_length(pos(),is_indel(),_word,_locus_size)) {
        //log_os << "ERROR: failed to parse locus at pos: "  << pos << "\n";
        log_os << "WARNING: failed to parse locus at: "  << vpos() << "\n";
        dump_state(log_os);
        //exit(EXIT_FAILURE);
        _locus_size=0;
    }

    _locus_offset=0;

    // deal with vcf records which fully proceed the region of interest:
    if (_opt.is_region()) {
        if ((pos()+_locus_size-1)<_opt.region_begin) return false;
    }

    //const bool last_is_call(is_call);
    _is_call = _opt.sti().get_is_call(_word,pos(),_skip_call_begin_pos,_skip_call_end_pos);

    _n_total = _opt.sti().total(_word);

    if (is_indel()) {
        if (! _is_return_indels)
        {
            _vpos.pos=last_vpos.pos;
            _locus_size=0;
            return false;
        }
        else
        {
            _locus_size=0;
        }
    }

    if (is_any_call()) {
        _is_call=update_allele();
    }

    // don't allow failed block read-through, so that we can get through indel-overlap errors
    //if(! is_call) {
    //    if(_locus_size>1) _locus_size=1;
    //}

    if (! _is_sample_begin_state) {
        if (! (last_vpos < vpos()) ) {
            if (_opt.is_murdock_mode) {
                _vpos=last_vpos;
                _locus_size=0;
                return false;
            } else {
                log_os << "ERROR: unexpected position order in variant file. current_pos: "
                       << pos() << " last " << last_vpos << "\n";
                dump_state(log_os);
                exit(EXIT_FAILURE);
            }
        }
    } else {
        _is_sample_begin_state=false;
    }

    // deal with vcf records which partially overlap the region of interest:
    if (_opt.is_region()) {
        if (pos()<_opt.region_begin) return false;
    }

    return true;
}
Esempio n. 2
0
//if you want to simulate a true hom, pass in var with both alleles the same
void simulator(int depth, int read_len, int kmer, double seq_err_per_base,
               int number_repetitions,  int colour_indiv,
               int colour_allele1, int colour_allele2, int colour_ref_minus_site,
               VariantBranchesAndFlanks* var,
               int len_genome_minus_site, zygosity true_gt,
               GraphAndModelInfo* model_info,
               char* fasta, char* true_ml_gt_name,
               int working_colour1, int working_colour2,
               boolean using_1and2_nets,
               dBGraph* db_graph)
               //dBNode** genome_minus_site
               //boolean are_the_two_alleles_identical
               //char* filelist_net1, char* filelist_net2
               //int working_colour_1net, int working_colour_2net
{

  if (NUMBER_OF_COLOURS<4)
    {
      die("Cannot run the simulator with <4 colours. Recompile.\n");
    }

  int count_passes = 0;
  int count_fails = 0;

  const gsl_rng_type * T;
  gsl_rng * r;
  
  // GLS setup:
  /* create a generator chosen by the 
     environment variable GSL_RNG_TYPE */
  gsl_rng_env_setup();
  T = gsl_rng_default;
  r = gsl_rng_alloc (T);


  //put the alleles  and reference into their colours:
  mark_allele_with_all_1s_or_more(var->one_allele, var->len_one_allele,     colour_allele1);
  mark_allele_with_all_1s_or_more(var->other_allele, var->len_other_allele, colour_allele2);
  
  int i;
  for (i=0; i<number_repetitions; i++)
    {
      zero_path_except_two_alleles_and_ref(var->one_allele, var->len_one_allele, colour_allele1, colour_allele2, colour_ref_minus_site);
      zero_path_except_two_alleles_and_ref(var->other_allele, var->len_other_allele, colour_allele1, colour_allele2, colour_ref_minus_site);

      //give the each allele depth which is taken from a Poisson with mean =  (D/2) * (R-k+1)/R  * (1-k*epsilon)
      //printf("Depth %d, var->len one allele - k,er +1 = %d, and 1-kmer * seq_err = %f     \n", depth, (var->len_one_allele)-kmer+1, 1-kmer*seq_err_per_base    )    ;
      double exp_depth_on_allele1 = 0;
      if (true_gt==het)
	//het. So 1/3 of seq errors on the other allele end up on this one
      	{
	  exp_depth_on_allele1 = ((double) depth/2) *
	    ( (double)(read_len+(var->len_one_allele)-kmer+1)/read_len) * (1-kmer*seq_err_per_base);
	    // + ((double) depth/2) *
	    //( (double)(read_len+(var->len_other_allele)-kmer+1)/read_len) * (kmer*seq_err_per_base/3); //some errors from the other allele give covg here
	}
      else if (true_gt==hom_one)
	{//hom
	  exp_depth_on_allele1 = ((double) depth) * 
	    ( (double)(read_len+(var->len_one_allele)-kmer+1)/read_len) * (1-kmer*seq_err_per_base);
	}
      else if (true_gt==hom_other)
	{
	  exp_depth_on_allele1 = ((double) depth) *
	    ( (double)(read_len+(var->len_other_allele)-kmer+1)/read_len) * kmer*seq_err_per_base/3;// 1/3 of the errors on the true allele are on this one
	}
      double exp_depth_on_allele2 = 0;
      if (true_gt==het)
	//het. So 1/3 of seq errors are not a problem. So loss of covg is (1-k*(2/3)*e)
      	{
	  exp_depth_on_allele2 = exp_depth_on_allele1;
	}
      else if (true_gt==hom_one)
	{
	  exp_depth_on_allele2 = ( (double)(read_len+(var->len_one_allele)-kmer+1)/read_len) * kmer*seq_err_per_base/3;
	}
      else if (true_gt==hom_other)
	{
	  exp_depth_on_allele2 = ((double) depth) *
	    ( (double)(read_len+(var->len_other_allele)-kmer+1)/read_len) * (1-kmer*seq_err_per_base);
	}

      double exp_depth_on_ref_minus_site = (double) depth * ((double)(len_genome_minus_site-kmer+1)/read_len) * (1-kmer*seq_err_per_base);
      //printf("exp ZAMMER %f %f %f\n", exp_depth_on_allele1, exp_depth_on_allele2, exp_depth_on_ref_minus_site);
      unsigned int sampled_covg_allele1 = gsl_ran_poisson (r, exp_depth_on_allele1);
      unsigned int sampled_covg_allele2 = gsl_ran_poisson (r, exp_depth_on_allele2);
      unsigned int sampled_covg_rest_of_genome = gsl_ran_poisson (r, exp_depth_on_ref_minus_site);
      printf("Sampled covgs on alleles 1,2 and genome are  %d %d %d\n", sampled_covg_allele1, sampled_covg_allele2, sampled_covg_rest_of_genome);
      update_allele(var->one_allele, var->len_one_allele,     
		    colour_indiv, sampled_covg_allele1,read_len-kmer+1);
      update_allele(var->other_allele, var->len_other_allele, 
		    colour_indiv, sampled_covg_allele2, read_len-kmer+1);
      //update_allele(genome_minus_site,len_genome_minus_site,  colour_indiv, sampled_covg_rest_of_genome);

      test(var, model_info, fasta, colour_ref_minus_site,
	         colour_indiv, using_1and2_nets,
           working_colour1, working_colour2,
           &count_passes, &count_fails, db_graph, true_ml_gt_name);

    }

  //cleanup
  zero_allele(var->one_allele, var->len_one_allele,     colour_indiv, colour_allele1, colour_allele2, colour_ref_minus_site);
  zero_allele(var->other_allele, var->len_other_allele, colour_indiv, colour_allele1, colour_allele2, colour_ref_minus_site);

  CU_ASSERT((double)count_passes/(double)(count_passes+count_fails) > 0.9 );//actually, we could set this to ==1
  printf("Number of passes: %d, number of fails %d\n", count_passes, count_fails);


  gsl_rng_free (r);

}