Esempio n. 1
0
void apply_sw(sw_server_input_t* input, aligner_batch_t *batch) {


  //  printf("START: apply_sw\n"); 
  int tid = omp_get_thread_num();

  cal_t *cal = NULL;
  array_list_t *cal_list = NULL, *mapping_list = NULL;//, *old_list = NULL, *new_list = NULL;
  fastq_batch_t *fq_batch = batch->fq_batch;

  size_t start, end;
  genome_t *genome = input->genome_p;
     
  size_t flank_length = input->flank_length;

  // SIMD support for Smith-Waterman
  float score, min_score = input->min_score;
  //  size_t curr_depth = 0;
  sw_output_t *sw_output;
  //  sw_simd_input_t *sw_sinput = sw_simd_input_new(SIMD_DEPTH);
  //  sw_simd_output_t *sw_soutput = sw_simd_output_new(SIMD_DEPTH);
  //sw_simd_context_t *context = sw_simd_context_new(input->match, input->mismatch, 
  //						    input->gap_open, input->gap_extend); 

  // for tracking the current read, cal being processed using sw_channel_t
  //sw_channel_t *channel;
  //sw_channel_t sw_channels[SIMD_DEPTH];
  //memset(sw_channels, 0, sizeof(sw_channels));
  
  //size_t header_len, read_len;
  //size_t strands[SIMD_DEPTH], chromosomes[SIMD_DEPTH], starts[SIMD_DEPTH];
  
  size_t index, num_cals;
  size_t total = 0, valids = 0;

  size_t num_seqs = batch->num_targets;

  // set to zero
  batch->num_done = batch->num_to_do;
  batch->num_to_do = 0;

  size_t sw_total = batch->num_done;
  /*
  // for all seqs pending to process !!
  size_t sw_total = 0;
  for (size_t i = 0; i < num_seqs; i++) {
    sw_total += array_list_size(batch->mapping_lists[batch->targets[i]]);
  }
  printf("number of sw to run: %d (vs num_done = %d)\n", sw_total, batch->num_done);
  */

  sw_optarg_t *sw_optarg = &input->sw_optarg;
    /*
  sw_optarg_t sw_optarg; //= sw_optarg_new(gap_open, gap_extend, matrix_filename);
  sw_optarg.gap_open = input->gap_open;
  sw_optarg.gap_extend = input->gap_extend;
  sw_optarg.subst_matrix['A']['A'] = input->match;    sw_optarg.subst_matrix['C']['A'] = input->mismatch; sw_optarg.subst_matrix['T']['A'] = input->mismatch; sw_optarg.subst_matrix['G']['A'] = input->mismatch;
  sw_optarg.subst_matrix['A']['C'] = input->mismatch; sw_optarg.subst_matrix['C']['C'] = input->match;    sw_optarg.subst_matrix['T']['C'] = input->mismatch; sw_optarg.subst_matrix['G']['C'] = input->mismatch;
  sw_optarg.subst_matrix['A']['G'] = input->mismatch; sw_optarg.subst_matrix['C']['T'] = input->mismatch; sw_optarg.subst_matrix['T']['T'] = input->match;    sw_optarg.subst_matrix['G']['T'] = input->mismatch;
  sw_optarg.subst_matrix['A']['T'] = input->mismatch; sw_optarg.subst_matrix['C']['G'] = input->mismatch; sw_optarg.subst_matrix['T']['G'] = input->mismatch; sw_optarg.subst_matrix['G']['G'] = input->match;
    */
  sw_multi_output_t *output = sw_multi_output_new(sw_total);
  char *q[sw_total], *r[sw_total];
  uint8_t strands[sw_total], chromosomes[sw_total];
  size_t starts[sw_total];
  size_t sw_count = 0, read_indices[sw_total];
  int read_len;

  // debugging: to kown how many reads are not mapped by SW score
  //  int unmapped_by_score[fq_batch->num_reads];
  //  memset(unmapped_by_score, 0, fq_batch->num_reads * sizeof(int));

  //  printf("num of sw to do: %i\n", sw_total);

  // initialize query and reference sequences to Smith-Waterman
  for (size_t i = 0; i < num_seqs; i++) {
    index = batch->targets[i];

    cal_list = batch->mapping_lists[index];
    num_cals = array_list_size(cal_list);

    //    printf("sw_server: read #%i with %i cals\n", index, num_cals);

    // processing each CAL from this read
    for(size_t j = 0; j < num_cals; j++) {

      // get cal and read index
      cal = array_list_get(j, cal_list);
      read_indices[sw_count] = index;

      // query sequence, revcomp if necessary
      read_len = fq_batch->data_indices[index + 1] - fq_batch->data_indices[index];
      q[sw_count] = (char *) calloc((read_len + 1), sizeof(char));
      memcpy(q[sw_count], &(fq_batch->seq[fq_batch->data_indices[index]]), read_len);
      if (cal->strand == 1) {
	seq_reverse_complementary(q[sw_count], read_len);
      }
      //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]);

      // reference sequence
      //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end);
  
      start = cal->start - flank_length;
      end = cal->end + flank_length;
      r[sw_count] = calloc(1, end - start + 2);
      genome_read_sequence_by_chr_index(r[sw_count], cal->strand,
					cal->chromosome_id - 1, &start, &end, genome);

      // save some stuff, we'll use them after...
      strands[sw_count] = cal->strand;
      chromosomes[sw_count] = cal->chromosome_id;
      starts[sw_count] = start;


      //      printf("read #%i (sw #%i): query: %s (%i)\nref  : %s (%i)\n\n", index, sw_count, q[sw_count], strlen(q[sw_count]), r[sw_count], strlen(r[sw_count]));

      // increase counter
      sw_count++;
    }

    // free cal_list
    array_list_free(cal_list, (void *)cal_free);
    batch->mapping_lists[index] = NULL;
  }

  // run Smith-Waterman
  //  printf("before smith_waterman: number of sw = %i\n", sw_total);
  smith_waterman_mqmr(q, r, sw_total, sw_optarg, 1, output);
  //  printf("after smith_waterman\n");

  /*
  // debugging
  {
    FILE *fd = fopen("sw.out", "w");
    sw_multi_output_save(sw_total, output, fd);
    fclose(fd);
  }
  */

  size_t num_targets = 0;
  // filter alignments by min_score
  for (size_t i = 0; i < sw_total; i++) {

    //    score = output->score_p[i] / (strlen(output->query_map_p[i]) * input->match);
    //    if (score >= min_score) {
    /*
    printf("--------------------------------------------------------------\n");
    printf("Smith-Waterman results:\n");
    printf("id\t%s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[read_indices[i]]]));
    printf("ref\n%s\n", r[i]);
    printf("query\n%s\n", q[i]);
    printf("map\n%s\n", output->ref_map_p[i]);
    printf("ref: chr = %d, strand = %d, start = %d, len = %d\n", chromosomes[i], strands[i], starts[i], strlen(r[i]));
    printf("query-map-start = %d, ref-map-start = %d\n", 
	   output->query_start_p[i], output->ref_start_p[i]);
    printf("score = %0.2f (min. score = %0.2f)\n", output->score_p[i], min_score);
    printf("--------------------------------------------------------------\n");
    */
    if (output->score_p[i] >= min_score) {
      // valid mappings, 
      //insert in the list for further processing
      index = read_indices[i];
      if (batch->mapping_lists[index] == NULL) {
	mapping_list = array_list_new(1000, 
				      1.25f, 
				      COLLECTION_MODE_ASYNCHRONIZED);
	array_list_set_flag(0, mapping_list);
	
	batch->mapping_lists[index] = mapping_list;
	batch->targets[num_targets++] = index;
      }

      sw_output = sw_output_new(strands[i],
				chromosomes[i],
				starts[i],
				strlen(r[i]),
				strlen(output->query_map_p[i]),
				output->query_start_p[i],
				output->ref_start_p[i],
				output->score_p[i],
				score,
				output->query_map_p[i],
				output->ref_map_p[i]);
      array_list_insert(sw_output, mapping_list);

      batch->num_to_do++;

      // debugging
      //unmapped_by_score[index] = 1;
    }

    // free query and reference
    free(q[i]);
    free(r[i]);
  }
  batch->num_targets = num_targets;
  /*
  // debugging
  for (size_t i = 0; i < fq_batch->num_reads; i++) {
    if (unmapped_by_score[i] == 0) {
	unmapped_by_score_counter[tid]++;
	//printf("by score: %s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[index]]));
      }
  }
  */

  // update counter
  thr_sw_items[tid] += sw_count;

  // free
  sw_multi_output_free(output);

  //  printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids);
}
int apply_sw_bs(sw_server_input_t* input, batch_t *batch) {

  int sw_3_nucleotides = 0;

  /*
  sw_optarg_t *sw_optarg2 = &input->sw_optarg;

  printf("Matrix Table\n\tA\tC\tG\tT\tN\nA\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nC\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nG\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nT\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nN\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\n\n",
	 sw_optarg2->subst_matrix['A']['A'],
	 sw_optarg2->subst_matrix['C']['A'],
	 sw_optarg2->subst_matrix['G']['A'],
	 sw_optarg2->subst_matrix['T']['A'],
	 sw_optarg2->subst_matrix['N']['A'],

	 sw_optarg2->subst_matrix['A']['C'],
	 sw_optarg2->subst_matrix['C']['C'],
	 sw_optarg2->subst_matrix['G']['C'],
	 sw_optarg2->subst_matrix['T']['C'],
	 sw_optarg2->subst_matrix['N']['C'],

	 sw_optarg2->subst_matrix['A']['G'],
	 sw_optarg2->subst_matrix['C']['G'],
	 sw_optarg2->subst_matrix['G']['G'],
	 sw_optarg2->subst_matrix['T']['G'],
	 sw_optarg2->subst_matrix['N']['G'],

	 sw_optarg2->subst_matrix['A']['T'],
	 sw_optarg2->subst_matrix['C']['T'],
	 sw_optarg2->subst_matrix['G']['T'],
	 sw_optarg2->subst_matrix['T']['T'],
	 sw_optarg2->subst_matrix['N']['T'],

	 sw_optarg2->subst_matrix['A']['N'],
	 sw_optarg2->subst_matrix['C']['N'],
	 sw_optarg2->subst_matrix['G']['N'],
	 sw_optarg2->subst_matrix['T']['N'],
	 sw_optarg2->subst_matrix['N']['N']
	 );
  */

  if (sw_3_nucleotides == 0) {
    apply_sw_bs_4nt(input, batch);
  } else {
    
    //printf("START: apply_sw\n"); 
    int tid = omp_get_thread_num();
    mapping_batch_t *mapping_batch = batch->mapping_batch;
    cal_t *cal = NULL;
    array_list_t *cal_list = NULL, *mapping_list = NULL;
    
    array_list_t *fq_batch = mapping_batch->fq_batch;
    fastq_read_t *fq_read;
    
    // added by PP for bisulfite
    array_list_t *CT_fq_batch = mapping_batch->CT_fq_batch;
    array_list_t *GA_fq_batch = mapping_batch->GA_fq_batch;
    array_list_t *CT_rev_fq_batch = mapping_batch->CT_rev_fq_batch;
    array_list_t *GA_rev_fq_batch = mapping_batch->GA_rev_fq_batch;
    fastq_read_t *fq_read2;
    // end added by PP for bisulfite
    
    size_t start, end;
    size_t start2, end2;
    /*
    genome_t *genome = input->genome_p;
    */
    // added by PP for bisulfite
    genome_t *genome1 = input->genome1_p;
    genome_t *genome2 = input->genome2_p;
    // end added by PP for bisulfite
    
    size_t flank_length = input->flank_length;
    
    // SIMD support for Smith-Waterman
    float score, min_score = input->min_score;
    
    sw_output_t *sw_output;
    
    size_t read_index, num_cals;
    
    size_t num_targets = mapping_batch->num_targets;
    size_t new_num_targets = 0;
    // added by PP for bisulfite
    size_t num_targets2 = mapping_batch->num_targets2;
    size_t new_num_targets2 = 0;
    // added by PP for bisulfite
    
    // added by PP for bisulfite
    size_t sw_total1 = mapping_batch->num_to_do;
    size_t sw_total2 = mapping_batch->num_to_do2;
    size_t sw_total = sw_total1 + sw_total2;
    // end added by PP for bisulfite
    
    // set to zero
    mapping_batch->num_to_do = 0;
    // added by PP for bisulfite
    mapping_batch->num_to_do2 = 0;
    int g[sw_total];
    // end added by PP for bisulfite
    
    sw_optarg_t *sw_optarg = &input->sw_optarg;
    
    sw_multi_output_t *output = sw_multi_output_new(sw_total);
    char *q[sw_total], *r[sw_total];
    uint8_t strands[sw_total], chromosomes[sw_total];
    size_t starts[sw_total];
    size_t sw_count = 0, read_indices[sw_total], sw_count2 = 0;
    int read_len, ref_len, max_ref_len;
    
    //printf("num of sw to do: %i\n", sw_total);
    
    // initialize query and reference sequences to Smith-Waterman
    for (size_t i = 0; i < num_targets; i++) {
      //    printf("sw_server: target #%i of %i\n", i, num_seqs);
      read_index = mapping_batch->targets[i];
      
      // to use with the three nucleotides searches
      fq_read  = (fastq_read_t *) array_list_get(read_index, GA_fq_batch);
      fq_read2 = (fastq_read_t *) array_list_get(read_index, GA_rev_fq_batch);
      
      //printf("read %lu = %s\n", read_index, fq_read->sequence);
      //printf("read %lu = %s\n", read_index, fq_read2->sequence);
      
      //    printf("sw_server: read #%i\n", read_index);
      
      cal_list = mapping_batch->mapping_lists[read_index];
      num_cals = array_list_size(cal_list);
      
      read_len = fq_read->length;
      //    max_ref_len = read_len + (read_len / 2);
      
      //printf("sw_server: num_cals = %i cals\n", num_cals);
      
      // processing each CAL from this read
      for(size_t j = 0; j < num_cals; j++) {
	
	// get cal and read index
	cal = array_list_get(j, cal_list);
	read_indices[sw_count] = read_index;
	
	if (flank_length >= cal->start) {
	  start = 0;
	} else {
	  start = cal->start - flank_length;
	}
	
	end = cal->end + flank_length;
	if (end >= genome1->chr_size[cal->chromosome_id - 1]) {
	  end = genome1->chr_size[cal->chromosome_id - 1] - 1;
	}
	
	ref_len = end - start + 2;
	//      if (ref_len < max_ref_len) {
	
	// query sequence, revcomp if necessary
	q[sw_count] = (char *) calloc((read_len + 1), sizeof(char));
	
	// to use with the three nucleotides searches
	if (cal->strand == 0) {
	  memcpy(q[sw_count], fq_read->sequence, read_len);
	  //seq_reverse_complementary(q[sw_count], read_len);
	} else {
	  memcpy(q[sw_count], fq_read2->sequence, read_len);
	}
	
	//q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]);
	
	// reference sequence
	//printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end);
	
	r[sw_count] = calloc(1, end - start + 2);
	
	// to use with the three nucleotides searches

	if (cal->strand == 0) {
	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start, &end, genome1);
	} else {

	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start, &end, genome2);

	  /*
	  start2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - end;
	  end2   = genome1->chr_size[cal->chromosome_id - 1] - 1 - start;
	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start2, &end2, genome2);
	  */
	}

	/*
	genome_read_sequence_by_chr_index(r[sw_count], cal->strand,
					  cal->chromosome_id - 1, &start, &end, genome1);
	*/
	
	// save some stuff, we'll use them after...
	strands[sw_count] = cal->strand;
	chromosomes[sw_count] = cal->chromosome_id;
	starts[sw_count] = start;

	/*
	printf("st = %lu\tend = %lu\n", cal->start, cal->end);
	printf("1\nseq %s\ngen %s\nstrand %2lu chromo %lu start %lu end %lu\n",
	       q[sw_count], r[sw_count], cal->strand, cal->chromosome_id, start, end);
	*/
	// increase counter
	sw_count++;
      }
      
      // free cal_list
      array_list_clear(cal_list, (void *) cal_free);
      //    batch->mapping_lists[index] = NULL;
    }
    ////////////////
    sw_count2 = sw_count;
    
    for (size_t i = 0; i < num_targets2; i++) {
      //    printf("sw_server: target #%i of %i\n", i, num_seqs);
      read_index = mapping_batch->targets2[i];
      
      // to use with the three nucleotides searches
      fq_read  = (fastq_read_t *) array_list_get(read_index, CT_fq_batch);
      fq_read2 = (fastq_read_t *) array_list_get(read_index, CT_rev_fq_batch);
      
      //printf("read %lu = %s\n", read_index, fq_read->sequence);
      //printf("read %lu = %s\n", read_index, fq_read2->sequence);
      
      //    printf("sw_server: read #%i\n", read_index);
      
      cal_list = mapping_batch->mapping_lists2[read_index];
      num_cals = array_list_size(cal_list);
      
      read_len = fq_read->length;
      //    max_ref_len = read_len + (read_len / 2);
      
      //printf("sw_server: num_cals = %i cals\n", num_cals);
      
      // processing each CAL from this read
      for(size_t j = 0; j < num_cals; j++) {
	
	// get cal and read index
	cal = array_list_get(j, cal_list);
	read_indices[sw_count] = read_index;
	
	if (flank_length >= cal->start) {
	  start = 0;
	} else {
	  start = cal->start - flank_length;
	}
	
	end = cal->end + flank_length;
	if (end >= genome1->chr_size[cal->chromosome_id - 1]) {
	  end = genome1->chr_size[cal->chromosome_id - 1] - 1;
	}
	
	ref_len = end - start + 2;
	//      if (ref_len < max_ref_len) {
	
	// query sequence, revcomp if necessary
	q[sw_count] = (char *) calloc((read_len + 1), sizeof(char));
	
	// to use with the three nucleotides searches
	if (cal->strand == 0) {
	  memcpy(q[sw_count], fq_read->sequence, read_len);
	  //seq_reverse_complementary(q[sw_count], read_len);
	} else {
	  memcpy(q[sw_count], fq_read2->sequence, read_len);
	}
	
	//q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]);
	
	// reference sequence
	//printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end);
	
	r[sw_count] = calloc(1, end - start + 2);
	
	// to use with the three nucleotides searches

	if (cal->strand == 0) {
	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start, &end, genome2);
	} else {

	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start, &end, genome1);

	  /*
	  start2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - end;
	  end2   = genome1->chr_size[cal->chromosome_id - 1] - 1 - start;
	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start2, &end2, genome1);
	  */
	}

	/*
	genome_read_sequence_by_chr_index(r[sw_count], cal->strand,
					  cal->chromosome_id - 1, &start, &end, genome2);
	*/
	
	// save some stuff, we'll use them after...
	strands[sw_count] = cal->strand;
	chromosomes[sw_count] = cal->chromosome_id;
	starts[sw_count] = start;
	
	//printf("2\nseq %s\ngen %s\nstrand %2lu chromo %lu start %lu end %lu\n",
	//     q[sw_count], r[sw_count], cal->strand, cal->chromosome_id, start, end);

	// increase counter
	sw_count++;
      }
      
      // free cal_list
      array_list_clear(cal_list, (void *) cal_free);
      //    batch->mapping_lists[index] = NULL;
    }
    
    //printf("before smith_waterman: sw_total = %i, sw_count = %i, sw_count2 = %i\n", sw_total, sw_count, sw_count2);
    
    // run Smith-Waterman
    //  printf("before smith_waterman: sw_total = %i, sw_count = %i\n", sw_total, sw_count);
    smith_waterman_mqmr(q, r, sw_count, sw_optarg, 1, output);
    //  printf("after smith_waterman\n");
    
    
    for (size_t i = 0; i < sw_count; i++) {
      LOG_DEBUG_F("cal: start = %lu, strand = %i\n", starts[i], strands[i]);
      LOG_DEBUG_F("\tquery : %s\n", q[i]); 
      LOG_DEBUG_F("\tref.  : %s\n", r[i]); 
      LOG_DEBUG_F("\tquery map: %s (start: %i)\n", 
		  output->query_map_p[i], output->query_start_p[i]);
      LOG_DEBUG_F("\tref. map : %s (start: %i)\n", 
		  output->ref_map_p[i], output->ref_start_p[i]);
      LOG_DEBUG("\n");
    }
    
    
    //size_t mapp = 0, mapp2 = 0;
    
    
    
    double norm_score;
    // filter alignments by min_score
    for (size_t i = 0; i < sw_count2; i++) {
      
      read_index = read_indices[i];
      fq_read = (fastq_read_t *) array_list_get(read_index, GA_fq_batch);
      fq_read2 = (fastq_read_t *) array_list_get(read_index, GA_rev_fq_batch);
      
      read_len = fq_read->length;
      norm_score = NORM_SCORE(output->score_p[i], read_len, input->match);
      
      if (norm_score >= min_score) {
	// valid mappings, 
	//insert in the list for further processing
	mapping_list = mapping_batch->mapping_lists[read_index];
	array_list_set_flag(0, mapping_list);
	
	if (array_list_size(mapping_list) == 0) {
	  mapping_batch->targets[new_num_targets++] = read_index;
	  
	  //mapp++;
	}
	
	sw_output = sw_output_new(strands[i],
				  chromosomes[i],
				  starts[i],
				  strlen(r[i]),
				  strlen(output->query_map_p[i]),
				  output->query_start_p[i],
				  output->ref_start_p[i],
				  output->score_p[i],
				  norm_score,
				  output->query_map_p[i],
				  output->ref_map_p[i]);
	array_list_insert(sw_output, mapping_list);
	
	mapping_batch->num_to_do++;
	
      }
      
      // free query and reference
      free(q[i]);
      free(r[i]);
    }
    mapping_batch->num_targets = new_num_targets;
    
    for (size_t i = sw_count2; i < sw_count; i++) {
      
      read_index = read_indices[i];
      fq_read = (fastq_read_t *) array_list_get(read_index, CT_fq_batch);
      fq_read2 = (fastq_read_t *) array_list_get(read_index, CT_rev_fq_batch);
      
      read_len = fq_read->length;
      norm_score = NORM_SCORE(output->score_p[i], read_len, input->match);
      
      if (norm_score >= min_score) {
	// valid mappings, 
	//insert in the list for further processing
	mapping_list = mapping_batch->mapping_lists2[read_index];
	array_list_set_flag(0, mapping_list);
	
	if (array_list_size(mapping_list) == 0) {
	  mapping_batch->targets2[new_num_targets2++] = read_index;
	  
	  //mapp2++;
	}
	
	sw_output = sw_output_new(strands[i],
				  chromosomes[i],
				  starts[i],
				  strlen(r[i]),
				  strlen(output->query_map_p[i]),
				  output->query_start_p[i],
				  output->ref_start_p[i],
				  output->score_p[i],
				  norm_score,
				  output->query_map_p[i],
				  output->ref_map_p[i]);
	array_list_insert(sw_output, mapping_list);
	
	mapping_batch->num_to_do2++;
	
      }
      
      // free query and reference
      free(q[i]);
      free(r[i]);
    }
    mapping_batch->num_targets2 = new_num_targets2;
    
    // update counter
    //  thr_sw_items[tid] += sw_count;
    
    // free
    sw_multi_output_free(output);
    
    // go to the next stage
    
    /*
      printf("3 SW1         \t%3lu\tmapp               \t%3lu\tno map (discard) \t%3lu\n", 
      num_targets, mapp, num_targets - mapp);
      printf("3 SW2         \t%3lu\tmapp               \t%3lu\tno map (discard) \t%3lu\n", 
      num_targets2, mapp2, num_targets2 - mapp2);
    */
    
    //printf("END: apply_sw, (%d Smith-Waterman)\n", sw_total);
    
  }
  
  //return CONSUMER_STAGE;
  return BS_POST_PAIR_STAGE;
  
  //  printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids);
}
Esempio n. 3
0
void fill_gaps(mapping_batch_t *mapping_batch, sw_optarg_t *sw_optarg, 
	       genome_t *genome, int min_gap, int min_distance) {

  int sw_count = 0;

  fastq_read_t *read;
  array_list_t *fq_batch = mapping_batch->fq_batch;

  size_t read_index, read_len;

  cal_t *cal;
  array_list_t *cal_list = NULL;
  size_t num_cals, num_targets = mapping_batch->num_targets;

  char *revcomp_seq = NULL;

  seed_region_t *s, *prev_s, *new_s;
  linked_list_iterator_t* itr;

  cigar_code_t *cigar_code;

  size_t start, end;
  size_t gap_read_start, gap_read_end, gap_read_len;
  size_t gap_genome_start, gap_genome_end, gap_genome_len;

  int left_flank, right_flank;
  sw_prepare_t *sw_prepare;
  array_list_t *sw_prepare_list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);

  char *query,  *ref;
  int distance, first = 0, last = 0;

  //  LOG_DEBUG("\n\n P R E   -   P R O C E S S\n");

  // initialize query and reference sequences to Smith-Waterman
  for (size_t i = 0; i < num_targets; i++) {

    read_index = mapping_batch->targets[i];
    read = (fastq_read_t *) array_list_get(read_index, fq_batch);
    
    cal_list = mapping_batch->mapping_lists[read_index];
    num_cals = array_list_size(cal_list);
    
    if (num_cals <= 0) continue;

    read_len = read->length;

    min_distance = read_len*0.2;

    LOG_DEBUG_F(">>>>> read %s\n", read->id);
    //    printf(">>>>> read %s\n", read->id);

    // processing each CAL from this read
    for(size_t j = 0; j < num_cals; j++) {

      // get cal and read index
      cal = array_list_get(j, cal_list);
      LOG_DEBUG_F("CAL #%i of %i (strand %i), sr_list size = %i, sr_duplicate_list size = %i\n", 
		  j, num_cals, cal->strand, cal->sr_list->size, cal->sr_duplicate_list->size);

      prev_s = NULL;
      itr = linked_list_iterator_new(cal->sr_list);
      s = (seed_region_t *) linked_list_iterator_curr(itr);
      while (s != NULL) {
	{
	  // for debugging
	  size_t start = s->genome_start;// + 1;
	  size_t end = s->genome_end;// + 1;
	  size_t len = end - start + 1;
	  //	  printf(":::::::::: %lu - %lu = %i ::::::::::::\n", end, start, len );
	  char *ref = (char *) malloc((len + 1) * sizeof(char));
	  genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
					    &start, &end, genome);
	  ref[len] = '\0';
	  //
	  LOG_DEBUG_F("\tseed: [%i|%i - %i|%i] %s (len = %i)\n", 
		      s->genome_start, s->read_start, s->read_end, s->genome_end, ref, len);
	  free(ref);
	}

	// set the cigar for the current region
	gap_read_len = s->read_end - s->read_start + 1;
	cigar_code = cigar_code_new();
	cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code);
	s->info = (void *) cigar_code;

	cigar_code = NULL;
	sw_prepare = NULL;

	if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) {
	  distance = 0;
	  mapping_batch->num_gaps++;
	  if (prev_s == NULL) {
	    // gap at the first position
	    gap_read_start = 0;
	    gap_read_end = s->read_start - 1;

	    gap_genome_start = s->genome_start - s->read_start;
	    gap_genome_end = s->genome_start - 1;

	    gap_read_len = gap_read_end - gap_read_start + 1;
	    gap_genome_len = gap_genome_end - gap_genome_start + 1;

	    cal->start = gap_genome_start;

	    assert(gap_read_len != 0);
	    assert(gap_genome_len != 0);

	    if (gap_read_len > min_gap) {
	      // the gap is too big, may be there's another CAL to cover it
	      cigar_code = cigar_code_new();
	      cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code);	      
	    } else {
	      left_flank = 0;
	      right_flank = DOUBLE_FLANK;
	    }
	  } else {
	    assert(prev_s->read_end < s->read_start);

	    // gap in a middle position
	    gap_read_start = prev_s->read_end + 1;
	    gap_read_end = s->read_start - 1;

	    gap_genome_start = prev_s->genome_end + 1;
	    gap_genome_end = s->genome_start - 1;

	    gap_read_len = gap_read_end - gap_read_start + 1;
	    gap_genome_len = gap_genome_end - gap_genome_start + 1;

	    LOG_DEBUG_F("gap (read, genome) = (%i, %i)\n", gap_read_len, gap_genome_len);

	    if (gap_genome_len == 0) { printf("#@#: %s\n", read->id); }
	    assert(gap_genome_len != 0);

	    if (gap_read_len == 0) {
	      // there's a deletion just between two consecutives seeds
	      cigar_code = (cigar_code_t *)prev_s->info;

	      cigar_code_append_op(cigar_op_new(gap_genome_len, 'D'), cigar_code);
	      cigar_code->distance += gap_genome_len;

	      cigar_code_append_op(cigar_op_new(s->read_end - s->read_start + 1, 'M'), cigar_code);
	      cigar_code->distance += ((cigar_code_t *)s->info)->distance;

	      prev_s->read_end = s->read_end;
	      prev_s->genome_end = s->genome_end;

	      LOG_DEBUG_F("prev cigar = %s\n", new_cigar_code_string((cigar_code_t *)prev_s->info));

	      // continue loop...
	      linked_list_iterator_remove(itr);
	      s = linked_list_iterator_curr(itr);
	      continue;
	    }
	      
	    left_flank = SINGLE_FLANK;
	    right_flank = SINGLE_FLANK;
	  }

	  if (!cigar_code) {
	    // we have to try to fill this gap and get a cigar
	    if (gap_read_len == gap_genome_len) {
	      //    1) first, for from  begin -> end, and begin <- end
	      start = gap_genome_start;// + 1;
	      end = gap_genome_end;// + 1;
	      first = -1;
	      last = -1;
	      ref = (char *) malloc((gap_genome_len + 5) * sizeof(char));
	      genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
						&start, &end, genome);
	      // handle strand -
	      if (cal->strand) {
		if (revcomp_seq == NULL) {
		  revcomp_seq = strdup(read->sequence);
		  seq_reverse_complementary(revcomp_seq, read_len);
		}
		query = &revcomp_seq[gap_read_start];
	      } else {
		query = &read->sequence[gap_read_start];
	      }
	      
	      for (int k = 0; k < gap_read_len; k++) {
		if (query[k] != ref[k]) {
		  distance++;
		  if (first == -1) first = k;
		  last = k;
		}
	      }

	      if (distance < min_distance) {
		cigar_code = cigar_code_new();
		cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code);
		cigar_code_inc_distance(distance, cigar_code);
	      }
	    }
	    if (!cigar_code) {
	      //    2) second, prepare SW to run

	      // get query sequence, revcomp if necessary
	      size_t read_start = gap_read_start - left_flank;
	      size_t read_end = gap_read_end + right_flank;
	      int gap_read_len_ex = read_end - read_start + 1;
	      query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char));
	      // handle strand -
	      if (cal->strand) {
		if (revcomp_seq == NULL) {
		  revcomp_seq = strdup(read->sequence);
		  seq_reverse_complementary(revcomp_seq, read_len);
		}
		memcpy(query, &revcomp_seq[read_start], gap_read_len_ex);
	      } else {
		memcpy(query, &read->sequence[read_start], gap_read_len_ex);
	      }
	      query[gap_read_len_ex] = '\0';
	      
	      // get ref. sequence
	      size_t genome_start = gap_genome_start - left_flank;// + 1;
	      size_t genome_end = gap_genome_end + right_flank;// + 1;
	      int gap_genome_len_ex = genome_end - genome_start + 1;
	      ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));;
	      genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
						&genome_start, &genome_end, genome);	      
	      ref[gap_genome_len_ex] = '\0';

	      if (prev_s == NULL) {
		sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, FIRST_SW);
	      } else {
		sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, MIDDLE_SW);
	      }

	      array_list_insert(sw_prepare, sw_prepare_list);
	      
	      // increase counter
	      sw_count++;	  

	      LOG_DEBUG_F("query: %s\n", query);
	      LOG_DEBUG_F("ref  : %s\n", ref);
	      LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", 
			  distance, min_distance, gap_read_len, first, last);
	      LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", 
			  gap_read_start, gap_read_end, gap_genome_start, gap_genome_end,
			  gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, 
			  read->id);

	    }
	  }
	  
	  // insert gap in the list
	  new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0);
	  new_s->info = (void *) cigar_code;
	  linked_list_iterator_insert(new_s, itr);

	  if (sw_prepare) {
	    sw_prepare->seed_region = new_s;
	    sw_prepare->cal = cal;
	    sw_prepare->read = read;
	  }
	}

	// continue loop...
	prev_s = s;
	linked_list_iterator_next(itr);
	s = linked_list_iterator_curr(itr);
      }

      // check for a gap at the last position
      sw_prepare = NULL;
      if (prev_s != NULL && prev_s->read_end < read_len - 1) { 
	cigar_code = NULL;
	mapping_batch->num_gaps++;
	//	mapping_batch->num_sws++;
	//	mapping_batch->num_ext_sws++;

	// gap at the last position
	gap_read_start = prev_s->read_end + 1;
	gap_read_end = read_len - 1;
	gap_read_len = gap_read_end - gap_read_start + 1;

	assert(gap_read_len != 0);

	gap_genome_len = gap_read_len;
	gap_genome_start = prev_s->genome_end + 1;
	gap_genome_end = gap_genome_start + gap_genome_len - 1;

	cal->end = gap_genome_end;

	assert(gap_genome_len != 0);

	//	LOG_DEBUG_F("\t\tgap_read_len = %i, gap_genome_len = %i\n", gap_read_len, gap_genome_len);
	//	LOG_DEBUG_F("\t\t%i : [%lu|%lu - %lu|%lu]\n", 
	//		    sw_count, gap_genome_start, gap_read_start, gap_read_end, gap_genome_end);

	if (gap_read_len > min_gap) {
	  // the gap is too big, may be there's another CAL to cover it
	  cigar_code = cigar_code_new();
	  cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code);	      
	} else {
	  // we have to try to fill this gap and get a cigar
	  
	  //    1) first, for from  begin -> end, and begin <- end
	  start = gap_genome_start;// + 1;
	  end = gap_genome_end;// + 1;
	  first = -1;
	  last = -1;
	  ref = (char *) malloc((gap_genome_len + 1) * sizeof(char));;
	  genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
					    &start, &end, genome);
	  // handle strand -
	  if (cal->strand) {
	    if (revcomp_seq == NULL) {
	      revcomp_seq = strdup(read->sequence);
	      seq_reverse_complementary(revcomp_seq, read_len);
	    }
	    query = &revcomp_seq[gap_read_start];
	  } else {
	    query = &read->sequence[gap_read_start];
	  }
	  
	  distance = 0;
	  for (int k = 0; k < gap_read_len; k++) {
	    if (query[k] != ref[k]) {
	      distance++;
	      if (first == -1) first = k;
	      last = k;
	    }
	  }
	  if (distance < min_distance) {
	    cigar_code = cigar_code_new();
	    cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code);
	    cigar_code_inc_distance(distance, cigar_code);
	  } else {
	    //    2) second, prepare SW to run

	    left_flank = DOUBLE_FLANK;
	    right_flank = 0;
	    
	    // get query sequence, revcomp if necessary
	    size_t read_start = gap_read_start - left_flank;
	    size_t read_end = gap_read_end + right_flank;
	    int gap_read_len_ex = read_end - read_start + 1;
	    query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char));
	    // handle strand -
	    if (cal->strand) {
	      if (revcomp_seq == NULL) {
		revcomp_seq = strdup(read->sequence);
		seq_reverse_complementary(revcomp_seq, read_len);
	      }
	      memcpy(query, &revcomp_seq[read_start], gap_read_len_ex);
	    } else {
	      memcpy(query, &read->sequence[read_start], gap_read_len_ex);
	    }
	    query[gap_read_len_ex] = '\0';
	    
	    // get ref. sequence
	    size_t genome_start = gap_genome_start - left_flank;// + 1;
	    size_t genome_end = gap_genome_end + right_flank;// + 1;
	    int gap_genome_len_ex = genome_end - genome_start + 1;
	    ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));;
	    genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
					      &genome_start, &genome_end, genome);
	    query[gap_genome_len_ex] = '\0';

	    sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, LAST_SW);
	    array_list_insert(sw_prepare, sw_prepare_list);
	    
	    // increase counter
	    sw_count++;	  

	    LOG_DEBUG_F("query: %s\n", query);
	    LOG_DEBUG_F("ref  : %s\n", ref);
	    LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", 
			distance, min_distance, gap_read_len, first, last);
	    LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", 
			gap_read_start, gap_read_end, gap_genome_start, gap_genome_end,
			gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, 
			read->id);
	  }
	}
	
	// insert gap in the list
	new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0);
	new_s->info = (void *) cigar_code;
	linked_list_insert_last(new_s, cal->sr_list);

	if (sw_prepare) {
	  sw_prepare->seed_region = new_s;
	  sw_prepare->cal = cal;
	  sw_prepare->read = read;
	}
      }
      linked_list_iterator_free(itr);      
    }

    // free memory
    if (revcomp_seq) {
      free(revcomp_seq);
      revcomp_seq = NULL;
    }
  }

  //  display_sr_lists("ATER pre-process in fill_gaps", mapping_batch);

  LOG_DEBUG_F("\nR U N   S W (sw_count = %i, sw_prepare_list size = %i)\n", sw_count, array_list_size(sw_prepare_list));
  assert(sw_count == array_list_size(sw_prepare_list));

  char *q[sw_count], *r[sw_count];
  for (int i = 0; i < sw_count; i++) {
    sw_prepare = array_list_get(i, sw_prepare_list);
    q[i] = sw_prepare->query;
    r[i] = sw_prepare->ref;
  }
  sw_multi_output_t *output = sw_multi_output_new(sw_count);

  // run Smith-Waterman
  smith_waterman_mqmr(q, r, sw_count, sw_optarg, 1, output);
  
  LOG_DEBUG("P O S T   -   P R O C E S S\n");
  cigar_op_t* cigar_op;
  for (int i = 0; i < sw_count; i++) {
    sw_prepare = array_list_get(i, sw_prepare_list);
    s = sw_prepare->seed_region;

    int read_gap_len = s->read_end - s->read_start + 1;
    int genome_gap_len = s->genome_end - s->genome_start + 1;

    int read_gap_len_ex = read_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank;
    int genome_gap_len_ex = genome_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank;

    LOG_DEBUG_F("\tgap (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", 
		s->read_start, s->read_end, s->genome_start, s->genome_end,
		read_gap_len, genome_gap_len, sw_prepare->read->id);
    LOG_DEBUG_F("\tflanks (left, right) = (%i, %i)\n", sw_prepare->left_flank, sw_prepare->right_flank);
    LOG_DEBUG_F("\tquery : %s\n", sw_prepare->query);
    LOG_DEBUG_F("\tref   : %s\n", sw_prepare->ref);
    LOG_DEBUG_F("\tmquery: %s (start %i)\n", output->query_map_p[i], output->query_start_p[i]);
    LOG_DEBUG_F("\tmref  : %s (start %i)\n", output->ref_map_p[i], output->ref_start_p[i]);

    cigar_code_t *cigar_c = generate_cigar_code(output->query_map_p[i], output->ref_map_p[i],
						strlen(output->query_map_p[i]), output->query_start_p[i],
						output->ref_start_p[i], read_gap_len, genome_gap_len,
						&distance, sw_prepare->ref_type);
    LOG_DEBUG_F("\tscore : %0.2f, cigar: %s (distance = %i)\n", 
		output->score_p[i], new_cigar_code_string(cigar_c), distance);

    /*
    if (output->query_start_p[i] > 0 && output->ref_start_p[i] > 0 && 
	output->query_start_p[i] != output->ref_start_p[i]) { 
      LOG_DEBUG("both map start points > 0 and are different lengths");
      exit(-1);
    }
    */
    //    assert(output->query_start_p[i] == 0);
    //    assert(output->ref_start_p[i] == 0);

    cigar_op = cigar_code_get_op(0, cigar_c);
    if (cigar_op) {
      if (cigar_op->name == 'H') {
	if (output->ref_start_p[i] == 0) { 
	  cigar_op->name = 'I';
	} else {
	  cigar_op->name = 'M';
	}
      } else if (cigar_op->name == '=') cigar_op->name = 'M';
    }

    cigar_op = cigar_code_get_last_op(cigar_c);
    if (cigar_op && cigar_op->name == 'H') cigar_op->name = 'I';

    LOG_DEBUG_F("gap_read_len = %i, cigar_code_length (%s) = %i\n", 
		read_gap_len, new_cigar_code_string(cigar_c), cigar_code_nt_length(cigar_c));
    assert(read_gap_len == cigar_code_nt_length(cigar_c));

    /*
    if (cigar_code_get_num_ops(cigar_c) > 2) {
      if (sw_prepare->left_flank > 0) {
	cigar_op = cigar_code_get_op(0, cigar_c);
	assert(cigar_op->number >= sw_prepare->left_flank && cigar_op->name == 'M');
	cigar_op->number -= sw_prepare->left_flank;
      }
      if (sw_prepare->right_flank > 0) {
	cigar_op = cigar_code_get_last_op(cigar_c);
	assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M');
	cigar_op->number -= sw_prepare->right_flank;
      }
      init_cigar_string(cigar_c);
      LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c));
    } else {
      assert(cigar_code_get_num_ops(cigar_c) == 1);
      if (sw_prepare->right_flank > 0) {
	cigar_op = cigar_code_get_last_op(cigar_c);
	assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M');
	cigar_op->number -= (sw_prepare->left_flank + sw_prepare->right_flank);
	if (cigar_op->number > read_gap_len) {
	  cigar_code_append_op(cigar_op_new(cigar_op->number - read_gap_len, 'D'), cigar_c);
	} else if (cigar_op->number < read_gap_len) {
	  cigar_code_append_op(cigar_op_new(read_gap_len - cigar_op->number, 'I'), cigar_c);
	} else{
	  init_cigar_string(cigar_c);
	}
	//	LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c));
      }
    }
    */
    // and now set the cigar for this gap
    s->info = (void *) cigar_c;

    // free
    sw_prepare_free(sw_prepare);
  }

  display_sr_lists("END of fill_gaps", mapping_batch);
    
  // free memory
  sw_multi_output_free(output);
  array_list_free(sw_prepare_list, (void *) NULL);
}