Exemplo n.º 1
0
int apply_bwt(bwt_server_input_t* input, batch_t *batch) {
  //printf("APPLY BWT SERVER...\n");



  mapping_batch_t *mapping_batch = batch->mapping_batch;
  size_t num_reads = array_list_size(mapping_batch->fq_batch);
  size_t num_mappings;
  array_list_t *list;
  size_t *unmapped_indices = mapping_batch->targets;
  size_t num_unmapped = 0;
  size_t num_anchors;

  for (int i = 0; i < num_reads; i++) {
    fastq_read_t *read = array_list_get(i, mapping_batch->fq_batch);
    //printf("BWT: %s\n", read->id);
    list = mapping_batch->mapping_lists[i];    
    array_list_set_flag(0, list);
    num_mappings = bwt_map_inexact_read(read,
					input->bwt_optarg_p,
					input->bwt_index_p,
					list);
    if (array_list_get_flag(list) != 2) { //If flag 2, the read exceded the max number of mappings
      if (array_list_get_flag(list) == 1) {
	if (num_mappings > 0) {
	  num_anchors = bwt_search_pair_anchors(list, read->length);	
	  if (num_anchors == 0) {
	    array_list_set_flag(NOT_ANCHORS, list);
	  } 
	} else {
	  array_list_set_flag(NOT_ANCHORS, list);
	}
	//printf("tot anchors found %i %s\n", num_anchors, read->id);
	unmapped_indices[num_unmapped++] = i;
      } else if (num_mappings <= 0) {
	array_list_set_flag(0, list);
	//printf("Read NO Mapped %i %s\n", num_anchors, read->id);
	unmapped_indices[num_unmapped++] = i;
      }
    } else {
      array_list_set_flag(ALIGNMENTS_EXCEEDED, list);
    }
  }
  // array_list flag: 0 -> Not  BWT Anchors found
  //                  1 -> One  BWT Anchors found
  //                  2 -> Pair BWT Anchors found
  //                  3 -> Alignments found
  //                  4 -> Alignments exceded
  
  mapping_batch->num_targets = num_unmapped;
    
  if (batch->mapping_batch->num_targets > 0) {
    return CAL_STAGE;
  }

  return DNA_POST_PAIR_STAGE;
}
Exemplo n.º 2
0
int apply_sw_bs(sw_server_input_t* input, batch_t *batch) {

  int sw_3_nucleotides = 0;

  /*
  sw_optarg_t *sw_optarg2 = &input->sw_optarg;

  printf("Matrix Table\n\tA\tC\tG\tT\tN\nA\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nC\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nG\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nT\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nN\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\n\n",
	 sw_optarg2->subst_matrix['A']['A'],
	 sw_optarg2->subst_matrix['C']['A'],
	 sw_optarg2->subst_matrix['G']['A'],
	 sw_optarg2->subst_matrix['T']['A'],
	 sw_optarg2->subst_matrix['N']['A'],

	 sw_optarg2->subst_matrix['A']['C'],
	 sw_optarg2->subst_matrix['C']['C'],
	 sw_optarg2->subst_matrix['G']['C'],
	 sw_optarg2->subst_matrix['T']['C'],
	 sw_optarg2->subst_matrix['N']['C'],

	 sw_optarg2->subst_matrix['A']['G'],
	 sw_optarg2->subst_matrix['C']['G'],
	 sw_optarg2->subst_matrix['G']['G'],
	 sw_optarg2->subst_matrix['T']['G'],
	 sw_optarg2->subst_matrix['N']['G'],

	 sw_optarg2->subst_matrix['A']['T'],
	 sw_optarg2->subst_matrix['C']['T'],
	 sw_optarg2->subst_matrix['G']['T'],
	 sw_optarg2->subst_matrix['T']['T'],
	 sw_optarg2->subst_matrix['N']['T'],

	 sw_optarg2->subst_matrix['A']['N'],
	 sw_optarg2->subst_matrix['C']['N'],
	 sw_optarg2->subst_matrix['G']['N'],
	 sw_optarg2->subst_matrix['T']['N'],
	 sw_optarg2->subst_matrix['N']['N']
	 );
  */

  if (sw_3_nucleotides == 0) {
    apply_sw_bs_4nt(input, batch);
  } else {
    
    //printf("START: apply_sw\n"); 
    int tid = omp_get_thread_num();
    mapping_batch_t *mapping_batch = batch->mapping_batch;
    cal_t *cal = NULL;
    array_list_t *cal_list = NULL, *mapping_list = NULL;
    
    array_list_t *fq_batch = mapping_batch->fq_batch;
    fastq_read_t *fq_read;
    
    // added by PP for bisulfite
    array_list_t *CT_fq_batch = mapping_batch->CT_fq_batch;
    array_list_t *GA_fq_batch = mapping_batch->GA_fq_batch;
    array_list_t *CT_rev_fq_batch = mapping_batch->CT_rev_fq_batch;
    array_list_t *GA_rev_fq_batch = mapping_batch->GA_rev_fq_batch;
    fastq_read_t *fq_read2;
    // end added by PP for bisulfite
    
    size_t start, end;
    size_t start2, end2;
    /*
    genome_t *genome = input->genome_p;
    */
    // added by PP for bisulfite
    genome_t *genome1 = input->genome1_p;
    genome_t *genome2 = input->genome2_p;
    // end added by PP for bisulfite
    
    size_t flank_length = input->flank_length;
    
    // SIMD support for Smith-Waterman
    float score, min_score = input->min_score;
    
    sw_output_t *sw_output;
    
    size_t read_index, num_cals;
    
    size_t num_targets = mapping_batch->num_targets;
    size_t new_num_targets = 0;
    // added by PP for bisulfite
    size_t num_targets2 = mapping_batch->num_targets2;
    size_t new_num_targets2 = 0;
    // added by PP for bisulfite
    
    // added by PP for bisulfite
    size_t sw_total1 = mapping_batch->num_to_do;
    size_t sw_total2 = mapping_batch->num_to_do2;
    size_t sw_total = sw_total1 + sw_total2;
    // end added by PP for bisulfite
    
    // set to zero
    mapping_batch->num_to_do = 0;
    // added by PP for bisulfite
    mapping_batch->num_to_do2 = 0;
    int g[sw_total];
    // end added by PP for bisulfite
    
    sw_optarg_t *sw_optarg = &input->sw_optarg;
    
    sw_multi_output_t *output = sw_multi_output_new(sw_total);
    char *q[sw_total], *r[sw_total];
    uint8_t strands[sw_total], chromosomes[sw_total];
    size_t starts[sw_total];
    size_t sw_count = 0, read_indices[sw_total], sw_count2 = 0;
    int read_len, ref_len, max_ref_len;
    
    //printf("num of sw to do: %i\n", sw_total);
    
    // initialize query and reference sequences to Smith-Waterman
    for (size_t i = 0; i < num_targets; i++) {
      //    printf("sw_server: target #%i of %i\n", i, num_seqs);
      read_index = mapping_batch->targets[i];
      
      // to use with the three nucleotides searches
      fq_read  = (fastq_read_t *) array_list_get(read_index, GA_fq_batch);
      fq_read2 = (fastq_read_t *) array_list_get(read_index, GA_rev_fq_batch);
      
      //printf("read %lu = %s\n", read_index, fq_read->sequence);
      //printf("read %lu = %s\n", read_index, fq_read2->sequence);
      
      //    printf("sw_server: read #%i\n", read_index);
      
      cal_list = mapping_batch->mapping_lists[read_index];
      num_cals = array_list_size(cal_list);
      
      read_len = fq_read->length;
      //    max_ref_len = read_len + (read_len / 2);
      
      //printf("sw_server: num_cals = %i cals\n", num_cals);
      
      // processing each CAL from this read
      for(size_t j = 0; j < num_cals; j++) {
	
	// get cal and read index
	cal = array_list_get(j, cal_list);
	read_indices[sw_count] = read_index;
	
	if (flank_length >= cal->start) {
	  start = 0;
	} else {
	  start = cal->start - flank_length;
	}
	
	end = cal->end + flank_length;
	if (end >= genome1->chr_size[cal->chromosome_id - 1]) {
	  end = genome1->chr_size[cal->chromosome_id - 1] - 1;
	}
	
	ref_len = end - start + 2;
	//      if (ref_len < max_ref_len) {
	
	// query sequence, revcomp if necessary
	q[sw_count] = (char *) calloc((read_len + 1), sizeof(char));
	
	// to use with the three nucleotides searches
	if (cal->strand == 0) {
	  memcpy(q[sw_count], fq_read->sequence, read_len);
	  //seq_reverse_complementary(q[sw_count], read_len);
	} else {
	  memcpy(q[sw_count], fq_read2->sequence, read_len);
	}
	
	//q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]);
	
	// reference sequence
	//printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end);
	
	r[sw_count] = calloc(1, end - start + 2);
	
	// to use with the three nucleotides searches

	if (cal->strand == 0) {
	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start, &end, genome1);
	} else {

	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start, &end, genome2);

	  /*
	  start2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - end;
	  end2   = genome1->chr_size[cal->chromosome_id - 1] - 1 - start;
	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start2, &end2, genome2);
	  */
	}

	/*
	genome_read_sequence_by_chr_index(r[sw_count], cal->strand,
					  cal->chromosome_id - 1, &start, &end, genome1);
	*/
	
	// save some stuff, we'll use them after...
	strands[sw_count] = cal->strand;
	chromosomes[sw_count] = cal->chromosome_id;
	starts[sw_count] = start;

	/*
	printf("st = %lu\tend = %lu\n", cal->start, cal->end);
	printf("1\nseq %s\ngen %s\nstrand %2lu chromo %lu start %lu end %lu\n",
	       q[sw_count], r[sw_count], cal->strand, cal->chromosome_id, start, end);
	*/
	// increase counter
	sw_count++;
      }
      
      // free cal_list
      array_list_clear(cal_list, (void *) cal_free);
      //    batch->mapping_lists[index] = NULL;
    }
    ////////////////
    sw_count2 = sw_count;
    
    for (size_t i = 0; i < num_targets2; i++) {
      //    printf("sw_server: target #%i of %i\n", i, num_seqs);
      read_index = mapping_batch->targets2[i];
      
      // to use with the three nucleotides searches
      fq_read  = (fastq_read_t *) array_list_get(read_index, CT_fq_batch);
      fq_read2 = (fastq_read_t *) array_list_get(read_index, CT_rev_fq_batch);
      
      //printf("read %lu = %s\n", read_index, fq_read->sequence);
      //printf("read %lu = %s\n", read_index, fq_read2->sequence);
      
      //    printf("sw_server: read #%i\n", read_index);
      
      cal_list = mapping_batch->mapping_lists2[read_index];
      num_cals = array_list_size(cal_list);
      
      read_len = fq_read->length;
      //    max_ref_len = read_len + (read_len / 2);
      
      //printf("sw_server: num_cals = %i cals\n", num_cals);
      
      // processing each CAL from this read
      for(size_t j = 0; j < num_cals; j++) {
	
	// get cal and read index
	cal = array_list_get(j, cal_list);
	read_indices[sw_count] = read_index;
	
	if (flank_length >= cal->start) {
	  start = 0;
	} else {
	  start = cal->start - flank_length;
	}
	
	end = cal->end + flank_length;
	if (end >= genome1->chr_size[cal->chromosome_id - 1]) {
	  end = genome1->chr_size[cal->chromosome_id - 1] - 1;
	}
	
	ref_len = end - start + 2;
	//      if (ref_len < max_ref_len) {
	
	// query sequence, revcomp if necessary
	q[sw_count] = (char *) calloc((read_len + 1), sizeof(char));
	
	// to use with the three nucleotides searches
	if (cal->strand == 0) {
	  memcpy(q[sw_count], fq_read->sequence, read_len);
	  //seq_reverse_complementary(q[sw_count], read_len);
	} else {
	  memcpy(q[sw_count], fq_read2->sequence, read_len);
	}
	
	//q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]);
	
	// reference sequence
	//printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end);
	
	r[sw_count] = calloc(1, end - start + 2);
	
	// to use with the three nucleotides searches

	if (cal->strand == 0) {
	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start, &end, genome2);
	} else {

	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start, &end, genome1);

	  /*
	  start2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - end;
	  end2   = genome1->chr_size[cal->chromosome_id - 1] - 1 - start;
	  genome_read_sequence_by_chr_index(r[sw_count], 0,
					    cal->chromosome_id - 1, &start2, &end2, genome1);
	  */
	}

	/*
	genome_read_sequence_by_chr_index(r[sw_count], cal->strand,
					  cal->chromosome_id - 1, &start, &end, genome2);
	*/
	
	// save some stuff, we'll use them after...
	strands[sw_count] = cal->strand;
	chromosomes[sw_count] = cal->chromosome_id;
	starts[sw_count] = start;
	
	//printf("2\nseq %s\ngen %s\nstrand %2lu chromo %lu start %lu end %lu\n",
	//     q[sw_count], r[sw_count], cal->strand, cal->chromosome_id, start, end);

	// increase counter
	sw_count++;
      }
      
      // free cal_list
      array_list_clear(cal_list, (void *) cal_free);
      //    batch->mapping_lists[index] = NULL;
    }
    
    //printf("before smith_waterman: sw_total = %i, sw_count = %i, sw_count2 = %i\n", sw_total, sw_count, sw_count2);
    
    // run Smith-Waterman
    //  printf("before smith_waterman: sw_total = %i, sw_count = %i\n", sw_total, sw_count);
    smith_waterman_mqmr(q, r, sw_count, sw_optarg, 1, output);
    //  printf("after smith_waterman\n");
    
    
    for (size_t i = 0; i < sw_count; i++) {
      LOG_DEBUG_F("cal: start = %lu, strand = %i\n", starts[i], strands[i]);
      LOG_DEBUG_F("\tquery : %s\n", q[i]); 
      LOG_DEBUG_F("\tref.  : %s\n", r[i]); 
      LOG_DEBUG_F("\tquery map: %s (start: %i)\n", 
		  output->query_map_p[i], output->query_start_p[i]);
      LOG_DEBUG_F("\tref. map : %s (start: %i)\n", 
		  output->ref_map_p[i], output->ref_start_p[i]);
      LOG_DEBUG("\n");
    }
    
    
    //size_t mapp = 0, mapp2 = 0;
    
    
    
    double norm_score;
    // filter alignments by min_score
    for (size_t i = 0; i < sw_count2; i++) {
      
      read_index = read_indices[i];
      fq_read = (fastq_read_t *) array_list_get(read_index, GA_fq_batch);
      fq_read2 = (fastq_read_t *) array_list_get(read_index, GA_rev_fq_batch);
      
      read_len = fq_read->length;
      norm_score = NORM_SCORE(output->score_p[i], read_len, input->match);
      
      if (norm_score >= min_score) {
	// valid mappings, 
	//insert in the list for further processing
	mapping_list = mapping_batch->mapping_lists[read_index];
	array_list_set_flag(0, mapping_list);
	
	if (array_list_size(mapping_list) == 0) {
	  mapping_batch->targets[new_num_targets++] = read_index;
	  
	  //mapp++;
	}
	
	sw_output = sw_output_new(strands[i],
				  chromosomes[i],
				  starts[i],
				  strlen(r[i]),
				  strlen(output->query_map_p[i]),
				  output->query_start_p[i],
				  output->ref_start_p[i],
				  output->score_p[i],
				  norm_score,
				  output->query_map_p[i],
				  output->ref_map_p[i]);
	array_list_insert(sw_output, mapping_list);
	
	mapping_batch->num_to_do++;
	
      }
      
      // free query and reference
      free(q[i]);
      free(r[i]);
    }
    mapping_batch->num_targets = new_num_targets;
    
    for (size_t i = sw_count2; i < sw_count; i++) {
      
      read_index = read_indices[i];
      fq_read = (fastq_read_t *) array_list_get(read_index, CT_fq_batch);
      fq_read2 = (fastq_read_t *) array_list_get(read_index, CT_rev_fq_batch);
      
      read_len = fq_read->length;
      norm_score = NORM_SCORE(output->score_p[i], read_len, input->match);
      
      if (norm_score >= min_score) {
	// valid mappings, 
	//insert in the list for further processing
	mapping_list = mapping_batch->mapping_lists2[read_index];
	array_list_set_flag(0, mapping_list);
	
	if (array_list_size(mapping_list) == 0) {
	  mapping_batch->targets2[new_num_targets2++] = read_index;
	  
	  //mapp2++;
	}
	
	sw_output = sw_output_new(strands[i],
				  chromosomes[i],
				  starts[i],
				  strlen(r[i]),
				  strlen(output->query_map_p[i]),
				  output->query_start_p[i],
				  output->ref_start_p[i],
				  output->score_p[i],
				  norm_score,
				  output->query_map_p[i],
				  output->ref_map_p[i]);
	array_list_insert(sw_output, mapping_list);
	
	mapping_batch->num_to_do2++;
	
      }
      
      // free query and reference
      free(q[i]);
      free(r[i]);
    }
    mapping_batch->num_targets2 = new_num_targets2;
    
    // update counter
    //  thr_sw_items[tid] += sw_count;
    
    // free
    sw_multi_output_free(output);
    
    // go to the next stage
    
    /*
      printf("3 SW1         \t%3lu\tmapp               \t%3lu\tno map (discard) \t%3lu\n", 
      num_targets, mapp, num_targets - mapp);
      printf("3 SW2         \t%3lu\tmapp               \t%3lu\tno map (discard) \t%3lu\n", 
      num_targets2, mapp2, num_targets2 - mapp2);
    */
    
    //printf("END: apply_sw, (%d Smith-Waterman)\n", sw_total);
    
  }
  
  //return CONSUMER_STAGE;
  return BS_POST_PAIR_STAGE;
  
  //  printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids);
}
Exemplo n.º 3
0
void apply_sw(sw_server_input_t* input, aligner_batch_t *batch) {


  //  printf("START: apply_sw\n"); 
  int tid = omp_get_thread_num();

  cal_t *cal = NULL;
  array_list_t *cal_list = NULL, *mapping_list = NULL;//, *old_list = NULL, *new_list = NULL;
  fastq_batch_t *fq_batch = batch->fq_batch;

  size_t start, end;
  genome_t *genome = input->genome_p;
     
  size_t flank_length = input->flank_length;

  // SIMD support for Smith-Waterman
  float score, min_score = input->min_score;
  //  size_t curr_depth = 0;
  sw_output_t *sw_output;
  //  sw_simd_input_t *sw_sinput = sw_simd_input_new(SIMD_DEPTH);
  //  sw_simd_output_t *sw_soutput = sw_simd_output_new(SIMD_DEPTH);
  //sw_simd_context_t *context = sw_simd_context_new(input->match, input->mismatch, 
  //						    input->gap_open, input->gap_extend); 

  // for tracking the current read, cal being processed using sw_channel_t
  //sw_channel_t *channel;
  //sw_channel_t sw_channels[SIMD_DEPTH];
  //memset(sw_channels, 0, sizeof(sw_channels));
  
  //size_t header_len, read_len;
  //size_t strands[SIMD_DEPTH], chromosomes[SIMD_DEPTH], starts[SIMD_DEPTH];
  
  size_t index, num_cals;
  size_t total = 0, valids = 0;

  size_t num_seqs = batch->num_targets;

  // set to zero
  batch->num_done = batch->num_to_do;
  batch->num_to_do = 0;

  size_t sw_total = batch->num_done;
  /*
  // for all seqs pending to process !!
  size_t sw_total = 0;
  for (size_t i = 0; i < num_seqs; i++) {
    sw_total += array_list_size(batch->mapping_lists[batch->targets[i]]);
  }
  printf("number of sw to run: %d (vs num_done = %d)\n", sw_total, batch->num_done);
  */

  sw_optarg_t *sw_optarg = &input->sw_optarg;
    /*
  sw_optarg_t sw_optarg; //= sw_optarg_new(gap_open, gap_extend, matrix_filename);
  sw_optarg.gap_open = input->gap_open;
  sw_optarg.gap_extend = input->gap_extend;
  sw_optarg.subst_matrix['A']['A'] = input->match;    sw_optarg.subst_matrix['C']['A'] = input->mismatch; sw_optarg.subst_matrix['T']['A'] = input->mismatch; sw_optarg.subst_matrix['G']['A'] = input->mismatch;
  sw_optarg.subst_matrix['A']['C'] = input->mismatch; sw_optarg.subst_matrix['C']['C'] = input->match;    sw_optarg.subst_matrix['T']['C'] = input->mismatch; sw_optarg.subst_matrix['G']['C'] = input->mismatch;
  sw_optarg.subst_matrix['A']['G'] = input->mismatch; sw_optarg.subst_matrix['C']['T'] = input->mismatch; sw_optarg.subst_matrix['T']['T'] = input->match;    sw_optarg.subst_matrix['G']['T'] = input->mismatch;
  sw_optarg.subst_matrix['A']['T'] = input->mismatch; sw_optarg.subst_matrix['C']['G'] = input->mismatch; sw_optarg.subst_matrix['T']['G'] = input->mismatch; sw_optarg.subst_matrix['G']['G'] = input->match;
    */
  sw_multi_output_t *output = sw_multi_output_new(sw_total);
  char *q[sw_total], *r[sw_total];
  uint8_t strands[sw_total], chromosomes[sw_total];
  size_t starts[sw_total];
  size_t sw_count = 0, read_indices[sw_total];
  int read_len;

  // debugging: to kown how many reads are not mapped by SW score
  //  int unmapped_by_score[fq_batch->num_reads];
  //  memset(unmapped_by_score, 0, fq_batch->num_reads * sizeof(int));

  //  printf("num of sw to do: %i\n", sw_total);

  // initialize query and reference sequences to Smith-Waterman
  for (size_t i = 0; i < num_seqs; i++) {
    index = batch->targets[i];

    cal_list = batch->mapping_lists[index];
    num_cals = array_list_size(cal_list);

    //    printf("sw_server: read #%i with %i cals\n", index, num_cals);

    // processing each CAL from this read
    for(size_t j = 0; j < num_cals; j++) {

      // get cal and read index
      cal = array_list_get(j, cal_list);
      read_indices[sw_count] = index;

      // query sequence, revcomp if necessary
      read_len = fq_batch->data_indices[index + 1] - fq_batch->data_indices[index];
      q[sw_count] = (char *) calloc((read_len + 1), sizeof(char));
      memcpy(q[sw_count], &(fq_batch->seq[fq_batch->data_indices[index]]), read_len);
      if (cal->strand == 1) {
	seq_reverse_complementary(q[sw_count], read_len);
      }
      //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]);

      // reference sequence
      //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end);
  
      start = cal->start - flank_length;
      end = cal->end + flank_length;
      r[sw_count] = calloc(1, end - start + 2);
      genome_read_sequence_by_chr_index(r[sw_count], cal->strand,
					cal->chromosome_id - 1, &start, &end, genome);

      // save some stuff, we'll use them after...
      strands[sw_count] = cal->strand;
      chromosomes[sw_count] = cal->chromosome_id;
      starts[sw_count] = start;


      //      printf("read #%i (sw #%i): query: %s (%i)\nref  : %s (%i)\n\n", index, sw_count, q[sw_count], strlen(q[sw_count]), r[sw_count], strlen(r[sw_count]));

      // increase counter
      sw_count++;
    }

    // free cal_list
    array_list_free(cal_list, (void *)cal_free);
    batch->mapping_lists[index] = NULL;
  }

  // run Smith-Waterman
  //  printf("before smith_waterman: number of sw = %i\n", sw_total);
  smith_waterman_mqmr(q, r, sw_total, sw_optarg, 1, output);
  //  printf("after smith_waterman\n");

  /*
  // debugging
  {
    FILE *fd = fopen("sw.out", "w");
    sw_multi_output_save(sw_total, output, fd);
    fclose(fd);
  }
  */

  size_t num_targets = 0;
  // filter alignments by min_score
  for (size_t i = 0; i < sw_total; i++) {

    //    score = output->score_p[i] / (strlen(output->query_map_p[i]) * input->match);
    //    if (score >= min_score) {
    /*
    printf("--------------------------------------------------------------\n");
    printf("Smith-Waterman results:\n");
    printf("id\t%s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[read_indices[i]]]));
    printf("ref\n%s\n", r[i]);
    printf("query\n%s\n", q[i]);
    printf("map\n%s\n", output->ref_map_p[i]);
    printf("ref: chr = %d, strand = %d, start = %d, len = %d\n", chromosomes[i], strands[i], starts[i], strlen(r[i]));
    printf("query-map-start = %d, ref-map-start = %d\n", 
	   output->query_start_p[i], output->ref_start_p[i]);
    printf("score = %0.2f (min. score = %0.2f)\n", output->score_p[i], min_score);
    printf("--------------------------------------------------------------\n");
    */
    if (output->score_p[i] >= min_score) {
      // valid mappings, 
      //insert in the list for further processing
      index = read_indices[i];
      if (batch->mapping_lists[index] == NULL) {
	mapping_list = array_list_new(1000, 
				      1.25f, 
				      COLLECTION_MODE_ASYNCHRONIZED);
	array_list_set_flag(0, mapping_list);
	
	batch->mapping_lists[index] = mapping_list;
	batch->targets[num_targets++] = index;
      }

      sw_output = sw_output_new(strands[i],
				chromosomes[i],
				starts[i],
				strlen(r[i]),
				strlen(output->query_map_p[i]),
				output->query_start_p[i],
				output->ref_start_p[i],
				output->score_p[i],
				score,
				output->query_map_p[i],
				output->ref_map_p[i]);
      array_list_insert(sw_output, mapping_list);

      batch->num_to_do++;

      // debugging
      //unmapped_by_score[index] = 1;
    }

    // free query and reference
    free(q[i]);
    free(r[i]);
  }
  batch->num_targets = num_targets;
  /*
  // debugging
  for (size_t i = 0; i < fq_batch->num_reads; i++) {
    if (unmapped_by_score[i] == 0) {
	unmapped_by_score_counter[tid]++;
	//printf("by score: %s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[index]]));
      }
  }
  */

  // update counter
  thr_sw_items[tid] += sw_count;

  // free
  sw_multi_output_free(output);

  //  printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids);
}
Exemplo n.º 4
0
size_t bwt_search_pair_anchors(array_list_t *list, unsigned int read_length) {
  bwt_anchor_t *bwt_anchor;
  int max_anchor_length = 0;
  

  bwt_anchor_t *bwt_anchor_back, *bwt_anchor_forw;
  int anchor_length_tmp, anchor_back, anchor_forw;
  int strand = 0, type = 0;
  int found_anchor = 0, found_double_anchor = 0;

  const int MIN_ANCHOR = 25;
  const int MIN_SINGLE_ANCHOR = 40;

  //const int MIN_DOUBLE_ANCHOR = MIN_ANCHOR*2;
  const int MAX_BWT_REGIONS = 50;
  const int MAX_BWT_ANCHOR_DISTANCE = 500000;

  array_list_t *anchor_list_tmp, *forward_anchor_list, *backward_anchor_list;
  cal_t *cal;
  int seed_size, gap_read, gap_genome;

  array_list_t *backward_anchor_list_0 = array_list_new(MAX_BWT_REGIONS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  array_list_t *forward_anchor_list_0 = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED);
  array_list_t *backward_anchor_list_1 = array_list_new(MAX_BWT_REGIONS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  array_list_t *forward_anchor_list_1 = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED);

  array_list_t *big_anchor_list = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED);

  //printf("Tot Anchors %i\n", array_list_size(list));
  for (int i = 0; i < array_list_size(list); i++) {
    bwt_anchor = array_list_get(i, list);
    if (bwt_anchor->strand == 1) {
      //printf("(-)bwt anchor %i:%lu-%lu (%i): \n", bwt_anchor->chromosome + 1, bwt_anchor->start, bwt_anchor->end, bwt_anchor->end - bwt_anchor->start + 1);
      if (bwt_anchor->type == FORWARD_ANCHOR) {
	array_list_insert(bwt_anchor, forward_anchor_list_1);
	//printf("FORW\n");
      } else {
	array_list_insert(bwt_anchor, backward_anchor_list_1);
	//printf("BACK\n");
      }
    } else {
      //printf("(+)bwt anchor %i:%lu-%lu (%i): \n", bwt_anchor->chromosome + 1, bwt_anchor->start, bwt_anchor->end, bwt_anchor->end - bwt_anchor->start + 1);
      if (bwt_anchor->type == FORWARD_ANCHOR) {
	array_list_insert(bwt_anchor, forward_anchor_list_0);
	//printf("FORW\n");
      } else {
	array_list_insert(bwt_anchor, backward_anchor_list_0);
	//printf("BACK\n");
      }
    }

    anchor_length_tmp = bwt_anchor->end - bwt_anchor->start + 1;
    if (anchor_length_tmp > MIN_SINGLE_ANCHOR && anchor_length_tmp > max_anchor_length) {
      max_anchor_length = anchor_length_tmp;
      found_anchor = 1;
      strand = bwt_anchor->strand;
      type = bwt_anchor->type;
    }
    
    if (read_length - anchor_length_tmp < 16) {
      array_list_insert(bwt_anchor, big_anchor_list);
    } 
    
  }
  
  array_list_clear(list, NULL);

  if (array_list_size(big_anchor_list) > 0) {
    for (int i = array_list_size(big_anchor_list) - 1; i >= 0; i--) {
      //printf("Insert cal %i\n", i);
      bwt_anchor = array_list_remove_at(i, big_anchor_list);
      size_t seed_size = bwt_anchor->end - bwt_anchor->start;

      if (bwt_anchor->type == FORWARD_ANCHOR) {
	cal = convert_bwt_anchor_to_CAL(bwt_anchor, 0, seed_size);
      } else {
	cal = convert_bwt_anchor_to_CAL(bwt_anchor, read_length - seed_size - 1, read_length - 1);
      }
      
      array_list_insert(cal, list);
    }
    array_list_set_flag(SINGLE_ANCHORS, list);
    
    goto exit;
  }

  for (int type = 1; type >= 0; type--) {
    if (!type) {
      forward_anchor_list = forward_anchor_list_1;
      backward_anchor_list = backward_anchor_list_1;
      //printf("Strand (+): %i-%i\n", array_list_size(forward_anchor_list), array_list_size(backward_anchor_list));
    } else { 
      forward_anchor_list = forward_anchor_list_0;
      backward_anchor_list = backward_anchor_list_0;
      //printf("Strand (-): %i-%i\n", array_list_size(forward_anchor_list), array_list_size(backward_anchor_list));
    }

    int *set_forward  = (int *)calloc(array_list_size(forward_anchor_list),  sizeof(int));
    int *set_backward = (int *)calloc(array_list_size(backward_anchor_list), sizeof(int));

    //Associate Anchors (+)/(-)
    for (int i = 0; i < array_list_size(forward_anchor_list); i++) { 
      if (set_forward[i]) { continue; }
      bwt_anchor_forw = array_list_get(i, forward_anchor_list);
      for (int j = 0; j < array_list_size(backward_anchor_list); j++) { 
	if (set_backward[j]) { continue; }
	bwt_anchor_back = array_list_get(j, backward_anchor_list);
	anchor_forw = (bwt_anchor_forw->end - bwt_anchor_forw->start + 1);
	anchor_back = (bwt_anchor_back->end - bwt_anchor_back->start + 1); 

	anchor_length_tmp = anchor_forw + anchor_back;

	//printf("\tCommpare %i:%lu-%lu with %i:%lu-%lu\n", bwt_anchor_forw->chromosome + 1, 
	//     bwt_anchor_forw->start, bwt_anchor_forw->end, bwt_anchor_back->chromosome + 1, 
	//     bwt_anchor_back->start, bwt_anchor_back->end);
	if (bwt_anchor_forw->chromosome == bwt_anchor_back->chromosome &&
	    abs(bwt_anchor_back->start - bwt_anchor_forw->end) <= MAX_BWT_ANCHOR_DISTANCE && 
	    anchor_forw >= MIN_ANCHOR && anchor_back >= MIN_ANCHOR) {
	  
	  if (bwt_anchor_back->start < bwt_anchor_forw->end) { continue; }
	  
	  gap_read = read_length - (anchor_forw + anchor_back);
	  gap_genome = bwt_anchor_back->start - bwt_anchor_forw->end;

	  //printf("anchor_forw = %i, anchor_back = %i, gap_read = %i, gap_genome = %i\n",
	  //	 anchor_forw, anchor_back, gap_read, gap_genome);
	  	  
	  int apply_flank = 0;
	  if (gap_read < 2 || gap_genome < 2) {
	    int gap;
	    if (gap_read < 0 && gap_genome < 0) {
	      gap = abs(gap_read) > abs(gap_genome) ? abs(gap_read) : abs(gap_genome);
	    } else if (gap_read < 0) {
	      gap = abs(gap_read);
	    } else if (gap_genome < 0) {
	      gap = abs(gap_genome);
	    } else {
	      gap = 2;
	    }
	    
	    int flank  = 5;
	    apply_flank = 1;
	    
	    if (abs(gap) >= flank*2) {
	      //Solve read overlap
	      flank = abs(gap)/2 + flank/2;
	    }
	    //printf("\tgap = %i, flank = %i\n", gap, flank);
	    if (flank >= anchor_forw) {
	      bwt_anchor_forw->end -= anchor_forw/2;	      
	    } else {
	      bwt_anchor_forw->end -= flank;
	    }

	    if (flank >= anchor_back) {
	      bwt_anchor_back->start += anchor_back/2;	    
	    } else {
	      bwt_anchor_back->start += flank;
	    }
	  } 
	  	  
	  cal = convert_bwt_anchor_to_CAL(bwt_anchor_forw, 0, bwt_anchor_forw->end - bwt_anchor_forw->start);
	  //printf("INSERT-1 (%i)[%i:%lu-%lu]\n", cal->strand, cal->chromosome_id, cal->start, cal->end);
	  array_list_insert(cal, list);
	  seed_size = bwt_anchor_back->end - bwt_anchor_back->start + 1;
	  //if (bwt_anchor_forw->end + read_length >= bwt_anchor_back->start) {	    
	  //seed_region_t *seed_region = seed_region_new(read_length - seed_size, read_length - 1,
	  //bwt_anchor_back->start, bwt_anchor_back->end, 1);
	  //cal->end = bwt_anchor_back->end;
	  //linked_list_insert_last(seed_region, cal->sr_list);	
	  //} else {
	  cal = convert_bwt_anchor_to_CAL(bwt_anchor_back, read_length - seed_size, read_length - 1);
	  //printf("INSERT-2 (%i)[%i:%lu-%lu]\n", cal->strand, cal->chromosome_id, cal->start, cal->end);
	  array_list_insert(cal, list);
	  if (array_list_size(list) > 5) { 
	    free(set_backward);
	    free(set_forward);	    
	    goto exit;
	  }

	  array_list_set_flag(DOUBLE_ANCHORS, list);
	  found_double_anchor = 1;
	  set_forward[i]  = 1;
	  set_backward[j] = 1;
	  break;
	}                                                                                                                      
      }         
    }
    free(set_backward);
    free(set_forward);
  }

  if (!found_double_anchor && found_anchor) { 
    //Not Double anchor found but one Yes!!
    if (strand == 1) {
      if (type == FORWARD_ANCHOR) {
	anchor_list_tmp = forward_anchor_list_1;
      } else {
	anchor_list_tmp =  backward_anchor_list_1;
      }
    } else {
      if (type == FORWARD_ANCHOR) {
	anchor_list_tmp =  forward_anchor_list_0;
      } else {
	anchor_list_tmp =  backward_anchor_list_0;
      }
    }

    //printf("LIST SIZE %i\n", array_list_size(anchor_list_tmp));
    for (int i = 0; i < array_list_size(anchor_list_tmp); i++) {
      bwt_anchor = array_list_get(i, anchor_list_tmp);
      size_t seed_size = bwt_anchor->end - bwt_anchor->start;
      //array_list_insert(bwt_anchor_new(bwt_anchor->strand, bwt_anchor->chromosome, 
      //			       bwt_anchor->start, bwt_anchor->end, bwt_anchor->type), anchor_list);
      if (bwt_anchor->type == FORWARD_ANCHOR) {
	//printf("------------------------> start %i\n", 0);
	cal = convert_bwt_anchor_to_CAL(bwt_anchor, 0, seed_size);
      } else {
	//printf("------------------------> start %i\n", read_length - seed_size);
	cal = convert_bwt_anchor_to_CAL(bwt_anchor, read_length - seed_size - 1, read_length - 1);
      }
      array_list_insert(cal, list);
    }
    array_list_set_flag(SINGLE_ANCHORS, list);
  } 

 exit:
  array_list_free(forward_anchor_list_1, (void *)bwt_anchor_free);
  array_list_free(backward_anchor_list_1,  (void *)bwt_anchor_free);
  array_list_free(forward_anchor_list_0,  (void *)bwt_anchor_free);
  array_list_free(backward_anchor_list_0,  (void *)bwt_anchor_free);
  array_list_free(big_anchor_list,  (void *)bwt_anchor_free);

  return array_list_size(list);
  
}
Exemplo n.º 5
0
int apply_bwt_rna(bwt_server_input_t* input, batch_t *batch) {

  LOG_DEBUG("========= APPLY BWT RNA =========\n");



  metaexons_t *metaexons = input->metaexons;
  mapping_batch_t *mapping_batch = batch->mapping_batch;
  size_t num_reads = array_list_size(mapping_batch->fq_batch);
  size_t num_mappings;
  array_list_t *list;
  size_t *unmapped_indices = mapping_batch->targets;
  size_t num_unmapped = 0;
  size_t num_anchors;

  extern pthread_mutex_t mutex_sp;
  extern st_bwt_t st_bwt;
  //pthread_mutex_lock(&mutex_sp);
  //extern size_t total_reads;
  //total_reads += num_reads;
  //pthread_mutex_unlock(&mutex_sp);
  
  for (int i = 0; i < num_reads; i++) {
    fastq_read_t *read = array_list_get(i, mapping_batch->fq_batch);

    //Rev-comp
    fastq_read_revcomp(read);

    //printf("BWT: %s\n", read->id);
    list = mapping_batch->mapping_lists[i];    
    
    array_list_set_flag(1, list);

    num_mappings = bwt_map_inexact_read(read,
					input->bwt_optarg_p,
					input->bwt_index_p,
					list);

    if (array_list_get_flag(list) != 2) { //If flag 2, the read exceded the max number of mappings
      if (array_list_get_flag(list) == 1) {
	if (num_mappings > 0) {
	  num_anchors = bwt_search_pair_anchors(list, read->length);	
	  if (num_anchors == 0) {
	    array_list_set_flag(NOT_ANCHORS, list);
	    unmapped_indices[num_unmapped++] = i;
	  }
	} else {
	  array_list_set_flag(NOT_ANCHORS, list);
	  unmapped_indices[num_unmapped++] = i;
	} 
	//printf("tot anchors found %i %s\n", num_anchors, read->id);
      } else if (num_mappings <= 0) {
	array_list_set_flag(0, list);
	//printf("Read NO Mapped %i %s\n", num_anchors, read->id);
	unmapped_indices[num_unmapped++] = i;
      } else {
	//Read Map, Metaexon Actualization
	array_list_set_flag(ALIGNMENTS_FOUND, list);

	pthread_mutex_lock(&mutex_sp);
	st_bwt.map_bwt++;
	pthread_mutex_unlock(&mutex_sp);

	for (int i = 0; i < num_mappings; i++) {
	  alignment_t *alignment = array_list_get(i, list);
	  metaexon_insert(0, alignment->chromosome,
			  alignment->position, alignment->position + read->length, 40,
			  METAEXON_NORMAL, NULL,
			  metaexons);
	  //alignment->alig_data = cigar_code_new_by_string(alignment->cigar);
	}
      }
    } else {
      array_list_set_flag(ALIGNMENTS_EXCEEDED, list);
    }
  
    if (array_list_get_flag(list) == DOUBLE_ANCHORS) {
      //printf("DOUBLE ANCHORS\n");
      for (int j = 0; j < array_list_size(list); j++) {
	//bwt_anchor_t *bwt_anchor_prev = array_list_get(j, list);
	cal_t *cal = array_list_get(j, list);
	metaexon_insert(0/*cal->strand*/, cal->chromosome_id - 1,
			cal->start, cal->end, 40,
			METAEXON_NORMAL, NULL,
			metaexons);
      }    
    } else if (array_list_get_flag(list) == SINGLE_ANCHORS) {
      for (int j = 0; j < array_list_size(list); j++) {
	//bwt_anchor_t *bwt_anchor = array_list_get(j, list);
	cal_t *cal = array_list_get(j, list);
	metaexon_t *metaexon;
	if (metaexon_search(0/*cal->strand*/, cal->chromosome_id - 1,
			    cal->start, cal->end, &metaexon,
			    metaexons)) {
	  metaexon_insert(0/*cal->strand*/, cal->chromosome_id - 1,
			  cal->start, cal->end, 40,
			  METAEXON_NORMAL, NULL,
			  metaexons);
	}
      }
    }
  }
  // array_list flag: 0 -> Not  BWT Anchors found
  //                  1 -> One  BWT Anchors found
  //                  2 -> Pair BWT Anchors found
  //                  3 -> Alignments found
  //                  4 -> Alignments exceded
  
  mapping_batch->num_targets = num_unmapped;

  LOG_DEBUG("========= APPLY BWT RNA END =========\n");
    
  if (batch->mapping_batch->num_targets > 0) {
    return RNA_CAL_STAGE;
  } else {
    return RNA_STAGE;
  }
    
}
Exemplo n.º 6
0
void apply_seeding(region_seeker_input_t* input, aligner_batch_t *batch) {

  //  printf("START: apply_seeding\n"); 

  char *seq;
  array_list_t *list = NULL;
  size_t index, num_mappings;
  fastq_batch_t *fq_batch = batch->fq_batch;
  size_t min_num_seeds = input->cal_optarg_p->min_num_seeds;
  size_t max_num_seeds = input->cal_optarg_p->max_num_seeds;
  size_t seed_size = input->cal_optarg_p->seed_size;
  size_t min_seed_size = input->cal_optarg_p->min_seed_size;
  size_t num_seqs = batch->num_targets;

  size_t num_outputs = 0;
  size_t *outputs = (size_t *) calloc(num_seqs, sizeof(size_t));

  // set to zero
  batch->num_done = 0;
  batch->num_to_do = 0;

  // omp parallel for !!
  for (size_t i = 0; i < num_seqs; i++) {
      
    index = batch->targets[i];
    list = batch->mapping_lists[index];
    //printf("region_seeker.c: apply_seeding: list #%i size = %i\n", i, array_list_size(list));
    
    seq = &(fq_batch->seq[fq_batch->data_indices[index]]);
    /*
    num_mappings = bwt_map_exact_seeds_seq(seq, seed_size, min_seed_size,
					   input->bwt_optarg_p, input->bwt_index_p, 
    					   list);
    */
    num_mappings = bwt_map_exact_seeds_seq_by_num(seq, min_num_seeds, max_num_seeds, 
						  seed_size, min_seed_size,
    						  input->bwt_optarg_p, input->bwt_index_p, 
    						  list);
    if (num_mappings > 0) {
      //      printf("\tregion_seeker.c: apply_seeding, setting flag to 2 for list %i\n", index);
      array_list_set_flag(2, list);
      outputs[num_outputs++] = index;
      batch->num_to_do += num_mappings;
      //    } else {
      //	if (strncmp("@rand", &(fq_batch->header[fq_batch->header_indices[index]]), 5)) {
      //	  printf("\tno seeds for read #%d: %s\n", 
      //		 index, &(fq_batch->header[fq_batch->header_indices[index]]));
      //	} 
      //   } else {
      //      printf("\tregion_seeker.c: apply_seeding: %s: list #%i, size = %i\n", &(fq_batch->header[fq_batch->header_indices[index]]), i, array_list_size(list));
    }
    //    printf("\tSEED  : read %d (%d items): %s\n", 
    //    	   index, num_mappings, &(fq_batch->header[fq_batch->header_indices[index]]));

    batch->num_done += 2 * (strlen(seq) / seed_size);
    batch->num_to_do += num_mappings;
  }

  // update batch
  batch->num_allocated_targets = num_seqs;
  batch->num_targets = num_outputs;
  if (batch->targets != NULL) free(batch->targets);
  batch->targets = outputs;

  // update counter
  thr_seeding_items[omp_get_thread_num()] += batch->num_done;
  
  //  printf("region_seeker.c: apply_seeding: num_outputs = %i\n", num_outputs);

  //  printf("END: apply_seeding, (seeding %d reads)\n", num_outputs);
}
Exemplo n.º 7
0
//====================================================================================
// apply_caling
//====================================================================================
int apply_caling(cal_seeker_input_t* input, batch_t *batch) {
  mapping_batch_t *mapping_batch = batch->mapping_batch;
  array_list_t *list = NULL;
  size_t read_index, num_cals;
  int min_seeds, max_seeds;


  cal_t *cal;
  array_list_t *cal_list;

  fastq_read_t *read;



  size_t num_chromosomes = input->genome->num_chromosomes + 1;
  size_t num_targets = mapping_batch->num_targets;
  size_t *targets = mapping_batch->targets;
  size_t new_num_targets = 0;
  array_list_t *region_list;
  bwt_anchor_t *bwt_anchor_back, *bwt_anchor_forw;
  linked_list_t *linked_list;
  int anchor_nt, gap_nt;
  seed_region_t *seed_region_start, *seed_region_end;
  //max_seeds = input->cal_optarg->num_seeds;
  
  //  size_t *new_targets = (size_t *) calloc(num_targets, sizeof(size_t));
  
  // set to zero
  mapping_batch->num_to_do = 0;

  for (size_t i = 0; i < num_targets; i++) {

    read_index = targets[i];
    read = array_list_get(read_index, mapping_batch->fq_batch); 
    region_list = mapping_batch->mapping_lists[read_index];
    // for debugging
    //    LOG_DEBUG_F("%s\n", ((fastq_read_t *) array_list_get(read_index, mapping_batch->fq_batch))->id);
    
    if (!list) {
      list = array_list_new(1000, 
			    1.25f, 
			    COLLECTION_MODE_ASYNCHRONIZED);
    }


    if (array_list_get_flag(region_list) == 0 || 
	array_list_get_flag(region_list) == 2) {
      //We have normal and extend seeds (anchors)
      max_seeds = (read->length / 15)*2 + 10;
      num_cals = bwt_generate_cal_list_linked_list(region_list,
						   input->cal_optarg,
						   &min_seeds, &max_seeds,
						   num_chromosomes,
						   list, read->length,
						   input->cal_optarg->min_cal_size, 0);
    } else {
      //We have double anchors with smaller distance between they
      //printf("Easy case... Two anchors and same distance between read gap and genome distance\n");
      num_cals = 0;
      for (int a = array_list_size(region_list) - 1; a >= 0; a -= 2) {
	max_seeds = 2;
	min_seeds = 2;
	bwt_anchor_back = array_list_remove_at(a, region_list);
	bwt_anchor_forw = array_list_remove_at(a - 1, region_list);

	linked_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED);

	
	//Seed for the first anchor
	anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start;
	//printf("\t seed0[%i-%i][%lu-%lu]\n", 0, anchor_nt - 1,
	//     bwt_anchor_forw->start, bwt_anchor_forw->end);
	seed_region_start = seed_region_new(0, anchor_nt - 1,
					    bwt_anchor_forw->start, bwt_anchor_forw->end, 0, 0, 0);

	//Seed for the first anchor
	gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start));
	//printf("\t gap_nt = %i, anchor_nt = %i\n", gap_nt, anchor_nt);
	//printf("\t seed1[%i-%i][%lu-%lu]\n", anchor_nt + gap_nt, read->length - 1, 
	//     bwt_anchor_back->start + 1, bwt_anchor_back->end);
	seed_region_end = seed_region_new(anchor_nt + gap_nt, read->length - 1,
					  bwt_anchor_back->start + 1, bwt_anchor_back->end, 1, 0, 0);

	//The reference distance is 0 and the read distance not
	//The read distance is 0 and the reference distance not
	//if (seed_region_start->genome_end > seed_region_end->genome_start || 
	//  seed_region_start->read_end > seed_region_end->read_start) { 
	//array_list_clear(region_list, NULL);
	//continue;
	if (seed_region_end->genome_start - seed_region_start->genome_end < 5 || 
	    seed_region_end->read_start - seed_region_start->read_end < 5) {
	  seed_region_start->genome_end -= 5;
	  seed_region_start->read_end -= 5;
	  seed_region_end->genome_start += 5;
	  seed_region_end->read_start += 5;
	}

	linked_list_insert(seed_region_start, linked_list);
	linked_list_insert_last(seed_region_end, linked_list);

	cal = cal_new(bwt_anchor_forw->chromosome + 1,
		      bwt_anchor_forw->strand,
		      bwt_anchor_forw->start,
		      bwt_anchor_back->end + 1,
		      2,
		      linked_list,
		      linked_list_new(COLLECTION_MODE_ASYNCHRONIZED));
	array_list_insert(cal, list);
	num_cals++;
      }
    }

    // for debugging
    LOG_DEBUG_F("read %s : num. cals = %i, min. seeds = %i, max. seeds = %i\n", 
		read->id, num_cals, min_seeds, max_seeds);


    /*    if (num_cals == 0) {
      int seed_size = 24;
      //First, Delete old regions
      array_list_clear(mapping_batch->mapping_lists[read_index], region_bwt_free);
      //Second, Create new regions with seed_size 24 and 1 Mismatch
      bwt_map_inexact_seeds_seq(read->sequence, seed_size, seed_size/2,
				bwt_optarg, bwt_index, 
				mapping_batch->mapping_lists[read_index]);

      num_cals = bwt_generate_cal_list_linked_list(mapping_batch->mapping_lists[mapping_batch->targets[i]], 
						   input->cal_optarg,
						   &min_seeds, &max_seeds,
						   num_chromosomes,
						   list, read->length);
						   }*/

    /*
    for (size_t j = 0; j < num_cals; j++) {
      cal = array_list_get(j, list);
      LOG_DEBUG_F("\tchr: %i, strand: %i, start: %lu, end: %lu, num_seeds = %i, num. regions = %lu\n", 
		  cal->chromosome_id, cal->strand, cal->start, cal->end, cal->num_seeds, cal->sr_list->size);
    }
    */
    //    printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", 
    //	   min_seeds, max_seeds, min_limit, array_list_size(list));

    // filter incoherent CALs
    int founds[num_cals], found = 0;
    for (size_t j = 0; j < num_cals; j++) {
      founds[j] = 0;
      cal = array_list_get(j, list);
      LOG_DEBUG_F("\tcal %i of %i: sr_list size = %i (cal->num_seeds = %i) %i:%lu-%lu\n", 
		  j, num_cals, cal->sr_list->size, cal->num_seeds,
		  cal->chromosome_id, cal->start, cal->end);
      if (cal->sr_list->size > 0) {
	int start = 0;
	for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) {
	  seed_region_t *s = list_item->item;
	  
	  LOG_DEBUG_F("\t\t:: star %lu > %lu s->read_start\n", start, s->read_start);
	  if (start > s->read_start) {
	    LOG_DEBUG("\t\t\t:: remove\n");
	    found++;
	    founds[j] = 1;
	  }
	  start = s->read_end + 1;
	}
      } else {
	found++;
	founds[j] = 1;
      }
    }
    if (found) {
      min_seeds = 100000;
      max_seeds = 0;
      cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
      for (size_t j = 0; j < num_cals; j++) {
	if (!founds[j]) {
	  cal = array_list_get(j, list);
	  cal->num_seeds = cal->sr_list->size;
	  if (cal->num_seeds > max_seeds) max_seeds = cal->num_seeds;
	  if (cal->num_seeds < min_seeds) min_seeds = cal->num_seeds;
	  array_list_insert(cal, cal_list);
	  array_list_set(j, NULL, list);
	}
      }
      array_list_free(list, (void *) cal_free);
      num_cals = array_list_size(cal_list);
      list = cal_list;
    }
  
    //    LOG_FATAL_F("num. cals = %i, min. seeds = %i, max. seeds = %i\n", num_cals, min_seeds, max_seeds);
    // filter CALs by the number of seeds

    cal_list = list;
    list = NULL;
    /*
    int min_limit = input->cal_optarg->min_num_seeds_in_cal;

    if (min_limit < 0) min_limit = max_seeds;
    //    min_limit -= 3;
    
    if (min_seeds == max_seeds || min_limit <= min_seeds) {
      cal_list = list;
      list = NULL;
    } else {
      cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
      for (size_t j = 0; j < num_cals; j++) {
	cal = array_list_get(j, list);
	if (cal->num_seeds >= min_limit) {
	  array_list_insert(cal, cal_list);
	  array_list_set(j, NULL, list);
	}
      }
      array_list_clear(list, (void *) cal_free);
      num_cals = array_list_size(cal_list);
    }
    */
    if (num_cals > MAX_CALS) {
      for (size_t j = num_cals - 1; j >= MAX_CALS; j--) {
	cal = (cal_t *) array_list_remove_at(j, cal_list);
	cal_free(cal);
      }
      num_cals = array_list_size(cal_list);
    }
    
    //    LOG_DEBUG_F("num. cals = %i, MAX_CALS = %i\n", num_cals, MAX_CALS);

    if (num_cals > 0 && num_cals <= MAX_CALS) {
      array_list_set_flag(2, cal_list);
      targets[new_num_targets++] = read_index;

      /*
      int count1 = 0, count2 = 0;
      // count number of sw to do

      // method #1
      //      printf("method #1\n");
      seed_region_t *s, *prev_s;
      linked_list_iterator_t* itr;
      for (size_t j = 0; j < num_cals; j++) {
	prev_s = NULL;
	cal = array_list_get(j, cal_list);
	itr = linked_list_iterator_new(cal->sr_list);
	s = (seed_region_t *) linked_list_iterator_curr(itr);
	while (s != NULL) {
	  if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) {
	    //	    printf("\t\t\tcase 1\n");
	    count1++;
	  }
	  prev_s = s;
	  linked_list_iterator_next(itr);
	  s = linked_list_iterator_curr(itr);
	}
	if (prev_s != NULL && prev_s->read_end < read->length - 1) { 
	  count1++;
	  //	  printf("\t\t\tcase 2 (%i < %i)\n", prev_s->read_end, read->length - 1);
	}
	linked_list_iterator_free(itr);
      }

      // method #2
      printf("method #2\n");
      for (size_t j = 0; j < num_cals; j++) {
	cal = array_list_get(j, cal_list);
	printf("\t: %i\n", j);
	if (cal->sr_list->size > 0) {
	  int start = 0;
	  for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) {
	    seed_region_t *s = list_item->item;
	    printf("\t\t[%i|%i - %i|%i]\n", s->genome_start, s->read_start, s->read_end, s->genome_end);
	    if (s->read_start != start) {
	      count2++;
	    }
	    start = s->read_end + 1;
	  }
	  if (start < read->length) { 
	    count2++;
	  }
	}
      }
      printf("count #1 = %i, count #2 = %i\n", count1, count2);
      assert(count1 == count2);

      mapping_batch->num_to_do += count1;
*/

      // we have to free the region list
      array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
      mapping_batch->mapping_lists[read_index] = cal_list;
    } else {
      array_list_set_flag(0, mapping_batch->mapping_lists[read_index]);
      // we have to free the region list
      array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
      if (cal_list) array_list_free(cal_list, (void *) cal_free);
      if (list) array_list_clear(list, (void *) cal_free);
    }

    /*    
    cal_list = list;
    list = NULL;
    array_list_set_flag(2, cal_list);
    //    mapping_batch->num_to_do += num_cals;
    targets[new_num_targets++] = read_index;
    
    // we have to free the region list
    array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
    mapping_batch->mapping_lists[read_index] = cal_list;
    */
    /*
    // filter CALs by the number of seeds
    int min_limit = input->cal_optarg->min_num_seeds_in_cal;
    if (min_limit < 0) min_limit = max_seeds;

    printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", 
	   min_seeds, max_seeds, min_limit, array_list_size(list));
    
    if (min_seeds == max_seeds || min_limit <= min_seeds) {
      cal_list = list;
      list = NULL;
    } else {
      cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
      for (size_t j = 0; j < num_cals; j++) {
	cal = array_list_get(j, list);
	if (cal->num_seeds >= min_limit) {
	  array_list_insert(cal, cal_list);
	  array_list_set(j, NULL, list);
	}
      }
      array_list_clear(list, (void *) cal_free);
      num_cals = array_list_size(cal_list);
      printf("************, num_cals = %i\n", num_cals);
    }

    if (num_cals > MAX_CALS) {
      for (size_t j = num_cals - 1; j >= MAX_CALS; j--) {
	cal = (cal_t *) array_list_remove_at(j, cal_list);
	cal_free(cal);
      }
      num_cals = array_list_size(cal_list);
    }

    if (num_cals > 0 && num_cals <= MAX_CALS) {
      array_list_set_flag(2, cal_list);
      mapping_batch->num_to_do += num_cals;
      targets[new_num_targets++] = read_index;
      
      // we have to free the region list
      array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
      mapping_batch->mapping_lists[read_index] = cal_list;
    } else {
      array_list_set_flag(0, mapping_batch->mapping_lists[read_index]);
      // we have to free the region list
      array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
      if (cal_list) array_list_free(cal_list, (void *) cal_free);
      if (list) array_list_clear(list, (void *) cal_free);
    }
    */
  } // end for 0 ... num_targets

  // update batch
  mapping_batch->num_targets = new_num_targets;

  //  LOG_DEBUG_F("num. SW to do: %i\n", 	mapping_batch->num_to_do);

  //  exit(-1);

  // free memory
  if (list) array_list_free(list, NULL);

  if (batch->mapping_mode == RNA_MODE) {
    return RNA_STAGE;
  }

  if (batch->pair_input->pair_mng->pair_mode != SINGLE_END_MODE) {
    return PRE_PAIR_STAGE;
  } else if (batch->mapping_batch->num_targets > 0) {
    return SW_STAGE;
  }
  
  return DNA_POST_PAIR_STAGE;
}
Exemplo n.º 8
0
int apply_seeding(region_seeker_input_t* input, batch_t *batch) {
  //printf("APPLY SEEDING...\n");


  //if (time_on) { start_timer(start); }


  mapping_batch_t *mapping_batch = batch->mapping_batch;

  size_t num_mappings;


  int seed_size = input->cal_optarg_p->seed_size;
  size_t min_seed_size = input->cal_optarg_p->min_seed_size;






  size_t num_targets = mapping_batch->num_targets;
  size_t *targets = mapping_batch->targets;
  size_t new_num_targets = 0;
  fastq_read_t *read;

  int min_intron_size = 40;

  int target;
  bwt_anchor_t *bwt_anchor = NULL;
  region_t *region;
  int gap_nt;
  int start_search;
  int end_search;

  // set to zero
  mapping_batch->num_to_do = 0;
  
  //TODO: omp parallel for !!
  /*if (batch->mapping_mode == 1000) {
    for (size_t i = 0; i < num_targets; i++) {
      //printf("Seq (i=%i)(target=%i): %s\n", i, targets[i], read->sequence);
      read = array_list_get(targets[i], mapping_batch->fq_batch);
      num_mappings = bwt_map_exact_seeds_seq(padding_left,
					     padding_right,
					     read->sequence,
					     seed_size,
					     min_seed_size,
					     input->bwt_optarg_p, 
					     input->bwt_index_p, 
					     mapping_batch->mapping_lists[targets[i]],
					     mapping_batch->extra_stage_id[targets[i]]);
      
      //printf("Num mappings %i\n", num_mappings);
      if (num_mappings > 0) {
	array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]);
	targets[new_num_targets++] = targets[i];
	mapping_batch->num_to_do += num_mappings;
      }
    }
    } else {*/
  
  //size_t new_num_targets = 0;
  //size_t *new_targets = (size_t *)malloc(array_list_size(fq_batch)*sizeof(size_t));
  array_list_t *array_list_aux = array_list_new(256, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  
  //Flag 0: The read has simple anchor or any, and need seeds and normal Cal_Seeker 
  //Flag 1: The read has double anchor and the gap is smaller than MIN_INTRON_SIZE. Cal_Seeker will be make one CAL
  //Flag 2: The read has double anchor but the gap is bigger than MIN_INTRON_SIZE. 
  for (size_t i = 0; i < num_targets; i++) {
    read = array_list_get(targets[i], mapping_batch->fq_batch);    
    //printf("Read Region %s: \n", read->id);
    /* if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 ||
	array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) {
      array_list_clear(mapping_batch->mapping_lists[targets[i]], bwt_anchor_free);
      continue;
      }
    */
    if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 || 
	array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) {
      //Flag 0 Case, Not anchors found, Make normal seeds      
      //      printf("***** Normal Case 0. Not anchors found!\n");
      for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) {
	bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]);
	array_list_insert(bwt_anchor, array_list_aux);
      }
      num_mappings = 0;

      num_mappings = bwt_map_exact_seeds_seq(0,
					     0,
					     read->sequence,
					     seed_size,
					     min_seed_size,
					     input->bwt_optarg_p, 
					     input->bwt_index_p, 
					     mapping_batch->mapping_lists[targets[i]],
					     0);
      if (num_mappings > 0) {
	array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]);
	targets[new_num_targets++] = targets[i];
	//mapping_batch->num_to_do += num_mappings;
      }
    } else if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) {
      //Flag 1 Case, One anchor found, Make displacements seeds                  
      printf("***** Case 1. One anchor found!\n");
      for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) {
	bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]);
	array_list_insert(bwt_anchor, array_list_aux);
      }

      int anchor_nt = bwt_anchor->end - bwt_anchor->start;
      int seed_id = 0;
      int seed_start, seed_end;
      int extra_seed;

      if ((bwt_anchor->type == FORWARD_ANCHOR && bwt_anchor->strand == 0) || 
	  (bwt_anchor->type == BACKWARD_ANCHOR && bwt_anchor->strand == 1 )) {
	start_search = anchor_nt + 1;
	end_search = read->length - 1;
	extra_seed = EXTRA_SEED_END;
      } else {
	start_search = 0;
	end_search = read->length - anchor_nt - 2;
	extra_seed = EXTRA_SEED_START;
      }

      printf("end_start %i - start_search %i = %i >= seed_size %i\n", end_search, start_search, end_search - start_search, seed_size);
      if (end_search - start_search >= seed_size) {
	printf("00 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search);
	/*
		num_mappings = bwt_map_exact_seeds_between_coords(start_search,
							  end_search,
							  read->sequence, 
							  seed_size, min_seed_size,
							  input->bwt_optarg_p, 
							  input->bwt_index_p, 
							  mapping_batch->mapping_lists[targets[i]],
							  extra_seed, &seed_id);
	*/
      }

      if (bwt_anchor->type == FORWARD_ANCHOR) {
	seed_id = 0;
	seed_start = 0;
	seed_end = anchor_nt;
      } else {
	seed_id += 1;
	seed_start = read->length - anchor_nt - 1;
	seed_end = read->length - 1;
      }

      for (int j = 0; j < array_list_size(array_list_aux); j++) {
	bwt_anchor_t *bwt_anchor = array_list_get(j, array_list_aux);
	//	printf("\tCreate seed Anchor [%i:%lu|%i-%i|%lu]\n", bwt_anchor->chromosome + 1, bwt_anchor->start, 
	//	       seed_start,seed_end,bwt_anchor->end);
	region = region_bwt_new(bwt_anchor->chromosome + 1,
				bwt_anchor->strand,
				bwt_anchor->start,
				bwt_anchor->end,
				seed_start,
				seed_end,
				read->length,
				seed_id);	  
	array_list_insert(region, mapping_batch->mapping_lists[targets[i]]);
      } 
      array_list_clear(array_list_aux, (void *)bwt_anchor_free); 
      array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]);
      targets[new_num_targets++] = targets[i];
    } else {
      //Flag 2 Case, Pair of anchors found
      printf("***** Case 2. Double anchor found!\n");
      bwt_anchor_t *bwt_anchor;
      bwt_anchor_t *bwt_anchor_forw, *bwt_anchor_back;

      int read_nt, genome_nt;


      int distance;
      int found = 0;
      region_t *region;
      int seed_id = 0;
      //if (array_list_size(mapping_batch->mapping_lists[targets[i]]) > 2) {
      int *anchors_targets = (int *)calloc(array_list_size(mapping_batch->mapping_lists[targets[i]]), sizeof(int));
      int num = 0;

      //min_intron_size = 0;

      //Search if one anchor is at the same distance from the reference and the read
      for (int b = 0; b < array_list_size(mapping_batch->mapping_lists[targets[i]]); b += 2) {
	bwt_anchor_forw = array_list_get(b, mapping_batch->mapping_lists[targets[i]]);
	bwt_anchor_back = array_list_get(b + 1, mapping_batch->mapping_lists[targets[i]]);
	//printf("FORW=%i:%lu-%lu BACK=%i:%lu-%lu\n", bwt_anchor_forw->chromosome, bwt_anchor_forw->start, bwt_anchor_forw->end,
	//     bwt_anchor_back->chromosome, bwt_anchor_back->start, bwt_anchor_back->end);
	read_nt = read->length - ((bwt_anchor_forw->end - bwt_anchor_forw->start) + (bwt_anchor_back->end - bwt_anchor_back->start));
	genome_nt = bwt_anchor_back->start - bwt_anchor_forw->end;	  
	distance = abs(genome_nt - read_nt);
	//printf("\t%i:Distance %i\n", b, distance);
	if (distance < min_intron_size) {
	  found = 1;
	} else {
	  anchors_targets[num++] = b;
	}
      }

      if (found) {
	//printf("\tFound Exact Case... Delete other anchors\n");
	for (int t = num - 1; t >= 0; t--) {
	  target = anchors_targets[t];
	  //printf("\tDelete %i, %i-->\n", target, target + 1);
	  bwt_anchor = array_list_remove_at(target + 1, mapping_batch->mapping_lists[targets[i]]);
	  bwt_anchor_free(bwt_anchor);
	  
	  bwt_anchor = array_list_remove_at(target, mapping_batch->mapping_lists[targets[i]]);
	  bwt_anchor_free(bwt_anchor);
	}
	array_list_set_flag(1, mapping_batch->mapping_lists[targets[i]]);
      } else {
	//Seeding between anchors
	//printf("\tFound gap between anchors \n");
	array_list_t *anchors_forward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]),
						       1.25f, COLLECTION_MODE_ASYNCHRONIZED);
	array_list_t *anchors_backward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]),
							1.25f, COLLECTION_MODE_ASYNCHRONIZED);
	int big_gap = 0;
	int final_anchor_nt = 0;
	int anchor_nt;
	int anchor_type;
	int anchor_strand;
	for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j -= 2) {
	  bwt_anchor_back = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]);
	  array_list_insert(bwt_anchor_back, anchors_backward);

	  bwt_anchor_forw = array_list_remove_at(j - 1, mapping_batch->mapping_lists[targets[i]]);
	  array_list_insert(bwt_anchor_forw, anchors_forward);

	  if (bwt_anchor_forw->strand == 0) {
	    anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start;
	    gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start));
	  } else {
	    anchor_nt = bwt_anchor_back->end - bwt_anchor_back->start;
	    gap_nt = read->length - (anchor_nt + (bwt_anchor_forw->end - bwt_anchor_forw->start));
	  }
	  if (gap_nt < 0) { gap_nt = 0; }
	  //printf("Gap nt (%i - %i): %i\n", anchor_nt, bwt_anchor_back->end - bwt_anchor_back->start, gap_nt);
	  if (gap_nt > big_gap) {
	    big_gap = gap_nt;
	    final_anchor_nt = anchor_nt;
	    anchor_type = bwt_anchor_back->type;
	    anchor_strand = bwt_anchor_back->strand;
	  }
	}

	printf("%i, %i\n", big_gap - 2, seed_size);
	if (big_gap - 2 > seed_size) {
	  //if (anchor_type == FORWARD_ANCHOR && anchor_strand == 0 || 
	  //  anchor_type == BACKWARD_ANCHOR && anchor_strand == 1 ) {
	    start_search = final_anchor_nt + 1;
	    end_search = final_anchor_nt + big_gap - 1;
	    //} else {
	    // start_search = final_anchor_nt + big_gap - 1;
	    //end_search = final_anchor_nt + 1;
	    //}
	  
	    //printf("Seeding between anchors... gap=%i\n", big_gap);
	    printf("11 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search);
	    /*
	    num_mappings = bwt_map_exact_seeds_between_coords(start_search,
							      end_search,
							      read->sequence, seed_size, min_seed_size,
							      input->bwt_optarg_p, 
							      input->bwt_index_p, 
							      mapping_batch->mapping_lists[targets[i]],
							      EXTRA_SEED_NONE,
							      &seed_id);
	    */
	}

	//printf("Making seeds anchors...\n");
	for (int a = 0; a < array_list_size(anchors_forward); a++) {
	  //Insert the last anchor. (Create new seed)
	  bwt_anchor_forw = array_list_get(a, anchors_forward);
	  bwt_anchor_back = array_list_get(a, anchors_backward);

	  anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start;
	  gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start));

	  //printf("\t --> Big Seed: %i, gap_nt: %i, anchor_nt = %i\n", a, gap_nt, anchor_nt);
	  if (gap_nt < 0) {
	    //gap_nt = 0;
	    bwt_anchor_forw->end   += gap_nt;
	    bwt_anchor_back->start -= gap_nt;
	    anchor_nt += gap_nt;
	    gap_nt = 0;
	  } else if (gap_nt == 0) {
	    bwt_anchor_forw->end -= 1;
	    bwt_anchor_back->start += 1;
	    anchor_nt -= 1;
	    gap_nt = 1;	    
	  }


	  region = region_bwt_new(bwt_anchor_forw->chromosome + 1,
				  bwt_anchor_forw->strand,
				  bwt_anchor_forw->start,
				  bwt_anchor_forw->end,
				  0,
				  anchor_nt,
				  read->length,
				  0);
	  //printf("Region: %i-%i\n", region->seq_start, region->seq_end);
	  array_list_insert(region, mapping_batch->mapping_lists[targets[i]]);

	  region = region_bwt_new(bwt_anchor_back->chromosome + 1,
				  bwt_anchor_back->strand,
				  bwt_anchor_back->start,
				  bwt_anchor_back->end,
				  anchor_nt + gap_nt,
				  read->length - 1,
				  read->length,
				  seed_id + 1);
	  //printf("Region: %i-%i\n", region->seq_start, region->seq_end);
	  array_list_insert(region, mapping_batch->mapping_lists[targets[i]]);

	  //printf("\tMaking seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]]));

	  bwt_anchor_free(bwt_anchor_back);
	  bwt_anchor_free(bwt_anchor_forw);
	}
	array_list_free(anchors_forward, NULL);
	array_list_free(anchors_backward, NULL);
	//printf("Making seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]]));

	array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]);
      }
      free(anchors_targets);
      targets[new_num_targets++] = targets[i];
    }
  }

  mapping_batch->num_targets = new_num_targets;

  array_list_free(array_list_aux, NULL);

  //if (time_on) { stop_timer(start, end, time); timing_add(time, REGION_SEEKER, timing); }

  //printf("APPLY SEEDING DONE!\n");
  
  return CAL_STAGE;

}
void *file_reader_2(void *input) {
  wf_input_file_t *wf_input = (wf_input_file_t *) input;
  FILE *fd = wf_input->file;
  batch_t *batch = wf_input->batch;
  int pair_mode = batch->pair_input->pair_mng->pair_mode;
  
  const int MAX_READS = 100;
  int num_reads = 0;
  batch_t *new_batch = NULL;

  size_t sizes_to_read[3], head_len, seq_len, num_items;
  size_t tot_size;
  char *buffer, *id, *sequence, *quality;
  size_t bytes;
  unsigned char type;
  array_list_t *reads = array_list_new(MAX_READS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  mapping_batch_t *mapping_batch = mapping_batch_new_2(MAX_READS, 
						       reads,
						       batch->pair_input->pair_mng);
  
  while (1) {
    //[size head][size seq][num items]
    bytes = fread(&type, sizeof(unsigned char), 1, fd);
    if (!bytes) { break; }
 
    //fastq_read_t *fq_read = file_fastq_read_new(&num_items, fd);
    fastq_read_t *fq_read = file_read_fastq_reads(&num_items, fd);
    if (fq_read == NULL) { /*printf("fq NULL\n");*/ break; }
    //printf("(num items %i)\nID : %s\nSEQ: %s\nQUA: %s\n", num_items, fq_read->id, fq_read->sequence, fq_read->quality);

    array_list_insert(fq_read, reads);
    
    mapping_batch->mapping_lists[num_reads] = array_list_new(50,
							     1.25f, 
							     COLLECTION_MODE_ASYNCHRONIZED);
    if (type == CAL_TYPE) {
      //exit(-1);
      //printf("\tCal Report\n");
      file_read_cals(num_items, mapping_batch->mapping_lists[num_reads], 
		     fq_read, fd);      
    } else if (type == META_ALIGNMENT_TYPE) {
      //printf("\tMeta Alignments Report\n");
      file_read_meta_alignments(num_items, mapping_batch->mapping_lists[num_reads], 
				fq_read, fd);            
      array_list_set_flag(BITEM_META_ALIGNMENTS,
			  mapping_batch->mapping_lists[num_reads]);    
    } else {
      //exit(-1);
      //printf("\tAlignments Report\n");
      file_read_alignments(num_items, mapping_batch->mapping_lists[num_reads], 
			   fq_read, fd);
    }

    //printf("W3 file read %i\n", array_list_size(mapping_batch->mapping_lists[num_reads]));
    num_reads++;
    if (num_reads >= MAX_READS) { break; }

  }

  tot_reads2 += num_reads;
  //printf("W3 Reads: %i | %i\n", tot_reads2, num_reads);
  //w3_r += num_reads;
  //printf("W3 Reads: %i\n", w3_r);

  if (num_reads) {
    mapping_batch->num_allocated_targets = num_reads;
    new_batch = batch_new(batch->bwt_input, batch->region_input, batch->cal_input, 
			  batch->pair_input, batch->preprocess_rna, batch->sw_input,
			  batch->writer_input, batch->mapping_mode, mapping_batch); 
  } else {
    mapping_batch_free(mapping_batch);
  }

  extern size_t reads_w3;
  reads_w3 += num_reads;

  return new_batch;

}
/*
fastq_read_t *file_fastq_read_new(size_t *num_items, FILE *fd) {

  size_t sizes_to_read[3], head_len, seq_len;

  head_len  = sizes_to_read[0];
  seq_len   = sizes_to_read[1];
  *num_items = sizes_to_read[2];
  
  int bytes = fread(sizes_to_read, sizeof(size_t), 3, fd);
  if (!bytes) { return NULL; }
  
  int tot_size = head_len + 2*seq_len;
  buffer = (char *)calloc(tot_size + 1, sizeof(char));
  bytes = fread(buffer, sizeof(char), tot_size, fd);
  if (!bytes) { 
    free(buffer);    
    return NULL; 
  }

  char *id = (char *)calloc(head_len + 1, sizeof(char));
  memcpy(id, buffer, head_len);
  //printf("ID : %s\n", id);

  char *sequence = (char *)calloc(seq_len + 1, sizeof(char));  
  memcpy(sequence, &buffer[head_len], seq_len);
  //printf("SEQ: %s\n", sequence);

  char *quality = (char *)calloc(seq_len + 1, sizeof(char));  
  memcpy(quality, &buffer[head_len + seq_len], seq_len);
  //printf("QUA: %s\n", quality);
  
  fastq_read_t *fq_read = fastq_read_new(id, sequence, quality);

  free(buffer);
  free(id);
  free(sequence);
  free(quality);


  return fq_read;

}

int file_cal_fill(size_t num_items, array_list_t *list, FILE *fd) {

  if (!num_items) { return 0; }
  
  bwt_anchor_t bwt_anchors[num_items];
  bytes = fread(bwt_anchors, sizeof(bwt_anchor_t), num_items, fd);
  if (!bytes) { LOG_FATAL("Corrupt file\n"); }
  
  for (int i = 0; i < num_items; i++) {
    //printf("[%i:%lu-%lu]\n", bwt_anchors[i].chromosome, bwt_anchors[i].start, bwt_anchors[i].end);
    size_t seed_size = bwt_anchors[i].end - bwt_anchors[i].start;
    cal_t *cal;
    if (bwt_anchors[i].type == FORWARD_ANCHOR) {
      cal = convert_bwt_anchor_to_CAL(&bwt_anchors[i], 0, seed_size);
    } else {
      cal = convert_bwt_anchor_to_CAL(&bwt_anchors[i], fq_read->length - seed_size - 1, fq_read->length - 1);
    }
    array_list_insert(cal, list); 
  }  
  
  return 0;

}

int file_meta_alignment_fill(size_t num_items, array_list_t *list, FILE *fd) {

  if (!num_items) { return 0; }

  simple_alignment_t simple_alignment[num_items];
  simple_alignment_t *simple_a;
  
  bytes = fread(simple_alignment, sizeof(simple_alignment_t), num_items, fd);
  if (!bytes) { LOG_FATAL("Corrupt file\n"); }
  
  size_t cigar_tot_len = 0;
  for (int i = 0; i < num_items; i++) {
    simple_a = &simple_alignment[i];
    //printf("ITEM %i: (%i)[%i:%lu] [%i-%i]\n", i, simple_a->map_strand, simple_a->map_chromosome,
    //     simple_a->map_start, simple_a->gap_start, simple_a->gap_end);
    cigar_tot_len += simple_a->cigar_len;
  }
    
  char cigar_buffer[cigar_tot_len];
  bytes = fread(cigar_buffer, sizeof(char), cigar_tot_len, fd);
  if (!bytes) { LOG_FATAL("Corrupt file\n"); }

  char cigars_test[num_items][1024];
  size_t actual_read = 0;
  for (int i = 0; i < num_items; i++) {
    simple_a = &simple_alignment[i];
    memcpy(&cigars_test[i], &cigar_buffer[actual_read], simple_a->cigar_len);
    cigars_test[i][simple_a->cigar_len] = '\0';
    actual_read += simple_a->cigar_len;
    //printf("CIGAR %i: %s\n", i, cigars_test[i]);
    size_t map_len = fq_read->length - simple_a->gap_start - simple_a->gap_end;
    //printf("SEED := len_read:%i - gap_read:%i - gap_end:%i = %i, SEED-END = %i\n", fq_read->length, 
    //     simple_a->gap_start, 
    //     simple_a->gap_end, 
    //     map_len, simple_a->gap_start + map_len);
    seed_region_t *s_region = seed_region_new(simple_a->gap_start, 
					      simple_a->gap_start + map_len - 1,
					      simple_a->map_start, 
					      simple_a->map_start + map_len,
					      0);
    
    //printf("Exit with seed [%i:%i]\n", s_region->read_start, s_region->read_end);
    
    linked_list_t *sr_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED);
    //s_region->info = cigar_code_new_by_string(cigars_test[i]);
    linked_list_insert(s_region, sr_list);
    
    cal_t *cal = cal_new(simple_a->map_chromosome, 
			 simple_a->map_strand,
			 simple_a->map_start,
			 simple_a->map_start + map_len,
			 1,
			 sr_list,
			 linked_list_new(COLLECTION_MODE_ASYNCHRONIZED));
    cal->info = cigar_code_new_by_string(cigars_test[i]);
    
    meta_alignment_t *meta_alignment = meta_alignment_new();
    array_list_insert(cal, meta_alignment->cals_list);
    array_list_insert(meta_alignment, list);
  }

  return 0;
}

int file_alignment_fill(size_t num_items, array_list_t *list, 
			fastq_read_t *fq_read, FILE *fd) {
  if (!num_items) { return 0; }
  
  alignment_aux_t alignments_aux[num_items];
  alignment_aux_t *alignment_a;
  
  bytes = fread(alignments_aux, sizeof(alignment_aux_t), num_items, fd);
  if (!bytes) { LOG_FATAL("Corrupt file\n"); }

  size_t cigar_tot_len = 0;
  for (int i = 0; i < num_items; i++) {
    alignment_a = &simple_alignment[i];
    //printf("ITEM %i: (%i)[%i:%lu] [%i-%i]\n", i, simple_a->map_strand, simple_a->map_chromosome,
    //     simple_a->map_start, simple_a->gap_start, simple_a->gap_end);
    cigar_tot_len += alignmment_a->cigar_len + alignment_a->optional_field_length;
  }

  char cigars_test[num_items][1024];
  char optional_fields[num_items][1024];
  size_t actual_read = 0;
  for (int i = 0; i < num_items; i++) {
    alignment_a = &alignments_aux[i];
    memcpy(&cigars_test[i], &cigar_buffer[actual_read], alignment_a->cigar_len);
    cigars_test[i][alignment_a->cigar_len] = '\0';
    actual_read += simple_a->cigar_len;
    
    char op;
    char op_value[1024];
    int c = 0;
    int hc_start = 0, hc_end;
    for (int j = 0; j < alignment_a->cigar_len; j++) {
      op = cigars_test[j];
      if (op < 58) {
	op_value[c++] = op;
      } else {
	op_value[c] = '\0';
	if (op == 'H') {
	  hc_start = atoi(op_value);
	}
	break;
      }
    }

    if (cigars_test[alignment_a->cigar_len - 1] == 'H') {
      for (int j = alignment_a->cigar_len - 2; j >= 0; j--) {
	op = cigars_test[j];
	if (op < 58) {
	  op_value[c++] = op;
	} else {
	  op_value[c] = '\0';
	  int len = strlen(op_value);
	  char op_val_aux[len];
	  int pos = len - 1;
	  for (int j = 0; j < len; j++) {	    
	    op_val_aux[j] = op_value[pos - j];
	  } 
	  hc_end = atoi(op_val_aux);
	  break;
	}
      }
    }

    memcpy(&optional_fields[i], &cigar_buffer[actual_read], alignment_a->optional_fields_length);
    optional_fields[i][alignment_a->optional_fields_length] = '0';
    actual_read += alignment_a->optional_fields_length;

    int header_len = strlen(fq_read->id);
    char header_id[header_len + 1];
    get_to_first_blank(fq_read->id, header_len, header_id);
    //char *header_match = (char *)malloc(sizeof(char)*header_len);
    //memcpy(header_match, header_id, header_len);

    int len_read = fq_read->length - (hc_start + hc_end);
    char *quality = (char *) calloc (len_read + 1, sizeof(char));
    strncpy(quality, fq_read->quality + hc_start, len_read);
    char *query = (char *) calloc (len_read + 1, sizeof(char));
    strncpy(query, fq_read->query + hc_start, len_read);

    //Revisar rna_Server get_to_first_blank header copy
    alignment_t *alignment = alignment_new();
    alignment_init_single_end(strdup(header_id),
			      query,
			      quality,
			      alignment_a->seq_strand, 
			      alignment_a->chromosome, 
			      alignment_a->position,
			      strdup(cigars_test[i]),
			      alignment_a->num_cigar_operations,
			      alignment_a->map_quality, 
			      1, 
			      num_items < 1,
			      alignment_a->optional_fields_length,
			      strdup(optional_fields[i]), 
			      alignment);
    
    array_list_insert(alignment, list);
  }  

  return 0;

}
*/
void *file_reader(void *input) {
  wf_input_file_t *wf_input = (wf_input_file_t *) input;
  FILE *fd = wf_input->file;
  batch_t *batch = wf_input->batch;
  int pair_mode = batch->pair_input->pair_mng->pair_mode;

  const int MAX_READS = 100;
  int num_reads = 0;
  batch_t *new_batch = NULL;

  size_t tot_size;
  size_t num_items;
  char *buffer, *id, *sequence, *quality;
  size_t bytes;
  unsigned char type;
  array_list_t *reads = array_list_new(MAX_READS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  mapping_batch_t *mapping_batch = mapping_batch_new_2(MAX_READS, 
						       reads,
						       batch->pair_input->pair_mng);  
  while (1) {
    //[type][size head][size seq][num items]
    bytes = fread(&type, sizeof(unsigned char), 1, fd);
    if (!bytes) { break; }
 
    fastq_read_t *fq_read = file_read_fastq_reads(&num_items, fd);
    if (fq_read == NULL) { break; }
    
    mapping_batch->mapping_lists[num_reads] = array_list_new(50,
							     1.25f, 
							     COLLECTION_MODE_ASYNCHRONIZED);
    //printf("(num items %i)\nID : %s\nSEQ: %s\nQUA: %s\n", num_items, fq_read->id, fq_read->sequence, fq_read->quality);

    array_list_insert(fq_read, reads);
    
    if (type == CAL_TYPE) {
      //printf("\tCal Report\n");
      file_read_cals(num_items, mapping_batch->mapping_lists[num_reads], 
		     fq_read, fd);      
      array_list_set_flag(BITEM_SINGLE_ANCHORS, 
			  mapping_batch->mapping_lists[num_reads]);
    } else if (type == META_ALIGNMENT_TYPE) {
      //printf("\tMeta Alignments Report\n");
      array_list_set_flag(BITEM_META_ALIGNMENTS, 
			  mapping_batch->mapping_lists[num_reads]);
      file_read_meta_alignments(num_items, mapping_batch->mapping_lists[num_reads], 
				fq_read, fd);            
    } else {
      //printf("\tAlignments Report\n");
      file_read_alignments(num_items, mapping_batch->mapping_lists[num_reads], 
			   fq_read, fd);
    }

    /*if (strcmp("@ENST00000496771@ENSG00000000003@processed_transcript@X@99887538@99891686@-1@KNOWN_518_447_1_0_0_0_4:0:0_3:0:0_3/1",
	       fq_read->id) == 0) {
      exit(-1);
      }*/
            
    num_reads++;
    if (num_reads >= MAX_READS) { break; }

  }

  //w2_r += num_reads;
  //printf("W2 Reads: %i\n", w2_r);

  if (num_reads) {
    mapping_batch->num_allocated_targets = num_reads;
    new_batch = batch_new(batch->bwt_input, batch->region_input, batch->cal_input, 
			  batch->pair_input, batch->preprocess_rna, batch->sw_input,
			  batch->writer_input, batch->mapping_mode, mapping_batch); 
  } else {
    //array_list_free(reads, NULL);
    mapping_batch_free(mapping_batch);
  }

  extern size_t reads_w2;
  reads_w2 += num_reads;


  return new_batch;

}