void region_seeker_server(region_seeker_input_t *input_p){
  
  printf("region_seeker_server(%d): START\n", omp_get_thread_num());  
  list_item_t *item_p = NULL;
  list_item_t *cal_item_p = NULL;
  fastq_batch_t *unmapped_batch_p;
  size_t num_reads;
  array_list_t **allocate_mapping_p;
  cal_batch_t *cal_batch_p;
  size_t num_mappings, total_mappings = 0, num_batches = 0;
  size_t num_threads = input_p->region_threads;
  size_t chunk;
  size_t total_reads = 0;

  omp_set_num_threads(num_threads);
  
  while ( (item_p = list_remove_item(input_p->unmapped_read_list_p)) != NULL ) {

    //printf("Region Seeker Processing batch...\n");
    num_batches++;
    if (time_on) { timing_start(REGION_SEEKER, 0, timing_p); }
    
    unmapped_batch_p = (fastq_batch_t *)item_p->data_p;
    num_reads = unmapped_batch_p->num_reads;
    total_reads += num_reads;
    allocate_mapping_p = (array_list_t **)malloc(sizeof(array_list_t *)*num_reads);
    
    if (input_p->gpu_enable) {
      //******************************* GPU PROCESS *********************************//
      for (size_t i = 0; i < num_reads; i++) {
	allocate_mapping_p[i] = array_list_new(1000, 
					       1.25f, 
					       COLLECTION_MODE_ASYNCHRONIZED);
      }
      #ifdef HPG_GPU
      num_mappings = bwt_map_exact_seed_batch_gpu(unmapped_batch_p,
						  input_p->bwt_optarg_p, 
						  input_p->cal_optarg_p,
						  input_p->bwt_index_p,
						  input_p->gpu_context,
						  allocate_mapping_p);
      #endif
      //****************************************************************************//
    } else {

      //******************************* CPU PROCESS *********************************//
      //printf("Region Seeker :: Process Batch with %d reads\n", num_reads); 
      chunk = MAX(1, num_reads/(num_threads*10));
      
      //printf("Region Seeker :: Process Batch with %d reads\n", num_reads);
      #pragma omp parallel for private(num_mappings) reduction(+:total_mappings) schedule(dynamic, chunk)
      //#pragma omp parallel for private(num_mappings) reduction(+:total_mappings) schedule(static)
      for (size_t i = 0; i < num_reads; i++) {
	//printf("Threads region zone: %d\n", omp_get_num_threads());
	
	allocate_mapping_p[i] = array_list_new(1000, 
					       1.25f, 
					       COLLECTION_MODE_ASYNCHRONIZED);
	
	num_mappings = bwt_map_exact_seeds_seq(&(unmapped_batch_p->seq[unmapped_batch_p->data_indices[i]]), 
					       input_p->cal_optarg_p->seed_size,
					       input_p->cal_optarg_p->min_seed_size,
					       input_p->bwt_optarg_p, input_p->bwt_index_p, allocate_mapping_p[i]);
	
	total_mappings += num_mappings;
	//printf("----------------->>>>>>>>>>>Regions found %d\n", num_mappings);      
      }
      //****************************************************************************//
    
    }

    cal_batch_p = cal_batch_new(allocate_mapping_p, unmapped_batch_p);
      
    list_item_free(item_p);
    cal_item_p = list_item_new(0, 0, cal_batch_p);
    //region_batch_free(region_batch_p);    

    if (time_on) { timing_stop(REGION_SEEKER, 0, timing_p); }
    
    list_insert_item(cal_item_p, input_p->region_list_p);
    //printf("Region Seeker Processing batch finish!\n");

  } //End of while
  
  list_decr_writers(input_p->region_list_p);
 
  if (statistics_on) { 
    statistics_set(REGION_SEEKER_ST, 0, num_batches, statistics_p); 
    statistics_set(REGION_SEEKER_ST, 1, total_reads, statistics_p); 
  }
 
  printf("region_seeker_server: END\n");
  
}
int main(int argc, char *argv[]) {

  if (argc != 4) {
    printf("Error.\n");
    printf("Usage: %s index-dirname seq num-errors\n", argv[0]);
    exit(-1);
  }

  char *index_dirname = argv[1];
  char *seq = argv[2];
  int num_errors = atoi(argv[3]);

  // initializations
  initReplaceTable();

  bwt_optarg_t *bwt_optarg = bwt_optarg_new(num_errors, 1, 10000, 1, 0, 0);
  bwt_index_t *bwt_index = bwt_index_new(index_dirname);

  // seq
  {
    array_list_t *mapping_list = array_list_new(100000, 1.25f, 
						COLLECTION_MODE_SYNCHRONIZED);
    
    size_t num_mappings;
    
    num_mappings = bwt_map_seq(seq, bwt_optarg, 
			       bwt_index, mapping_list);
    printf("seq: %s\n", seq);
    printf("num_mappings = %lu\n", num_mappings);
    for (size_t i = 0; i < num_mappings; i++) {
      printf("%lu\t---------------------\n", i);
      alignment_print(array_list_get(i, mapping_list));
    }
  }


  // seed
  {
    array_list_t *mapping_list = array_list_new(100000, 1.25f, 
						COLLECTION_MODE_SYNCHRONIZED);
    
    size_t num_mappings;
    
    size_t len = strlen(seq);
    char *code_seq = (char *) calloc(len + 10, sizeof(char));
    replaceBases(seq, code_seq, len);
    
    num_mappings = bwt_map_exact_seeds_seq(code_seq, 18, 16,
					   bwt_optarg, bwt_index, mapping_list);
    
    region_t *region;
    for (size_t i = 0; i < num_mappings; i++) {
      region = array_list_get(i, mapping_list);
      printf("Region: chr = %lu, strand = %d, start = %lu, end = %lu\n", 
	     region->chromosome_id, region->strand, region->start, region->end);
    }
  }

  printf("Done.\n");

}
Exemple #3
0
int apply_seeding(region_seeker_input_t* input, batch_t *batch) {
  //printf("APPLY SEEDING...\n");


  //if (time_on) { start_timer(start); }


  mapping_batch_t *mapping_batch = batch->mapping_batch;

  size_t num_mappings;


  int seed_size = input->cal_optarg_p->seed_size;
  size_t min_seed_size = input->cal_optarg_p->min_seed_size;






  size_t num_targets = mapping_batch->num_targets;
  size_t *targets = mapping_batch->targets;
  size_t new_num_targets = 0;
  fastq_read_t *read;

  int min_intron_size = 40;

  int target;
  bwt_anchor_t *bwt_anchor = NULL;
  region_t *region;
  int gap_nt;
  int start_search;
  int end_search;

  // set to zero
  mapping_batch->num_to_do = 0;
  
  //TODO: omp parallel for !!
  /*if (batch->mapping_mode == 1000) {
    for (size_t i = 0; i < num_targets; i++) {
      //printf("Seq (i=%i)(target=%i): %s\n", i, targets[i], read->sequence);
      read = array_list_get(targets[i], mapping_batch->fq_batch);
      num_mappings = bwt_map_exact_seeds_seq(padding_left,
					     padding_right,
					     read->sequence,
					     seed_size,
					     min_seed_size,
					     input->bwt_optarg_p, 
					     input->bwt_index_p, 
					     mapping_batch->mapping_lists[targets[i]],
					     mapping_batch->extra_stage_id[targets[i]]);
      
      //printf("Num mappings %i\n", num_mappings);
      if (num_mappings > 0) {
	array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]);
	targets[new_num_targets++] = targets[i];
	mapping_batch->num_to_do += num_mappings;
      }
    }
    } else {*/
  
  //size_t new_num_targets = 0;
  //size_t *new_targets = (size_t *)malloc(array_list_size(fq_batch)*sizeof(size_t));
  array_list_t *array_list_aux = array_list_new(256, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  
  //Flag 0: The read has simple anchor or any, and need seeds and normal Cal_Seeker 
  //Flag 1: The read has double anchor and the gap is smaller than MIN_INTRON_SIZE. Cal_Seeker will be make one CAL
  //Flag 2: The read has double anchor but the gap is bigger than MIN_INTRON_SIZE. 
  for (size_t i = 0; i < num_targets; i++) {
    read = array_list_get(targets[i], mapping_batch->fq_batch);    
    //printf("Read Region %s: \n", read->id);
    /* if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 ||
	array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) {
      array_list_clear(mapping_batch->mapping_lists[targets[i]], bwt_anchor_free);
      continue;
      }
    */
    if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 || 
	array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) {
      //Flag 0 Case, Not anchors found, Make normal seeds      
      //      printf("***** Normal Case 0. Not anchors found!\n");
      for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) {
	bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]);
	array_list_insert(bwt_anchor, array_list_aux);
      }
      num_mappings = 0;

      num_mappings = bwt_map_exact_seeds_seq(0,
					     0,
					     read->sequence,
					     seed_size,
					     min_seed_size,
					     input->bwt_optarg_p, 
					     input->bwt_index_p, 
					     mapping_batch->mapping_lists[targets[i]],
					     0);
      if (num_mappings > 0) {
	array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]);
	targets[new_num_targets++] = targets[i];
	//mapping_batch->num_to_do += num_mappings;
      }
    } else if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) {
      //Flag 1 Case, One anchor found, Make displacements seeds                  
      printf("***** Case 1. One anchor found!\n");
      for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) {
	bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]);
	array_list_insert(bwt_anchor, array_list_aux);
      }

      int anchor_nt = bwt_anchor->end - bwt_anchor->start;
      int seed_id = 0;
      int seed_start, seed_end;
      int extra_seed;

      if ((bwt_anchor->type == FORWARD_ANCHOR && bwt_anchor->strand == 0) || 
	  (bwt_anchor->type == BACKWARD_ANCHOR && bwt_anchor->strand == 1 )) {
	start_search = anchor_nt + 1;
	end_search = read->length - 1;
	extra_seed = EXTRA_SEED_END;
      } else {
	start_search = 0;
	end_search = read->length - anchor_nt - 2;
	extra_seed = EXTRA_SEED_START;
      }

      printf("end_start %i - start_search %i = %i >= seed_size %i\n", end_search, start_search, end_search - start_search, seed_size);
      if (end_search - start_search >= seed_size) {
	printf("00 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search);
	/*
		num_mappings = bwt_map_exact_seeds_between_coords(start_search,
							  end_search,
							  read->sequence, 
							  seed_size, min_seed_size,
							  input->bwt_optarg_p, 
							  input->bwt_index_p, 
							  mapping_batch->mapping_lists[targets[i]],
							  extra_seed, &seed_id);
	*/
      }

      if (bwt_anchor->type == FORWARD_ANCHOR) {
	seed_id = 0;
	seed_start = 0;
	seed_end = anchor_nt;
      } else {
	seed_id += 1;
	seed_start = read->length - anchor_nt - 1;
	seed_end = read->length - 1;
      }

      for (int j = 0; j < array_list_size(array_list_aux); j++) {
	bwt_anchor_t *bwt_anchor = array_list_get(j, array_list_aux);
	//	printf("\tCreate seed Anchor [%i:%lu|%i-%i|%lu]\n", bwt_anchor->chromosome + 1, bwt_anchor->start, 
	//	       seed_start,seed_end,bwt_anchor->end);
	region = region_bwt_new(bwt_anchor->chromosome + 1,
				bwt_anchor->strand,
				bwt_anchor->start,
				bwt_anchor->end,
				seed_start,
				seed_end,
				read->length,
				seed_id);	  
	array_list_insert(region, mapping_batch->mapping_lists[targets[i]]);
      } 
      array_list_clear(array_list_aux, (void *)bwt_anchor_free); 
      array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]);
      targets[new_num_targets++] = targets[i];
    } else {
      //Flag 2 Case, Pair of anchors found
      printf("***** Case 2. Double anchor found!\n");
      bwt_anchor_t *bwt_anchor;
      bwt_anchor_t *bwt_anchor_forw, *bwt_anchor_back;

      int read_nt, genome_nt;


      int distance;
      int found = 0;
      region_t *region;
      int seed_id = 0;
      //if (array_list_size(mapping_batch->mapping_lists[targets[i]]) > 2) {
      int *anchors_targets = (int *)calloc(array_list_size(mapping_batch->mapping_lists[targets[i]]), sizeof(int));
      int num = 0;

      //min_intron_size = 0;

      //Search if one anchor is at the same distance from the reference and the read
      for (int b = 0; b < array_list_size(mapping_batch->mapping_lists[targets[i]]); b += 2) {
	bwt_anchor_forw = array_list_get(b, mapping_batch->mapping_lists[targets[i]]);
	bwt_anchor_back = array_list_get(b + 1, mapping_batch->mapping_lists[targets[i]]);
	//printf("FORW=%i:%lu-%lu BACK=%i:%lu-%lu\n", bwt_anchor_forw->chromosome, bwt_anchor_forw->start, bwt_anchor_forw->end,
	//     bwt_anchor_back->chromosome, bwt_anchor_back->start, bwt_anchor_back->end);
	read_nt = read->length - ((bwt_anchor_forw->end - bwt_anchor_forw->start) + (bwt_anchor_back->end - bwt_anchor_back->start));
	genome_nt = bwt_anchor_back->start - bwt_anchor_forw->end;	  
	distance = abs(genome_nt - read_nt);
	//printf("\t%i:Distance %i\n", b, distance);
	if (distance < min_intron_size) {
	  found = 1;
	} else {
	  anchors_targets[num++] = b;
	}
      }

      if (found) {
	//printf("\tFound Exact Case... Delete other anchors\n");
	for (int t = num - 1; t >= 0; t--) {
	  target = anchors_targets[t];
	  //printf("\tDelete %i, %i-->\n", target, target + 1);
	  bwt_anchor = array_list_remove_at(target + 1, mapping_batch->mapping_lists[targets[i]]);
	  bwt_anchor_free(bwt_anchor);
	  
	  bwt_anchor = array_list_remove_at(target, mapping_batch->mapping_lists[targets[i]]);
	  bwt_anchor_free(bwt_anchor);
	}
	array_list_set_flag(1, mapping_batch->mapping_lists[targets[i]]);
      } else {
	//Seeding between anchors
	//printf("\tFound gap between anchors \n");
	array_list_t *anchors_forward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]),
						       1.25f, COLLECTION_MODE_ASYNCHRONIZED);
	array_list_t *anchors_backward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]),
							1.25f, COLLECTION_MODE_ASYNCHRONIZED);
	int big_gap = 0;
	int final_anchor_nt = 0;
	int anchor_nt;
	int anchor_type;
	int anchor_strand;
	for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j -= 2) {
	  bwt_anchor_back = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]);
	  array_list_insert(bwt_anchor_back, anchors_backward);

	  bwt_anchor_forw = array_list_remove_at(j - 1, mapping_batch->mapping_lists[targets[i]]);
	  array_list_insert(bwt_anchor_forw, anchors_forward);

	  if (bwt_anchor_forw->strand == 0) {
	    anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start;
	    gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start));
	  } else {
	    anchor_nt = bwt_anchor_back->end - bwt_anchor_back->start;
	    gap_nt = read->length - (anchor_nt + (bwt_anchor_forw->end - bwt_anchor_forw->start));
	  }
	  if (gap_nt < 0) { gap_nt = 0; }
	  //printf("Gap nt (%i - %i): %i\n", anchor_nt, bwt_anchor_back->end - bwt_anchor_back->start, gap_nt);
	  if (gap_nt > big_gap) {
	    big_gap = gap_nt;
	    final_anchor_nt = anchor_nt;
	    anchor_type = bwt_anchor_back->type;
	    anchor_strand = bwt_anchor_back->strand;
	  }
	}

	printf("%i, %i\n", big_gap - 2, seed_size);
	if (big_gap - 2 > seed_size) {
	  //if (anchor_type == FORWARD_ANCHOR && anchor_strand == 0 || 
	  //  anchor_type == BACKWARD_ANCHOR && anchor_strand == 1 ) {
	    start_search = final_anchor_nt + 1;
	    end_search = final_anchor_nt + big_gap - 1;
	    //} else {
	    // start_search = final_anchor_nt + big_gap - 1;
	    //end_search = final_anchor_nt + 1;
	    //}
	  
	    //printf("Seeding between anchors... gap=%i\n", big_gap);
	    printf("11 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search);
	    /*
	    num_mappings = bwt_map_exact_seeds_between_coords(start_search,
							      end_search,
							      read->sequence, seed_size, min_seed_size,
							      input->bwt_optarg_p, 
							      input->bwt_index_p, 
							      mapping_batch->mapping_lists[targets[i]],
							      EXTRA_SEED_NONE,
							      &seed_id);
	    */
	}

	//printf("Making seeds anchors...\n");
	for (int a = 0; a < array_list_size(anchors_forward); a++) {
	  //Insert the last anchor. (Create new seed)
	  bwt_anchor_forw = array_list_get(a, anchors_forward);
	  bwt_anchor_back = array_list_get(a, anchors_backward);

	  anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start;
	  gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start));

	  //printf("\t --> Big Seed: %i, gap_nt: %i, anchor_nt = %i\n", a, gap_nt, anchor_nt);
	  if (gap_nt < 0) {
	    //gap_nt = 0;
	    bwt_anchor_forw->end   += gap_nt;
	    bwt_anchor_back->start -= gap_nt;
	    anchor_nt += gap_nt;
	    gap_nt = 0;
	  } else if (gap_nt == 0) {
	    bwt_anchor_forw->end -= 1;
	    bwt_anchor_back->start += 1;
	    anchor_nt -= 1;
	    gap_nt = 1;	    
	  }


	  region = region_bwt_new(bwt_anchor_forw->chromosome + 1,
				  bwt_anchor_forw->strand,
				  bwt_anchor_forw->start,
				  bwt_anchor_forw->end,
				  0,
				  anchor_nt,
				  read->length,
				  0);
	  //printf("Region: %i-%i\n", region->seq_start, region->seq_end);
	  array_list_insert(region, mapping_batch->mapping_lists[targets[i]]);

	  region = region_bwt_new(bwt_anchor_back->chromosome + 1,
				  bwt_anchor_back->strand,
				  bwt_anchor_back->start,
				  bwt_anchor_back->end,
				  anchor_nt + gap_nt,
				  read->length - 1,
				  read->length,
				  seed_id + 1);
	  //printf("Region: %i-%i\n", region->seq_start, region->seq_end);
	  array_list_insert(region, mapping_batch->mapping_lists[targets[i]]);

	  //printf("\tMaking seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]]));

	  bwt_anchor_free(bwt_anchor_back);
	  bwt_anchor_free(bwt_anchor_forw);
	}
	array_list_free(anchors_forward, NULL);
	array_list_free(anchors_backward, NULL);
	//printf("Making seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]]));

	array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]);
      }
      free(anchors_targets);
      targets[new_num_targets++] = targets[i];
    }
  }

  mapping_batch->num_targets = new_num_targets;

  array_list_free(array_list_aux, NULL);

  //if (time_on) { stop_timer(start, end, time); timing_add(time, REGION_SEEKER, timing); }

  //printf("APPLY SEEDING DONE!\n");
  
  return CAL_STAGE;

}