Пример #1
0
int apply_sw_bs_4nt(sw_server_input_t* input, batch_t *batch) {

  mapping_batch_t *mapping_batch = batch->mapping_batch;
  genome_t *genome1 = input->genome1_p;
  genome_t *genome2 = input->genome2_p;
  sw_optarg_t *sw_optarg = &input->sw_optarg;

  {
    char r[1024];
    size_t start = 169312417;
    size_t end = start + 99;
    genome_read_sequence_by_chr_index(r, 0,
				      0, &start, &end, genome2);
    printf("+++++++++++++ genome2 = %s \n", r);
    genome_read_sequence_by_chr_index(r, 0,
				      0, &start, &end, genome1);
    printf("+++++++++++++ genome1 = %s \n", r);

  }

  // fill gaps between seeds
  fill_gaps_bs(mapping_batch, sw_optarg, genome2, genome1, 20, 5, 1);
  merge_seed_regions_bs(mapping_batch, 1);
  fill_end_gaps_bs(mapping_batch, sw_optarg, genome1, genome2, 20, 400, 1);
  
  fill_gaps_bs(mapping_batch, sw_optarg, genome1, genome2, 20, 5, 0);
  merge_seed_regions_bs(mapping_batch, 0);
  fill_end_gaps_bs(mapping_batch, sw_optarg, genome2, genome1, 20, 400, 0);

  // now we can create the alignments
  fastq_read_t *read;
  array_list_t *fq_batch = mapping_batch->fq_batch;
  
  char *match_seq, *match_qual;
  size_t read_index, read_len, match_len, match_start;
  
  cal_t *cal;
  array_list_t *cal_list = NULL;
  size_t num_cals;
  
  seed_region_t *s;
  cigar_code_t *cigar_code;
  cigar_op_t *first_op;

  float score, norm_score, min_score = input->min_score;

  alignment_t *alignment;
  array_list_t *alignment_list;

  char *p, *optional_fields;
  int optional_fields_length, AS;

  array_list_t **mapping_lists;
  size_t num_targets;
  size_t *targets;

  for (int bs_id = 0; bs_id < 2; bs_id++) {

    if (bs_id == 0) {
      mapping_lists = mapping_batch->mapping_lists;
      num_targets = mapping_batch->num_targets;
      targets = mapping_batch->targets;
    } else {
      mapping_lists = mapping_batch->mapping_lists2;
      num_targets = mapping_batch->num_targets2;
      targets = mapping_batch->targets2;
    }

    for (size_t i = 0; i < num_targets; i++) {
      read_index = targets[i];
      read = (fastq_read_t *) array_list_get(read_index, fq_batch);
      
      cal_list = mapping_lists[read_index];
      num_cals = array_list_size(cal_list);
      
      if (num_cals <= 0) continue;
    
      read_len = read->length;
    
      alignment_list = array_list_new(num_cals, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);

      // processing each CAL from this read
      for(size_t j = 0; j < num_cals; j++) {

	// get cal and read index
	cal = array_list_get(j, cal_list);
	if (cal->sr_list->size == 0) continue;
	
	s = (seed_region_t *) linked_list_get_first(cal->sr_list);
	cigar_code = (cigar_code_t *) s->info;
	
	norm_score = cigar_code_get_score(read_len, cigar_code);
	score = norm_score * 100; //read_len;
	LOG_DEBUG_F("score = %0.2f\n", norm_score);

	// filter by SW score
	if (norm_score > min_score) {

	  // update cigar and sequence and quality strings
	  cigar_code_update(cigar_code);
	  LOG_DEBUG_F("\tcigar code = %s\n", new_cigar_code_string(cigar_code));
	  match_start = 0;
	  match_len = cigar_code_nt_length(cigar_code); 
	  first_op = cigar_code_get_first_op(cigar_code);
	  match_start = (first_op && first_op->name == 'H' ? first_op->number : 0);
	  
	  match_seq = (char *) malloc((match_len + 1)* sizeof(char));
	  memcpy(match_seq, &read->sequence[match_start], match_len);
	  match_seq[match_len] = 0;
	  
	  match_qual = (char *) malloc((match_len + 1)* sizeof(char));
	  memcpy(match_qual, &read->quality[match_start], match_len);
	  match_qual[match_len] = 0;
	  
	  // set optional fields
	  optional_fields_length = 100;
	  optional_fields = (char *) calloc(optional_fields_length, sizeof(char));
	  
	  p = optional_fields;
	  AS = (int) norm_score * 100;
	
	  sprintf(p, "ASi");
	  p += 3;
	  memcpy(p, &AS, sizeof(int));
	  p += sizeof(int);
	  
	  sprintf(p, "NHi");
	  p += 3;
	  memcpy(p, &num_cals, sizeof(int));
	  p += sizeof(int);
	  
	  sprintf(p, "NMi");
	  p += 3;
	  memcpy(p, &cigar_code->distance, sizeof(int));
	  p += sizeof(int);
	  
	  assert(read->length == cigar_code_nt_length(cigar_code));
	  
	  // create an alignment and insert it into the list
	  alignment = alignment_new();

	  //read_id = malloc(read->length);
	  size_t header_len = strlen(read->id);
	  char *head_id = (char *) malloc(header_len + 1);
	  
	  get_to_first_blank(read->id, header_len, head_id);
	
	  alignment_init_single_end(head_id, match_seq, match_qual, 
				    cal->strand, cal->chromosome_id - 1, cal->start - 1,
				    new_cigar_code_string(cigar_code), 
				    cigar_code_get_num_ops(cigar_code), 
				    norm_score * 254, 1, (num_cals > 1),
				    optional_fields_length, optional_fields, alignment);
	  
	  array_list_insert(alignment, alignment_list);

	  LOG_DEBUG_F("creating alignment (bs_id = %i)...\n", bs_id);
	  //alignment_print(alignment);

	}
      }
      
      // free the cal list, and update the mapping list with the alignment list
      array_list_free(cal_list, (void *) cal_free);
      mapping_lists[read_index] = alignment_list;
    }
  }

  // go to the next stage
  return BS_POST_PAIR_STAGE;
}
Пример #2
0
void extend_seeds(seed_cal_t *cal, sa_index3_t *sa_index) {
  size_t gap_read_start, gap_read_end;
  size_t gap_genome_start, gap_genome_end;
  int chrom, gap_read_len, gap_genome_len;

  linked_list_item_t *item;
  seed_t *prev_seed, *seed;

  int read_area, genome_area, num_seeds;
  char *seq;
  alig_out_t alig_out;

  fastq_read_t *read = cal->read;

  chrom = cal->chromosome_id;
  seq = (cal->strand ? read->revcomp : read->sequence);
  
  num_seeds = cal->seed_list->size;
  if (num_seeds <= 0) return;
  
  // first seed
  seed = linked_list_get_first(cal->seed_list);
  cal->start = seed->genome_start;
  if (seed->read_start > 0) {
    // extend to left
    extend_to_left(seed, seed->read_start, cal, sa_index, &alig_out);
    update_seed_left(&alig_out, seed, cal);
    cal->start = seed->genome_start;
    cigar_clean(&alig_out.cigar);
  }
  
  // seeds at the middle positions
  prev_seed = seed;
  for (item = cal->seed_list->first->next; item != NULL; item = item->next) {
    seed = item->item;
    
    read_area = 0;
    genome_area = 0;

    // gap in read
    gap_read_start = prev_seed->read_end + 1;
    gap_read_end = seed->read_start - 1;
    gap_read_len = abs(gap_read_end - gap_read_start + 1);    

    // gap in genome
    gap_genome_start = prev_seed->genome_end + 1;
    gap_genome_end = seed->genome_start - 1;
    gap_genome_len = abs(gap_genome_end - gap_genome_start + 1);

    if (gap_read_len > 0) {
      // extend previous seed to right
      extend_to_right(prev_seed, gap_read_len, cal, sa_index, &alig_out);
      if (alig_out.match) {
	update_seed_right(&alig_out, prev_seed, cal);
	read_area = alig_out.map_len1;
	genome_area = alig_out.map_len2;
	
	// gap in read
	gap_read_start = prev_seed->read_end + 1;
	gap_read_end = seed->read_start - 1;
	gap_read_len = abs(gap_read_end - gap_read_start + 1);    
	
	// gap in genome
	gap_genome_start = prev_seed->genome_end + 1;
	gap_genome_end = seed->genome_start - 1;
	gap_genome_len = abs(gap_genome_end - gap_genome_start + 1);
      }
    }

    if (gap_read_len > 0) {
      // extend current seed to left
      extend_to_left(seed, gap_read_len, cal, sa_index, &alig_out);
      if (alig_out.match) {
	update_seed_left(&alig_out, seed, cal);
	read_area += alig_out.map_len1;
	genome_area += alig_out.map_len2;
      }
    }

    prev_seed = seed;
  }
  
  // last seed
  cal->end = seed->genome_end;
  if (seed->read_end < read->length - 1) {
    // extend to right
    extend_to_right(seed, read->length - seed->read_end - 1, cal, sa_index, &alig_out);
    update_seed_right(&alig_out, seed, cal);
    cal->end = seed->genome_end;
    cigar_clean(&alig_out.cigar);
  }
}
Пример #3
0
void fill_end_gaps(mapping_batch_t *mapping_batch, sw_optarg_t *sw_optarg, 
		   genome_t *genome, int min_H, int min_distance) {

  int sw_count = 0;

  fastq_read_t *fq_read;
  array_list_t *fq_batch = mapping_batch->fq_batch;

  size_t read_index, read_len;

  cal_t *cal;
  array_list_t *cal_list = NULL;
  size_t num_cals, num_targets = mapping_batch->num_targets;

  char *seq, *revcomp_seq = NULL;

  seed_region_t *s;

  cigar_op_t *cigar_op;
  cigar_code_t *cigar_code;

  size_t start, end;
  size_t gap_read_start, gap_read_end, gap_read_len;
  size_t gap_genome_start, gap_genome_end, gap_genome_len;

  int first, last, mode, distance;
  sw_prepare_t *sw_prepare;

  
  char *ref;

  // initialize query and reference sequences to Smith-Waterman
  for (size_t i = 0; i < num_targets; i++) {

    read_index = mapping_batch->targets[i];
    fq_read = (fastq_read_t *) array_list_get(read_index, fq_batch);

    cal_list = mapping_batch->mapping_lists[read_index];
    num_cals = array_list_size(cal_list);
    
    if (num_cals <= 0) continue;

    read_len = fq_read->length;
    revcomp_seq = NULL;

    // processing each CAL from this read
    for(size_t j = 0; j < num_cals; j++) {

      // get cal and read index
      cal = array_list_get(j, cal_list);
      if (cal->sr_list->size == 0) continue;

      sw_prepare = NULL;
      s = (seed_region_t *) linked_list_get_first(cal->sr_list);
      cigar_code = (cigar_code_t *) s->info;
      LOG_DEBUG_F("CAL #%i of %i (strand %i), sr_list size = %i, cigar = %s (distance = %i)\n", 
		  j, num_cals, cal->strand, cal->sr_list->size, new_cigar_code_string(cigar_code), cigar_code->distance);
      
      for (int k = 0; k < 2; k++) {
	mode = NONE_POS;
	if (k == 0) {
	  if ((cigar_op = cigar_code_get_op(0, cigar_code)) &&
	      cigar_op->name == 'H' && cigar_op->number > min_H) {
	    LOG_DEBUG_F("%i%c\n", cigar_op->number, cigar_op->name);

	    mode = BEGIN_POS;
	    gap_read_start = 0;
	    gap_read_end = cigar_op->number - 1;
	    gap_genome_start = s->genome_start;
	    gap_genome_end = gap_genome_start + cigar_op->number - 1;
	  }
	} else {
	  if ((cigar_op = cigar_code_get_last_op(cigar_code)) &&
	      cigar_op->name == 'H' && cigar_op->number > min_H) {
	    LOG_DEBUG_F("%i%c\n", cigar_op->number, cigar_op->name);

	    mode = END_POS;
	    gap_read_start = read_len - cigar_op->number;
	    gap_read_end = read_len - 1;
	    gap_genome_end = s->genome_end;
	    gap_genome_start = gap_genome_end - cigar_op->number + 1;
	  }
	}
	    
	if (mode == NONE_POS) continue;

	// get query sequence, revcomp if necessary
	if (cal->strand) {
	  if (revcomp_seq == NULL) {
	    revcomp_seq = strdup(fq_read->sequence);
	    seq_reverse_complementary(revcomp_seq, read_len);
	  }
	  seq = revcomp_seq;
	} else {
	  seq = fq_read->sequence;
	}

	gap_read_len = gap_read_end - gap_read_start + 1;
	/*	
	char *query = (char *) malloc((gap_read_len + 1) * sizeof(char));
	memcpy(query, seq, gap_len);
	query[gap_read_len] = '\0';
	*/

	// get ref. sequence
	start = gap_genome_start;// + 1;
	end = gap_genome_end;// + 1;
	gap_genome_len = end - start + 1;
	ref = (char *) malloc((gap_genome_len + 1) * sizeof(char));
	genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
					  &start, &end, genome);
	ref[gap_genome_len] = '\0';
	
	first = -1; 
	last = -1;
	distance = 0;
	for (int k = 0, k1 = gap_read_start; k < gap_read_len; k++, k1++) {
	  if (seq[k1] != ref[k]) {
	    distance++;
	    if (first == -1) first = k;
	    last = k;
	  }
	  //	  LOG_DEBUG_F("k = %i, k.read = %i: %c - %c : distance = %i, (first, last) = (%i, %i)\n", 
	  //		      k, k1, seq[k1], ref[k], distance, first, last);
	}

	if (distance < min_distance) {
	  cigar_op->name = 'M';
	  cigar_code->distance += distance;
	  free(ref);
	  continue;
	} else {
	  //	  LOG_DEBUG_F("query: %s\n", &seq[gap_read_start]);
	  //	  LOG_DEBUG_F("ref. : %s\n", ref);
	  LOG_FATAL_F("here we must run SW: distance = %i: first = %i, last = %i, gaps (read, genome) = (%i, %i)\n", 
		      distance, first, last, gap_read_len, gap_genome_len);
	}

	// we must run the SW algorithm
	

	//	sw_prepare = sw_prepare_new(0, 0, 0, 0);
	//	sw_prepare_sequences( cal, genome, sw_prepare);
	//	array_list_insert(sw_prepare, sw_prepare_list);
	//	sw_count++;
      }
    }
  }
  LOG_DEBUG_F("sw_count = %i\n", sw_count);


  // debugging....
  for (size_t i = 0; i < num_targets; i++) {
    read_index = mapping_batch->targets[i];
    fq_read = (fastq_read_t *) array_list_get(read_index, fq_batch);

    LOG_DEBUG_F("Read %s\n", fq_read->id);
    
    cal_list = mapping_batch->mapping_lists[read_index];
    num_cals = array_list_size(cal_list);
    
    if (num_cals <= 0) continue;

    for(size_t j = 0; j < num_cals; j++) {

      // get cal and read index
      cal = array_list_get(j, cal_list);
      if (cal->sr_list->size == 0) continue;

      sw_prepare = NULL;
      s = (seed_region_t *) linked_list_get_first(cal->sr_list);
      cigar_code = (cigar_code_t *) s->info;
      LOG_DEBUG_F("\tCAL #%i of %i (strand %i), sr_list size = %i, cigar = %s (distance = %i)\n", 
		  j, num_cals, cal->strand, cal->sr_list->size, new_cigar_code_string(cigar_code), cigar_code->distance);
    }
  }
}
Пример #4
0
static  int
bfwork_run_threaded(bam_fwork_t *fwork)
{
	int err;
	bam_region_t *region;
	linked_list_t *regions;
	double times;

	omp_lock_t end_condition_lock;
	int end_condition;

	omp_lock_t reads_lock;
	size_t reads;
	size_t reads_to_write;

	//Init lock
	omp_init_lock(&end_condition_lock);
	omp_init_lock(&reads_lock);
	//#pragma omp parallel private(err, region, regions, times, reads_to_write)
	{
		//#pragma omp single
		{
			printf("Running in multithreading mode with %d threads\n", omp_get_max_threads());
			end_condition = 1;
			reads = 0;
		}

		#pragma omp parallel sections private(err, region, regions, times, reads_to_write)
		{
			//Region read
			#pragma omp section
			{
				regions = fwork->regions_list;
				while(1)
				{
					//Create new current region
					region = (bam_region_t *)malloc(sizeof(bam_region_t));
					breg_init(region);

					//Fill region
#ifdef D_TIME_DEBUG
					times = omp_get_wtime();
#endif
					err = bfwork_obtain_region(fwork, region);
#ifdef D_TIME_DEBUG
					times = omp_get_wtime() - times;
					omp_set_lock(&region->lock);
					if(fwork->context->time_stats)
					if(region->size != 0)
						time_add_time_slot(D_FWORK_READ, fwork->context->time_stats, times / (double)region->size);
					omp_unset_lock(&region->lock);
#endif
					if(err)
					{
						if(err == WANDER_REGION_CHANGED || err == WANDER_READ_EOF)
						{
							//Until process, this region cant be writed
							omp_test_lock(&region->write_lock);

							//Add region to framework regions
							bfwork_region_insert(fwork, region);

							#pragma omp task untied firstprivate(region) private(err)
							{
								int i;
								size_t pf_l;
								double aux_time;

								//Process region
								omp_set_lock(&region->lock);
#ifdef D_TIME_DEBUG
								times = omp_get_wtime();
#endif
								//Process region
								pf_l = fwork->context->processing_f_l;
								for(i = 0; i < pf_l; i++)
								{
									fwork->context->processing_f[i](fwork, region);
								}
#ifdef D_TIME_DEBUG
								times = omp_get_wtime() - times;
								if(fwork->context->time_stats)
								if(region->size != 0)
									time_add_time_slot(D_FWORK_PROC_FUNC, fwork->context->time_stats, times / (double)region->size);
								aux_time = omp_get_wtime();
#endif
								omp_unset_lock(&region->lock);

								omp_set_lock(&reads_lock);
								reads += region->size;
								printf("Reads processed: %lu\r", reads);
								omp_unset_lock(&reads_lock);

#ifdef D_TIME_DEBUG
								aux_time = omp_get_wtime() - aux_time;
								omp_set_lock(&region->lock);
								if(fwork->context->time_stats)
								if(region->size != 0)
									time_add_time_slot(D_FWORK_PROC, fwork->context->time_stats, (times + aux_time) / (double)region->size);
								omp_unset_lock(&region->lock);
#endif

								//Set this region as writable
								omp_unset_lock(&region->write_lock);
							}

							//End readings
							if(err == WANDER_READ_EOF)
								 break;
						}
						else
						{
							if(err == WANDER_READ_TRUNCATED)
							{
								LOG_WARN("Readed truncated read\n");
							}
							else
							{
								LOG_FATAL_F("Failed to read next region, error code: %d\n", err);
							}
							break;
						}
					}
					else
					{
						//No more regions, end loop
						LOG_INFO("No more regions to read");
						break;
					}
				}

				omp_set_lock(&end_condition_lock);
				end_condition = 0;
				omp_unset_lock(&end_condition_lock);
				//LOG_WARN("Read thread exit\n");
			}//End read section

			//Write section
			#pragma omp section
			{
				regions = fwork->regions_list;
				omp_set_lock(&end_condition_lock);
				while(end_condition || linked_list_size(regions) > 0)
				{
					omp_unset_lock(&end_condition_lock);
#ifdef D_TIME_DEBUG
					times = omp_get_wtime();
#endif

					//Get next region
					omp_set_lock(&fwork->regions_lock);
					region = linked_list_get_first(regions);
					omp_unset_lock(&fwork->regions_lock);
					if(region == NULL)
					{
						omp_set_lock(&end_condition_lock);
						continue;
					}

					//Wait region to be writable
					omp_set_lock(&region->write_lock);

					//Write region
					omp_set_lock(&fwork->output_file_lock);
					reads_to_write = region->size;
					breg_write_n(region, reads_to_write, fwork->output_file);
					omp_unset_lock(&fwork->output_file_lock);

					//Remove from list
					omp_set_lock(&fwork->regions_lock);
					if(linked_list_size(regions) == 1)	//Possible bug?
						linked_list_clear(regions, NULL);
					else
						linked_list_remove_first(regions);

					//Signal read section if regions list is full
					if(linked_list_size(regions) < (FWORK_REGIONS_MAX / 2) )
						omp_unset_lock(&fwork->free_slots);

					omp_unset_lock(&fwork->regions_lock);

#ifdef D_TIME_DEBUG
					times = omp_get_wtime() - times;
					omp_set_lock(&region->lock);
					if(fwork->context->time_stats)
					if(reads_to_write != 0)
						time_add_time_slot(D_FWORK_WRITE, fwork->context->time_stats, times / (double)reads_to_write);
					omp_unset_lock(&region->lock);
#endif

					//Free region
					breg_destroy(region, 1);
					free(region);

					omp_set_lock(&end_condition_lock);
				}
				omp_unset_lock(&end_condition_lock);

				//LOG_WARN("Write thread exit\n");
			}//End write section

		}//End sections

	}//End parallel

	//Lineskip
	printf("\n");

	//Free
	omp_destroy_lock(&end_condition_lock);

	return NO_ERROR;
}