Example #1
0
File: tests.c Project: xujl12/BLAST
void nw_test_no_mismatches()
{
  nw_aligner_t *nw = needleman_wunsch_new();
  alignment_t *result = alignment_create(256);

  int match = 1;
  int mismatch = -2;
  int gap_open = -4;
  int gap_extend = -1;

  bool no_start_gap_penalty = false, no_end_gap_penalty = false;
  bool no_gaps_in_a = false, no_gaps_in_b = false;
  bool no_mismatches = true, case_sensitive = true;

  scoring_t scoring;
  scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
               no_start_gap_penalty, no_end_gap_penalty,
               no_gaps_in_a, no_gaps_in_b,
               no_mismatches, case_sensitive);

  needleman_wunsch_align("atc", "ac", &scoring, nw, result);
  ASSERT(strcmp(result->result_a, "atc") == 0 &&
         strcmp(result->result_b, "a-c") == 0);

  needleman_wunsch_align("cgatcga", "catcctcga", &scoring, nw, result);
  ASSERT(strcmp(result->result_a, "cgatc---ga") == 0 &&
         strcmp(result->result_b, "c-atcctcga") == 0);

  alignment_free(result);
  needleman_wunsch_free(nw);
}
void write_unmapped_read(fastq_read_t *fq_read, bam_file_t *bam_file) {
  static char aux[1024] = "";
  alignment_t *alig;
  size_t header_len;
  char *id;
  bam1_t *bam1;

  // calculating cigar
  //sprintf(aux, "%luX", fq_read->length);	       
  alig = alignment_new();	       
  //header_len = strlen(fq_read->id);
  //id = (char *) malloc(sizeof(char) * (header_len + 1));
  //get_to_first_blank(fq_read->id, header_len, id);
  //free(fq_read->id);
  
  alignment_init_single_end(strdup(fq_read->id), fq_read->sequence, fq_read->quality,
			    0, -1, -1, /*strdup(aux)*/"", 0, 0, 0, 0, 0, NULL, alig);
  
  bam1 = convert_to_bam(alig, 33);
  bam_fwrite(bam1, bam_file);
  bam_destroy1(bam1);
	       
  alig->sequence = NULL;
  alig->quality = NULL;
  alig->cigar = NULL;
  alignment_free(alig);	       

  //printf("\tWRITE : read %i (%d items): unmapped...done !!\n", i, num_items);
  
}
void write_mapped_read(array_list_t *array_list, bam_file_t *bam_file) {
  size_t num_items = array_list_size(array_list);
  alignment_t *alig;
  bam1_t *bam1;
  for (size_t j = 0; j < num_items; j++) {
    alig = (alignment_t *) array_list_get(j, array_list);

    //printf("\t******** %i(%i)\n", j, num_items);
    //printf("is null alig->name %i\n", (alig->query_name == NULL));
    //printf("name = %s\n", alig->query_name);
    //printf("read = %s\n", alig->sequence);
    //printf("\t-----> %s\n", alig->cigar);
    LOG_DEBUG("writting bam..\n");
    //alignment_print(alig);
    //exit(-1);
    if (alig != NULL) {
      bam1 = convert_to_bam(alig, 33);
      bam_fwrite(bam1, bam_file);
      bam_destroy1(bam1);	 
      alignment_free(alig);
    } else {
      LOG_FATAL_F("alig is NULL, num_items = %lu\n", num_items);
    }
    //printf("\t**************** %i(%i)\n", j, num_items);
  }
  if (array_list) { array_list_free(array_list, NULL); }
}
Example #4
0
File: tests.c Project: xujl12/BLAST
/* No gap is expected in the longer sequence */
void nw_test_no_gaps_in_longer()
{
  nw_aligner_t *nw = needleman_wunsch_new();
  alignment_t *aln = alignment_create(256);

  const char* seq_a = "aaaaacg";
  const char* seq_b = "acgt";

  int match = 1;
  int mismatch = -2;
  int gap_open = -4;
  int gap_extend = -1;

  bool no_start_gap_penalty = false, no_end_gap_penalty = false;
  bool no_gaps_in_a = true, no_gaps_in_b = false;
  bool no_mismatches = false, case_sensitive = true;

  scoring_t scoring;
  scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
               no_start_gap_penalty, no_end_gap_penalty,
               no_gaps_in_a, no_gaps_in_b,
               no_mismatches, case_sensitive);

  needleman_wunsch_align(seq_a, seq_b, &scoring, nw, aln);

  // ASSERT(strcmp(aln->result_a, "aaaaacg") == 0 &&
  //        strcmp(aln->result_b, "acgt---") == 0);

  ASSERT(strcmp(aln->result_a, "aaaaacg-") == 0 &&
         strcmp(aln->result_b, "a----cgt") == 0);

  alignment_free(aln);
  needleman_wunsch_free(nw);
}
Example #5
0
File: tests.c Project: xujl12/BLAST
/* First sequence is aligned to the corresponding (equal) substring of the second
 * sequence because both gaps at start and at end are free */
void nw_test_free_gaps_at_ends()
{
  nw_aligner_t *nw = needleman_wunsch_new();
  alignment_t *result = alignment_create(256);

  const char* seq_a = "acg";
  const char* seq_b = "tttacgttt";

  int match = 1;
  int mismatch = -1;
  int gap_open = -4;
  int gap_extend = -1;

  bool no_start_gap_penalty = true, no_end_gap_penalty = true;
  bool no_gaps_in_a = false, no_gaps_in_b = false;
  bool no_mismatches = false, case_sensitive = true;

  scoring_t scoring;
  scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
               no_start_gap_penalty, no_end_gap_penalty,
               no_gaps_in_a, no_gaps_in_b,
               no_mismatches, case_sensitive);

  needleman_wunsch_align(seq_a, seq_b, &scoring, nw, result);
  ASSERT(strcmp(result->result_a, "---acg---") == 0 &&
         strcmp(result->result_b, "tttacgttt") == 0);

  alignment_free(result);
  needleman_wunsch_free(nw);
}
Example #6
0
void call_decomp_destroy(CallDecomp *dc)
{
  alignment_free(dc->aln);
  needleman_wunsch_free(dc->nw_aligner);
  ctx_free(dc->scoring);
  bcf_destroy(dc->v);
  strbuf_dealloc(&dc->sbuf);
  ctx_free(dc);
}
Example #7
0
void align(char* seq_a, char* seq_b)
{
  // Variables to store alignment result
  sw_aligner_t *sw = smith_waterman_new();
  alignment_t *result = alignment_create(256);

  // Decide on scoring
  int match = 1;
  int mismatch = -2;
  int gap_open = -4;
  int gap_extend = -1;
  
  // Don't penalise gaps at the start
  // ACGATTT
  // ----TTT would score +3 (when match=+1)
  char no_start_gap_penalty = 1;
  
  // ..or gaps at the end e.g.
  // ACGATTT
  // ACGA--- would score +4 (when match=+1)
  char no_end_gap_penalty = 1;

  char no_gaps_in_a = 0, no_gaps_in_b = 0;
  char no_mismatches = 0;

  // Compare character case-sensitively (usually set to 0 for DNA etc)
  char case_sensitive = 0;

  scoring_t scoring;
  scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
               no_start_gap_penalty, no_end_gap_penalty,
               no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive);

  // Add some special cases
  // x -> y means x in seq1 changing to y in seq2
  scoring_add_mutation(&scoring, 'a', 'c', -2); // a -> c give substitution score -2
  scoring_add_mutation(&scoring, 'c', 'a', -1); // c -> a give substitution score -1

  // We could also prohibit the aligning of characters not given as special cases
  // scoring.use_match_mismatch = 0;

  smith_waterman_align(seq_a, seq_b, &scoring, sw);

  while(smith_waterman_fetch(sw, result))
  {
    printf("seqA: %s [start:%zu]\n", result->result_a, result->pos_a);
    printf("seqB: %s [start:%zu]\n", result->result_b, result->pos_b);
    printf("alignment score: %i\n\n", result->score);
  }

  // Free memory for storing alignment results
  smith_waterman_free(sw);
  alignment_free(result);
}
Example #8
0
File: tests.c Project: xujl12/BLAST
void nw_test_no_mismatches_rand()
{
  nw_aligner_t *nw = needleman_wunsch_new();
  alignment_t *aln = alignment_create(256);

  int match = 1;
  int mismatch = -2;
  int gap_open = -4;
  int gap_extend = -1;

  bool no_start_gap_penalty = false, no_end_gap_penalty = false;
  bool no_gaps_in_a = false, no_gaps_in_b = false;
  bool no_mismatches = true, case_sensitive = true;

  scoring_t scoring;
  scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
               no_start_gap_penalty, no_end_gap_penalty,
               no_gaps_in_a, no_gaps_in_b,
               no_mismatches, case_sensitive);

  char seqa[100], seqb[100];
  size_t i;

  // Run 50 random alignments
  for(i = 0; i < 50; i++)
  {
    make_rand_seq(seqa, sizeof(seqa));
    make_rand_seq(seqb, sizeof(seqb));
    needleman_wunsch_align(seqa, seqb, &scoring, nw, aln);
    // Check no mismatches
    char *a = aln->result_a, *b = aln->result_b;
    while(1) {
      ASSERT(*a == '-' || *b == '-' || *a == *b);
        // printf("Seq: '%s' '%s'\n", aln->result_a, aln->result_b);
        // exit(EXIT_FAILURE);
      if(!*a && !*b) break;
      a++; b++;
    }
  }

  alignment_free(aln);
  needleman_wunsch_free(nw);
}
Example #9
0
int main(int argc, char **argv)
{
  if(argc != 2) print_usage(argv);

  char *seq = argv[1];
  size_t seqlen = strlen(seq);

  // Go
  int match = 1, mismatch = -1, gap_open = -4, gap_extend = -1;

  bool no_start_gap_penalty = false, no_end_gap_penalty = false;
  bool no_gaps_in_a = true, no_gaps_in_b = true;
  bool no_mismatches = true, case_sensitive = true;

  scoring_t scoring;
  scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
               no_start_gap_penalty, no_end_gap_penalty,
               no_gaps_in_a, no_gaps_in_b,
               no_mismatches, case_sensitive);

  // Alignment results stored here
  sw_aligner_t *sw = smith_waterman_new();
  alignment_t *aln = alignment_create(seqlen+1);

  smith_waterman_align(seq, seq, &scoring, sw);

  // Loop over results
  while(smith_waterman_fetch(sw, aln))
  {
    if(aln->pos_a < aln->pos_b) {
      fputs(aln->result_a, stdout);
      printf(" [%zu,%zu]\n", aln->pos_a, aln->pos_b);
    }
  }

  smith_waterman_free(sw);
  alignment_free(aln);

  return EXIT_SUCCESS;
}
Example #10
0
int main(int argc, char* argv[])
{
  #ifdef SEQ_ALIGN_VERBOSE
  printf("VERBOSE: on\n");
  #endif

  sw_set_default_scoring();
  cmd = cmdline_new(argc, argv, &scoring, SEQ_ALIGN_SW_CMD);

  // Align!
  sw = smith_waterman_new();
  result = alignment_create(256);

  if(cmd->seq1 != NULL)
  {
    // Align seq1 and seq2
    align(cmd->seq1, cmd->seq2, NULL, NULL);
  }

  // Align from files
  size_t i, num_of_file_pairs = cmdline_get_num_of_file_pairs(cmd);
  for(i = 0; i < num_of_file_pairs; i++)
  {
    const char *file1 = cmdline_get_file1(cmd, i);
    const char *file2 = cmdline_get_file2(cmd, i);
    if(file1 != NULL && *file1 == '\0' && file2 == NULL) {
      wait_on_keystroke = 1;
      file1 = "-";
    }
    align_from_file(file1, file2, &align_pair_from_file, !cmd->interactive);
  }

  // Free memory for storing alignment results
  smith_waterman_free(sw);
  alignment_free(result);

  cmdline_free(cmd);

  return EXIT_SUCCESS;
}
Example #11
0
File: tests.c Project: xujl12/BLAST
void sw_test_no_gaps_smith_waterman()
{
  sw_aligner_t *sw = smith_waterman_new();
  alignment_t *result = alignment_create(256);

  const char* seq_a = "gacag";
  const char* seq_b = "tgaagt";

  int match = 1;
  int mismatch = -2;
  int gap_open = -4;
  int gap_extend = -1;

  bool no_start_gap_penalty = false, no_end_gap_penalty = false;
  bool no_gaps_in_a = true, no_gaps_in_b = true;
  bool no_mismatches = false, case_sensitive = true;

  scoring_t scoring;
  scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
               no_start_gap_penalty, no_end_gap_penalty,
               no_gaps_in_a, no_gaps_in_b,
               no_mismatches, case_sensitive);

  smith_waterman_align(seq_a, seq_b, &scoring, sw);

  smith_waterman_fetch(sw, result);
  ASSERT(strcmp(result->result_a, "ga") == 0 &&
         strcmp(result->result_b, "ga") == 0);

  smith_waterman_fetch(sw, result);
  ASSERT(strcmp(result->result_a, "ag") == 0 &&
         strcmp(result->result_b, "ag") == 0);

  alignment_free(result);
  smith_waterman_free(sw);
}
Example #12
0
int sa_sam_writer(void *data) {
  sa_wf_batch_t *wf_batch = (sa_wf_batch_t *) data;
  
  sa_mapping_batch_t *mapping_batch = (sa_mapping_batch_t *) wf_batch->mapping_batch;
  if (mapping_batch == NULL) {
    printf("bam_writer1: error, NULL mapping batch\n");
    return 0;
  }
  /*
  for (int i = 0; i < NUM_COUNTERS; i++) {
    counters[i] += mapping_batch->counters[i];
  }
  */
  #ifdef _TIMING
  for (int i = 0; i < NUM_TIMING; i++) {
    func_times[i] += mapping_batch->func_times[i];
  }
  #endif

  int num_mismatches, num_cigar_ops;
  size_t flag, pnext = 0, tlen = 0;
  char *cigar_string, *cigar_M_string, *rnext = "*";

  fastq_read_t *read;
  array_list_t *read_list = mapping_batch->fq_reads;

  array_list_t *mapping_list, *mate_list;
  FILE *out_file = (FILE *) wf_batch->writer_input->bam_file;

  sa_genome3_t *genome = wf_batch->sa_index->genome;

  size_t num_reads, num_mappings, num_mate_mappings;
  num_reads = mapping_batch->num_reads;

  if (mapping_batch->options->pair_mode != SINGLE_END_MODE) {
    // PAIR MODE
    int len;
    char *sequence, *quality;

    char *seq, *opt_fields;
    alignment_t *alig;
  
    for (size_t i = 0; i < num_reads; i++) {
      read = (fastq_read_t *) array_list_get(i, read_list);
      //      seq = read->sequence;
      /*
      if (i % 2 == 0)  {
	mate_list = mapping_batch->mapping_lists[i+1];
	num_mate_mappings = array_list_size(mate_list);
      } else {
	mate_list = mapping_list;
	num_mate_mappings = num_mappings;
      }
      */
      mapping_list = mapping_batch->mapping_lists[i];
      num_mappings = array_list_size(mapping_list);
      num_total_mappings += num_mappings;

      #ifdef _VERBOSE
      if (num_mappings > 1) {
	num_dup_reads++;
	num_total_dup_reads += num_mappings;
      }
      #endif
      
      if (num_mappings > 0) {
	num_mapped_reads++;
	if (num_mappings > 1) {
	  num_multihit_reads++;
	}
	for (size_t j = 0; j < num_mappings; j++) {
	  alig = (alignment_t *) array_list_get(j, mapping_list);
	  /*
	  // update alignment
	  alig->secondary_alignment = 0;
	  if (num_mate_mappings != 1) {
	    alig->is_mate_mapped = 0;
	    alig->is_paired_end_mapped = 0;
	    alig->mate_strand = 0;
	  }
	  */
	  if (alig->optional_fields) {
	    opt_fields = (char *) calloc(strlen(alig->optional_fields) + 100, sizeof(char));
	    sprintf(opt_fields, "NH:i:%i\t%s", num_mappings, alig->optional_fields);
	    //	    sprintf(opt_fields, "NH:i:%i\t%s\tXU:i:%i", num_mappings, alig->optional_fields, mapping_batch->status[i]);
	  } else {
	    opt_fields = (char *) calloc(100, sizeof(char));
	    sprintf(opt_fields, "NH:i:%i", num_mappings);
	    //	    sprintf(opt_fields, "NH:i:%i\tXU:i:%i", num_mappings, mapping_batch->status[i]);
	  }
	  /*
	  // update alignment
	  alig->secondary_alignment = 0;
	  if (num_mate_mappings != 1) {
	    alig->is_mate_mapped = 0;
	    alig->is_paired_end_mapped = 0;
	    alig->mate_strand = 0;
	  }
	  */
	  flag = 0;
	  if (alig->is_paired_end)                              flag += BAM_FPAIRED;
	  if (alig->is_paired_end_mapped)                       flag += BAM_FPROPER_PAIR;
	  if (!alig->is_seq_mapped)                             flag += BAM_FUNMAP;   
	  if ((!alig->is_mate_mapped) && (alig->is_paired_end)) flag += BAM_FMUNMAP;
	  if (alig->mate_strand)                                flag += BAM_FMREVERSE;
	  if (alig->pair_num == 1)	                        flag += BAM_FREAD1;
	  if (alig->pair_num == 2)                              flag += BAM_FREAD2;
	  if (alig->secondary_alignment)                        flag += BAM_FSECONDARY;
	  if (alig->fails_quality_check)                        flag += BAM_FQCFAIL;
	  if (alig->pc_optical_duplicate)                       flag += BAM_FDUP;
	  if (alig->seq_strand)                                 flag += BAM_FREVERSE;

	  fprintf(out_file, "%s\t%lu\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n", 
		  read->id,
		  flag,
		  genome->chrom_names[alig->chromosome],
		  alig->position + 1,
		  (num_mappings > 1 ? 0 : alig->mapq), //60, //(alig->map_quality > 3 ? 0 : alig->map_quality),
		  alig->cigar,
		  (alig->chromosome == alig->mate_chromosome ? "=" : genome->chrom_names[alig->mate_chromosome]),
		  alig->mate_position + 1,
		  alig->template_length,
		  alig->sequence,
		  alig->quality,
		  opt_fields
		  );

	  // free memory
	  free(opt_fields);
	  alignment_free(alig);	 
	} // end for num_mappings
      } else {
	num_unmapped_reads++;

	opt_fields = (char *) calloc(100, sizeof(char));
	sprintf(opt_fields, "XM:i:%i XU:i:%i", num_mappings, mapping_batch->status[i]);

	if (read->adapter) {
	  len = read->length + abs(read->adapter_length);
	  sequence = (char *) malloc(len + 1);
	  quality = (char *) malloc(len + 1);

	  if (read->adapter_length < 0) {
	    strcpy(quality, read->adapter_quality);
	    strcat(quality, read->quality);
	  } else {
	    strcpy(quality, read->quality);
	    strcat(quality, read->adapter_quality);
	  }
	  
	  if ((read->adapter_strand == 0 && read->adapter_length < 0) || 
	      (read->adapter_strand == 1 && read->adapter_length > 0)) {
	    strcpy(sequence, read->adapter);
	    strcat(sequence, read->sequence);
	  } else {
	    strcpy(sequence, read->sequence);
	    strcat(sequence, read->adapter);
	  }

	  sequence[len] = 0; 
	  quality[len] = 0; 
	} else {
	  sequence = read->sequence;
	  quality = read->quality;
	}

	fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\t%s\n", 
		read->id,
		sequence,
		quality,
		opt_fields
		);

	free(opt_fields);

	if (read->adapter) {
	  free(sequence);
	  free(quality);
	}
      }
      array_list_free(mapping_list, (void *) NULL);
    }
  } else {
    // SINGLE MODE
    int len, mapq;
    char *seq;
    seed_cal_t *cal;

    cigar_t *cigar;
    char *sequence, *revcomp, *quality;

    for (size_t i = 0; i < num_reads; i++) {
      read = (fastq_read_t *) array_list_get(i, read_list);
      mapping_list = mapping_batch->mapping_lists[i];
      num_mappings = array_list_size(mapping_list);
      num_total_mappings += num_mappings;

      #ifdef _VERBOSE
      if (num_mappings > 1) {
	num_dup_reads++;
	num_total_dup_reads += num_mappings;
      }
      #endif
      
      if (num_mappings > 0) {
	num_mapped_reads++;
	if (num_mappings > 1) {
	  num_multihit_reads++;
	}

	for (size_t j = 0; j < num_mappings; j++) {
	  cal = (seed_cal_t *) array_list_get(j, mapping_list);
	  
	  if (read->adapter) {
	    // sequences and cigar
	    len = read->length + abs(read->adapter_length);
	    sequence = (char *) malloc(len + 1);
	    revcomp = (char *) malloc(len + 1);
	    quality = (char *) malloc(len + 1);
	    cigar = cigar_new_empty();

	    if (read->adapter_length < 0) {
	      strcpy(quality, read->adapter_quality);
	      strcat(quality, read->quality);
	    } else {
	      strcpy(quality, read->quality);
	      strcat(quality, read->adapter_quality);
	    }
	    
	    if ( (cal->strand == 1 && 
		  ((read->adapter_strand == 0 && read->adapter_length > 0) || 
		   (read->adapter_strand == 1 && read->adapter_length < 0)))
		 ||
		 (cal->strand == 0 && 
		  ((read->adapter_strand == 0 && read->adapter_length < 0) ||
		   (read->adapter_strand == 1 && read->adapter_length > 0))) ) {
	      strcpy(sequence, read->adapter);
	      strcat(sequence, read->sequence);
	      strcpy(revcomp, read->adapter_revcomp);
	      strcat(revcomp, read->revcomp);
	      
	      cigar_append_op(abs(read->adapter_length), 'S', cigar);
	      cigar_concat(&cal->cigar, cigar);
	    } else {
	      strcpy(sequence, read->sequence);
	      strcat(sequence, read->adapter);
	      strcpy(revcomp, read->revcomp);
	      strcat(revcomp, read->adapter_revcomp);
	      
	      cigar_concat(&cal->cigar, cigar);
	      cigar_append_op(read->adapter_length, 'S', cigar);
	    }
	    sequence[len] = 0; 
	    revcomp[len] = 0; 
	    quality[len] = 0; 
	  } else {
	    // sequences and cigar
	    sequence = read->sequence;
	    revcomp = read->revcomp;
	    quality = read->quality;
	    cigar = &cal->cigar;
	  }

	  if (cal->strand) {
	    flag = 16;
	    seq = revcomp;
	  } else {
	    flag = 0;
	    seq = sequence;
	  }

	  /*
	  if (i == 0) {
	    flag += BAM_FSECONDARY;
	  }
	  */

	  cigar_string = cigar_to_string(cigar);
	  cigar_M_string = cigar_to_M_string(&num_mismatches, &num_cigar_ops, cigar);
	  if (num_mappings > 1) {
	    cal->mapq = 0;
	  }
	  fprintf(out_file, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%lu\t%i\t%s\t%s\tNH:i:%i\tNM:i:%i\n", 
		  read->id,
		  flag,
		  genome->chrom_names[cal->chromosome_id],
		  cal->start + 1,
		  (num_mappings == 1 ? cal->mapq : 0),
		  cigar_M_string,
		  rnext,
		  pnext,
		  tlen,
		  seq,
		  quality,
		  num_mappings,
		  num_mismatches
		  );

	  // free memory
	  free(cigar_M_string);
	  free(cigar_string);
	  seed_cal_free(cal);	 
	  if (read->adapter) {
	    free(sequence);
	    free(revcomp);
	    free(quality);
	    cigar_free(cigar);
	  }
	}
      } else {
	num_unmapped_reads++;

	if (read->adapter) {
	  // sequences and cigar
	  len = read->length + abs(read->adapter_length);
	  sequence = (char *) malloc(len + 1);
	  quality = (char *) malloc(len + 1);

	  if (read->adapter_length < 0) {
	    strcpy(quality, read->adapter_quality);
	    strcat(quality, read->quality);
	  } else {
	    strcpy(quality, read->quality);
	    strcat(quality, read->adapter_quality);
	  }
	  
	  if ((read->adapter_strand == 0 && read->adapter_length < 0) || 
	      (read->adapter_strand == 1 && read->adapter_length > 0)) {
	    strcpy(sequence, read->adapter);
	    strcat(sequence, read->sequence);
	  } else {
	    strcpy(sequence, read->sequence);
	    strcat(sequence, read->adapter);
	  }

	  sequence[len] = 0; 
	  quality[len] = 0; 
	} else {
	  // sequences
	  sequence = read->sequence;
	  quality = read->quality;
	}
	
	fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", 
		read->id,
		sequence,
		quality
		);

	if (read->adapter) {
	  free(sequence);
	  free(quality);
	}
      }
      
      array_list_free(mapping_list, (void *) NULL);
    } // end for num_reads
  }

  // free memory
  sa_mapping_batch_free(mapping_batch);

  if (wf_batch) sa_wf_batch_free(wf_batch);

  return 0;
}
int sam_writer(void *data) {
  batch_t *batch = (batch_t *) data;
  batch_writer_input_t *writer_input = batch->writer_input;
  //bam_file_t *bam_file = writer_input->bam_file;    
  FILE *out_file = (FILE *) writer_input->bam_file;
  genome_t *genome = writer_input->genome;
  fastq_read_t *read;
  mapping_batch_t *mapping_batch = (mapping_batch_t *) batch->mapping_batch;
  size_t num_reads = array_list_size(mapping_batch->fq_batch);
  //linked_list_t *linked_list = writer_input->list_p;
  array_list_t *read_list = mapping_batch->fq_batch;
  array_list_t *mapping_list;
  size_t num_mappings;
  size_t num_mapped_reads = 0;
  size_t total_mappings = 0;  
  alignment_t *alig;
  int flag, pnext = 0, tlen = 0;
  char rnext[4] = "*\0";

  extern st_bwt_t st_bwt;
  st_bwt.total_reads += num_reads;

  for (size_t i = 0; i < num_reads; i++) {
    read = (fastq_read_t *) array_list_get(i, read_list);
    mapping_list = mapping_batch->mapping_lists[i];
    num_mappings = array_list_size(mapping_list);
    total_mappings += num_mappings;
    //printf("%i.Read %s (num_mappings %i)\n", i, read->id, num_mappings);    
    if (num_mappings > 0) {
      if (num_mappings == 1) {
	st_bwt.single_alig++;
      } else {
	st_bwt.multi_alig++;
      }

      num_mapped_reads++;
      for (size_t j = 0; j < num_mappings; j++) {
	alig = (alignment_t *) array_list_get(j, mapping_list);
	flag = (alig->seq_strand ? 16 : 0);
	fprintf(out_file, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s\n", 
		alig->query_name, 
		flag,
		genome->chr_name[alig->chromosome],
		alig->position + 1,
		alig->map_quality,
		alig->cigar,
		rnext,
		pnext,
		tlen,
		alig->sequence,
		alig->quality);
	//alig->optional_fields);
	
	alignment_free(alig);	 
      }
      array_list_free(mapping_list, NULL);
    } else {
      total_mappings++;
      if (mapping_list) {
	array_list_free(mapping_list, NULL);
      }	 
      
      fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", 
	      read->id,
	      read->sequence,
	      read->quality);

    }
  }

  if (mapping_batch) {
    mapping_batch_free(mapping_batch);
  }
  
  if (batch) batch_free(batch);

  basic_statistics_add(num_reads, num_mapped_reads, total_mappings, 0, basic_st);
}
Example #14
0
static PyObject * nw_align_wrapper(PyObject *self, PyObject *args, PyObject *kw)
{
    const char *seq1, *seq2;
    // Decide on scoring
    int match = 1;
    int mismatch = -2;
    int gap_open = -4;
    int gap_extend = -1;
    
    // Don't penalise gaps at the start
    // ACGATTT
    // ----TTT would score +3 (when match=+1)
    int no_start_gap_penalty = 0;
    
    // ..or gaps at the end e.g.
    // ACGATTT
    // ACGA--- would score +4 (when match=+1)
    int no_end_gap_penalty = 0;

    int no_gaps_in_a = 0, no_gaps_in_b = 0;
    int no_mismatches = 0;

    // Compare character case-sensitively (usually set to 0 for DNA etc)
    int case_sensitive = 0;

    PyObject * matrix = NULL;

    static char *kwlist[] = {"seq1","seq2", "matrix", "match", "mismatch", "gap_open","gap_extend", "no_start_gap_penalty", "no_end_gap_penalty", "no_gaps_in_a", "no_gaps_in_b", "no_mismatches", "case_sensitive", NULL};
    PyObject *res = NULL;

    if(!PyArg_ParseTupleAndKeywords(args, kw, "ss|Oiiiiiiiiii", kwlist, &seq1, &seq2, &matrix, &match, &mismatch, &gap_open, &gap_extend,
                                                                 &no_start_gap_penalty, &no_end_gap_penalty, &no_gaps_in_a, &no_gaps_in_b, &no_mismatches, &case_sensitive))
        return NULL;
    alignment_t *result = alignment_create(256);
    
    // Variables to store alignment result
    nw_aligner_t *nw = needleman_wunsch_new();

    scoring_t scoring;
    scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
                no_start_gap_penalty, no_end_gap_penalty,
                no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive);

    // Add some special cases
    // x -> y means x in seq1 changing to y in seq2
    if(matrix != NULL)
    {
        PyObject * mapping = PyMapping_Items(matrix);
        if(mapping == NULL)
            goto error;
        int n = PySequence_Size(mapping);
        PyObject *item;
        int value;
        PyObject *key;
        char * char_a;
        char * char_b;
        int i;
        for(i = 0; i < n; i++)
        {
            item = PySequence_GetItem(mapping, i);
            if(item == NULL || !PyTuple_Check(item))
            {
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error; 
            }
            
            if(!PyArg_ParseTuple(item, "Oi", &key, &value))
            {
                PyErr_SetString(PyExc_RuntimeError, "Values of matrix dict should be integers");
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error;
            }
            if(!PyTuple_Check(key))
            {
                PyErr_SetString(PyExc_RuntimeError, "Keys of matrix dict should be tuples");
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error;
            }
            if(!PyArg_ParseTuple(key, "ss", &char_a, &char_b))
            {
                PyErr_SetString(PyExc_RuntimeError, "Keys of matrix dict should be tuples with 2 characters as elements.");
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error;
            }
            if(strlen(char_a) != 1 || strlen(char_b) != 1)
            {
                PyErr_SetString(PyExc_RuntimeError, "Character length should be 1");
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error;
            }
            scoring_add_mutation(&scoring, case_sensitive ? *char_a : tolower(*char_a), case_sensitive ? *char_a : tolower(*char_b), value); // a -> c give substitution score -2
            Py_DECREF(item);
        }
    }

    // We could also prohibit the aligning of characters not given as special cases
    // scoring.use_match_mismatch = 0;

    needleman_wunsch_align(seq1, seq2, &scoring, nw, result);

    res = Py_BuildValue("ssi", result->result_a, result->result_b, result->score);

error:
    // Free memory for storing alignment results
    needleman_wunsch_free(nw);

    alignment_free(result);
    return res;
}
Example #15
0
void batch_writer(batch_writer_input_t* input_p) {

    struct timespec ts;
    ts.tv_sec = 1;
    ts.tv_nsec = 0;

    alignment_t **buffer_p;
    bam1_t* bam1_p;
    bam_header_t* bam_header_p;
    bam_file_t* bam_file_p;

    char* match_filename = input_p->match_filename;
    //char* mismatch_filename = input_p->mismatch_filename;

    char* splice_exact_filename = input_p->splice_exact_filename;
    char* splice_extend_filename = input_p->splice_extend_filename;

    list_t* list_p = input_p->list_p;

    printf("batch_writer (%i): START\n", omp_get_thread_num());

    list_item_t *item_p = NULL;
    write_batch_t* batch_p;

    FILE* fd;
    FILE* splice_exact_fd  = fopen(splice_exact_filename, "w");
    FILE* splice_extend_fd = fopen(splice_extend_filename, "w");

    //printf("HEADER FROM WRITE: %s\n", input_p->header_filename);
    bam_header_p = bam_header_new(HUMAN, NCBI37, input_p->header_filename);
    //bam_file_p = bam_fopen(match_filename);
    bam_file_p = bam_fopen_mode(match_filename, bam_header_p, "w");
    bam_fwrite_header(bam_header_p, bam_file_p);

    // main loop
    while ( (item_p = list_remove_item(list_p)) != NULL ) {

        if (time_on) {
            timing_start(BATCH_WRITER, 0, timing_p);
        }

        batch_p = (write_batch_t*) item_p->data_p;
        //printf("*********************************Extract one item*********************************\n");
        if (batch_p->flag == MATCH_FLAG || batch_p->flag == MISMATCH_FLAG) { //fd = match_fd;
            //printf("start write alignment. Total %d\n", batch_p->size);
            buffer_p = (alignment_t **)batch_p->buffer_p;
            for(int i = 0; i < batch_p->size; i++) {
                //alignment_print(buffer_p[i]);
                bam1_p = convert_to_bam(buffer_p[i], 33);
                bam_fwrite(bam1_p, bam_file_p);
                bam_destroy1(bam1_p);
                alignment_free(buffer_p[i]);
            }
        } else {
            if (batch_p->flag == SPLICE_EXACT_FLAG) {
                fd = splice_exact_fd;
            }
            else if (batch_p->flag == SPLICE_EXTEND_FLAG) {
                fd = splice_extend_fd;
            }
            else {
                fd = NULL;
            }

            if (fd != NULL) {
                //printf("start write batch, %i bytes...\n", batch_p->size);
                fwrite((char *)batch_p->buffer_p, batch_p->size, 1, fd);
                //printf("write done !!\n");
                //if (time_on) { stop_timer(t1_write, t2_write, write_time); }
            }
        }
        //printf("Free batch\n");
        write_batch_free(batch_p);
        list_item_free(item_p);

        if (time_on) {
            timing_stop(BATCH_WRITER, 0, timing_p);
        }
    } // end of batch loop

    //fclose(match_fd);
    //fclose(mismatch_fd);
    fclose(splice_exact_fd);
    fclose(splice_extend_fd);

    bam_fclose(bam_file_p);
    //bam_header_free(bam_header_p);
    printf("batch_writer: END\n");
}
Example #16
0
// Clean up pairwise aligner
static void nw_aligner_destroy()
{
  alignment_free(aln);
  needleman_wunsch_free(nw_aligner);
}
Example #17
0
int batch_decoder_run(batch_decoder_t *bd)
{
    int32 ctloffset, ctlcount, ctlincr;
    lineiter_t *li, *ali = NULL;

    search_run(bd->fwdtree);
    search_run(bd->fwdflat);

    ctloffset = cmd_ln_int32_r(bd->config, "-ctloffset");
    ctlcount = cmd_ln_int32_r(bd->config, "-ctlcount");
    ctlincr = cmd_ln_int32_r(bd->config, "-ctlincr");

    if (bd->alignfh)
        ali = lineiter_start(bd->alignfh);
    for (li = lineiter_start(bd->ctlfh); li; li = lineiter_next(li)) {
        alignment_t *al = NULL;
        char *wptr[4];
        int32 nf, sf, ef;

        if (li->lineno < ctloffset) {
            if (ali)
                ali = lineiter_next(ali);
            continue;
        }
        if ((li->lineno - ctloffset) % ctlincr != 0) {
            if (ali)
                ali = lineiter_next(ali);
            continue;
        }
        if (ctlcount != -1 && li->lineno >= ctloffset + ctlcount)
            break;
        if (ali)
            al = parse_alignment(ali->buf, search_factory_d2p(bd->sf));
        sf = 0;
        ef = -1;
        nf = str2words(li->buf, wptr, 4);
        if (nf == 0) {
            /* Do nothing. */
        }
        else if (nf < 0) {
            E_ERROR("Unexpected extra data in control file at line %d\n", li->lineno);
        }
        else
        {
            char *file, *uttid;
            file = wptr[0];
            uttid = NULL;
            if (nf > 1)
            sf = atoi(wptr[1]);
            if (nf > 2)
            ef = atoi(wptr[2]);
            if (nf > 3)
            uttid = wptr[3];
            /* Do actual decoding. */
            batch_decoder_decode(bd, file, uttid, sf, ef, al);
        }
        alignment_free(al);
        if (ali) ali = lineiter_next(ali);
    }
    featbuf_producer_shutdown(search_factory_featbuf(bd->sf));
    return 0;
}
Example #18
0
int sa_bam_writer(void *data) {
  sa_wf_batch_t *wf_batch = (sa_wf_batch_t *) data;
  
  sa_mapping_batch_t *mapping_batch = (sa_mapping_batch_t *) wf_batch->mapping_batch;
  if (mapping_batch == NULL) {
    printf("bam_writer1: error, NULL mapping batch\n");
    return 0;
  }

  //  for (int i = 0; i < NUM_COUNTERS; i++) {
  //    counters[i] += mapping_batch->counters[i];
  //  }

  #ifdef _TIMING
  for (int i = 0; i < NUM_TIMING; i++) {
    func_times[i] += mapping_batch->func_times[i];
  }
  #endif

  int flag, len;
  char *sequence, *quality;

  fastq_read_t *read;
  array_list_t *read_list = mapping_batch->fq_reads;

  bam1_t *bam1;
  alignment_t *alig;
  array_list_t *mapping_list;
  bam_file_t *out_file = wf_batch->writer_input->bam_file;

  sa_genome3_t *genome = wf_batch->sa_index->genome;

  size_t num_reads, num_mappings, num_mate_mappings;
  num_reads = mapping_batch->num_reads;
  for (size_t i = 0; i < num_reads; i++) {
    read = (fastq_read_t *) array_list_get(i, read_list);
    mapping_list = mapping_batch->mapping_lists[i];
    num_mappings = array_list_size(mapping_list);
    num_total_mappings += num_mappings;

    #ifdef _VERBOSE
    if (num_mappings > 1) {
      num_dup_reads++;
      num_total_dup_reads += num_mappings;
    }
    #endif

    if (num_mappings > 0) {
      num_mapped_reads++;
      if (num_mappings > 1) {
	num_multihit_reads++;
      }
      for (size_t j = 0; j < num_mappings; j++) {
	alig = (alignment_t *) array_list_get(j, mapping_list);

	// update alignment
	if (num_mappings > 1) {
	  alig->map_quality = 0;
	} else {
	  alig->map_quality = alig->mapq;
	}

	bam1 = convert_to_bam(alig, 33);
	bam_fwrite(bam1, out_file);
	bam_destroy1(bam1);
	alignment_free(alig);
      }
    } else {
      num_unmapped_reads++;

      if (read->adapter) {
	// sequences and cigar
	len = read->length + abs(read->adapter_length);
	sequence = (char *) malloc(len + 1);
	quality = (char *) malloc(len + 1);

	if (read->adapter_length < 0) {
	  strcpy(quality, read->adapter_quality);
	  strcat(quality, read->quality);
	} else {
	  strcpy(quality, read->quality);
	  strcat(quality, read->adapter_quality);
	}
	
	if ((read->adapter_strand == 0 && read->adapter_length < 0) || 
	    (read->adapter_strand == 1 && read->adapter_length > 0)) {
	  strcpy(sequence, read->adapter);
	  strcat(sequence, read->sequence);
	} else {
	  strcpy(sequence, read->sequence);
	  strcat(sequence, read->adapter);
	}
	sequence[len] = 0; 
	quality[len] = 0; 
      } else {
	// sequences
	sequence = read->sequence;
	quality = read->quality;
      }
      
      alig = alignment_new();       
      alignment_init_single_end(strdup(read->id), sequence, quality,
				0, -1, -1, /*strdup(aux)*/"", 0, 0, 0, 0, 0, NULL, alig);
      
      bam1 = convert_to_bam(alig, 33);
      bam_fwrite(bam1, out_file);
        
      // free memory
      bam_destroy1(bam1);
      alig->sequence = NULL;
      alig->quality = NULL;
      alig->cigar = NULL;
      alignment_free(alig);
      if (read->adapter) {
	free(sequence);
	free(quality);
      }
    }
    array_list_free(mapping_list, (void *) NULL);
  }

  // free memory
  sa_mapping_batch_free(mapping_batch);

  if (wf_batch) sa_wf_batch_free(wf_batch);

  return 0;
}
Example #19
0
void batch_writer2(batch_writer_input_t* input) {

    printf("START: batch_writer (%i): START, for file %s\n",
           omp_get_thread_num(), input->match_filename);

    bam1_t *bam1;
    bam_header_t *bam_header;
    bam_file_t *bam_file;
    alignment_t *alig;

    char* match_filename = input->match_filename;
    //  char* splice_filename = input->splice_filename;

    list_t *write_list = input->list_p;
    array_list_t *array_list;

    list_item_t *item = NULL;
    aligner_batch_t *batch = NULL;
    fastq_batch_t *fq_batch = NULL;

    FILE* fd;

    static char aux[10];

    size_t read_len;

    bam_header = bam_header_new(HUMAN, NCBI37, input->header_filename);
    bam_file = bam_fopen_mode(match_filename, bam_header, "w");

    bam_fwrite_header(bam_header, bam_file);

    size_t num_reads = 0, num_items = 0, total_mappings = 0;

    // main loop
    while ( (item = list_remove_item(write_list)) != NULL ) {

        //    if (array_list == NULL) printf("batch_writer.c...\n");

        batch = (aligner_batch_t *) item->data_p;
        fq_batch = batch->fq_batch;
        num_reads = batch->num_mapping_lists;

        for (size_t i = 0; i < num_reads; i++) {

            array_list = batch->mapping_lists[i];
            //      if (array_list == NULL) printf("READ %d, writer, list is NULL\n", i);

            //      printf("----> list == NULL ? %d\n", (array_list == NULL));
            num_items = (array_list == NULL ? 0 : array_list_size(array_list));
            //      printf("----> number of items = %d, num_items <= 0 ? %d\n", num_items, num_items <= 0);

            read_len = fq_batch->data_indices[i + 1] - fq_batch->data_indices[i] - 1;

            // mapped or not mapped ?
            if (num_items == 0) {

                //printf("\tWRITE : read %i (%d items): unmapped...\n", i, num_items);

                // calculating cigar
                sprintf(aux, "%luX", read_len);

                alig = alignment_new();
                alignment_init_single_end(&(fq_batch->header[fq_batch->header_indices[i]])+1,
                                          &(fq_batch->seq[fq_batch->data_indices[i]]),
                                          &(fq_batch->quality[fq_batch->data_indices[i]]),
                                          0,
                                          0,
                                          0,
                                          aux, 1, 255, 0, 0, alig);

                bam1 = convert_to_bam(alig, 33);
                bam_fwrite(bam1, bam_file);
                bam_destroy1(bam1);

                // some cosmetic stuff before freeing the alignment,
                // (in order to not free twice some fields)
                alig->query_name = NULL;
                alig->sequence = NULL;
                alig->quality = NULL;
                alig->cigar = NULL;
                alignment_free(alig);

                //	printf("\tWRITE : read %i (%d items): unmapped...done !!\n", i, num_items);

            } else {
                //	printf("\tWRITE : read %d (%d items): mapped...\n", i, num_items);
                for (size_t j = 0; j < num_items; j++) {
                    alig = (alignment_t *) array_list_get(j, array_list);
                    if (alig != NULL) {

                        bam1 = convert_to_bam(alig, 33);
                        bam_fwrite(bam1, bam_file);
                        bam_destroy1(bam1);

                        alignment_free(alig);
                    }
                }
                //	printf("\tWRITE : read %d (%d items): mapped...done !!\n", i, num_items);
            }
            if (array_list != NULL) array_list_free(array_list, NULL);
        }

        if (batch != NULL) aligner_batch_free(batch);
        if (item != NULL) list_item_free(item);

        if (time_on) {
            timing_stop(BATCH_WRITER, 0, timing_p);
        }
    } // end of batch loop
    bam_fclose(bam_file);
    printf("END: batch_writer (total mappings %lu)\n", total_mappings);
}