예제 #1
0
void map_reads(gzFile *output_files, gzFile multi_out_file, 
	       gzFile unmapped_out_file, gzFile reads_f, Mapper *mapper,
	       int reads_format, int output_type) {
  FastqRead fastq_read;
  MapRead map_read;
  long warn_count, n_fastq_rec, n_fastq_err;
  long n_map_uniq, n_map_multi, n_map_none;

  map_read.fwd_nucs = my_new(unsigned char, FASTQ_MAX_LINE);
  map_read.rev_nucs = my_new(unsigned char, FASTQ_MAX_LINE);

  n_map_uniq = n_map_multi = n_map_none = 0;
  warn_count = n_fastq_err = n_fastq_rec = 0;

  /* loop over all records in FASTQ file */
  while(TRUE) {
    long r = 0;

    /* read fastq record from file */
    if(reads_format == READS_FORMAT_FASTQ) {
      r = fastq_parse_read(&fastq_read, reads_f);
    }
    else if(reads_format == READS_FORMAT_QSEQ) {
      r = fastq_parse_qseq_read(&fastq_read, reads_f);
    } 
    else {
      my_err("%s:%d: unknown read format", __FILE__, __LINE__);
    }
    
    if(r == FASTQ_END) {
      /* we have reached the end of the file */
      break;
    }

    if(r == FASTQ_ERR) {
      /* this fastq record contains an error */
      if(warn_count < FASTQ_MAX_WARN) {
	warn_count += 1;
	my_warn("%s:%d: skipping invalid fastq record:\n", 
		__FILE__, __LINE__);
	fprintf(stderr, "  %s\n  %s\n  %s\n  %s\n", fastq_read.line1, 
		fastq_read.line2, fastq_read.line3, fastq_read.line4);
      }
      n_fastq_err += 1;
    }
    else if(fastq_read.read_len != mapper->seed_finder_fwd->read_len) {
      /* check that read length is correct */
	warn_count += 1;
	my_warn("%s:%d: specified read length is %u, but got %d, "
		"skipping read\n",  __FILE__, __LINE__, 
		mapper->seed_finder_fwd->read_len, fastq_read.read_len);
	n_fastq_err += 1;
    }
    else if(r == FASTQ_OK) {
      n_fastq_rec += 1;

      read_from_fastq_record(&map_read, &fastq_read);

      if((n_fastq_rec % 1000000) == 0) {
	fprintf(stderr, ".");
      }

      /* try to map this read to genome */
      mapper_map_one_read(mapper, &map_read);

      if(map_read.map_code == MAP_CODE_NONE) {
	/* read does not map to genome */
	n_map_none += 1;
	if(output_type == OUTPUT_TYPE_SINGLE) {
	  write_unmapped_read(output_files[0], &map_read);
	} else {
	  write_unmapped_read(unmapped_out_file, &map_read);
	}
      }
      else if(map_read.map_code == MAP_CODE_MULTI) {
	/* read maps to multiple genomic locations */
	n_map_multi += 1;

	if(output_type == OUTPUT_TYPE_SINGLE) {
	  write_read(output_files, mapper->chr_tab, &map_read, FALSE);
	} else {
	  write_read(&multi_out_file, mapper->chr_tab, &map_read, FALSE);
	}
      }
      else if(map_read.map_code == MAP_CODE_UNIQUE) {
	/* read maps to single genomic location */
	n_map_uniq += 1;
	
	if(output_type == OUTPUT_TYPE_SINGLE) {
	  write_read(output_files, mapper->chr_tab, &map_read, FALSE);
	} else {
	  write_read(output_files, mapper->chr_tab, &map_read, TRUE);
	}
      }
      else {
	my_err("%s:%d: unknown mapping code", __FILE__, __LINE__);
      }
    } else {
      my_err("%s:%d: unknown fastq status", __FILE__, __LINE__);
    }
  }

  fprintf(stderr, "\ndone\n");
  fprintf(stderr, "fastq errors: %ld\n", n_fastq_err);
  fprintf(stderr, "fastq records (without errors): %ld\n", n_fastq_rec);
  fprintf(stderr, "unmapped reads: %ld\n", n_map_none);
  fprintf(stderr, "uniquely mapping reads: %ld\n", n_map_uniq);
  fprintf(stderr, "multiply mapping reads: %ld\n", n_map_multi);

  my_free(map_read.fwd_nucs);
  my_free(map_read.rev_nucs);
}
int bam_writer(void *data) {
  struct timeval start, end;
  double time;
  
  //if (time_on) { start_timer(start); }
  
  batch_t *batch = (batch_t *) data;
  fastq_read_t *fq_read;
  array_list_t *array_list;
  size_t num_items;
  
  //bam1_t *bam1;
  //alignment_t *alig;

  mapping_batch_t *mapping_batch = (mapping_batch_t *) batch->mapping_batch;
  
  batch_writer_input_t *writer_input = batch->writer_input;
  bam_file_t *bam_file = writer_input->bam_file;     
  linked_list_t *linked_list = writer_input->list_p;
  size_t num_reads_b = array_list_size(mapping_batch->fq_batch);
  size_t num_mapped_reads = 0;
  size_t total_mappings = 0;
  unsigned char found_p1 = 0;
  unsigned char found_p2 = 0;
  int i = 0;
  
  extern size_t bwt_correct;
  extern size_t bwt_error;
  extern pthread_mutex_t bwt_mutex, mutex_sp;
  
  writer_input->total_batches++;
  
  extern size_t *histogram_sw;
  
  extern size_t num_reads_map;
  extern size_t num_reads;
  extern size_t tot_reads;
  
  extern st_bwt_t st_bwt;
  st_bwt.total_reads += num_reads_b;

  free(mapping_batch->histogram_sw);
  //
  // DNA/RNA mode
  //
  for (size_t i = 0; i < num_reads_b; i++) {
    num_items = array_list_size(mapping_batch->mapping_lists[i]);
    total_mappings += num_items;
    fq_read = (fastq_read_t *) array_list_get(i, mapping_batch->fq_batch);
    
    // mapped or not mapped ?	 
    if (num_items == 0) {
      total_mappings++;
      write_unmapped_read(fq_read, bam_file);
      if (mapping_batch->mapping_lists[i]) {
	array_list_free(mapping_batch->mapping_lists[i], NULL);
      }	 
    } else {
      num_mapped_reads++;

      if (array_list_size(mapping_batch->mapping_lists[i]) == 1) {
	st_bwt.single_alig++;
      } else {
	st_bwt.multi_alig++;
      }
      
      write_mapped_read(mapping_batch->mapping_lists[i], bam_file);
    }
  }
  
  if (mapping_batch) {
    mapping_batch_free(mapping_batch);
  }
  
  if (batch) batch_free(batch);
  
  basic_statistics_add(num_reads_b, num_mapped_reads, total_mappings, 0, basic_st);
}