Beispiel #1
0
int load_seqs(const char *path, char ***seqs_ptr, int *cap_ptr)
{
  int cap = 1024;
  char **seqs = my_malloc(sizeof(char*) * cap,__FILE__,__LINE__);

  read_t read;
  seq_read_alloc(&read);

  seq_file_t *file = seq_open(path);
  if(file == NULL) die("Cannot open file: %s.", path);
  int num = 0;

  while(seq_read(file, &read))
  {
    if(num == cap) {
      cap *= 2;
      seqs = realloc(seqs, sizeof(char*) * cap);
    }
    seqs[num++] = strdup(read.seq.b);
  }

  seq_read_dealloc(&read);
  seq_close(file);

  *seqs_ptr = seqs;
  *cap_ptr = cap;

  return num;
}
Beispiel #2
0
// Load all reads from files into a read buffer and close the seq_files
// Returns the number of reads loaded
size_t seq_load_all_reads(seq_file_t **seq_files, size_t num_files,
                          ReadBuffer *rbuf)
{
  status("Loading sequences...");

  size_t i, nreads = rbuf->len;
  read_t r;
  seq_read_alloc(&r);
  for(i = 0; i < num_files; i++) {
    status("  file: %s", seq_files[i]->path);
    while(seq_read_primary(seq_files[i], &r) > 0) {
      read_buf_push(rbuf, &r, 1); // copy read
      seq_read_alloc(&r); // allocate new read
    }
    seq_close(seq_files[i]);
  }
  seq_read_dealloc(&r);

  return rbuf->len - nreads;
}
Beispiel #3
0
int main(int argc, char **argv)
{
  if(argc != 2) exit(EXIT_FAILURE);
  seq_file_t *f = seq_open(argv[1]);
  read_t *r = seq_read_alloc();
  if(f == NULL) exit(EXIT_FAILURE);
  while(seq_read(f,r) > 0)
    printf("%s\t[%lu,%lu,%lu]\n", r->name.b, r->name.end, r->seq.end, r->qual.end);
  seq_close(f);
  seq_read_destroy(r);
  return EXIT_SUCCESS;
}
Beispiel #4
0
// Load reads from a file, apply sequence error, dump
// Return total number of bases
size_t mutate_reads(seq_file_t *sfile, gzFile gzout, FileList *flist, float err)
{
  printf(" reading: %s\n", sfile->path);
  read_t r;
  seq_read_alloc(&r);
  size_t num_bases = 0;

  while(seq_read(sfile, &r) > 0) {
    if(err > 0) add_seq_error_rate(r.seq.b, r.seq.end, err);
    else add_seq_error_profile(r.seq.b, r.seq.end, flist);
    gzprintf(gzout, "@%s\n%s\n+\n%s\n", r.name.b, r.seq.b, r.qual.b);
    num_bases += r.seq.end;
  }

  seq_read_dealloc(&r);
  return num_bases;
}
Beispiel #5
0
void filelist_alloc(FileList *flist, char **paths, size_t num)
{
  size_t i;
  flist->num_files = num;
  flist->curr = 0;
  flist->files = malloc(num * sizeof(seq_file_t*));
  flist->fqoffsets = malloc(num * sizeof(int));

  for(i = 0; i < num; i++) {
    if((flist->files[i] = seq_open(paths[i])) == NULL)
      die("Cannot open: %s", paths[i]);
    int min, max, fmt;
    fmt = seq_guess_fastq_format(flist->files[i], &min, &max);
    if(fmt < 0) die("Cannot detect FASTQ format: %s", paths[i]);
    flist->fqoffsets[i] = FASTQ_OFFSET[fmt];
    printf(" profile: %s [offset: %i]\n", paths[i], FASTQ_OFFSET[fmt]);
  }

  seq_read_alloc(&flist->read);
  flist->filesready = 1;
  flist->errors_cap = 512;
  flist->errors_len = 0;
  flist->errors = calloc(flist->errors_cap, sizeof(size_t));
}
Beispiel #6
0
static void test_kmer_occur_filter()
{
  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  const size_t kmer_size = 11, ncols = 3;
  size_t i;

  // Create graph
  db_graph_alloc(&graph, kmer_size, ncols, 1, 2000,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  //      xyz------->>>      y         >  <         X
  // TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA

  #define NUM_NODES 3
  #define NUM_READS 3

  const char *tmp[NUM_READS]
  = {
    "AACA",
    "TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA",
    "TCTAGCATGTGTGTT"};

  read_t reads[NUM_READS];
  for(i = 0; i < NUM_READS; i++) {
    seq_read_alloc(&reads[i]);
    seq_read_set(&reads[i], tmp[i]);
  }

  KOGraph kograph = kograph_create(reads, NUM_READS, true, 0, 1, &graph);

  TASSERT(kograph.nchroms == NUM_READS);
  TASSERT(kograph.koccurs != NULL);

  KOccurRunBuffer koruns, koruns_tmp, koruns_ended;
  korun_buf_alloc(&koruns, 16);
  korun_buf_alloc(&koruns_tmp, 16);
  korun_buf_alloc(&koruns_ended, 16);

  // Check CCCGACAGGGCAA starts at CCCGACAGGGC
  // x=CCCGACAGGGC, y=CCGACAGGGCA, z=CGACAGGGCAA
  // X=GCCCTGTCGGG, Y=TGCCCTGTCGG, Z=TTGCCCTGTCG
  dBNode nodes[NUM_NODES];
  for(i = 0; i < NUM_NODES; i++)
    nodes[i] = db_graph_find_str(&graph, &"CCCGACAGGGCAA"[i]);

  korun_buf_reset(&koruns);
  korun_buf_reset(&koruns_ended);
  kograph_filter_extend(&kograph, nodes, NUM_NODES, true, 0, 0,
                        &koruns, &koruns_tmp, &koruns_ended);

  // Checks
  TASSERT2(koruns.len == 1, "koruns.len: %zu", koruns.len);
  TASSERT(koruns.b[0].strand == STRAND_PLUS); // left-to-right with ref
  TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom);
  TASSERT2(koruns.b[0].first == 5, "offset: %zu", (size_t)koruns.b[0].first);
  TASSERT2(koruns.b[0].last == 7, "last: %zu", (size_t)koruns.b[0].last);

  // Test reverse
  db_nodes_reverse_complement(nodes, NUM_NODES);

  korun_buf_reset(&koruns);
  korun_buf_reset(&koruns_ended);
  kograph_filter_extend(&kograph, nodes, 1, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended);
  kograph_filter_extend(&kograph, nodes+1, 1, true, 0, 1, &koruns, &koruns_tmp, &koruns_ended);
  kograph_filter_extend(&kograph, nodes+2, 1, true, 0, 2, &koruns, &koruns_tmp, &koruns_ended);

  // Print out for debugging
  // printf("koruns: ");
  // koruns_print(koruns.b, koruns.len, kmer_size, stdout);
  // printf("\nkoruns_ended: ");
  // koruns_print(koruns_ended.b, koruns_ended.len, kmer_size, stdout);
  // printf("\n");

  // Check results match:
  // koruns: chromid:1:17-5:-, chromid:1:37-47:+
  // koruns_ended: chromid:1:34-24:-
  TASSERT2(koruns.len == 2, "koruns.len: %zu", koruns.len);
  TASSERT2(koruns_ended.len == 1, "koruns_ended.len: %zu", koruns_ended.len);
  TASSERT(koruns.b[0].strand == STRAND_MINUS); // reverse complement of ref
  TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom);
  TASSERT2(koruns.b[0].first == 7, "offset: %zu", (size_t)koruns.b[0].first);
  TASSERT2(koruns.b[0].last == 5, "last: %zu", (size_t)koruns.b[0].last);

  korun_buf_dealloc(&koruns);
  korun_buf_dealloc(&koruns_tmp);
  korun_buf_dealloc(&koruns_ended);

  for(i = 0; i < NUM_READS; i++) seq_read_dealloc(&reads[i]);
  kograph_dealloc(&kograph);

  db_graph_dealloc(&graph);
}
Beispiel #7
0
// If seq2 is NULL, read pair of entries from first file
// Otherwise read an entry from each
void align_from_file(const char *path1, const char *path2,
                     void (align)(read_t *r1, read_t *r2),
                     bool use_zlib)
{
  seq_file_t *sf1, *sf2;

  if((sf1 = open_seq_file(path1, use_zlib)) == NULL)
  {
    fprintf(stderr, "Alignment Error: couldn't open file %s\n", path1);
    fflush(stderr);
    return;
  }

  if(path2 == NULL)
  {
    sf2 = sf1;
  }
  else if((sf2 = open_seq_file(path2, use_zlib)) == NULL)
  {
    fprintf(stderr, "Alignment Error: couldn't open file %s\n", path1);
    fflush(stderr);
    return;
  }

  // fprintf(stderr, "File buffer %zu zlib: %i\n", sf1->in.size, seq_use_gzip(sf1));

  read_t read1, read2;
  seq_read_alloc(&read1);
  seq_read_alloc(&read2);

  // Loop while we can read a sequence from the first file
  unsigned long alignments;

  for(alignments = 0; seq_read(sf1, &read1) > 0; alignments++)
  {
    if(seq_read(sf2, &read2) <= 0)
    {
      fprintf(stderr, "Alignment Error: Odd number of sequences - "
                      "I read in pairs!\n");
      fflush(stderr);
      break;
    }

    (align)(&read1, &read2);
  }

  // warn if no bases read
  if(alignments == 0)
  {
    fprintf(stderr, "Alignment Warning: empty input\n");
    fflush(stderr);
  }

  // Close files
  seq_close(sf1);

  if(path2 != NULL)
    seq_close(sf2);

  // Free memory
  seq_read_dealloc(&read1);
  seq_read_dealloc(&read2);
}
Beispiel #8
0
// Returns num of bases printed
size_t sim_reads(seq_file_t *reffile, gzFile out0, gzFile out1,
                 FileList *flist, float err_rate,
                 size_t insert, double insert_stddev, size_t rlen, double depth)
{
  size_t i, chromcap = 16, nchroms, glen = 0, nreads, chr, pos0, pos1, tlen;
  read_t *chroms;

  tlen = rlen + (out1 == NULL ? 0 : insert + rlen);

  chroms = malloc(chromcap * sizeof(read_t));
  nchroms = 0;

  // Load genome
  printf(" Loaded contigs:");
  while(1)
  {
    if(nchroms == chromcap) chroms = realloc(chroms, (chromcap*=2)*sizeof(read_t));
    seq_read_alloc(&chroms[nchroms]);
    if(seq_read(reffile, &chroms[nchroms]) <= 0)
    { seq_read_dealloc(&chroms[nchroms]); break; }
    if(chroms[nchroms].seq.end < tlen) { seq_read_dealloc(&chroms[nchroms]); }
    else {
      seq_read_truncate_name(&chroms[nchroms]);
      printf(" %s[%zu]", chroms[nchroms].name.b, chroms[nchroms].seq.end);
      glen += chroms[nchroms].seq.end;
      nchroms++;
    }
  }
  printf("\n Genome size: %zu\n", glen);

  if(nchroms == 0) {
    die("No sequences long enough in ref genome file [min len: %zu]: %s",
        tlen, reffile->path);
  }

  // Sample
  nreads = (glen * depth) / (out1 == NULL ? rlen : (2 * rlen));
  char read0[rlen+1], read1[rlen+1];
  read0[rlen] = read1[rlen] = '\0';

  printf("Sampling %zu %sreads...\n", nreads,
         out1 == NULL ? "single " : "paired-end ");

  // Sample paired-end if out1 != NULL
  for(i = 0; i < nreads; i++)
  {
    chr = (nchroms == 1) ? 0 : rand_chrom(chroms, nchroms, glen);
    pos0 = random_uniform(chroms[chr].seq.end - (out1 == NULL ? rlen : tlen));
    pos1 = pos0;
    memcpy(read0, chroms[chr].seq.b+pos0, rlen);
    if(out1 != NULL) {
      pos1 = pos0 + rlen + insert + ran_normal()*insert_stddev;
      if(pos1 + rlen > chroms[chr].seq.end) pos1 = chroms[chr].seq.end-rlen;
      memcpy(read1, chroms[chr].seq.b+pos1, rlen);
    }
    if(flist != NULL) {
      add_seq_error_profile(read0, rlen, flist);
      if(out1 != NULL)
        add_seq_error_profile(read1, rlen, flist);
    }
    else if(err_rate >= 0) {
      add_seq_error_rate(read0, rlen, err_rate);
    }
    gzprintf(out0, ">r%zu:%s:%zu:%zu%s\n%.*s\n", i, chroms[chr].name.b,
                   pos0, pos1, (out1 != NULL ? "/1" : ""), (int)rlen, read0);
    if(out1 != NULL) {
      dna_revcmp(read1, rlen);
      gzprintf(out1, ">r%zu:%s:%zu:%zu/2\n%.*s\n", i, chroms[chr].name.b,
                     pos0, pos1, (int)rlen, read1);
    }
  }

  for(i = 0; i < nchroms; i++) seq_read_dealloc(&chroms[i]);
  free(chroms);

  size_t num_bases = nreads * rlen;
  if(out1 != NULL) num_bases *= 2;

  return num_bases;
}