Exemple #1
0
void testCreateSequence() {
  Sequence seq1 = create_sequence("label", "l", "AAAAAAAAAA", "!!!!!!!!!!");
  Sequence seq2 = create_sequence("", "", "", "");

  TAP_TEST(seq1.label_full == "label", TEST_CREATE_SEQUENCE_LABEL_FULL, "");
  TAP_TEST(seq2.label_full == "", TEST_CREATE_SEQUENCE_LABEL_FULL, "");

  TAP_TEST(seq1.label == "l", TEST_CREATE_SEQUENCE_LABEL, "");
  TAP_TEST(seq2.label == "", TEST_CREATE_SEQUENCE_LABEL, "");

  TAP_TEST(seq1.sequence == "AAAAAAAAAA", TEST_CREATE_SEQUENCE_SEQUENCE, "");
  TAP_TEST(seq2.sequence == "", TEST_CREATE_SEQUENCE_SEQUENCE, "");

  TAP_TEST(seq1.quality == "!!!!!!!!!!", TEST_CREATE_SEQUENCE_QUALITY, "");
  TAP_TEST(seq2.quality == "", TEST_CREATE_SEQUENCE_QUALITY, "");
}
Exemple #2
0
Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) {
  bwa_seq_t* sequence = create_sequence(bases,read_length);

  // Calculate paths.
  bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);

  // Check for no alignments found and return null.
  if(sequence->n_aln == 0) {
    bwa_free_read_seq(1,sequence);
    return NULL;
  }

  // bwa_cal_sa_reg_gap destroys the bases / read length.  Copy them back in.
  copy_bases_into_sequence(sequence,bases,read_length);

  // Pick best alignment and propagate its information into the sequence.
  bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);

  // Generate the best alignment from the sequence.
  Alignment* alignment = new Alignment;
  *alignment = generate_final_alignment_from_sequence(sequence);

  bwa_free_read_seq(1,sequence);

  return alignment;
}
Exemple #3
0
/* Reads data line after line from a standard input, where every line
 * represents a single data point that can be a number or optionally
 * a number, any number of spaces, and a label, and returns the sequence
 * represented by this stream.
 *
 * The function is very tolerant about spaces and badly formatted lines. */
struct sequence *datastream_to_sequence(void) {
    struct sequence *seq = create_sequence();
    char buf[4096];

    while(fgets(buf,sizeof(buf),stdin) != NULL) {
        char *endptr, *start, *label = NULL, *p = buf;
        double value;

        /* Skip initial spaces */
        while(*p && isspace(*p)) p++;
        if (*p == '\0') continue; /* Empty line, skip it */
        start = p;
        /* Find the end of the value */
        while(*p && !isspace(*p)) p++;
        if (*p) {
            *p = '\0';
            p++;
        }
        /* Parse the value */
        errno = 0;
        value = strtod(start,&endptr);
        if (*endptr != '\0' || errno != 0) continue; /* Bad float, skip it */
        /* Find the start of the label */
        while(*p && isspace(*p)) p++;
        if (*p != '\0') {
            /* We have an additional label, find the end */
            label = p;
            while(*p && !isspace(*p)) p++;
            *p = '\0';
        }
        sequence_add_sample(seq,value,label ? strdup(label) : NULL);
    }
    return seq;
}
Exemple #4
0
/* Read chars from stdin until end of file, build a frequency table, and
 * finally translate it into samples. */
struct sequence *file_freq_to_sequence(void) {
    struct sequence *seq;
    unsigned int count[256];
    int c;

    memset(count,0,sizeof(count));
    while((c = getc(stdin)) != EOF) {
        if (opt_mode == ASPARK_MODE_TXTFREQ) c = toupper(c);
        count[c]++;
    }
    seq = create_sequence();
    if (opt_mode == ASPARK_MODE_BINFREQ) {
        for (c = 0; c < 256; c++) {
            char buf[32];

            snprintf(buf,sizeof(buf),"%d",c);
            sequence_add_sample(seq,count[c],strdup(buf));
        }
    } else {
        for (c = ' '+1; c <= 'Z'; c++) {
            char buf[2];

            snprintf(buf,sizeof(buf),"%c",c);
            sequence_add_sample(seq,count[c],strdup(buf));
        }
    }
    return seq;
}
Exemple #5
0
/* Convert a string in the form 1,2,3.4,5:label1,6:label2 into a sequence.
 * On error NULL is returned and errno set accordingly:
 *
 * EINVAL => Invalid data format. */
struct sequence *argument_to_sequence(const char *arg) {
    struct sequence *seq = create_sequence();
    char *copy, *start;

    start = copy = strdup(arg);
    while(*start) {
        char *label, *end, *endptr;
        double value;

        end = strchr(start,',');
        if (end) *end = '\0';
        label = strchr(start,':');
        if (label) {
            *label = '\0';
            label++; /* skip the ':' */
        }
        errno = 0;
        value = strtod(start,&endptr);
        if (*endptr != '\0' || errno != 0 || isinf(value) || isnan(value)) {
            errno = EINVAL;
            return NULL;
        }
        sequence_add_sample(seq,value,label ? strdup(label) : NULL);
        if (end)
            start = end+1;
        else
            break;
    }
    free(copy);
    return seq;
}
Exemple #6
0
void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count) 
{
  bwa_seq_t* sequence = create_sequence(bases, read_length);

  // Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln).
  // This method will destroy the contents of seq and rseq.
  bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);

  paths = new bwt_aln1_t[sequence->n_aln];
  memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t));
  num_paths = sequence->n_aln;

  // Call aln2seq to initialize the type of match present.
  bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
  best_path_count = sequence->c1;
  second_best_path_count = sequence->c2;

  bwa_free_read_seq(1,sequence);
}
Exemple #7
0
int mmerge_process (SequenceList list, int *super, int (*majority)(Sequence **, int, int *, int, int *, int **, int *)) {
  Sequence *node;
  Sequence **seq;
  int n; 
  int i;
  int alphabet[MAX_ALPHABET_SIZE], alpha_len; 

  alpha_len = get_alphabet_set(list, alphabet);
  n = get_size(list);

  seq = (Sequence**) malloc (n * sizeof(Sequence*));

  node = list; i = 0; 
  while(node) {
    seq[i++] = create_sequence (node->seq, node->len);
    node = node->next;
  }

  return mmerge(seq, n, alphabet, alpha_len, majority, super);
}
// Generate 10 sequences, and launch 10 times getRandom(1).
// We should not have the same sequence 10 times (p < 10^{-10})
void testRandom() {
  list<Sequence> seqs;
  string seg_name = "seq";
  char id = '0';
  string sequence = "AA";

  for (int i = 0; i < 10; i++) {
    seqs.push_back(create_sequence("seq" + string_of_int(id), "seq" + string_of_int(id), sequence, ""));
    sequence += "A";
    id++;
  }

  SequenceSampler sampler(seqs);
  string first_random = sampler.getRandom(1).front().label;
  bool all_equal = true;
  for (int i = 0; i < 9 && all_equal; i++) {
    if (first_random != sampler.getRandom(1).front().label)
      all_equal = false;
  }

  TAP_TEST(all_equal == false, TEST_SAMPLER_RANDOM, "On the 10 trials, we drawn 10 times the same sequence");
}
void testLongest() {
  list<Sequence> seqs;

  seqs.push_back(create_sequence("seq1", "seq1", "AAAAAAAAA", ""));
  seqs.push_back(create_sequence("seq2", "seq2", "AAAAA", ""));
  seqs.push_back(create_sequence("seq3", "seq3", "AAAAAAAA", ""));
  seqs.push_back(create_sequence("seq4", "seq4", "AAAAAAAAAA", ""));
  seqs.push_back(create_sequence("seq5", "seq5", "AAAAAA", ""));
  seqs.push_back(create_sequence("seq6", "seq6", "AAAAAAA", ""));

  SequenceSampler s(seqs);

  list<Sequence> l1 = s.getLongest(6, 11);
  size_t *distrib = s.getLengthDistribution();

  TAP_TEST(distrib[0] == 0
           && distrib[1] == 0 && distrib[2] == 0 && distrib[3] == 0 && distrib[4] == 0
           && distrib[5] == 1 && distrib[6] == 1 && distrib[7] == 1 && distrib[8] == 1
           && distrib[9] == 1 && distrib[10] == 1,
           TEST_SAMPLER_LENGTH, "");

  char id = '1';
  TAP_TEST(l1.size() == 6, TEST_SAMPLER_LONGEST, "");
  for (list<Sequence>::const_iterator it = l1.begin(); it != l1.end(); it++) {
    TAP_TEST(it->label[3] == id, TEST_SAMPLER_LONGEST, "");
    id++;
  }

  // With only 10 buckets, the two longest sequences share the same bucket.
  // Due to their insertion order, the shorter will be sampled first
  l1 = s.getLongest(2, 10);
  distrib = s.getLengthDistribution();

  TAP_TEST(distrib[0] == 0
           && distrib[1] == 0 && distrib[2] == 0 && distrib[3] == 0 && distrib[4] == 0
           && distrib[5] == 1 && distrib[6] == 1 && distrib[7] == 1 && distrib[8] == 1
           && distrib[9] == 2, TEST_SAMPLER_LENGTH, "");

  TAP_TEST(l1.size() == 2, TEST_SAMPLER_LONGEST, "");
  TAP_TEST(l1.front().sequence.size() == 9, TEST_SAMPLER_LONGEST, "");
  Sequence next = *(++l1.begin());
  TAP_TEST(next.sequence.size() == 10, TEST_SAMPLER_LONGEST, "label = " << next.label);
}
Exemple #10
0
void BWA::generate_alignments_from_paths(const char* bases, 
                                         const unsigned read_length, 
                                         bwt_aln1_t* paths, 
                                         const unsigned num_paths, 
                                         const unsigned best_count,
                                         const unsigned second_best_count,
                                         Alignment*& alignments, 
                                         unsigned& num_alignments) 
{
  bwa_seq_t* sequence = create_sequence(bases,read_length);

  sequence->aln = paths;
  sequence->n_aln = num_paths;

  // (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself.
  bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);

  // But overwrite key parts of the sequence in case the user passed back only a smaller subset
  // of the paths.
  sequence->c1 = best_count;
  sequence->c2 = second_best_count;
  sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;

  num_alignments = 0;
  for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++)
    num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1;

  alignments = new Alignment[num_alignments];
  unsigned alignment_idx = 0;

  for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
    // Stub in a 'working' path, so that only the desired alignment is local-aligned.
    const bwt_aln1_t* path = paths + path_idx;
    bwt_aln1_t working_path = *path;

    // Loop through all alignments, aligning each one individually.
    for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) {
      working_path.k = working_path.l = sa_idx;
      sequence->aln = &working_path;
      sequence->n_aln = 1;

      sequence->sa = sa_idx;
      sequence->strand = path->a;
      sequence->score = path->score;

      // Each time through bwa_refine_gapped, seq gets reversed.  Revert the reverse.
      // TODO: Fix the interface to bwa_refine_gapped so its easier to work with.
      if(alignment_idx > 0)
        seq_reverse(sequence->len, sequence->seq, 0);

      // Copy the local alignment data into the alignment object.
      *(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence);

      alignment_idx++;
    }
  }

  sequence->aln = NULL;
  sequence->n_aln = 0;

  bwa_free_read_seq(1,sequence);
}