void testCreateSequence() { Sequence seq1 = create_sequence("label", "l", "AAAAAAAAAA", "!!!!!!!!!!"); Sequence seq2 = create_sequence("", "", "", ""); TAP_TEST(seq1.label_full == "label", TEST_CREATE_SEQUENCE_LABEL_FULL, ""); TAP_TEST(seq2.label_full == "", TEST_CREATE_SEQUENCE_LABEL_FULL, ""); TAP_TEST(seq1.label == "l", TEST_CREATE_SEQUENCE_LABEL, ""); TAP_TEST(seq2.label == "", TEST_CREATE_SEQUENCE_LABEL, ""); TAP_TEST(seq1.sequence == "AAAAAAAAAA", TEST_CREATE_SEQUENCE_SEQUENCE, ""); TAP_TEST(seq2.sequence == "", TEST_CREATE_SEQUENCE_SEQUENCE, ""); TAP_TEST(seq1.quality == "!!!!!!!!!!", TEST_CREATE_SEQUENCE_QUALITY, ""); TAP_TEST(seq2.quality == "", TEST_CREATE_SEQUENCE_QUALITY, ""); }
Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) { bwa_seq_t* sequence = create_sequence(bases,read_length); // Calculate paths. bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); // Check for no alignments found and return null. if(sequence->n_aln == 0) { bwa_free_read_seq(1,sequence); return NULL; } // bwa_cal_sa_reg_gap destroys the bases / read length. Copy them back in. copy_bases_into_sequence(sequence,bases,read_length); // Pick best alignment and propagate its information into the sequence. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); // Generate the best alignment from the sequence. Alignment* alignment = new Alignment; *alignment = generate_final_alignment_from_sequence(sequence); bwa_free_read_seq(1,sequence); return alignment; }
/* Reads data line after line from a standard input, where every line * represents a single data point that can be a number or optionally * a number, any number of spaces, and a label, and returns the sequence * represented by this stream. * * The function is very tolerant about spaces and badly formatted lines. */ struct sequence *datastream_to_sequence(void) { struct sequence *seq = create_sequence(); char buf[4096]; while(fgets(buf,sizeof(buf),stdin) != NULL) { char *endptr, *start, *label = NULL, *p = buf; double value; /* Skip initial spaces */ while(*p && isspace(*p)) p++; if (*p == '\0') continue; /* Empty line, skip it */ start = p; /* Find the end of the value */ while(*p && !isspace(*p)) p++; if (*p) { *p = '\0'; p++; } /* Parse the value */ errno = 0; value = strtod(start,&endptr); if (*endptr != '\0' || errno != 0) continue; /* Bad float, skip it */ /* Find the start of the label */ while(*p && isspace(*p)) p++; if (*p != '\0') { /* We have an additional label, find the end */ label = p; while(*p && !isspace(*p)) p++; *p = '\0'; } sequence_add_sample(seq,value,label ? strdup(label) : NULL); } return seq; }
/* Read chars from stdin until end of file, build a frequency table, and * finally translate it into samples. */ struct sequence *file_freq_to_sequence(void) { struct sequence *seq; unsigned int count[256]; int c; memset(count,0,sizeof(count)); while((c = getc(stdin)) != EOF) { if (opt_mode == ASPARK_MODE_TXTFREQ) c = toupper(c); count[c]++; } seq = create_sequence(); if (opt_mode == ASPARK_MODE_BINFREQ) { for (c = 0; c < 256; c++) { char buf[32]; snprintf(buf,sizeof(buf),"%d",c); sequence_add_sample(seq,count[c],strdup(buf)); } } else { for (c = ' '+1; c <= 'Z'; c++) { char buf[2]; snprintf(buf,sizeof(buf),"%c",c); sequence_add_sample(seq,count[c],strdup(buf)); } } return seq; }
/* Convert a string in the form 1,2,3.4,5:label1,6:label2 into a sequence. * On error NULL is returned and errno set accordingly: * * EINVAL => Invalid data format. */ struct sequence *argument_to_sequence(const char *arg) { struct sequence *seq = create_sequence(); char *copy, *start; start = copy = strdup(arg); while(*start) { char *label, *end, *endptr; double value; end = strchr(start,','); if (end) *end = '\0'; label = strchr(start,':'); if (label) { *label = '\0'; label++; /* skip the ':' */ } errno = 0; value = strtod(start,&endptr); if (*endptr != '\0' || errno != 0 || isinf(value) || isnan(value)) { errno = EINVAL; return NULL; } sequence_add_sample(seq,value,label ? strdup(label) : NULL); if (end) start = end+1; else break; } free(copy); return seq; }
void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count) { bwa_seq_t* sequence = create_sequence(bases, read_length); // Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln). // This method will destroy the contents of seq and rseq. bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); paths = new bwt_aln1_t[sequence->n_aln]; memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t)); num_paths = sequence->n_aln; // Call aln2seq to initialize the type of match present. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); best_path_count = sequence->c1; second_best_path_count = sequence->c2; bwa_free_read_seq(1,sequence); }
int mmerge_process (SequenceList list, int *super, int (*majority)(Sequence **, int, int *, int, int *, int **, int *)) { Sequence *node; Sequence **seq; int n; int i; int alphabet[MAX_ALPHABET_SIZE], alpha_len; alpha_len = get_alphabet_set(list, alphabet); n = get_size(list); seq = (Sequence**) malloc (n * sizeof(Sequence*)); node = list; i = 0; while(node) { seq[i++] = create_sequence (node->seq, node->len); node = node->next; } return mmerge(seq, n, alphabet, alpha_len, majority, super); }
// Generate 10 sequences, and launch 10 times getRandom(1). // We should not have the same sequence 10 times (p < 10^{-10}) void testRandom() { list<Sequence> seqs; string seg_name = "seq"; char id = '0'; string sequence = "AA"; for (int i = 0; i < 10; i++) { seqs.push_back(create_sequence("seq" + string_of_int(id), "seq" + string_of_int(id), sequence, "")); sequence += "A"; id++; } SequenceSampler sampler(seqs); string first_random = sampler.getRandom(1).front().label; bool all_equal = true; for (int i = 0; i < 9 && all_equal; i++) { if (first_random != sampler.getRandom(1).front().label) all_equal = false; } TAP_TEST(all_equal == false, TEST_SAMPLER_RANDOM, "On the 10 trials, we drawn 10 times the same sequence"); }
void testLongest() { list<Sequence> seqs; seqs.push_back(create_sequence("seq1", "seq1", "AAAAAAAAA", "")); seqs.push_back(create_sequence("seq2", "seq2", "AAAAA", "")); seqs.push_back(create_sequence("seq3", "seq3", "AAAAAAAA", "")); seqs.push_back(create_sequence("seq4", "seq4", "AAAAAAAAAA", "")); seqs.push_back(create_sequence("seq5", "seq5", "AAAAAA", "")); seqs.push_back(create_sequence("seq6", "seq6", "AAAAAAA", "")); SequenceSampler s(seqs); list<Sequence> l1 = s.getLongest(6, 11); size_t *distrib = s.getLengthDistribution(); TAP_TEST(distrib[0] == 0 && distrib[1] == 0 && distrib[2] == 0 && distrib[3] == 0 && distrib[4] == 0 && distrib[5] == 1 && distrib[6] == 1 && distrib[7] == 1 && distrib[8] == 1 && distrib[9] == 1 && distrib[10] == 1, TEST_SAMPLER_LENGTH, ""); char id = '1'; TAP_TEST(l1.size() == 6, TEST_SAMPLER_LONGEST, ""); for (list<Sequence>::const_iterator it = l1.begin(); it != l1.end(); it++) { TAP_TEST(it->label[3] == id, TEST_SAMPLER_LONGEST, ""); id++; } // With only 10 buckets, the two longest sequences share the same bucket. // Due to their insertion order, the shorter will be sampled first l1 = s.getLongest(2, 10); distrib = s.getLengthDistribution(); TAP_TEST(distrib[0] == 0 && distrib[1] == 0 && distrib[2] == 0 && distrib[3] == 0 && distrib[4] == 0 && distrib[5] == 1 && distrib[6] == 1 && distrib[7] == 1 && distrib[8] == 1 && distrib[9] == 2, TEST_SAMPLER_LENGTH, ""); TAP_TEST(l1.size() == 2, TEST_SAMPLER_LONGEST, ""); TAP_TEST(l1.front().sequence.size() == 9, TEST_SAMPLER_LONGEST, ""); Sequence next = *(++l1.begin()); TAP_TEST(next.sequence.size() == 10, TEST_SAMPLER_LONGEST, "label = " << next.label); }
void BWA::generate_alignments_from_paths(const char* bases, const unsigned read_length, bwt_aln1_t* paths, const unsigned num_paths, const unsigned best_count, const unsigned second_best_count, Alignment*& alignments, unsigned& num_alignments) { bwa_seq_t* sequence = create_sequence(bases,read_length); sequence->aln = paths; sequence->n_aln = num_paths; // (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); // But overwrite key parts of the sequence in case the user passed back only a smaller subset // of the paths. sequence->c1 = best_count; sequence->c2 = second_best_count; sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; num_alignments = 0; for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++) num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1; alignments = new Alignment[num_alignments]; unsigned alignment_idx = 0; for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { // Stub in a 'working' path, so that only the desired alignment is local-aligned. const bwt_aln1_t* path = paths + path_idx; bwt_aln1_t working_path = *path; // Loop through all alignments, aligning each one individually. for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) { working_path.k = working_path.l = sa_idx; sequence->aln = &working_path; sequence->n_aln = 1; sequence->sa = sa_idx; sequence->strand = path->a; sequence->score = path->score; // Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse. // TODO: Fix the interface to bwa_refine_gapped so its easier to work with. if(alignment_idx > 0) seq_reverse(sequence->len, sequence->seq, 0); // Copy the local alignment data into the alignment object. *(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence); alignment_idx++; } } sequence->aln = NULL; sequence->n_aln = 0; bwa_free_read_seq(1,sequence); }