void align(char* seq_a, char* seq_b) { // Variables to store alignment result sw_aligner_t *sw = smith_waterman_new(); alignment_t *result = alignment_create(256); // Decide on scoring int match = 1; int mismatch = -2; int gap_open = -4; int gap_extend = -1; // Don't penalise gaps at the start // ACGATTT // ----TTT would score +3 (when match=+1) char no_start_gap_penalty = 1; // ..or gaps at the end e.g. // ACGATTT // ACGA--- would score +4 (when match=+1) char no_end_gap_penalty = 1; char no_gaps_in_a = 0, no_gaps_in_b = 0; char no_mismatches = 0; // Compare character case-sensitively (usually set to 0 for DNA etc) char case_sensitive = 0; scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); // Add some special cases // x -> y means x in seq1 changing to y in seq2 scoring_add_mutation(&scoring, 'a', 'c', -2); // a -> c give substitution score -2 scoring_add_mutation(&scoring, 'c', 'a', -1); // c -> a give substitution score -1 // We could also prohibit the aligning of characters not given as special cases // scoring.use_match_mismatch = 0; smith_waterman_align(seq_a, seq_b, &scoring, sw); while(smith_waterman_fetch(sw, result)) { printf("seqA: %s [start:%zu]\n", result->result_a, result->pos_a); printf("seqB: %s [start:%zu]\n", result->result_b, result->pos_b); printf("alignment score: %i\n\n", result->score); } // Free memory for storing alignment results smith_waterman_free(sw); alignment_free(result); }
void align(char* seq_a, char* seq_b, int min_score) { // Decide on scoring int match = 1; int mismatch = -2; int gap_open = -4; int gap_extend = -1; // Compare character case-sensitively (usually set to 0 for DNA etc) char case_sensitive = 0; // Create scoring system SCORING_SYSTEM* scoring = scoring_create(match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, case_sensitive); // Add some special cases // x -> y means x in seq1 changing to y in seq2 scoring_add_mutation(scoring, 'a', 'c', -2); // a -> c give substitution score -2 scoring_add_mutation(scoring, 'c', 'a', -1); // c -> a give substitution score -1 // We could also prohibit the aligning of characters not given as special cases // scoring->use_match_mismatch = 0; // Do alignment SW_COMPUTATION* smithwaterman = smith_waterman_align(seq_a, seq_b, scoring); // Allocate memory for storing result SW_LOCAL_ALIGNMENT* alignment = (SW_LOCAL_ALIGNMENT*) malloc(sizeof(SW_LOCAL_ALIGNMENT)); // Loop through results while(smith_waterman_get_hit(smithwaterman, alignment) && alignment->score >= min_score) { printf("seqA [%u]: %s\n", alignment->pos_a, alignment->result_a); printf("seqB [%u]: %s\n", alignment->pos_b, alignment->result_b); printf("score: %i\n\n", alignment->score); } // Free result free(alignment); // Free memory used to store scoring preferences scoring_free(scoring); }
void scoring_add_mutations(scoring_t* scoring, const char *str, const int *scores, char use_match_mismatch) { size_t i, j, len = strlen(str); char a, b; int score; for(i = 0; i < len; i++) { a = scoring->case_sensitive ? str[i] : tolower(str[i]); for(j = 0; j < len; j++) { b = scoring->case_sensitive ? str[j] : tolower(str[j]); score = ARR_LOOKUP(scores, len, i, j); scoring_add_mutation(scoring, a, b, score); } } scoring->use_match_mismatch = use_match_mismatch; }
void align_scoring_load_matrix(gzFile file, const char* file_path, scoring_t* scoring, char case_sensitive) { StrBuf* sbuf = strbuf_new(500); size_t read_length; int line_num = 0; // Read first line (column headings) while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0) { strbuf_chomp(sbuf); if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment !string_is_all_whitespace(sbuf->b)) // and not whitespace { // Read first line if(sbuf->end < 2) { _loading_error("Too few column headings", file_path, line_num, 1); } break; } line_num++; } if(line_num == 0 && sbuf->end <= 0) { _loading_error("Empty file", file_path, -1, 0); } // If the separator character is whitespace, // the set of whitespace characters is used char sep = sbuf->b[0]; if((sep >= (int)'0' && sep <= (int)'9') || sep == '-') { _loading_error("Numbers (0-9) and dashes (-) do not make good separators", file_path, line_num, 0); } char* characters = (char*)malloc(sbuf->end); int num_of_chars = 0; if(isspace(sep)) { char* next = sbuf->b; while((next = string_next_nonwhitespace(next+1)) != NULL) { characters[num_of_chars++] = case_sensitive ? *next : tolower(*next); } // Now read lines below while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0) { strbuf_chomp(sbuf); char* from_char_pos = string_next_nonwhitespace(sbuf->b); if(from_char_pos == NULL || sbuf->b[0] == '#') { // skip this line continue; } char from_char = case_sensitive ? *from_char_pos : tolower(*from_char_pos); char to_char; char* score_txt = sbuf->b+1; int score; int i; for(i = 0; i < num_of_chars; i++) { to_char = characters[i]; if(!isspace(*score_txt)) { _loading_error("Expected whitespace between elements - found character", file_path, line_num, 1); } score_txt = string_next_nonwhitespace(score_txt+1); char* strtol_last_char_ptr = score_txt; score = (int)strtol(strtol_last_char_ptr, &strtol_last_char_ptr, 10); // If pointer to end of number string hasn't moved -> error if(strtol_last_char_ptr == score_txt) { _loading_error("Missing number value on line", file_path, line_num, 1); } scoring_add_mutation(scoring, from_char, to_char, score); score_txt = strtol_last_char_ptr; } if(*score_txt != '\0' && !string_is_all_whitespace(score_txt)) { _loading_error("Too many columns on row", file_path, line_num, 1); } line_num++; } } else { size_t i; for(i = 0; i < sbuf->end; i += 2) { if(sbuf->b[i] != sep) { _loading_error("Separator missing from line", file_path, line_num, 1); } char c = case_sensitive ? sbuf->b[i+1] : tolower(sbuf->b[i+1]); characters[num_of_chars++] = c; } int score; // Read rows while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0) { strbuf_chomp(sbuf); char from_char = case_sensitive ? sbuf->b[0] : tolower(sbuf->b[0]); if(from_char == '#' || string_is_all_whitespace(sbuf->b)) { // skip this line continue; } char* str_pos = sbuf->b; int to_char_index = 0; char to_char; while(*str_pos != '\0') { to_char = characters[to_char_index++]; if(*str_pos != sep) { _loading_error("Separator missing from line", file_path, line_num, 1); } // Move past separator str_pos++; char* after_num_str = str_pos; score = (int)strtol(str_pos, &after_num_str, 10); // If pointer to end of number string hasn't moved -> error if(str_pos == after_num_str) { _loading_error("Missing number value on line", file_path, line_num, 1); } if(to_char_index >= num_of_chars) { _loading_error("Too many columns on row", file_path, line_num, 1); } scoring_add_mutation(scoring, from_char, to_char, score); str_pos = after_num_str; } line_num++; } } free(characters); strbuf_free(sbuf); }
void align_scoring_load_pairwise(gzFile file, const char* file_path, scoring_t* scoring, char case_sensitive) { StrBuf* sbuf = strbuf_new(200); size_t read_length; int line_num = 0; char a, b; int score; int num_pairs_added = 0; while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0) { strbuf_chomp(sbuf); if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment !string_is_all_whitespace(sbuf->b)) // and not whitespace { if(read_length < 5) { _loading_error("Too few column headings", file_path, line_num, 0); } if(isspace(sbuf->b[1])) { // split by whitespace a = sbuf->b[0]; size_t char2_pos; for(char2_pos = 1; sbuf->b[char2_pos] != '\0' && isspace(sbuf->b[char2_pos]); char2_pos++); if(char2_pos+2 >= sbuf->end || !isspace(sbuf->b[char2_pos+1])) { _loading_error("Line too short", file_path, line_num, 0); } b = sbuf->b[char2_pos]; if(!parse_entire_int(sbuf->b+char2_pos+2, &score)) { _loading_error("Invalid number", file_path, line_num, 0); } } else { if(sbuf->b[1] != sbuf->b[3]) { _loading_error("Inconsistent separators used", file_path, line_num, 0); } a = sbuf->b[0]; b = sbuf->b[2]; if(!parse_entire_int(sbuf->b + 4, &score)) { _loading_error("Invalid number", file_path, line_num, 0); } } if(!case_sensitive) { a = tolower(a); b = tolower(b); } scoring_add_mutation(scoring, a, b, score); num_pairs_added++; } line_num++; } strbuf_free(sbuf); if(num_pairs_added == 0) { _loading_error("No pairs added from file (file empty?)", file_path, line_num, 0); } }
static PyObject * nw_align_wrapper(PyObject *self, PyObject *args, PyObject *kw) { const char *seq1, *seq2; // Decide on scoring int match = 1; int mismatch = -2; int gap_open = -4; int gap_extend = -1; // Don't penalise gaps at the start // ACGATTT // ----TTT would score +3 (when match=+1) int no_start_gap_penalty = 0; // ..or gaps at the end e.g. // ACGATTT // ACGA--- would score +4 (when match=+1) int no_end_gap_penalty = 0; int no_gaps_in_a = 0, no_gaps_in_b = 0; int no_mismatches = 0; // Compare character case-sensitively (usually set to 0 for DNA etc) int case_sensitive = 0; PyObject * matrix = NULL; static char *kwlist[] = {"seq1","seq2", "matrix", "match", "mismatch", "gap_open","gap_extend", "no_start_gap_penalty", "no_end_gap_penalty", "no_gaps_in_a", "no_gaps_in_b", "no_mismatches", "case_sensitive", NULL}; PyObject *res = NULL; if(!PyArg_ParseTupleAndKeywords(args, kw, "ss|Oiiiiiiiiii", kwlist, &seq1, &seq2, &matrix, &match, &mismatch, &gap_open, &gap_extend, &no_start_gap_penalty, &no_end_gap_penalty, &no_gaps_in_a, &no_gaps_in_b, &no_mismatches, &case_sensitive)) return NULL; alignment_t *result = alignment_create(256); // Variables to store alignment result nw_aligner_t *nw = needleman_wunsch_new(); scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); // Add some special cases // x -> y means x in seq1 changing to y in seq2 if(matrix != NULL) { PyObject * mapping = PyMapping_Items(matrix); if(mapping == NULL) goto error; int n = PySequence_Size(mapping); PyObject *item; int value; PyObject *key; char * char_a; char * char_b; int i; for(i = 0; i < n; i++) { item = PySequence_GetItem(mapping, i); if(item == NULL || !PyTuple_Check(item)) { Py_XDECREF(item); Py_DECREF(mapping); goto error; } if(!PyArg_ParseTuple(item, "Oi", &key, &value)) { PyErr_SetString(PyExc_RuntimeError, "Values of matrix dict should be integers"); Py_XDECREF(item); Py_DECREF(mapping); goto error; } if(!PyTuple_Check(key)) { PyErr_SetString(PyExc_RuntimeError, "Keys of matrix dict should be tuples"); Py_XDECREF(item); Py_DECREF(mapping); goto error; } if(!PyArg_ParseTuple(key, "ss", &char_a, &char_b)) { PyErr_SetString(PyExc_RuntimeError, "Keys of matrix dict should be tuples with 2 characters as elements."); Py_XDECREF(item); Py_DECREF(mapping); goto error; } if(strlen(char_a) != 1 || strlen(char_b) != 1) { PyErr_SetString(PyExc_RuntimeError, "Character length should be 1"); Py_XDECREF(item); Py_DECREF(mapping); goto error; } scoring_add_mutation(&scoring, case_sensitive ? *char_a : tolower(*char_a), case_sensitive ? *char_a : tolower(*char_b), value); // a -> c give substitution score -2 Py_DECREF(item); } } // We could also prohibit the aligning of characters not given as special cases // scoring.use_match_mismatch = 0; needleman_wunsch_align(seq1, seq2, &scoring, nw, result); res = Py_BuildValue("ssi", result->result_a, result->result_b, result->score); error: // Free memory for storing alignment results needleman_wunsch_free(nw); alignment_free(result); return res; }