Example #1
0
void align(char* seq_a, char* seq_b)
{
  // Variables to store alignment result
  sw_aligner_t *sw = smith_waterman_new();
  alignment_t *result = alignment_create(256);

  // Decide on scoring
  int match = 1;
  int mismatch = -2;
  int gap_open = -4;
  int gap_extend = -1;
  
  // Don't penalise gaps at the start
  // ACGATTT
  // ----TTT would score +3 (when match=+1)
  char no_start_gap_penalty = 1;
  
  // ..or gaps at the end e.g.
  // ACGATTT
  // ACGA--- would score +4 (when match=+1)
  char no_end_gap_penalty = 1;

  char no_gaps_in_a = 0, no_gaps_in_b = 0;
  char no_mismatches = 0;

  // Compare character case-sensitively (usually set to 0 for DNA etc)
  char case_sensitive = 0;

  scoring_t scoring;
  scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
               no_start_gap_penalty, no_end_gap_penalty,
               no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive);

  // Add some special cases
  // x -> y means x in seq1 changing to y in seq2
  scoring_add_mutation(&scoring, 'a', 'c', -2); // a -> c give substitution score -2
  scoring_add_mutation(&scoring, 'c', 'a', -1); // c -> a give substitution score -1

  // We could also prohibit the aligning of characters not given as special cases
  // scoring.use_match_mismatch = 0;

  smith_waterman_align(seq_a, seq_b, &scoring, sw);

  while(smith_waterman_fetch(sw, result))
  {
    printf("seqA: %s [start:%zu]\n", result->result_a, result->pos_a);
    printf("seqB: %s [start:%zu]\n", result->result_b, result->pos_b);
    printf("alignment score: %i\n\n", result->score);
  }

  // Free memory for storing alignment results
  smith_waterman_free(sw);
  alignment_free(result);
}
Example #2
0
void align(char* seq_a, char* seq_b, int min_score)
{
  // Decide on scoring
  int match = 1;
  int mismatch = -2;
  int gap_open = -4;
  int gap_extend = -1;

  // Compare character case-sensitively (usually set to 0 for DNA etc)
  char case_sensitive = 0;

  // Create scoring system
  SCORING_SYSTEM* scoring = scoring_create(match, mismatch,
                                           gap_open, gap_extend,
                                           no_start_gap_penalty,
                                           no_end_gap_penalty,
                                           case_sensitive);

  // Add some special cases
  // x -> y means x in seq1 changing to y in seq2
  scoring_add_mutation(scoring, 'a', 'c', -2); // a -> c give substitution score -2
  scoring_add_mutation(scoring, 'c', 'a', -1); // c -> a give substitution score -1

  // We could also prohibit the aligning of characters not given as special cases
  // scoring->use_match_mismatch = 0;

  // Do alignment
  SW_COMPUTATION* smithwaterman = smith_waterman_align(seq_a, seq_b, scoring);

  // Allocate memory for storing result
  SW_LOCAL_ALIGNMENT* alignment
    = (SW_LOCAL_ALIGNMENT*) malloc(sizeof(SW_LOCAL_ALIGNMENT));
  
  // Loop through results
  while(smith_waterman_get_hit(smithwaterman, alignment) &&
        alignment->score >= min_score)
  {
    printf("seqA [%u]: %s\n", alignment->pos_a, alignment->result_a);
    printf("seqB [%u]: %s\n", alignment->pos_b, alignment->result_b);
    printf("score: %i\n\n", alignment->score);
  }

  // Free result
  free(alignment);

  // Free memory used to store scoring preferences
  scoring_free(scoring);
}
Example #3
0
void scoring_add_mutations(scoring_t* scoring, const char *str, const int *scores,
                           char use_match_mismatch)
{
  size_t i, j, len = strlen(str);
  char a, b;
  int score;

  for(i = 0; i < len; i++)
  {
    a = scoring->case_sensitive ? str[i] : tolower(str[i]);

    for(j = 0; j < len; j++)
    {
      b = scoring->case_sensitive ? str[j] : tolower(str[j]);
      score = ARR_LOOKUP(scores, len, i, j);

      scoring_add_mutation(scoring, a, b, score);
    }
  }

  scoring->use_match_mismatch = use_match_mismatch;
}
Example #4
0
void align_scoring_load_matrix(gzFile file, const char* file_path,
                               scoring_t* scoring, char case_sensitive)
{
    StrBuf* sbuf = strbuf_new(500);
    size_t read_length;
    int line_num = 0;

    // Read first line (column headings)
    while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0)
    {
        strbuf_chomp(sbuf);

        if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment
                !string_is_all_whitespace(sbuf->b)) // and not whitespace
        {
            // Read first line

            if(sbuf->end < 2)
            {
                _loading_error("Too few column headings", file_path, line_num, 1);
            }

            break;
        }

        line_num++;
    }

    if(line_num == 0 && sbuf->end <= 0)
    {
        _loading_error("Empty file", file_path, -1, 0);
    }

    // If the separator character is whitespace,
    // the set of whitespace characters is used
    char sep = sbuf->b[0];

    if((sep >= (int)'0' && sep <= (int)'9') || sep == '-')
    {
        _loading_error("Numbers (0-9) and dashes (-) do not make good separators",
                       file_path, line_num, 0);
    }

    char* characters = (char*)malloc(sbuf->end);
    int num_of_chars = 0;

    if(isspace(sep))
    {
        char* next = sbuf->b;

        while((next = string_next_nonwhitespace(next+1)) != NULL)
        {
            characters[num_of_chars++] = case_sensitive ? *next : tolower(*next);
        }

        // Now read lines below
        while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0)
        {
            strbuf_chomp(sbuf);

            char* from_char_pos = string_next_nonwhitespace(sbuf->b);

            if(from_char_pos == NULL || sbuf->b[0] == '#')
            {
                // skip this line
                continue;
            }

            char from_char = case_sensitive ? *from_char_pos : tolower(*from_char_pos);
            char to_char;

            char* score_txt = sbuf->b+1;
            int score;

            int i;
            for(i = 0; i < num_of_chars; i++)
            {
                to_char = characters[i];

                if(!isspace(*score_txt))
                {
                    _loading_error("Expected whitespace between elements - found character",
                                   file_path, line_num, 1);
                }

                score_txt = string_next_nonwhitespace(score_txt+1);

                char* strtol_last_char_ptr = score_txt;
                score = (int)strtol(strtol_last_char_ptr, &strtol_last_char_ptr, 10);

                // If pointer to end of number string hasn't moved -> error
                if(strtol_last_char_ptr == score_txt)
                {
                    _loading_error("Missing number value on line", file_path, line_num, 1);
                }

                scoring_add_mutation(scoring, from_char, to_char, score);

                score_txt = strtol_last_char_ptr;
            }

            if(*score_txt != '\0' && !string_is_all_whitespace(score_txt))
            {
                _loading_error("Too many columns on row", file_path, line_num, 1);
            }

            line_num++;
        }
    }
    else
    {
        size_t i;

        for(i = 0; i < sbuf->end; i += 2)
        {
            if(sbuf->b[i] != sep)
            {
                _loading_error("Separator missing from line", file_path, line_num, 1);
            }

            char c = case_sensitive ? sbuf->b[i+1] : tolower(sbuf->b[i+1]);
            characters[num_of_chars++] = c;
        }

        int score;

        // Read rows
        while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0)
        {
            strbuf_chomp(sbuf);

            char from_char = case_sensitive ? sbuf->b[0] : tolower(sbuf->b[0]);

            if(from_char == '#' || string_is_all_whitespace(sbuf->b))
            {
                // skip this line
                continue;
            }

            char* str_pos = sbuf->b;

            int to_char_index = 0;
            char to_char;

            while(*str_pos != '\0')
            {
                to_char = characters[to_char_index++];

                if(*str_pos != sep)
                {
                    _loading_error("Separator missing from line", file_path, line_num, 1);
                }

                // Move past separator
                str_pos++;

                char* after_num_str = str_pos;
                score = (int)strtol(str_pos, &after_num_str, 10);

                // If pointer to end of number string hasn't moved -> error
                if(str_pos == after_num_str)
                {
                    _loading_error("Missing number value on line", file_path, line_num, 1);
                }

                if(to_char_index >= num_of_chars)
                {
                    _loading_error("Too many columns on row", file_path, line_num, 1);
                }

                scoring_add_mutation(scoring, from_char, to_char, score);

                str_pos = after_num_str;
            }

            line_num++;
        }
    }

    free(characters);
    strbuf_free(sbuf);
}
Example #5
0
void align_scoring_load_pairwise(gzFile file, const char* file_path,
                                 scoring_t* scoring, char case_sensitive)
{
    StrBuf* sbuf = strbuf_new(200);
    size_t read_length;
    int line_num = 0;

    char a, b;
    int score;

    int num_pairs_added = 0;

    while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0)
    {
        strbuf_chomp(sbuf);

        if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment
                !string_is_all_whitespace(sbuf->b)) // and not whitespace
        {
            if(read_length < 5)
            {
                _loading_error("Too few column headings", file_path, line_num, 0);
            }

            if(isspace(sbuf->b[1]))
            {
                // split by whitespace
                a = sbuf->b[0];

                size_t char2_pos;

                for(char2_pos = 1;
                        sbuf->b[char2_pos] != '\0' && isspace(sbuf->b[char2_pos]);
                        char2_pos++);

                if(char2_pos+2 >= sbuf->end || !isspace(sbuf->b[char2_pos+1]))
                {
                    _loading_error("Line too short", file_path, line_num, 0);
                }

                b = sbuf->b[char2_pos];

                if(!parse_entire_int(sbuf->b+char2_pos+2, &score))
                {
                    _loading_error("Invalid number", file_path, line_num, 0);
                }
            }
            else
            {
                if(sbuf->b[1] != sbuf->b[3])
                {
                    _loading_error("Inconsistent separators used", file_path, line_num, 0);
                }

                a = sbuf->b[0];
                b = sbuf->b[2];

                if(!parse_entire_int(sbuf->b + 4, &score))
                {
                    _loading_error("Invalid number", file_path, line_num, 0);
                }
            }

            if(!case_sensitive)
            {
                a = tolower(a);
                b = tolower(b);
            }

            scoring_add_mutation(scoring, a, b, score);
            num_pairs_added++;
        }

        line_num++;
    }

    strbuf_free(sbuf);

    if(num_pairs_added == 0)
    {
        _loading_error("No pairs added from file (file empty?)",
                       file_path, line_num, 0);
    }
}
Example #6
0
static PyObject * nw_align_wrapper(PyObject *self, PyObject *args, PyObject *kw)
{
    const char *seq1, *seq2;
    // Decide on scoring
    int match = 1;
    int mismatch = -2;
    int gap_open = -4;
    int gap_extend = -1;
    
    // Don't penalise gaps at the start
    // ACGATTT
    // ----TTT would score +3 (when match=+1)
    int no_start_gap_penalty = 0;
    
    // ..or gaps at the end e.g.
    // ACGATTT
    // ACGA--- would score +4 (when match=+1)
    int no_end_gap_penalty = 0;

    int no_gaps_in_a = 0, no_gaps_in_b = 0;
    int no_mismatches = 0;

    // Compare character case-sensitively (usually set to 0 for DNA etc)
    int case_sensitive = 0;

    PyObject * matrix = NULL;

    static char *kwlist[] = {"seq1","seq2", "matrix", "match", "mismatch", "gap_open","gap_extend", "no_start_gap_penalty", "no_end_gap_penalty", "no_gaps_in_a", "no_gaps_in_b", "no_mismatches", "case_sensitive", NULL};
    PyObject *res = NULL;

    if(!PyArg_ParseTupleAndKeywords(args, kw, "ss|Oiiiiiiiiii", kwlist, &seq1, &seq2, &matrix, &match, &mismatch, &gap_open, &gap_extend,
                                                                 &no_start_gap_penalty, &no_end_gap_penalty, &no_gaps_in_a, &no_gaps_in_b, &no_mismatches, &case_sensitive))
        return NULL;
    alignment_t *result = alignment_create(256);
    
    // Variables to store alignment result
    nw_aligner_t *nw = needleman_wunsch_new();

    scoring_t scoring;
    scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
                no_start_gap_penalty, no_end_gap_penalty,
                no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive);

    // Add some special cases
    // x -> y means x in seq1 changing to y in seq2
    if(matrix != NULL)
    {
        PyObject * mapping = PyMapping_Items(matrix);
        if(mapping == NULL)
            goto error;
        int n = PySequence_Size(mapping);
        PyObject *item;
        int value;
        PyObject *key;
        char * char_a;
        char * char_b;
        int i;
        for(i = 0; i < n; i++)
        {
            item = PySequence_GetItem(mapping, i);
            if(item == NULL || !PyTuple_Check(item))
            {
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error; 
            }
            
            if(!PyArg_ParseTuple(item, "Oi", &key, &value))
            {
                PyErr_SetString(PyExc_RuntimeError, "Values of matrix dict should be integers");
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error;
            }
            if(!PyTuple_Check(key))
            {
                PyErr_SetString(PyExc_RuntimeError, "Keys of matrix dict should be tuples");
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error;
            }
            if(!PyArg_ParseTuple(key, "ss", &char_a, &char_b))
            {
                PyErr_SetString(PyExc_RuntimeError, "Keys of matrix dict should be tuples with 2 characters as elements.");
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error;
            }
            if(strlen(char_a) != 1 || strlen(char_b) != 1)
            {
                PyErr_SetString(PyExc_RuntimeError, "Character length should be 1");
                Py_XDECREF(item);
                Py_DECREF(mapping);
                goto error;
            }
            scoring_add_mutation(&scoring, case_sensitive ? *char_a : tolower(*char_a), case_sensitive ? *char_a : tolower(*char_b), value); // a -> c give substitution score -2
            Py_DECREF(item);
        }
    }

    // We could also prohibit the aligning of characters not given as special cases
    // scoring.use_match_mismatch = 0;

    needleman_wunsch_align(seq1, seq2, &scoring, nw, result);

    res = Py_BuildValue("ssi", result->result_a, result->result_b, result->score);

error:
    // Free memory for storing alignment results
    needleman_wunsch_free(nw);

    alignment_free(result);
    return res;
}