Пример #1
0
void align_scoring_load_matrix(gzFile file, const char* file_path,
                               scoring_t* scoring, char case_sensitive)
{
    StrBuf* sbuf = strbuf_new(500);
    size_t read_length;
    int line_num = 0;

    // Read first line (column headings)
    while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0)
    {
        strbuf_chomp(sbuf);

        if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment
                !string_is_all_whitespace(sbuf->b)) // and not whitespace
        {
            // Read first line

            if(sbuf->end < 2)
            {
                _loading_error("Too few column headings", file_path, line_num, 1);
            }

            break;
        }

        line_num++;
    }

    if(line_num == 0 && sbuf->end <= 0)
    {
        _loading_error("Empty file", file_path, -1, 0);
    }

    // If the separator character is whitespace,
    // the set of whitespace characters is used
    char sep = sbuf->b[0];

    if((sep >= (int)'0' && sep <= (int)'9') || sep == '-')
    {
        _loading_error("Numbers (0-9) and dashes (-) do not make good separators",
                       file_path, line_num, 0);
    }

    char* characters = (char*)malloc(sbuf->end);
    int num_of_chars = 0;

    if(isspace(sep))
    {
        char* next = sbuf->b;

        while((next = string_next_nonwhitespace(next+1)) != NULL)
        {
            characters[num_of_chars++] = case_sensitive ? *next : tolower(*next);
        }

        // Now read lines below
        while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0)
        {
            strbuf_chomp(sbuf);

            char* from_char_pos = string_next_nonwhitespace(sbuf->b);

            if(from_char_pos == NULL || sbuf->b[0] == '#')
            {
                // skip this line
                continue;
            }

            char from_char = case_sensitive ? *from_char_pos : tolower(*from_char_pos);
            char to_char;

            char* score_txt = sbuf->b+1;
            int score;

            int i;
            for(i = 0; i < num_of_chars; i++)
            {
                to_char = characters[i];

                if(!isspace(*score_txt))
                {
                    _loading_error("Expected whitespace between elements - found character",
                                   file_path, line_num, 1);
                }

                score_txt = string_next_nonwhitespace(score_txt+1);

                char* strtol_last_char_ptr = score_txt;
                score = (int)strtol(strtol_last_char_ptr, &strtol_last_char_ptr, 10);

                // If pointer to end of number string hasn't moved -> error
                if(strtol_last_char_ptr == score_txt)
                {
                    _loading_error("Missing number value on line", file_path, line_num, 1);
                }

                scoring_add_mutation(scoring, from_char, to_char, score);

                score_txt = strtol_last_char_ptr;
            }

            if(*score_txt != '\0' && !string_is_all_whitespace(score_txt))
            {
                _loading_error("Too many columns on row", file_path, line_num, 1);
            }

            line_num++;
        }
    }
    else
    {
        size_t i;

        for(i = 0; i < sbuf->end; i += 2)
        {
            if(sbuf->b[i] != sep)
            {
                _loading_error("Separator missing from line", file_path, line_num, 1);
            }

            char c = case_sensitive ? sbuf->b[i+1] : tolower(sbuf->b[i+1]);
            characters[num_of_chars++] = c;
        }

        int score;

        // Read rows
        while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0)
        {
            strbuf_chomp(sbuf);

            char from_char = case_sensitive ? sbuf->b[0] : tolower(sbuf->b[0]);

            if(from_char == '#' || string_is_all_whitespace(sbuf->b))
            {
                // skip this line
                continue;
            }

            char* str_pos = sbuf->b;

            int to_char_index = 0;
            char to_char;

            while(*str_pos != '\0')
            {
                to_char = characters[to_char_index++];

                if(*str_pos != sep)
                {
                    _loading_error("Separator missing from line", file_path, line_num, 1);
                }

                // Move past separator
                str_pos++;

                char* after_num_str = str_pos;
                score = (int)strtol(str_pos, &after_num_str, 10);

                // If pointer to end of number string hasn't moved -> error
                if(str_pos == after_num_str)
                {
                    _loading_error("Missing number value on line", file_path, line_num, 1);
                }

                if(to_char_index >= num_of_chars)
                {
                    _loading_error("Too many columns on row", file_path, line_num, 1);
                }

                scoring_add_mutation(scoring, from_char, to_char, score);

                str_pos = after_num_str;
            }

            line_num++;
        }
    }

    free(characters);
    strbuf_free(sbuf);
}
Пример #2
0
int main(int argc, char **argv)
{
  // compiler complains about unused function without these linese
  (void)kh_clear_ghash;
  (void)kh_del_ghash;

  if(argc < 2) print_usage(usage, NULL);

  char swap_alleles = 0;

  int c;
  while((c = getopt(argc, argv, "s")) >= 0) {
    switch (c) {
      case 's': swap_alleles = 1; break;
      default: die("Unknown option: %c", c);
    }
  }

  if(optind == argc) print_usage(usage, "Not enough arguments");

  char *inputpath = argv[optind];
  char **refpaths = argv + optind + 1;
  size_t num_refs = argc - optind - 1;

  gzFile gzin = gzopen(inputpath, "r");
  if(gzin == NULL) die("Cannot read file: %s", inputpath);

  size_t i, nchroms = 0, capacity = 1024;
  khash_t(ghash) *genome = kh_init(ghash);
  read_t *reads = malloc(capacity * sizeof(read_t)), *r;
  int hret;
  khiter_t k;

  for(i = 0; i < num_refs; i++) {
    fprintf(stderr, "Loading %s\n", refpaths[i]);
    load_reads(refpaths[i], &reads, &capacity, &nchroms);
  }

  if(num_refs == 0) {
    fprintf(stderr, "Loading from stdin\n");
    load_reads("-", &reads, &capacity, &nchroms);
  }

  if(nchroms == 0) die("No chromosomes loaded");

  for(i = 0; i < nchroms; i++) {
    r = reads + i;
    fprintf(stderr, "Loaded: '%s'\n", r->name.b);
    k = kh_put(ghash, genome, r->name.b, &hret);
    if(hret == 0) warn("Duplicate read name (taking first): %s", r->name.b);
    else kh_value(genome, k) = r;
  }

  // Now read VCF
  StrBuf line;
  strbuf_alloc(&line, 1024);
  char *fields[9];
  char *chr;
  int pos, reflen, altlen;

  while(strbuf_reset_gzreadline(&line, gzin) > 0)
  {
    if(line.b[0] == '#') fputs(line.b, stdout);
    else
    {
      strbuf_chomp(&line);
      vcf_columns(line.b, fields);
      fields[1][-1] = fields[2][-1] = '\0';
      chr = line.b;
      pos = atoi(fields[1])-1;
      k = kh_get(ghash, genome, chr);
      r = kh_value(genome, k);
      fields[1][-1] = fields[2][-1] = '\t';
      reflen = fields[4] - fields[3] - 1;
      altlen = fields[5] - fields[4] - 1;
      if(k == kh_end(genome)) warn("Cannot find chrom: %s", chr);
      else if(pos < 0) warn("Bad line: %s\n", line.b);
      else if((reflen == 1 && altlen == 1) || fields[3][0] == fields[4][0])
      {
        if((unsigned)pos + reflen <= r->seq.end &&
           strncasecmp(r->seq.b+pos,fields[3],reflen) == 0)
        {
          fputs(line.b, stdout);
          fputc('\n', stdout);
        }
        else if(swap_alleles && (unsigned)pos + altlen <= r->seq.end &&
                strncasecmp(r->seq.b+pos,fields[4],altlen) == 0)
        {
          // swap alleles
          char tmp[altlen], *ref = fields[3], *alt = fields[4];
          memcpy(tmp, alt, altlen);
          memmove(ref+altlen+1, ref, reflen);
          memcpy(ref, tmp, altlen);
          ref[altlen] = '\t';
          fputs(line.b, stdout);
          fputc('\n', stdout);
        }
        // else printf("FAIL0\n");
      }
      // else printf("FAIL1\n");
    }
  }

  kh_destroy(ghash, genome);
  strbuf_dealloc(&line);
  gzclose(gzin);

  for(i = 0; i < nchroms; i++) seq_read_dealloc(reads+i);
  free(reads);

  fprintf(stderr, " Done.\n");

  return 0;
}
Пример #3
0
void align_scoring_load_pairwise(gzFile file, const char* file_path,
                                 scoring_t* scoring, char case_sensitive)
{
    StrBuf* sbuf = strbuf_new(200);
    size_t read_length;
    int line_num = 0;

    char a, b;
    int score;

    int num_pairs_added = 0;

    while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0)
    {
        strbuf_chomp(sbuf);

        if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment
                !string_is_all_whitespace(sbuf->b)) // and not whitespace
        {
            if(read_length < 5)
            {
                _loading_error("Too few column headings", file_path, line_num, 0);
            }

            if(isspace(sbuf->b[1]))
            {
                // split by whitespace
                a = sbuf->b[0];

                size_t char2_pos;

                for(char2_pos = 1;
                        sbuf->b[char2_pos] != '\0' && isspace(sbuf->b[char2_pos]);
                        char2_pos++);

                if(char2_pos+2 >= sbuf->end || !isspace(sbuf->b[char2_pos+1]))
                {
                    _loading_error("Line too short", file_path, line_num, 0);
                }

                b = sbuf->b[char2_pos];

                if(!parse_entire_int(sbuf->b+char2_pos+2, &score))
                {
                    _loading_error("Invalid number", file_path, line_num, 0);
                }
            }
            else
            {
                if(sbuf->b[1] != sbuf->b[3])
                {
                    _loading_error("Inconsistent separators used", file_path, line_num, 0);
                }

                a = sbuf->b[0];
                b = sbuf->b[2];

                if(!parse_entire_int(sbuf->b + 4, &score))
                {
                    _loading_error("Invalid number", file_path, line_num, 0);
                }
            }

            if(!case_sensitive)
            {
                a = tolower(a);
                b = tolower(b);
            }

            scoring_add_mutation(scoring, a, b, score);
            num_pairs_added++;
        }

        line_num++;
    }

    strbuf_free(sbuf);

    if(num_pairs_added == 0)
    {
        _loading_error("No pairs added from file (file empty?)",
                       file_path, line_num, 0);
    }
}