Beispiel #1
0
/**
 * Reads recombination rates from a file.
 */
float *dist_read_recomb_rates(char *filename,long seq_len, double scale) {
  long i, prev, start, end;
  FILE *fh;
  int n_tok;
  char **tok, *line;
  float *rates, rate;

  rates = g_new(float, seq_len);

  fh = fopen(filename, "r");
  if(fh == NULL) {
    g_error("read_recomb_rates: could not read file '%s'", filename);
  }

  
  prev = 1;
  end  = 1;
  while((line = util_fgets_line(fh)) != NULL) {
    tok = g_strsplit(line, "\t", 5);

    n_tok = 0;
    while(tok[n_tok] != NULL) { n_tok++; }
    if(n_tok != 5) {
      g_error("read_recomb_rates: expected 5 tokens per line, got %d", n_tok);
    }

    start = strtol(tok[2], NULL, 10) + 1; /* 0-based UCSC coords */
    end   = strtol(tok[3], NULL, 10);
    rate  = strtod(tok[4], NULL);

    if(prev < start) {
      /* undefined region before this, pad with flags */
      for(i = prev-1; i < start; i++) {
	rates[i] = DIST_NA;
      }
    }
    for(i = start-1; i < end; i++) {
      rates[i] = rate * scale;
    }

    g_strfreev(tok);
    g_free(line);
    
    prev = end+1;
  }

  /* pad remainder of chromomsome with flag */
  for(i = end; i < seq_len; i++) {
    rates[i] = DIST_NA;
  }
  
  fclose(fh);

  return rates;
}
Beispiel #2
0
/**
 *  Reads an array of SeqFeature structures from a flatfile.
 *  The value pointed to the num_red argument is set to the number
 *  of SeqFeauture's read.
 */
SeqFeature *seqfeat_read_flatfile(const char *filename, long *n_read) {
  char *line;
  char **toks;
  int n_toks, prefix_len;
  long n_lines, i;
  SeqFeature *feats;

  FILE *fh;
  
  fh = fopen(filename, "r");
  if(fh == NULL) {
    g_error("%s:%d: could not read from file '%s'", 
	    __FILE__, __LINE__, filename);
  }

  /* count number of SEQFEAT lines in file  */
  n_lines = util_fcount_lines_match(fh, SEQ_FEAT_LINE_PREFIX);
  feats = g_new(SeqFeature, n_lines);

  prefix_len = strlen(SEQ_FEAT_LINE_PREFIX);

  i = 0;
  while(i < n_lines) {
    if((line = util_fgets_line(fh)) == NULL) {
      g_error("seqfeat_read_flatfile: Expected %lu %s lines, "
	      "got %lu", n_lines, SEQ_FEAT_LINE_PREFIX, i);
    }
    if(strncmp(SEQ_FEAT_LINE_PREFIX,line, prefix_len) != 0) {
      continue;
    }

    toks = g_strsplit(line, "\t", 8);

    n_toks = 0;
    while(toks[n_toks] != NULL) {
      n_toks++;
    }
    
    if(n_toks < 8) {
      g_error("gene_read_flatfile: Expected %d toks per gene line, got %d", 
	      8, n_toks);
    }

    feats[i].id      = strtol(toks[1], NULL, 10);
    feats[i].name    = g_strdup(toks[2]);
    feats[i].c.seqname = g_strdup(toks[3]);

    feats[i].c.start   = strtol(toks[4], NULL, 10);
    feats[i].c.end     = strtol(toks[5], NULL, 10);
    feats[i].c.strand  = strtol(toks[6], NULL, 10);
    feats[i].score = strtod(toks[7], NULL);

    feats[i].n_sub_feat = 0;
    feats[i].sub_feats = NULL;
    feats[i].attrib = NULL;

    g_strfreev(toks);
    g_free(line);

    i++;
  }

  *n_read = i;

  fclose(fh);

  return feats;
}
Beispiel #3
0
/**
 * Reads an array of SeqFeature structures from a BED file.
 * The value pointed to the num_red argument is set to the number
 * of SeqFeauture's read.
 */
SeqFeature *seqfeat_read_bed(const char *filename, long *n_read) {
  char *line;
  char **toks;
  int n_toks;
  SeqFeature *feats;
  long n_lines, i;  

  FILE *fh;

  fh = fopen(filename, "r");
  if(fh == NULL) {
    g_error("Could not read from BED file '%s'", filename);
  }

  n_lines = util_fcount_lines(fh);
  *n_read = 0;
  
  /* allocating feature for every line may be excessive because of
   * comments at the beginning of the file. However, it is much faster
   * to allocate all at once -- and not much space will be wasted
   * unless there is an unusually large number of comment lines
   */
  feats = g_new(SeqFeature, n_lines);

  for(i = 0; i < n_lines; i++) {
    if((line = util_fgets_line(fh)) == NULL) {
      /* Less lines than expected. File changed in length? */
      break;
    }

    /* skip comment lines beginning with ';'*/
    if(line[0] == ';') {
      g_free(line);
      continue;
    }

    toks = g_strsplit(line, "\t", 12);

    n_toks = 0;
    while(toks[n_toks] != NULL) {
      n_toks++;
    }
    
    if(n_toks < 3) {
      g_error("Expected at least 3 toks per BED file line, got %d", 
	      n_toks);
    }

    /* first three fields (chr, start, end) are required */
    feats[*n_read].c.chr = NULL;
    feats[*n_read].c.seqname = g_strdup(toks[0]);
    feats[*n_read].c.start = strtol(toks[1], NULL, 10) + 1;
    feats[*n_read].c.end   = strtol(toks[2], NULL, 10);

    /* optional fourth field is feature name */
    if(n_toks >= 4) {
      feats[*n_read].name = g_strdup(toks[3]);
    } else {
      feats[*n_read].name = NULL;
    }

    /* optional fifth field is score */
    if(n_toks >= 5) {
      feats[*n_read].score = strtod(toks[4], NULL);
    } else {
      feats[*n_read].score = 0.0;
    }

    /* optional sixth field is strand */
    if(n_toks >= 6) {
      feats[*n_read].c.strand = char_to_strand(toks[5][0]);
    } else {
      feats[*n_read].c.strand = STRAND_NONE;
    }

    feats[*n_read].n_sub_feat = 0;
    feats[*n_read].sub_feats = NULL;
    feats[*n_read].attrib = NULL;

    *n_read += 1;

    g_strfreev(toks);
    g_free(line);
  }

  fclose(fh);
	  
  return feats;
}
Beispiel #4
0
/**
 *  Reads an array of SeqFeature structures from a file. The value
 *  pointed to the by the n_read argument is set to the number of
 *  SeqFeatures read.
 * 
 * The lines should be formatted as follows: 
 *   <chr>\t<start>\t<end>\t[<strand>\t[<score>\t[<name>\t[<attrib1>\t[<attrib2>\t[...]]]]]]\n
 *
 * For example, the following line could define a repeat feature 
 * with two extra attributes (with space instead of tab delimiters):
 *   chr12 1324151 1324222 0 0.0 AluSx Alu SINE 
 *
 * At a the very minimum, a feature must specify a chromosome, start,
 * and end, but can also specify an arbitrary number of named
 * attributes. If named attributes are to be specified, the strand, score and
 * name must also be specified. The name can be an empty string, in
 * which case the name is set to NULL. The strand can be 0 (indicating
 * no strand).
 */
SeqFeature *seqfeat_read_file(const char *filename, char **attrib_names,
			      const int n_attrib, long *n_read) {
  char *line;
  char **tok;
  int n_tok, min_tok, max_tok, strand, j, more_tok_warning;
  long n_line, i;
  SeqFeature *feats;

  FILE *fh;
  
  fh = fopen(filename, "r");
  if(fh == NULL) {
    g_error("%s:%d: could not read from file '%s'", __FILE__, __LINE__, 
	    filename);
  }
  
  /* count number of lines in file  */
  n_line = util_fcount_lines(fh);
  feats = g_new(SeqFeature, n_line);
  
  /* determine maximum and minimum number of tokens per line */
  if(n_attrib > 0) {
    min_tok = 6 + n_attrib;
    max_tok = 6 + n_attrib;
  } else {
    min_tok = 3;
    max_tok = 6;
  }

  more_tok_warning = FALSE;

  i = 0;
  while(i < n_line) {
    if((line = util_fgets_line(fh)) == NULL) {
      g_error("%s:%d: Expected %lu lines but got only %ld", __FILE__, 
	      __LINE__, n_line, i);
    }

    tok = g_strsplit(line, "\t", max_tok+1);

    n_tok = 0;
    while(tok[n_tok] != NULL) {
      n_tok++;
    }
    
    if(n_tok < min_tok) {
      g_error("%s:%d: Expected between %d and %d tokens per line, got %d",
	      __FILE__, __LINE__, min_tok, max_tok, n_tok);
    }
    if((n_tok > max_tok) && !more_tok_warning) {
      more_tok_warning = TRUE;
      g_warning("%s:%d: Some lines have more tokens than expected. "
		"Some feature attributes may be ignored because they "
		"were not specified",
		__FILE__, __LINE__);
    }

    /* read chromosome, start, end */
    feats[i].c.seqname = g_strdup(tok[0]);
    feats[i].c.chr = NULL;
    feats[i].c.start   = strtol(tok[1], NULL, 10);
    feats[i].c.end     = strtol(tok[2], NULL, 10);

    if(n_tok > 3) {
      /* read strand */
      strand = (tok[3][0] == '\0') ? STRAND_NONE : strtol(tok[3], NULL, 10);
      if((strand != STRAND_FWD) && (strand != STRAND_REV) && 
	 (strand != STRAND_NONE)) {
	g_error("%s:%d: invalid strand (%s)", __FILE__, __LINE__, tok[3]);
      }
      feats[i].c.strand = strand;
    }
    
    if(n_tok > 4) {
      /* read score */
      feats[i].score = (tok[4][0] == '\0') ? 0.0 : strtod(tok[4], NULL);
    } else {
      feats[i].score = 0.0;
    }
    
    if(n_tok > 5) {
      /* read name */
      feats[i].name = (tok[5][0] == '\0') ?  NULL : g_strdup(tok[5]);
    } else {
      feats[i].name = NULL;
    }

    feats[i].n_sub_feat = 0;
    feats[i].sub_feats = NULL;
    feats[i].attrib = NULL;

    /* add attributes */
    for(j = 0; j < n_attrib; j++) {
      seqfeat_add_attrib(&feats[i], attrib_names[j], tok[6 + j]);
    }

    g_strfreev(tok);
    g_free(line);

    i++;
  }

  *n_read = i;

  fclose(fh);

  return feats;
}