/** * Reads recombination rates from a file. */ float *dist_read_recomb_rates(char *filename,long seq_len, double scale) { long i, prev, start, end; FILE *fh; int n_tok; char **tok, *line; float *rates, rate; rates = g_new(float, seq_len); fh = fopen(filename, "r"); if(fh == NULL) { g_error("read_recomb_rates: could not read file '%s'", filename); } prev = 1; end = 1; while((line = util_fgets_line(fh)) != NULL) { tok = g_strsplit(line, "\t", 5); n_tok = 0; while(tok[n_tok] != NULL) { n_tok++; } if(n_tok != 5) { g_error("read_recomb_rates: expected 5 tokens per line, got %d", n_tok); } start = strtol(tok[2], NULL, 10) + 1; /* 0-based UCSC coords */ end = strtol(tok[3], NULL, 10); rate = strtod(tok[4], NULL); if(prev < start) { /* undefined region before this, pad with flags */ for(i = prev-1; i < start; i++) { rates[i] = DIST_NA; } } for(i = start-1; i < end; i++) { rates[i] = rate * scale; } g_strfreev(tok); g_free(line); prev = end+1; } /* pad remainder of chromomsome with flag */ for(i = end; i < seq_len; i++) { rates[i] = DIST_NA; } fclose(fh); return rates; }
/** * Reads an array of SeqFeature structures from a flatfile. * The value pointed to the num_red argument is set to the number * of SeqFeauture's read. */ SeqFeature *seqfeat_read_flatfile(const char *filename, long *n_read) { char *line; char **toks; int n_toks, prefix_len; long n_lines, i; SeqFeature *feats; FILE *fh; fh = fopen(filename, "r"); if(fh == NULL) { g_error("%s:%d: could not read from file '%s'", __FILE__, __LINE__, filename); } /* count number of SEQFEAT lines in file */ n_lines = util_fcount_lines_match(fh, SEQ_FEAT_LINE_PREFIX); feats = g_new(SeqFeature, n_lines); prefix_len = strlen(SEQ_FEAT_LINE_PREFIX); i = 0; while(i < n_lines) { if((line = util_fgets_line(fh)) == NULL) { g_error("seqfeat_read_flatfile: Expected %lu %s lines, " "got %lu", n_lines, SEQ_FEAT_LINE_PREFIX, i); } if(strncmp(SEQ_FEAT_LINE_PREFIX,line, prefix_len) != 0) { continue; } toks = g_strsplit(line, "\t", 8); n_toks = 0; while(toks[n_toks] != NULL) { n_toks++; } if(n_toks < 8) { g_error("gene_read_flatfile: Expected %d toks per gene line, got %d", 8, n_toks); } feats[i].id = strtol(toks[1], NULL, 10); feats[i].name = g_strdup(toks[2]); feats[i].c.seqname = g_strdup(toks[3]); feats[i].c.start = strtol(toks[4], NULL, 10); feats[i].c.end = strtol(toks[5], NULL, 10); feats[i].c.strand = strtol(toks[6], NULL, 10); feats[i].score = strtod(toks[7], NULL); feats[i].n_sub_feat = 0; feats[i].sub_feats = NULL; feats[i].attrib = NULL; g_strfreev(toks); g_free(line); i++; } *n_read = i; fclose(fh); return feats; }
/** * Reads an array of SeqFeature structures from a BED file. * The value pointed to the num_red argument is set to the number * of SeqFeauture's read. */ SeqFeature *seqfeat_read_bed(const char *filename, long *n_read) { char *line; char **toks; int n_toks; SeqFeature *feats; long n_lines, i; FILE *fh; fh = fopen(filename, "r"); if(fh == NULL) { g_error("Could not read from BED file '%s'", filename); } n_lines = util_fcount_lines(fh); *n_read = 0; /* allocating feature for every line may be excessive because of * comments at the beginning of the file. However, it is much faster * to allocate all at once -- and not much space will be wasted * unless there is an unusually large number of comment lines */ feats = g_new(SeqFeature, n_lines); for(i = 0; i < n_lines; i++) { if((line = util_fgets_line(fh)) == NULL) { /* Less lines than expected. File changed in length? */ break; } /* skip comment lines beginning with ';'*/ if(line[0] == ';') { g_free(line); continue; } toks = g_strsplit(line, "\t", 12); n_toks = 0; while(toks[n_toks] != NULL) { n_toks++; } if(n_toks < 3) { g_error("Expected at least 3 toks per BED file line, got %d", n_toks); } /* first three fields (chr, start, end) are required */ feats[*n_read].c.chr = NULL; feats[*n_read].c.seqname = g_strdup(toks[0]); feats[*n_read].c.start = strtol(toks[1], NULL, 10) + 1; feats[*n_read].c.end = strtol(toks[2], NULL, 10); /* optional fourth field is feature name */ if(n_toks >= 4) { feats[*n_read].name = g_strdup(toks[3]); } else { feats[*n_read].name = NULL; } /* optional fifth field is score */ if(n_toks >= 5) { feats[*n_read].score = strtod(toks[4], NULL); } else { feats[*n_read].score = 0.0; } /* optional sixth field is strand */ if(n_toks >= 6) { feats[*n_read].c.strand = char_to_strand(toks[5][0]); } else { feats[*n_read].c.strand = STRAND_NONE; } feats[*n_read].n_sub_feat = 0; feats[*n_read].sub_feats = NULL; feats[*n_read].attrib = NULL; *n_read += 1; g_strfreev(toks); g_free(line); } fclose(fh); return feats; }
/** * Reads an array of SeqFeature structures from a file. The value * pointed to the by the n_read argument is set to the number of * SeqFeatures read. * * The lines should be formatted as follows: * <chr>\t<start>\t<end>\t[<strand>\t[<score>\t[<name>\t[<attrib1>\t[<attrib2>\t[...]]]]]]\n * * For example, the following line could define a repeat feature * with two extra attributes (with space instead of tab delimiters): * chr12 1324151 1324222 0 0.0 AluSx Alu SINE * * At a the very minimum, a feature must specify a chromosome, start, * and end, but can also specify an arbitrary number of named * attributes. If named attributes are to be specified, the strand, score and * name must also be specified. The name can be an empty string, in * which case the name is set to NULL. The strand can be 0 (indicating * no strand). */ SeqFeature *seqfeat_read_file(const char *filename, char **attrib_names, const int n_attrib, long *n_read) { char *line; char **tok; int n_tok, min_tok, max_tok, strand, j, more_tok_warning; long n_line, i; SeqFeature *feats; FILE *fh; fh = fopen(filename, "r"); if(fh == NULL) { g_error("%s:%d: could not read from file '%s'", __FILE__, __LINE__, filename); } /* count number of lines in file */ n_line = util_fcount_lines(fh); feats = g_new(SeqFeature, n_line); /* determine maximum and minimum number of tokens per line */ if(n_attrib > 0) { min_tok = 6 + n_attrib; max_tok = 6 + n_attrib; } else { min_tok = 3; max_tok = 6; } more_tok_warning = FALSE; i = 0; while(i < n_line) { if((line = util_fgets_line(fh)) == NULL) { g_error("%s:%d: Expected %lu lines but got only %ld", __FILE__, __LINE__, n_line, i); } tok = g_strsplit(line, "\t", max_tok+1); n_tok = 0; while(tok[n_tok] != NULL) { n_tok++; } if(n_tok < min_tok) { g_error("%s:%d: Expected between %d and %d tokens per line, got %d", __FILE__, __LINE__, min_tok, max_tok, n_tok); } if((n_tok > max_tok) && !more_tok_warning) { more_tok_warning = TRUE; g_warning("%s:%d: Some lines have more tokens than expected. " "Some feature attributes may be ignored because they " "were not specified", __FILE__, __LINE__); } /* read chromosome, start, end */ feats[i].c.seqname = g_strdup(tok[0]); feats[i].c.chr = NULL; feats[i].c.start = strtol(tok[1], NULL, 10); feats[i].c.end = strtol(tok[2], NULL, 10); if(n_tok > 3) { /* read strand */ strand = (tok[3][0] == '\0') ? STRAND_NONE : strtol(tok[3], NULL, 10); if((strand != STRAND_FWD) && (strand != STRAND_REV) && (strand != STRAND_NONE)) { g_error("%s:%d: invalid strand (%s)", __FILE__, __LINE__, tok[3]); } feats[i].c.strand = strand; } if(n_tok > 4) { /* read score */ feats[i].score = (tok[4][0] == '\0') ? 0.0 : strtod(tok[4], NULL); } else { feats[i].score = 0.0; } if(n_tok > 5) { /* read name */ feats[i].name = (tok[5][0] == '\0') ? NULL : g_strdup(tok[5]); } else { feats[i].name = NULL; } feats[i].n_sub_feat = 0; feats[i].sub_feats = NULL; feats[i].attrib = NULL; /* add attributes */ for(j = 0; j < n_attrib; j++) { seqfeat_add_attrib(&feats[i], attrib_names[j], tok[6 + j]); } g_strfreev(tok); g_free(line); i++; } *n_read = i; fclose(fh); return feats; }