/* Print a CategoryMap to a file */ void cm_print(CategoryMap *cm, FILE *F) { int i, j, k; List *tmpl; fprintf(F, "NCATS = %d\n\n", cm->ncats); for (i = 1; i <= cm->ncats; i++) { CategoryRange *cr = cm->ranges[i]; for (j = 0; j < lst_size(cr->feature_types); j++) { String *s = (String*)lst_get_ptr(cr->feature_types, j); fprintf(F, "%-15s %d", s->chars, cr->start_cat_no); if (cr->end_cat_no > cr->start_cat_no) fprintf(F, "-%d", cr->end_cat_no); if (cm->conditioned_on[i] != NULL) { fprintf(F, "\t"); for (k = 0; k < lst_size(cm->conditioned_on[i]); k++) fprintf(F, "%d%s", lst_get_int(cm->conditioned_on[i], k), k + 1 == lst_size(cm->conditioned_on[i]) ? "" : ","); } fprintf(F, "\n"); } i = cr->end_cat_no; /* avoid looking multiple times at the same range */ } /* reconstruct precedence lists */ tmpl = lst_new_int(cm->ncats + 1); for (i = 0; i <= cm->ncats; i++) lst_push_int(tmpl, i); prec = cm->labelling_precedence; lst_qsort(tmpl, compare_prec); fprintf(F, "\nLABELLING_PRECEDENCE = "); for (i = 0; i <= cm->ncats; i++) { int cat = lst_get_int(tmpl, i); if (cm->labelling_precedence[cat] != -1) fprintf(F, "%d%s", cat, i < cm->ncats ? "," : ""); } fprintf(F, "\n"); lst_clear(tmpl); for (i = 0; i <= cm->ncats; i++) lst_push_int(tmpl, i); prec = cm->fill_precedence; lst_qsort(tmpl, compare_prec); fprintf(F, "FILL_PRECEDENCE = "); for (i = 0; i <= cm->ncats; i++) { int cat = lst_get_int(tmpl, i); if (cm->fill_precedence[cat] != -1) fprintf(F, "%d%s", cat, i < cm->ncats ? "," : ""); } fprintf(F, "\n"); lst_free(tmpl); }
/* given a list of 5' and 3' splice sites extracted from a group, check whether they form valid pairs in all species */ int are_introns_okay(List *intron_splice, MSA *msa, List *problems, int offset5, int offset3) { int i, j, start1, start2; char str1[3], str2[3], str12[5]; char strand; int retval = 1; char * splice_pairs[3] = {"GTAG", "GCAG", "ATAC"}; str1[2] = '\0'; str2[2] = '\0'; if (lst_size(intron_splice) < 2) return 1; strand = ((GFF_Feature*)lst_get_ptr(intron_splice, 0))->strand; /* assume all same strand */ if (strand == '+') lst_qsort(intron_splice, feature_comparator_ascending); else lst_qsort(intron_splice, feature_comparator_descending); for (i = 0; i < lst_size(intron_splice) - 1; i++) { /* assume every 5' splice and immediately following 3' splice form a pair */ GFF_Feature *f1 = lst_get_ptr(intron_splice, i); GFF_Feature *f2 = lst_get_ptr(intron_splice, i+1); if (str_starts_with_charstr(f1->feature, SPLICE_5) && str_starts_with_charstr(f2->feature, SPLICE_3)) { start1 = f1->start - 1 + (strand == '-' ? offset5 : 0); start2 = f2->start - 1 + (strand == '+' ? offset3 : 0); for (j = 0; j < msa->nseqs; j++) { str1[0] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start1], j, 0); str1[1] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start1+1], j, 0); str2[0] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start2], j, 0); str2[1] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start2+1], j, 0); if (strand == '-') { msa_reverse_compl_seq(str1, 2); msa_reverse_compl_seq(str2, 2); } strcpy(str12, str1); strcat(str12, str2); if (!is_signal(str12, 3, splice_pairs, msa->is_missing)) { problem_add(problems, f1, BAD_INTRON, -1, -1); problem_add(problems, f2, BAD_INTRON, -1, -1); retval = 0; break; } } i++; /* no need to look at next feature */ } } return retval; }
/* scans a cds for gaps. Returns CLN_GAPS, NOVRLP_CLN_GAPS, NO_GAPS, or FSHIFT_BAD; doesn't try to check for compensatory indels, which is more complicated (this is left for the special-purpose function below) */ int scan_for_gaps(GFF_Feature *feat, MSA *msa, Problem **problem) { int msa_start = feat->start - 1; int msa_end = feat->end - 1; int i, j; int near_boundary = 0; cds_gap_type retval = NGAPS; List *gaps = lst_new_ptr(10); for (j = 0; retval != FSHIFT_BAD && j < msa->nseqs; j++) { for (i = msa_start; i <= msa_end; i++) { if (ss_get_char_pos(msa, i, j, 0) == GAP_CHAR) { int gap_start, gap_end; struct gap *g; for (gap_start = i-1; gap_start >= msa_start && ss_get_char_pos(msa, gap_start, j, 0) == GAP_CHAR; gap_start--); gap_start++; /* inclusive */ for (gap_end = i+1; gap_end <= msa_end && ss_get_char_pos(msa, gap_end, j, 0) == GAP_CHAR; gap_end++); gap_end--; /* inclusive */ if ((gap_end - gap_start + 1) % 3 != 0) { retval = FSHIFT_BAD; *problem = problem_new(feat, FSHIFT, gap_start, gap_end); (*problem)->cds_gap = FSHIFT_BAD; break; } /* note whether gaps occur near a cds boundary (within 3 sites) */ if (gap_start <= msa_start + 3 || gap_end >= msa_end - 3) near_boundary = 1; if (retval == NGAPS) retval = CLN_GAPS; g = smalloc(sizeof(struct gap)); g->start = gap_start; g->end = gap_end; lst_push_ptr(gaps, g); i = gap_end; } } } if (retval == CLN_GAPS) { /* now check for overlaps */ lst_qsort(gaps, gap_compare); retval = NOVRLP_CLN_GAPS; for (i = 1; i < lst_size(gaps); i++) { struct gap *g1 = lst_get_ptr(gaps, i-1); struct gap *g2 = lst_get_ptr(gaps, i); if (g2->start <= g1->end && (g2->start != g1->start || g2->end != g1->end)) { retval = CLN_GAPS; break; } } if (retval == NOVRLP_CLN_GAPS && near_boundary) retval = CLN_GAPS; /* note that the boundary criterion is being confounded with the overlap criterion. Doesn't seem worth fixing at the moment ... */ } for (i = 0; i < lst_size(gaps); i++) sfree(lst_get_ptr(gaps, i)); lst_free(gaps); return retval; }
/* * filepath may be either a directory or a fully-qualified path. * if it's fully-qualified, only directory entries that sort alphabetically * after the specified file will be returned. * * morefiles will be set if there are more entries left in the directory * after maxentries have been returned. This is intended to let the caller * know they can continue reading. * * Note that the directory may change while we're reading it. If it does, * files that have been added or removed since we started reading it may * not be accurately reflected. */ int list_directory( ctx_t *c, /* ARGSUSED */ int maxentries, char *listDir, /* directory to list */ char *startFile, /* if continuing, start here */ char *restrictions, uint32_t *morefiles, /* OUT */ sqm_lst_t **direntries) /* OUT */ { int rval = 0; int st = 0; DIR *curdir; /* Variable for directory system calls */ dirent64_t *entry; /* Pointer to a directory entry */ dirent64_t *entryp; struct stat64 sout; restrict_t filter = {0}; char *data; /* Pointer to data item to add to list */ node_t *node; sqm_lst_t *lstp = NULL; char buf[MAXPATHLEN + 1]; char *fname; if (ISNULL(listDir, direntries, morefiles)) { return (-1); } *morefiles = 0; /* Set up wildcard restrictions */ rval = set_restrict(restrictions, &filter); if (rval) { return (rval); } curdir = opendir(listDir); /* Set up to ask for directory entries */ if (curdir == NULL) { return (samrerr(SE_NOSUCHPATH, listDir)); } *direntries = lst_create(); /* Return results in this list */ if (*direntries == NULL) { closedir(curdir); return (-1); /* If allocation failed, samerr is set */ } lstp = *direntries; entry = mallocer(sizeof (struct dirent64) + MAXPATHLEN + 1); if (entry == NULL) { closedir(curdir); lst_free(*direntries); *direntries = NULL; return (-1); } /* Walk through directory entries */ while ((rval = readdir64_r(curdir, entry, &entryp)) == 0) { if (entryp == NULL) { break; } fname = (char *)&(entry->d_name[0]); if ((strcmp(fname, ".") == 0) || (strcmp(fname, "..") == 0)) { continue; } /* * If we were given a non-directory, start after * that file alphabetically. */ if (startFile != NULL) { if ((strcmp(fname, startFile)) <= 0) { continue; } } /* Create full pathname and get stat info */ snprintf(buf, sizeof (buf), "%s/%s", listDir, fname); if (lstat64(buf, &sout) != 0) { continue; /* Ignore file which can't be stat'ed */ } /* * think about ways to avoid a double-stat in when we're * fetching file details */ if (check_restrict_stat(fname, &sout, &filter)) { continue; /* Not this entry */ } /* copy to allocated struct */ data = copystr(fname); if (data == NULL) { rval = -1; break; /* samerr already set */ } /* * caller wants all entries for the directory * should there be a top-end limit, to avoid the case where * the directory has millions of entries? */ if (maxentries <= 0) { rval = lst_append(lstp, data); if (rval != 0) { free(data); break; } continue; } /* * Directory may have more entries than requested, so pre-sort * the list so we return the first <n> sorted alphabetically. */ for (node = lstp->head; node != NULL; node = node->next) { st = strcmp(data, (char *)(node->data)); if (st > 0) { continue; } if (st < 0) { rval = lst_ins_before(lstp, node, data); data = NULL; } if ((rval != 0) || (st == 0)) { free(data); data = NULL; } break; } /* entry sorts higher than existing entries */ if (data != NULL) { if (lstp->length < maxentries) { rval = lst_append(lstp, data); if (rval != 0) { free(data); break; } } else { /* no room for this entry */ free(data); (*morefiles)++; } } /* Keep list to designated limits */ if (lstp->length > maxentries) { /* pop off the last entry */ lst_remove(lstp, lstp->tail); (*morefiles)++; } } closedir(curdir); free(entry); if (rval) { lst_free_deep(*direntries); *direntries = NULL; } else if (maxentries <= 0) { lst_qsort(*direntries, node_cmp); } return (rval); }
int list_dir( ctx_t *c, int maxentries, char *filepath, char *restrictions, sqm_lst_t **direntries) /* ARGSUSED */ { int rval = 0; DIR *curdir; /* Variable for directory system calls */ struct dirent64 *entry; /* Pointer to a directory entry */ struct dirent64 *entryp; struct stat64 sout; restrict_t filter = {0}; char *data; /* Pointer to data item to add to list */ char fullpath[MAXPATHLEN]; /* Set up wildcard restrictions */ rval = set_restrict(restrictions, &filter); if (rval) { return (rval); } curdir = opendir(filepath); /* Set up to ask for directory entries */ if (curdir == NULL) { return (samrerr(SE_NOSUCHPATH, filepath)); } *direntries = lst_create(); /* Return results in this list */ if (*direntries == NULL) { closedir(curdir); return (-1); /* If allocation failed, samerr is set */ } entry = mallocer(sizeof (struct dirent64) + MAXPATHLEN + 1); if (entry == NULL) { closedir(curdir); lst_free(*direntries); *direntries = NULL; return (-1); } /* Walk through directory entries */ while ((rval = readdir64_r(curdir, entry, &entryp)) == 0) { if (entryp == NULL) { break; } if ((strcmp(entry->d_name, ".") == 0) || (strcmp(entry->d_name, "..") == 0)) { continue; } /* Create full pathname and get stat info */ snprintf( fullpath, MAXPATHLEN, "%s/%s", filepath, entry->d_name); if (stat64(fullpath, &sout) != 0) { continue; /* Ignore file which can't be stat'ed */ } if (check_restrict_stat(entry->d_name, &sout, &filter)) continue; /* Not this entry */ data = copystr(entry->d_name); /* Copy data to allocated mem */ if (data == NULL) { rval = -1; break; /* samerr already set */ } lst_append(*direntries, data); if ((*direntries)->length >= maxentries) break; /* Keep list to designated limits */ } free(entry); if (rval) { lst_free_deep(*direntries); /* On failure, don't return list */ *direntries = NULL; } else { lst_qsort(*direntries, node_cmp); } closedir(curdir); return (rval); }