/* Function: MSAGetGC() * Date: SRE, Fri Aug 13 13:25:57 1999 [St. Louis] * * Purpose: Given a tagname for a miscellaneous #=GC column * annotation, return a pointer to the annotation * string. * * Args: msa - alignment and its annotation * tag - name of the annotation * * Returns: ptr to the annotation string. Caller does *not* * free; is managed by msa object still. */ char * MSAGetGC(MSA *msa, char *tag) { int tagidx; if (msa->gc_idx == NULL) return NULL; if ((tagidx = GKIKeyIndex(msa->gc_idx, tag)) < 0) return NULL; return msa->gc[tagidx]; }
/* Function: MSAAddGS() * Date: SRE, Wed Jun 2 06:57:03 1999 [St. Louis] * * Purpose: Add an unparsed #=GS markup line to the MSA * structure, allocating as necessary. * * It's possible that we could get more than one * of the same type of GS tag per sequence; for * example, "DR PDB;" structure links in Pfam. * Hack: handle these by appending to the string, * in a \n separated fashion. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "AC") * sqidx - index of sequence to assoc markup with (0..nseq-1) * value - markup (e.g. "P00666") * * Returns: 0 on success */ void MSAAddGS(MSA *msa, char *tag, int sqidx, char *value) { int tagidx; int i; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gs_tag == NULL) /* first tag? init w/ malloc */ { msa->gs_idx = GKIInit(); tagidx = GKIStoreKey(msa->gs_idx, tag); SQD_DASSERT1((tagidx == 0)); msa->gs_tag = MallocOrDie(sizeof(char *)); msa->gs = MallocOrDie(sizeof(char **)); msa->gs[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gs[0][i] = NULL; } else { /* new tag? */ tagidx = GKIKeyIndex(msa->gs_idx, tag); if (tagidx < 0) { /* it's a new tag name; realloc */ tagidx = GKIStoreKey(msa->gs_idx, tag); /* since we alloc in blocks of 1, we always realloc upon seeing a new tag. */ SQD_DASSERT1((tagidx == msa->ngs)); msa->gs_tag = ReallocOrDie(msa->gs_tag, (msa->ngs+1) * sizeof(char *)); msa->gs = ReallocOrDie(msa->gs, (msa->ngs+1) * sizeof(char **)); msa->gs[msa->ngs] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gs[msa->ngs][i] = NULL; } } if (tagidx == msa->ngs) { msa->gs_tag[tagidx] = sre_strdup(tag, -1); msa->ngs++; } if (msa->gs[tagidx][sqidx] == NULL) /* first annotation of this seq with this tag? */ msa->gs[tagidx][sqidx] = sre_strdup(value, -1); else { /* >1 annotation of this seq with this tag; append */ int len; if ((len = sre_strcat(&(msa->gs[tagidx][sqidx]), -1, "\n", 1)) < 0) Die("failed to sre_strcat()"); if (sre_strcat(&(msa->gs[tagidx][sqidx]), len, value, -1) < 0) Die("failed to sre_strcat()"); } return; }
/* Function: MSAAppendGR() * Date: SRE, Thu Jun 3 06:34:38 1999 [Madison] * * Purpose: Add an unparsed #=GR markup line to the * MSA structure, allocating as necessary. * * When called multiple times for the same tag, * appends value strings together -- used when * parsing multiblock alignment files, for * example. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "SS") * sqidx - index of seq to assoc markup with (0..nseq-1) * value - markup, one char per aligned column * * Returns: (void) */ void MSAAppendGR(MSA *msa, char *tag, int sqidx, char *value) { int tagidx; int i; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gr_tag == NULL) /* first tag? init w/ malloc */ { msa->gr_tag = MallocOrDie(sizeof(char *)); msa->gr = MallocOrDie(sizeof(char **)); msa->gr[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gr[0][i] = NULL; msa->gr_idx = GKIInit(); tagidx = GKIStoreKey(msa->gr_idx, tag); SQD_DASSERT1((tagidx == 0)); } else { /* new tag? */ tagidx = GKIKeyIndex(msa->gr_idx, tag); if (tagidx < 0) { /* it's a new tag name; realloc */ tagidx = GKIStoreKey(msa->gr_idx, tag); /* since we alloc in blocks of 1, we always realloc upon seeing a new tag. */ SQD_DASSERT1((tagidx == msa->ngr)); msa->gr_tag = ReallocOrDie(msa->gr_tag, (msa->ngr+1) * sizeof(char *)); msa->gr = ReallocOrDie(msa->gr, (msa->ngr+1) * sizeof(char **)); msa->gr[msa->ngr] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gr[msa->ngr][i] = NULL; } } if (tagidx == msa->ngr) { msa->gr_tag[tagidx] = sre_strdup(tag, -1); msa->ngr++; } sre_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1); return; }
/* Function: MSAGetSeqidx() * Date: SRE, Wed May 19 15:08:25 1999 [St. Louis] * * Purpose: From a sequence name, return seqidx appropriate * for an MSA structure. * * 1) try to guess the index. (pass -1 if you can't guess) * 2) Look up name in msa's hashtable. * 3) If it's a new name, store in msa's hashtable; * expand allocs as needed; * save sqname. * * Args: msa - alignment object * name - a sequence name * guess - a guess at the right index, or -1 if no guess. * * Returns: seqidx */ int MSAGetSeqidx(MSA *msa, char *name, int guess) { int seqidx; /* can we guess? */ if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0) return guess; /* else, a lookup in the index */ if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0) return seqidx; /* else, it's a new name */ seqidx = GKIStoreKey(msa->index, name); if (seqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[seqidx] = sre_strdup(name, -1); msa->nseq++; return seqidx; }
/* Function: MSAAppendGC() * Date: SRE, Thu Jun 3 06:25:14 1999 [Madison] * * Purpose: Add an unparsed #=GC markup line to the MSA * structure, allocating as necessary. * * When called multiple times for the same tag, * appends value strings together -- used when * parsing multiblock alignment files, for * example. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "CS") * value - markup, one char per aligned column * * Returns: (void) */ void MSAAppendGC(MSA *msa, char *tag, char *value) { int tagidx; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gc_tag == NULL) /* first tag? init w/ malloc */ { msa->gc_tag = MallocOrDie(sizeof(char *)); msa->gc = MallocOrDie(sizeof(char *)); msa->gc_idx = GKIInit(); tagidx = GKIStoreKey(msa->gc_idx, tag); SQD_DASSERT1((tagidx == 0)); msa->gc[0] = NULL; } else { /* new tag? */ tagidx = GKIKeyIndex(msa->gc_idx, tag); if (tagidx < 0) { /* it's a new tag name; realloc */ tagidx = GKIStoreKey(msa->gc_idx, tag); /* since we alloc in blocks of 1, we always realloc upon seeing a new tag. */ SQD_DASSERT1((tagidx == msa->ngc)); msa->gc_tag = ReallocOrDie(msa->gc_tag, (msa->ngc+1) * sizeof(char **)); msa->gc = ReallocOrDie(msa->gc, (msa->ngc+1) * sizeof(char **)); msa->gc[tagidx] = NULL; } } if (tagidx == msa->ngc) { msa->gc_tag[tagidx] = sre_strdup(tag, -1); msa->ngc++; } sre_strcat(&(msa->gc[tagidx]), -1, value, -1); return; }
/* Function: ReadMSF() * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis] * * Purpose: Parse an alignment read from an open MSF format * alignment file. (MSF is a single-alignment format.) * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no more alignments * * Diagnostics: * Will Die() here with a (potentially) useful message * if a parsing error occurs. */ MSA * ReadMSF(MSAFILE *afp) { MSA *msa; char *s; int alleged_alen; int alleged_type; int alleged_checksum; char *tok; char *sp; int slen; int sqidx; char *name; char *seq; if (feof(afp->f)) return NULL; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; /* The first line is the header. * This is a new-ish GCG feature. Don't count on it, so * we can be a bit more tolerant towards non-GCG software * generating "MSF" files. */ msa = MSAAlloc(10, 0); if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kAmino; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kRNA; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } /* Now we're in the free text comment section of the MSF file. * It ends when we see the "MSF: Type: Check: .." line. * This line must be present. */ do { if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) && Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3)) { alleged_alen = atoi(sqd_parse[0]); switch (*(sqd_parse[1])) { case 'N' : alleged_type = kRNA; break; case 'P' : alleged_type = kAmino; break; case 'X' : alleged_type = kOtherSeq; break; default : alleged_type = kOtherSeq; } alleged_checksum = atoi(sqd_parse[3]); if (msa->type == kOtherSeq) msa->type = alleged_type; break; /* we're done with comment section. */ } if (! IsBlankline(s)) MSAAddComment(msa, s); } while ((s = MSAFileGetLine(afp)) != NULL); /* Now we're in the name section. * GCG has a relatively poorly documented feature: only sequences that * appear in this list will be read from the alignment section. Commenting * out sequences in the name list (by preceding them with "!") is * allowed as a means of manually defining subsets of sequences in * the alignment section. We can support this feature reasonably * easily because of the hash table for names in the MSA: we * only add names to the hash table when we see 'em in the name section. */ while ((s = MSAFileGetLine(afp)) != NULL) { while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */ if (*s == '\n') continue; /* skip blank lines */ else if (*s == '!') MSAAddComment(msa, s); else if ((sp = strstr(s, "Name:")) != NULL) { /* We take the name and the weigh, and that's it */ sp += 5; tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */ sqidx = GKIStoreKey(msa->index, tok); if (sqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[sqidx] = sre_strdup(tok, slen); msa->nseq++; if ((sp = strstr(sp, "Weight:")) == NULL) Die("No Weight: on line %d for %s in name section of MSF file %s\n", afp->linenumber, msa->sqname[sqidx], afp->fname); sp += 7; tok = sre_strtok(&sp, " \t", &slen); msa->wgt[sqidx] = atof(tok); msa->flags |= MSA_SET_WGT; } else if (strncmp(s, "//", 2) == 0) break; else { Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n", afp->linenumber, afp->fname, s); squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */ return NULL; } } /* And now we're in the sequence section. * As discussed above, if we haven't seen a sequence name, then we * don't include the sequence in the alignment. * Also, watch out for coordinate-only lines. */ while ((s = MSAFileGetLine(afp)) != NULL) { sp = s; if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue; if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue; /* The test for a coord line: digits starting both fields */ if (isdigit((int) *name) && isdigit((int) *seq)) continue; /* It's not blank, and it's not a coord line: must be sequence */ sqidx = GKIKeyIndex(msa->index, name); if (sqidx < 0) continue; /* not a sequence we recognize */ msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); } /* We've left blanks in the aseqs; take them back out. */ for (sqidx = 0; sqidx < msa->nseq; sqidx++) { if (msa->aseq[sqidx] == NULL) Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname); for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++) { if (*s == ' ' || *s == '\t') { msa->sqlen[sqidx]--; } else { *sp = *s; sp++; } } *sp = '\0'; } MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ return msa; }