/* Function: MSAGetSeqidx() * Date: SRE, Wed May 19 15:08:25 1999 [St. Louis] * * Purpose: From a sequence name, return seqidx appropriate * for an MSA structure. * * 1) try to guess the index. (pass -1 if you can't guess) * 2) Look up name in msa's hashtable. * 3) If it's a new name, store in msa's hashtable; * expand allocs as needed; * save sqname. * * Args: msa - alignment object * name - a sequence name * guess - a guess at the right index, or -1 if no guess. * * Returns: seqidx */ int MSAGetSeqidx(MSA *msa, char *name, int guess) { int seqidx; /* can we guess? */ if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0) return guess; /* else, a lookup in the index */ if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0) return seqidx; /* else, it's a new name */ seqidx = GKIStoreKey(msa->index, name); if (seqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[seqidx] = sre_strdup(name, -1); msa->nseq++; return seqidx; }
/* Function: ReadA2M() * Date: SRE, Sun Jun 6 17:11:29 1999 [bus from Madison 1999 worm mtg] * * Purpose: Parse an alignment read from an open A2M format * alignment file. A2M is a single alignment format. * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object. * Caller responsible for an MSAFree() */ MSA * ReadA2M(MSAFILE *afp) { MSA *msa; char *buf; char *name; char *desc; char *seq; int idx; int len1, len2; if (feof(afp->f)) return NULL; name = NULL; msa = MSAAlloc(10, 0); idx = 0; while ((buf = MSAFileGetLine(afp)) != NULL) { if (*buf == '>') { buf++; /* skip the '>' */ if ((name = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) Die("Blank name in A2M file %s (line %d)\n", afp->fname, afp->linenumber); desc = sre_strtok(&buf, "\n", &len2); idx = GKIStoreKey(msa->index, name); if (idx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[idx] = sre_strdup(name, len1); if (desc != NULL) MSASetSeqDescription(msa, idx, desc); msa->nseq++; } else if (name != NULL) { if ((seq = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) continue; msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], seq, len1); } } if (name == NULL) { MSAFree(msa); return NULL; } MSAVerifyParse(msa); return msa; }
/* Function: ReadMSF() * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis] * * Purpose: Parse an alignment read from an open MSF format * alignment file. (MSF is a single-alignment format.) * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no more alignments * * Diagnostics: * Will Die() here with a (potentially) useful message * if a parsing error occurs. */ MSA * ReadMSF(MSAFILE *afp) { MSA *msa; char *s; int alleged_alen; int alleged_type; int alleged_checksum; char *tok; char *sp; int slen; int sqidx; char *name; char *seq; if (feof(afp->f)) return NULL; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; /* The first line is the header. * This is a new-ish GCG feature. Don't count on it, so * we can be a bit more tolerant towards non-GCG software * generating "MSF" files. */ msa = MSAAlloc(10, 0); if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kAmino; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kRNA; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } /* Now we're in the free text comment section of the MSF file. * It ends when we see the "MSF: Type: Check: .." line. * This line must be present. */ do { if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) && Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3)) { alleged_alen = atoi(sqd_parse[0]); switch (*(sqd_parse[1])) { case 'N' : alleged_type = kRNA; break; case 'P' : alleged_type = kAmino; break; case 'X' : alleged_type = kOtherSeq; break; default : alleged_type = kOtherSeq; } alleged_checksum = atoi(sqd_parse[3]); if (msa->type == kOtherSeq) msa->type = alleged_type; break; /* we're done with comment section. */ } if (! IsBlankline(s)) MSAAddComment(msa, s); } while ((s = MSAFileGetLine(afp)) != NULL); /* Now we're in the name section. * GCG has a relatively poorly documented feature: only sequences that * appear in this list will be read from the alignment section. Commenting * out sequences in the name list (by preceding them with "!") is * allowed as a means of manually defining subsets of sequences in * the alignment section. We can support this feature reasonably * easily because of the hash table for names in the MSA: we * only add names to the hash table when we see 'em in the name section. */ while ((s = MSAFileGetLine(afp)) != NULL) { while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */ if (*s == '\n') continue; /* skip blank lines */ else if (*s == '!') MSAAddComment(msa, s); else if ((sp = strstr(s, "Name:")) != NULL) { /* We take the name and the weigh, and that's it */ sp += 5; tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */ sqidx = GKIStoreKey(msa->index, tok); if (sqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[sqidx] = sre_strdup(tok, slen); msa->nseq++; if ((sp = strstr(sp, "Weight:")) == NULL) Die("No Weight: on line %d for %s in name section of MSF file %s\n", afp->linenumber, msa->sqname[sqidx], afp->fname); sp += 7; tok = sre_strtok(&sp, " \t", &slen); msa->wgt[sqidx] = atof(tok); msa->flags |= MSA_SET_WGT; } else if (strncmp(s, "//", 2) == 0) break; else { Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n", afp->linenumber, afp->fname, s); squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */ return NULL; } } /* And now we're in the sequence section. * As discussed above, if we haven't seen a sequence name, then we * don't include the sequence in the alignment. * Also, watch out for coordinate-only lines. */ while ((s = MSAFileGetLine(afp)) != NULL) { sp = s; if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue; if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue; /* The test for a coord line: digits starting both fields */ if (isdigit((int) *name) && isdigit((int) *seq)) continue; /* It's not blank, and it's not a coord line: must be sequence */ sqidx = GKIKeyIndex(msa->index, name); if (sqidx < 0) continue; /* not a sequence we recognize */ msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); } /* We've left blanks in the aseqs; take them back out. */ for (sqidx = 0; sqidx < msa->nseq; sqidx++) { if (msa->aseq[sqidx] == NULL) Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname); for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++) { if (*s == ' ' || *s == '\t') { msa->sqlen[sqidx]--; } else { *sp = *s; sp++; } } *sp = '\0'; } MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ return msa; }