/* Function: ReadPhylip() * Date: SRE, Fri Jun 18 12:59:37 1999 [Sanger Centre] * * Purpose: Parse an alignment from an open Phylip format * alignment file. Phylip is a single-alignment format. * Return the alignment, or NULL if we have no data. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * Caller responsible for an MSAFree() * NULL if no more alignments */ MSA * ReadPhylip(MSAFILE *afp) { MSA *msa; char *s, *s1, *s2; char name[11]; /* seq name max len = 10 char */ int nseq, alen; int idx; /* index of current sequence */ int slen; int nblock; if (feof(afp->f)) return NULL; /* Skip until we see a nonblank line; it's the header, * containing nseq/alen */ nseq = 0; alen = 0; while ((s = MSAFileGetLine(afp)) != NULL) { if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) Die("Failed to parse nseq/alen from first line of PHYLIP file %s\n", afp->fname); if (! IsInt(s1) || ! IsInt(s2)) Die("nseq and/or alen not an integer in first line of PHYLIP file %s\n", afp->fname); nseq = atoi(s1); alen = atoi(s2); break; } msa = MSAAlloc(nseq, 0); idx = 0; nblock = 0; while ((s = MSAFileGetLine(afp)) != NULL) { /* ignore blank lines. nonblank lines start w/ nonblank char */ if (isspace(*s)) continue; /* First block has seq names */ if (nblock == 0) { strncpy(name, s, 10); name[10] = '\0'; GKIStoreKey(msa->index, name); msa->sqname[idx] = sre_strdup(name, -1); s += 10; } /* be careful of trailing whitespace on lines */ if ((s1 = sre_strtok(&s, WHITESPACE, &slen)) == NULL) Die("Failed to parse sequence at line %d of PHYLIP file %s\n", afp->linenumber, afp->fname); msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], s1, slen); idx++; if (idx == nseq) { idx = 0; nblock++; } } msa->nseq = nseq; MSAVerifyParse(msa); /* verifies; sets alen, wgt; frees sqlen[] */ return msa; }
/* Function: ReadClustal() * Date: SRE, Sun Jun 6 17:53:49 1999 [bus from Madison, 1999 worm mtg] * * Purpose: Parse an alignment read from an open Clustal format * alignment file. Clustal is a single-alignment format. * Return the alignment, or NULL if we have no data. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no more alignments * * Diagnostics: * Will Die() here with a (potentially) useful message * if a parsing error occurs. */ MSA * ReadClustal(MSAFILE *afp) { MSA *msa; char *s; int slen; int sqidx; char *name; char *seq; char *s2; if (feof(afp->f)) return NULL; /* Skip until we see the CLUSTAL header */ while ((s = MSAFileGetLine(afp)) != NULL) { if (strncmp(s, "CLUSTAL", 7) == 0 && strstr(s, "multiple sequence alignment") != NULL) break; } if (s == NULL) return NULL; msa = MSAAlloc(10, 0); /* Now we're in the sequence section. * As discussed above, if we haven't seen a sequence name, then we * don't include the sequence in the alignment. * Watch out for conservation markup lines that contain *.: chars */ while ((s = MSAFileGetLine(afp)) != NULL) { if ((name = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; if ((seq = sre_strtok(&s, WHITESPACE, &slen)) == NULL) continue; s2 = sre_strtok(&s, "\n", NULL); /* The test for a conservation markup line */ if (strpbrk(name, ".*:") != NULL && strpbrk(seq, ".*:") != NULL) continue; if (s2 != NULL) Die("Parse failed at line %d, file %s: possibly using spaces as gaps", afp->linenumber, afp->fname); /* It's not blank, and it's not a coord line: must be sequence */ sqidx = MSAGetSeqidx(msa, name, msa->lastidx+1); msa->lastidx = sqidx; msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); } MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ return msa; }
/* Function: ReadA2M() * Date: SRE, Sun Jun 6 17:11:29 1999 [bus from Madison 1999 worm mtg] * * Purpose: Parse an alignment read from an open A2M format * alignment file. A2M is a single alignment format. * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object. * Caller responsible for an MSAFree() */ MSA * ReadA2M(MSAFILE *afp) { MSA *msa; char *buf; char *name; char *desc; char *seq; int idx; int len1, len2; if (feof(afp->f)) return NULL; name = NULL; msa = MSAAlloc(10, 0); idx = 0; while ((buf = MSAFileGetLine(afp)) != NULL) { if (*buf == '>') { buf++; /* skip the '>' */ if ((name = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) Die("Blank name in A2M file %s (line %d)\n", afp->fname, afp->linenumber); desc = sre_strtok(&buf, "\n", &len2); idx = GKIStoreKey(msa->index, name); if (idx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[idx] = sre_strdup(name, len1); if (desc != NULL) MSASetSeqDescription(msa, idx, desc); msa->nseq++; } else if (name != NULL) { if ((seq = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) continue; msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], seq, len1); } } if (name == NULL) { MSAFree(msa); return NULL; } MSAVerifyParse(msa); return msa; }
/* Function: ReadMSF() * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis] * * Purpose: Parse an alignment read from an open MSF format * alignment file. (MSF is a single-alignment format.) * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no more alignments * * Diagnostics: * Will Die() here with a (potentially) useful message * if a parsing error occurs. */ MSA * ReadMSF(MSAFILE *afp) { MSA *msa; char *s; int alleged_alen; int alleged_type; int alleged_checksum; char *tok; char *sp; int slen; int sqidx; char *name; char *seq; if (feof(afp->f)) return NULL; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; /* The first line is the header. * This is a new-ish GCG feature. Don't count on it, so * we can be a bit more tolerant towards non-GCG software * generating "MSF" files. */ msa = MSAAlloc(10, 0); if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kAmino; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kRNA; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } /* Now we're in the free text comment section of the MSF file. * It ends when we see the "MSF: Type: Check: .." line. * This line must be present. */ do { if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) && Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3)) { alleged_alen = atoi(sqd_parse[0]); switch (*(sqd_parse[1])) { case 'N' : alleged_type = kRNA; break; case 'P' : alleged_type = kAmino; break; case 'X' : alleged_type = kOtherSeq; break; default : alleged_type = kOtherSeq; } alleged_checksum = atoi(sqd_parse[3]); if (msa->type == kOtherSeq) msa->type = alleged_type; break; /* we're done with comment section. */ } if (! IsBlankline(s)) MSAAddComment(msa, s); } while ((s = MSAFileGetLine(afp)) != NULL); /* Now we're in the name section. * GCG has a relatively poorly documented feature: only sequences that * appear in this list will be read from the alignment section. Commenting * out sequences in the name list (by preceding them with "!") is * allowed as a means of manually defining subsets of sequences in * the alignment section. We can support this feature reasonably * easily because of the hash table for names in the MSA: we * only add names to the hash table when we see 'em in the name section. */ while ((s = MSAFileGetLine(afp)) != NULL) { while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */ if (*s == '\n') continue; /* skip blank lines */ else if (*s == '!') MSAAddComment(msa, s); else if ((sp = strstr(s, "Name:")) != NULL) { /* We take the name and the weigh, and that's it */ sp += 5; tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */ sqidx = GKIStoreKey(msa->index, tok); if (sqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[sqidx] = sre_strdup(tok, slen); msa->nseq++; if ((sp = strstr(sp, "Weight:")) == NULL) Die("No Weight: on line %d for %s in name section of MSF file %s\n", afp->linenumber, msa->sqname[sqidx], afp->fname); sp += 7; tok = sre_strtok(&sp, " \t", &slen); msa->wgt[sqidx] = atof(tok); msa->flags |= MSA_SET_WGT; } else if (strncmp(s, "//", 2) == 0) break; else { Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n", afp->linenumber, afp->fname, s); squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */ return NULL; } } /* And now we're in the sequence section. * As discussed above, if we haven't seen a sequence name, then we * don't include the sequence in the alignment. * Also, watch out for coordinate-only lines. */ while ((s = MSAFileGetLine(afp)) != NULL) { sp = s; if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue; if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue; /* The test for a coord line: digits starting both fields */ if (isdigit((int) *name) && isdigit((int) *seq)) continue; /* It's not blank, and it's not a coord line: must be sequence */ sqidx = GKIKeyIndex(msa->index, name); if (sqidx < 0) continue; /* not a sequence we recognize */ msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); } /* We've left blanks in the aseqs; take them back out. */ for (sqidx = 0; sqidx < msa->nseq; sqidx++) { if (msa->aseq[sqidx] == NULL) Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname); for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++) { if (*s == ' ' || *s == '\t') { msa->sqlen[sqidx]--; } else { *sp = *s; sp++; } } *sp = '\0'; } MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ return msa; }