/* Function: ReadPhylip() * Date: SRE, Fri Jun 18 12:59:37 1999 [Sanger Centre] * * Purpose: Parse an alignment from an open Phylip format * alignment file. Phylip is a single-alignment format. * Return the alignment, or NULL if we have no data. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * Caller responsible for an MSAFree() * NULL if no more alignments */ MSA * ReadPhylip(MSAFILE *afp) { MSA *msa; char *s, *s1, *s2; char name[11]; /* seq name max len = 10 char */ int nseq, alen; int idx; /* index of current sequence */ int slen; int nblock; if (feof(afp->f)) return NULL; /* Skip until we see a nonblank line; it's the header, * containing nseq/alen */ nseq = 0; alen = 0; while ((s = MSAFileGetLine(afp)) != NULL) { if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) Die("Failed to parse nseq/alen from first line of PHYLIP file %s\n", afp->fname); if (! IsInt(s1) || ! IsInt(s2)) Die("nseq and/or alen not an integer in first line of PHYLIP file %s\n", afp->fname); nseq = atoi(s1); alen = atoi(s2); break; } msa = MSAAlloc(nseq, 0); idx = 0; nblock = 0; while ((s = MSAFileGetLine(afp)) != NULL) { /* ignore blank lines. nonblank lines start w/ nonblank char */ if (isspace(*s)) continue; /* First block has seq names */ if (nblock == 0) { strncpy(name, s, 10); name[10] = '\0'; GKIStoreKey(msa->index, name); msa->sqname[idx] = sre_strdup(name, -1); s += 10; } /* be careful of trailing whitespace on lines */ if ((s1 = sre_strtok(&s, WHITESPACE, &slen)) == NULL) Die("Failed to parse sequence at line %d of PHYLIP file %s\n", afp->linenumber, afp->fname); msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], s1, slen); idx++; if (idx == nseq) { idx = 0; nblock++; } } msa->nseq = nseq; MSAVerifyParse(msa); /* verifies; sets alen, wgt; frees sqlen[] */ return msa; }
/* Function: ReadClustal() * Date: SRE, Sun Jun 6 17:53:49 1999 [bus from Madison, 1999 worm mtg] * * Purpose: Parse an alignment read from an open Clustal format * alignment file. Clustal is a single-alignment format. * Return the alignment, or NULL if we have no data. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no more alignments * * Diagnostics: * Will Die() here with a (potentially) useful message * if a parsing error occurs. */ MSA * ReadClustal(MSAFILE *afp) { MSA *msa; char *s; int slen; int sqidx; char *name; char *seq; char *s2; if (feof(afp->f)) return NULL; /* Skip until we see the CLUSTAL header */ while ((s = MSAFileGetLine(afp)) != NULL) { if (strncmp(s, "CLUSTAL", 7) == 0 && strstr(s, "multiple sequence alignment") != NULL) break; } if (s == NULL) return NULL; msa = MSAAlloc(10, 0); /* Now we're in the sequence section. * As discussed above, if we haven't seen a sequence name, then we * don't include the sequence in the alignment. * Watch out for conservation markup lines that contain *.: chars */ while ((s = MSAFileGetLine(afp)) != NULL) { if ((name = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; if ((seq = sre_strtok(&s, WHITESPACE, &slen)) == NULL) continue; s2 = sre_strtok(&s, "\n", NULL); /* The test for a conservation markup line */ if (strpbrk(name, ".*:") != NULL && strpbrk(seq, ".*:") != NULL) continue; if (s2 != NULL) Die("Parse failed at line %d, file %s: possibly using spaces as gaps", afp->linenumber, afp->fname); /* It's not blank, and it's not a coord line: must be sequence */ sqidx = MSAGetSeqidx(msa, name, msa->lastidx+1); msa->lastidx = sqidx; msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); } MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ return msa; }
/* Function: ReadA2M() * Date: SRE, Sun Jun 6 17:11:29 1999 [bus from Madison 1999 worm mtg] * * Purpose: Parse an alignment read from an open A2M format * alignment file. A2M is a single alignment format. * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object. * Caller responsible for an MSAFree() */ MSA * ReadA2M(MSAFILE *afp) { MSA *msa; char *buf; char *name; char *desc; char *seq; int idx; int len1, len2; if (feof(afp->f)) return NULL; name = NULL; msa = MSAAlloc(10, 0); idx = 0; while ((buf = MSAFileGetLine(afp)) != NULL) { if (*buf == '>') { buf++; /* skip the '>' */ if ((name = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) Die("Blank name in A2M file %s (line %d)\n", afp->fname, afp->linenumber); desc = sre_strtok(&buf, "\n", &len2); idx = GKIStoreKey(msa->index, name); if (idx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[idx] = sre_strdup(name, len1); if (desc != NULL) MSASetSeqDescription(msa, idx, desc); msa->nseq++; } else if (name != NULL) { if ((seq = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) continue; msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], seq, len1); } } if (name == NULL) { MSAFree(msa); return NULL; } MSAVerifyParse(msa); return msa; }
/** * @brief Write alignment to file. * * @param[in] mseq * The mseq_t struct containing the aligned sequences * @param[in] pcAlnOutfile * The name of the output file * @param[in] outfmt * The alignment output format (defined in squid.h) * @param[in] iWrap * length of line for Clustal/Fasta format * * @return Non-zero on error * * @note We create a temporary squid MSA struct in here because we never * use it within clustal. We might be better of using the old clustal * output routines instead. * */ int WriteAlignment(mseq_t *mseq, const char *pcAlnOutfile, int outfmt, int iWrap, bool bResno) { int i; /* aux */ MSA *msa; /* squid's alignment structure */ FILE *pfOut = NULL; int key; /* MSA struct internal index for sequence */ int alen; /* alignment length */ bool use_stdout; assert(mseq!=NULL); if (MSAFILE_UNKNOWN == outfmt) { Log(&rLog, LOG_ERROR, "Unknown output format chosen"); return -1; } if (NULL == pcAlnOutfile) { pfOut = stdout; use_stdout = TRUE; } else { use_stdout = FALSE; if (NULL == (pfOut = fopen(pcAlnOutfile, "w"))) { Log(&rLog, LOG_ERROR, "Could not open file %s for writing", pcAlnOutfile); return -1; } } /* derive alignment length from first seq */ alen = strlen(mseq->seq[0]); msa = MSAAlloc(mseq->nseqs, alen); /* basic structure borrowed code from squid-1.9g/a2m.c:ReadA2M() * we actually create a copy of mseq. keeping the pointers becomes * messy when calling MSAFree() */ for (i=0; i<mseq->nseqs; i++) { char *this_name = NULL; /* mseq sequence name */ char *this_seq = NULL; /* mseq sequence */ SQINFO *this_sqinfo = NULL; /* mseq sequence name */ int iI; /* mseq->tree_order encodes to order in which sequences are listed in the guide-tree, if the user wants the sequence output in the input-order then mseq->tree_order==NULL, otherwise mseq->tree_order!=NULL, containing the indices of the sequences, FS, r274 -> */ iI = (NULL == mseq->tree_order) ? i : mseq->tree_order[i]; this_name = mseq->sqinfo[iI].name; /* mseq sequence name */ this_seq = mseq->seq[iI]; /* mseq sequence */ this_sqinfo = &mseq->sqinfo[iI]; /* mseq sequence name */ key = GKIStoreKey(msa->index, this_name); msa->sqname[key] = sre_strdup(this_name, strlen(this_name)); /* setting msa->sqlen[idx] and msa->aseq[idx] */ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key], this_seq, strlen(this_seq)); if (this_sqinfo->flags & SQINFO_DESC) { /* FIXME never get here ... */ MSASetSeqDescription(msa, key, this_sqinfo->desc); } /* FIXME extend this by copying more stuff according to flags. * See MSAFileRead() in msa.c and used functions there * * Problem is that we never parse MSA information as we use squid'sSeqFile */ msa->nseq++; } /* 0 <= i < mseq->nseqs */ /* FIXME Would like to, but can't use MSAVerifyParse(msa) here, as it * will die on error. Need to implement our own version */ #if 0 MSAVerifyParse(msa); #endif /* The below is copy of MSAFileWrite() which originally only writes to stdout. */ /* Be sloppy and make a2m and fasta the same. same for vienna (which is the same). same same. can can. boleh boleh */ if (outfmt==SQFILE_FASTA) outfmt = MSAFILE_A2M; if (outfmt==SQFILE_VIENNA) outfmt = MSAFILE_VIENNA; switch (outfmt) { case MSAFILE_A2M: /*WriteA2M(pfOut, msa, 0);*/ WriteA2M(pfOut, msa, iWrap); break; case MSAFILE_VIENNA: /*WriteA2M(pfOut, msa, 1);*/ WriteA2M(pfOut, msa, INT_MAX); break; case MSAFILE_CLUSTAL: WriteClustal(pfOut, msa, iWrap, TRUE==bResno ? 1 : 0, mseq->seqtype); break; case MSAFILE_MSF: WriteMSF(pfOut, msa); break; case MSAFILE_PHYLIP: WritePhylip(pfOut, msa); break; case MSAFILE_SELEX: WriteSELEX(pfOut, msa); break; case MSAFILE_STOCKHOLM: WriteStockholm(pfOut, msa); break; default: Log(&rLog, LOG_FATAL, "internal error: %s", "invalid output format should have been detected before"); } if (use_stdout == FALSE) { (void) fclose(pfOut); Log(&rLog, LOG_INFO, "Alignment written to %s", pcAlnOutfile); } MSAFree(msa); return 0; }
/* Function: ReadMSF() * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis] * * Purpose: Parse an alignment read from an open MSF format * alignment file. (MSF is a single-alignment format.) * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no more alignments * * Diagnostics: * Will Die() here with a (potentially) useful message * if a parsing error occurs. */ MSA * ReadMSF(MSAFILE *afp) { MSA *msa; char *s; int alleged_alen; int alleged_type; int alleged_checksum; char *tok; char *sp; int slen; int sqidx; char *name; char *seq; if (feof(afp->f)) return NULL; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; /* The first line is the header. * This is a new-ish GCG feature. Don't count on it, so * we can be a bit more tolerant towards non-GCG software * generating "MSF" files. */ msa = MSAAlloc(10, 0); if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kAmino; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kRNA; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } /* Now we're in the free text comment section of the MSF file. * It ends when we see the "MSF: Type: Check: .." line. * This line must be present. */ do { if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) && Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3)) { alleged_alen = atoi(sqd_parse[0]); switch (*(sqd_parse[1])) { case 'N' : alleged_type = kRNA; break; case 'P' : alleged_type = kAmino; break; case 'X' : alleged_type = kOtherSeq; break; default : alleged_type = kOtherSeq; } alleged_checksum = atoi(sqd_parse[3]); if (msa->type == kOtherSeq) msa->type = alleged_type; break; /* we're done with comment section. */ } if (! IsBlankline(s)) MSAAddComment(msa, s); } while ((s = MSAFileGetLine(afp)) != NULL); /* Now we're in the name section. * GCG has a relatively poorly documented feature: only sequences that * appear in this list will be read from the alignment section. Commenting * out sequences in the name list (by preceding them with "!") is * allowed as a means of manually defining subsets of sequences in * the alignment section. We can support this feature reasonably * easily because of the hash table for names in the MSA: we * only add names to the hash table when we see 'em in the name section. */ while ((s = MSAFileGetLine(afp)) != NULL) { while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */ if (*s == '\n') continue; /* skip blank lines */ else if (*s == '!') MSAAddComment(msa, s); else if ((sp = strstr(s, "Name:")) != NULL) { /* We take the name and the weigh, and that's it */ sp += 5; tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */ sqidx = GKIStoreKey(msa->index, tok); if (sqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[sqidx] = sre_strdup(tok, slen); msa->nseq++; if ((sp = strstr(sp, "Weight:")) == NULL) Die("No Weight: on line %d for %s in name section of MSF file %s\n", afp->linenumber, msa->sqname[sqidx], afp->fname); sp += 7; tok = sre_strtok(&sp, " \t", &slen); msa->wgt[sqidx] = atof(tok); msa->flags |= MSA_SET_WGT; } else if (strncmp(s, "//", 2) == 0) break; else { Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n", afp->linenumber, afp->fname, s); squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */ return NULL; } } /* And now we're in the sequence section. * As discussed above, if we haven't seen a sequence name, then we * don't include the sequence in the alignment. * Also, watch out for coordinate-only lines. */ while ((s = MSAFileGetLine(afp)) != NULL) { sp = s; if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue; if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue; /* The test for a coord line: digits starting both fields */ if (isdigit((int) *name) && isdigit((int) *seq)) continue; /* It's not blank, and it's not a coord line: must be sequence */ sqidx = GKIKeyIndex(msa->index, name); if (sqidx < 0) continue; /* not a sequence we recognize */ msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); } /* We've left blanks in the aseqs; take them back out. */ for (sqidx = 0; sqidx < msa->nseq; sqidx++) { if (msa->aseq[sqidx] == NULL) Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname); for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++) { if (*s == ' ' || *s == '\t') { msa->sqlen[sqidx]--; } else { *sp = *s; sp++; } } *sp = '\0'; } MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ return msa; }
/* Function: ReadSELEX() * Date: SRE, Sun Jun 6 18:24:09 1999 [St. Louis] * * Purpose: Parse an alignment read from an open SELEX format * alignment file. (SELEX is a single alignment format). * Return the alignment, or NULL if we've already read the * alignment or there's no alignment data in the file. * * Limitations: SELEX is the only remaining multipass parser for * alignment files. It cannot read from gzip or from stdin. * It Die()'s here if you try. The reason for this * that SELEX allows space characters as gaps, so we don't * know the borders of an alignment block until we've seen * the whole block. I could rewrite to allow single-pass * parsing (by storing the whole block in memory) but * since SELEX is now legacy, why bother. * * Note that the interface is totally kludged: fastest * possible adaptation of old ReadSELEX() to the new * MSA interface. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no alignment data. */ MSA * ReadSELEX(MSAFILE *afp) { MSA *msa; /* RETURN: mult seq alignment */ FILE *fp; /* ptr to opened seqfile */ char **aseqs; /* aligned seqs */ int num = 0; /* number of seqs read */ char buffer[LINEBUFLEN]; /* input buffer for lines */ char bufcpy[LINEBUFLEN]; /* strtok'able copy of buffer */ struct block_struc { /** alignment data for a block: */ int lcol; /* furthest left aligned sym */ int rcol; /* furthest right aligned sym */ } *blocks = NULL; int blocknum; /* number of blocks in file */ char *nptr; /* ptr to start of name on line */ char *sptr; /* ptr into sequence on line */ int currnum; /* num. seqs in given block */ int currblock; /* index for blocks */ int i; /* loop counter */ int seqidx; /* counter for seqs */ int alen; /* length of alignment */ int warn_names; /* becomes TRUE if names don't match between blocks */ int headnum; /* seqidx in per-sequence header info */ int currlen; int count; int have_cs = 0; int have_rf = 0; AINFO base_ainfo, *ainfo; /* hack: used to be passed ptr to AINFO */ /* Convert from MSA interface to what old ReadSELEX() did: * - copy our open fp, rather than opening file * - verify that we're not reading a gzip or stdin */ if (feof(afp->f)) return NULL; if (afp->do_gzip || afp->do_stdin) Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file"); fp = afp->f; ainfo = &base_ainfo; /*************************************************** * First pass across file. * Count seqs, get names, determine column info * Determine what sorts of info are active in this file. ***************************************************/ InitAinfo(ainfo); /* get first line of the block * (non-comment, non-blank) */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_NODATA; return 0; } strcpy(bufcpy, buffer); if (*buffer == '#') { if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1; else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1; } } while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || (strchr(commentsyms, *nptr) != NULL)); blocknum = 0; warn_names = FALSE; while (!feof(fp)) { /* allocate for info about this block. */ if (blocknum == 0) blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc)); else blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc)); blocks[blocknum].lcol = LINEBUFLEN+1; blocks[blocknum].rcol = -1; currnum = 0; while (nptr != NULL) /* becomes NULL when this block ends. */ { /* First block only: save names */ if (blocknum == 0) { if (currnum == 0) ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO)); else ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO)); ainfo->sqinfo[currnum].flags = 0; SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME); } else /* in each additional block: check names */ { if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0) warn_names = TRUE; } currnum++; /* check rcol, lcol */ if ((sptr = strtok(NULL, WHITESPACE)) != NULL) { /* is this the furthest left we've seen word 2 in this block? */ if (sptr - bufcpy < blocks[blocknum].lcol) blocks[blocknum].lcol = sptr - bufcpy; /* look for right side in buffer */ for (sptr = buffer + strlen(buffer) - 1; strchr(WHITESPACE, *sptr) != NULL; sptr --) /* do nothing */ ; if (sptr - buffer > blocks[blocknum].rcol) blocks[blocknum].rcol = sptr - buffer; } /* get the next line; blank line means end of block */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; } strcpy(bufcpy, buffer); if (strncmp(buffer, "#=SS", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS; else if (strncmp(buffer, "#=SA", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA; else if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1; else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1; if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; } while (strchr(commentsyms, *nptr) != NULL); } /* check that number of sequences matches expected */ if (blocknum == 0) num = currnum; else if (currnum != num) Die("Parse error in ReadSELEX()"); blocknum++; /* get first line of next block * (non-comment, non-blank) */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; } strcpy(bufcpy, buffer); } while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || (strchr(commentsyms, *nptr) != NULL)); } /*************************************************** * Get ready for second pass: * figure out the length of the alignment * malloc space * rewind the file ***************************************************/ alen = 0; for (currblock = 0; currblock < blocknum; currblock++) alen += blocks[currblock].rcol - blocks[currblock].lcol + 1; rewind(fp); /* allocations. we can't use AllocateAlignment because of * the way we already used ainfo->sqinfo. */ aseqs = (char **) MallocOrDie (num * sizeof(char *)); if (have_cs) ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char)); if (have_rf) ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char)); for (i = 0; i < num; i++) { aseqs[i] = (char *) MallocOrDie ((alen+1) * sizeof(char)); if (ainfo->sqinfo[i].flags & SQINFO_SS) ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char)); if (ainfo->sqinfo[i].flags & SQINFO_SA) ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char)); } ainfo->alen = alen; ainfo->nseq = num; ainfo->wgt = (float *) MallocOrDie (sizeof(float) * num); FSet(ainfo->wgt, num, 1.0); /*************************************************** * Second pass across file. Parse header; assemble sequences ***************************************************/ /* We've now made a complete first pass over the file. We know how * many blocks it contains, we know the number of seqs in the first * block, and we know every block has the same number of blocks; * so we can be a bit more cavalier about error-checking as we * make the second pass. */ /* Look for header */ headnum = 0; for (;;) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) Die("Parse error in ReadSELEX()"); strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */ if (strcmp(nptr, "#=AU") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->au = Strdup(sptr); else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->name = Strdup(sptr); else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->acc = Strdup(sptr); else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->desc = Strdup(sptr); else if (strcmp(nptr, "#=GA") == 0) { if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=GA line in ReadSELEX()"); ainfo->ga1 = atof(sptr); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=GA line in ReadSELEX()"); ainfo->ga2 = atof(sptr); ainfo->flags |= AINFO_GA; } else if (strcmp(nptr, "#=TC") == 0) { if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=TC line in ReadSELEX()"); ainfo->tc1 = atof(sptr); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=TC line in ReadSELEX()"); ainfo->tc2 = atof(sptr); ainfo->flags |= AINFO_TC; } else if (strcmp(nptr, "#=NC") == 0) { if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=NC line in ReadSELEX()"); ainfo->nc1 = atof(sptr); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=NC line in ReadSELEX()"); ainfo->nc2 = atof(sptr); ainfo->flags |= AINFO_NC; } else if (strcmp(nptr, "#=SQ") == 0) /* per-sequence header info */ { /* first field is the name */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX()"); if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE; /* second field is the weight */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX()"); if (!IsReal(sptr)) Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number"); ainfo->wgt[headnum] = atof(sptr); /* third field is database source id */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID); /* fourth field is database accession number */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC); /* fifth field is start..stop::olen */ if ((sptr = strtok(NULL, ".:")) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START); if ((sptr = strtok(NULL, ".:")) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP); if ((sptr = strtok(NULL, ":\t ")) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN); /* rest of line is optional description */ if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC); headnum++; } else if (strcmp(nptr, "#=CS") == 0) break; else if (strcmp(nptr, "#=RF") == 0) break; else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */ } currlen = 0; for (currblock = 0 ; currblock < blocknum; currblock++) { /* parse the block */ seqidx = 0; while (nptr != NULL) { /* Consensus structure */ if (strcmp(nptr, "#=CS") == 0) { if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=CS line in ReadSELEX()"); } /* Reference coordinates */ else if (strcmp(nptr, "#=RF") == 0) { if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=RF line in ReadSELEX()"); } /* Individual secondary structure */ else if (strcmp(nptr, "#=SS") == 0) { if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=SS line in ReadSELEX()"); } /* Side chain % surface accessibility code */ else if (strcmp(nptr, "#=SA") == 0) { if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=SA line in ReadSELEX()"); } /* Aligned sequence; avoid unparsed machine comments */ else if (strncmp(nptr, "#=", 2) != 0) { if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in alignment line in ReadSELEX()"); seqidx++; } /* get next line */ for (;;) { nptr = NULL; if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* EOF */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* data */ } } /* end of a block */ currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1; /* get line 1 of next block */ for (;;) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */ } } /* end of the file */ /* Lengths in sqinfo are for raw sequence (ungapped), * and SS, SA are 0..rlen-1 not 0..alen-1. * Only the seqs with structures come out of here with lengths set. */ for (seqidx = 0; seqidx < num; seqidx++) { int apos, rpos; /* secondary structures */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SS) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos]; rpos++; } ainfo->sqinfo[seqidx].ss[rpos] = '\0'; } /* Surface accessibility */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SA) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos]; rpos++; } ainfo->sqinfo[seqidx].sa[rpos] = '\0'; } } /* NULL-terminate all the strings */ if (ainfo->rf != NULL) ainfo->rf[alen] = '\0'; if (ainfo->cs != NULL) ainfo->cs[alen] = '\0'; for (seqidx = 0; seqidx < num; seqidx++) aseqs[seqidx][alen] = '\0'; /* find raw sequence lengths for sqinfo */ for (seqidx = 0; seqidx < num; seqidx++) { count = 0; for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++) if (!isgap(*sptr)) count++; ainfo->sqinfo[seqidx].len = count; ainfo->sqinfo[seqidx].flags |= SQINFO_LEN; } /*************************************************** * Garbage collection and return ***************************************************/ free(blocks); if (warn_names) Warn("sequences may be in different orders in blocks of %s?", afp->fname); /* Convert back to MSA structure. (Wasteful kludge.) */ msa = MSAFromAINFO(aseqs, ainfo); MSAVerifyParse(msa); FreeAlignment(aseqs, ainfo); return msa; }