/* Function: MSAAddGS() * Date: SRE, Wed Jun 2 06:57:03 1999 [St. Louis] * * Purpose: Add an unparsed #=GS markup line to the MSA * structure, allocating as necessary. * * It's possible that we could get more than one * of the same type of GS tag per sequence; for * example, "DR PDB;" structure links in Pfam. * Hack: handle these by appending to the string, * in a \n separated fashion. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "AC") * sqidx - index of sequence to assoc markup with (0..nseq-1) * value - markup (e.g. "P00666") * * Returns: 0 on success */ void MSAAddGS(MSA *msa, char *tag, int sqidx, char *value) { int tagidx; int i; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gs_tag == NULL) /* first tag? init w/ malloc */ { msa->gs_idx = GKIInit(); tagidx = GKIStoreKey(msa->gs_idx, tag); SQD_DASSERT1((tagidx == 0)); msa->gs_tag = MallocOrDie(sizeof(char *)); msa->gs = MallocOrDie(sizeof(char **)); msa->gs[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gs[0][i] = NULL; } else { /* new tag? */ tagidx = GKIKeyIndex(msa->gs_idx, tag); if (tagidx < 0) { /* it's a new tag name; realloc */ tagidx = GKIStoreKey(msa->gs_idx, tag); /* since we alloc in blocks of 1, we always realloc upon seeing a new tag. */ SQD_DASSERT1((tagidx == msa->ngs)); msa->gs_tag = ReallocOrDie(msa->gs_tag, (msa->ngs+1) * sizeof(char *)); msa->gs = ReallocOrDie(msa->gs, (msa->ngs+1) * sizeof(char **)); msa->gs[msa->ngs] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gs[msa->ngs][i] = NULL; } } if (tagidx == msa->ngs) { msa->gs_tag[tagidx] = sre_strdup(tag, -1); msa->ngs++; } if (msa->gs[tagidx][sqidx] == NULL) /* first annotation of this seq with this tag? */ msa->gs[tagidx][sqidx] = sre_strdup(value, -1); else { /* >1 annotation of this seq with this tag; append */ int len; if ((len = sre_strcat(&(msa->gs[tagidx][sqidx]), -1, "\n", 1)) < 0) Die("failed to sre_strcat()"); if (sre_strcat(&(msa->gs[tagidx][sqidx]), len, value, -1) < 0) Die("failed to sre_strcat()"); } return; }
/* Function: MSAAppendGR() * Date: SRE, Thu Jun 3 06:34:38 1999 [Madison] * * Purpose: Add an unparsed #=GR markup line to the * MSA structure, allocating as necessary. * * When called multiple times for the same tag, * appends value strings together -- used when * parsing multiblock alignment files, for * example. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "SS") * sqidx - index of seq to assoc markup with (0..nseq-1) * value - markup, one char per aligned column * * Returns: (void) */ void MSAAppendGR(MSA *msa, char *tag, int sqidx, char *value) { int tagidx; int i; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gr_tag == NULL) /* first tag? init w/ malloc */ { msa->gr_tag = MallocOrDie(sizeof(char *)); msa->gr = MallocOrDie(sizeof(char **)); msa->gr[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gr[0][i] = NULL; msa->gr_idx = GKIInit(); tagidx = GKIStoreKey(msa->gr_idx, tag); SQD_DASSERT1((tagidx == 0)); } else { /* new tag? */ tagidx = GKIKeyIndex(msa->gr_idx, tag); if (tagidx < 0) { /* it's a new tag name; realloc */ tagidx = GKIStoreKey(msa->gr_idx, tag); /* since we alloc in blocks of 1, we always realloc upon seeing a new tag. */ SQD_DASSERT1((tagidx == msa->ngr)); msa->gr_tag = ReallocOrDie(msa->gr_tag, (msa->ngr+1) * sizeof(char *)); msa->gr = ReallocOrDie(msa->gr, (msa->ngr+1) * sizeof(char **)); msa->gr[msa->ngr] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gr[msa->ngr][i] = NULL; } } if (tagidx == msa->ngr) { msa->gr_tag[tagidx] = sre_strdup(tag, -1); msa->ngr++; } sre_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1); return; }
/* Function: ReadPhylip() * Date: SRE, Fri Jun 18 12:59:37 1999 [Sanger Centre] * * Purpose: Parse an alignment from an open Phylip format * alignment file. Phylip is a single-alignment format. * Return the alignment, or NULL if we have no data. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * Caller responsible for an MSAFree() * NULL if no more alignments */ MSA * ReadPhylip(MSAFILE *afp) { MSA *msa; char *s, *s1, *s2; char name[11]; /* seq name max len = 10 char */ int nseq, alen; int idx; /* index of current sequence */ int slen; int nblock; if (feof(afp->f)) return NULL; /* Skip until we see a nonblank line; it's the header, * containing nseq/alen */ nseq = 0; alen = 0; while ((s = MSAFileGetLine(afp)) != NULL) { if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) Die("Failed to parse nseq/alen from first line of PHYLIP file %s\n", afp->fname); if (! IsInt(s1) || ! IsInt(s2)) Die("nseq and/or alen not an integer in first line of PHYLIP file %s\n", afp->fname); nseq = atoi(s1); alen = atoi(s2); break; } msa = MSAAlloc(nseq, 0); idx = 0; nblock = 0; while ((s = MSAFileGetLine(afp)) != NULL) { /* ignore blank lines. nonblank lines start w/ nonblank char */ if (isspace(*s)) continue; /* First block has seq names */ if (nblock == 0) { strncpy(name, s, 10); name[10] = '\0'; GKIStoreKey(msa->index, name); msa->sqname[idx] = sre_strdup(name, -1); s += 10; } /* be careful of trailing whitespace on lines */ if ((s1 = sre_strtok(&s, WHITESPACE, &slen)) == NULL) Die("Failed to parse sequence at line %d of PHYLIP file %s\n", afp->linenumber, afp->fname); msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], s1, slen); idx++; if (idx == nseq) { idx = 0; nblock++; } } msa->nseq = nseq; MSAVerifyParse(msa); /* verifies; sets alen, wgt; frees sqlen[] */ return msa; }
/* Function: MSAAppendGC() * Date: SRE, Thu Jun 3 06:25:14 1999 [Madison] * * Purpose: Add an unparsed #=GC markup line to the MSA * structure, allocating as necessary. * * When called multiple times for the same tag, * appends value strings together -- used when * parsing multiblock alignment files, for * example. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "CS") * value - markup, one char per aligned column * * Returns: (void) */ void MSAAppendGC(MSA *msa, char *tag, char *value) { int tagidx; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gc_tag == NULL) /* first tag? init w/ malloc */ { msa->gc_tag = MallocOrDie(sizeof(char *)); msa->gc = MallocOrDie(sizeof(char *)); msa->gc_idx = GKIInit(); tagidx = GKIStoreKey(msa->gc_idx, tag); SQD_DASSERT1((tagidx == 0)); msa->gc[0] = NULL; } else { /* new tag? */ tagidx = GKIKeyIndex(msa->gc_idx, tag); if (tagidx < 0) { /* it's a new tag name; realloc */ tagidx = GKIStoreKey(msa->gc_idx, tag); /* since we alloc in blocks of 1, we always realloc upon seeing a new tag. */ SQD_DASSERT1((tagidx == msa->ngc)); msa->gc_tag = ReallocOrDie(msa->gc_tag, (msa->ngc+1) * sizeof(char **)); msa->gc = ReallocOrDie(msa->gc, (msa->ngc+1) * sizeof(char **)); msa->gc[tagidx] = NULL; } } if (tagidx == msa->ngc) { msa->gc_tag[tagidx] = sre_strdup(tag, -1); msa->ngc++; } sre_strcat(&(msa->gc[tagidx]), -1, value, -1); return; }
/* Function: MSAGetSeqidx() * Date: SRE, Wed May 19 15:08:25 1999 [St. Louis] * * Purpose: From a sequence name, return seqidx appropriate * for an MSA structure. * * 1) try to guess the index. (pass -1 if you can't guess) * 2) Look up name in msa's hashtable. * 3) If it's a new name, store in msa's hashtable; * expand allocs as needed; * save sqname. * * Args: msa - alignment object * name - a sequence name * guess - a guess at the right index, or -1 if no guess. * * Returns: seqidx */ int MSAGetSeqidx(MSA *msa, char *name, int guess) { int seqidx; /* can we guess? */ if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0) return guess; /* else, a lookup in the index */ if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0) return seqidx; /* else, it's a new name */ seqidx = GKIStoreKey(msa->index, name); if (seqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[seqidx] = sre_strdup(name, -1); msa->nseq++; return seqidx; }
/* Function: ReadA2M() * Date: SRE, Sun Jun 6 17:11:29 1999 [bus from Madison 1999 worm mtg] * * Purpose: Parse an alignment read from an open A2M format * alignment file. A2M is a single alignment format. * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object. * Caller responsible for an MSAFree() */ MSA * ReadA2M(MSAFILE *afp) { MSA *msa; char *buf; char *name; char *desc; char *seq; int idx; int len1, len2; if (feof(afp->f)) return NULL; name = NULL; msa = MSAAlloc(10, 0); idx = 0; while ((buf = MSAFileGetLine(afp)) != NULL) { if (*buf == '>') { buf++; /* skip the '>' */ if ((name = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) Die("Blank name in A2M file %s (line %d)\n", afp->fname, afp->linenumber); desc = sre_strtok(&buf, "\n", &len2); idx = GKIStoreKey(msa->index, name); if (idx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[idx] = sre_strdup(name, len1); if (desc != NULL) MSASetSeqDescription(msa, idx, desc); msa->nseq++; } else if (name != NULL) { if ((seq = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) continue; msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], seq, len1); } } if (name == NULL) { MSAFree(msa); return NULL; } MSAVerifyParse(msa); return msa; }
/** * @brief Write alignment to file. * * @param[in] mseq * The mseq_t struct containing the aligned sequences * @param[in] pcAlnOutfile * The name of the output file * @param[in] outfmt * The alignment output format (defined in squid.h) * @param[in] iWrap * length of line for Clustal/Fasta format * * @return Non-zero on error * * @note We create a temporary squid MSA struct in here because we never * use it within clustal. We might be better of using the old clustal * output routines instead. * */ int WriteAlignment(mseq_t *mseq, const char *pcAlnOutfile, int outfmt, int iWrap, bool bResno) { int i; /* aux */ MSA *msa; /* squid's alignment structure */ FILE *pfOut = NULL; int key; /* MSA struct internal index for sequence */ int alen; /* alignment length */ bool use_stdout; assert(mseq!=NULL); if (MSAFILE_UNKNOWN == outfmt) { Log(&rLog, LOG_ERROR, "Unknown output format chosen"); return -1; } if (NULL == pcAlnOutfile) { pfOut = stdout; use_stdout = TRUE; } else { use_stdout = FALSE; if (NULL == (pfOut = fopen(pcAlnOutfile, "w"))) { Log(&rLog, LOG_ERROR, "Could not open file %s for writing", pcAlnOutfile); return -1; } } /* derive alignment length from first seq */ alen = strlen(mseq->seq[0]); msa = MSAAlloc(mseq->nseqs, alen); /* basic structure borrowed code from squid-1.9g/a2m.c:ReadA2M() * we actually create a copy of mseq. keeping the pointers becomes * messy when calling MSAFree() */ for (i=0; i<mseq->nseqs; i++) { char *this_name = NULL; /* mseq sequence name */ char *this_seq = NULL; /* mseq sequence */ SQINFO *this_sqinfo = NULL; /* mseq sequence name */ int iI; /* mseq->tree_order encodes to order in which sequences are listed in the guide-tree, if the user wants the sequence output in the input-order then mseq->tree_order==NULL, otherwise mseq->tree_order!=NULL, containing the indices of the sequences, FS, r274 -> */ iI = (NULL == mseq->tree_order) ? i : mseq->tree_order[i]; this_name = mseq->sqinfo[iI].name; /* mseq sequence name */ this_seq = mseq->seq[iI]; /* mseq sequence */ this_sqinfo = &mseq->sqinfo[iI]; /* mseq sequence name */ key = GKIStoreKey(msa->index, this_name); msa->sqname[key] = sre_strdup(this_name, strlen(this_name)); /* setting msa->sqlen[idx] and msa->aseq[idx] */ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key], this_seq, strlen(this_seq)); if (this_sqinfo->flags & SQINFO_DESC) { /* FIXME never get here ... */ MSASetSeqDescription(msa, key, this_sqinfo->desc); } /* FIXME extend this by copying more stuff according to flags. * See MSAFileRead() in msa.c and used functions there * * Problem is that we never parse MSA information as we use squid'sSeqFile */ msa->nseq++; } /* 0 <= i < mseq->nseqs */ /* FIXME Would like to, but can't use MSAVerifyParse(msa) here, as it * will die on error. Need to implement our own version */ #if 0 MSAVerifyParse(msa); #endif /* The below is copy of MSAFileWrite() which originally only writes to stdout. */ /* Be sloppy and make a2m and fasta the same. same for vienna (which is the same). same same. can can. boleh boleh */ if (outfmt==SQFILE_FASTA) outfmt = MSAFILE_A2M; if (outfmt==SQFILE_VIENNA) outfmt = MSAFILE_VIENNA; switch (outfmt) { case MSAFILE_A2M: /*WriteA2M(pfOut, msa, 0);*/ WriteA2M(pfOut, msa, iWrap); break; case MSAFILE_VIENNA: /*WriteA2M(pfOut, msa, 1);*/ WriteA2M(pfOut, msa, INT_MAX); break; case MSAFILE_CLUSTAL: WriteClustal(pfOut, msa, iWrap, TRUE==bResno ? 1 : 0, mseq->seqtype); break; case MSAFILE_MSF: WriteMSF(pfOut, msa); break; case MSAFILE_PHYLIP: WritePhylip(pfOut, msa); break; case MSAFILE_SELEX: WriteSELEX(pfOut, msa); break; case MSAFILE_STOCKHOLM: WriteStockholm(pfOut, msa); break; default: Log(&rLog, LOG_FATAL, "internal error: %s", "invalid output format should have been detected before"); } if (use_stdout == FALSE) { (void) fclose(pfOut); Log(&rLog, LOG_INFO, "Alignment written to %s", pcAlnOutfile); } MSAFree(msa); return 0; }
/** * @brief Stripped down version of squid's alistat * * * @param[in] prMSeq * The alignment to analyse * @param[in] bSampling * For many sequences: samples from pool * @param[in] bReportAll * Report identities for all sequence pairs * * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive */ void AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) { /* * bSampling = squid's do_fast * bReportAll = squid's allreport */ float **ppdIdentMx; /* identity matrix (squid: imx) */ const int iNumSample = 1000; /* sample size (squid: nsample) */ MSA *msa; /* squid's alignment structure */ int small, large; int bestj, worstj; float sum; float worst_worst, worst_best, best_best; float avgid; int i, j; int nres; /* number of residues */ if (bSampling && bReportAll) { Log(&rLog, LOG_WARN, "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__); return; } if (FALSE == prMSeq->aligned) { Log(&rLog, LOG_WARN, "Sequences are not aligned. Skipping %s()", __FUNCTION__); return; } /* silence gcc warnings about uninitialized variables */ worst_worst = worst_best = best_best = 0.0; bestj = worstj = -1; /** mseq to squid msa * * FIXME code overlap with WriteAlignment. Make it a function and take * code there (contains more comments) as template * */ msa = MSAAlloc(prMSeq->nseqs, /* derive alignment length from first seq */ strlen(prMSeq->seq[0])); for (i=0; i<prMSeq->nseqs; i++) { int key; /* MSA struct internal index for sequence */ char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */ char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */ SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */ key = GKIStoreKey(msa->index, this_name); msa->sqname[key] = sre_strdup(this_name, strlen(this_name)); /* setting msa->sqlen[idx] and msa->aseq[idx] */ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key], this_seq, strlen(this_seq)); if (this_sqinfo->flags & SQINFO_DESC) { MSASetSeqDescription(msa, key, this_sqinfo->desc); } msa->nseq++; } nres = 0; small = large = -1; for (i = 0; i < msa->nseq; i++) { int rlen; /* raw sequence length */ rlen = DealignedLength(msa->aseq[i]); nres += rlen; if (small == -1 || rlen < small) small = rlen; if (large == -1 || rlen > large) large = rlen; } if (bSampling) { avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen, msa->nseq, iNumSample); } else { float best, worst; /* this might be slow...could use openmp inside squid */ MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx); if (bReportAll) { printf(" %-15s %5s %7s %-15s %7s %-15s\n", "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)"); printf(" --------------- ----- ------- --------------- ------- ---------------\n"); } sum = 0.0; worst_best = 1.0; best_best = 0.0; worst_worst = 1.0; for (i = 0; i < msa->nseq; i++) { worst = 1.0; best = 0.0; for (j = 0; j < msa->nseq; j++) { /* closest seq to this one = best */ if (i != j && ppdIdentMx[i][j] > best) { best = ppdIdentMx[i][j]; bestj = j; } if (ppdIdentMx[i][j] < worst) { worst = ppdIdentMx[i][j]; worstj = j; } } if (bReportAll) { printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n", msa->sqname[i], DealignedLength(msa->aseq[i]), best * 100., msa->sqname[bestj], worst * 100., msa->sqname[worstj]); } if (best > best_best) best_best = best; if (best < worst_best) worst_best = best; if (worst < worst_worst) worst_worst = worst; for (j = 0; j < i; j++) sum += ppdIdentMx[i][j]; } avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0); if (bReportAll) puts(""); FMX2Free(ppdIdentMx); } /* else bSampling */ /* Print output */ if (msa->name != NULL) printf("Alignment name: %s\n", msa->name); /*printf("Format: %s\n", SeqfileFormat2String(afp->format));*/ printf("Number of sequences: %d\n", msa->nseq); printf("Total # residues: %d\n", nres); printf("Smallest: %d\n", small); printf("Largest: %d\n", large); printf("Average length: %.1f\n", (float) nres / (float) msa->nseq); printf("Alignment length: %d\n", msa->alen); printf("Average identity: %.2f%%\n", 100.*avgid); if (! bSampling) { printf("Most related pair: %.2f%%\n", 100.*best_best); printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst); printf("Most distant seq: %.2f%%\n", 100.*worst_best); } /* char *cs; cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen); printf cs; */ MSAFree(msa); }
/* Function: MSAFromAINFO() * Date: SRE, Mon Jun 14 11:22:24 1999 [St. Louis] * * Purpose: Convert the old aseq/ainfo alignment structure * to new MSA structure. Enables more rapid conversion * of codebase to the new world order. * * Args: aseq - [0..nseq-1][0..alen-1] alignment * ainfo - old-style optional info * * Returns: MSA * */ MSA * MSAFromAINFO(char **aseq, AINFO *ainfo) { MSA *msa; int i, j; msa = MSAAlloc(ainfo->nseq, ainfo->alen); for (i = 0; i < ainfo->nseq; i++) { strcpy(msa->aseq[i], aseq[i]); msa->wgt[i] = ainfo->wgt[i]; msa->sqname[i] = sre_strdup(ainfo->sqinfo[i].name, -1); msa->sqlen[i] = msa->alen; GKIStoreKey(msa->index, msa->sqname[i]); if (ainfo->sqinfo[i].flags & SQINFO_ACC) MSASetSeqAccession(msa, i, ainfo->sqinfo[i].acc); if (ainfo->sqinfo[i].flags & SQINFO_DESC) MSASetSeqDescription(msa, i, ainfo->sqinfo[i].desc); if (ainfo->sqinfo[i].flags & SQINFO_SS) { if (msa->ss == NULL) { msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc); msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc); for (j = 0; j < msa->nseqalloc; j++) { msa->ss[j] = NULL; msa->sslen[j] = 0; } } MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].ss, &(msa->ss[i])); msa->sslen[i] = msa->alen; } if (ainfo->sqinfo[i].flags & SQINFO_SA) { if (msa->sa == NULL) { msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc); msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc); for (j = 0; j < msa->nseqalloc; j++) { msa->sa[j] = NULL; msa->salen[j] = 0; } } MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].sa, &(msa->sa[i])); msa->salen[i] = msa->alen; } } /* note that sre_strdup() returns NULL when passed NULL */ msa->name = sre_strdup(ainfo->name, -1); msa->desc = sre_strdup(ainfo->desc, -1); msa->acc = sre_strdup(ainfo->acc, -1); msa->au = sre_strdup(ainfo->au, -1); msa->ss_cons = sre_strdup(ainfo->cs, -1); msa->rf = sre_strdup(ainfo->rf, -1); if (ainfo->flags & AINFO_TC) { msa->cutoff[MSA_CUTOFF_TC1] = ainfo->tc1; msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE; msa->cutoff[MSA_CUTOFF_TC2] = ainfo->tc2; msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE; } if (ainfo->flags & AINFO_NC) { msa->cutoff[MSA_CUTOFF_NC1] = ainfo->nc1; msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE; msa->cutoff[MSA_CUTOFF_NC2] = ainfo->nc2; msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE; } if (ainfo->flags & AINFO_GA) { msa->cutoff[MSA_CUTOFF_GA1] = ainfo->ga1; msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE; msa->cutoff[MSA_CUTOFF_GA2] = ainfo->ga2; msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE; } msa->nseq = ainfo->nseq; msa->alen = ainfo->alen; return msa; }
/* Function: ReadMSF() * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis] * * Purpose: Parse an alignment read from an open MSF format * alignment file. (MSF is a single-alignment format.) * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no more alignments * * Diagnostics: * Will Die() here with a (potentially) useful message * if a parsing error occurs. */ MSA * ReadMSF(MSAFILE *afp) { MSA *msa; char *s; int alleged_alen; int alleged_type; int alleged_checksum; char *tok; char *sp; int slen; int sqidx; char *name; char *seq; if (feof(afp->f)) return NULL; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; /* The first line is the header. * This is a new-ish GCG feature. Don't count on it, so * we can be a bit more tolerant towards non-GCG software * generating "MSF" files. */ msa = MSAAlloc(10, 0); if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kAmino; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kRNA; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } /* Now we're in the free text comment section of the MSF file. * It ends when we see the "MSF: Type: Check: .." line. * This line must be present. */ do { if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) && Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3)) { alleged_alen = atoi(sqd_parse[0]); switch (*(sqd_parse[1])) { case 'N' : alleged_type = kRNA; break; case 'P' : alleged_type = kAmino; break; case 'X' : alleged_type = kOtherSeq; break; default : alleged_type = kOtherSeq; } alleged_checksum = atoi(sqd_parse[3]); if (msa->type == kOtherSeq) msa->type = alleged_type; break; /* we're done with comment section. */ } if (! IsBlankline(s)) MSAAddComment(msa, s); } while ((s = MSAFileGetLine(afp)) != NULL); /* Now we're in the name section. * GCG has a relatively poorly documented feature: only sequences that * appear in this list will be read from the alignment section. Commenting * out sequences in the name list (by preceding them with "!") is * allowed as a means of manually defining subsets of sequences in * the alignment section. We can support this feature reasonably * easily because of the hash table for names in the MSA: we * only add names to the hash table when we see 'em in the name section. */ while ((s = MSAFileGetLine(afp)) != NULL) { while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */ if (*s == '\n') continue; /* skip blank lines */ else if (*s == '!') MSAAddComment(msa, s); else if ((sp = strstr(s, "Name:")) != NULL) { /* We take the name and the weigh, and that's it */ sp += 5; tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */ sqidx = GKIStoreKey(msa->index, tok); if (sqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[sqidx] = sre_strdup(tok, slen); msa->nseq++; if ((sp = strstr(sp, "Weight:")) == NULL) Die("No Weight: on line %d for %s in name section of MSF file %s\n", afp->linenumber, msa->sqname[sqidx], afp->fname); sp += 7; tok = sre_strtok(&sp, " \t", &slen); msa->wgt[sqidx] = atof(tok); msa->flags |= MSA_SET_WGT; } else if (strncmp(s, "//", 2) == 0) break; else { Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n", afp->linenumber, afp->fname, s); squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */ return NULL; } } /* And now we're in the sequence section. * As discussed above, if we haven't seen a sequence name, then we * don't include the sequence in the alignment. * Also, watch out for coordinate-only lines. */ while ((s = MSAFileGetLine(afp)) != NULL) { sp = s; if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue; if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue; /* The test for a coord line: digits starting both fields */ if (isdigit((int) *name) && isdigit((int) *seq)) continue; /* It's not blank, and it's not a coord line: must be sequence */ sqidx = GKIKeyIndex(msa->index, name); if (sqidx < 0) continue; /* not a sequence we recognize */ msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); } /* We've left blanks in the aseqs; take them back out. */ for (sqidx = 0; sqidx < msa->nseq; sqidx++) { if (msa->aseq[sqidx] == NULL) Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname); for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++) { if (*s == ' ' || *s == '\t') { msa->sqlen[sqidx]--; } else { *sp = *s; sp++; } } *sp = '\0'; } MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ return msa; }