/* Function: ReadA2M() * Date: SRE, Sun Jun 6 17:11:29 1999 [bus from Madison 1999 worm mtg] * * Purpose: Parse an alignment read from an open A2M format * alignment file. A2M is a single alignment format. * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object. * Caller responsible for an MSAFree() */ MSA * ReadA2M(MSAFILE *afp) { MSA *msa; char *buf; char *name; char *desc; char *seq; int idx; int len1, len2; if (feof(afp->f)) return NULL; name = NULL; msa = MSAAlloc(10, 0); idx = 0; while ((buf = MSAFileGetLine(afp)) != NULL) { if (*buf == '>') { buf++; /* skip the '>' */ if ((name = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) Die("Blank name in A2M file %s (line %d)\n", afp->fname, afp->linenumber); desc = sre_strtok(&buf, "\n", &len2); idx = GKIStoreKey(msa->index, name); if (idx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[idx] = sre_strdup(name, len1); if (desc != NULL) MSASetSeqDescription(msa, idx, desc); msa->nseq++; } else if (name != NULL) { if ((seq = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) continue; msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], seq, len1); } } if (name == NULL) { MSAFree(msa); return NULL; } MSAVerifyParse(msa); return msa; }
/***************************************************************** * msf.c test driver: * cc -DTESTDRIVE_CLUSTAL -g -O2 -Wall -o test clustal.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm * */ int main(int argc, char **argv) { MSAFILE *afp; MSA *msa; char *file; file = argv[1]; if ((afp = MSAFileOpen(file, MSAFILE_CLUSTAL, NULL)) == NULL) Die("Couldn't open %s\n", file); while ((msa = ReadClustal(afp)) != NULL) { WriteClustal(stdout, msa); MSAFree(msa); } MSAFileClose(afp); exit(0); }
/***************************************************************** * phylip.c test driver: * */ int main(int argc, char **argv) { MSAFILE *afp; MSA *msa; char *file; file = argv[1]; if ((afp = MSAFileOpen(file, MSAFILE_UNKNOWN, NULL)) == NULL) Die("Couldn't open %s\n", file); printf("format %d\n", afp->format); while ((msa = ReadPhylip(afp)) != NULL) { WritePhylip(stdout, msa); MSAFree(msa); } MSAFileClose(afp); exit(0); }
/** * @brief Write alignment to file. * * @param[in] mseq * The mseq_t struct containing the aligned sequences * @param[in] pcAlnOutfile * The name of the output file * @param[in] outfmt * The alignment output format (defined in squid.h) * @param[in] iWrap * length of line for Clustal/Fasta format * * @return Non-zero on error * * @note We create a temporary squid MSA struct in here because we never * use it within clustal. We might be better of using the old clustal * output routines instead. * */ int WriteAlignment(mseq_t *mseq, const char *pcAlnOutfile, int outfmt, int iWrap, bool bResno) { int i; /* aux */ MSA *msa; /* squid's alignment structure */ FILE *pfOut = NULL; int key; /* MSA struct internal index for sequence */ int alen; /* alignment length */ bool use_stdout; assert(mseq!=NULL); if (MSAFILE_UNKNOWN == outfmt) { Log(&rLog, LOG_ERROR, "Unknown output format chosen"); return -1; } if (NULL == pcAlnOutfile) { pfOut = stdout; use_stdout = TRUE; } else { use_stdout = FALSE; if (NULL == (pfOut = fopen(pcAlnOutfile, "w"))) { Log(&rLog, LOG_ERROR, "Could not open file %s for writing", pcAlnOutfile); return -1; } } /* derive alignment length from first seq */ alen = strlen(mseq->seq[0]); msa = MSAAlloc(mseq->nseqs, alen); /* basic structure borrowed code from squid-1.9g/a2m.c:ReadA2M() * we actually create a copy of mseq. keeping the pointers becomes * messy when calling MSAFree() */ for (i=0; i<mseq->nseqs; i++) { char *this_name = NULL; /* mseq sequence name */ char *this_seq = NULL; /* mseq sequence */ SQINFO *this_sqinfo = NULL; /* mseq sequence name */ int iI; /* mseq->tree_order encodes to order in which sequences are listed in the guide-tree, if the user wants the sequence output in the input-order then mseq->tree_order==NULL, otherwise mseq->tree_order!=NULL, containing the indices of the sequences, FS, r274 -> */ iI = (NULL == mseq->tree_order) ? i : mseq->tree_order[i]; this_name = mseq->sqinfo[iI].name; /* mseq sequence name */ this_seq = mseq->seq[iI]; /* mseq sequence */ this_sqinfo = &mseq->sqinfo[iI]; /* mseq sequence name */ key = GKIStoreKey(msa->index, this_name); msa->sqname[key] = sre_strdup(this_name, strlen(this_name)); /* setting msa->sqlen[idx] and msa->aseq[idx] */ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key], this_seq, strlen(this_seq)); if (this_sqinfo->flags & SQINFO_DESC) { /* FIXME never get here ... */ MSASetSeqDescription(msa, key, this_sqinfo->desc); } /* FIXME extend this by copying more stuff according to flags. * See MSAFileRead() in msa.c and used functions there * * Problem is that we never parse MSA information as we use squid'sSeqFile */ msa->nseq++; } /* 0 <= i < mseq->nseqs */ /* FIXME Would like to, but can't use MSAVerifyParse(msa) here, as it * will die on error. Need to implement our own version */ #if 0 MSAVerifyParse(msa); #endif /* The below is copy of MSAFileWrite() which originally only writes to stdout. */ /* Be sloppy and make a2m and fasta the same. same for vienna (which is the same). same same. can can. boleh boleh */ if (outfmt==SQFILE_FASTA) outfmt = MSAFILE_A2M; if (outfmt==SQFILE_VIENNA) outfmt = MSAFILE_VIENNA; switch (outfmt) { case MSAFILE_A2M: /*WriteA2M(pfOut, msa, 0);*/ WriteA2M(pfOut, msa, iWrap); break; case MSAFILE_VIENNA: /*WriteA2M(pfOut, msa, 1);*/ WriteA2M(pfOut, msa, INT_MAX); break; case MSAFILE_CLUSTAL: WriteClustal(pfOut, msa, iWrap, TRUE==bResno ? 1 : 0, mseq->seqtype); break; case MSAFILE_MSF: WriteMSF(pfOut, msa); break; case MSAFILE_PHYLIP: WritePhylip(pfOut, msa); break; case MSAFILE_SELEX: WriteSELEX(pfOut, msa); break; case MSAFILE_STOCKHOLM: WriteStockholm(pfOut, msa); break; default: Log(&rLog, LOG_FATAL, "internal error: %s", "invalid output format should have been detected before"); } if (use_stdout == FALSE) { (void) fclose(pfOut); Log(&rLog, LOG_INFO, "Alignment written to %s", pcAlnOutfile); } MSAFree(msa); return 0; }
/** * @brief Stripped down version of squid's alistat * * * @param[in] prMSeq * The alignment to analyse * @param[in] bSampling * For many sequences: samples from pool * @param[in] bReportAll * Report identities for all sequence pairs * * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive */ void AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) { /* * bSampling = squid's do_fast * bReportAll = squid's allreport */ float **ppdIdentMx; /* identity matrix (squid: imx) */ const int iNumSample = 1000; /* sample size (squid: nsample) */ MSA *msa; /* squid's alignment structure */ int small, large; int bestj, worstj; float sum; float worst_worst, worst_best, best_best; float avgid; int i, j; int nres; /* number of residues */ if (bSampling && bReportAll) { Log(&rLog, LOG_WARN, "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__); return; } if (FALSE == prMSeq->aligned) { Log(&rLog, LOG_WARN, "Sequences are not aligned. Skipping %s()", __FUNCTION__); return; } /* silence gcc warnings about uninitialized variables */ worst_worst = worst_best = best_best = 0.0; bestj = worstj = -1; /** mseq to squid msa * * FIXME code overlap with WriteAlignment. Make it a function and take * code there (contains more comments) as template * */ msa = MSAAlloc(prMSeq->nseqs, /* derive alignment length from first seq */ strlen(prMSeq->seq[0])); for (i=0; i<prMSeq->nseqs; i++) { int key; /* MSA struct internal index for sequence */ char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */ char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */ SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */ key = GKIStoreKey(msa->index, this_name); msa->sqname[key] = sre_strdup(this_name, strlen(this_name)); /* setting msa->sqlen[idx] and msa->aseq[idx] */ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key], this_seq, strlen(this_seq)); if (this_sqinfo->flags & SQINFO_DESC) { MSASetSeqDescription(msa, key, this_sqinfo->desc); } msa->nseq++; } nres = 0; small = large = -1; for (i = 0; i < msa->nseq; i++) { int rlen; /* raw sequence length */ rlen = DealignedLength(msa->aseq[i]); nres += rlen; if (small == -1 || rlen < small) small = rlen; if (large == -1 || rlen > large) large = rlen; } if (bSampling) { avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen, msa->nseq, iNumSample); } else { float best, worst; /* this might be slow...could use openmp inside squid */ MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx); if (bReportAll) { printf(" %-15s %5s %7s %-15s %7s %-15s\n", "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)"); printf(" --------------- ----- ------- --------------- ------- ---------------\n"); } sum = 0.0; worst_best = 1.0; best_best = 0.0; worst_worst = 1.0; for (i = 0; i < msa->nseq; i++) { worst = 1.0; best = 0.0; for (j = 0; j < msa->nseq; j++) { /* closest seq to this one = best */ if (i != j && ppdIdentMx[i][j] > best) { best = ppdIdentMx[i][j]; bestj = j; } if (ppdIdentMx[i][j] < worst) { worst = ppdIdentMx[i][j]; worstj = j; } } if (bReportAll) { printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n", msa->sqname[i], DealignedLength(msa->aseq[i]), best * 100., msa->sqname[bestj], worst * 100., msa->sqname[worstj]); } if (best > best_best) best_best = best; if (best < worst_best) worst_best = best; if (worst < worst_worst) worst_worst = worst; for (j = 0; j < i; j++) sum += ppdIdentMx[i][j]; } avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0); if (bReportAll) puts(""); FMX2Free(ppdIdentMx); } /* else bSampling */ /* Print output */ if (msa->name != NULL) printf("Alignment name: %s\n", msa->name); /*printf("Format: %s\n", SeqfileFormat2String(afp->format));*/ printf("Number of sequences: %d\n", msa->nseq); printf("Total # residues: %d\n", nres); printf("Smallest: %d\n", small); printf("Largest: %d\n", large); printf("Average length: %.1f\n", (float) nres / (float) msa->nseq); printf("Alignment length: %d\n", msa->alen); printf("Average identity: %.2f%%\n", 100.*avgid); if (! bSampling) { printf("Most related pair: %.2f%%\n", 100.*best_best); printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst); printf("Most distant seq: %.2f%%\n", 100.*worst_best); } /* char *cs; cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen); printf cs; */ MSAFree(msa); }