/* Function: MsaPwIdent * * Purpose: * * Args: *msa * * Returns: pairwise sequence identity of msa file */ float MsaPwIdent(MSA *msa) { float **idmx; /* identity matrix */ float sum, avgid; int i, j; MakeIdentityMx(msa->aseq, msa->nseq, &idmx); sum=0.0; for (i = 0; i < msa->nseq; i++) for (j = 0; j < i; j++) sum += idmx[i][j]; avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0); FMX2Free(idmx); return avgid; }
/** * @brief Stripped down version of squid's alistat * * * @param[in] prMSeq * The alignment to analyse * @param[in] bSampling * For many sequences: samples from pool * @param[in] bReportAll * Report identities for all sequence pairs * * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive */ void AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) { /* * bSampling = squid's do_fast * bReportAll = squid's allreport */ float **ppdIdentMx; /* identity matrix (squid: imx) */ const int iNumSample = 1000; /* sample size (squid: nsample) */ MSA *msa; /* squid's alignment structure */ int small, large; int bestj, worstj; float sum; float worst_worst, worst_best, best_best; float avgid; int i, j; int nres; /* number of residues */ if (bSampling && bReportAll) { Log(&rLog, LOG_WARN, "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__); return; } if (FALSE == prMSeq->aligned) { Log(&rLog, LOG_WARN, "Sequences are not aligned. Skipping %s()", __FUNCTION__); return; } /* silence gcc warnings about uninitialized variables */ worst_worst = worst_best = best_best = 0.0; bestj = worstj = -1; /** mseq to squid msa * * FIXME code overlap with WriteAlignment. Make it a function and take * code there (contains more comments) as template * */ msa = MSAAlloc(prMSeq->nseqs, /* derive alignment length from first seq */ strlen(prMSeq->seq[0])); for (i=0; i<prMSeq->nseqs; i++) { int key; /* MSA struct internal index for sequence */ char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */ char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */ SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */ key = GKIStoreKey(msa->index, this_name); msa->sqname[key] = sre_strdup(this_name, strlen(this_name)); /* setting msa->sqlen[idx] and msa->aseq[idx] */ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key], this_seq, strlen(this_seq)); if (this_sqinfo->flags & SQINFO_DESC) { MSASetSeqDescription(msa, key, this_sqinfo->desc); } msa->nseq++; } nres = 0; small = large = -1; for (i = 0; i < msa->nseq; i++) { int rlen; /* raw sequence length */ rlen = DealignedLength(msa->aseq[i]); nres += rlen; if (small == -1 || rlen < small) small = rlen; if (large == -1 || rlen > large) large = rlen; } if (bSampling) { avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen, msa->nseq, iNumSample); } else { float best, worst; /* this might be slow...could use openmp inside squid */ MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx); if (bReportAll) { printf(" %-15s %5s %7s %-15s %7s %-15s\n", "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)"); printf(" --------------- ----- ------- --------------- ------- ---------------\n"); } sum = 0.0; worst_best = 1.0; best_best = 0.0; worst_worst = 1.0; for (i = 0; i < msa->nseq; i++) { worst = 1.0; best = 0.0; for (j = 0; j < msa->nseq; j++) { /* closest seq to this one = best */ if (i != j && ppdIdentMx[i][j] > best) { best = ppdIdentMx[i][j]; bestj = j; } if (ppdIdentMx[i][j] < worst) { worst = ppdIdentMx[i][j]; worstj = j; } } if (bReportAll) { printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n", msa->sqname[i], DealignedLength(msa->aseq[i]), best * 100., msa->sqname[bestj], worst * 100., msa->sqname[worstj]); } if (best > best_best) best_best = best; if (best < worst_best) worst_best = best; if (worst < worst_worst) worst_worst = worst; for (j = 0; j < i; j++) sum += ppdIdentMx[i][j]; } avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0); if (bReportAll) puts(""); FMX2Free(ppdIdentMx); } /* else bSampling */ /* Print output */ if (msa->name != NULL) printf("Alignment name: %s\n", msa->name); /*printf("Format: %s\n", SeqfileFormat2String(afp->format));*/ printf("Number of sequences: %d\n", msa->nseq); printf("Total # residues: %d\n", nres); printf("Smallest: %d\n", small); printf("Largest: %d\n", large); printf("Average length: %.1f\n", (float) nres / (float) msa->nseq); printf("Alignment length: %d\n", msa->alen); printf("Average identity: %.2f%%\n", 100.*avgid); if (! bSampling) { printf("Most related pair: %.2f%%\n", 100.*best_best); printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst); printf("Most distant seq: %.2f%%\n", 100.*worst_best); } /* char *cs; cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen); printf cs; */ MSAFree(msa); }