Пример #1
0
/* Function:		MsaPwIdent
 * 
 * Purpose:		
 *           
 * Args:			*msa					
 *
 * Returns:		pairwise sequence identity of msa file
 */
float MsaPwIdent(MSA *msa)
{
     float  **idmx; /* identity matrix */
     float sum, avgid;
     int i, j;

     MakeIdentityMx(msa->aseq, msa->nseq, &idmx);
     sum=0.0;
     for (i = 0; i < msa->nseq; i++)
          for (j = 0; j < i; j++)
               sum += idmx[i][j];
     avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0);
     FMX2Free(idmx);

     return avgid;
}
Пример #2
0
/**
 * @brief Stripped down version of squid's alistat
 *
 *
 * @param[in] prMSeq
 * The alignment to analyse
 * @param[in] bSampling
 * For many sequences: samples from pool
 * @param[in] bReportAll
 * Report identities for all sequence pairs
 *
 * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive
 */
void
AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) {

    /*
     * bSampling = squid's do_fast
     * bReportAll = squid's allreport
     */
    float  **ppdIdentMx;  /* identity matrix (squid: imx) */
    const int iNumSample = 1000; /* sample size (squid: nsample) */


    MSA *msa; /* squid's alignment structure */
    int small, large;
    int bestj, worstj;
    float sum;
    float worst_worst, worst_best, best_best;
    float avgid;
    int i, j;
    int nres; /* number of residues */

    if (bSampling && bReportAll) {
        Log(&rLog, LOG_WARN,
            "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__);
        return;
    }
    if (FALSE == prMSeq->aligned) {
        Log(&rLog, LOG_WARN,
            "Sequences are not aligned. Skipping %s()", __FUNCTION__);
        return;
    }

    /* silence gcc warnings about uninitialized variables
     */
    worst_worst = worst_best = best_best = 0.0;
    bestj = worstj = -1;


    /** mseq to squid msa
     *
     * FIXME code overlap with WriteAlignment. Make it a function and take
     * code there (contains more comments) as template
     *
     */
    msa  = MSAAlloc(prMSeq->nseqs,
                    /* derive alignment length from first seq */
                    strlen(prMSeq->seq[0]));
    for (i=0; i<prMSeq->nseqs; i++) {
        int key; /* MSA struct internal index for sequence */
        char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */
        char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */
        SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */

        key = GKIStoreKey(msa->index, this_name);
        msa->sqname[key] = sre_strdup(this_name, strlen(this_name));
        /* setting msa->sqlen[idx] and msa->aseq[idx] */
        msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
                                     this_seq, strlen(this_seq));
        if (this_sqinfo->flags & SQINFO_DESC) {
            MSASetSeqDescription(msa, key, this_sqinfo->desc);
        }
        msa->nseq++;
    }



    nres = 0;
    small = large = -1;
    for (i = 0; i < msa->nseq; i++) {
        int rlen;		/* raw sequence length           */
        rlen  = DealignedLength(msa->aseq[i]);
        nres +=  rlen;
        if (small == -1 || rlen < small) small = rlen;
        if (large == -1 || rlen > large) large = rlen;
    }


    if (bSampling) {
        avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen,
                                            msa->nseq, iNumSample);

    } else {
        float best, worst;

        /* this might be slow...could use openmp inside squid */
        MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx);
        if (bReportAll) {
            printf("  %-15s %5s %7s %-15s %7s %-15s\n",
                   "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)");
            printf("  --------------- ----- ------- --------------- ------- ---------------\n");
        }

        sum = 0.0;
        worst_best  = 1.0;
        best_best   = 0.0;
        worst_worst = 1.0;
        for (i = 0; i < msa->nseq; i++) {
            worst = 1.0;
            best  = 0.0;
            for (j = 0; j < msa->nseq; j++) {
                /* closest seq to this one = best */
                if (i != j && ppdIdentMx[i][j] > best)  {
                    best  = ppdIdentMx[i][j];
                    bestj = j;
                }
                if (ppdIdentMx[i][j] < worst) {
                    worst = ppdIdentMx[i][j];
                    worstj = j;
                }
            }

            if (bReportAll)  {
                printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n",
                       msa->sqname[i], DealignedLength(msa->aseq[i]),
                       best * 100.,  msa->sqname[bestj],
                       worst * 100., msa->sqname[worstj]);
            }
            if (best > best_best)    best_best = best;
            if (best < worst_best)   worst_best = best;
            if (worst < worst_worst) worst_worst = worst;
            for (j = 0; j < i; j++)
                sum += ppdIdentMx[i][j];
        }
        avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0);
        if (bReportAll)
            puts("");
        FMX2Free(ppdIdentMx);
    } /* else bSampling */



    /* Print output
     */
    if (msa->name != NULL)
        printf("Alignment name:      %s\n", msa->name);
    /*printf("Format:              %s\n",     SeqfileFormat2String(afp->format));*/
    printf("Number of sequences: %d\n", msa->nseq);
    printf("Total # residues:    %d\n", nres);
    printf("Smallest:            %d\n", small);
    printf("Largest:             %d\n", large);
    printf("Average length:      %.1f\n", (float) nres / (float) msa->nseq);
    printf("Alignment length:    %d\n", msa->alen);
    printf("Average identity:    %.2f%%\n", 100.*avgid);

    if (! bSampling) {
        printf("Most related pair:   %.2f%%\n", 100.*best_best);
        printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst);
        printf("Most distant seq:    %.2f%%\n", 100.*worst_best);
    }

    /*
    char *cs;
    cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen);
    printf cs;
    */

    MSAFree(msa);
}