Esempio n. 1
0
/**
 * @brief Stripped down version of squid's alistat
 *
 *
 * @param[in] prMSeq
 * The alignment to analyse
 * @param[in] bSampling
 * For many sequences: samples from pool
 * @param[in] bReportAll
 * Report identities for all sequence pairs
 *
 * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive
 */
void
AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) {

    /*
     * bSampling = squid's do_fast
     * bReportAll = squid's allreport
     */
    float  **ppdIdentMx;  /* identity matrix (squid: imx) */
    const int iNumSample = 1000; /* sample size (squid: nsample) */


    MSA *msa; /* squid's alignment structure */
    int small, large;
    int bestj, worstj;
    float sum;
    float worst_worst, worst_best, best_best;
    float avgid;
    int i, j;
    int nres; /* number of residues */

    if (bSampling && bReportAll) {
        Log(&rLog, LOG_WARN,
            "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__);
        return;
    }
    if (FALSE == prMSeq->aligned) {
        Log(&rLog, LOG_WARN,
            "Sequences are not aligned. Skipping %s()", __FUNCTION__);
        return;
    }

    /* silence gcc warnings about uninitialized variables
     */
    worst_worst = worst_best = best_best = 0.0;
    bestj = worstj = -1;


    /** mseq to squid msa
     *
     * FIXME code overlap with WriteAlignment. Make it a function and take
     * code there (contains more comments) as template
     *
     */
    msa  = MSAAlloc(prMSeq->nseqs,
                    /* derive alignment length from first seq */
                    strlen(prMSeq->seq[0]));
    for (i=0; i<prMSeq->nseqs; i++) {
        int key; /* MSA struct internal index for sequence */
        char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */
        char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */
        SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */

        key = GKIStoreKey(msa->index, this_name);
        msa->sqname[key] = sre_strdup(this_name, strlen(this_name));
        /* setting msa->sqlen[idx] and msa->aseq[idx] */
        msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
                                     this_seq, strlen(this_seq));
        if (this_sqinfo->flags & SQINFO_DESC) {
            MSASetSeqDescription(msa, key, this_sqinfo->desc);
        }
        msa->nseq++;
    }



    nres = 0;
    small = large = -1;
    for (i = 0; i < msa->nseq; i++) {
        int rlen;		/* raw sequence length           */
        rlen  = DealignedLength(msa->aseq[i]);
        nres +=  rlen;
        if (small == -1 || rlen < small) small = rlen;
        if (large == -1 || rlen > large) large = rlen;
    }


    if (bSampling) {
        avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen,
                                            msa->nseq, iNumSample);

    } else {
        float best, worst;

        /* this might be slow...could use openmp inside squid */
        MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx);
        if (bReportAll) {
            printf("  %-15s %5s %7s %-15s %7s %-15s\n",
                   "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)");
            printf("  --------------- ----- ------- --------------- ------- ---------------\n");
        }

        sum = 0.0;
        worst_best  = 1.0;
        best_best   = 0.0;
        worst_worst = 1.0;
        for (i = 0; i < msa->nseq; i++) {
            worst = 1.0;
            best  = 0.0;
            for (j = 0; j < msa->nseq; j++) {
                /* closest seq to this one = best */
                if (i != j && ppdIdentMx[i][j] > best)  {
                    best  = ppdIdentMx[i][j];
                    bestj = j;
                }
                if (ppdIdentMx[i][j] < worst) {
                    worst = ppdIdentMx[i][j];
                    worstj = j;
                }
            }

            if (bReportAll)  {
                printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n",
                       msa->sqname[i], DealignedLength(msa->aseq[i]),
                       best * 100.,  msa->sqname[bestj],
                       worst * 100., msa->sqname[worstj]);
            }
            if (best > best_best)    best_best = best;
            if (best < worst_best)   worst_best = best;
            if (worst < worst_worst) worst_worst = worst;
            for (j = 0; j < i; j++)
                sum += ppdIdentMx[i][j];
        }
        avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0);
        if (bReportAll)
            puts("");
        FMX2Free(ppdIdentMx);
    } /* else bSampling */



    /* Print output
     */
    if (msa->name != NULL)
        printf("Alignment name:      %s\n", msa->name);
    /*printf("Format:              %s\n",     SeqfileFormat2String(afp->format));*/
    printf("Number of sequences: %d\n", msa->nseq);
    printf("Total # residues:    %d\n", nres);
    printf("Smallest:            %d\n", small);
    printf("Largest:             %d\n", large);
    printf("Average length:      %.1f\n", (float) nres / (float) msa->nseq);
    printf("Alignment length:    %d\n", msa->alen);
    printf("Average identity:    %.2f%%\n", 100.*avgid);

    if (! bSampling) {
        printf("Most related pair:   %.2f%%\n", 100.*best_best);
        printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst);
        printf("Most distant seq:    %.2f%%\n", 100.*worst_best);
    }

    /*
    char *cs;
    cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen);
    printf cs;
    */

    MSAFree(msa);
}
Esempio n. 2
0
/* Function: ReadInterleaved()
 * 
 * Purpose:  Read multiple aligned sequences from the file seqfile.
 *           Store the alignment in aseq, and the associated info
 *           in ainfo. 
 * 
 * Args:     seqfile:        name of alignment file to read.
 *           skip_header():  routine to skip the header of the file 
 *           parse_header(): routine to parse the header of the file
 *           is_dataline():  routine to determine if a line contains data
 *           ret_aseqs:      RETURN: 2D array of aligned sequences
 *           ainfo:          RETURN: optional alignment information
 *
 * Return:   Returns 1 on success. Returns 0 on failure and sets
 *           squid_errno to indicate the cause of the failure.
 */
int
ReadInterleaved(char *seqfile, 
		int (*skip_header)(FILE *),
		int (*parse_header)(FILE *, AINFO *),
		int (*is_dataline)(char *, char *),
		char ***ret_aseqs, AINFO *ainfo)
{
  FILE    *fp;                  /* ptr to opened seqfile        */
  char     buffer[LINEBUFLEN];	/* input buffer for lines       */
  char   **aseqs;               /* aligned seqs                 */
  int      nseq;		/* number of seqs read          */
  int      alen;		/* width of alignment           */
  struct block_struc {          /** alignment data for a block: */
    int lcol;			/* furthest left aligned sym    */
    int rcol;			/* furthest right aligned sym   */
  } *blocks;
  int      blocknum;		/* number of blocks             */
  char    *sptr;                /* ptr into line during parsing */
  int      currblock;		/* index for blocks             */
  int      idx;			/* counter for seqs             */
  int      currlen;
  int      inblock;		/* TRUE if in a block of data   */
  int      pos;


			/* open the file for reading; skip header*/
  fp = fopen(seqfile, "r");
  if (fp == NULL) { squid_errno = SQERR_NOFILE; return 0; }
  if (! (*skip_header) (fp)) return 0;

  /***************************************************
   * First pass across file. 
   * Determine # of seqs and width of alignment so we can alloc.
   ***************************************************/

  blocknum = 0;
  nseq     = 0; 
  alen     = 0;
  inblock  = FALSE;
  while (!feof(fp))
    {
      				/* allocate for info about this block. */
      if (blocknum == 0)
	blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));
      else 
	blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));
      blocks[blocknum].lcol = LINEBUFLEN+1;
      blocks[blocknum].rcol = -1;

      idx = 0;
      /*CONSTCOND*/
      while (1)			/* breaks out when blank line or EOF is hit, see below */
      {
				/* get a data line */
	do {
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) goto BREAKOUT; /* end of file  */
	  if (inblock && is_blankline(buffer))       goto BREAKOUT; /* end of block */
	} while (! (*is_dataline)(buffer, NULL));
	
	inblock = TRUE;
	if (blocknum == 0) nseq++; /* count nseq in first block */
	idx++;			   /* count # of seqs in subsequent blocks */

				/* get rcol for this block */
	for (sptr = buffer + strlen(buffer) - 1; isspace(*sptr); sptr --)
	  ;
	if (sptr - buffer > blocks[blocknum].rcol)
	  blocks[blocknum].rcol = sptr - buffer;

				/* get lcol for this block */
	if ((sptr = strtok(buffer, WHITESPACE)) == NULL) /* name */
	  { squid_errno = SQERR_FORMAT; return 0; }
	if ((sptr = strtok(NULL, WHITESPACE)) == NULL)   /* sequence */
	  { squid_errno = SQERR_FORMAT; return 0; }
	if (sptr - buffer < blocks[blocknum].lcol) 
	  blocks[blocknum].lcol = sptr - buffer;
      }
      
    BREAKOUT:			/* end of a block */
      if (inblock) 
	{
	  if (idx != nseq) { squid_errno = SQERR_FORMAT; return 0; }
	  alen += blocks[blocknum].rcol - blocks[blocknum].lcol + 1;
	  blocknum++;
	  inblock = FALSE;
	}
    }

  /***************************************************
   * Allocations; rewind file for second pass
   ***************************************************/

  AllocAlignment(nseq, alen, &aseqs, ainfo);
  rewind(fp);
  
  /***************************************************
   * Parse file header, if any. 
   * Note that we needed to know the number of seqs
   * before attempting to parse the header, because we
   * needed to allocate the alignment and assoc. info.
   ***************************************************/

  if (! (*parse_header)(fp, ainfo)) return 0;
  
 /***************************************************
  * Second pass across file: parse in the names, aseqs.
  ***************************************************/

  currlen = 0;
  for (currblock = 0 ; currblock < blocknum; currblock++)
    {
      for (idx = 0; idx < nseq; idx++)
	{
			/* get next data line */
	  do {
	    if (fgets(buffer, LINEBUFLEN, fp) == NULL) 
	      { squid_errno = SQERR_FORMAT; return 0; }
	  } while (! (*is_dataline)(buffer, ainfo->sqinfo[idx].flags & SQINFO_NAME ?
				    ainfo->sqinfo[idx].name : NULL));
	  
	  			/* find right boundary of name */
	  sptr = buffer;
	  while (*sptr && isspace(*sptr))  sptr++;
	  if (ainfo->sqinfo[idx].flags & SQINFO_NAME) 
	    while (*sptr && !isspace(*sptr)) sptr++; 
	  else			
	    {			/* first time we've seen name */
	      pos = 0;
	      while (*sptr && !isspace(*sptr) && pos < SQINFO_NAMELEN-1) {
		ainfo->sqinfo[idx].name[pos++] = *sptr;
		sptr++;
	      }
	      ainfo->sqinfo[idx].name[pos] = '\0';
	      ainfo->sqinfo[idx].flags    |= SQINFO_NAME;
	    }

				/* parse alignment line */
	  if (! copy_alignment_line(aseqs[idx], currlen, sptr - buffer,
				    buffer, blocks[currblock].lcol, 
				    blocks[currblock].rcol))
	    { squid_errno = SQERR_FORMAT; return 0; }
	}
      currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1;
    }

				/* Tidy up. */
  for (idx = 0; idx < nseq; idx++)
    {
      aseqs[idx][alen] = '\0';
      homogenize_gapsym(aseqs[idx], (char) '.');
      ainfo->sqinfo[idx].len    = DealignedLength(aseqs[idx]);
      ainfo->sqinfo[idx].flags |= SQINFO_LEN;
    }
  MingapAlignment(aseqs, ainfo);

 /***************************************************
   * Garbage collection and return
   ***************************************************/
  fclose(fp);
  free(blocks);

  *ret_aseqs = aseqs;
  return 1;
}