Ejemplo n.º 1
0
/* Function: MSAAddGS()
 * Date:     SRE, Wed Jun  2 06:57:03 1999 [St. Louis]
 *
 * Purpose:  Add an unparsed #=GS markup line to the MSA
 *           structure, allocating as necessary.
 *           
 *           It's possible that we could get more than one
 *           of the same type of GS tag per sequence; for
 *           example, "DR PDB;" structure links in Pfam.
 *           Hack: handle these by appending to the string,
 *           in a \n separated fashion. 
 *
 * Args:     msa    - multiple alignment structure
 *           tag    - markup tag (e.g. "AC")
 *           sqidx  - index of sequence to assoc markup with (0..nseq-1)
 *           value  - markup (e.g. "P00666")
 *
 * Returns:  0 on success
 */
void
MSAAddGS(MSA *msa, char *tag, int sqidx, char *value)
{
  int tagidx;
  int i;

  /* Is this an unparsed tag name that we recognize?
   * If not, handle adding it to index, and reallocating
   * as needed.
   */
  if (msa->gs_tag == NULL)	/* first tag? init w/ malloc  */
    {
      msa->gs_idx = GKIInit();
      tagidx      = GKIStoreKey(msa->gs_idx, tag);
      SQD_DASSERT1((tagidx == 0));
      msa->gs_tag = MallocOrDie(sizeof(char *));
      msa->gs     = MallocOrDie(sizeof(char **));
      msa->gs[0]  = MallocOrDie(sizeof(char *) * msa->nseqalloc);
      for (i = 0; i < msa->nseqalloc; i++)
	msa->gs[0][i] = NULL;
    }
  else 
    {
				/* new tag? */
      tagidx  = GKIKeyIndex(msa->gs_idx, tag); 
      if (tagidx < 0) {		/* it's a new tag name; realloc */
	tagidx = GKIStoreKey(msa->gs_idx, tag);
				/* since we alloc in blocks of 1,
				   we always realloc upon seeing 
				   a new tag. */
	SQD_DASSERT1((tagidx == msa->ngs));
	msa->gs_tag =       ReallocOrDie(msa->gs_tag, (msa->ngs+1) * sizeof(char *));
	msa->gs     =       ReallocOrDie(msa->gs, (msa->ngs+1) * sizeof(char **));
	msa->gs[msa->ngs] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	for (i = 0; i < msa->nseqalloc; i++) 
	  msa->gs[msa->ngs][i] = NULL;
      }
    }

  if (tagidx == msa->ngs) {
    msa->gs_tag[tagidx] = sre_strdup(tag, -1);
    msa->ngs++;
  }
  
  if (msa->gs[tagidx][sqidx] == NULL) /* first annotation of this seq with this tag? */
    msa->gs[tagidx][sqidx] = sre_strdup(value, -1);
  else {			
				/* >1 annotation of this seq with this tag; append */
    int len;
    if ((len = sre_strcat(&(msa->gs[tagidx][sqidx]), -1, "\n", 1)) < 0)
      Die("failed to sre_strcat()");
    if (sre_strcat(&(msa->gs[tagidx][sqidx]), len, value, -1) < 0)
      Die("failed to sre_strcat()");
  }
  return;
} 
Ejemplo n.º 2
0
/* Function: MSAAppendGR()
 * Date:     SRE, Thu Jun  3 06:34:38 1999 [Madison]
 *
 * Purpose:  Add an unparsed #=GR markup line to the
 *           MSA structure, allocating as necessary.
 *           
 *           When called multiple times for the same tag,
 *           appends value strings together -- used when
 *           parsing multiblock alignment files, for
 *           example.
 *
 * Args:     msa    - multiple alignment structure
 *           tag    - markup tag (e.g. "SS")
 *           sqidx  - index of seq to assoc markup with (0..nseq-1)
 *           value  - markup, one char per aligned column      
 *
 * Returns:  (void)
 */
void
MSAAppendGR(MSA *msa, char *tag, int sqidx, char *value)
{
  int tagidx;
  int i;

  /* Is this an unparsed tag name that we recognize?
   * If not, handle adding it to index, and reallocating
   * as needed.
   */
  if (msa->gr_tag == NULL)	/* first tag? init w/ malloc  */
    {
      msa->gr_tag = MallocOrDie(sizeof(char *));
      msa->gr     = MallocOrDie(sizeof(char **));
      msa->gr[0]  = MallocOrDie(sizeof(char *) * msa->nseqalloc);
      for (i = 0; i < msa->nseqalloc; i++) 
	msa->gr[0][i] = NULL;
      msa->gr_idx = GKIInit();
      tagidx      = GKIStoreKey(msa->gr_idx, tag);
      SQD_DASSERT1((tagidx == 0));
    }
  else 
    {
				/* new tag? */
      tagidx  = GKIKeyIndex(msa->gr_idx, tag); 
      if (tagidx < 0) {		/* it's a new tag name; realloc */
	tagidx = GKIStoreKey(msa->gr_idx, tag);
				/* since we alloc in blocks of 1,
				   we always realloc upon seeing 
				   a new tag. */
	SQD_DASSERT1((tagidx == msa->ngr));
	msa->gr_tag       = ReallocOrDie(msa->gr_tag, (msa->ngr+1) * sizeof(char *));
	msa->gr           = ReallocOrDie(msa->gr, (msa->ngr+1) * sizeof(char **));
	msa->gr[msa->ngr] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	for (i = 0; i < msa->nseqalloc; i++) 
	  msa->gr[msa->ngr][i] = NULL;
      }
    }
  
  if (tagidx == msa->ngr) {
    msa->gr_tag[tagidx] = sre_strdup(tag, -1);
    msa->ngr++;
  }
  sre_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1);
  return;
}
Ejemplo n.º 3
0
/* Function: ReadPhylip()
 * Date:     SRE, Fri Jun 18 12:59:37 1999 [Sanger Centre]
 *
 * Purpose:  Parse an alignment from an open Phylip format
 *           alignment file. Phylip is a single-alignment format.
 *           Return the alignment, or NULL if we have no data.
 *
 * Args:     afp - open alignment file
 *
 * Returns:  MSA * - an alignment object
 *                   Caller responsible for an MSAFree()
 *           NULL if no more alignments        
 */
MSA *
ReadPhylip(MSAFILE *afp)
{
  MSA  *msa;
  char *s, *s1, *s2;
  char  name[11];		/* seq name max len = 10 char */
  int   nseq, alen;
  int   idx;			/* index of current sequence */
  int   slen;
  int   nblock;
  
  if (feof(afp->f)) return NULL;

  /* Skip until we see a nonblank line; it's the header,
   * containing nseq/alen
   */
  nseq = 0; alen = 0;
  while ((s = MSAFileGetLine(afp)) != NULL)
    {
      if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue;
      if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL)
	Die("Failed to parse nseq/alen from first line of PHYLIP file %s\n", afp->fname);
      if (! IsInt(s1) || ! IsInt(s2))
	Die("nseq and/or alen not an integer in first line of PHYLIP file %s\n", afp->fname);
      nseq = atoi(s1);
      alen = atoi(s2);
      break;
    }

  msa = MSAAlloc(nseq, 0);
  idx    = 0;
  nblock = 0;
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      /* ignore blank lines. nonblank lines start w/ nonblank char */
      if (isspace(*s)) continue;
				/* First block has seq names */
      if (nblock == 0) {
	strncpy(name, s, 10);
	name[10] = '\0';
	GKIStoreKey(msa->index, name);
	msa->sqname[idx] = sre_strdup(name, -1);
	s += 10;		
      }
				/* be careful of trailing whitespace on lines */
      if ((s1 = sre_strtok(&s, WHITESPACE, &slen)) == NULL)
	Die("Failed to parse sequence at line %d of PHYLIP file %s\n", 
	    afp->linenumber, afp->fname);
      msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], s1, slen);

      idx++;
      if (idx == nseq) { idx = 0; nblock++; }
    }
  msa->nseq = nseq;
  MSAVerifyParse(msa);		/* verifies; sets alen, wgt; frees sqlen[] */
  return msa;
}
Ejemplo n.º 4
0
/* Function: MSAAppendGC()
 * Date:     SRE, Thu Jun  3 06:25:14 1999 [Madison]
 *
 * Purpose:  Add an unparsed #=GC markup line to the MSA
 *           structure, allocating as necessary. 
 *           
 *           When called multiple times for the same tag,
 *           appends value strings together -- used when
 *           parsing multiblock alignment files, for
 *           example.
 *
 * Args:     msa   - multiple alignment structure
 *           tag   - markup tag (e.g. "CS")
 *           value - markup, one char per aligned column      
 *
 * Returns:  (void)
 */
void
MSAAppendGC(MSA *msa, char *tag, char *value)
{
  int tagidx;

  /* Is this an unparsed tag name that we recognize?
   * If not, handle adding it to index, and reallocating
   * as needed.
   */
  if (msa->gc_tag == NULL)	/* first tag? init w/ malloc  */
    {
      msa->gc_tag = MallocOrDie(sizeof(char *));
      msa->gc     = MallocOrDie(sizeof(char *));
      msa->gc_idx = GKIInit();
      tagidx      = GKIStoreKey(msa->gc_idx, tag);
      SQD_DASSERT1((tagidx == 0));
      msa->gc[0]  = NULL;
    }
  else
    {			/* new tag? */
      tagidx  = GKIKeyIndex(msa->gc_idx, tag); 
      if (tagidx < 0) {		/* it's a new tag name; realloc */
	tagidx = GKIStoreKey(msa->gc_idx, tag);
				/* since we alloc in blocks of 1,
				   we always realloc upon seeing 
				   a new tag. */
	SQD_DASSERT1((tagidx == msa->ngc));
	msa->gc_tag = ReallocOrDie(msa->gc_tag, (msa->ngc+1) * sizeof(char **));
	msa->gc     = ReallocOrDie(msa->gc, (msa->ngc+1) * sizeof(char **));
	msa->gc[tagidx] = NULL;
      }
    }

  if (tagidx == msa->ngc) {
    msa->gc_tag[tagidx] = sre_strdup(tag, -1);
    msa->ngc++;
  }
  sre_strcat(&(msa->gc[tagidx]), -1, value, -1);
  return;
}
Ejemplo n.º 5
0
/* Function: MSAGetSeqidx()
 * Date:     SRE, Wed May 19 15:08:25 1999 [St. Louis]
 *
 * Purpose:  From a sequence name, return seqidx appropriate
 *           for an MSA structure.
 *           
 *           1) try to guess the index. (pass -1 if you can't guess)
 *           2) Look up name in msa's hashtable.
 *           3) If it's a new name, store in msa's hashtable;
 *                                  expand allocs as needed;
 *                                  save sqname.
 *
 * Args:     msa   - alignment object
 *           name  - a sequence name
 *           guess - a guess at the right index, or -1 if no guess.
 *
 * Returns:  seqidx
 */
int
MSAGetSeqidx(MSA *msa, char *name, int guess)
{
  int seqidx;
				/* can we guess? */
  if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0) 
    return guess;
				/* else, a lookup in the index */
  if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0)
    return seqidx;
				/* else, it's a new name */
  seqidx = GKIStoreKey(msa->index, name);
  if (seqidx >= msa->nseqalloc)  MSAExpand(msa);

  msa->sqname[seqidx] = sre_strdup(name, -1);
  msa->nseq++;
  return seqidx;
}
Ejemplo n.º 6
0
Archivo: a2m.c Proyecto: rforge/phyexe
/* Function: ReadA2M()
 * Date:     SRE, Sun Jun  6 17:11:29 1999 [bus from Madison 1999 worm mtg]
 *
 * Purpose:  Parse an alignment read from an open A2M format
 *           alignment file. A2M is a single alignment format.
 *           Return the alignment, or NULL if we've already
 *           read the alignment.
 *
 * Args:     afp - open alignment file
 *
 * Returns:  MSA *  - an alignment object. 
 *                    Caller responsible for an MSAFree()
 */
MSA *
ReadA2M(MSAFILE *afp)
{
  MSA  *msa;
  char *buf;
  char *name;
  char *desc;
  char *seq;
  int   idx;
  int   len1, len2;
  
  if (feof(afp->f)) return NULL;

  name = NULL;
  msa  = MSAAlloc(10, 0);
  idx  = 0;
  while ((buf = MSAFileGetLine(afp)) != NULL) 
    {
      if (*buf == '>') 
	{
	  buf++;		/* skip the '>' */
	  if ((name = sre_strtok(&buf, WHITESPACE, &len1)) == NULL)
	    Die("Blank name in A2M file %s (line %d)\n", afp->fname, afp->linenumber);
	  desc = sre_strtok(&buf, "\n", &len2);
	
	  idx = GKIStoreKey(msa->index, name);
	  if (idx >= msa->nseqalloc) MSAExpand(msa);

	  msa->sqname[idx] = sre_strdup(name, len1);
	  if (desc != NULL) MSASetSeqDescription(msa, idx, desc);
	  msa->nseq++;
	} 
      else if (name != NULL) 
	{
	  if ((seq = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) continue; 
	  msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], seq, len1);
	}
    } 
  if (name == NULL) { MSAFree(msa); return NULL; }

  MSAVerifyParse(msa);
  return msa;
}
Ejemplo n.º 7
0
/**
 * @brief Write alignment to file.
 *
 * @param[in] mseq
 * The mseq_t struct containing the aligned sequences
 * @param[in] pcAlnOutfile
 * The name of the output file
 * @param[in] outfmt
 * The alignment output format (defined in squid.h)
 * @param[in] iWrap
 * length of line for Clustal/Fasta format
 *
 * @return Non-zero on error
 *
 * @note We create a temporary squid MSA struct in here because we never
 * use it within clustal. We might be better of using the old clustal
 * output routines instead.
 *
 */
int
WriteAlignment(mseq_t *mseq, const char *pcAlnOutfile, int outfmt, int iWrap, bool bResno)
{
    int i; /* aux */
    MSA *msa; /* squid's alignment structure */
    FILE *pfOut = NULL;
    int key; /* MSA struct internal index for sequence */
    int alen; /* alignment length */
    bool use_stdout;

    assert(mseq!=NULL);

    if (MSAFILE_UNKNOWN == outfmt) {
        Log(&rLog, LOG_ERROR, "Unknown output format chosen");
        return -1;
    }

    if (NULL == pcAlnOutfile) {
        pfOut = stdout;
        use_stdout = TRUE;
    } else {
        use_stdout = FALSE;
        if (NULL == (pfOut = fopen(pcAlnOutfile, "w"))) {
            Log(&rLog, LOG_ERROR, "Could not open file %s for writing", pcAlnOutfile);
            return -1;
        }
    }


    /* derive alignment length from first seq */
    alen = strlen(mseq->seq[0]);

    msa  = MSAAlloc(mseq->nseqs, alen);

    /* basic structure borrowed code from squid-1.9g/a2m.c:ReadA2M()
     * we actually create a copy of mseq. keeping the pointers becomes
     * messy when calling MSAFree()
     */
    for (i=0; i<mseq->nseqs; i++) {
        char *this_name = NULL; /* mseq sequence name */
        char *this_seq = NULL; /* mseq sequence */
        SQINFO *this_sqinfo = NULL; /* mseq sequence name */
        int iI;

        /* mseq->tree_order encodes to order in which sequences are listed in the guide-tree,
           if the user wants the sequence output in the input-order then mseq->tree_order==NULL,
           otherwise mseq->tree_order!=NULL, containing the indices of the sequences, FS, r274 ->  */
        iI = (NULL == mseq->tree_order) ? i : mseq->tree_order[i];

        this_name = mseq->sqinfo[iI].name; /* mseq sequence name */
        this_seq = mseq->seq[iI]; /* mseq sequence */
        this_sqinfo = &mseq->sqinfo[iI]; /* mseq sequence name */

        key = GKIStoreKey(msa->index, this_name);
        msa->sqname[key] = sre_strdup(this_name, strlen(this_name));

        /* setting msa->sqlen[idx] and msa->aseq[idx] */
        msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
                                     this_seq, strlen(this_seq));

        if (this_sqinfo->flags & SQINFO_DESC) {
            /* FIXME never get here ... */
            MSASetSeqDescription(msa, key, this_sqinfo->desc);
        }
        /* FIXME extend this by copying more stuff according to flags.
         * See MSAFileRead() in msa.c and used functions there
         *
         * Problem is that we never parse MSA information as we use squid'sSeqFile
         */

        msa->nseq++;

    } /* 0 <= i < mseq->nseqs */


    /* FIXME Would like to, but can't use MSAVerifyParse(msa) here, as it
     * will die on error. Need to implement our own version
     */
#if 0
    MSAVerifyParse(msa);
#endif

    /* The below is copy of MSAFileWrite() which originally only writes to stdout.
     */

    /* Be sloppy and make a2m and fasta the same. same for vienna (which is
       the same). same same. can can. boleh boleh */
    if (outfmt==SQFILE_FASTA)
        outfmt = MSAFILE_A2M;
    if (outfmt==SQFILE_VIENNA)
        outfmt = MSAFILE_VIENNA;

    switch (outfmt) {
    case MSAFILE_A2M:
        /*WriteA2M(pfOut, msa, 0);*/
        WriteA2M(pfOut, msa, iWrap);
        break;
    case MSAFILE_VIENNA:
        /*WriteA2M(pfOut, msa, 1);*/
        WriteA2M(pfOut, msa, INT_MAX);
        break;
    case MSAFILE_CLUSTAL:
        WriteClustal(pfOut, msa, iWrap, TRUE==bResno ? 1 : 0, mseq->seqtype);
        break;
    case MSAFILE_MSF:
        WriteMSF(pfOut, msa);
        break;
    case MSAFILE_PHYLIP:
        WritePhylip(pfOut, msa);
        break;
    case MSAFILE_SELEX:
        WriteSELEX(pfOut, msa);
        break;
    case MSAFILE_STOCKHOLM:
        WriteStockholm(pfOut, msa);
        break;
    default:
        Log(&rLog, LOG_FATAL, "internal error: %s",
            "invalid output format should have been detected before");
    }

    if (use_stdout == FALSE) {
        (void) fclose(pfOut);
        Log(&rLog, LOG_INFO,
            "Alignment written to %s", pcAlnOutfile);
    }
    MSAFree(msa);

    return 0;
}
Ejemplo n.º 8
0
/**
 * @brief Stripped down version of squid's alistat
 *
 *
 * @param[in] prMSeq
 * The alignment to analyse
 * @param[in] bSampling
 * For many sequences: samples from pool
 * @param[in] bReportAll
 * Report identities for all sequence pairs
 *
 * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive
 */
void
AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) {

    /*
     * bSampling = squid's do_fast
     * bReportAll = squid's allreport
     */
    float  **ppdIdentMx;  /* identity matrix (squid: imx) */
    const int iNumSample = 1000; /* sample size (squid: nsample) */


    MSA *msa; /* squid's alignment structure */
    int small, large;
    int bestj, worstj;
    float sum;
    float worst_worst, worst_best, best_best;
    float avgid;
    int i, j;
    int nres; /* number of residues */

    if (bSampling && bReportAll) {
        Log(&rLog, LOG_WARN,
            "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__);
        return;
    }
    if (FALSE == prMSeq->aligned) {
        Log(&rLog, LOG_WARN,
            "Sequences are not aligned. Skipping %s()", __FUNCTION__);
        return;
    }

    /* silence gcc warnings about uninitialized variables
     */
    worst_worst = worst_best = best_best = 0.0;
    bestj = worstj = -1;


    /** mseq to squid msa
     *
     * FIXME code overlap with WriteAlignment. Make it a function and take
     * code there (contains more comments) as template
     *
     */
    msa  = MSAAlloc(prMSeq->nseqs,
                    /* derive alignment length from first seq */
                    strlen(prMSeq->seq[0]));
    for (i=0; i<prMSeq->nseqs; i++) {
        int key; /* MSA struct internal index for sequence */
        char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */
        char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */
        SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */

        key = GKIStoreKey(msa->index, this_name);
        msa->sqname[key] = sre_strdup(this_name, strlen(this_name));
        /* setting msa->sqlen[idx] and msa->aseq[idx] */
        msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
                                     this_seq, strlen(this_seq));
        if (this_sqinfo->flags & SQINFO_DESC) {
            MSASetSeqDescription(msa, key, this_sqinfo->desc);
        }
        msa->nseq++;
    }



    nres = 0;
    small = large = -1;
    for (i = 0; i < msa->nseq; i++) {
        int rlen;		/* raw sequence length           */
        rlen  = DealignedLength(msa->aseq[i]);
        nres +=  rlen;
        if (small == -1 || rlen < small) small = rlen;
        if (large == -1 || rlen > large) large = rlen;
    }


    if (bSampling) {
        avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen,
                                            msa->nseq, iNumSample);

    } else {
        float best, worst;

        /* this might be slow...could use openmp inside squid */
        MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx);
        if (bReportAll) {
            printf("  %-15s %5s %7s %-15s %7s %-15s\n",
                   "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)");
            printf("  --------------- ----- ------- --------------- ------- ---------------\n");
        }

        sum = 0.0;
        worst_best  = 1.0;
        best_best   = 0.0;
        worst_worst = 1.0;
        for (i = 0; i < msa->nseq; i++) {
            worst = 1.0;
            best  = 0.0;
            for (j = 0; j < msa->nseq; j++) {
                /* closest seq to this one = best */
                if (i != j && ppdIdentMx[i][j] > best)  {
                    best  = ppdIdentMx[i][j];
                    bestj = j;
                }
                if (ppdIdentMx[i][j] < worst) {
                    worst = ppdIdentMx[i][j];
                    worstj = j;
                }
            }

            if (bReportAll)  {
                printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n",
                       msa->sqname[i], DealignedLength(msa->aseq[i]),
                       best * 100.,  msa->sqname[bestj],
                       worst * 100., msa->sqname[worstj]);
            }
            if (best > best_best)    best_best = best;
            if (best < worst_best)   worst_best = best;
            if (worst < worst_worst) worst_worst = worst;
            for (j = 0; j < i; j++)
                sum += ppdIdentMx[i][j];
        }
        avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0);
        if (bReportAll)
            puts("");
        FMX2Free(ppdIdentMx);
    } /* else bSampling */



    /* Print output
     */
    if (msa->name != NULL)
        printf("Alignment name:      %s\n", msa->name);
    /*printf("Format:              %s\n",     SeqfileFormat2String(afp->format));*/
    printf("Number of sequences: %d\n", msa->nseq);
    printf("Total # residues:    %d\n", nres);
    printf("Smallest:            %d\n", small);
    printf("Largest:             %d\n", large);
    printf("Average length:      %.1f\n", (float) nres / (float) msa->nseq);
    printf("Alignment length:    %d\n", msa->alen);
    printf("Average identity:    %.2f%%\n", 100.*avgid);

    if (! bSampling) {
        printf("Most related pair:   %.2f%%\n", 100.*best_best);
        printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst);
        printf("Most distant seq:    %.2f%%\n", 100.*worst_best);
    }

    /*
    char *cs;
    cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen);
    printf cs;
    */

    MSAFree(msa);
}
Ejemplo n.º 9
0
/* Function: MSAFromAINFO()
 * Date:     SRE, Mon Jun 14 11:22:24 1999 [St. Louis]
 *
 * Purpose:  Convert the old aseq/ainfo alignment structure
 *           to new MSA structure. Enables more rapid conversion
 *           of codebase to the new world order.
 *
 * Args:     aseq  - [0..nseq-1][0..alen-1] alignment
 *           ainfo - old-style optional info
 *
 * Returns:  MSA *
 */
MSA *
MSAFromAINFO(char **aseq, AINFO *ainfo)
{
  MSA *msa;
  int  i, j;

  msa = MSAAlloc(ainfo->nseq, ainfo->alen);
  for (i = 0; i < ainfo->nseq; i++)
    {
      strcpy(msa->aseq[i], aseq[i]);
      msa->wgt[i]    = ainfo->wgt[i];
      msa->sqname[i] = sre_strdup(ainfo->sqinfo[i].name, -1);
      msa->sqlen[i]  = msa->alen;
      GKIStoreKey(msa->index, msa->sqname[i]);

      if (ainfo->sqinfo[i].flags & SQINFO_ACC) 
	MSASetSeqAccession(msa, i, ainfo->sqinfo[i].acc);

      if (ainfo->sqinfo[i].flags & SQINFO_DESC) 
	MSASetSeqDescription(msa, i, ainfo->sqinfo[i].desc);

      if (ainfo->sqinfo[i].flags & SQINFO_SS) {
	if (msa->ss == NULL) {
	  msa->ss    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	  msa->sslen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
	  for (j = 0; j < msa->nseqalloc; j++) {
	    msa->ss[j]    = NULL;
	    msa->sslen[j] = 0;
	  }
	}
	MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].ss, &(msa->ss[i]));
	msa->sslen[i] = msa->alen;
      }

      if (ainfo->sqinfo[i].flags & SQINFO_SA) {
	if (msa->sa == NULL) {
	  msa->sa    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	  msa->salen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
	  for (j = 0; j < msa->nseqalloc; j++) {
	    msa->sa[j]    = NULL;
	    msa->salen[j] = 0;
	  }
	}
	MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].sa, &(msa->sa[i]));
	msa->salen[i] = msa->alen;
      }
    }
			/* note that sre_strdup() returns NULL when passed NULL */
  msa->name    = sre_strdup(ainfo->name, -1);
  msa->desc    = sre_strdup(ainfo->desc, -1);
  msa->acc     = sre_strdup(ainfo->acc,  -1);
  msa->au      = sre_strdup(ainfo->au,   -1);
  msa->ss_cons = sre_strdup(ainfo->cs,   -1);
  msa->rf      = sre_strdup(ainfo->rf,   -1);
  if (ainfo->flags & AINFO_TC) {
    msa->cutoff[MSA_CUTOFF_TC1] = ainfo->tc1; msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
    msa->cutoff[MSA_CUTOFF_TC2] = ainfo->tc2; msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
  }
  if (ainfo->flags & AINFO_NC) {
    msa->cutoff[MSA_CUTOFF_NC1] = ainfo->nc1; msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
    msa->cutoff[MSA_CUTOFF_NC2] = ainfo->nc2; msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
  }
  if (ainfo->flags & AINFO_GA) {
    msa->cutoff[MSA_CUTOFF_GA1] = ainfo->ga1; msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
    msa->cutoff[MSA_CUTOFF_GA2] = ainfo->ga2; msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
  }
  msa->nseq = ainfo->nseq;
  msa->alen = ainfo->alen;
  return msa;
}
/* Function: ReadMSF()
 * Date:     SRE, Tue Jun  1 08:07:22 1999 [St. Louis]
 *
 * Purpose:  Parse an alignment read from an open MSF format
 *           alignment file. (MSF is a single-alignment format.)
 *           Return the alignment, or NULL if we've already
 *           read the alignment.
 *           
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA * - an alignment object
 *                   caller responsible for an MSAFree()
 *           NULL if no more alignments
 *
 * Diagnostics: 
 *           Will Die() here with a (potentially) useful message
 *           if a parsing error occurs.
 */
MSA *
ReadMSF(MSAFILE *afp)
{
  MSA    *msa;
  char   *s;
  int     alleged_alen;
  int     alleged_type;
  int     alleged_checksum;
  char   *tok;
  char   *sp;
  int     slen;
  int     sqidx;
  char   *name;
  char   *seq;

  if (feof(afp->f)) return NULL;
  if ((s = MSAFileGetLine(afp)) == NULL) return NULL;

  /* The first line is the header.
   * This is a new-ish GCG feature. Don't count on it, so
   * we can be a bit more tolerant towards non-GCG software
   * generating "MSF" files.
   */
  msa = MSAAlloc(10, 0);
  if      (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
    msa->type = kAmino;
    if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
  } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
    msa->type = kRNA;
    if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
  }

  /* Now we're in the free text comment section of the MSF file.
   * It ends when we see the "MSF: Type: Check: .." line.
   * This line must be present. 
   */
  do
    {
      if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
	  Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
	{
	  alleged_alen     = atoi(sqd_parse[0]);
	  switch (*(sqd_parse[1])) {
	  case 'N' : alleged_type = kRNA;      break;
	  case 'P' : alleged_type = kAmino;    break;  
	  case 'X' : alleged_type = kOtherSeq; break;
	  default  : alleged_type = kOtherSeq; 
	  }
	  alleged_checksum = atoi(sqd_parse[3]);
	  if (msa->type == kOtherSeq) msa->type = alleged_type;
	  break;		/* we're done with comment section. */
	}
      if (! IsBlankline(s)) 
	MSAAddComment(msa, s);
    } while ((s = MSAFileGetLine(afp)) != NULL); 

  /* Now we're in the name section.
   * GCG has a relatively poorly documented feature: only sequences that
   * appear in this list will be read from the alignment section. Commenting
   * out sequences in the name list (by preceding them with "!") is
   * allowed as a means of manually defining subsets of sequences in
   * the alignment section. We can support this feature reasonably
   * easily because of the hash table for names in the MSA: we
   * only add names to the hash table when we see 'em in the name section.
   */
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */

      if      (*s == '\n')   continue;                 /* skip blank lines */
      else if (*s == '!')    MSAAddComment(msa, s);
      else if ((sp  = strstr(s, "Name:")) != NULL) 
	{
				/* We take the name and the weigh, and that's it */
	  sp   += 5;
	  tok   = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
	  sqidx = GKIStoreKey(msa->index, tok);
	  if (sqidx >= msa->nseqalloc) MSAExpand(msa);
	  msa->sqname[sqidx] = sre_strdup(tok, slen);
	  msa->nseq++;

	  if ((sp = strstr(sp, "Weight:")) == NULL)
	    Die("No Weight: on line %d for %s in name section of MSF file %s\n",
		afp->linenumber, msa->sqname[sqidx],  afp->fname);
	  sp += 7;
	  tok = sre_strtok(&sp, " \t", &slen);
	  msa->wgt[sqidx] = atof(tok);
	  msa->flags |= MSA_SET_WGT;
	}
      else if (strncmp(s, "//", 2) == 0)
	break;
      else
	{
	  Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
	      afp->linenumber, afp->fname, s);
	  squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
	  return NULL;
	}

    }

  /* And now we're in the sequence section. 
   * As discussed above, if we haven't seen a sequence name, then we
   * don't include the sequence in the alignment.
   * Also, watch out for coordinate-only lines.
   */
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      sp  = s;
      if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
      if ((seq  = sre_strtok(&sp, "\n",  &slen)) == NULL) continue;
      
      /* The test for a coord line: digits starting both fields
       */
      if (isdigit((int) *name) && isdigit((int) *seq))
	continue;
  
      /* It's not blank, and it's not a coord line: must be sequence
       */
      sqidx = GKIKeyIndex(msa->index, name);
      if (sqidx < 0) continue;	/* not a sequence we recognize */
      
      msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); 
    }
  
  /* We've left blanks in the aseqs; take them back out.
   */
  for (sqidx = 0; sqidx <  msa->nseq; sqidx++)
    {
      if (msa->aseq[sqidx] == NULL)
	Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
      
      for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
	{
	  if (*s == ' ' || *s == '\t') {
	    msa->sqlen[sqidx]--;
	  } else {
	    *sp = *s;
	    sp++;
	  }
	}
      *sp = '\0';
    }
  
  MSAVerifyParse(msa);		/* verifies, and also sets alen and wgt. */
  return msa;
}