Example #1
0
/**
 * @brief copies an mseq structure
 *
 * @param[out] prMSeqDest_p
 * Copy of mseq structure
 * @param[in]  prMSeqSrc
 * Source mseq structure to copy
 *
 * @note caller has to free copy by calling FreeMSeq()
 *
 */
void
CopyMSeq(mseq_t **prMSeqDest_p, mseq_t *prMSeqSrc)
{
    int i;
    assert(prMSeqSrc != NULL && prMSeqDest_p != NULL);

    NewMSeq(prMSeqDest_p);

    (*prMSeqDest_p)->nseqs = prMSeqSrc->nseqs;
    (*prMSeqDest_p)->seqtype = prMSeqSrc->seqtype;
    if (prMSeqSrc->filename!=NULL) {
        (*prMSeqDest_p)->filename = CkStrdup(prMSeqSrc->filename);
    }

    (*prMSeqDest_p)->seq =  (char **)
                            CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *));
    (*prMSeqDest_p)->orig_seq =  (char **)
                                 CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *));
    (*prMSeqDest_p)->sqinfo =  (SQINFO *)
                               CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(SQINFO));



    for (i=0; i<(*prMSeqDest_p)->nseqs; i++) {
        (*prMSeqDest_p)->seq[i] = CkStrdup(prMSeqSrc->seq[i]);
        (*prMSeqDest_p)->orig_seq[i] = CkStrdup(prMSeqSrc->orig_seq[i]);
        SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[i], &prMSeqSrc->sqinfo[i]);
    }
}
Example #2
0
/* Function: ReadMultipleRseqs()
 * 
 * Purpose:  Open a data file and
 *           parse it into an array of rseqs (raw, unaligned
 *           sequences).
 * 
 *           Caller is responsible for free'ing memory allocated
 *           to ret_rseqs, ret_weights, and ret_names.
 *           
 *           Weights are currently only supported for MSF format.
 *           Sequences read from all other formats will be assigned
 *           weights of 1.0. If the caller isn't interested in
 *           weights, it passes NULL as ret_weights.
 * 
 * Returns 1 on success. Returns 0 on failure and sets
 * squid_errno to indicate the cause.
 */
int
ReadMultipleRseqs(char              *seqfile,
		  int                fformat,
		  char            ***ret_rseqs,
		  SQINFO **ret_sqinfo,
		  int               *ret_num)
{
  SQINFO *sqinfo;               /* array of sequence optional info         */
  SQFILE *dbfp;                 /* open ptr for sequential access of file  */
  char  **rseqs;                /* sequence array                          */
  char  **aseqs;                /* aligned sequences, if file is aligned   */
  AINFO   ainfo;      /* alignment-associated information        */
  int     numalloced;           /* num of seqs currently alloced for       */
  int     idx;
  int     num;

  if (fformat == kSelex || fformat == kMSF || fformat == kClustal)
    {
      if (! ReadAlignment(seqfile, fformat, &aseqs, &ainfo)) return 0;
      if (! DealignAseqs(aseqs, ainfo.nseq, &rseqs))                return 0;

      /* copy the sqinfo array
       */
      num = ainfo.nseq;
      sqinfo= (SQINFO *) MallocOrDie (sizeof(SQINFO)*ainfo.nseq);
      for (idx = 0; idx < ainfo.nseq; idx++)
	SeqinfoCopy(&(sqinfo[idx]), &(ainfo.sqinfo[idx]));
      FreeAlignment(aseqs, &ainfo);
    }
  else
    {
				/* initial alloc */
      num        = 0;
      numalloced = 16;
      rseqs  = (char **) MallocOrDie (numalloced * sizeof(char *));
      sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO));
      if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0;      

      while (ReadSeq(dbfp, fformat, &rseqs[num], &(sqinfo[num])))
	{
	  num++;
	  if (num == numalloced) /* more seqs coming, alloc more room */
	    {
	      numalloced += 16;
	      rseqs  = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *));
	      sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO));
	    }
	}
      SeqfileClose(dbfp);
    }

  *ret_rseqs  = rseqs;
  *ret_sqinfo = sqinfo;
  *ret_num    = num;
  return 1;
}
Example #3
0
File: seq.c Project: rforge/phyexe
/**
 * @brief Appends an mseq structure to an already existing one.
 * filename will be left untouched.
 *
 * @param[in] prMSeqDest_p
 * MSeq structure to which to append to
 * @param[out] prMSeqToAdd
 * MSeq structure which is to append
 *
 * 
 */    
void
JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd)
{
    int iSrcSeqIndex;
    int iNewNSeq;
    
    assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p));
    assert(NULL != prMSeqToAdd);
    
    if (0 == prMSeqToAdd->nseqs) {
        Log(&rLog, LOG_WARN, "Was asked to add 0 sequences");
        return;
    }
    
    /* warn on seqtype mismatch and keep original seqtype */
    if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) {
        Log(&rLog, LOG_WARN, "Joining sequences of different type");
    }
    
    /* leave filename as it is */

    /*
     * copy new seq/s, orig_seq/s, sqinfo/s
     */
    iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs;
    
    (*prMSeqDest_p)->seq =  (char **)
        CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *));
    
    (*prMSeqDest_p)->orig_seq =  (char **)
        CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *));
    
    (*prMSeqDest_p)->sqinfo =  (SQINFO *)
        CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO));
    
    
    for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) {
        int iDstSeqIndex = (*prMSeqDest_p)->nseqs++;
        
        (*prMSeqDest_p)->seq[iDstSeqIndex] =
            CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]);
        
        (*prMSeqDest_p)->orig_seq[iDstSeqIndex] =
            CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]);
        
        SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex],
                    & prMSeqToAdd->sqinfo[iSrcSeqIndex]);
    }

    (*prMSeqDest_p)->nseqs = iNewNSeq;
    
    (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p);
    
    return; 
}
Example #4
0
/**
 * @brief Creates a new sequence entry and appends it to an existing mseq
 * structure.
 *
 * @param[out] prMSeqDest_p
 * Already existing and initialised mseq structure
 * @param[in] pcSeqName
 * sequence name of the sequence to add
 * @param[in] pcSeqRes
 * the actual sequence (residues) to add
 *
 * @note Don't forget to update the align and type flag if necessary!
 *
 * FIXME allow adding of more features
 *
 */
void
AddSeq(mseq_t **prMSeqDest_p, char *pcSeqName, char *pcSeqRes)
{
    int iSeqIdx = 0;
    SQINFO sqinfo;

    assert(NULL != prMSeqDest_p);
    assert(NULL != pcSeqName);
    assert(NULL != pcSeqRes);

    iSeqIdx = (*prMSeqDest_p)->nseqs;

    (*prMSeqDest_p)->seq =  (char **)
                            CKREALLOC((*prMSeqDest_p)->seq, (iSeqIdx+1) * sizeof(char *));
    (*prMSeqDest_p)->orig_seq =  (char **)
                                 CKREALLOC((*prMSeqDest_p)->orig_seq, (iSeqIdx+1) * sizeof(char *));
    (*prMSeqDest_p)->sqinfo =  (SQINFO *)
                               CKREALLOC((*prMSeqDest_p)->sqinfo, (iSeqIdx+1) * sizeof(SQINFO));


    (*prMSeqDest_p)->seq[iSeqIdx] = CkStrdup(pcSeqRes);
    (*prMSeqDest_p)->orig_seq[iSeqIdx] = CkStrdup(pcSeqRes);

    /* should probably get ri of SqInfo altogether in the long run and just
       transfer the intersting members into our own struct
     */
    sqinfo.flags = 0; /* init */

    sqinfo.len = strlen(pcSeqRes);
    sqinfo.flags |= SQINFO_LEN;

    /* name is an array of SQINFO_NAMELEN length */
    strncpy(sqinfo.name, pcSeqName, SQINFO_NAMELEN-1);
    sqinfo.name[SQINFO_NAMELEN-1] = '\0';
    sqinfo.flags |= SQINFO_NAME;

    SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iSeqIdx],
                & sqinfo);

    (*prMSeqDest_p)->nseqs++;

    return;
}
Example #5
0
/**
 * @brief Sort sequences by length
 *
 * @param[out] prMSeq
 * mseq to sort by length
 * @param[out] cOrder
 * Sorting order. 'd' for descending, 'a' for ascending.
 *
 *
 */
void
SortMSeqByLength(mseq_t *prMSeq, const char cOrder)
{
    int *piSeqLen;
    int *piOrder;
    int iSeqIndex;
    mseq_t *prMSeqCopy = NULL;

    assert('a'==cOrder || 'd'==cOrder);

    Log(&rLog, LOG_WARN,
        "FIXME: This modifies sequence ordering. Might not be what user wants. Will change output order as well");

    piSeqLen = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
    piOrder = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
        piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
    }
    QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, cOrder, FALSE);

    CopyMSeq(&prMSeqCopy, prMSeq);
    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
        /* copy mseq entry
         */
        CKFREE(prMSeq->seq[iSeqIndex]);
        prMSeq->seq[iSeqIndex] = CkStrdup(prMSeqCopy->seq[piOrder[iSeqIndex]]);

        CKFREE(prMSeq->orig_seq[iSeqIndex]);
        prMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeqCopy->orig_seq[piOrder[iSeqIndex]]);

        SeqinfoCopy(&prMSeq->sqinfo[iSeqIndex], &prMSeqCopy->sqinfo[piOrder[iSeqIndex]]);
    }

    CKFREE(piSeqLen);
    CKFREE(piOrder);
    FreeMSeq(&prMSeqCopy);

    return;
}
Example #6
0
/**
 * @brief reads sequences from file
 *
 * @param[out] prMSeq
 * Multiple sequence struct. Must be preallocated.
 * FIXME: would make more sense to allocate it here.
 * @param[in] seqfile
 * Sequence file name. If '-' sequence will be read from stdin.
 * @param[in] iSeqType
 * int-encoded sequence type. Set to
 * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence)
 * @param[in] iMaxNumSeq
 * Return an error, if more than iMaxNumSeq have been read
 * @param[in] iMaxSeqLen
 * Return an error, if a seq longer than iMaxSeqLen has been read
 *
 * @return 0 on success, -1 on error
 *
 * @note
 *  - Depends heavily on squid
 *  - Sequence file format will be guessed
 *  - If supported by squid, gzipped files can be read as well.
 */
int
ReadSequences(mseq_t *prMSeq, char *seqfile,
              int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs,
              int iMaxNumSeq, int iMaxSeqLen)
{
    SQFILE *dbfp; /* sequence file descriptor */
    char *cur_seq;
    SQINFO cur_sqinfo;
    int iSeqIdx; /* sequence counter */
    int iSeqPos; /* sequence string position counter */

    assert(NULL!=seqfile);


    /* Try to work around inability to autodetect from a pipe or .gz:
     * assume FASTA format
     */
    if (SQFILE_UNKNOWN == iSeqFmt  &&
            (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) {
        iSeqFmt = SQFILE_FASTA;
    }

    /* Using squid routines to read input. taken from seqstat_main.c. we don't
     * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen
     * etc. NOTE this also means we discard some information, e.g. when
     * reading from and writing to a stockholm file, all extra MSA
     * info/annotation will be lost.
     *
     */

    if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) {
        Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile);
        return -1;
    }


    /* FIXME squid's ReadSeq() will exit with fatal error if format is
     * unknown. This will be a problem for a GUI. Same is true for many squid
     * other functions.
     *
     * The original squid:ReadSeq() dealigns sequences on input. We
     * use a patched version.
     *
     */
    while (ReadSeq(dbfp, dbfp->format,
                   &cur_seq,
                   &cur_sqinfo)) {

        if (prMSeq->nseqs+1>iMaxNumSeq) {
            Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'",
                iMaxNumSeq, cur_sqinfo.name, seqfile);
            return -1;
        }
        if ((int)strlen(cur_seq)>iMaxSeqLen) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)",
                cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen);
            return -1;
        }
        if ((int)strlen(cur_seq)==0) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues",
                cur_sqinfo.name);
            return -1;
        }

        /* FIXME: use modified version of AddSeq() that allows handing down SqInfo
         */

        prMSeq->seq =  (char **)
                       CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *));
        prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq);


        prMSeq->sqinfo =  (SQINFO *)
                          CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
        SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo);

#ifdef TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]);
        LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]);
#endif
        /* always guess type from first seq. use squid function and
         * convert value
         */
        if (0 == prMSeq->nseqs) {
            int type = Seqtype(prMSeq->seq[prMSeq->nseqs]);
            switch (type)  {
            case kDNA:
                prMSeq->seqtype = SEQTYPE_DNA;
                break;
            case kRNA:
                prMSeq->seqtype = SEQTYPE_RNA;
                break;
            case kAmino:
                prMSeq->seqtype = SEQTYPE_PROTEIN;
                break;
            case kOtherSeq:
                prMSeq->seqtype = SEQTYPE_UNKNOWN;
                break;
            default:
                Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
            }

            /* override with given sequence type but check with
             * automatically detected type and warn if necessary
             */
            if (SEQTYPE_UNKNOWN != iSeqType) {
                if (prMSeq->seqtype != iSeqType) {
                    Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested",
                        SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType));
                    prMSeq->seqtype = iSeqType;
                }
            }
            /* if type could not be determined and was not set return error */
            if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) {
                Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence");
                FreeSequence(cur_seq, &cur_sqinfo);
                SeqfileClose(dbfp);
                return -1;
            }
        }

        Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s",
            prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype),
            prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len,
            prMSeq->seq[prMSeq->nseqs]);

        /* FIXME IPUAC and/or case conversion? If yes see
         * corresponding squid functions. Special treatment of
         * Stockholm tilde-gaps for ktuple code?
         */

        prMSeq->nseqs++;

        FreeSequence(cur_seq, &cur_sqinfo);
    }
    SeqfileClose(dbfp);

    /*#if ALLOW_ONLY_PROTEIN
        if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
            Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.",
                  SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME);
        }
    #endif*/

    /* Check if sequences are aligned */
    prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs);


    /* keep original sequence as copy and convert "working" sequence
     *
     */
    prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *));
    for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {

        prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]);


        /* convert unknown characters according to set seqtype
         * be conservative, i.e. don't allow any fancy ambiguity
         * characters to make sure that ktuple code etc. works.
         */

        /* first on the fly conversion between DNA and RNA
         */
        if (prMSeq->seqtype==SEQTYPE_DNA)
            ToDNA(prMSeq->seq[iSeqIdx]);
        if (prMSeq->seqtype==SEQTYPE_RNA)
            ToRNA(prMSeq->seq[iSeqIdx]);

        /* then check of each character
         */
        for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) {
            char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]);
            if (isgap(*res))
                continue;

            if (prMSeq->seqtype==SEQTYPE_PROTEIN) {
                if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) {
                    *res = AMINOACID_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_DNA) {
                if (NULL == strchr(DNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_RNA) {
                if (NULL == strchr(RNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            }
        }
    }

    /* order in which sequences appear in guide-tree
     * only allocate if different output-order desired */
    prMSeq->tree_order = NULL;

    prMSeq->filename = CkStrdup(seqfile);
    Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s",
        prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename);

    return 0;
}
Example #7
0
/**
 * @brief Appends an mseq structure to an already existing one.
 * filename will be left untouched.
 *
 * @param[in] prMSeqDest_p
 * MSeq structure to which to append to
 * @param[out] prMSeqToAdd
 * MSeq structure which is to append
 *
 */
void
JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd)
{
    int iSrcSeqIndex;
    int iNewNSeq;

    assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p));
    assert(NULL != prMSeqToAdd);

    if (0 == prMSeqToAdd->nseqs) {
        Log(&rLog, LOG_WARN, "Was asked to add 0 sequences");
        return;
    }

    /* warn on seqtype mismatch and keep original seqtype */
    if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) {
        Log(&rLog, LOG_WARN, "Joining sequences of different type");
    }

    /* leave filename as it is */

    /*
     * copy new seq/s, orig_seq/s, sqinfo/s
     */
    iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs;

    (*prMSeqDest_p)->seq =  (char **)
                            CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *));

    (*prMSeqDest_p)->orig_seq =  (char **)
                                 CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *));

    (*prMSeqDest_p)->sqinfo =  (SQINFO *)
                               CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO));


    for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) {
        int iDstSeqIndex = (*prMSeqDest_p)->nseqs++;

        (*prMSeqDest_p)->seq[iDstSeqIndex] =
            CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]);

        (*prMSeqDest_p)->orig_seq[iDstSeqIndex] =
            CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]);

        SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex],
                    & prMSeqToAdd->sqinfo[iSrcSeqIndex]);
    }

    (*prMSeqDest_p)->nseqs = iNewNSeq;

#if 0
    /* 2nd arg is bIsProfile, which when set TRUE skips
     * the check for gaps. here always check for gaps,
     * so set FALSE (main reason is that it is easier), FS, r282 -> */
    /* had a problem at this stage, therefore dispense with gap check, FS, r290 -> */
    /* 3rd argument is dealignment flag, do not dealign profiles */
    (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p, TRUE/*FALSE*/, FALSE);
#else
    (*prMSeqDest_p)->aligned = TRUE;
#endif

    return;
}
Example #8
0
/* Function: include_alignment()
 * Date:     SRE, Sun Jul  5 15:25:13 1998 [St. Louis]
 *
 * Purpose:  Given the name of a multiple alignment file,
 *           align that alignment to the HMM, and add traces
 *           to an existing array of traces. If do_mapped
 *           is TRUE, we use the HMM's map file. If not,
 *           we use P7ViterbiAlignAlignment().
 *
 * Args:     seqfile  - name of alignment file
 *           hmm      - model to align to
 *           do_mapped- TRUE if we're to use the HMM's alignment map
 *           rsq      - RETURN: array of rseqs to add to
 *           dsq      - RETURN: array of dsq to add to
 *           sqinfo   - RETURN: array of SQINFO to add to
 *           tr       - RETURN: array of traces to add to
 *           nseq     - RETURN: number of seqs           
 *
 * Returns:  new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is
 *           increased to nseq+ainfo.nseq.
 */
void
include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped,
		  char ***rsq, char ***dsq, SQINFO **sqinfo, 
		  struct p7trace_s ***tr, int *nseq)
{
  int format;			/* format of alignment file */
  char **aseq;			/* aligned seqs             */
  char **newdsq;
  char **newrseq;
  AINFO ainfo;			/* info that goes with aseq */
  int   idx;			/* counter over aseqs       */
  struct p7trace_s *master;     /* master trace             */
  struct p7trace_s **addtr;     /* individual traces for aseq */

  if (! SeqfileFormat(seqfile, &format, NULL))
    switch (squid_errno) {
    case SQERR_NOFILE: 
      ajFatal("Alignment file %s could not be opened for reading", seqfile);
      /*FALLTHRU*/ /* a white lie to shut lint up */
    case SQERR_FORMAT: 
    default:           
      ajFatal("Failed to determine format of alignment file %s", seqfile);
    }
				/* read the alignment from file */
  if (! ReadAlignment(seqfile, format, &aseq, &ainfo))
    ajFatal("Failed to read aligned sequence file %s", seqfile);
  for (idx = 0; idx < ainfo.nseq; idx++)
    s2upper(aseq[idx]);
				/* Verify checksums before mapping */
  if (do_mapped && GCGMultchecksum(aseq, ainfo.nseq) != hmm->checksum)
    ajFatal("The checksums for alignment file %s and the HMM alignment map don't match.", 
	seqfile);
				/* Get a master trace */
  if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, ainfo.alen);
  else           master = P7ViterbiAlignAlignment(aseq, &ainfo, hmm);

				/* convert to individual traces */
  ImposeMasterTrace(aseq, ainfo.nseq, master, &addtr);
				/* add those traces to existing ones */
  *tr = MergeTraceArrays(*tr, *nseq, addtr, ainfo.nseq);
  
				/* additional bookkeeping: add to dsq, sqinfo */
  *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DealignAseqs(aseq, ainfo.nseq, &newrseq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*rsq)[idx] = newrseq[idx - (*nseq)];
  free(newrseq);

  *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DigitizeAlignment(aseq, &ainfo, &newdsq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*dsq)[idx] = newdsq[idx - (*nseq)];
  free(newdsq);
				/* unnecessarily complex, but I can't be bothered... */
  *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + ainfo.nseq));
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    SeqinfoCopy(&((*sqinfo)[idx]), &(ainfo.sqinfo[idx - (*nseq)]));
  
  *nseq = *nseq + ainfo.nseq;

				/* Cleanup */
  P7FreeTrace(master);
  FreeAlignment(aseq, &ainfo);
				/* Return */
  return;
}
Example #9
0
/* Function: ReadSeq()
 * 
 * Purpose:  Read next sequence from an open database file.
 *           Return the sequence and associated info.
 *           
 * Args:     fp      - open sequence database file pointer          
 *           format  - format of the file (previously determined
 *                      by call to SeqfileFormat())
 *           ret_seq - RETURN: sequence
 *           sqinfo  - RETURN: filled in w/ other information  
 *           
 * Return:   1 on success, 0 on failure.
 *           ret_seq and some field of sqinfo are allocated here,
 *           The preferred call mechanism to properly free the memory is:
 *           
 *           SQINFO sqinfo;
 *           char  *seq;
 *           
 *           ReadSeq(fp, format, &seq, &sqinfo);
 *           ... do something...
 *           FreeSequence(seq, &sqinfo);
 */
int
ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo)
{
  int    gotuw;
  int    apos, rpos;

  squid_errno = SQERR_OK;
  if (format < kMinFormat || format > kMaxFormat) 
    {
      squid_errno = SQERR_FORMAT;
      *ret_seq    = NULL;
      return 0;
    }

  /* Here's the hack for sequential access of sequences from
   * the multiple sequence alignment formats
   */
  if (format == kMSF || format == kSelex || format == kClustal) {
    if (V->ali_curridx >= V->ali_ainfo.nseq) 
      return 0; /* out of aseqs */

    SeqinfoCopy(sqinfo, &(V->ali_ainfo.sqinfo[V->ali_curridx]));

				/* copy and dealign the appropriate aligned seq */
    V->seq = MallocOrDie (sizeof(char) * (V->ali_ainfo.alen+1));
    for (rpos = apos = 0; apos < V->ali_ainfo.alen; apos++) 
      if (!isgap(V->ali_aseqs[V->ali_curridx][apos]))
	V->seq[rpos++] = V->ali_aseqs[V->ali_curridx][apos];
    V->seq[rpos]   = '\0'; 
    V->seqlen      = rpos;
    V->ali_curridx++;
  } 
  else {
    if (feof(V->f)) return 0;

    V->seq           = (char*) calloc (kStartLength+1, sizeof(char));
    V->maxseq        = kStartLength;
    V->seqlen        = 0;
    V->sqinfo        = sqinfo;
    V->sqinfo->flags = 0;
    V->dash_equals_n = (format == kEMBL) ? TRUE : FALSE;

    switch (format) {
    case kIG      : readIG(V);      break;
    case kStrider : readStrider(V); break;
    case kGenBank : readGenBank(V); break;
    case kPearson : readPearson(V); break;
    case kEMBL    : readEMBL(V);    break;
    case kZuker   : readZuker(V);   break;
    case kPIR     : readPIR(V);     break;
    case kSquid   : readSquid(V);   break;
    case kGCGdata : readGCGdata(V); break; 
	
    case kGCG:
      do {			/* skip leading comments on GCG file */
	gotuw = (strstr(V->sbuffer,"..") != NULL);
	if (gotuw) readUWGCG(V);
	getline2(V);
      } while (! feof(V->f));
      break;

    case kIdraw:   /* SRE: no attempt to read idraw postscript */
    default:
      squid_errno = SQERR_FORMAT;
      free(V->seq);
      return 0;
    }
    V->seq[V->seqlen] = 0; /* stick a string terminator on it */
  }

  /* Cleanup
   */
  sqinfo->len    = V->seqlen; 
  sqinfo->flags |= SQINFO_LEN;
  *ret_seq = V->seq;
  if (squid_errno == SQERR_OK) return 1; else return 0;
}