Пример #1
0
/**
 * @brief Appends an mseq structure to an already existing one.
 * filename will be left untouched.
 *
 * @param[in] prMSeqDest_p
 * MSeq structure to which to append to
 * @param[out] prMSeqToAdd
 * MSeq structure which is to append
 *
 * 
 */    
void
JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd)
{
    int iSrcSeqIndex;
    int iNewNSeq;
    
    assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p));
    assert(NULL != prMSeqToAdd);
    
    if (0 == prMSeqToAdd->nseqs) {
        Log(&rLog, LOG_WARN, "Was asked to add 0 sequences");
        return;
    }
    
    /* warn on seqtype mismatch and keep original seqtype */
    if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) {
        Log(&rLog, LOG_WARN, "Joining sequences of different type");
    }
    
    /* leave filename as it is */

    /*
     * copy new seq/s, orig_seq/s, sqinfo/s
     */
    iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs;
    
    (*prMSeqDest_p)->seq =  (char **)
        CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *));
    
    (*prMSeqDest_p)->orig_seq =  (char **)
        CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *));
    
    (*prMSeqDest_p)->sqinfo =  (SQINFO *)
        CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO));
    
    
    for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) {
        int iDstSeqIndex = (*prMSeqDest_p)->nseqs++;
        
        (*prMSeqDest_p)->seq[iDstSeqIndex] =
            CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]);
        
        (*prMSeqDest_p)->orig_seq[iDstSeqIndex] =
            CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]);
        
        SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex],
                    & prMSeqToAdd->sqinfo[iSrcSeqIndex]);
    }

    (*prMSeqDest_p)->nseqs = iNewNSeq;
    
    (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p);
    
    return; 
}
Пример #2
0
/**
 * @brief Creates a new sequence entry and appends it to an existing mseq
 * structure.
 *
 * @param[out] prMSeqDest_p
 * Already existing and initialised mseq structure
 * @param[in] pcSeqName
 * sequence name of the sequence to add
 * @param[in] pcSeqRes
 * the actual sequence (residues) to add
 *
 * @note Don't forget to update the align and type flag if necessary!
 *
 * FIXME allow adding of more features
 *
 */
void
AddSeq(mseq_t **prMSeqDest_p, char *pcSeqName, char *pcSeqRes)
{
    int iSeqIdx = 0;
    SQINFO sqinfo;

    assert(NULL != prMSeqDest_p);
    assert(NULL != pcSeqName);
    assert(NULL != pcSeqRes);

    iSeqIdx = (*prMSeqDest_p)->nseqs;

    (*prMSeqDest_p)->seq =  (char **)
                            CKREALLOC((*prMSeqDest_p)->seq, (iSeqIdx+1) * sizeof(char *));
    (*prMSeqDest_p)->orig_seq =  (char **)
                                 CKREALLOC((*prMSeqDest_p)->orig_seq, (iSeqIdx+1) * sizeof(char *));
    (*prMSeqDest_p)->sqinfo =  (SQINFO *)
                               CKREALLOC((*prMSeqDest_p)->sqinfo, (iSeqIdx+1) * sizeof(SQINFO));


    (*prMSeqDest_p)->seq[iSeqIdx] = CkStrdup(pcSeqRes);
    (*prMSeqDest_p)->orig_seq[iSeqIdx] = CkStrdup(pcSeqRes);

    /* should probably get ri of SqInfo altogether in the long run and just
       transfer the intersting members into our own struct
     */
    sqinfo.flags = 0; /* init */

    sqinfo.len = strlen(pcSeqRes);
    sqinfo.flags |= SQINFO_LEN;

    /* name is an array of SQINFO_NAMELEN length */
    strncpy(sqinfo.name, pcSeqName, SQINFO_NAMELEN-1);
    sqinfo.name[SQINFO_NAMELEN-1] = '\0';
    sqinfo.flags |= SQINFO_NAME;

    SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iSeqIdx],
                & sqinfo);

    (*prMSeqDest_p)->nseqs++;

    return;
}
Пример #3
0
/**
 * @brief reads sequences from file
 *
 * @param[out] prMSeq
 * Multiple sequence struct. Must be preallocated.
 * FIXME: would make more sense to allocate it here.
 * @param[in] seqfile
 * Sequence file name. If '-' sequence will be read from stdin.
 * @param[in] iSeqType
 * int-encoded sequence type. Set to
 * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence)
 * @param[in] iMaxNumSeq
 * Return an error, if more than iMaxNumSeq have been read
 * @param[in] iMaxSeqLen
 * Return an error, if a seq longer than iMaxSeqLen has been read
 *
 * @return 0 on success, -1 on error
 *
 * @note
 *  - Depends heavily on squid
 *  - Sequence file format will be guessed
 *  - If supported by squid, gzipped files can be read as well.
 */
int
ReadSequences(mseq_t *prMSeq, char *seqfile,
              int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs,
              int iMaxNumSeq, int iMaxSeqLen)
{
    SQFILE *dbfp; /* sequence file descriptor */
    char *cur_seq;
    SQINFO cur_sqinfo;
    int iSeqIdx; /* sequence counter */
    int iSeqPos; /* sequence string position counter */

    assert(NULL!=seqfile);


    /* Try to work around inability to autodetect from a pipe or .gz:
     * assume FASTA format
     */
    if (SQFILE_UNKNOWN == iSeqFmt  &&
            (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) {
        iSeqFmt = SQFILE_FASTA;
    }

    /* Using squid routines to read input. taken from seqstat_main.c. we don't
     * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen
     * etc. NOTE this also means we discard some information, e.g. when
     * reading from and writing to a stockholm file, all extra MSA
     * info/annotation will be lost.
     *
     */

    if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) {
        Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile);
        return -1;
    }


    /* FIXME squid's ReadSeq() will exit with fatal error if format is
     * unknown. This will be a problem for a GUI. Same is true for many squid
     * other functions.
     *
     * The original squid:ReadSeq() dealigns sequences on input. We
     * use a patched version.
     *
     */
    while (ReadSeq(dbfp, dbfp->format,
                   &cur_seq,
                   &cur_sqinfo)) {

        if (prMSeq->nseqs+1>iMaxNumSeq) {
            Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'",
                iMaxNumSeq, cur_sqinfo.name, seqfile);
            return -1;
        }
        if ((int)strlen(cur_seq)>iMaxSeqLen) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)",
                cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen);
            return -1;
        }
        if ((int)strlen(cur_seq)==0) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues",
                cur_sqinfo.name);
            return -1;
        }

        /* FIXME: use modified version of AddSeq() that allows handing down SqInfo
         */

        prMSeq->seq =  (char **)
                       CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *));
        prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq);


        prMSeq->sqinfo =  (SQINFO *)
                          CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
        SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo);

#ifdef TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]);
        LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]);
#endif
        /* always guess type from first seq. use squid function and
         * convert value
         */
        if (0 == prMSeq->nseqs) {
            int type = Seqtype(prMSeq->seq[prMSeq->nseqs]);
            switch (type)  {
            case kDNA:
                prMSeq->seqtype = SEQTYPE_DNA;
                break;
            case kRNA:
                prMSeq->seqtype = SEQTYPE_RNA;
                break;
            case kAmino:
                prMSeq->seqtype = SEQTYPE_PROTEIN;
                break;
            case kOtherSeq:
                prMSeq->seqtype = SEQTYPE_UNKNOWN;
                break;
            default:
                Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
            }

            /* override with given sequence type but check with
             * automatically detected type and warn if necessary
             */
            if (SEQTYPE_UNKNOWN != iSeqType) {
                if (prMSeq->seqtype != iSeqType) {
                    Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested",
                        SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType));
                    prMSeq->seqtype = iSeqType;
                }
            }
            /* if type could not be determined and was not set return error */
            if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) {
                Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence");
                FreeSequence(cur_seq, &cur_sqinfo);
                SeqfileClose(dbfp);
                return -1;
            }
        }

        Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s",
            prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype),
            prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len,
            prMSeq->seq[prMSeq->nseqs]);

        /* FIXME IPUAC and/or case conversion? If yes see
         * corresponding squid functions. Special treatment of
         * Stockholm tilde-gaps for ktuple code?
         */

        prMSeq->nseqs++;

        FreeSequence(cur_seq, &cur_sqinfo);
    }
    SeqfileClose(dbfp);

    /*#if ALLOW_ONLY_PROTEIN
        if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
            Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.",
                  SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME);
        }
    #endif*/

    /* Check if sequences are aligned */
    prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs);


    /* keep original sequence as copy and convert "working" sequence
     *
     */
    prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *));
    for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {

        prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]);


        /* convert unknown characters according to set seqtype
         * be conservative, i.e. don't allow any fancy ambiguity
         * characters to make sure that ktuple code etc. works.
         */

        /* first on the fly conversion between DNA and RNA
         */
        if (prMSeq->seqtype==SEQTYPE_DNA)
            ToDNA(prMSeq->seq[iSeqIdx]);
        if (prMSeq->seqtype==SEQTYPE_RNA)
            ToRNA(prMSeq->seq[iSeqIdx]);

        /* then check of each character
         */
        for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) {
            char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]);
            if (isgap(*res))
                continue;

            if (prMSeq->seqtype==SEQTYPE_PROTEIN) {
                if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) {
                    *res = AMINOACID_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_DNA) {
                if (NULL == strchr(DNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_RNA) {
                if (NULL == strchr(RNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            }
        }
    }

    /* order in which sequences appear in guide-tree
     * only allocate if different output-order desired */
    prMSeq->tree_order = NULL;

    prMSeq->filename = CkStrdup(seqfile);
    Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s",
        prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename);

    return 0;
}
Пример #4
0
/**
 * @brief Appends an mseq structure to an already existing one.
 * filename will be left untouched.
 *
 * @param[in] prMSeqDest_p
 * MSeq structure to which to append to
 * @param[out] prMSeqToAdd
 * MSeq structure which is to append
 *
 */
void
JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd)
{
    int iSrcSeqIndex;
    int iNewNSeq;

    assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p));
    assert(NULL != prMSeqToAdd);

    if (0 == prMSeqToAdd->nseqs) {
        Log(&rLog, LOG_WARN, "Was asked to add 0 sequences");
        return;
    }

    /* warn on seqtype mismatch and keep original seqtype */
    if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) {
        Log(&rLog, LOG_WARN, "Joining sequences of different type");
    }

    /* leave filename as it is */

    /*
     * copy new seq/s, orig_seq/s, sqinfo/s
     */
    iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs;

    (*prMSeqDest_p)->seq =  (char **)
                            CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *));

    (*prMSeqDest_p)->orig_seq =  (char **)
                                 CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *));

    (*prMSeqDest_p)->sqinfo =  (SQINFO *)
                               CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO));


    for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) {
        int iDstSeqIndex = (*prMSeqDest_p)->nseqs++;

        (*prMSeqDest_p)->seq[iDstSeqIndex] =
            CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]);

        (*prMSeqDest_p)->orig_seq[iDstSeqIndex] =
            CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]);

        SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex],
                    & prMSeqToAdd->sqinfo[iSrcSeqIndex]);
    }

    (*prMSeqDest_p)->nseqs = iNewNSeq;

#if 0
    /* 2nd arg is bIsProfile, which when set TRUE skips
     * the check for gaps. here always check for gaps,
     * so set FALSE (main reason is that it is easier), FS, r282 -> */
    /* had a problem at this stage, therefore dispense with gap check, FS, r290 -> */
    /* 3rd argument is dealignment flag, do not dealign profiles */
    (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p, TRUE/*FALSE*/, FALSE);
#else
    (*prMSeqDest_p)->aligned = TRUE;
#endif

    return;
}