/** * @brief Appends an mseq structure to an already existing one. * filename will be left untouched. * * @param[in] prMSeqDest_p * MSeq structure to which to append to * @param[out] prMSeqToAdd * MSeq structure which is to append * * */ void JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd) { int iSrcSeqIndex; int iNewNSeq; assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p)); assert(NULL != prMSeqToAdd); if (0 == prMSeqToAdd->nseqs) { Log(&rLog, LOG_WARN, "Was asked to add 0 sequences"); return; } /* warn on seqtype mismatch and keep original seqtype */ if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) { Log(&rLog, LOG_WARN, "Joining sequences of different type"); } /* leave filename as it is */ /* * copy new seq/s, orig_seq/s, sqinfo/s */ iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs; (*prMSeqDest_p)->seq = (char **) CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO)); for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) { int iDstSeqIndex = (*prMSeqDest_p)->nseqs++; (*prMSeqDest_p)->seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]); (*prMSeqDest_p)->orig_seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]); SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex], & prMSeqToAdd->sqinfo[iSrcSeqIndex]); } (*prMSeqDest_p)->nseqs = iNewNSeq; (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p); return; }
/** * @brief Creates a new sequence entry and appends it to an existing mseq * structure. * * @param[out] prMSeqDest_p * Already existing and initialised mseq structure * @param[in] pcSeqName * sequence name of the sequence to add * @param[in] pcSeqRes * the actual sequence (residues) to add * * @note Don't forget to update the align and type flag if necessary! * * FIXME allow adding of more features * */ void AddSeq(mseq_t **prMSeqDest_p, char *pcSeqName, char *pcSeqRes) { int iSeqIdx = 0; SQINFO sqinfo; assert(NULL != prMSeqDest_p); assert(NULL != pcSeqName); assert(NULL != pcSeqRes); iSeqIdx = (*prMSeqDest_p)->nseqs; (*prMSeqDest_p)->seq = (char **) CKREALLOC((*prMSeqDest_p)->seq, (iSeqIdx+1) * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKREALLOC((*prMSeqDest_p)->orig_seq, (iSeqIdx+1) * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKREALLOC((*prMSeqDest_p)->sqinfo, (iSeqIdx+1) * sizeof(SQINFO)); (*prMSeqDest_p)->seq[iSeqIdx] = CkStrdup(pcSeqRes); (*prMSeqDest_p)->orig_seq[iSeqIdx] = CkStrdup(pcSeqRes); /* should probably get ri of SqInfo altogether in the long run and just transfer the intersting members into our own struct */ sqinfo.flags = 0; /* init */ sqinfo.len = strlen(pcSeqRes); sqinfo.flags |= SQINFO_LEN; /* name is an array of SQINFO_NAMELEN length */ strncpy(sqinfo.name, pcSeqName, SQINFO_NAMELEN-1); sqinfo.name[SQINFO_NAMELEN-1] = '\0'; sqinfo.flags |= SQINFO_NAME; SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iSeqIdx], & sqinfo); (*prMSeqDest_p)->nseqs++; return; }
/** * @brief reads sequences from file * * @param[out] prMSeq * Multiple sequence struct. Must be preallocated. * FIXME: would make more sense to allocate it here. * @param[in] seqfile * Sequence file name. If '-' sequence will be read from stdin. * @param[in] iSeqType * int-encoded sequence type. Set to * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence) * @param[in] iMaxNumSeq * Return an error, if more than iMaxNumSeq have been read * @param[in] iMaxSeqLen * Return an error, if a seq longer than iMaxSeqLen has been read * * @return 0 on success, -1 on error * * @note * - Depends heavily on squid * - Sequence file format will be guessed * - If supported by squid, gzipped files can be read as well. */ int ReadSequences(mseq_t *prMSeq, char *seqfile, int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs, int iMaxNumSeq, int iMaxSeqLen) { SQFILE *dbfp; /* sequence file descriptor */ char *cur_seq; SQINFO cur_sqinfo; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ assert(NULL!=seqfile); /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (SQFILE_UNKNOWN == iSeqFmt && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) { iSeqFmt = SQFILE_FASTA; } /* Using squid routines to read input. taken from seqstat_main.c. we don't * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen * etc. NOTE this also means we discard some information, e.g. when * reading from and writing to a stockholm file, all extra MSA * info/annotation will be lost. * */ if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) { Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile); return -1; } /* FIXME squid's ReadSeq() will exit with fatal error if format is * unknown. This will be a problem for a GUI. Same is true for many squid * other functions. * * The original squid:ReadSeq() dealigns sequences on input. We * use a patched version. * */ while (ReadSeq(dbfp, dbfp->format, &cur_seq, &cur_sqinfo)) { if (prMSeq->nseqs+1>iMaxNumSeq) { Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'", iMaxNumSeq, cur_sqinfo.name, seqfile); return -1; } if ((int)strlen(cur_seq)>iMaxSeqLen) { Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)", cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen); return -1; } if ((int)strlen(cur_seq)==0) { Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues", cur_sqinfo.name); return -1; } /* FIXME: use modified version of AddSeq() that allows handing down SqInfo */ prMSeq->seq = (char **) CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *)); prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq); prMSeq->sqinfo = (SQINFO *) CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO)); SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo); #ifdef TRACE Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]); LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]); #endif /* always guess type from first seq. use squid function and * convert value */ if (0 == prMSeq->nseqs) { int type = Seqtype(prMSeq->seq[prMSeq->nseqs]); switch (type) { case kDNA: prMSeq->seqtype = SEQTYPE_DNA; break; case kRNA: prMSeq->seqtype = SEQTYPE_RNA; break; case kAmino: prMSeq->seqtype = SEQTYPE_PROTEIN; break; case kOtherSeq: prMSeq->seqtype = SEQTYPE_UNKNOWN; break; default: Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__); } /* override with given sequence type but check with * automatically detected type and warn if necessary */ if (SEQTYPE_UNKNOWN != iSeqType) { if (prMSeq->seqtype != iSeqType) { Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested", SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType)); prMSeq->seqtype = iSeqType; } } /* if type could not be determined and was not set return error */ if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) { Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence"); FreeSequence(cur_seq, &cur_sqinfo); SeqfileClose(dbfp); return -1; } } Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len, prMSeq->seq[prMSeq->nseqs]); /* FIXME IPUAC and/or case conversion? If yes see * corresponding squid functions. Special treatment of * Stockholm tilde-gaps for ktuple code? */ prMSeq->nseqs++; FreeSequence(cur_seq, &cur_sqinfo); } SeqfileClose(dbfp); /*#if ALLOW_ONLY_PROTEIN if (SEQTYPE_PROTEIN != prMSeq->seqtype) { Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.", SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME); } #endif*/ /* Check if sequences are aligned */ prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs); /* keep original sequence as copy and convert "working" sequence * */ prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *)); for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) { prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]); /* convert unknown characters according to set seqtype * be conservative, i.e. don't allow any fancy ambiguity * characters to make sure that ktuple code etc. works. */ /* first on the fly conversion between DNA and RNA */ if (prMSeq->seqtype==SEQTYPE_DNA) ToDNA(prMSeq->seq[iSeqIdx]); if (prMSeq->seqtype==SEQTYPE_RNA) ToRNA(prMSeq->seq[iSeqIdx]); /* then check of each character */ for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) { char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]); if (isgap(*res)) continue; if (prMSeq->seqtype==SEQTYPE_PROTEIN) { if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) { *res = AMINOACID_ANY; } } else if (prMSeq->seqtype==SEQTYPE_DNA) { if (NULL == strchr(DNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } else if (prMSeq->seqtype==SEQTYPE_RNA) { if (NULL == strchr(RNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } } } /* order in which sequences appear in guide-tree * only allocate if different output-order desired */ prMSeq->tree_order = NULL; prMSeq->filename = CkStrdup(seqfile); Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename); return 0; }
/** * @brief Appends an mseq structure to an already existing one. * filename will be left untouched. * * @param[in] prMSeqDest_p * MSeq structure to which to append to * @param[out] prMSeqToAdd * MSeq structure which is to append * */ void JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd) { int iSrcSeqIndex; int iNewNSeq; assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p)); assert(NULL != prMSeqToAdd); if (0 == prMSeqToAdd->nseqs) { Log(&rLog, LOG_WARN, "Was asked to add 0 sequences"); return; } /* warn on seqtype mismatch and keep original seqtype */ if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) { Log(&rLog, LOG_WARN, "Joining sequences of different type"); } /* leave filename as it is */ /* * copy new seq/s, orig_seq/s, sqinfo/s */ iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs; (*prMSeqDest_p)->seq = (char **) CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO)); for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) { int iDstSeqIndex = (*prMSeqDest_p)->nseqs++; (*prMSeqDest_p)->seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]); (*prMSeqDest_p)->orig_seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]); SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex], & prMSeqToAdd->sqinfo[iSrcSeqIndex]); } (*prMSeqDest_p)->nseqs = iNewNSeq; #if 0 /* 2nd arg is bIsProfile, which when set TRUE skips * the check for gaps. here always check for gaps, * so set FALSE (main reason is that it is easier), FS, r282 -> */ /* had a problem at this stage, therefore dispense with gap check, FS, r290 -> */ /* 3rd argument is dealignment flag, do not dealign profiles */ (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p, TRUE/*FALSE*/, FALSE); #else (*prMSeqDest_p)->aligned = TRUE; #endif return; }