/** * @brief copies an mseq structure * * @param[out] prMSeqDest_p * Copy of mseq structure * @param[in] prMSeqSrc * Source mseq structure to copy * * @note caller has to free copy by calling FreeMSeq() * */ void CopyMSeq(mseq_t **prMSeqDest_p, mseq_t *prMSeqSrc) { int i; assert(prMSeqSrc != NULL && prMSeqDest_p != NULL); NewMSeq(prMSeqDest_p); (*prMSeqDest_p)->nseqs = prMSeqSrc->nseqs; (*prMSeqDest_p)->seqtype = prMSeqSrc->seqtype; if (prMSeqSrc->filename!=NULL) { (*prMSeqDest_p)->filename = CkStrdup(prMSeqSrc->filename); } (*prMSeqDest_p)->seq = (char **) CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(SQINFO)); for (i=0; i<(*prMSeqDest_p)->nseqs; i++) { (*prMSeqDest_p)->seq[i] = CkStrdup(prMSeqSrc->seq[i]); (*prMSeqDest_p)->orig_seq[i] = CkStrdup(prMSeqSrc->orig_seq[i]); SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[i], &prMSeqSrc->sqinfo[i]); } }
/* Function: ReadMultipleRseqs() * * Purpose: Open a data file and * parse it into an array of rseqs (raw, unaligned * sequences). * * Caller is responsible for free'ing memory allocated * to ret_rseqs, ret_weights, and ret_names. * * Weights are currently only supported for MSF format. * Sequences read from all other formats will be assigned * weights of 1.0. If the caller isn't interested in * weights, it passes NULL as ret_weights. * * Returns 1 on success. Returns 0 on failure and sets * squid_errno to indicate the cause. */ int ReadMultipleRseqs(char *seqfile, int fformat, char ***ret_rseqs, SQINFO **ret_sqinfo, int *ret_num) { SQINFO *sqinfo; /* array of sequence optional info */ SQFILE *dbfp; /* open ptr for sequential access of file */ char **rseqs; /* sequence array */ char **aseqs; /* aligned sequences, if file is aligned */ AINFO ainfo; /* alignment-associated information */ int numalloced; /* num of seqs currently alloced for */ int idx; int num; if (fformat == kSelex || fformat == kMSF || fformat == kClustal) { if (! ReadAlignment(seqfile, fformat, &aseqs, &ainfo)) return 0; if (! DealignAseqs(aseqs, ainfo.nseq, &rseqs)) return 0; /* copy the sqinfo array */ num = ainfo.nseq; sqinfo= (SQINFO *) MallocOrDie (sizeof(SQINFO)*ainfo.nseq); for (idx = 0; idx < ainfo.nseq; idx++) SeqinfoCopy(&(sqinfo[idx]), &(ainfo.sqinfo[idx])); FreeAlignment(aseqs, &ainfo); } else { /* initial alloc */ num = 0; numalloced = 16; rseqs = (char **) MallocOrDie (numalloced * sizeof(char *)); sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO)); if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0; while (ReadSeq(dbfp, fformat, &rseqs[num], &(sqinfo[num]))) { num++; if (num == numalloced) /* more seqs coming, alloc more room */ { numalloced += 16; rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *)); sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO)); } } SeqfileClose(dbfp); } *ret_rseqs = rseqs; *ret_sqinfo = sqinfo; *ret_num = num; return 1; }
/** * @brief Appends an mseq structure to an already existing one. * filename will be left untouched. * * @param[in] prMSeqDest_p * MSeq structure to which to append to * @param[out] prMSeqToAdd * MSeq structure which is to append * * */ void JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd) { int iSrcSeqIndex; int iNewNSeq; assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p)); assert(NULL != prMSeqToAdd); if (0 == prMSeqToAdd->nseqs) { Log(&rLog, LOG_WARN, "Was asked to add 0 sequences"); return; } /* warn on seqtype mismatch and keep original seqtype */ if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) { Log(&rLog, LOG_WARN, "Joining sequences of different type"); } /* leave filename as it is */ /* * copy new seq/s, orig_seq/s, sqinfo/s */ iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs; (*prMSeqDest_p)->seq = (char **) CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO)); for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) { int iDstSeqIndex = (*prMSeqDest_p)->nseqs++; (*prMSeqDest_p)->seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]); (*prMSeqDest_p)->orig_seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]); SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex], & prMSeqToAdd->sqinfo[iSrcSeqIndex]); } (*prMSeqDest_p)->nseqs = iNewNSeq; (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p); return; }
/** * @brief Creates a new sequence entry and appends it to an existing mseq * structure. * * @param[out] prMSeqDest_p * Already existing and initialised mseq structure * @param[in] pcSeqName * sequence name of the sequence to add * @param[in] pcSeqRes * the actual sequence (residues) to add * * @note Don't forget to update the align and type flag if necessary! * * FIXME allow adding of more features * */ void AddSeq(mseq_t **prMSeqDest_p, char *pcSeqName, char *pcSeqRes) { int iSeqIdx = 0; SQINFO sqinfo; assert(NULL != prMSeqDest_p); assert(NULL != pcSeqName); assert(NULL != pcSeqRes); iSeqIdx = (*prMSeqDest_p)->nseqs; (*prMSeqDest_p)->seq = (char **) CKREALLOC((*prMSeqDest_p)->seq, (iSeqIdx+1) * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKREALLOC((*prMSeqDest_p)->orig_seq, (iSeqIdx+1) * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKREALLOC((*prMSeqDest_p)->sqinfo, (iSeqIdx+1) * sizeof(SQINFO)); (*prMSeqDest_p)->seq[iSeqIdx] = CkStrdup(pcSeqRes); (*prMSeqDest_p)->orig_seq[iSeqIdx] = CkStrdup(pcSeqRes); /* should probably get ri of SqInfo altogether in the long run and just transfer the intersting members into our own struct */ sqinfo.flags = 0; /* init */ sqinfo.len = strlen(pcSeqRes); sqinfo.flags |= SQINFO_LEN; /* name is an array of SQINFO_NAMELEN length */ strncpy(sqinfo.name, pcSeqName, SQINFO_NAMELEN-1); sqinfo.name[SQINFO_NAMELEN-1] = '\0'; sqinfo.flags |= SQINFO_NAME; SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iSeqIdx], & sqinfo); (*prMSeqDest_p)->nseqs++; return; }
/** * @brief Sort sequences by length * * @param[out] prMSeq * mseq to sort by length * @param[out] cOrder * Sorting order. 'd' for descending, 'a' for ascending. * * */ void SortMSeqByLength(mseq_t *prMSeq, const char cOrder) { int *piSeqLen; int *piOrder; int iSeqIndex; mseq_t *prMSeqCopy = NULL; assert('a'==cOrder || 'd'==cOrder); Log(&rLog, LOG_WARN, "FIXME: This modifies sequence ordering. Might not be what user wants. Will change output order as well"); piSeqLen = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int)); piOrder = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int)); for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len; } QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, cOrder, FALSE); CopyMSeq(&prMSeqCopy, prMSeq); for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { /* copy mseq entry */ CKFREE(prMSeq->seq[iSeqIndex]); prMSeq->seq[iSeqIndex] = CkStrdup(prMSeqCopy->seq[piOrder[iSeqIndex]]); CKFREE(prMSeq->orig_seq[iSeqIndex]); prMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeqCopy->orig_seq[piOrder[iSeqIndex]]); SeqinfoCopy(&prMSeq->sqinfo[iSeqIndex], &prMSeqCopy->sqinfo[piOrder[iSeqIndex]]); } CKFREE(piSeqLen); CKFREE(piOrder); FreeMSeq(&prMSeqCopy); return; }
/** * @brief reads sequences from file * * @param[out] prMSeq * Multiple sequence struct. Must be preallocated. * FIXME: would make more sense to allocate it here. * @param[in] seqfile * Sequence file name. If '-' sequence will be read from stdin. * @param[in] iSeqType * int-encoded sequence type. Set to * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence) * @param[in] iMaxNumSeq * Return an error, if more than iMaxNumSeq have been read * @param[in] iMaxSeqLen * Return an error, if a seq longer than iMaxSeqLen has been read * * @return 0 on success, -1 on error * * @note * - Depends heavily on squid * - Sequence file format will be guessed * - If supported by squid, gzipped files can be read as well. */ int ReadSequences(mseq_t *prMSeq, char *seqfile, int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs, int iMaxNumSeq, int iMaxSeqLen) { SQFILE *dbfp; /* sequence file descriptor */ char *cur_seq; SQINFO cur_sqinfo; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ assert(NULL!=seqfile); /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (SQFILE_UNKNOWN == iSeqFmt && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) { iSeqFmt = SQFILE_FASTA; } /* Using squid routines to read input. taken from seqstat_main.c. we don't * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen * etc. NOTE this also means we discard some information, e.g. when * reading from and writing to a stockholm file, all extra MSA * info/annotation will be lost. * */ if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) { Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile); return -1; } /* FIXME squid's ReadSeq() will exit with fatal error if format is * unknown. This will be a problem for a GUI. Same is true for many squid * other functions. * * The original squid:ReadSeq() dealigns sequences on input. We * use a patched version. * */ while (ReadSeq(dbfp, dbfp->format, &cur_seq, &cur_sqinfo)) { if (prMSeq->nseqs+1>iMaxNumSeq) { Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'", iMaxNumSeq, cur_sqinfo.name, seqfile); return -1; } if ((int)strlen(cur_seq)>iMaxSeqLen) { Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)", cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen); return -1; } if ((int)strlen(cur_seq)==0) { Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues", cur_sqinfo.name); return -1; } /* FIXME: use modified version of AddSeq() that allows handing down SqInfo */ prMSeq->seq = (char **) CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *)); prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq); prMSeq->sqinfo = (SQINFO *) CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO)); SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo); #ifdef TRACE Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]); LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]); #endif /* always guess type from first seq. use squid function and * convert value */ if (0 == prMSeq->nseqs) { int type = Seqtype(prMSeq->seq[prMSeq->nseqs]); switch (type) { case kDNA: prMSeq->seqtype = SEQTYPE_DNA; break; case kRNA: prMSeq->seqtype = SEQTYPE_RNA; break; case kAmino: prMSeq->seqtype = SEQTYPE_PROTEIN; break; case kOtherSeq: prMSeq->seqtype = SEQTYPE_UNKNOWN; break; default: Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__); } /* override with given sequence type but check with * automatically detected type and warn if necessary */ if (SEQTYPE_UNKNOWN != iSeqType) { if (prMSeq->seqtype != iSeqType) { Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested", SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType)); prMSeq->seqtype = iSeqType; } } /* if type could not be determined and was not set return error */ if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) { Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence"); FreeSequence(cur_seq, &cur_sqinfo); SeqfileClose(dbfp); return -1; } } Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len, prMSeq->seq[prMSeq->nseqs]); /* FIXME IPUAC and/or case conversion? If yes see * corresponding squid functions. Special treatment of * Stockholm tilde-gaps for ktuple code? */ prMSeq->nseqs++; FreeSequence(cur_seq, &cur_sqinfo); } SeqfileClose(dbfp); /*#if ALLOW_ONLY_PROTEIN if (SEQTYPE_PROTEIN != prMSeq->seqtype) { Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.", SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME); } #endif*/ /* Check if sequences are aligned */ prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs); /* keep original sequence as copy and convert "working" sequence * */ prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *)); for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) { prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]); /* convert unknown characters according to set seqtype * be conservative, i.e. don't allow any fancy ambiguity * characters to make sure that ktuple code etc. works. */ /* first on the fly conversion between DNA and RNA */ if (prMSeq->seqtype==SEQTYPE_DNA) ToDNA(prMSeq->seq[iSeqIdx]); if (prMSeq->seqtype==SEQTYPE_RNA) ToRNA(prMSeq->seq[iSeqIdx]); /* then check of each character */ for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) { char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]); if (isgap(*res)) continue; if (prMSeq->seqtype==SEQTYPE_PROTEIN) { if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) { *res = AMINOACID_ANY; } } else if (prMSeq->seqtype==SEQTYPE_DNA) { if (NULL == strchr(DNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } else if (prMSeq->seqtype==SEQTYPE_RNA) { if (NULL == strchr(RNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } } } /* order in which sequences appear in guide-tree * only allocate if different output-order desired */ prMSeq->tree_order = NULL; prMSeq->filename = CkStrdup(seqfile); Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename); return 0; }
/** * @brief Appends an mseq structure to an already existing one. * filename will be left untouched. * * @param[in] prMSeqDest_p * MSeq structure to which to append to * @param[out] prMSeqToAdd * MSeq structure which is to append * */ void JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd) { int iSrcSeqIndex; int iNewNSeq; assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p)); assert(NULL != prMSeqToAdd); if (0 == prMSeqToAdd->nseqs) { Log(&rLog, LOG_WARN, "Was asked to add 0 sequences"); return; } /* warn on seqtype mismatch and keep original seqtype */ if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) { Log(&rLog, LOG_WARN, "Joining sequences of different type"); } /* leave filename as it is */ /* * copy new seq/s, orig_seq/s, sqinfo/s */ iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs; (*prMSeqDest_p)->seq = (char **) CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO)); for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) { int iDstSeqIndex = (*prMSeqDest_p)->nseqs++; (*prMSeqDest_p)->seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]); (*prMSeqDest_p)->orig_seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]); SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex], & prMSeqToAdd->sqinfo[iSrcSeqIndex]); } (*prMSeqDest_p)->nseqs = iNewNSeq; #if 0 /* 2nd arg is bIsProfile, which when set TRUE skips * the check for gaps. here always check for gaps, * so set FALSE (main reason is that it is easier), FS, r282 -> */ /* had a problem at this stage, therefore dispense with gap check, FS, r290 -> */ /* 3rd argument is dealignment flag, do not dealign profiles */ (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p, TRUE/*FALSE*/, FALSE); #else (*prMSeqDest_p)->aligned = TRUE; #endif return; }
/* Function: include_alignment() * Date: SRE, Sun Jul 5 15:25:13 1998 [St. Louis] * * Purpose: Given the name of a multiple alignment file, * align that alignment to the HMM, and add traces * to an existing array of traces. If do_mapped * is TRUE, we use the HMM's map file. If not, * we use P7ViterbiAlignAlignment(). * * Args: seqfile - name of alignment file * hmm - model to align to * do_mapped- TRUE if we're to use the HMM's alignment map * rsq - RETURN: array of rseqs to add to * dsq - RETURN: array of dsq to add to * sqinfo - RETURN: array of SQINFO to add to * tr - RETURN: array of traces to add to * nseq - RETURN: number of seqs * * Returns: new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is * increased to nseq+ainfo.nseq. */ void include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped, char ***rsq, char ***dsq, SQINFO **sqinfo, struct p7trace_s ***tr, int *nseq) { int format; /* format of alignment file */ char **aseq; /* aligned seqs */ char **newdsq; char **newrseq; AINFO ainfo; /* info that goes with aseq */ int idx; /* counter over aseqs */ struct p7trace_s *master; /* master trace */ struct p7trace_s **addtr; /* individual traces for aseq */ if (! SeqfileFormat(seqfile, &format, NULL)) switch (squid_errno) { case SQERR_NOFILE: ajFatal("Alignment file %s could not be opened for reading", seqfile); /*FALLTHRU*/ /* a white lie to shut lint up */ case SQERR_FORMAT: default: ajFatal("Failed to determine format of alignment file %s", seqfile); } /* read the alignment from file */ if (! ReadAlignment(seqfile, format, &aseq, &ainfo)) ajFatal("Failed to read aligned sequence file %s", seqfile); for (idx = 0; idx < ainfo.nseq; idx++) s2upper(aseq[idx]); /* Verify checksums before mapping */ if (do_mapped && GCGMultchecksum(aseq, ainfo.nseq) != hmm->checksum) ajFatal("The checksums for alignment file %s and the HMM alignment map don't match.", seqfile); /* Get a master trace */ if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, ainfo.alen); else master = P7ViterbiAlignAlignment(aseq, &ainfo, hmm); /* convert to individual traces */ ImposeMasterTrace(aseq, ainfo.nseq, master, &addtr); /* add those traces to existing ones */ *tr = MergeTraceArrays(*tr, *nseq, addtr, ainfo.nseq); /* additional bookkeeping: add to dsq, sqinfo */ *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + ainfo.nseq)); DealignAseqs(aseq, ainfo.nseq, &newrseq); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) (*rsq)[idx] = newrseq[idx - (*nseq)]; free(newrseq); *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + ainfo.nseq)); DigitizeAlignment(aseq, &ainfo, &newdsq); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) (*dsq)[idx] = newdsq[idx - (*nseq)]; free(newdsq); /* unnecessarily complex, but I can't be bothered... */ *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + ainfo.nseq)); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) SeqinfoCopy(&((*sqinfo)[idx]), &(ainfo.sqinfo[idx - (*nseq)])); *nseq = *nseq + ainfo.nseq; /* Cleanup */ P7FreeTrace(master); FreeAlignment(aseq, &ainfo); /* Return */ return; }
/* Function: ReadSeq() * * Purpose: Read next sequence from an open database file. * Return the sequence and associated info. * * Args: fp - open sequence database file pointer * format - format of the file (previously determined * by call to SeqfileFormat()) * ret_seq - RETURN: sequence * sqinfo - RETURN: filled in w/ other information * * Return: 1 on success, 0 on failure. * ret_seq and some field of sqinfo are allocated here, * The preferred call mechanism to properly free the memory is: * * SQINFO sqinfo; * char *seq; * * ReadSeq(fp, format, &seq, &sqinfo); * ... do something... * FreeSequence(seq, &sqinfo); */ int ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo) { int gotuw; int apos, rpos; squid_errno = SQERR_OK; if (format < kMinFormat || format > kMaxFormat) { squid_errno = SQERR_FORMAT; *ret_seq = NULL; return 0; } /* Here's the hack for sequential access of sequences from * the multiple sequence alignment formats */ if (format == kMSF || format == kSelex || format == kClustal) { if (V->ali_curridx >= V->ali_ainfo.nseq) return 0; /* out of aseqs */ SeqinfoCopy(sqinfo, &(V->ali_ainfo.sqinfo[V->ali_curridx])); /* copy and dealign the appropriate aligned seq */ V->seq = MallocOrDie (sizeof(char) * (V->ali_ainfo.alen+1)); for (rpos = apos = 0; apos < V->ali_ainfo.alen; apos++) if (!isgap(V->ali_aseqs[V->ali_curridx][apos])) V->seq[rpos++] = V->ali_aseqs[V->ali_curridx][apos]; V->seq[rpos] = '\0'; V->seqlen = rpos; V->ali_curridx++; } else { if (feof(V->f)) return 0; V->seq = (char*) calloc (kStartLength+1, sizeof(char)); V->maxseq = kStartLength; V->seqlen = 0; V->sqinfo = sqinfo; V->sqinfo->flags = 0; V->dash_equals_n = (format == kEMBL) ? TRUE : FALSE; switch (format) { case kIG : readIG(V); break; case kStrider : readStrider(V); break; case kGenBank : readGenBank(V); break; case kPearson : readPearson(V); break; case kEMBL : readEMBL(V); break; case kZuker : readZuker(V); break; case kPIR : readPIR(V); break; case kSquid : readSquid(V); break; case kGCGdata : readGCGdata(V); break; case kGCG: do { /* skip leading comments on GCG file */ gotuw = (strstr(V->sbuffer,"..") != NULL); if (gotuw) readUWGCG(V); getline2(V); } while (! feof(V->f)); break; case kIdraw: /* SRE: no attempt to read idraw postscript */ default: squid_errno = SQERR_FORMAT; free(V->seq); return 0; } V->seq[V->seqlen] = 0; /* stick a string terminator on it */ } /* Cleanup */ sqinfo->len = V->seqlen; sqinfo->flags |= SQINFO_LEN; *ret_seq = V->seq; if (squid_errno == SQERR_OK) return 1; else return 0; }