/* Function: make_alilist() * * Purpose: Construct a list (array) mapping the raw symbols of s1 * onto the indexes of the aligned symbols in s2 (or -1 * for gaps in s2). The list (s1_list) will be of the * length of s1's raw sequence. * * Args: s1 - sequence to construct the list for * s2 - sequence s1 is aligned to * ret_s1_list - RETURN: the constructed list (caller must free) * ret_listlen - RETURN: length of the list * * Returns: 1 on success, 0 on failure */ static int make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen) { int *s1_list; int col; /* column position in alignment */ int r1, r2; /* raw symbol index at current col in s1, s2 */ /* Malloc for s1_list. It can't be longer than s1 itself; we just malloc * for that (and waste a wee bit of space) */ s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1)); r1 = r2 = 0; for (col = 0; s1[col] != '\0'; col++) { /* symbol in s1? Record what it's aligned to, and bump * the r1 counter. */ if (! isgap(s1[col])) { s1_list[r1] = isgap(s2[col]) ? -1 : r2; r1++; } /* symbol in s2? bump the r2 counter */ if (! isgap(s2[col])) r2++; } *ret_listlen = r1; *ret_s1_list = s1_list; return 1; }
/*ARGSUSED*/ static int make_ref_alilist(int *ref, char *k1, char *k2, char *s1, char *s2, int **ret_s1_list, int *ret_listlen) { int *s1_list; int col; /* column position in alignment */ int r1, r2; /* raw symbol index at current col in s1, s2 */ int *canons1; /* flag array, 1 if position i in s1 raw seq is canonical */ int lpos; /* position in list */ /* Allocations. No arrays can exceed the length of their * appropriate parent (s1 or s2) */ s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1)); canons1 = (int *) MallocOrDie (sizeof(int) * strlen(s1)); /* First we use refcoords and k1,k2 to construct an array of 1's * and 0's, telling us whether s1's raw symbol number i is countable. * It's countable simply if it's under a canonical column. */ r1 = 0; for (col = 0; k1[col] != '\0'; col++) { if (! isgap(k1[col])) { canons1[r1] = ref[col] ? 1 : 0; r1++; } } /* Now we can construct the list. We don't count pairs if the sym in s1 * is non-canonical. * We have to keep separate track of our position in the list (lpos) * from our positions in the raw sequences (r1,r2) */ r1 = r2 = lpos = 0; for (col = 0; s1[col] != '\0'; col++) { if (! isgap(s1[col]) && canons1[r1]) { s1_list[lpos] = isgap(s2[col]) ? -1 : r2; lpos++; } if (! isgap(s1[col])) r1++; if (! isgap(s2[col])) r2++; } free(canons1); *ret_listlen = lpos; *ret_s1_list = s1_list; return 1; }
/* Function: DealignAseqs() * * Given an array of (num) aligned sequences aseqs, * strip the gaps. Store the raw sequences in a new allocated array. * * Caller is responsible for free'ing the memory allocated to * rseqs. * * Returns 1 on success. Returns 0 and sets squid_errno on * failure. */ int DealignAseqs(char **aseqs, int num, char ***ret_rseqs) { char **rseqs; /* de-aligned sequence array */ int idx; /* counter for sequences */ int depos; /* position counter for dealigned seq*/ int apos; /* position counter for aligned seq */ int seqlen; /* length of aligned seq */ /* alloc space */ rseqs = (char **) MallocOrDie (num * sizeof(char *)); /* main loop */ for (idx = 0; idx < num; idx++) { seqlen = strlen(aseqs[idx]); /* alloc space */ rseqs[idx] = (char *) MallocOrDie ((seqlen + 1) * sizeof(char)); /* strip gaps */ depos = 0; for (apos = 0; aseqs[idx][apos] != '\0'; apos++) if (!isgap(aseqs[idx][apos])) { rseqs[idx][depos] = aseqs[idx][apos]; depos++; } rseqs[idx][depos] = '\0'; } *ret_rseqs = rseqs; return 1; }
/** * note: naseq should be unit-offset */ static void encode(char *seq, char *naseq, int l, const char *res_codes) { /* code seq as ints .. use GAP_POS2 for gap */ register int i; bool seq_contains_unknown_char = FALSE; /*LOG_DEBUG("seq=%s naseq=%p l=%d", &(seq[1]), naseq, l); */ for (i=1; i<=l; i++) { char res = toupper(seq[i]); if (isgap(res)) { naseq[i] = GAP_POS2; /* gap in input */ } else { naseq[i] = res_index(res_codes, res); } /*LOG_DEBUG("Character '%c' at pos %d", res, i);*/ if (-1 == naseq[i]) { seq_contains_unknown_char = TRUE; /*LOG_DEBUG("Unknown character '%c' at pos %d", res, i);*/ } /*LOG_DEBUG("na_seq[%d]=%d", i, naseq[i]);*/ } if (TRUE == seq_contains_unknown_char) Log(&rLog, LOG_WARN, "Unknown character in seq '%s'", &(seq[1])); naseq[i] = END_MARK; return; }
unsigned GetUngappedLength(const byte *Seq, unsigned L) { unsigned UL = 0; for (unsigned i = 0; i < L; ++i) if (!isgap(Seq[i])) ++UL; return UL; }
/** * @brief Checks if sequences in given mseq structure are aligned. By * definition this is only true, if sequences are of the same length * and at least one gap was found * * @param[in] prMSeq * Sequences to check * * @return TRUE if sequences are aligned, FALSE if not * * */ bool SeqsAreAligned(mseq_t *prMSeq) { bool bGapFound, bSameLength; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ /* Special case of just one sequence: * it is arguable that a single sequence qualifies as a profile, * however, this is what we do at the first stage of MSA anyway. * So, if there is only 1 sequence it is a 1-profile * and it is (defined to be) aligned (with itself). FS, r240 -> 241 */ if (1 == prMSeq->nseqs) { return TRUE; } /* Check if sequences are aligned. For being aligned, the * sequences have to be of same length (bSameLength) and at least * one of them has to contain at least one gap (bGapFound) */ bGapFound = FALSE; bSameLength = TRUE; for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) { if (FALSE == bGapFound) { for (iSeqPos=0; iSeqPos<prMSeq->sqinfo[iSeqIdx].len && false==bGapFound; iSeqPos++) { if (isgap(prMSeq->seq[iSeqIdx][iSeqPos])) { bGapFound = TRUE; /* skip rest of sequence */ break; } } } if (iSeqIdx>0) { if (prMSeq->sqinfo[iSeqIdx].len != prMSeq->sqinfo[iSeqIdx-1].len) { bSameLength = FALSE; /* no need to continue search, bSameLength==FALSE is * sufficient condition */ break; } } } #if 0 Log(&rLog, LOG_FORCED_DEBUG, "bSameLength=%d bGapFound=%d", bSameLength, bGapFound); #endif if (TRUE == bSameLength && TRUE == bGapFound) { return TRUE; } else { return FALSE; } }
/* Function: BlockRaggedEdgedAlignment() * * Purpose: A brutal hack for ignoring exterior gaps on an * alignment in Maxmodelmaker(). Convert all * exterior gaps to the symbol ',' and hope to * God nobody ever uses commas to mean anything * in an alignment. * * Args: aseqs - [0..nseq-1][0..alen-1] alignment to block * nseq - number of seqs in the alignment * alen - width of alignment, columns * * Return: (void). Data in aseqs is changed. */ void BlockRaggedEdgedAlignment(char **aseqs, int nseq, int alen) { int idx, pos; for (idx = 0; idx < nseq; idx++) { for (pos = 0; pos < alen; pos++) { if (isgap(aseqs[idx][pos])) aseqs[idx][pos] = ','; else break; } for (pos = alen-1; pos >= 0; pos--) { if (isgap(aseqs[idx][pos])) aseqs[idx][pos] = ','; else break; } } }
static void StripGaps(const byte *Seq, unsigned L, string &s) { s.clear(); for (unsigned i = 0; i < L; ++i) { char c = Seq[i]; if (!isgap(c)) s.push_back(c); } }
/* Function: PairwiseIdentity() * * Purpose: Calculate the pairwise fractional identity between * two aligned sequences s1 and s2. This is simply * (idents / MIN(len1, len2)). * * Note how many ways there are to calculate pairwise identity, * because of the variety of choices for the denominator: * idents/(idents+mismat) has the disadvantage that artifactual * gappy alignments would have high "identities". * idents/(AVG|MAX)(len1+len2) both have the disadvantage that * alignments of fragments to longer sequences would have * artifactually low "identities". * * Watch out in nucleic acid alignments; U/T RNA/DNA alignments * will be counted as mismatches! */ float PairwiseIdentity(char *s1, char *s2) { int idents; /* total identical positions */ int len1, len2; /* lengths of seqs */ int x; /* position in aligned seqs */ idents = len1 = len2 = 0; for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++) { if (!isgap(s1[x])) { len1++; if (s1[x] == s2[x]) idents++; } if (!isgap(s2[x])) len2++; } if (len2 < len1) len1 = len2; return (len1 == 0 ? 0.0 : (float) idents / (float) len1); }
void AlignChime3SD(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3, ChimeHit2 &Hit) { if (opt_realign) { AlignChime3SDRealign(QSD3, ASD3, BSD3, Hit); return; } string Q3; string A3; string B3; const unsigned ColCount = QSD3.L; asserta(ASD3.L == ColCount && BSD3.L == ColCount); Q3.reserve(ColCount); A3.reserve(ColCount); B3.reserve(ColCount); const byte *QS = QSD3.Seq; const byte *AS = ASD3.Seq; const byte *BS = BSD3.Seq; for (unsigned Col = 0; Col < ColCount; ++Col) { byte q = toupper(QS[Col]); byte a = toupper(AS[Col]); byte b = toupper(BS[Col]); if (isgap(q) && isgap(a) && isgap(b)) continue; Q3.push_back(q); A3.push_back(a); B3.push_back(b); } AlignChime3(Q3, A3, B3, QSD3.Label, ASD3.Label, BSD3.Label, Hit); }
static void StripGapsAlloc(const SeqData &SDIn, SeqData &SDOut) { SDOut = SDIn; byte *s = myalloc(byte, SDIn.L); unsigned k = 0; for (unsigned i = 0; i < SDIn.L; ++i) { char c = SDIn.Seq[i]; if (!isgap(c)) s[k++] = toupper(c); } SDOut.Seq = s; SDOut.L = k; }
static int dataline_MSF(char *buf, char *expected_name) { while (*buf && isspace(*buf)) buf++; if (*buf == '\0' || strchr(commentsyms, *buf) != NULL) return 0; /* blank or comment */ if (expected_name != NULL && strncmp(buf, expected_name, strlen(expected_name) == 0)) return 1; /* matches expected seq name, definitely data */ for (; *buf != '\0'; buf++) { /* MSF has coordinate lines to worry about */ if (isspace(*buf)) continue; /* no info from spaces */ if (isalpha(*buf)||isgap(*buf)) return 1; /* has data on it */ } return 0; }
static int dataline_clustal(char *buf, char *expected_name) { while (*buf && isspace(*buf)) buf++; if (*buf == '\0' || strchr(commentsyms, *buf) != NULL) return 0; /* blank or comment */ if (expected_name != NULL && strncmp(buf, expected_name, strlen(expected_name) == 0)) return 1; /* matches expected seq name, definitely data */ for (; *buf != '\0'; buf++) { /* Clustal has no coord lines to worry about */ if (*buf == '*' || *buf == '.' || *buf == ':') continue; /* possible consensus line */ if (isalnum(*buf)) return 1; /* name or seq character */ if (*buf != ' ' && isgap(*buf)) return 1; /* possible all-gap line */ } return 0; }
vector<pblock::PBLOCK> pblock::split(vector<gen::quartral_element> ic) { int last=0; PBLOCK tmp=PBLOCK(); vector<pblock::PBLOCK> result; set<string> tmpdef; set<string> tmpuse; for (size_t i=0;i<ic.size();i++) { if (isgap(ic[i])||ic[i+1].op==gen::LABEL) { tmp.begin=last;tmp.end=i+1; last=i+1; result.push_back(tmp); tmpdef.clear(); tmpuse.clear(); } } for (size_t i=0;i<result.size();i++) { if (ic[result[i].begin].op==gen::FUN||(result[i].begin+1<ic.size()&&ic[result[i].begin+1].op==gen::FUN)) { string tmps = ic[result[i].end-1].target; for (size_t j=0;j<result.size();j++) { for (size_t k=result[j].begin;k<result[j].end;k++) { if (ic[k].target==tmps&&ic[k].op==gen::LABEL) { result[j].valid=true; } } } } } for (size_t i=0;i<result.size();i++) { if (result[i].valid==true) { cout<<i<<' '; } } cout<<endl; return result; }
/** * @brief Removes all gap-characters from a sequence. * * @param[out] seq * Sequence to dealign * * @note seq will not be reallocated */ void DealignSeq(char *seq) { int aln_pos; int dealn_pos; assert(seq!=NULL); dealn_pos=0; for (aln_pos=0; aln_pos<(int)strlen(seq); aln_pos++) { if (! isgap(seq[aln_pos])) { seq[dealn_pos++] = seq[aln_pos]; } } seq[dealn_pos] = '\0'; return; }
/* Function: MSANogap() * Date: SRE, Wed Nov 17 09:59:51 1999 [St. Louis] * * Purpose: Remove all columns from a multiple sequence alignment that * contain any gaps -- used for filtering before phylogenetic * analysis. * * Args: msa - the alignment * * Returns: (void). The alignment is modified, so if you want to keep * the original for something, make a copy. */ void MSANogap(MSA *msa) { int *useme; /* array of TRUE/FALSE flags for which columns to keep */ int apos; /* position in original alignment */ int idx; /* sequence index */ useme = MallocOrDie(sizeof(int) * msa->alen); for (apos = 0; apos < msa->alen; apos++) { for (idx = 0; idx < msa->nseq; idx++) if (isgap(msa->aseq[idx][apos])) break; if (idx == msa->nseq) useme[apos] = TRUE; else useme[apos] = FALSE; } MSAShorterAlignment(msa, useme); free(useme); return; }
/* Function: DigitizeAlignment() * * Purpose: Given an alignment, return digitized unaligned * sequence array. (Tracebacks are always relative * to digitized unaligned seqs, even if they are * faked from an existing alignment in modelmakers.c.) * * Args: msa - alignment to digitize * ret_dsqs - RETURN: array of digitized unaligned sequences * * Return: (void) * dsqs is alloced here. Free2DArray(dseqs, nseq). */ void DigitizeAlignment(MSA *msa, char ***ret_dsqs) { char **dsq; int idx; /* counter for sequences */ int dpos; /* position in digitized seq */ int apos; /* position in aligned seq */ dsq = (char **) MallocOrDie (sizeof(char *) * msa->nseq); for (idx = 0; idx < msa->nseq; idx++) { dsq[idx] = (char *) MallocOrDie (sizeof(char) * (msa->alen+2)); dsq[idx][0] = (char) Alphabet_iupac; /* sentinel byte at start */ for (apos = 0, dpos = 1; apos < msa->alen; apos++) { if (! isgap(msa->aseq[idx][apos])) /* skip gaps */ dsq[idx][dpos++] = SymbolIndex(msa->aseq[idx][apos]); } dsq[idx][dpos] = (char) Alphabet_iupac; /* sentinel byte at end */ } *ret_dsqs = dsq; }
/* Function: Seqtype() * * Purpose: Returns a (very good) guess about type of sequence: * kDNA, kRNA, kAmino, or kOtherSeq. * * Modified from, and replaces, Gilbert getseqtype(). */ int Seqtype(char *seq) { int saw; /* how many non-gap characters I saw */ char c; int po = 0; /* count of protein-only */ int nt = 0; /* count of t's */ int nu = 0; /* count of u's */ int na = 0; /* count of nucleotides */ int aa = 0; /* count of amino acids */ int no = 0; /* count of others */ /* Look at the first 300 non-gap characters */ for (saw = 0; *seq != '\0' && saw < 300; seq++) { c = sre_toupper((int) *seq); if (! isgap(c)) { if (strchr(protonly, c)) po++; else if (strchr(primenuc,c)) { na++; if (c == 'T') nt++; else if (c == 'U') nu++; } else if (strchr(aminos,c)) aa++; else if (isalpha(c)) no++; saw++; } } if (no > 0) return kOtherSeq; else if (po > 0) return kAmino; else if (na > aa) { if (nu > nt) return kRNA; else return kDNA; } else return kAmino; }
/** * @brief reads sequences from file * * @param[out] prMSeq * Multiple sequence struct. Must be preallocated. * FIXME: would make more sense to allocate it here. * @param[in] seqfile * Sequence file name. If '-' sequence will be read from stdin. * @param[in] iSeqType * int-encoded sequence type. Set to * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence) * @param[in] iMaxNumSeq * Return an error, if more than iMaxNumSeq have been read * @param[in] iMaxSeqLen * Return an error, if a seq longer than iMaxSeqLen has been read * * @return 0 on success, -1 on error * * @note * - Depends heavily on squid * - Sequence file format will be guessed * - If supported by squid, gzipped files can be read as well. */ int ReadSequences(mseq_t *prMSeq, char *seqfile, int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs, int iMaxNumSeq, int iMaxSeqLen) { SQFILE *dbfp; /* sequence file descriptor */ char *cur_seq; SQINFO cur_sqinfo; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ assert(NULL!=seqfile); /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (SQFILE_UNKNOWN == iSeqFmt && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) { iSeqFmt = SQFILE_FASTA; } /* Using squid routines to read input. taken from seqstat_main.c. we don't * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen * etc. NOTE this also means we discard some information, e.g. when * reading from and writing to a stockholm file, all extra MSA * info/annotation will be lost. * */ if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) { Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile); return -1; } /* FIXME squid's ReadSeq() will exit with fatal error if format is * unknown. This will be a problem for a GUI. Same is true for many squid * other functions. * * The original squid:ReadSeq() dealigns sequences on input. We * use a patched version. * */ while (ReadSeq(dbfp, dbfp->format, &cur_seq, &cur_sqinfo)) { if (prMSeq->nseqs+1>iMaxNumSeq) { Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'", iMaxNumSeq, cur_sqinfo.name, seqfile); return -1; } if ((int)strlen(cur_seq)>iMaxSeqLen) { Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)", cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen); return -1; } if ((int)strlen(cur_seq)==0) { Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues", cur_sqinfo.name); return -1; } /* FIXME: use modified version of AddSeq() that allows handing down SqInfo */ prMSeq->seq = (char **) CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *)); prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq); prMSeq->sqinfo = (SQINFO *) CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO)); SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo); #ifdef TRACE Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]); LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]); #endif /* always guess type from first seq. use squid function and * convert value */ if (0 == prMSeq->nseqs) { int type = Seqtype(prMSeq->seq[prMSeq->nseqs]); switch (type) { case kDNA: prMSeq->seqtype = SEQTYPE_DNA; break; case kRNA: prMSeq->seqtype = SEQTYPE_RNA; break; case kAmino: prMSeq->seqtype = SEQTYPE_PROTEIN; break; case kOtherSeq: prMSeq->seqtype = SEQTYPE_UNKNOWN; break; default: Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__); } /* override with given sequence type but check with * automatically detected type and warn if necessary */ if (SEQTYPE_UNKNOWN != iSeqType) { if (prMSeq->seqtype != iSeqType) { Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested", SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType)); prMSeq->seqtype = iSeqType; } } /* if type could not be determined and was not set return error */ if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) { Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence"); FreeSequence(cur_seq, &cur_sqinfo); SeqfileClose(dbfp); return -1; } } Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len, prMSeq->seq[prMSeq->nseqs]); /* FIXME IPUAC and/or case conversion? If yes see * corresponding squid functions. Special treatment of * Stockholm tilde-gaps for ktuple code? */ prMSeq->nseqs++; FreeSequence(cur_seq, &cur_sqinfo); } SeqfileClose(dbfp); /*#if ALLOW_ONLY_PROTEIN if (SEQTYPE_PROTEIN != prMSeq->seqtype) { Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.", SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME); } #endif*/ /* Check if sequences are aligned */ prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs); /* keep original sequence as copy and convert "working" sequence * */ prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *)); for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) { prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]); /* convert unknown characters according to set seqtype * be conservative, i.e. don't allow any fancy ambiguity * characters to make sure that ktuple code etc. works. */ /* first on the fly conversion between DNA and RNA */ if (prMSeq->seqtype==SEQTYPE_DNA) ToDNA(prMSeq->seq[iSeqIdx]); if (prMSeq->seqtype==SEQTYPE_RNA) ToRNA(prMSeq->seq[iSeqIdx]); /* then check of each character */ for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) { char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]); if (isgap(*res)) continue; if (prMSeq->seqtype==SEQTYPE_PROTEIN) { if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) { *res = AMINOACID_ANY; } } else if (prMSeq->seqtype==SEQTYPE_DNA) { if (NULL == strchr(DNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } else if (prMSeq->seqtype==SEQTYPE_RNA) { if (NULL == strchr(RNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } } } /* order in which sequences appear in guide-tree * only allocate if different output-order desired */ prMSeq->tree_order = NULL; prMSeq->filename = CkStrdup(seqfile); Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename); return 0; }
/** * @brief Checks if sequences in given mseq structure are aligned. By * definition this is only true, if sequences are of the same length * and at least one gap was found * * @param[in] prMSeq * Sequences to check * * @return TRUE if sequences are aligned, FALSE if not * * */ bool SeqsAreAligned(mseq_t *prMSeq, bool bIsProfile, bool bDealignInputSeqs) { bool bGapFound, bSameLength; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ /* Special case of just one sequence: * it is arguable that a single sequence qualifies as a profile, * however, this is what we do at the first stage of MSA anyway. * So, if there is only 1 sequence it is a 1-profile * and it is (defined to be) aligned (with itself). FS, r240 -> 241 */ if (1 == prMSeq->nseqs) { return TRUE; } /* Check if sequences are aligned. For being aligned, the * sequences have to be of same length (bSameLength) and at least * one of them has to contain at least one gap (bGapFound) */ bGapFound = FALSE; bSameLength = TRUE; for (iSeqIdx=0; (iSeqIdx < prMSeq->nseqs); iSeqIdx++) { if ( (FALSE == bGapFound) ) { for (iSeqPos=0; iSeqPos<prMSeq->sqinfo[iSeqIdx].len && false==bGapFound; iSeqPos++) { if (isgap(prMSeq->seq[iSeqIdx][iSeqPos])) { bGapFound = TRUE; /* skip rest of sequence */ break; } } } /* gap not (yet) found */ if (iSeqIdx>0) { if (prMSeq->sqinfo[iSeqIdx].len != prMSeq->sqinfo[iSeqIdx-1].len) { bSameLength = FALSE; /* no need to continue search, bSameLength==FALSE is * sufficient condition */ break; } } } /* 0 <= iSeqIdx < prMSeq->nseqs */ #if 0 Log(&rLog, LOG_FORCED_DEBUG, "bSameLength=%d bGapFound=%d", bSameLength, bGapFound); #endif #if 0 if ( (TRUE == bSameLength) && ((TRUE == bGapFound) || (TRUE == bIsProfile)) ) { return TRUE; } else { if ((FALSE == bSameLength) && (TRUE == bGapFound) && (FALSE == bDealignInputSeqs)) { Log(&rLog, LOG_FORCED_DEBUG, "Potential Problem: Gaps encountered but not all sequences have same length, consider using --dealign"); } return FALSE; } #else if (FALSE == bSameLength) { /* if sequences don't have same lengths they can never be profile */ if (TRUE == bGapFound) { Log(&rLog, LOG_FORCED_DEBUG, "Potential Problem: sequences (N=%d) don't have same lengths but contain gaps, consider using --dealign", prMSeq->nseqs); } return FALSE; } else { /* here all sequences have same lengths */ if (TRUE == bGapFound) { /* if at least one sequence contains gaps (and all have the same lengths) then we can be sure it is a profile */ return TRUE; } /* here all sequences have same lengths but no sequences contain any gaps */ else if (TRUE == bIsProfile) { /* if the user says it is a profile then it is */ return TRUE; } else { return FALSE; } } #endif }
void AlignChimeLocal3(const string &Q3, const string &A3, const string &B3, const string &QLabel, const string &ALabel, const string &BLabel, ChimeHit2 &Hit) { Hit.Clear(); const byte *Q3Seq = (const byte *) Q3.c_str(); const byte *A3Seq = (const byte *) A3.c_str(); const byte *B3Seq = (const byte *) B3.c_str(); const unsigned ColCount = SIZE(Q3); asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount); vector<float> ColScoresA(ColCount, 0.0f); vector<float> ColScoresB(ColCount, 0.0f); float ScoreN = -(float) opt_xn; unsigned QL = 0; for (unsigned Col = 0; Col < ColCount; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (!isgap(q)) ++QL; if (q == a && q == b && a == b) continue; if (isgap(q) || isgap(a) || isgap(b)) continue; if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1]))) continue; if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1]))) continue; if (q == a && q != b) ColScoresA[Col] = 1; else ColScoresA[Col] = ScoreN; if (q == b && q != a) ColScoresB[Col] = 1; else ColScoresB[Col] = ScoreN; } vector<float> LVA(ColCount, 0.0f); vector<float> LVB(ColCount, 0.0f); LVA[0] = ColScoresA[0]; LVB[0] = ColScoresB[0]; for (unsigned Col = 1; Col < ColCount; ++Col) { LVA[Col] = max(LVA[Col-1], 0.0f) + ColScoresA[Col]; LVB[Col] = max(LVB[Col-1], 0.0f) + ColScoresB[Col]; } vector<float> RVA(ColCount, 0.0f); vector<float> RVB(ColCount, 0.0f); RVA[ColCount-1] = ColScoresA[ColCount-1]; RVB[ColCount-1] = ColScoresB[ColCount-1]; for (int Col = ColCount-2; Col >= 0; --Col) { RVA[Col] = max(RVA[Col+1], 0.0f) + ColScoresA[Col]; RVB[Col] = max(RVB[Col+1], 0.0f) + ColScoresB[Col]; } bool FirstA = true; float MaxSum = 0.0; unsigned ColX = UINT_MAX; for (unsigned Col = 1; Col < ColCount-1; ++Col) { float Sum = LVA[Col] + RVB[Col+1]; if (Sum > MaxSum) { FirstA = true; MaxSum = Sum; ColX = Col; } } for (unsigned Col = 1; Col < ColCount-1; ++Col) { float Sum = LVB[Col] + RVA[Col+1]; if (Sum > MaxSum) { FirstA = false; MaxSum = Sum; ColX = Col; } } if (ColX == UINT_MAX) return; unsigned ColLo = UINT_MAX; unsigned ColHi = UINT_MAX; if (FirstA) { float Sum = 0.0f; for (int Col = ColX; Col >= 0; --Col) { Sum += ColScoresA[Col]; if (Sum >= LVA[ColX]) { ColLo = Col; break; } } asserta(Sum >= LVA[ColX]); Sum = 0.0f; for (unsigned Col = ColX+1; Col < ColCount; ++Col) { Sum += ColScoresB[Col]; if (Sum >= RVB[ColX]) { ColHi = Col; break; } } asserta(Sum >= RVB[ColX]); } else { float Sum = 0.0f; for (int Col = ColX; Col >= 0; --Col) { Sum += ColScoresB[Col]; if (Sum >= LVB[ColX]) { ColLo = Col; break; } } asserta(Sum >= LVB[ColX]); Sum = 0.0f; for (unsigned Col = ColX+1; Col < ColCount; ++Col) { Sum += ColScoresA[Col]; if (Sum >= RVA[ColX]) { ColHi = Col; break; } } asserta(Sum >= RVA[ColX]); } unsigned ColXHi = ColX; for (unsigned Col = ColX + 1; Col < ColCount; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (q == a && q == b && !isgap(q)) ColXHi = Col; else break; } unsigned ColXLo = ColX; for (int Col = (int) ColX - 1; Col >= 0; --Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (q == a && q == b && !isgap(q)) ColXLo = Col; else break; } unsigned IdQA = 0; unsigned IdQB = 0; unsigned IdAB = 0; unsigned NQA = 0; unsigned NQB = 0; unsigned NAB = 0; for (unsigned Col = 0; Col < ColCount; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (!isgap(q) && !isgap(a)) { ++NQA; if (q == a) ++IdQA; } if (!isgap(q) && !isgap(b)) { ++NQB; if (q == b) ++IdQB; } if (!isgap(a) && !isgap(b)) { ++NAB; if (a == b) ++IdAB; } } Hit.PctIdQA = Pct(IdQA, NQA); Hit.PctIdQB = Pct(IdQB, NQB); Hit.PctIdAB = Pct(IdAB, NAB); unsigned LIdQA = 0; unsigned LIdQB = 0; for (unsigned Col = ColLo; Col < ColXLo; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (!isgap(q) && !isgap(a)) { if (q == a) ++LIdQA; } if (!isgap(q) && !isgap(b)) { if (q == b) ++LIdQB; } } unsigned RIdQA = 0; unsigned RIdQB = 0; for (unsigned Col = ColXHi+1; Col <= ColHi; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (!isgap(q) && !isgap(a)) { if (q == a) ++RIdQA; } if (!isgap(q) && !isgap(b)) { if (q == b) ++RIdQB; } } unsigned IdDiffL = max(LIdQA, LIdQB) - min(LIdQA, LIdQB); unsigned IdDiffR = max(RIdQA, RIdQB) - min(RIdQA, RIdQB); unsigned MinIdDiff = min(IdDiffL, IdDiffR); unsigned ColRange = ColHi - ColLo + 1; if (opt_queryfract > 0.0f && float(ColRange)/float(QL) < opt_queryfract) return; // double Div = Pct(MinIdDiff, QSD.L); #if TRACE { Log(" Col A Q B ScoreA ScoreB LVA LVB RVA RVB\n"); Log("----- - - - ------- ------- ------- ------- ------- -------\n"); for (unsigned Col = 0; Col < ColCount; ++Col) { if (ColScoresA[Col] == 0.0 && ColScoresB[Col] == 0.0) continue; char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; Log("%5u %c %c %c", Col, a, q, b); if (ColScoresA[Col] == 0.0) Log(" %7.7s", ""); else Log(" %7.1f", ColScoresA[Col]); if (ColScoresB[Col] == 0.0) Log(" %7.7s", ""); else Log(" %7.1f", ColScoresB[Col]); Log(" %7.1f %7.1f %7.1f %7.1f", LVA[Col], LVB[Col], RVA[Col], RVB[Col]); Log("\n"); } Log("\n"); Log("MaxSum %.1f, ColLo %u, ColXLo %u, ColX %u, ColXHi %u, ColHi %u, AF %c\n", MaxSum, ColLo, ColXLo, ColX, ColXHi, ColHi, tof(FirstA)); Log(" LIdQA %u, LIdQB %u, RIdQA %u, RIdQB %u\n", LIdQA, LIdQB, RIdQA, RIdQB); } #endif string Q3L; string A3L; string B3L; for (unsigned Col = ColLo; Col <= ColHi; ++Col) { char q = Q3[Col]; char a = A3[Col]; char b = B3[Col]; Q3L += q; A3L += a; B3L += b; } AlignChimeGlobal3(Q3L, A3L, B3L, QLabel, ALabel, BLabel, Hit); #if 0 // CS SNPs Hit.CS_LY = 0; Hit.CS_LN = 0; Hit.CS_RY = 0; Hit.CS_RN = 0; Hit.CS_LA = 0; Hit.CS_RA = 0; for (unsigned Col = ColLo; Col <= ColHi; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (q == a && q == b && a == b) continue; if (isgap(q) || isgap(a) || isgap(b)) continue; if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1]))) continue; if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1]))) continue; if (!FirstA) swap(a, b); if (Col < ColXLo) { if (q == a && q != b) ++Hit.CS_LY; else if (q == b && q != a) ++Hit.CS_LN; else ++Hit.CS_LA; } else if (Col > ColXHi) { if (q == b && q != a) ++Hit.CS_RY; else if (q == a && q != b) ++Hit.CS_RN; else ++Hit.CS_RA; } } double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA); double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA); Hit.Score = ScoreL*ScoreR; //Hit.QSD = QSD; //if (FirstA) // { // Hit.ASD = ASD; // Hit.BSD = BSD; // Hit.PathQA = PathQA; // Hit.PathQB = PathQB; // } //else // { // Hit.ASD = BSD; // Hit.BSD = ASD; // } //Hit.ColLo = ColLo; //Hit.ColXLo = ColXLo; //Hit.ColXHi = ColXHi; //Hit.ColHi = ColHi; //Hit.Div = Div; // Hit.LogMe(); #endif }
void AlignChimeGlobal3(const string &Q3, const string &A3, const string &B3, const string &QLabel, const string &ALabel, const string &BLabel, ChimeHit2 &Hit) { Hit.Clear(); Hit.QLabel = QLabel; const byte *Q3Seq = (const byte *) Q3.c_str(); const byte *A3Seq = (const byte *) A3.c_str(); const byte *B3Seq = (const byte *) B3.c_str(); const unsigned ColCount = SIZE(Q3); asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount); #if TRACE Log("Q %5u %*.*s\n", ColCount, ColCount, ColCount, Q3Seq); Log("A %5u %*.*s\n", ColCount, ColCount, ColCount, A3Seq); Log("B %5u %*.*s\n", ColCount, ColCount, ColCount, B3Seq); #endif // Discard terminal gaps unsigned ColLo = UINT_MAX; unsigned ColHi = UINT_MAX; for (unsigned Col = 2; Col + 2 < ColCount; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (isacgt(q) && isacgt(a) && isacgt(b)) { if (ColLo == UINT_MAX) ColLo = Col; ColHi = Col; } } if (ColLo == UINT_MAX) return; unsigned QPos = 0; unsigned APos = 0; unsigned BPos = 0; unsigned DiffCount = 0; vector<unsigned> ColToQPos(ColLo, UINT_MAX); vector<unsigned> AccumCount(ColLo, UINT_MAX); vector<unsigned> AccumSameA(ColLo, UINT_MAX); vector<unsigned> AccumSameB(ColLo, UINT_MAX); vector<unsigned> AccumForA(ColLo, UINT_MAX); vector<unsigned> AccumForB(ColLo, UINT_MAX); vector<unsigned> AccumAbstain(ColLo, UINT_MAX); vector<unsigned> AccumAgainst(ColLo, UINT_MAX); unsigned SumSameA = 0; unsigned SumSameB = 0; unsigned SumSameAB = 0; unsigned Sum = 0; unsigned SumForA = 0; unsigned SumForB = 0; unsigned SumAbstain = 0; unsigned SumAgainst = 0; for (unsigned Col = ColLo; Col <= ColHi; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (isacgt(q) && isacgt(a) && isacgt(b)) { if (q == a) ++SumSameA; if (q == b) ++SumSameB; if (a == b) ++SumSameAB; if (q == a && q != b) ++SumForA; if (q == b && q != a) ++SumForB; if (a == b && q != a) ++SumAgainst; if (q != a && q != b) ++SumAbstain; ++Sum; } ColToQPos.push_back(QPos); AccumSameA.push_back(SumSameA); AccumSameB.push_back(SumSameB); AccumCount.push_back(Sum); AccumForA.push_back(SumForA); AccumForB.push_back(SumForB); AccumAbstain.push_back(SumAbstain); AccumAgainst.push_back(SumAgainst); if (q != '-') ++QPos; if (a != '-') ++APos; if (b != '-') ++BPos; } asserta(SIZE(ColToQPos) == ColHi+1); asserta(SIZE(AccumSameA) == ColHi+1); asserta(SIZE(AccumSameB) == ColHi+1); asserta(SIZE(AccumAbstain) == ColHi+1); asserta(SIZE(AccumAgainst) == ColHi+1); double IdQA = double(SumSameA)/Sum; double IdQB = double(SumSameB)/Sum; double IdAB = double(SumSameAB)/Sum; double MaxId = max(IdQA, IdQB); #if TRACE Log("IdQA=%.1f%% IdQB=%.1f%% IdAB=%.1f\n", IdQA*100.0, IdQB*100.0, IdAB*100.0); Log("\n"); Log(" x AQB IdAL IdBL IdAR IdBR DivAB DivBA YAL YBL YAR YBR AbL AbR ScoreAB ScoreAB XLo Xhi\n"); Log("----- --- ----- ----- ----- ----- ------ ------ ----- ----- ----- ----- ----- ----- ------- ------- ----- -----\n"); #endif unsigned BestXLo = UINT_MAX; unsigned BestXHi = UINT_MAX; double BestDiv = 0.0; double BestIdQM = 0.0; double BestScore = 0.0; // Find range of cols BestXLo..BestXHi that maximizes score bool FirstA = false; // NOTE: Must be < ColHi not <= because use Col+1 below for (unsigned Col = ColLo; Col < ColHi; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; unsigned SameAL = AccumSameA[Col]; unsigned SameBL = AccumSameB[Col]; unsigned SameAR = SumSameA - AccumSameA[Col]; unsigned SameBR = SumSameB - AccumSameB[Col]; double IdAB = double(SameAL + SameBR)/Sum; double IdBA = double(SameBL + SameAR)/Sum; unsigned ForAL = AccumForA[Col]; unsigned ForBL = AccumForB[Col]; unsigned ForAR = SumForA - AccumForA[Col+1]; unsigned ForBR = SumForB - AccumForB[Col+1]; unsigned AbL = AccumAbstain[Col]; unsigned AbR = SumAbstain - AccumAbstain[Col+1]; double ScoreAB = GetScore2(ForAL, ForBL, AbL)*GetScore2(ForBR, ForAR, AbR); double ScoreBA = GetScore2(ForBL, ForAL, AbL)*GetScore2(ForAR, ForBR, AbR); double DivAB = IdAB/MaxId; double DivBA = IdBA/MaxId; double MaxDiv = max(DivAB, DivBA); //if (MaxDiv > BestDiv) // { // BestDiv = MaxDiv; // BestXLo = Col; // BestXHi = Col; // FirstA = (DivAB > DivBA); // if (FirstA) // BestIdQM = IdAB; // else // BestIdQM = IdBA; // } //else if (MaxDiv == BestDiv) // BestXHi = Col; double MaxScore = max(ScoreAB, ScoreBA); if (MaxScore > BestScore) { BestScore = MaxScore; BestXLo = Col; BestXHi = Col; FirstA = (ScoreAB > ScoreBA); if (FirstA) BestIdQM = IdAB; else BestIdQM = IdBA; if (MaxDiv > BestDiv) BestDiv = MaxDiv; } else if (MaxScore == BestScore) { BestXHi = Col; if (MaxDiv > BestDiv) BestDiv = MaxDiv; } #if TRACE { Log("%5u", Col); char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; Log(" %c%c%c", a, q, b); Log(" %5u", SameAL); Log(" %5u", SameBL); Log(" %5u", SameAR); Log(" %5u", SameBR); Log(" %5.4f", DivAB); Log(" %5.4f", DivBA); Log(" %5u", ForAL); Log(" %5u", ForBL); Log(" %5u", ForAR); Log(" %5u", ForBR); Log(" %5u", AbL); Log(" %5u", AbR); Log(" %7.4f", ScoreAB); Log(" %7.4f", ScoreBA); if (BestXLo != UINT_MAX) Log(" %5u", BestXLo); if (BestXHi != UINT_MAX) Log(" %5u", BestXHi); Log("\n"); } #endif } if (BestXLo == UINT_MAX) { #if TRACE Log("\n"); Log("No crossover found.\n"); #endif return; } #if TRACE Log("BestX col %u - %u\n", BestXLo, BestXHi); #endif // Find maximum region of identity within BestXLo..BestXHi unsigned ColXLo = (BestXLo + BestXHi)/2; unsigned ColXHi = ColXLo; unsigned SegLo = UINT_MAX; unsigned SegHi = UINT_MAX; for (unsigned Col = BestXLo; Col <= BestXHi; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (q == a && q == b) { if (SegLo == UINT_MAX) SegLo = Col; SegHi = Col; } else { unsigned SegLength = SegHi - SegLo + 1; unsigned BestSegLength = ColXHi - ColXLo + 1; if (SegLength > BestSegLength) { ColXLo = SegLo; ColXHi = SegHi; } SegLo = UINT_MAX; SegHi = UINT_MAX; } } unsigned SegLength = SegHi - SegLo + 1; unsigned BestSegLength = ColXHi - ColXLo + 1; if (SegLength > BestSegLength) { ColXLo = SegLo; ColXHi = SegHi; } QPos = 0; for (unsigned x = 0; x < ColCount; ++x) { if (x == ColXLo) Hit.QXLo = QPos; else if (x == ColXHi) { Hit.QXHi = QPos; break; } char q = Q3Seq[x]; if (q != '-') ++QPos; } Hit.ColXLo = ColXLo; Hit.ColXHi = ColXHi; //if (FirstA) // { // Hit.LY = AccumForA[ColXLo]; // Hit.LN = AccumForB[ColXLo]; // Hit.RY = SumForB - AccumForB[ColXHi]; // Hit.RN = SumForA - AccumForA[ColXHi]; // } //else // { // Hit.LY = AccumForB[ColXLo]; // Hit.LN = AccumForA[ColXLo]; // Hit.RY = SumForA - AccumForA[ColXHi]; // Hit.RN = SumForB - AccumForB[ColXHi]; // } //Hit.LA = AccumAgainst[ColXLo]; //Hit.LD = AccumAbstain[ColXLo]; //Hit.RA = SumAgainst - AccumAgainst[ColXHi]; //Hit.RD = SumAbstain - AccumAbstain[ColXHi]; Hit.PctIdAB = IdAB*100.0; Hit.PctIdQM = BestIdQM*100.0; Hit.Div = (BestDiv - 1.0)*100.0; //Hit.QSD = QSD; Hit.Q3 = Q3; Hit.QLabel = QLabel; if (FirstA) { //Hit.ASD = ASD; //Hit.BSD = BSD; //Hit.PathQA = PathQA; //Hit.PathQB = PathQB; Hit.A3 = A3; Hit.B3 = B3; Hit.ALabel = ALabel; Hit.BLabel = BLabel; Hit.PctIdQA = IdQA*100.0; Hit.PctIdQB = IdQB*100.0; } else { Hit.A3 = B3; Hit.B3 = A3; Hit.ALabel = BLabel; Hit.BLabel = ALabel; Hit.PctIdQA = IdQB*100.0; Hit.PctIdQB = IdQA*100.0; } // CS SNPs Hit.CS_LY = 0; Hit.CS_LN = 0; Hit.CS_RY = 0; Hit.CS_RN = 0; Hit.CS_LA = 0; Hit.CS_RA = 0; //vector<float> Cons; //for (unsigned Col = 0; Col < ColCount; ++Col) // { // char q = Q3Seq[Col]; // char a = A3Seq[Col]; // char b = B3Seq[Col]; // if (q == a && q == b && a == b) // { // Cons.push_back(1.0f); // continue; // } // bool gapq = isgap(q); // bool gapa = isgap(a); // bool gapb = isgap(b); // if (!gapq && !gapa && !gapb) // { // if (q == a || q == b || a == b) // Cons.push_back(0.75); // else // Cons.push_back(0.5); // } // else // { // if (!gapa && (a == b || a == q)) // Cons.push_back(0.5f); // else if (!gapb && b == q) // Cons.push_back(0.5f); // else // Cons.push_back(0.0f); // } // } //float fLY = 0.0f; //float fLN = 0.0f; //float fLA = 0.0f; //float fRY = 0.0f; //float fRN = 0.0f; //float fRA = 0.0f; for (unsigned Col = ColLo; Col <= ColHi; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; if (q == a && q == b && a == b) continue; unsigned ngaps = 0; if (isgap(q)) ++ngaps; if (isgap(a)) ++ngaps; if (isgap(b)) ++ngaps; if (opt_skipgaps) { if (ngaps == 3) continue; } else { if (ngaps == 2) continue; } if (!FirstA) swap(a, b); //float AvgCons = (Cons[Col-2] + Cons[Col-1] + Cons[Col+1] + Cons[Col+2])/4; //if (Col < ColXLo) // { // if (q == a && q != b) // fLY += AvgCons; // else if (q == b && q != a) // fLN += AvgCons; // else // fLA += AvgCons; // } //else if (Col > ColXHi) // { // if (q == b && q != a) // fRY += AvgCons; // else if (q == a && q != b) // fRN += AvgCons; // else // fRA += AvgCons; // } if (opt_skipgaps2) { if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1]))) continue; if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1]))) continue; } //if (Col > 0 && isgap(Q3Seq[Col-1])) //continue; //if (Col + 1 < ColCount && isgap(Q3Seq[Col+1])) // continue; if (Col < ColXLo) { if (q == a && q != b) ++Hit.CS_LY; else if (q == b && q != a) ++Hit.CS_LN; else ++Hit.CS_LA; } else if (Col > ColXHi) { if (q == b && q != a) ++Hit.CS_RY; else if (q == a && q != b) ++Hit.CS_RN; else ++Hit.CS_RA; } } double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA); double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA); Hit.Score = ScoreL*ScoreR; extern bool g_UchimeDeNovo; //if (0)//g_UchimeDeNovo) // { // double AbQ = GetAbFromLabel(QLabel.c_str()); // double AbA = GetAbFromLabel(ALabel.c_str()); // double AbB = GetAbFromLabel(BLabel.c_str()); // if (AbQ > 0.0 && AbA > 0.0 && AbB > 0.0) // { // double MinAb = min(AbA, AbB); // double Ratio = MinAb/AbQ; // double t = Ratio - opt_abx; // // double Factor = 2.0/(1.0 + exp(-t)); // double Factor = min(Ratio, opt_abx)/opt_abx; // if (opt_verbose) // Log("Score %.4f Ab factor %.4f >%s\n", Hit.Score, Factor, QLabel.c_str()); // Hit.Score *= Factor; // } // } extern FILE *g_fUChimeAlns; if (g_fUChimeAlns != 0 && Hit.Div > 0.0) { void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit); WriteChimeHitX(g_fUChimeAlns, Hit); } }
/* Function: PrintFancyTrace() * * Purpose: Print an alignment of an HMM to a sequence, given a traceback. * Somewhat inspired by the output style of BLAST, except that * we're aligning to a complicated model that's difficult to * represent compactly. * * Arguments: ofp - where to print it (open FILE for writing, or stdout) * shmm - log-odds form HMM * tr - traceback from ViterbiTrace() * seq - sequence that is aligned * seqname - name of seq to print in left margin * from_pos - first position in seq that aligns (0..seqlen-1) * * Returns: (void) */ void PrintFancyTrace(FILE *ofp, struct shmm_s *shmm, struct trace_s *tr, char *seq, char *seqname, int from_pos) { char *model; /* display of model */ char *mline; /* display of match/mismatch */ char *rfline; /* display of reference seq */ char *csline; /* display of consensus struct */ char *aseq; /* display of aligned sequence */ int rpos; /* position in raw seq */ int apos; /* position in traceback/alignment */ float score; /* score for position */ float max_score; /* best score for position */ char bestsym; /* best match sym at position */ int idx; /* counter for alphabet */ int len; /* current length of display printed */ char buffer[CPL+1]; /* buffer for lines of display */ int startpos, endpos; /* Memory allocation. */ if ((rfline = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL || (csline = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL || (model = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL || (mline = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL || (aseq = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL) Die("memory allocation failed at %s:%d", __FILE__, __LINE__); memset(rfline, ' ', tr->tlen); memset(csline, ' ', tr->tlen); memset(model, ' ', tr->tlen); memset(mline, ' ', tr->tlen); memset(aseq, ' ', tr->tlen); /* Create the displays of model and aligned sequence. * Ignore BEGIN (apos == 0) and END (apos == N-1) in the traceback. */ rpos = from_pos; for (apos = 1; apos < tr->tlen-1; apos++) { /* find best sym at this model position */ if (tr->statetype[apos] != INSERT) { max_score = -999; for (idx = 0; idx < 26; idx++) if (shmm->m_emit[idx][tr->nodeidx[apos]] > max_score) { max_score = shmm->m_emit[idx][tr->nodeidx[apos]]; bestsym = (char) ('A' + idx); } if (max_score > (int)(CUTOFF * INTSCALE)) model[apos] = toupper((int) bestsym); else model[apos] = tolower((int) bestsym); } else model[apos] = '.'; /* construct mline (match/mismatch display), rfline, and csline */ switch (tr->statetype[apos]) { case MATCH: score = shmm->m_emit[seq[rpos]-'A'][tr->nodeidx[apos]]; if (seq[rpos] == bestsym) mline[apos] = bestsym; else if (score > 0) mline[apos] = '+'; aseq[apos] = seq[rpos]; if (shmm->flags & HMM_REF) rfline[apos] = shmm->ref[tr->nodeidx[apos]]; if (shmm->flags & HMM_CS) csline[apos] = shmm->cs[tr->nodeidx[apos]]; rpos++; break; case INSERT: aseq[apos] = seq[rpos]; rpos++; break; case DELETE: aseq[apos] = '-'; if (shmm->flags & HMM_REF) rfline[apos] = shmm->ref[tr->nodeidx[apos]]; if (shmm->flags & HMM_CS) csline[apos] = shmm->cs[tr->nodeidx[apos]]; break; default: Die("Unrecognized statetype %d at %d in traceback", tr->statetype[apos], apos); } } /* Null terminate, and tack on asterisks to represent BEGIN and * END dummy states in model. */ model[0] = '*'; /* begin */ model[tr->tlen-1] = '*'; /* end */ model[tr->tlen] = '\0'; aseq[tr->tlen] = '\0'; mline[tr->tlen] = '\0'; csline[tr->tlen] = '\0'; rfline[tr->tlen] = '\0'; /* Print out the display. */ fprintf(ofp, " Alignment to HMM consensus:\n"); buffer[CPL] = '\0'; len = 0; rpos = from_pos + 1; while (len < tr->tlen) { startpos = rpos; /* rf line reference coord line */ if (shmm->flags & HMM_REF) { strncpy(buffer, rfline+len, CPL); fprintf(ofp, " REF %s\n", buffer); } /* cs consensus structure line */ if (shmm->flags & HMM_CS) { strncpy(buffer, csline+len, CPL); fprintf(ofp, " CS %s\n", buffer); } /* model */ strncpy(buffer, model+len, CPL); fprintf(ofp, " %s\n", buffer); /* mline */ strncpy(buffer, mline+len, CPL); fprintf(ofp, " %s\n", buffer); /* get coords of this aseq block */ for (apos = len; aseq[apos] != '\0' && apos < len + CPL; apos++) if (! isgap(aseq[apos])) rpos++; endpos = rpos-1; /* aligned sequence */ strncpy(buffer, aseq+len, CPL); fprintf(ofp, " %10.10s %5d %s %5d\n", seqname, startpos, buffer, endpos); len += CPL; fprintf(ofp, "\n"); } /* Done. Free memory and return. */ fflush(ofp); free(model); free(aseq); free(mline); free(rfline); free(csline); return; }
void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit) { if (f == 0) return; if (Hit.Div <= 0.0) return; const string &Q3 = Hit.Q3; const string &A3 = Hit.A3; const string &B3 = Hit.B3; const byte *Q3Seq = (const byte *) Q3.c_str(); const byte *A3Seq = (const byte *) A3.c_str(); const byte *B3Seq = (const byte *) B3.c_str(); // Aligned unsigned ColCount = SIZE(Q3); asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount); unsigned LQ = GetUngappedLength(Q3Seq, ColCount); unsigned LA = GetUngappedLength(A3Seq, ColCount); unsigned LB = GetUngappedLength(B3Seq, ColCount); fprintf(f, "\n"); fprintf(f, "------------------------------------------------------------------------\n"); fprintf(f, "Query (%5u nt) %s\n", LQ, Hit.QLabel.c_str()); fprintf(f, "ParentA (%5u nt) %s\n", LA, Hit.ALabel.c_str()); fprintf(f, "ParentB (%5u nt) %s\n", LB, Hit.BLabel.c_str()); // Strip terminal gaps in query unsigned FromCol = UINT_MAX; unsigned ToCol = UINT_MAX; for (unsigned Col = 0; Col < ColCount; ++Col) { if (!isgap(Q3Seq[Col])) { if (FromCol == UINT_MAX) FromCol = Col; ToCol = Col; } } unsigned QPos = 0; unsigned APos = 0; unsigned BPos = 0; for (unsigned Col = 0; Col < FromCol; ++Col) { if (!isgap(A3Seq[Col])) ++APos; if (!isgap(B3Seq[Col])) ++BPos; } unsigned Range = ToCol - FromCol + 1; unsigned RowCount = (Range + 79)/80; unsigned RowFromCol = FromCol; for (unsigned RowIndex = 0; RowIndex < RowCount; ++RowIndex) { fprintf(f, "\n"); unsigned RowToCol = RowFromCol + 79; if (RowToCol > ToCol) RowToCol = ToCol; // A row fprintf(f, "A %5u ", APos + 1); for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; if (a != q) a = tolower(a); fprintf(f, "%c", a); if (!isgap(a)) ++APos; } fprintf(f, " %u\n", APos); // Q row fprintf(f, "Q %5u ", QPos + 1); for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) { char q = Q3Seq[Col]; fprintf(f, "%c", q); if (!isgap(q)) ++QPos; } fprintf(f, " %u\n", QPos); // B row fprintf(f, "B %5u ", BPos + 1); for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) { char q = Q3Seq[Col]; char b = B3Seq[Col]; if (b != q) b = tolower(b); fprintf(f, "%c", b); if (!isgap(b)) ++BPos; } fprintf(f, " %u\n", BPos); // Diffs fprintf(f, "Diffs "); for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; char c = ' '; if (isgap(q) || isgap(a) || isgap(b)) c = ' '; else if (Col < Hit.ColXLo) { if (q == a && q == b) c = ' '; else if (q == a && q != b) c = 'A'; else if (q == b && q != a) c = 'b'; else if (a == b && q != a) c = 'N'; else c = '?'; } else if (Col > Hit.ColXHi) { if (q == a && q == b) c = ' '; else if (q == b && q != a) c = 'B'; else if (q == a && q != b) c = 'a'; else if (a == b && q != a) c = 'N'; else c = '?'; } fprintf(f, "%c", c); } fprintf(f, "\n"); // SNPs fprintf(f, "Votes "); for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) { char q = Q3Seq[Col]; char a = A3Seq[Col]; char b = B3Seq[Col]; bool PrevGap = Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])); bool NextGap = Col+1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])); char c = ' '; if (isgap(q) || isgap(a) || isgap(b) || PrevGap || NextGap) c = ' '; else if (Col < Hit.ColXLo) { if (q == a && q == b) c = ' '; else if (q == a && q != b) c = '+'; else if (q == b && q != a) c = '!'; else c = '0'; } else if (Col > Hit.ColXHi) { if (q == a && q == b) c = ' '; else if (q == b && q != a) c = '+'; else if (q == a && q != b) c = '!'; else c = '0'; } fprintf(f, "%c", c); } fprintf(f, "\n"); // LR row fprintf(f, "Model "); for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) { if (Col < Hit.ColXLo) fprintf(f, "A"); else if (Col >= Hit.ColXLo && Col <= Hit.ColXHi) fprintf(f, "x"); else fprintf(f, "B"); } fprintf(f, "\n"); RowFromCol += 80; } fprintf(f, "\n"); double PctIdBestP = max(Hit.PctIdQA, Hit.PctIdQB); double Div = (Hit.PctIdQM - PctIdBestP)*100.0/PctIdBestP; unsigned LTot = Hit.CS_LY + Hit.CS_LN + Hit.CS_LA; unsigned RTot = Hit.CS_RY + Hit.CS_RN + Hit.CS_RA; double PctL = Pct(Hit.CS_LY, LTot); double PctR = Pct(Hit.CS_RY, RTot); fprintf(f, "Ids. QA %.1f%%, QB %.1f%%, AB %.1f%%, QModel %.1f%%, Div. %+.1f%%\n", Hit.PctIdQA, Hit.PctIdQB, Hit.PctIdAB, Hit.PctIdQM, Div); fprintf(f, "Diffs Left %u: N %u, A %u, Y %u (%.1f%%); Right %u: N %u, A %u, Y %u (%.1f%%), Score %.4f\n", LTot, Hit.CS_LN, Hit.CS_LA, Hit.CS_LY, PctL, RTot, Hit.CS_RN, Hit.CS_RA, Hit.CS_RY, PctR, Hit.Score); }
vector<pblock::PBLOCK> pblock::resplit(vector<gen::quartral_element> ic) { int last=0; PBLOCK tmp=PBLOCK(); vector<pblock::PBLOCK> result; set<string> tmpdef; set<string> tmpuse; for (size_t i=0;i<ic.size();i++) { if ((isgap(ic[i])&&ic[i].op!=gen::CALL&&ic[i].op!=gen::CALLFUN)||ic[i+1].op==gen::LABEL) { tmp.begin=last;tmp.end=i+1;tmp.valid=false; last=i+1; result.push_back(tmp); tmpdef.clear(); tmpuse.clear(); } } for (size_t i=0;i<result.size();i++) { if (ic[result[i].begin].op==gen::FUN||(result[i].begin+1<ic.size()&&ic[result[i].begin+1].op==gen::FUN)) { string tmps = ic[result[i].end-1].target; string fn = ic[result[i].begin+1].target; result[i].functionname=fn; for (size_t j=0;j<result.size();j++) { for (size_t k=result[j].begin;k<result[j].end;k++) { if (ic[k].target==tmps&&ic[k].op==gen::LABEL) { result[j].valid=true; for (size_t l=result[i].begin;l!=result[i].end;l++) { if (ic[l].op==gen::CREATE) { result[j].var.push_back(ic[l].first); } } if (i!=0) { result[j].level=helper::type2type::string2int(ic[result[i].begin+1].first); } else { result[j].level=0; } result[j].functionname=fn; } } } } } for (size_t i=0;i<result.size();i++) { if (result[i].valid==true) { cout<<i<<' '; } } cout<<endl; return result; }
/* Function: WriteMSF() * * Purpose: Write aseqs, names, weights to an open fp, * in GCG MSF format. The alignment must * be flushed (all aseqs the same length, padded * with gaps) * * Return: (void) */ void WriteMSF(FILE *fp, /* open fp for writing */ char **aseqs, /* aligned sequences */ AINFO *ainfo) { int still_going; /* True if writing another block */ int idx; /* counter for sequences */ int pos; /* position counter */ int namelen; /* maximum name length used */ int len; /* tmp variable for name lengths */ char buffer[51]; /* buffer for writing seq */ char **sqptr; /* ptrs into each sequence */ int charcount; /* num. symbols we're writing */ /* allocate seq pointers that we'll move across each sequence */ sqptr = (char **) MallocOrDie (ainfo->nseq * sizeof(char *)); /* set sqptrs to start of each seq */ for (idx = 0; idx < ainfo->nseq; idx++) sqptr[idx] = aseqs[idx]; /* calculate max namelen used */ namelen = 0; for (idx = 0; idx < ainfo->nseq; idx++) if ((len = strlen(ainfo->sqinfo[idx].name)) > namelen) namelen = len; /***************************************************** * Write the title line *****************************************************/ fprintf(fp, "\n"); /* ack! we're writing bullshit here */ fprintf(fp, " MSF: 000 Type: X Check: 0000 ..\n"); fprintf(fp, "\n"); /***************************************************** * Write the names *****************************************************/ for (idx = 0; idx < ainfo->nseq; idx++) { fprintf(fp, " Name: %-*.*s Len: %5d Check: %5d Weight: %.4f\n", namelen, namelen, ainfo->sqinfo[idx].name, ainfo->alen, GCGchecksum(aseqs[idx], ainfo->alen), ainfo->wgt[idx]); } fprintf(fp, "\n"); fprintf(fp, "//\n"); fprintf(fp, "\n"); /***************************************************** * Write the sequences *****************************************************/ still_going = 1; while (still_going) { still_going = 0; for (idx = 0; idx < ainfo->nseq; idx++) { fprintf(fp, "%-*.*s ", namelen, namelen, ainfo->sqinfo[idx].name); /* get next line's worth of 50 from seq */ strncpy(buffer, sqptr[idx], 50); buffer[50] = '\0'; charcount = strlen(buffer); /* is there still more to go? */ if (charcount == 50 && sqptr[idx][50] != '\0') still_going = 1; /* shift the seq ptr by a line */ sqptr[idx] += charcount; /* draw the sequence line */ pos = 0; while (pos < charcount) { if (isgap(buffer[pos])) fputc('.', fp); else fputc(buffer[pos], fp); pos++; if (!(pos % 10)) fputc(' ', fp); } fputc('\n', fp); } /* put blank line between blocks */ fputc('\n', fp); } free(sqptr); }
/* Function: ViterbiAlignAlignment() * * Purpose: Align a multiple sequence alignment to an HMM without * altering the multiple alignment. * * Args: shmm - HMM in integer log-odds score form * aseq - alignment, [0..nseq-1][0..alen-1] * alen - length of aligned sequences * nseq - number of aligned sequences * ret_tr - RETURN: array of tracebacks. rpos field is * relative to aseq, not raw seq, similar to * Maxmodelmaker(); use DealignTrace() if you * want relative to raw sequence. * ret_sc - RETURN: sum of log odds scores. * * Return: (void) * ret_tr is alloced here. Individuals must be free'd by FreeTrace(), * then tr itself free'd by free(). */ void ViterbiAlignAlignment(struct shmm_s *shmm, char **aseq, int alen, int nseq, struct trace_s ***ret_tr, float *ret_sc) { struct fvit_s **mx; /* the viterbi calculation grid */ int score; /* tmp variable for scores */ int i; /* counter for sequence position: 0,1..L */ int k; /* counter for model position: 0,1..M */ int idx; /* index for sequences */ struct fvit_s *thisrow; /* ptr to current row of mx */ struct fvit_s *nextrow; /* ptr to next row of mx */ int **matocc; /* [0..alen+1][0..nseq-1], 1 for MATCH*/ struct trace_s **tr; /* array of tracebacks to return */ int *tpos; /* index for position in indiv traces */ int lastsub; /* last state type in master trace */ /* A crucial extra component of this alignment algorithm: * at each matrix cell, we have to remember: for the best * path into the INSERT subcell, what state is each sequence in? * This is non-trivial because some gaps are assigned to * no states. When we calculate the score from an insert column, * where there are gaps we have to look up the previous state. * * Fortunately, we don't need to keep a full matrix of these, * or we'd be in serious memory problems. Use a rolling pointer * trick, keep two active rows "current" and "next". */ char **cur_state; /* [0..M+1][0..nseq-1]; MATCH, INSERT, or DELETE */ char **nxt_state; /* same, except keeps states for next row */ char **swap; /* used for swapping cur, nxt */ /******************************************** * Initial setup and allocations ********************************************/ /* allocate the calculation matrix, which is 0..alen+1 rows by 0..M+1 cols */ mx = (struct fvit_s **) MallocOrDie (sizeof(struct fvit_s *) * (alen+2)); matocc = (int **) MallocOrDie (sizeof(int *) * (alen+2)); cur_state = (char **) MallocOrDie (sizeof(char *) * (shmm->M+2)); nxt_state = (char **) MallocOrDie (sizeof(char *) * (shmm->M+2)); for (i = 0; i <= alen+1; i++) { mx[i] = (struct fvit_s *) MallocOrDie (sizeof(struct fvit_s) * (shmm->M+2)); matocc[i]= (int *) MallocOrDie (sizeof(int) * nseq); } for (k = 0; k <= shmm->M+1; k++) { cur_state[k] = (char *) MallocOrDie (sizeof(char) * nseq); nxt_state[k] = (char *) MallocOrDie (sizeof(char) * nseq); } /******************************************** * Initialization ********************************************/ /* initialize the first cell 0,0 */ mx[0][0].score_m = 0; mx[0][0].score_d = -99999999; mx[0][0].score_i = -99999999; for (k = 0; k <= shmm->M+1; k++) for (idx = 0; idx < nseq; idx++) nxt_state[k][idx] = MATCH; /* initialize the top row */ for (k = 1; k <= shmm->M+1; k++) { mx[0][k].score_m = -99999999; mx[0][k].score_i = -99999999; } /* Precalculate matocc (match occupancy). * 1 if symbol in column for this seq, 0 if not. * 1..alen, from 0..alen-1 alignments */ for (idx = 0; idx < nseq; idx++) { matocc[0][idx] = matocc[alen+1][idx] = 1; /* dummies for BEGIN, END */ for (i = 1; i <= alen; i++) matocc[i][idx] = isgap(aseq[idx][i-1]) ? 0 : 1; } /******************************************** * Recursion: fill in the mx matrix ********************************************/ /* Alignment is 0..alen-1, we index it here as 1..alen because of Viterbi matrix. */ for (i = 0; i <= alen; i++) { /* get ptrs into current and next row. */ thisrow = mx[i]; nextrow = mx[i+1]; /* initialize in the next row */ nextrow[0].score_m = -99999999; nextrow[0].score_d = -99999999; swap = cur_state; cur_state = nxt_state; nxt_state = swap; for (k = 0; k <= shmm->M; k++) { /* begin inner loop... this is where all the time is spent. */ /* add in emission scores to the current cell. */ if (i > 0) for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) { thisrow[k].score_m += shmm->m_emit[aseq[idx][i-1] - 'A'][k]; thisrow[k].score_i += shmm->i_emit[aseq[idx][i-1] - 'A'][k]; } /* initialize with transitions out of delete state */ /* to delete */ thisrow[k+1].score_d = thisrow[k].score_d + shmm->t[9*k + Tdd] * nseq; thisrow[k+1].tback_d = DELETE; /* to insert */ nextrow[k].score_i = thisrow[k].score_d; nextrow[k].tback_i = DELETE; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) { nextrow[k].score_i += shmm->t[9*k + Tdi]; nxt_state[k][idx] = INSERT; } else nxt_state[k][idx] = DELETE; /* to match */ nextrow[k+1].score_m = thisrow[k].score_d; nextrow[k+1].tback_m = DELETE; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) nextrow[k+1].score_m += shmm-> t[9*k + Tdm]; else nextrow[k+1].score_m += shmm-> t[9*k + Tdd]; /* deal with transitions out of insert state */ /* to delete state. */ score = thisrow[k].score_i; for (idx = 0; idx < nseq; idx++) switch (cur_state[k][idx]) { case MATCH: score += shmm->t[9*k + Tmd]; break; case DELETE: score += shmm->t[9*k + Tdd]; break; case INSERT: score += shmm->t[9*k + Tid]; break; } if (score > thisrow[k+1].score_d) { thisrow[k+1].score_d = score; thisrow[k+1].tback_d = INSERT; } /* to insert state */ score = thisrow[k].score_i; for (idx = 0; idx < nseq; idx++) { if (matocc[i+1][idx]) switch (cur_state[k][idx]) { case MATCH: score += shmm->t[9*k + Tmi]; break; case DELETE: score += shmm->t[9*k + Tdi]; break; case INSERT: score += shmm->t[9*k + Tii]; break; } } if (score > nextrow[k].score_i) { nextrow[k].score_i = score; nextrow[k].tback_i = INSERT; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) nxt_state[k][idx] = INSERT; else nxt_state[k][idx] = cur_state[k][idx]; } /* to match state */ score = thisrow[k].score_i; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) switch (cur_state[k][idx]) { case MATCH: score += shmm->t[9*k + Tmm]; break; case DELETE: score += shmm->t[9*k + Tdm]; break; case INSERT: score += shmm->t[9*k + Tim]; break; } else switch (cur_state[k][idx]) { case MATCH: score += shmm->t[9*k + Tmd]; break; case DELETE: score += shmm->t[9*k + Tdd]; break; case INSERT: score += shmm->t[9*k + Tid]; break; } if (score > nextrow[k+1].score_m) { nextrow[k+1].score_m = score; nextrow[k+1].tback_m = INSERT; } /* Transitions out of match state. */ /* to delete */ score = thisrow[k].score_m; for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) score += shmm->t[9*k + Tmd]; else score += shmm->t[9*k + Tdd]; if (score > thisrow[k+1].score_d) { thisrow[k+1].score_d = score; thisrow[k+1].tback_d = MATCH; } /* to insert */ score = thisrow[k].score_m; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) { if (matocc[i][idx]) score += shmm->t[9*k + Tmi]; else score += shmm->t[9*k + Tdi]; } if (score > nextrow[k].score_i) { nextrow[k].score_i = score; nextrow[k].tback_i = MATCH; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) nxt_state[k][idx] = INSERT; else if (matocc[i][idx]) nxt_state[k][idx] = MATCH; else nxt_state[k][idx] = DELETE; } /* to match */ score = thisrow[k].score_m; for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) { if (matocc[i+1][idx]) score += shmm->t[9*k + Tmm]; else score += shmm->t[9*k + Tmd]; } else { if (matocc[i+1][idx]) score += shmm->t[9*k + Tdm]; else score += shmm->t[9*k + Tdd]; } if (score > nextrow[k+1].score_m) { nextrow[k+1].score_m = score; nextrow[k+1].tback_m = MATCH; } } /* end loop over model positions k */ } /* end loop over alignment positions i */ /* PrintFragViterbiMatrix(mx, alen, shmm->M); */ /* Fill stage finished. * mx now contains final score in mx[alen+1][M+1]. * Trace back from there to get master alignment. */ tr = (struct trace_s **) MallocOrDie (sizeof(struct trace_s *) * nseq); tpos = (int *) MallocOrDie (sizeof(int) * nseq); for (idx = 0; idx < nseq; idx++) { AllocTrace(alen + shmm->M + 3, &(tr[idx])); tr[idx]->nodeidx[0] = shmm->M+1; tr[idx]->statetype[0] = MATCH; tr[idx]->rpos[0] = -1; tpos[idx] = 1; } i = alen+1; k = shmm->M+1; lastsub= MATCH; while (i != 0 || k != 0) { switch (lastsub) { case MATCH: lastsub = mx[i][k].tback_m; i--; k--; break; case DELETE: lastsub = mx[i][k].tback_d; k--; break; case INSERT: lastsub = mx[i][k].tback_i; i--; break; default: Die("trace failed!"); } switch (lastsub) { case MATCH: for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) { tr[idx]->nodeidx[tpos[idx]] = k; tr[idx]->statetype[tpos[idx]] = MATCH; tr[idx]->rpos[tpos[idx]] = i-1; tpos[idx]++; } else { tr[idx]->nodeidx[tpos[idx]] = k; tr[idx]->statetype[tpos[idx]] = DELETE; tr[idx]->rpos[tpos[idx]] = -1; tpos[idx]++; } break; case INSERT: for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) { tr[idx]->nodeidx[tpos[idx]] = k; tr[idx]->statetype[tpos[idx]] = INSERT; tr[idx]->rpos[tpos[idx]] = i-1; tpos[idx]++; } break; case DELETE: for (idx = 0; idx < nseq; idx++) { tr[idx]->nodeidx[tpos[idx]] = k; tr[idx]->statetype[tpos[idx]] = DELETE; tr[idx]->rpos[tpos[idx]] = -1; tpos[idx]++; } break; default: Die("trace failed!"); } /* end switch across new subcell in traceback */ } /* end traceback */ for (idx = 0; idx < nseq; idx++) ReverseTrace(tr[idx], tpos[idx]); *ret_tr = tr; *ret_sc = (float) mx[alen+1][shmm->M+1].score_m / INTSCALE; Free2DArray(matocc, alen+2); Free2DArray(cur_state, shmm->M+2); Free2DArray(nxt_state, shmm->M+2); Free2DArray(mx, alen+2); free(tpos); }
/* Function: homogenize_gapsym() * * Purpose: Make gap symbols homogeneous. */ static void homogenize_gapsym(char *s, char gapsym) { for (; *s != '\0'; s++) if (isgap(*s)) *s = gapsym; }
/* Function: ReadSELEX() * Date: SRE, Sun Jun 6 18:24:09 1999 [St. Louis] * * Purpose: Parse an alignment read from an open SELEX format * alignment file. (SELEX is a single alignment format). * Return the alignment, or NULL if we've already read the * alignment or there's no alignment data in the file. * * Limitations: SELEX is the only remaining multipass parser for * alignment files. It cannot read from gzip or from stdin. * It Die()'s here if you try. The reason for this * that SELEX allows space characters as gaps, so we don't * know the borders of an alignment block until we've seen * the whole block. I could rewrite to allow single-pass * parsing (by storing the whole block in memory) but * since SELEX is now legacy, why bother. * * Note that the interface is totally kludged: fastest * possible adaptation of old ReadSELEX() to the new * MSA interface. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no alignment data. */ MSA * ReadSELEX(MSAFILE *afp) { MSA *msa; /* RETURN: mult seq alignment */ FILE *fp; /* ptr to opened seqfile */ char **aseqs; /* aligned seqs */ int num = 0; /* number of seqs read */ char buffer[LINEBUFLEN]; /* input buffer for lines */ char bufcpy[LINEBUFLEN]; /* strtok'able copy of buffer */ struct block_struc { /** alignment data for a block: */ int lcol; /* furthest left aligned sym */ int rcol; /* furthest right aligned sym */ } *blocks = NULL; int blocknum; /* number of blocks in file */ char *nptr; /* ptr to start of name on line */ char *sptr; /* ptr into sequence on line */ int currnum; /* num. seqs in given block */ int currblock; /* index for blocks */ int i; /* loop counter */ int seqidx; /* counter for seqs */ int alen; /* length of alignment */ int warn_names; /* becomes TRUE if names don't match between blocks */ int headnum; /* seqidx in per-sequence header info */ int currlen; int count; int have_cs = 0; int have_rf = 0; AINFO base_ainfo, *ainfo; /* hack: used to be passed ptr to AINFO */ /* Convert from MSA interface to what old ReadSELEX() did: * - copy our open fp, rather than opening file * - verify that we're not reading a gzip or stdin */ if (feof(afp->f)) return NULL; if (afp->do_gzip || afp->do_stdin) Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file"); fp = afp->f; ainfo = &base_ainfo; /*************************************************** * First pass across file. * Count seqs, get names, determine column info * Determine what sorts of info are active in this file. ***************************************************/ InitAinfo(ainfo); /* get first line of the block * (non-comment, non-blank) */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_NODATA; return 0; } strcpy(bufcpy, buffer); if (*buffer == '#') { if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1; else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1; } } while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || (strchr(commentsyms, *nptr) != NULL)); blocknum = 0; warn_names = FALSE; while (!feof(fp)) { /* allocate for info about this block. */ if (blocknum == 0) blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc)); else blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc)); blocks[blocknum].lcol = LINEBUFLEN+1; blocks[blocknum].rcol = -1; currnum = 0; while (nptr != NULL) /* becomes NULL when this block ends. */ { /* First block only: save names */ if (blocknum == 0) { if (currnum == 0) ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO)); else ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO)); ainfo->sqinfo[currnum].flags = 0; SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME); } else /* in each additional block: check names */ { if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0) warn_names = TRUE; } currnum++; /* check rcol, lcol */ if ((sptr = strtok(NULL, WHITESPACE)) != NULL) { /* is this the furthest left we've seen word 2 in this block? */ if (sptr - bufcpy < blocks[blocknum].lcol) blocks[blocknum].lcol = sptr - bufcpy; /* look for right side in buffer */ for (sptr = buffer + strlen(buffer) - 1; strchr(WHITESPACE, *sptr) != NULL; sptr --) /* do nothing */ ; if (sptr - buffer > blocks[blocknum].rcol) blocks[blocknum].rcol = sptr - buffer; } /* get the next line; blank line means end of block */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; } strcpy(bufcpy, buffer); if (strncmp(buffer, "#=SS", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS; else if (strncmp(buffer, "#=SA", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA; else if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1; else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1; if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; } while (strchr(commentsyms, *nptr) != NULL); } /* check that number of sequences matches expected */ if (blocknum == 0) num = currnum; else if (currnum != num) Die("Parse error in ReadSELEX()"); blocknum++; /* get first line of next block * (non-comment, non-blank) */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; } strcpy(bufcpy, buffer); } while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || (strchr(commentsyms, *nptr) != NULL)); } /*************************************************** * Get ready for second pass: * figure out the length of the alignment * malloc space * rewind the file ***************************************************/ alen = 0; for (currblock = 0; currblock < blocknum; currblock++) alen += blocks[currblock].rcol - blocks[currblock].lcol + 1; rewind(fp); /* allocations. we can't use AllocateAlignment because of * the way we already used ainfo->sqinfo. */ aseqs = (char **) MallocOrDie (num * sizeof(char *)); if (have_cs) ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char)); if (have_rf) ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char)); for (i = 0; i < num; i++) { aseqs[i] = (char *) MallocOrDie ((alen+1) * sizeof(char)); if (ainfo->sqinfo[i].flags & SQINFO_SS) ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char)); if (ainfo->sqinfo[i].flags & SQINFO_SA) ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char)); } ainfo->alen = alen; ainfo->nseq = num; ainfo->wgt = (float *) MallocOrDie (sizeof(float) * num); FSet(ainfo->wgt, num, 1.0); /*************************************************** * Second pass across file. Parse header; assemble sequences ***************************************************/ /* We've now made a complete first pass over the file. We know how * many blocks it contains, we know the number of seqs in the first * block, and we know every block has the same number of blocks; * so we can be a bit more cavalier about error-checking as we * make the second pass. */ /* Look for header */ headnum = 0; for (;;) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) Die("Parse error in ReadSELEX()"); strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */ if (strcmp(nptr, "#=AU") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->au = Strdup(sptr); else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->name = Strdup(sptr); else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->acc = Strdup(sptr); else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->desc = Strdup(sptr); else if (strcmp(nptr, "#=GA") == 0) { if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=GA line in ReadSELEX()"); ainfo->ga1 = atof(sptr); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=GA line in ReadSELEX()"); ainfo->ga2 = atof(sptr); ainfo->flags |= AINFO_GA; } else if (strcmp(nptr, "#=TC") == 0) { if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=TC line in ReadSELEX()"); ainfo->tc1 = atof(sptr); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=TC line in ReadSELEX()"); ainfo->tc2 = atof(sptr); ainfo->flags |= AINFO_TC; } else if (strcmp(nptr, "#=NC") == 0) { if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=NC line in ReadSELEX()"); ainfo->nc1 = atof(sptr); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=NC line in ReadSELEX()"); ainfo->nc2 = atof(sptr); ainfo->flags |= AINFO_NC; } else if (strcmp(nptr, "#=SQ") == 0) /* per-sequence header info */ { /* first field is the name */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX()"); if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE; /* second field is the weight */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX()"); if (!IsReal(sptr)) Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number"); ainfo->wgt[headnum] = atof(sptr); /* third field is database source id */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID); /* fourth field is database accession number */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC); /* fifth field is start..stop::olen */ if ((sptr = strtok(NULL, ".:")) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START); if ((sptr = strtok(NULL, ".:")) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP); if ((sptr = strtok(NULL, ":\t ")) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN); /* rest of line is optional description */ if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC); headnum++; } else if (strcmp(nptr, "#=CS") == 0) break; else if (strcmp(nptr, "#=RF") == 0) break; else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */ } currlen = 0; for (currblock = 0 ; currblock < blocknum; currblock++) { /* parse the block */ seqidx = 0; while (nptr != NULL) { /* Consensus structure */ if (strcmp(nptr, "#=CS") == 0) { if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=CS line in ReadSELEX()"); } /* Reference coordinates */ else if (strcmp(nptr, "#=RF") == 0) { if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=RF line in ReadSELEX()"); } /* Individual secondary structure */ else if (strcmp(nptr, "#=SS") == 0) { if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=SS line in ReadSELEX()"); } /* Side chain % surface accessibility code */ else if (strcmp(nptr, "#=SA") == 0) { if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=SA line in ReadSELEX()"); } /* Aligned sequence; avoid unparsed machine comments */ else if (strncmp(nptr, "#=", 2) != 0) { if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in alignment line in ReadSELEX()"); seqidx++; } /* get next line */ for (;;) { nptr = NULL; if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* EOF */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* data */ } } /* end of a block */ currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1; /* get line 1 of next block */ for (;;) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */ } } /* end of the file */ /* Lengths in sqinfo are for raw sequence (ungapped), * and SS, SA are 0..rlen-1 not 0..alen-1. * Only the seqs with structures come out of here with lengths set. */ for (seqidx = 0; seqidx < num; seqidx++) { int apos, rpos; /* secondary structures */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SS) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos]; rpos++; } ainfo->sqinfo[seqidx].ss[rpos] = '\0'; } /* Surface accessibility */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SA) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos]; rpos++; } ainfo->sqinfo[seqidx].sa[rpos] = '\0'; } } /* NULL-terminate all the strings */ if (ainfo->rf != NULL) ainfo->rf[alen] = '\0'; if (ainfo->cs != NULL) ainfo->cs[alen] = '\0'; for (seqidx = 0; seqidx < num; seqidx++) aseqs[seqidx][alen] = '\0'; /* find raw sequence lengths for sqinfo */ for (seqidx = 0; seqidx < num; seqidx++) { count = 0; for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++) if (!isgap(*sptr)) count++; ainfo->sqinfo[seqidx].len = count; ainfo->sqinfo[seqidx].flags |= SQINFO_LEN; } /*************************************************** * Garbage collection and return ***************************************************/ free(blocks); if (warn_names) Warn("sequences may be in different orders in blocks of %s?", afp->fname); /* Convert back to MSA structure. (Wasteful kludge.) */ msa = MSAFromAINFO(aseqs, ainfo); MSAVerifyParse(msa); FreeAlignment(aseqs, ainfo); return msa; }
/* Function: WriteMSF() * Date: SRE, Mon May 31 11:25:18 1999 [St. Louis] * * Purpose: Write an alignment in MSF format to an open file. * * Args: fp - file that's open for writing. * msa - alignment to write. * * Note that msa->type, usually optional, must be * set for WriteMSF to work. If it isn't, a fatal * error is generated. * * Returns: (void) */ void WriteMSF(FILE *fp, MSA *msa) { time_t now; /* current time as a time_t */ char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */ char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */ char **gcg_sqname; /* sequence names with GCG-valid character sets */ int idx; /* counter for sequences */ char *s; /* pointer into sqname or seq */ int len; /* tmp variable for name lengths */ int namelen; /* maximum name length used */ int pos; /* position counter */ char buffer[51]; /* buffer for writing seq */ int i; /* another position counter */ /***************************************************************** * Make copies of sequence names and sequences. * GCG recommends that name characters should only contain * alphanumeric characters, -, or _ * Some GCG and GCG-compatible software is sensitive to this. * We silently convert all other characters to '_'. * * For sequences, GCG allows only ~ and . for gaps. * Otherwise, everthing is interpreted as a residue; * so squid's IUPAC-restricted chars are fine. ~ means * an external gap. . means an internal gap. *****************************************************************/ /* make copies that we can edit */ gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq); gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq); for (idx = 0; idx < msa->nseq; idx++) { gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen); gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1); } /* alter names as needed */ for (idx = 0; idx < msa->nseq; idx++) for (s = gcg_sqname[idx]; *s != '\0'; s++) if (! isalnum((int) *s) && *s != '-' && *s != '_') *s = '_'; /* alter gap chars in seq */ for (idx = 0; idx < msa->nseq; idx++) { for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++) *s = '~'; for (; *s != '\0'; s++) if (isgap(*s)) *s = '.'; for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--) gcg_aseq[idx][pos] = '~'; } /* calculate max namelen used */ namelen = 0; for (idx = 0; idx < msa->nseq; idx++) if ((len = strlen(msa->sqname[idx])) > namelen) namelen = len; /***************************************************** * Write the MSF header *****************************************************/ /* required file type line */ if (msa->type == kOtherSeq) msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq); if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n"); else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n"); else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n"); else if (msa->type == kOtherSeq) Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n"); else Die("Invalid sequence type %d in WriteMSF()\n", msa->type); /* free text comments */ if (msa->ncomment > 0) { for (idx = 0; idx < msa->ncomment; idx++) fprintf(fp, "%s\n", msa->comment[idx]); fprintf(fp, "\n"); } /* required checksum line */ now = time(NULL); if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0) Die("What time is it on earth? strftime() failed in WriteMSF().\n"); fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n", msa->name != NULL ? msa->name : "squid.msf", msa->alen, msa->type == kRNA ? 'N' : 'P', date, GCGMultchecksum(gcg_aseq, msa->nseq)); fprintf(fp, "\n"); /***************************************************** * Names/weights section *****************************************************/ for (idx = 0; idx < msa->nseq; idx++) { fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n", namelen, namelen, gcg_sqname[idx], msa->alen, GCGchecksum(gcg_aseq[idx], msa->alen), msa->wgt[idx]); } fprintf(fp, "\n"); fprintf(fp, "//\n"); /***************************************************** * Write the sequences *****************************************************/ for (pos = 0; pos < msa->alen; pos += 50) { fprintf(fp, "\n"); /* Blank line between sequence blocks */ /* Coordinate line */ len = (pos + 50) > msa->alen ? msa->alen - pos : 50; if (len > 10) fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "", pos+1, len + ((len-1)/10) - 12, "", pos + len); else fprintf(fp, "%*s %-6d\n", namelen, "", pos+1); for (idx = 0; idx < msa->nseq; idx++) { fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]); /* get next line's worth of 50 from seq */ strncpy(buffer, gcg_aseq[idx] + pos, 50); buffer[50] = '\0'; /* draw the sequence line */ for (i = 0; i < len; i++) { if (! (i % 10)) fputc(' ', fp); fputc(buffer[i], fp); } fputc('\n', fp); } } Free2DArray((void **) gcg_aseq, msa->nseq); Free2DArray((void **) gcg_sqname, msa->nseq); return; }