/* Function: make_alilist()
 * 
 * Purpose:  Construct a list (array) mapping the raw symbols of s1
 *           onto the indexes of the aligned symbols in s2 (or -1
 *           for gaps in s2). The list (s1_list) will be of the
 *           length of s1's raw sequence.
 *           
 * Args:     s1          - sequence to construct the list for
 *           s2          - sequence s1 is aligned to
 *           ret_s1_list - RETURN: the constructed list (caller must free)
 *           ret_listlen - RETURN: length of the list
 *           
 * Returns:  1 on success, 0 on failure
 */
static int
make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
{
  int *s1_list;
  int  col;			/* column position in alignment */
  int  r1, r2;			/* raw symbol index at current col in s1, s2 */
  
  /* Malloc for s1_list. It can't be longer than s1 itself; we just malloc
   * for that (and waste a wee bit of space)
   */
  s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
  r1 = r2 = 0;
  for (col = 0; s1[col] != '\0'; col++)
    {
      /* symbol in s1? Record what it's aligned to, and bump
       * the r1 counter.
       */
      if (! isgap(s1[col]))
	{
	  s1_list[r1] = isgap(s2[col]) ? -1 : r2;
	  r1++;
	}

      /* symbol in s2? bump the r2 counter
       */
      if (! isgap(s2[col]))
	r2++;
    }

  *ret_listlen = r1;
  *ret_s1_list = s1_list;
  return 1;
}
/*ARGSUSED*/
static int
make_ref_alilist(int *ref, char *k1, char *k2,
		 char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
{
  int *s1_list;
  int  col;			/* column position in alignment */
  int  r1, r2;			/* raw symbol index at current col in s1, s2 */
  int *canons1;			/* flag array, 1 if position i in s1 raw seq is canonical */
  int  lpos;			/* position in list */
  
  /* Allocations. No arrays can exceed the length of their
   * appropriate parent (s1 or s2)
   */
  s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
  canons1 = (int *) MallocOrDie (sizeof(int) * strlen(s1));

  /* First we use refcoords and k1,k2 to construct an array of 1's 
   * and 0's, telling us whether s1's raw symbol number i is countable.
   * It's countable simply if it's under a canonical column.
   */
  r1 =  0;
  for (col = 0; k1[col] != '\0'; col++)
    {
      if (! isgap(k1[col]))
	{
	  canons1[r1] = ref[col] ? 1 : 0;
	  r1++;
	}
    }

  /* Now we can construct the list. We don't count pairs if the sym in s1
   * is non-canonical.
   * We have to keep separate track of our position in the list (lpos)
   * from our positions in the raw sequences (r1,r2)
   */
  r1 = r2 = lpos = 0;
  for (col = 0; s1[col] != '\0'; col++)
    {
      if (! isgap(s1[col]) && canons1[r1])
	{
	  s1_list[lpos] = isgap(s2[col]) ? -1 : r2;
	  lpos++;
	}
      
      if (! isgap(s1[col]))
	r1++;
      if (! isgap(s2[col]))
	r2++;
    }

  free(canons1);
  *ret_listlen = lpos;
  *ret_s1_list = s1_list;
  return 1;
}
Exemple #3
0
/* Function: DealignAseqs()
 * 
 * Given an array of (num) aligned sequences aseqs,
 * strip the gaps. Store the raw sequences in a new allocated array.
 * 
 * Caller is responsible for free'ing the memory allocated to
 * rseqs.
 * 
 * Returns 1 on success. Returns 0 and sets squid_errno on
 * failure.
 */
int
DealignAseqs(char **aseqs, int num, char ***ret_rseqs)
{
  char **rseqs;                 /* de-aligned sequence array   */
  int    idx;			/* counter for sequences       */
  int    depos; 		/* position counter for dealigned seq*/
  int    apos;			/* position counter for aligned seq */
  int    seqlen;		/* length of aligned seq */

				/* alloc space */
  rseqs = (char **) MallocOrDie (num * sizeof(char *));
				/* main loop */
  for (idx = 0; idx < num; idx++)
    {
      seqlen = strlen(aseqs[idx]);
				/* alloc space */
      rseqs[idx] = (char *) MallocOrDie ((seqlen + 1) * sizeof(char));

				/* strip gaps */
      depos = 0;
      for (apos = 0; aseqs[idx][apos] != '\0'; apos++)
	if (!isgap(aseqs[idx][apos]))
	  {
	    rseqs[idx][depos] = aseqs[idx][apos];
	    depos++;
	  }
      rseqs[idx][depos] = '\0';
    }
  *ret_rseqs = rseqs;
  return 1;
}
Exemple #4
0
/**
 * note: naseq should be unit-offset
 */
static void
encode(char *seq, char *naseq, int l, const char *res_codes)
{
    /* code seq as ints .. use GAP_POS2 for gap */
    register int i;
    bool seq_contains_unknown_char = FALSE;
    /*LOG_DEBUG("seq=%s naseq=%p l=%d", &(seq[1]), naseq, l); */


    for (i=1; i<=l; i++) {
        char res = toupper(seq[i]);
        if (isgap(res)) {
            naseq[i] = GAP_POS2; /* gap in input */
        } else {
            naseq[i] = res_index(res_codes, res);
        }

        /*LOG_DEBUG("Character '%c' at pos %d", res, i);*/
        if (-1 == naseq[i]) {
            seq_contains_unknown_char = TRUE;
            /*LOG_DEBUG("Unknown character '%c' at pos %d", res, i);*/
        }
        /*LOG_DEBUG("na_seq[%d]=%d", i, naseq[i]);*/
    }

    if (TRUE == seq_contains_unknown_char)
        Log(&rLog, LOG_WARN, "Unknown character in seq '%s'", &(seq[1]));

    naseq[i] = END_MARK;

    return;
}
Exemple #5
0
unsigned GetUngappedLength(const byte *Seq, unsigned L)
	{
	unsigned UL = 0;
	for (unsigned i = 0; i < L; ++i)
		if (!isgap(Seq[i]))
			++UL;
	return UL;
	}
Exemple #6
0
/**
 * @brief Checks if sequences in given mseq structure are aligned. By
 * definition this is only true, if sequences are of the same length
 * and at least one gap was found
 *
 * @param[in] prMSeq
 * Sequences to check
 *
 * @return TRUE if sequences are aligned, FALSE if not
 *
 * 
 */    
bool
SeqsAreAligned(mseq_t *prMSeq)
{
    bool bGapFound, bSameLength;
    int iSeqIdx; /* sequence counter */
    int iSeqPos; /* sequence string position counter */

    /* Special case of just one sequence:
     * it is arguable that a single sequence qualifies as a profile, 
     * however, this is what we do at the first stage of MSA anyway. 
     * So, if there is only 1 sequence it is a 1-profile 
     * and it is (defined to be) aligned (with itself). FS, r240 -> 241
     */
    if (1 == prMSeq->nseqs) {
        return TRUE;
    }


    /* Check if sequences are aligned. For being aligned, the
     * sequences have to be of same length (bSameLength) and at least
     * one of them has to contain at least one gap (bGapFound)
     */
    bGapFound = FALSE;
    bSameLength = TRUE;
    for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {
        if (FALSE == bGapFound) {
            for (iSeqPos=0;
                 iSeqPos<prMSeq->sqinfo[iSeqIdx].len && false==bGapFound;
                 iSeqPos++) {
                if  (isgap(prMSeq->seq[iSeqIdx][iSeqPos])) {
                    bGapFound = TRUE;
                    /* skip rest of sequence */
                    break;
                }
            }
        }

        if (iSeqIdx>0) {
            if (prMSeq->sqinfo[iSeqIdx].len != prMSeq->sqinfo[iSeqIdx-1].len) {
                bSameLength = FALSE;
                /* no need to continue search, bSameLength==FALSE is
                 * sufficient condition */
                break;
            }
        }
    }
#if 0
    Log(&rLog, LOG_FORCED_DEBUG, "bSameLength=%d bGapFound=%d", bSameLength, bGapFound);
#endif
    if (TRUE == bSameLength && TRUE == bGapFound) {
        return TRUE;
    } else {
        return FALSE;
    }   

}
Exemple #7
0
/* Function: BlockRaggedEdgedAlignment()
 * 
 * Purpose:  A brutal hack for ignoring exterior gaps on an
 *           alignment in Maxmodelmaker(). Convert all
 *           exterior gaps to the symbol ',' and hope to
 *           God nobody ever uses commas to mean anything
 *           in an alignment. 
 *           
 * Args:     aseqs  - [0..nseq-1][0..alen-1] alignment to block
 *           nseq   - number of seqs in the alignment
 *           alen   - width of alignment, columns
 *           
 * Return:   (void). Data in aseqs is changed.
 */
void
BlockRaggedEdgedAlignment(char **aseqs, int nseq, int alen)
{
  int  idx, pos;

  for (idx = 0; idx < nseq; idx++)
    {
      for (pos = 0; pos < alen; pos++)
	{
	  if (isgap(aseqs[idx][pos])) aseqs[idx][pos] = ',';
	  else break;
	}
      for (pos = alen-1; pos >= 0; pos--)
	{
	  if (isgap(aseqs[idx][pos])) aseqs[idx][pos] = ',';
	  else break;
	}
    }
}
Exemple #8
0
static void StripGaps(const byte *Seq, unsigned L, string &s)
	{
	s.clear();
	for (unsigned i = 0; i < L; ++i)
		{
		char c = Seq[i];
		if (!isgap(c))
			s.push_back(c);
		}
	}
/* Function: PairwiseIdentity()
 * 
 * Purpose:  Calculate the pairwise fractional identity between
 *           two aligned sequences s1 and s2. This is simply
 *           (idents / MIN(len1, len2)).
 *
 *           Note how many ways there are to calculate pairwise identity,
 *           because of the variety of choices for the denominator:
 *           idents/(idents+mismat) has the disadvantage that artifactual
 *             gappy alignments would have high "identities".
 *           idents/(AVG|MAX)(len1+len2) both have the disadvantage that 
 *             alignments of fragments to longer sequences would have
 *             artifactually low "identities".
 *           
 *           Watch out in nucleic acid alignments; U/T RNA/DNA alignments
 *           will be counted as mismatches!
 */
float
PairwiseIdentity(char *s1, char *s2)
{
  int     idents;		/* total identical positions  */
  int     len1, len2;		/* lengths of seqs            */
  int     x;			/* position in aligned seqs   */

  idents = len1 = len2 = 0;
  for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++) 
    {
      if (!isgap(s1[x])) {
	len1++;
	if (s1[x] == s2[x]) idents++; 
      }
      if (!isgap(s2[x])) len2++;
    }
  if (len2 < len1) len1 = len2;
  return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
}
Exemple #10
0
void AlignChime3SD(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3,
  ChimeHit2 &Hit)
	{
	if (opt_realign)
		{
		AlignChime3SDRealign(QSD3, ASD3, BSD3, Hit);
		return;
		}

	string Q3;
	string A3;
	string B3;

	const unsigned ColCount = QSD3.L;
	asserta(ASD3.L == ColCount && BSD3.L == ColCount);

	Q3.reserve(ColCount);
	A3.reserve(ColCount);
	B3.reserve(ColCount);

	const byte *QS = QSD3.Seq;
	const byte *AS = ASD3.Seq;
	const byte *BS = BSD3.Seq;
	for (unsigned Col = 0; Col < ColCount; ++Col)
		{
		byte q = toupper(QS[Col]);
		byte a = toupper(AS[Col]);
		byte b = toupper(BS[Col]);

		if (isgap(q) && isgap(a) && isgap(b))
			continue;

		Q3.push_back(q);
		A3.push_back(a);
		B3.push_back(b);
		}

	AlignChime3(Q3, A3, B3, QSD3.Label, ASD3.Label, BSD3.Label, Hit);
	}
Exemple #11
0
static void StripGapsAlloc(const SeqData &SDIn, SeqData &SDOut)
	{
	SDOut = SDIn;
	byte *s = myalloc(byte, SDIn.L);
	unsigned k = 0;
	for (unsigned i = 0; i < SDIn.L; ++i)
		{
		char c = SDIn.Seq[i];
		if (!isgap(c))
			s[k++] = toupper(c);
		}
	SDOut.Seq = s;
	SDOut.L = k;
	}
static int
dataline_MSF(char *buf, char *expected_name)
{
  while (*buf && isspace(*buf)) buf++;
  if (*buf == '\0' || strchr(commentsyms, *buf) != NULL) 
    return 0;			/* blank or comment */
  if (expected_name != NULL && strncmp(buf, expected_name, strlen(expected_name) == 0))
    return 1;			/* matches expected seq name, definitely data */
  for (; *buf != '\0'; buf++)
    {				/* MSF has coordinate lines to worry about */
      if (isspace(*buf))              continue;   /* no info from spaces     */
      if (isalpha(*buf)||isgap(*buf)) return 1;   /* has data on it          */
    }
  return 0;
}
static int 
dataline_clustal(char *buf, char *expected_name) 
{
  while (*buf && isspace(*buf)) buf++;
  if (*buf == '\0' || strchr(commentsyms, *buf) != NULL) 
    return 0;			/* blank or comment */
  if (expected_name != NULL && strncmp(buf, expected_name, strlen(expected_name) == 0))
    return 1;			/* matches expected seq name, definitely data */
  for (; *buf != '\0'; buf++)
    {				/* Clustal has no coord lines to worry about */
      if (*buf == '*' || *buf == '.' || *buf == ':') continue;  /* possible consensus line */
      if (isalnum(*buf))              return 1;   /* name or seq character   */
      if (*buf != ' ' && isgap(*buf)) return 1;   /* possible all-gap line   */
    }
  return 0;
}
Exemple #14
0
vector<pblock::PBLOCK> pblock::split(vector<gen::quartral_element> ic)
{
	int last=0;
	PBLOCK tmp=PBLOCK();
	vector<pblock::PBLOCK> result;
	set<string> tmpdef;
	set<string> tmpuse;
	for (size_t i=0;i<ic.size();i++)
	{
		if (isgap(ic[i])||ic[i+1].op==gen::LABEL)
		{
			tmp.begin=last;tmp.end=i+1;
			last=i+1;
			result.push_back(tmp);
			tmpdef.clear();
			tmpuse.clear();
		}	
	}
	for (size_t i=0;i<result.size();i++)
	{
		if (ic[result[i].begin].op==gen::FUN||(result[i].begin+1<ic.size()&&ic[result[i].begin+1].op==gen::FUN))
		{
			string tmps = ic[result[i].end-1].target;
			for (size_t j=0;j<result.size();j++)
			{
				for (size_t k=result[j].begin;k<result[j].end;k++)
				{
					if (ic[k].target==tmps&&ic[k].op==gen::LABEL)
					{
						result[j].valid=true;
					}
				}
			}					
		}
	}
	for (size_t i=0;i<result.size();i++)
	{
		if (result[i].valid==true)
		{
			cout<<i<<' ';
		}	
	}
	cout<<endl;
	return result;
}
Exemple #15
0
/**
 * @brief Removes all gap-characters from a sequence.
 *
 * @param[out] seq
 * Sequence to dealign
 *
 * @note seq will not be reallocated
 */
void
DealignSeq(char *seq)
{
    int aln_pos;
    int dealn_pos;

    assert(seq!=NULL);

    dealn_pos=0;
    for (aln_pos=0; aln_pos<(int)strlen(seq); aln_pos++) {
        if (! isgap(seq[aln_pos])) {
            seq[dealn_pos++] = seq[aln_pos];
        }
    }
    seq[dealn_pos] = '\0';

    return;
}
Exemple #16
0
/* Function: MSANogap()
 * Date:     SRE, Wed Nov 17 09:59:51 1999 [St. Louis]
 *
 * Purpose:  Remove all columns from a multiple sequence alignment that
 *           contain any gaps -- used for filtering before phylogenetic
 *           analysis.
 *
 * Args:     msa - the alignment
 *
 * Returns:  (void). The alignment is modified, so if you want to keep
 *           the original for something, make a copy.
 */
void
MSANogap(MSA *msa)
{
  int *useme;			/* array of TRUE/FALSE flags for which columns to keep */
  int apos;			/* position in original alignment */
  int idx;			/* sequence index */

  useme = MallocOrDie(sizeof(int) * msa->alen);
  for (apos = 0; apos < msa->alen; apos++)
    {
      for (idx = 0; idx < msa->nseq; idx++)
	if (isgap(msa->aseq[idx][apos]))
	  break;
      if (idx == msa->nseq) useme[apos] = TRUE; else useme[apos] = FALSE;
    }
  MSAShorterAlignment(msa, useme);
  free(useme);
  return;
}
Exemple #17
0
/* Function: DigitizeAlignment() 
 * 
 * Purpose:  Given an alignment, return digitized unaligned
 *           sequence array. (Tracebacks are always relative
 *           to digitized unaligned seqs, even if they are
 *           faked from an existing alignment in modelmakers.c.)
 *           
 * Args:     msa      - alignment to digitize
 *           ret_dsqs - RETURN: array of digitized unaligned sequences
 *           
 * Return:   (void)
 *           dsqs is alloced here. Free2DArray(dseqs, nseq).
 */ 
void
DigitizeAlignment(MSA *msa, char ***ret_dsqs)
{
  char **dsq;
  int    idx;			/* counter for sequences     */
  int    dpos;			/* position in digitized seq */
  int    apos;			/* position in aligned seq   */

  dsq = (char **) MallocOrDie (sizeof(char *) * msa->nseq);
  for (idx = 0; idx < msa->nseq; idx++) {
    dsq[idx] = (char *) MallocOrDie (sizeof(char) * (msa->alen+2));

    dsq[idx][0] = (char) Alphabet_iupac; /* sentinel byte at start */

    for (apos = 0, dpos = 1; apos < msa->alen; apos++) {
      if (! isgap(msa->aseq[idx][apos]))  /* skip gaps */
	dsq[idx][dpos++] = SymbolIndex(msa->aseq[idx][apos]);
    }
    dsq[idx][dpos] = (char) Alphabet_iupac; /* sentinel byte at end */
  }
  *ret_dsqs = dsq;
}
Exemple #18
0
/* Function: Seqtype()
 * 
 * Purpose:  Returns a (very good) guess about type of sequence:
 *           kDNA, kRNA, kAmino, or kOtherSeq.
 *           
 *           Modified from, and replaces, Gilbert getseqtype().
 */
int
Seqtype(char *seq)
{
  int  saw;			/* how many non-gap characters I saw */
  char c;
  int  po = 0;			/* count of protein-only */
  int  nt = 0;			/* count of t's */
  int  nu = 0;			/* count of u's */
  int  na = 0;			/* count of nucleotides */
  int  aa = 0;			/* count of amino acids */
  int  no = 0;			/* count of others */
  
  /* Look at the first 300 non-gap characters
   */
  for (saw = 0; *seq != '\0' && saw < 300; seq++)
    {
      c = sre_toupper((int) *seq);
      if (! isgap(c)) 
	{
	  if (strchr(protonly, c)) po++;
	  else if (strchr(primenuc,c)) {
	    na++;
	    if (c == 'T') nt++;
	    else if (c == 'U') nu++;
	  }
	  else if (strchr(aminos,c)) aa++;
	  else if (isalpha(c)) no++;
	  saw++;
	}
    }

  if (no > 0) return kOtherSeq;
  else if (po > 0) return kAmino;
  else if (na > aa) {
    if (nu > nt) return kRNA;
    else return kDNA;
    }
  else return kAmino;
}
Exemple #19
0
/**
 * @brief reads sequences from file
 *
 * @param[out] prMSeq
 * Multiple sequence struct. Must be preallocated.
 * FIXME: would make more sense to allocate it here.
 * @param[in] seqfile
 * Sequence file name. If '-' sequence will be read from stdin.
 * @param[in] iSeqType
 * int-encoded sequence type. Set to
 * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence)
 * @param[in] iMaxNumSeq
 * Return an error, if more than iMaxNumSeq have been read
 * @param[in] iMaxSeqLen
 * Return an error, if a seq longer than iMaxSeqLen has been read
 *
 * @return 0 on success, -1 on error
 *
 * @note
 *  - Depends heavily on squid
 *  - Sequence file format will be guessed
 *  - If supported by squid, gzipped files can be read as well.
 */
int
ReadSequences(mseq_t *prMSeq, char *seqfile,
              int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs,
              int iMaxNumSeq, int iMaxSeqLen)
{
    SQFILE *dbfp; /* sequence file descriptor */
    char *cur_seq;
    SQINFO cur_sqinfo;
    int iSeqIdx; /* sequence counter */
    int iSeqPos; /* sequence string position counter */

    assert(NULL!=seqfile);


    /* Try to work around inability to autodetect from a pipe or .gz:
     * assume FASTA format
     */
    if (SQFILE_UNKNOWN == iSeqFmt  &&
            (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) {
        iSeqFmt = SQFILE_FASTA;
    }

    /* Using squid routines to read input. taken from seqstat_main.c. we don't
     * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen
     * etc. NOTE this also means we discard some information, e.g. when
     * reading from and writing to a stockholm file, all extra MSA
     * info/annotation will be lost.
     *
     */

    if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) {
        Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile);
        return -1;
    }


    /* FIXME squid's ReadSeq() will exit with fatal error if format is
     * unknown. This will be a problem for a GUI. Same is true for many squid
     * other functions.
     *
     * The original squid:ReadSeq() dealigns sequences on input. We
     * use a patched version.
     *
     */
    while (ReadSeq(dbfp, dbfp->format,
                   &cur_seq,
                   &cur_sqinfo)) {

        if (prMSeq->nseqs+1>iMaxNumSeq) {
            Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'",
                iMaxNumSeq, cur_sqinfo.name, seqfile);
            return -1;
        }
        if ((int)strlen(cur_seq)>iMaxSeqLen) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)",
                cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen);
            return -1;
        }
        if ((int)strlen(cur_seq)==0) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues",
                cur_sqinfo.name);
            return -1;
        }

        /* FIXME: use modified version of AddSeq() that allows handing down SqInfo
         */

        prMSeq->seq =  (char **)
                       CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *));
        prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq);


        prMSeq->sqinfo =  (SQINFO *)
                          CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
        SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo);

#ifdef TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]);
        LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]);
#endif
        /* always guess type from first seq. use squid function and
         * convert value
         */
        if (0 == prMSeq->nseqs) {
            int type = Seqtype(prMSeq->seq[prMSeq->nseqs]);
            switch (type)  {
            case kDNA:
                prMSeq->seqtype = SEQTYPE_DNA;
                break;
            case kRNA:
                prMSeq->seqtype = SEQTYPE_RNA;
                break;
            case kAmino:
                prMSeq->seqtype = SEQTYPE_PROTEIN;
                break;
            case kOtherSeq:
                prMSeq->seqtype = SEQTYPE_UNKNOWN;
                break;
            default:
                Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
            }

            /* override with given sequence type but check with
             * automatically detected type and warn if necessary
             */
            if (SEQTYPE_UNKNOWN != iSeqType) {
                if (prMSeq->seqtype != iSeqType) {
                    Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested",
                        SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType));
                    prMSeq->seqtype = iSeqType;
                }
            }
            /* if type could not be determined and was not set return error */
            if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) {
                Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence");
                FreeSequence(cur_seq, &cur_sqinfo);
                SeqfileClose(dbfp);
                return -1;
            }
        }

        Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s",
            prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype),
            prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len,
            prMSeq->seq[prMSeq->nseqs]);

        /* FIXME IPUAC and/or case conversion? If yes see
         * corresponding squid functions. Special treatment of
         * Stockholm tilde-gaps for ktuple code?
         */

        prMSeq->nseqs++;

        FreeSequence(cur_seq, &cur_sqinfo);
    }
    SeqfileClose(dbfp);

    /*#if ALLOW_ONLY_PROTEIN
        if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
            Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.",
                  SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME);
        }
    #endif*/

    /* Check if sequences are aligned */
    prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs);


    /* keep original sequence as copy and convert "working" sequence
     *
     */
    prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *));
    for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {

        prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]);


        /* convert unknown characters according to set seqtype
         * be conservative, i.e. don't allow any fancy ambiguity
         * characters to make sure that ktuple code etc. works.
         */

        /* first on the fly conversion between DNA and RNA
         */
        if (prMSeq->seqtype==SEQTYPE_DNA)
            ToDNA(prMSeq->seq[iSeqIdx]);
        if (prMSeq->seqtype==SEQTYPE_RNA)
            ToRNA(prMSeq->seq[iSeqIdx]);

        /* then check of each character
         */
        for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) {
            char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]);
            if (isgap(*res))
                continue;

            if (prMSeq->seqtype==SEQTYPE_PROTEIN) {
                if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) {
                    *res = AMINOACID_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_DNA) {
                if (NULL == strchr(DNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_RNA) {
                if (NULL == strchr(RNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            }
        }
    }

    /* order in which sequences appear in guide-tree
     * only allocate if different output-order desired */
    prMSeq->tree_order = NULL;

    prMSeq->filename = CkStrdup(seqfile);
    Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s",
        prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename);

    return 0;
}
Exemple #20
0
/**
 * @brief Checks if sequences in given mseq structure are aligned. By
 * definition this is only true, if sequences are of the same length
 * and at least one gap was found
 *
 * @param[in] prMSeq
 * Sequences to check
 *
 * @return TRUE if sequences are aligned, FALSE if not
 *
 *
 */
bool
SeqsAreAligned(mseq_t *prMSeq, bool bIsProfile, bool bDealignInputSeqs)
{
    bool bGapFound, bSameLength;
    int iSeqIdx; /* sequence counter */
    int iSeqPos; /* sequence string position counter */

    /* Special case of just one sequence:
     * it is arguable that a single sequence qualifies as a profile,
     * however, this is what we do at the first stage of MSA anyway.
     * So, if there is only 1 sequence it is a 1-profile
     * and it is (defined to be) aligned (with itself). FS, r240 -> 241
     */
    if (1 == prMSeq->nseqs) {
        return TRUE;
    }


    /* Check if sequences are aligned. For being aligned, the
     * sequences have to be of same length (bSameLength) and at least
     * one of them has to contain at least one gap (bGapFound)
     */
    bGapFound = FALSE;
    bSameLength = TRUE;
    for (iSeqIdx=0; (iSeqIdx < prMSeq->nseqs); iSeqIdx++) {
        if ( (FALSE == bGapFound) ) {
            for (iSeqPos=0;
                    iSeqPos<prMSeq->sqinfo[iSeqIdx].len && false==bGapFound;
                    iSeqPos++) {
                if  (isgap(prMSeq->seq[iSeqIdx][iSeqPos])) {
                    bGapFound = TRUE;
                    /* skip rest of sequence */
                    break;
                }
            }
        } /* gap not (yet) found */

        if (iSeqIdx>0) {
            if (prMSeq->sqinfo[iSeqIdx].len != prMSeq->sqinfo[iSeqIdx-1].len) {
                bSameLength = FALSE;
                /* no need to continue search, bSameLength==FALSE is
                 * sufficient condition */
                break;
            }
        }
    } /* 0 <= iSeqIdx < prMSeq->nseqs */
#if 0
    Log(&rLog, LOG_FORCED_DEBUG, "bSameLength=%d bGapFound=%d", bSameLength, bGapFound);
#endif

#if 0
    if ( (TRUE == bSameLength) && ((TRUE == bGapFound) || (TRUE == bIsProfile)) ) {
        return TRUE;
    } else {
        if ((FALSE == bSameLength) && (TRUE == bGapFound) && (FALSE == bDealignInputSeqs)) {
            Log(&rLog, LOG_FORCED_DEBUG, "Potential Problem: Gaps encountered but not all sequences have same length, consider using --dealign");
        }
        return FALSE;
    }
#else
    if (FALSE == bSameLength) {
        /* if sequences don't have same lengths they can never be profile */
        if (TRUE == bGapFound) {
            Log(&rLog, LOG_FORCED_DEBUG, "Potential Problem: sequences (N=%d) don't have same lengths but contain gaps, consider using --dealign", prMSeq->nseqs);
        }
        return FALSE;
    }
    else { /* here all sequences have same lengths */
        if (TRUE == bGapFound) {
            /* if at least one sequence contains gaps (and all have the same lengths)
               then we can be sure it is a profile */
            return TRUE;
        }
        /* here all sequences have same lengths but no sequences contain any gaps */
        else if (TRUE == bIsProfile) {
            /* if the user says it is a profile then it is */
            return TRUE;
        }
        else {
            return FALSE;
        }
    }
#endif

}
Exemple #21
0
void AlignChimeLocal3(const string &Q3, const string &A3, const string &B3,
  const string &QLabel, const string &ALabel, const string &BLabel,
  ChimeHit2 &Hit)
	{
	Hit.Clear();

	const byte *Q3Seq = (const byte *) Q3.c_str();
	const byte *A3Seq = (const byte *) A3.c_str();
	const byte *B3Seq = (const byte *) B3.c_str();

	const unsigned ColCount = SIZE(Q3);
	asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);

	vector<float> ColScoresA(ColCount, 0.0f);
	vector<float> ColScoresB(ColCount, 0.0f);

	float ScoreN = -(float) opt_xn;
	unsigned QL = 0;
	for (unsigned Col = 0; Col < ColCount; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];

		if (!isgap(q))
			++QL;

		if (q == a && q == b && a == b)
			continue;

		if (isgap(q) || isgap(a) || isgap(b))
			continue;

		if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))
			continue;

		if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))
			continue;

		if (q == a && q != b)
			ColScoresA[Col] = 1;
		else
			ColScoresA[Col] = ScoreN;

		if (q == b && q != a)
			ColScoresB[Col] = 1;
		else
			ColScoresB[Col] = ScoreN;
		}

	vector<float> LVA(ColCount, 0.0f);
	vector<float> LVB(ColCount, 0.0f);

	LVA[0] = ColScoresA[0];
	LVB[0] = ColScoresB[0];
	for (unsigned Col = 1; Col < ColCount; ++Col)
		{
		LVA[Col] = max(LVA[Col-1], 0.0f) + ColScoresA[Col];
		LVB[Col] = max(LVB[Col-1], 0.0f) + ColScoresB[Col];
		}

	vector<float> RVA(ColCount, 0.0f);
	vector<float> RVB(ColCount, 0.0f);

	RVA[ColCount-1] = ColScoresA[ColCount-1];
	RVB[ColCount-1] = ColScoresB[ColCount-1];
	for (int Col = ColCount-2; Col >= 0; --Col)
		{
		RVA[Col] = max(RVA[Col+1], 0.0f) + ColScoresA[Col];
		RVB[Col] = max(RVB[Col+1], 0.0f) + ColScoresB[Col];
		}

	bool FirstA = true;
	float MaxSum = 0.0;
	unsigned ColX = UINT_MAX;
	for (unsigned Col = 1; Col < ColCount-1; ++Col)
		{
		float Sum = LVA[Col] + RVB[Col+1];
		if (Sum > MaxSum)
			{
			FirstA = true;
			MaxSum = Sum;
			ColX = Col;
			}
		}

	for (unsigned Col = 1; Col < ColCount-1; ++Col)
		{
		float Sum = LVB[Col] + RVA[Col+1];
		if (Sum > MaxSum)
			{
			FirstA = false;
			MaxSum = Sum;
			ColX = Col;
			}
		}
	if (ColX == UINT_MAX)
		return;

	unsigned ColLo = UINT_MAX;
	unsigned ColHi = UINT_MAX;
	if (FirstA)
		{
		float Sum = 0.0f;
		for (int Col = ColX; Col >= 0; --Col)
			{
			Sum += ColScoresA[Col];
			if (Sum >= LVA[ColX])
				{
				ColLo = Col;
				break;
				}
			}
		asserta(Sum >= LVA[ColX]);
		Sum = 0.0f;
		for (unsigned Col = ColX+1; Col < ColCount; ++Col)
			{
			Sum += ColScoresB[Col];
			if (Sum >= RVB[ColX])
				{
				ColHi = Col;
				break;
				}
			}
		asserta(Sum >= RVB[ColX]);
		}
	else
		{
		float Sum = 0.0f;
		for (int Col = ColX; Col >= 0; --Col)
			{
			Sum += ColScoresB[Col];
			if (Sum >= LVB[ColX])
				{
				ColLo = Col;
				break;
				}
			}
		asserta(Sum >= LVB[ColX]);
		Sum = 0.0f;
		for (unsigned Col = ColX+1; Col < ColCount; ++Col)
			{
			Sum += ColScoresA[Col];
			if (Sum >= RVA[ColX])
				{
				ColHi = Col;
				break;
				}
			}
		asserta(Sum >= RVA[ColX]);
		}

	unsigned ColXHi = ColX;
	for (unsigned Col = ColX + 1; Col < ColCount; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];
		
		if (q == a && q == b && !isgap(q))
			ColXHi = Col;
		else
			break;
		}

	unsigned ColXLo = ColX;
	for (int Col = (int) ColX - 1; Col >= 0; --Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];
		
		if (q == a && q == b && !isgap(q))
			ColXLo = Col;
		else
			break;
		}

	unsigned IdQA = 0;
	unsigned IdQB = 0;
	unsigned IdAB = 0;
	unsigned NQA = 0;
	unsigned NQB = 0;
	unsigned NAB = 0;
	for (unsigned Col = 0; Col < ColCount; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];

		if (!isgap(q) && !isgap(a))
			{
			++NQA;
			if (q == a)
				++IdQA;
			}

		if (!isgap(q) && !isgap(b))
			{
			++NQB;
			if (q == b)
				++IdQB;
			}

		if (!isgap(a) && !isgap(b))
			{
			++NAB;
			if (a == b)
				++IdAB;
			}
		}

	Hit.PctIdQA = Pct(IdQA, NQA);
	Hit.PctIdQB = Pct(IdQB, NQB);
	Hit.PctIdAB = Pct(IdAB, NAB);

	unsigned LIdQA = 0;
	unsigned LIdQB = 0;
	for (unsigned Col = ColLo; Col < ColXLo; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];

		if (!isgap(q) && !isgap(a))
			{
			if (q == a)
				++LIdQA;
			}

		if (!isgap(q) && !isgap(b))
			{
			if (q == b)
				++LIdQB;
			}
		}

	unsigned RIdQA = 0;
	unsigned RIdQB = 0;
	for (unsigned Col = ColXHi+1; Col <= ColHi; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];

		if (!isgap(q) && !isgap(a))
			{
			if (q == a)
				++RIdQA;
			}

		if (!isgap(q) && !isgap(b))
			{
			if (q == b)
				++RIdQB;
			}
		}

	unsigned IdDiffL = max(LIdQA, LIdQB) - min(LIdQA, LIdQB);
	unsigned IdDiffR = max(RIdQA, RIdQB) - min(RIdQA, RIdQB);
	unsigned MinIdDiff = min(IdDiffL, IdDiffR);
	unsigned ColRange = ColHi - ColLo + 1;
	if (opt_queryfract > 0.0f && float(ColRange)/float(QL) < opt_queryfract)
		return;

//	double Div = Pct(MinIdDiff, QSD.L);

#if	TRACE
	{
	Log("  Col  A Q B   ScoreA   ScoreB      LVA      LVB      RVA      RVB\n");
	Log("-----  - - -  -------  -------  -------  -------  -------  -------\n");
	for (unsigned Col = 0; Col < ColCount; ++Col)
		{
		if (ColScoresA[Col] == 0.0 && ColScoresB[Col] == 0.0)
			continue;

		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];
		Log("%5u  %c %c %c", Col, a, q, b);

		if (ColScoresA[Col] == 0.0)
			Log("  %7.7s", "");
		else
			Log("  %7.1f", ColScoresA[Col]);

		if (ColScoresB[Col] == 0.0)
			Log("  %7.7s", "");
		else
			Log("  %7.1f", ColScoresB[Col]);

		Log("  %7.1f  %7.1f  %7.1f  %7.1f", LVA[Col], LVB[Col], RVA[Col], RVB[Col]);

		Log("\n");
		}
	Log("\n");
	Log("MaxSum %.1f, ColLo %u, ColXLo %u, ColX %u, ColXHi %u, ColHi %u, AF %c\n",
	  MaxSum, ColLo, ColXLo, ColX, ColXHi, ColHi, tof(FirstA));
	Log("  LIdQA %u, LIdQB %u, RIdQA %u, RIdQB %u\n", LIdQA, LIdQB, RIdQA, RIdQB);
	}
#endif

	string Q3L;
	string A3L;
	string B3L;
	for (unsigned Col = ColLo; Col <= ColHi; ++Col)
		{
		char q = Q3[Col];
		char a = A3[Col];
		char b = B3[Col];

		Q3L += q;
		A3L += a;
		B3L += b;
		}

	AlignChimeGlobal3(Q3L, A3L, B3L, QLabel, ALabel, BLabel, Hit);

#if	0
// CS SNPs
	Hit.CS_LY = 0;
	Hit.CS_LN = 0;
	Hit.CS_RY = 0;
	Hit.CS_RN = 0;
	Hit.CS_LA = 0;
	Hit.CS_RA = 0;
	for (unsigned Col = ColLo; Col <= ColHi; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];
		if (q == a && q == b && a == b)
			continue;
		if (isgap(q) || isgap(a) || isgap(b))
			continue;
		if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))
			continue;
		if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))
			continue;

		if (!FirstA)
			swap(a, b);

		if (Col < ColXLo)
			{
			if (q == a && q != b)
				++Hit.CS_LY;
			else if (q == b && q != a)
				++Hit.CS_LN;
			else
				++Hit.CS_LA;
			}
		else if (Col > ColXHi)
			{
			if (q == b && q != a)
				++Hit.CS_RY;
			else if (q == a && q != b)
				++Hit.CS_RN;
			else
				++Hit.CS_RA;
			}
		}

	double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA);
	double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA);
	Hit.Score = ScoreL*ScoreR;

	//Hit.QSD = QSD;
	//if (FirstA)
	//	{
	//	Hit.ASD = ASD;
	//	Hit.BSD = BSD;
	//	Hit.PathQA = PathQA;
	//	Hit.PathQB = PathQB;
	//	}
	//else
	//	{
	//	Hit.ASD = BSD;
	//	Hit.BSD = ASD;
	//	}

	//Hit.ColLo = ColLo;
	//Hit.ColXLo = ColXLo;
	//Hit.ColXHi = ColXHi;
	//Hit.ColHi = ColHi;
	//Hit.Div = Div;

//	Hit.LogMe();
#endif
	}
Exemple #22
0
void AlignChimeGlobal3(const string &Q3, const string &A3, const string &B3,
  const string &QLabel, const string &ALabel, const string &BLabel,
  ChimeHit2 &Hit)
	{
	Hit.Clear();
	Hit.QLabel = QLabel;

	const byte *Q3Seq = (const byte *) Q3.c_str();
	const byte *A3Seq = (const byte *) A3.c_str();
	const byte *B3Seq = (const byte *) B3.c_str();

	const unsigned ColCount = SIZE(Q3);
	asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);

#if	TRACE
	Log("Q %5u %*.*s\n", ColCount, ColCount, ColCount, Q3Seq);
	Log("A %5u %*.*s\n", ColCount, ColCount, ColCount, A3Seq);
	Log("B %5u %*.*s\n", ColCount, ColCount, ColCount, B3Seq);
#endif

// Discard terminal gaps
	unsigned ColLo = UINT_MAX;
	unsigned ColHi = UINT_MAX;
	for (unsigned Col = 2; Col + 2 < ColCount; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];

		if (isacgt(q) && isacgt(a) && isacgt(b))
			{
			if (ColLo == UINT_MAX)
				ColLo = Col;
			ColHi = Col;
			}
		}

	if (ColLo == UINT_MAX)
		return;

	unsigned QPos = 0;
	unsigned APos = 0;
	unsigned BPos = 0;
	unsigned DiffCount = 0;

	vector<unsigned> ColToQPos(ColLo, UINT_MAX);
	vector<unsigned> AccumCount(ColLo, UINT_MAX);
	vector<unsigned> AccumSameA(ColLo, UINT_MAX);
	vector<unsigned> AccumSameB(ColLo, UINT_MAX);
	vector<unsigned> AccumForA(ColLo, UINT_MAX);
	vector<unsigned> AccumForB(ColLo, UINT_MAX);
	vector<unsigned> AccumAbstain(ColLo, UINT_MAX);
	vector<unsigned> AccumAgainst(ColLo, UINT_MAX);

	unsigned SumSameA = 0;
	unsigned SumSameB = 0;
	unsigned SumSameAB = 0;
	unsigned Sum = 0;
	unsigned SumForA = 0;
	unsigned SumForB = 0;
	unsigned SumAbstain = 0;
	unsigned SumAgainst = 0;
	for (unsigned Col = ColLo; Col <= ColHi; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];

		if (isacgt(q) && isacgt(a) && isacgt(b))
			{
			if (q == a)
				++SumSameA;
			if (q == b)
				++SumSameB;
			if (a == b)
				++SumSameAB;
			if (q == a && q != b)
				++SumForA;
			if (q == b && q != a)
				++SumForB;
			if (a == b && q != a)
				++SumAgainst;
			if (q != a && q != b)
				++SumAbstain;
			++Sum;
			}

		ColToQPos.push_back(QPos);
		AccumSameA.push_back(SumSameA);
		AccumSameB.push_back(SumSameB);
		AccumCount.push_back(Sum);
		AccumForA.push_back(SumForA);
		AccumForB.push_back(SumForB);
		AccumAbstain.push_back(SumAbstain);
		AccumAgainst.push_back(SumAgainst);

		if (q != '-')
			++QPos;
		if (a != '-')
			++APos;
		if (b != '-')
			++BPos;
		}

	asserta(SIZE(ColToQPos) == ColHi+1);
	asserta(SIZE(AccumSameA) == ColHi+1);
	asserta(SIZE(AccumSameB) == ColHi+1);
	asserta(SIZE(AccumAbstain) == ColHi+1);
	asserta(SIZE(AccumAgainst) == ColHi+1);

	double IdQA = double(SumSameA)/Sum;
	double IdQB = double(SumSameB)/Sum;
	double IdAB = double(SumSameAB)/Sum;
	double MaxId = max(IdQA, IdQB);

#if	TRACE
	Log("IdQA=%.1f%% IdQB=%.1f%% IdAB=%.1f\n", IdQA*100.0, IdQB*100.0, IdAB*100.0);
	Log("\n");
	Log("    x  AQB   IdAL   IdBL   IdAR   IdBR   DivAB   DivBA    YAL    YBL    YAR    YBR    AbL    AbR  ScoreAB  ScoreAB    XLo    Xhi\n");
	Log("-----  ---  -----  -----  -----  -----  ------  ------  -----  -----  -----  -----  -----  -----  -------  -------  -----  -----\n");
#endif
	unsigned BestXLo = UINT_MAX;
	unsigned BestXHi = UINT_MAX;
	double BestDiv = 0.0;
	double BestIdQM = 0.0;
	double BestScore = 0.0;

// Find range of cols BestXLo..BestXHi that maximizes score
	bool FirstA = false;

// NOTE: Must be < ColHi not <= because use Col+1 below
	for (unsigned Col = ColLo; Col < ColHi; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];

		unsigned SameAL = AccumSameA[Col];
		unsigned SameBL = AccumSameB[Col];
		unsigned SameAR = SumSameA - AccumSameA[Col];
		unsigned SameBR = SumSameB - AccumSameB[Col];

		double IdAB = double(SameAL + SameBR)/Sum;
		double IdBA = double(SameBL + SameAR)/Sum;

		unsigned ForAL = AccumForA[Col];
		unsigned ForBL = AccumForB[Col];
		unsigned ForAR = SumForA - AccumForA[Col+1];
		unsigned ForBR = SumForB - AccumForB[Col+1];
		unsigned AbL = AccumAbstain[Col];
		unsigned AbR = SumAbstain - AccumAbstain[Col+1];

		double ScoreAB = GetScore2(ForAL, ForBL, AbL)*GetScore2(ForBR, ForAR, AbR);
		double ScoreBA = GetScore2(ForBL, ForAL, AbL)*GetScore2(ForAR, ForBR, AbR);
	
		double DivAB = IdAB/MaxId;
		double DivBA = IdBA/MaxId;
		double MaxDiv = max(DivAB, DivBA);

		//if (MaxDiv > BestDiv)
		//	{
		//	BestDiv = MaxDiv;
		//	BestXLo = Col;
		//	BestXHi = Col;
		//	FirstA = (DivAB > DivBA);
		//	if (FirstA)
		//		BestIdQM = IdAB;
		//	else
		//		BestIdQM = IdBA;
		//	}
		//else if (MaxDiv == BestDiv)
		//	BestXHi = Col;

		double MaxScore = max(ScoreAB, ScoreBA);
		if (MaxScore > BestScore)
			{
			BestScore = MaxScore;
			BestXLo = Col;
			BestXHi = Col;
			FirstA = (ScoreAB > ScoreBA);
			if (FirstA)
				BestIdQM = IdAB;
			else
				BestIdQM = IdBA;
			if (MaxDiv > BestDiv)
				BestDiv = MaxDiv;
			}
		else if (MaxScore == BestScore)
			{
			BestXHi = Col;
			if (MaxDiv > BestDiv)
				BestDiv = MaxDiv;
			}

#if	TRACE
		{
		Log("%5u", Col);
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];
		Log("  %c%c%c", a, q, b);
		Log("  %5u", SameAL);
		Log("  %5u", SameBL);
		Log("  %5u", SameAR);
		Log("  %5u", SameBR);
		Log("  %5.4f", DivAB);
		Log("  %5.4f", DivBA);
		Log("  %5u", ForAL);
		Log("  %5u", ForBL);
		Log("  %5u", ForAR);
		Log("  %5u", ForBR);
		Log("  %5u", AbL);
		Log("  %5u", AbR);
		Log("  %7.4f", ScoreAB);
		Log("  %7.4f", ScoreBA);
		if (BestXLo != UINT_MAX)
			Log("  %5u", BestXLo);
		if (BestXHi != UINT_MAX)
			Log("  %5u", BestXHi);
		Log("\n");
		}
#endif
		}

	if (BestXLo == UINT_MAX)
		{
#if	TRACE
		Log("\n");
		Log("No crossover found.\n");
#endif
		return;
		}
#if	TRACE
	Log("BestX col %u - %u\n", BestXLo, BestXHi);
#endif

// Find maximum region of identity within BestXLo..BestXHi
	unsigned ColXLo = (BestXLo + BestXHi)/2;
	unsigned ColXHi = ColXLo;
	unsigned SegLo = UINT_MAX;
	unsigned SegHi = UINT_MAX;
	for (unsigned Col = BestXLo; Col <= BestXHi; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];

		if (q == a && q == b)
			{
			if (SegLo == UINT_MAX)
				SegLo = Col;
			SegHi = Col;
			}
		else
			{
			unsigned SegLength = SegHi - SegLo + 1;
			unsigned BestSegLength = ColXHi - ColXLo + 1;
			if (SegLength > BestSegLength)
				{
				ColXLo = SegLo;
				ColXHi = SegHi;
				}
			SegLo = UINT_MAX;
			SegHi = UINT_MAX;
			}
		}
	unsigned SegLength = SegHi - SegLo + 1;
	unsigned BestSegLength = ColXHi - ColXLo + 1;
	if (SegLength > BestSegLength)
		{
		ColXLo = SegLo;
		ColXHi = SegHi;
		}

	QPos = 0;
	for (unsigned x = 0; x < ColCount; ++x)
		{
		if (x == ColXLo)
			Hit.QXLo = QPos;
		else if (x == ColXHi)
			{
			Hit.QXHi = QPos;
			break;
			}
		char q = Q3Seq[x];
		if (q != '-')
			++QPos;
		}

	Hit.ColXLo = ColXLo;
	Hit.ColXHi = ColXHi;

	//if (FirstA)
	//	{
	//	Hit.LY = AccumForA[ColXLo];
	//	Hit.LN = AccumForB[ColXLo];

	//	Hit.RY = SumForB - AccumForB[ColXHi];
	//	Hit.RN = SumForA - AccumForA[ColXHi];
	//	}
	//else
	//	{
	//	Hit.LY = AccumForB[ColXLo];
	//	Hit.LN = AccumForA[ColXLo];
	//	Hit.RY = SumForA - AccumForA[ColXHi];
	//	Hit.RN = SumForB - AccumForB[ColXHi];
	//	}

	//Hit.LA = AccumAgainst[ColXLo];
	//Hit.LD = AccumAbstain[ColXLo];

	//Hit.RA = SumAgainst - AccumAgainst[ColXHi];
	//Hit.RD = SumAbstain - AccumAbstain[ColXHi];

	Hit.PctIdAB = IdAB*100.0;
	Hit.PctIdQM = BestIdQM*100.0;

	Hit.Div = (BestDiv - 1.0)*100.0;

	//Hit.QSD = QSD;
	Hit.Q3 = Q3;
	Hit.QLabel = QLabel;
	if (FirstA)
		{
		//Hit.ASD = ASD;
		//Hit.BSD = BSD;
		//Hit.PathQA = PathQA;
		//Hit.PathQB = PathQB;
		Hit.A3 = A3;
		Hit.B3 = B3;
		Hit.ALabel = ALabel;
		Hit.BLabel = BLabel;
		Hit.PctIdQA = IdQA*100.0;
		Hit.PctIdQB = IdQB*100.0;
		}
	else
		{
		Hit.A3 = B3;
		Hit.B3 = A3;
		Hit.ALabel = BLabel;
		Hit.BLabel = ALabel;
		Hit.PctIdQA = IdQB*100.0;
		Hit.PctIdQB = IdQA*100.0;
		}

// CS SNPs
	Hit.CS_LY = 0;
	Hit.CS_LN = 0;
	Hit.CS_RY = 0;
	Hit.CS_RN = 0;
	Hit.CS_LA = 0;
	Hit.CS_RA = 0;

	//vector<float> Cons;
	//for (unsigned Col = 0; Col < ColCount; ++Col)
	//	{
	//	char q = Q3Seq[Col];
	//	char a = A3Seq[Col];
	//	char b = B3Seq[Col];
	//	if (q == a && q == b && a == b)
	//		{
	//		Cons.push_back(1.0f);
	//		continue;
	//		}

	//	bool gapq = isgap(q);
	//	bool gapa = isgap(a);
	//	bool gapb = isgap(b);

	//	if (!gapq && !gapa && !gapb)
	//		{
	//		if (q == a || q == b || a == b)
	//			Cons.push_back(0.75);
	//		else
	//			Cons.push_back(0.5);
	//		}
	//	else
	//		{
	//		if (!gapa && (a == b || a == q))
	//			Cons.push_back(0.5f);
	//		else if (!gapb && b == q)
	//			Cons.push_back(0.5f);
	//		else
	//			Cons.push_back(0.0f);
	//		}
	//	}

	//float fLY = 0.0f;
	//float fLN = 0.0f;
	//float fLA = 0.0f;
	//float fRY = 0.0f;
	//float fRN = 0.0f;
	//float fRA = 0.0f;
	for (unsigned Col = ColLo; Col <= ColHi; ++Col)
		{
		char q = Q3Seq[Col];
		char a = A3Seq[Col];
		char b = B3Seq[Col];
		if (q == a && q == b && a == b)
			continue;

		unsigned ngaps = 0;
		if (isgap(q))
			++ngaps;
		if (isgap(a))
			++ngaps;
		if (isgap(b))
			++ngaps;

		if (opt_skipgaps)
			{
			if (ngaps == 3)
				continue;
			}
		else
			{
			if (ngaps == 2)
				continue;
			}

		if (!FirstA)
			swap(a, b);

		//float AvgCons = (Cons[Col-2] + Cons[Col-1] + Cons[Col+1] + Cons[Col+2])/4;
		//if (Col < ColXLo)
		//	{
		//	if (q == a && q != b)
		//		fLY += AvgCons;
		//	else if (q == b && q != a)
		//		fLN += AvgCons;
		//	else
		//		fLA += AvgCons;
		//	}
		//else if (Col > ColXHi)
		//	{
		//	if (q == b && q != a)
		//		fRY += AvgCons;
		//	else if (q == a && q != b)
		//		fRN += AvgCons;
		//	else
		//		fRA += AvgCons;
		//	}

		if (opt_skipgaps2)
			{
			if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))
				continue;
			if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))
				continue;
			}

		//if (Col > 0 && isgap(Q3Seq[Col-1]))
			//continue;
		//if (Col + 1 < ColCount && isgap(Q3Seq[Col+1]))
		//	continue;

		if (Col < ColXLo)
			{
			if (q == a && q != b)
				++Hit.CS_LY;
			else if (q == b && q != a)
				++Hit.CS_LN;
			else
				++Hit.CS_LA;
			}
		else if (Col > ColXHi)
			{
			if (q == b && q != a)
				++Hit.CS_RY;
			else if (q == a && q != b)
				++Hit.CS_RN;
			else
				++Hit.CS_RA;
			}
		}

	double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA);
	double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA);
	Hit.Score = ScoreL*ScoreR;

	extern bool g_UchimeDeNovo;

	//if (0)//g_UchimeDeNovo)
	//	{
	//	double AbQ = GetAbFromLabel(QLabel.c_str());
	//	double AbA = GetAbFromLabel(ALabel.c_str());
	//	double AbB = GetAbFromLabel(BLabel.c_str());
	//	if (AbQ > 0.0 && AbA > 0.0 && AbB > 0.0)
	//		{
	//		double MinAb = min(AbA, AbB);
	//		double Ratio = MinAb/AbQ;
	//		double t = Ratio - opt_abx;
	//	//	double Factor = 2.0/(1.0 + exp(-t));
	//		double Factor = min(Ratio, opt_abx)/opt_abx;
	//		if (opt_verbose)
	//			Log("Score %.4f Ab factor %.4f >%s\n", Hit.Score, Factor, QLabel.c_str());
	//		Hit.Score *= Factor;
	//		}
	//	}

	extern FILE *g_fUChimeAlns;
	if (g_fUChimeAlns != 0 && Hit.Div > 0.0)
		{
		void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit);
		WriteChimeHitX(g_fUChimeAlns, Hit);
		}
	}
Exemple #23
0
/* Function: PrintFancyTrace()
 * 
 * Purpose:  Print an alignment of an HMM to a sequence, given a traceback.
 *           Somewhat inspired by the output style of BLAST, except that
 *           we're aligning to a complicated model that's difficult to
 *           represent compactly. 
 *
 * Arguments: ofp       - where to print it (open FILE for writing, or stdout)
 *            shmm      - log-odds form HMM
 *            tr        - traceback from ViterbiTrace()
 *            seq       - sequence that is aligned
 *            seqname   - name of seq to print in left margin
 *            from_pos  - first position in seq that aligns (0..seqlen-1)
 *
 * Returns:  (void)
 */
void
PrintFancyTrace(FILE             *ofp,
		struct shmm_s    *shmm,
		struct trace_s   *tr,
		char             *seq,
		char             *seqname,
		int               from_pos)
{
  char *model;                  /* display of model                */
  char *mline;			/* display of match/mismatch       */
  char *rfline;			/* display of reference seq        */
  char *csline;			/* display of consensus struct     */
  char *aseq;                   /* display of aligned sequence     */
  int   rpos;                   /* position in raw seq             */
  int   apos;                   /* position in traceback/alignment */
  float score;			/* score for position              */
  float max_score;		/* best score for position         */
  char  bestsym;		/* best match sym at position      */
  int   idx;			/* counter for alphabet            */
  int   len;			/* current length of display printed */
  char  buffer[CPL+1];          /* buffer for lines of display     */ 
  int   startpos, endpos;

  /* Memory allocation.
   */
  if ((rfline = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL ||
      (csline = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL ||
      (model  = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL ||
      (mline  = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL ||
      (aseq   = (char *) malloc (sizeof(char) * (tr->tlen + 1))) == NULL)
    Die("memory allocation failed at %s:%d", __FILE__, __LINE__);
  memset(rfline, ' ', tr->tlen);
  memset(csline, ' ', tr->tlen);
  memset(model,  ' ', tr->tlen);
  memset(mline,  ' ', tr->tlen);
  memset(aseq,   ' ', tr->tlen);

  /* Create the displays of model and aligned sequence.
   * Ignore BEGIN (apos == 0) and END (apos == N-1) in the traceback.
   */
  rpos = from_pos;
  for (apos = 1; apos < tr->tlen-1; apos++)
    {
				/* find best sym at this model position */
      if (tr->statetype[apos] != INSERT)
	{
	  max_score = -999;
	  for (idx = 0; idx < 26; idx++)
	    if (shmm->m_emit[idx][tr->nodeidx[apos]] > max_score)
	      { 
		max_score = shmm->m_emit[idx][tr->nodeidx[apos]];
		bestsym   = (char) ('A' + idx);
	      }
	  if (max_score > (int)(CUTOFF * INTSCALE))
	    model[apos] = toupper((int) bestsym);
	  else
	    model[apos] = tolower((int) bestsym);
	}
      else
	model[apos] = '.';

				/* construct mline (match/mismatch display), rfline, and csline */
      switch (tr->statetype[apos]) {
      case MATCH:
	score = shmm->m_emit[seq[rpos]-'A'][tr->nodeidx[apos]];
	if (seq[rpos] == bestsym) 
	  mline[apos] = bestsym;
	else if (score > 0)       
	  mline[apos] = '+';
	aseq[apos] = seq[rpos];
	if (shmm->flags & HMM_REF) rfline[apos] = shmm->ref[tr->nodeidx[apos]];
	if (shmm->flags & HMM_CS)  csline[apos] = shmm->cs[tr->nodeidx[apos]];
	rpos++;
	break;

      case INSERT:
	aseq[apos] = seq[rpos];
	rpos++;
	break;

      case DELETE:
	aseq[apos] = '-';
	if (shmm->flags & HMM_REF) rfline[apos] = shmm->ref[tr->nodeidx[apos]];
	if (shmm->flags & HMM_CS)  csline[apos] = shmm->cs[tr->nodeidx[apos]];
	break;

      default: Die("Unrecognized statetype %d at %d in traceback", 
		   tr->statetype[apos], apos);
      }
    }
  /* Null terminate, and tack on asterisks to represent BEGIN and
   * END dummy states in model.
   */
  model[0]          = '*';	/* begin */
  model[tr->tlen-1] = '*';      /* end   */
  model[tr->tlen]   = '\0';
  aseq[tr->tlen]    = '\0';
  mline[tr->tlen]   = '\0';
  csline[tr->tlen]   = '\0';
  rfline[tr->tlen]   = '\0';

  /* Print out the display.
   */
  fprintf(ofp, "  Alignment to HMM consensus:\n");
  buffer[CPL] = '\0';
  len = 0; 
  rpos     = from_pos + 1;
  while (len < tr->tlen)	
    {
      startpos = rpos;
				/* rf line reference coord line */
      if (shmm->flags & HMM_REF)
	{
	  strncpy(buffer, rfline+len, CPL);
	  fprintf(ofp, "               REF %s\n", buffer);
	}
				/* cs consensus structure line */
      if (shmm->flags & HMM_CS)
	{
	  strncpy(buffer, csline+len, CPL);
	  fprintf(ofp, "                CS %s\n", buffer);
	}
				/* model */
      strncpy(buffer, model+len, CPL);
      fprintf(ofp, "                   %s\n", buffer);
				/* mline */
      strncpy(buffer, mline+len, CPL);
      fprintf(ofp, "                   %s\n", buffer);
				/* get coords of this aseq block */
      for (apos = len; aseq[apos] != '\0' && apos < len + CPL; apos++)
	if (! isgap(aseq[apos]))
	  rpos++;
      endpos = rpos-1;

				/* aligned sequence */
      strncpy(buffer, aseq+len, CPL);
      fprintf(ofp, "  %10.10s %5d %s %5d\n", seqname, startpos, buffer, endpos);          

      len += CPL;
      fprintf(ofp, "\n");
    }

  /* Done. Free memory and return.
   */
  fflush(ofp);
  free(model);
  free(aseq);
  free(mline);
  free(rfline);
  free(csline);
  return;
}
Exemple #24
0
void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit)
	{
	if (f == 0)
		return;

	if (Hit.Div <= 0.0)
		return;

	const string &Q3 = Hit.Q3;
	const string &A3 = Hit.A3;
	const string &B3 = Hit.B3;

	const byte *Q3Seq = (const byte *) Q3.c_str();
	const byte *A3Seq = (const byte *) A3.c_str();
	const byte *B3Seq = (const byte *) B3.c_str();

// Aligned
	unsigned ColCount = SIZE(Q3);
	asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);

	unsigned LQ = GetUngappedLength(Q3Seq, ColCount);
	unsigned LA = GetUngappedLength(A3Seq, ColCount);
	unsigned LB = GetUngappedLength(B3Seq, ColCount);

	fprintf(f, "\n");
	fprintf(f, "------------------------------------------------------------------------\n");
	fprintf(f, "Query   (%5u nt) %s\n", LQ, Hit.QLabel.c_str());
	fprintf(f, "ParentA (%5u nt) %s\n", LA, Hit.ALabel.c_str());
	fprintf(f, "ParentB (%5u nt) %s\n", LB, Hit.BLabel.c_str());

// Strip terminal gaps in query
	unsigned FromCol = UINT_MAX;
	unsigned ToCol = UINT_MAX;
	for (unsigned Col = 0; Col < ColCount; ++Col)
		{
		if (!isgap(Q3Seq[Col]))
			{
			if (FromCol == UINT_MAX)
				FromCol = Col;
			ToCol = Col;
			}
		}

	unsigned QPos = 0;
	unsigned APos = 0;
	unsigned BPos = 0;
	for (unsigned Col = 0; Col < FromCol; ++Col)
		{
		if (!isgap(A3Seq[Col]))
			++APos;
		if (!isgap(B3Seq[Col]))
			++BPos;
		}

	unsigned Range = ToCol - FromCol + 1;
	unsigned RowCount = (Range + 79)/80;
	unsigned RowFromCol = FromCol;
	for (unsigned RowIndex = 0; RowIndex < RowCount; ++RowIndex)
		{
		fprintf(f, "\n");
		unsigned RowToCol = RowFromCol + 79;
		if (RowToCol > ToCol)
			RowToCol = ToCol;

	// A row
		fprintf(f, "A %5u ", APos + 1);
		for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)
			{
			char q = Q3Seq[Col];
			char a = A3Seq[Col];
			if (a != q)
				a = tolower(a);
			fprintf(f, "%c", a);
			if (!isgap(a))
				++APos;
			}
		fprintf(f, " %u\n", APos);

	// Q row
		fprintf(f, "Q %5u ", QPos + 1);
		for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)
			{
			char q = Q3Seq[Col];
			fprintf(f, "%c", q);
			if (!isgap(q))
				++QPos;
			}
		fprintf(f, " %u\n", QPos);

	// B row
		fprintf(f, "B %5u ", BPos + 1);
		for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)
			{
			char q = Q3Seq[Col];
			char b = B3Seq[Col];
			if (b != q)
				b = tolower(b);
			fprintf(f, "%c", b);
			if (!isgap(b))
				++BPos;
			}
		fprintf(f, " %u\n", BPos);

	// Diffs
		fprintf(f, "Diffs   ");
		for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)
			{
			char q = Q3Seq[Col];
			char a = A3Seq[Col];
			char b = B3Seq[Col];

			char c = ' ';
			if (isgap(q) || isgap(a) || isgap(b))
				c = ' ';
			else if (Col < Hit.ColXLo)
				{
				if (q == a && q == b)
					c = ' ';
				else if (q == a && q != b)
					c = 'A';
				else if (q == b && q != a)
					c = 'b';
				else if (a == b && q != a)
					c = 'N';
				else
					c = '?';
				}
			else if (Col > Hit.ColXHi)
				{
				if (q == a && q == b)
					c = ' ';
				else if (q == b && q != a)
					c = 'B';
				else if (q == a && q != b)
					c = 'a';
				else if (a == b && q != a)
					c = 'N';
				else
					c = '?';
				}

			fprintf(f, "%c", c);
			}
		fprintf(f, "\n");

	// SNPs
		fprintf(f, "Votes   ");
		for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)
			{
			char q = Q3Seq[Col];
			char a = A3Seq[Col];
			char b = B3Seq[Col];

			bool PrevGap = Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1]));
			bool NextGap = Col+1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1]));

			char c = ' ';
			if (isgap(q) || isgap(a) || isgap(b) || PrevGap || NextGap)
				c = ' ';
			else if (Col < Hit.ColXLo)
				{
				if (q == a && q == b)
					c = ' ';
				else if (q == a && q != b)
					c = '+';
				else if (q == b && q != a)
					c = '!';
				else
					c = '0';
				}
			else if (Col > Hit.ColXHi)
				{
				if (q == a && q == b)
					c = ' ';
				else if (q == b && q != a)
					c = '+';
				else if (q == a && q != b)
					c = '!';
				else
					c = '0';
				}

			fprintf(f, "%c", c);
			}
		fprintf(f, "\n");

	// LR row
		fprintf(f, "Model   ");
		for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)
			{
			if (Col < Hit.ColXLo)
				fprintf(f, "A");
			else if (Col >= Hit.ColXLo && Col <= Hit.ColXHi)
				fprintf(f, "x");
			else
				fprintf(f, "B");
			}

		fprintf(f, "\n");

		RowFromCol += 80;
		}
	fprintf(f, "\n");

	double PctIdBestP = max(Hit.PctIdQA, Hit.PctIdQB);
	double Div = (Hit.PctIdQM - PctIdBestP)*100.0/PctIdBestP;

	unsigned LTot = Hit.CS_LY + Hit.CS_LN + Hit.CS_LA;
	unsigned RTot = Hit.CS_RY + Hit.CS_RN + Hit.CS_RA;

	double PctL = Pct(Hit.CS_LY, LTot);
	double PctR = Pct(Hit.CS_RY, RTot);

	fprintf(f,
	  "Ids.  QA %.1f%%, QB %.1f%%, AB %.1f%%, QModel %.1f%%, Div. %+.1f%%\n",
	  Hit.PctIdQA,
	  Hit.PctIdQB,
	  Hit.PctIdAB,
	  Hit.PctIdQM,
	  Div);

	fprintf(f,
	  "Diffs Left %u: N %u, A %u, Y %u (%.1f%%); Right %u: N %u, A %u, Y %u (%.1f%%), Score %.4f\n",
	  LTot, Hit.CS_LN, Hit.CS_LA, Hit.CS_LY, PctL,
	  RTot, Hit.CS_RN, Hit.CS_RA, Hit.CS_RY, PctR,
	  Hit.Score);
	}
Exemple #25
0
vector<pblock::PBLOCK> pblock::resplit(vector<gen::quartral_element> ic)
{
	int last=0;
	PBLOCK tmp=PBLOCK();
	vector<pblock::PBLOCK> result;
	set<string> tmpdef;
	set<string> tmpuse;
	for (size_t i=0;i<ic.size();i++)
	{
		if ((isgap(ic[i])&&ic[i].op!=gen::CALL&&ic[i].op!=gen::CALLFUN)||ic[i+1].op==gen::LABEL)
		{
			tmp.begin=last;tmp.end=i+1;tmp.valid=false;
			last=i+1;
			result.push_back(tmp);
			tmpdef.clear();
			tmpuse.clear();
		}	
	}
	for (size_t i=0;i<result.size();i++)
	{
		if (ic[result[i].begin].op==gen::FUN||(result[i].begin+1<ic.size()&&ic[result[i].begin+1].op==gen::FUN))
		{
			string tmps = ic[result[i].end-1].target;
			string fn = ic[result[i].begin+1].target;
			result[i].functionname=fn;
			for (size_t j=0;j<result.size();j++)
			{
				for (size_t k=result[j].begin;k<result[j].end;k++)
				{
					if (ic[k].target==tmps&&ic[k].op==gen::LABEL)
					{
						result[j].valid=true;
						for (size_t l=result[i].begin;l!=result[i].end;l++)
						{
							if (ic[l].op==gen::CREATE)
							{
								result[j].var.push_back(ic[l].first);
							}
						}
						if (i!=0)
						{
							result[j].level=helper::type2type::string2int(ic[result[i].begin+1].first);
						}
						else
						{
							result[j].level=0;
						}
						result[j].functionname=fn;
					}
				}
			}					
		}
	}
	for (size_t i=0;i<result.size();i++)
	{
		if (result[i].valid==true)
		{
			cout<<i<<' ';
		}	
	}
	cout<<endl;
	return result;
}
Exemple #26
0
/* Function: WriteMSF()
 * 
 * Purpose:  Write aseqs, names, weights to an open fp,
 *           in GCG MSF format. The alignment must
 *           be flushed (all aseqs the same length, padded
 *           with gaps)
 * 
 * Return:   (void)
 */
void
WriteMSF(FILE   *fp,            /* open fp for writing           */
	 char  **aseqs,         /* aligned sequences             */
	 AINFO  *ainfo)
{
  int    still_going;		/* True if writing another block */
  int    idx;			/* counter for sequences         */
  int    pos;			/* position counter              */
  int    namelen;		/* maximum name length used      */
  int    len;			/* tmp variable for name lengths */
  char   buffer[51];		/* buffer for writing seq        */
  char **sqptr;                 /* ptrs into each sequence       */
  int    charcount;		/* num. symbols we're writing    */

				/* allocate seq pointers that we'll
				   move across each sequence */
  sqptr = (char **) MallocOrDie (ainfo->nseq * sizeof(char *));

				/* set sqptrs to start of each seq */
  for (idx = 0; idx < ainfo->nseq; idx++)
    sqptr[idx] = aseqs[idx];
				/* calculate max namelen used */
  namelen = 0;
  for (idx = 0; idx < ainfo->nseq; idx++)
    if ((len = strlen(ainfo->sqinfo[idx].name)) > namelen) 
      namelen = len;

  /*****************************************************
   * Write the title line
   *****************************************************/
  fprintf(fp, "\n");
				/* ack! we're writing bullshit here */
  fprintf(fp, "    MSF:  000  Type: X  Check: 0000  ..\n");
  fprintf(fp, "\n");

  /*****************************************************
   * Write the names
   *****************************************************/

  for (idx = 0; idx < ainfo->nseq; idx++)
    {
      fprintf(fp, "  Name: %-*.*s  Len:  %5d  Check:  %5d  Weight: %.4f\n",
	      namelen, namelen,
	      ainfo->sqinfo[idx].name,
	      ainfo->alen,
	      GCGchecksum(aseqs[idx], ainfo->alen),
	      ainfo->wgt[idx]);
    }
  fprintf(fp, "\n");
  fprintf(fp, "//\n");
  fprintf(fp, "\n");

  /*****************************************************
   * Write the sequences
   *****************************************************/

  still_going = 1;
  while (still_going)
    {
      still_going = 0;
      for (idx = 0; idx < ainfo->nseq; idx++)
	{
	  fprintf(fp, "%-*.*s  ", namelen, namelen, 
		  ainfo->sqinfo[idx].name);

				/* get next line's worth of 50 from seq */
	  strncpy(buffer, sqptr[idx], 50);
	  buffer[50] = '\0';
	  charcount = strlen(buffer);

				/* is there still more to go? */
	  if (charcount == 50 && sqptr[idx][50] != '\0')
	    still_going = 1;

				/* shift the seq ptr by a line */
	  sqptr[idx] += charcount;

				/* draw the sequence line */
	  pos = 0; 
	  while (pos < charcount)
	    {
	      if (isgap(buffer[pos])) fputc('.', fp);
	      else fputc(buffer[pos], fp);
	      pos++;
	      if (!(pos % 10)) fputc(' ', fp);
	    }
	  fputc('\n', fp);
	}
				/* put blank line between blocks */
      fputc('\n', fp);
    }

  free(sqptr);
}
Exemple #27
0
/* Function: ViterbiAlignAlignment()
 * 
 * Purpose:  Align a multiple sequence alignment to an HMM without
 *           altering the multiple alignment.
 *           
 * Args:     shmm   - HMM in integer log-odds score form
 *           aseq   - alignment, [0..nseq-1][0..alen-1]
 *           alen   - length of aligned sequences
 *           nseq   - number of aligned sequences
 *           ret_tr - RETURN: array of tracebacks. rpos field is
 *                    relative to aseq, not raw seq, similar to
 *                    Maxmodelmaker(); use DealignTrace() if you
 *                    want relative to raw sequence.
 *           ret_sc - RETURN: sum of log odds scores.
 *           
 * Return:   (void)
 *           ret_tr is alloced here. Individuals must be free'd by FreeTrace(),
 *           then tr itself free'd by free().
 */
void
ViterbiAlignAlignment(struct shmm_s *shmm, char **aseq, int alen, int nseq,
		      struct trace_s ***ret_tr, float *ret_sc)
{
  struct fvit_s **mx;           /* the viterbi calculation grid       */
  int    score;	                /* tmp variable for scores            */
  int    i;			/* counter for sequence position: 0,1..L */
  int    k;			/* counter for model position: 0,1..M */
  int    idx;			/* index for sequences                */
  struct fvit_s *thisrow;       /* ptr to current row of mx           */
  struct fvit_s *nextrow;       /* ptr to next row of mx              */
  int  **matocc;                /* [0..alen+1][0..nseq-1], 1 for MATCH*/
  struct trace_s **tr;          /* array of tracebacks to return      */
  int   *tpos;                  /* index for position in indiv traces */
  int    lastsub;		/* last state type in master trace    */

  /* A crucial extra component of this alignment algorithm:
   * at each matrix cell, we have to remember: for the best
   * path into the INSERT subcell, what state is each sequence in?
   * This is non-trivial because some gaps are assigned to
   * no states. When we calculate the score from an insert column,
   * where there are gaps we have to look up the previous state.
   *
   * Fortunately, we don't need to keep a full matrix of these,
   * or we'd be in serious memory problems. Use a rolling pointer
   * trick, keep two active rows "current" and "next".
   */
  char **cur_state;             /* [0..M+1][0..nseq-1]; MATCH, INSERT, or DELETE */ 
  char **nxt_state;             /* same, except keeps states for next row        */
  char **swap;                  /* used for swapping cur, nxt                    */

  /********************************************
   * Initial setup and allocations
   ********************************************/
				/* allocate the calculation matrix,
				   which is 0..alen+1 rows by 0..M+1 cols */
  mx        = (struct fvit_s **) MallocOrDie (sizeof(struct fvit_s *) * (alen+2));
  matocc    = (int  **)          MallocOrDie (sizeof(int *)           * (alen+2));
  cur_state = (char **)          MallocOrDie (sizeof(char *)          * (shmm->M+2));
  nxt_state = (char **)          MallocOrDie (sizeof(char *)          * (shmm->M+2));
  for (i = 0; i <= alen+1; i++)
    {
      mx[i]    = (struct fvit_s *) MallocOrDie (sizeof(struct fvit_s) * (shmm->M+2));
      matocc[i]= (int *)           MallocOrDie (sizeof(int)           * nseq);
    }
  for (k = 0; k <= shmm->M+1; k++)
    {
      cur_state[k] = (char *) MallocOrDie (sizeof(char) * nseq);
      nxt_state[k] = (char *) MallocOrDie (sizeof(char) * nseq);
    }

  /********************************************
   * Initialization
   ********************************************/
				/* initialize the first cell 0,0 */
  mx[0][0].score_m = 0;
  mx[0][0].score_d = -99999999;
  mx[0][0].score_i = -99999999;

  for (k = 0; k <= shmm->M+1; k++)
    for (idx = 0; idx < nseq; idx++)
      nxt_state[k][idx] = MATCH;

				/* initialize the top row */
  for (k = 1; k <= shmm->M+1; k++)
    {
      mx[0][k].score_m = -99999999;
      mx[0][k].score_i = -99999999;
    }

  /* Precalculate matocc (match occupancy). 
   * 1 if symbol in column for this seq, 0 if not. 
   * 1..alen, from 0..alen-1 alignments
   */
  for (idx = 0; idx < nseq; idx++)
    {
      matocc[0][idx] = matocc[alen+1][idx] = 1; /* dummies for BEGIN, END */
      for (i = 1; i <= alen; i++)
	matocc[i][idx] = isgap(aseq[idx][i-1]) ? 0 : 1;
    }

  /********************************************
   * Recursion: fill in the mx matrix
   ********************************************/
				/* Alignment is 0..alen-1, we index it 
				   here as 1..alen because of Viterbi matrix. */
  for (i = 0; i <= alen; i++)
    {
				/* get ptrs into current and next row. */
      thisrow = mx[i];
      nextrow = mx[i+1];
				/* initialize in the next row */
      nextrow[0].score_m = -99999999;
      nextrow[0].score_d = -99999999;

      swap = cur_state; cur_state = nxt_state; nxt_state = swap;

      for (k = 0; k <= shmm->M; k++)
	{ /* begin inner loop... this is where all the time is spent. */

				/* add in emission scores to the current cell. */
	  if (i > 0)
	    for (idx = 0; idx < nseq; idx++)
	      if (matocc[i][idx])
		{
		  thisrow[k].score_m += shmm->m_emit[aseq[idx][i-1] - 'A'][k];
		  thisrow[k].score_i += shmm->i_emit[aseq[idx][i-1] - 'A'][k];
		}
				/* initialize with transitions out of delete state */
				/* to delete */
	  thisrow[k+1].score_d = thisrow[k].score_d + shmm->t[9*k + Tdd] * nseq;
	  thisrow[k+1].tback_d = DELETE;
				/* to insert */
	  nextrow[k].score_i = thisrow[k].score_d;
	  nextrow[k].tback_i = DELETE;
	  for (idx = 0; idx < nseq; idx++) 
	    if (matocc[i+1][idx])
	      {
		nextrow[k].score_i += shmm->t[9*k + Tdi];
		nxt_state[k][idx]  = INSERT;
	      }
	    else
	      nxt_state[k][idx] = DELETE;
				/* to match */
	  nextrow[k+1].score_m = thisrow[k].score_d;
	  nextrow[k+1].tback_m = DELETE;
	  for (idx = 0; idx < nseq; idx++)
	    if (matocc[i+1][idx])
	      nextrow[k+1].score_m += shmm-> t[9*k + Tdm];
	    else
	      nextrow[k+1].score_m += shmm-> t[9*k + Tdd];

	  
				/* deal with transitions out of insert state */
				/* to delete state. */
	  score = thisrow[k].score_i;
	  for (idx = 0; idx < nseq; idx++)
	    switch (cur_state[k][idx]) {
	    case MATCH:  score += shmm->t[9*k + Tmd]; break;
	    case DELETE: score += shmm->t[9*k + Tdd]; break;
	    case INSERT: score += shmm->t[9*k + Tid]; break;
	    }
	  if (score > thisrow[k+1].score_d) 
	    {
	      thisrow[k+1].score_d = score;
	      thisrow[k+1].tback_d = INSERT;
	    }
				/* to insert state */
	  score = thisrow[k].score_i;
	  for (idx = 0; idx < nseq; idx++)
	    {
	      if (matocc[i+1][idx])
		switch (cur_state[k][idx]) {
		case MATCH:  score += shmm->t[9*k + Tmi]; break;
		case DELETE: score += shmm->t[9*k + Tdi]; break;
		case INSERT: score += shmm->t[9*k + Tii]; break;
		}
	    }
	  if (score > nextrow[k].score_i) 
	    {
	      nextrow[k].score_i = score;
	      nextrow[k].tback_i = INSERT;
	      for (idx = 0; idx < nseq; idx++)
		if (matocc[i+1][idx])
		  nxt_state[k][idx] = INSERT;
		else
		  nxt_state[k][idx] = cur_state[k][idx];
	    }
				/* to match state */
	  score = thisrow[k].score_i;
	  for (idx = 0; idx < nseq; idx++)
	    if (matocc[i+1][idx])
	      switch (cur_state[k][idx]) {
	      case MATCH:  score += shmm->t[9*k + Tmm]; break;
	      case DELETE: score += shmm->t[9*k + Tdm]; break;
	      case INSERT: score += shmm->t[9*k + Tim]; break;
	      }
	    else
	      switch (cur_state[k][idx]) {
	      case MATCH:  score += shmm->t[9*k + Tmd]; break;
	      case DELETE: score += shmm->t[9*k + Tdd]; break;
	      case INSERT: score += shmm->t[9*k + Tid]; break;
	      }
	  if (score > nextrow[k+1].score_m) 
	    {
	      nextrow[k+1].score_m = score;
	      nextrow[k+1].tback_m = INSERT;
	    }

	  /* Transitions out of match state.
	   */
				/* to delete */
	  score = thisrow[k].score_m;
	  for (idx = 0; idx < nseq; idx++)
	    if (matocc[i][idx])
	      score += shmm->t[9*k + Tmd];
	    else
	      score += shmm->t[9*k + Tdd];
	  if (score > thisrow[k+1].score_d)
	    {
	      thisrow[k+1].score_d = score;
	      thisrow[k+1].tback_d = MATCH;
	    }
				/* to insert */
	  score = thisrow[k].score_m;
	  for (idx = 0; idx < nseq; idx++)
	    if (matocc[i+1][idx])
	      {
		if (matocc[i][idx])
		  score += shmm->t[9*k + Tmi];
		else
		  score += shmm->t[9*k + Tdi];
	      }
	  if (score > nextrow[k].score_i)
	    {
	      nextrow[k].score_i = score;
	      nextrow[k].tback_i = MATCH;
	      for (idx = 0; idx < nseq; idx++)
		if (matocc[i+1][idx])
		  nxt_state[k][idx] = INSERT;
		else if (matocc[i][idx])
		  nxt_state[k][idx] = MATCH;
		else
		  nxt_state[k][idx] = DELETE;
	    }
				/* to match */
	  score = thisrow[k].score_m;
	  for (idx = 0; idx < nseq; idx++)
	    if (matocc[i][idx])
	      {
		if (matocc[i+1][idx])
		  score += shmm->t[9*k + Tmm];
		else
		  score += shmm->t[9*k + Tmd];
	      }
	    else
	      {
		if (matocc[i+1][idx])
		  score += shmm->t[9*k + Tdm];
		else
		  score += shmm->t[9*k + Tdd];
	      }
	  if (score > nextrow[k+1].score_m)
	    {
	      nextrow[k+1].score_m = score;
	      nextrow[k+1].tback_m = MATCH;
	    }

	} /* end loop over model positions k */

    } /* end loop over alignment positions i */

/*  PrintFragViterbiMatrix(mx, alen, shmm->M); */

  /* Fill stage finished.
   * mx now contains final score in mx[alen+1][M+1].
   * Trace back from there to get master alignment.
   */
  tr   = (struct trace_s **) MallocOrDie (sizeof(struct trace_s *) * nseq);
  tpos = (int *)             MallocOrDie (sizeof(int)              * nseq);
  for (idx = 0; idx < nseq; idx++)
    {
      AllocTrace(alen + shmm->M + 3, &(tr[idx]));
      tr[idx]->nodeidx[0]   = shmm->M+1;
      tr[idx]->statetype[0] = MATCH;
      tr[idx]->rpos[0]      = -1;
      tpos[idx]        = 1;
    }
  i      = alen+1;
  k      = shmm->M+1;
  lastsub= MATCH;
	       
  while (i != 0 || k != 0)
    {
      switch (lastsub) {
      case MATCH:  lastsub = mx[i][k].tback_m; i--; k--; break;
      case DELETE: lastsub = mx[i][k].tback_d;      k--; break;
      case INSERT: lastsub = mx[i][k].tback_i; i--;      break;
      default: Die("trace failed!");
      }

      switch (lastsub) {
      case MATCH:
	for (idx = 0; idx < nseq; idx++)
	  if (matocc[i][idx]) 
	    {
	      tr[idx]->nodeidx[tpos[idx]]   = k;
	      tr[idx]->statetype[tpos[idx]] = MATCH;
	      tr[idx]->rpos[tpos[idx]]      = i-1;
	      tpos[idx]++;
	    }
	  else
	    {
	      tr[idx]->nodeidx[tpos[idx]]   = k;
	      tr[idx]->statetype[tpos[idx]] = DELETE;
	      tr[idx]->rpos[tpos[idx]]      = -1;
	      tpos[idx]++;
	    }
	break;
      case INSERT:
	for (idx = 0; idx < nseq; idx++)
	  if (matocc[i][idx])
	    {
	      tr[idx]->nodeidx[tpos[idx]]   = k;
	      tr[idx]->statetype[tpos[idx]] = INSERT;
	      tr[idx]->rpos[tpos[idx]]      = i-1;
	      tpos[idx]++;
	    }
	break;
      case DELETE:
	for (idx = 0; idx < nseq; idx++)
	  {
	    tr[idx]->nodeidx[tpos[idx]]   = k;
	    tr[idx]->statetype[tpos[idx]] = DELETE;
	    tr[idx]->rpos[tpos[idx]]      = -1;
	    tpos[idx]++;
	  }
	break;
      default: Die("trace failed!");
      }	/* end switch across new subcell in traceback */
    } /* end traceback */

  for (idx = 0; idx < nseq; idx++)
    ReverseTrace(tr[idx], tpos[idx]);

  *ret_tr = tr;
  *ret_sc = (float) mx[alen+1][shmm->M+1].score_m / INTSCALE;

  Free2DArray(matocc, alen+2);
  Free2DArray(cur_state, shmm->M+2);
  Free2DArray(nxt_state, shmm->M+2);
  Free2DArray(mx, alen+2);
  free(tpos);
}
/* Function: homogenize_gapsym()
 * 
 * Purpose:  Make gap symbols homogeneous.
 */
static void 
homogenize_gapsym(char *s, char gapsym)
{
  for (; *s != '\0'; s++)
    if (isgap(*s)) *s = gapsym; 
}
Exemple #29
0
/* Function: ReadSELEX()
 * Date:     SRE, Sun Jun  6 18:24:09 1999 [St. Louis]
 *
 * Purpose:  Parse an alignment read from an open SELEX format
 *           alignment file. (SELEX is a single alignment format).
 *           Return the alignment, or NULL if we've already read the
 *           alignment or there's no alignment data in the file.
 *           
 * Limitations: SELEX is the only remaining multipass parser for
 *           alignment files. It cannot read from gzip or from stdin.
 *           It Die()'s here if you try. The reason for this
 *           that SELEX allows space characters as gaps, so we don't
 *           know the borders of an alignment block until we've seen
 *           the whole block. I could rewrite to allow single-pass
 *           parsing (by storing the whole block in memory) but
 *           since SELEX is now legacy, why bother.
 *           
 *           Note that the interface is totally kludged: fastest
 *           possible adaptation of old ReadSELEX() to the new
 *           MSA interface.  
 *
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA *  - an alignment object
 *                    caller responsible for an MSAFree()
 *           NULL if no alignment data.          
 */
MSA *
ReadSELEX(MSAFILE *afp)
{
  MSA     *msa;                 /* RETURN: mult seq alignment   */
  FILE    *fp;                  /* ptr to opened seqfile        */
  char   **aseqs;               /* aligned seqs                 */
  int      num = 0;		/* number of seqs read          */
  char     buffer[LINEBUFLEN];	/* input buffer for lines       */
  char     bufcpy[LINEBUFLEN];	/* strtok'able copy of buffer   */
  struct block_struc {          /** alignment data for a block: */
    int lcol;			/* furthest left aligned sym    */
    int rcol;			/* furthest right aligned sym   */
  } *blocks = NULL;
  int      blocknum;		/* number of blocks in file     */
  char    *nptr;                /* ptr to start of name on line */
  char    *sptr;                /* ptr into sequence on line    */
  int      currnum;		/* num. seqs in given block     */
  int      currblock;		/* index for blocks             */
  int      i;			/* loop counter                 */
  int      seqidx;		/* counter for seqs             */
  int      alen;                /* length of alignment          */
  int      warn_names;          /* becomes TRUE if names don't match between blocks */
  int      headnum;		/* seqidx in per-sequence header info */
  int      currlen;
  int      count;
  int      have_cs = 0;
  int      have_rf = 0;
  AINFO    base_ainfo, *ainfo;	/* hack: used to be passed ptr to AINFO */


  /* Convert from MSA interface to what old ReadSELEX() did:
   *     - copy our open fp, rather than opening file
   *     - verify that we're not reading a gzip or stdin
   */
  if (feof(afp->f)) return NULL;
  if (afp->do_gzip || afp->do_stdin)
    Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file"); 
  fp    = afp->f;
  ainfo = &base_ainfo;

  /***************************************************
   * First pass across file. 
   * Count seqs, get names, determine column info
   * Determine what sorts of info are active in this file.
   ***************************************************/

  InitAinfo(ainfo);
				/* get first line of the block 
				 * (non-comment, non-blank) */
  do
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	{ squid_errno = SQERR_NODATA; return 0; }
      strcpy(bufcpy, buffer);
      if (*buffer == '#')
	{
	  if      (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	  else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;
	}
    }
  while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	 (strchr(commentsyms, *nptr) != NULL));

  blocknum   = 0;
  warn_names = FALSE;
  while (!feof(fp))
    {
				/* allocate for info about this block. */
      if (blocknum == 0)
	blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));
      else 
	blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));
      blocks[blocknum].lcol = LINEBUFLEN+1;
      blocks[blocknum].rcol = -1;
	
      currnum = 0;
      while (nptr != NULL)	/* becomes NULL when this block ends. */
      {
				/* First block only: save names */
	if (blocknum == 0)
	  {
	    if (currnum == 0)
	      ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO));
	    else 
	      ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO));

	    ainfo->sqinfo[currnum].flags = 0;
	    SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME);
	  }
	else			/* in each additional block: check names */
	  {
	    if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0)
	      warn_names = TRUE;
	  }
	currnum++;

				/* check rcol, lcol */
	if ((sptr = strtok(NULL, WHITESPACE)) != NULL)
	  {
				/* is this the furthest left we've
				   seen word 2 in this block? */
	    if (sptr - bufcpy < blocks[blocknum].lcol) 
	      blocks[blocknum].lcol = sptr - bufcpy;
				/* look for right side in buffer */
	    for (sptr = buffer + strlen(buffer) - 1;  
		 strchr(WHITESPACE, *sptr) != NULL;
		 sptr --)
	      /* do nothing */ ;
	    if (sptr - buffer > blocks[blocknum].rcol)
	      blocks[blocknum].rcol = sptr - buffer;
	  }

				/* get the next line; blank line means end of block */
	do
	  {
	    if (fgets(buffer, LINEBUFLEN, fp) == NULL) 
	      { nptr = NULL; break; }
	    strcpy(bufcpy, buffer);

	    if      (strncmp(buffer, "#=SS",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS;
	    else if (strncmp(buffer, "#=SA",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA;
	    else if (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	    else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;

	    if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) 
	      break;
	  } while (strchr(commentsyms, *nptr) != NULL);
      }


				/* check that number of sequences matches expected */
      if (blocknum == 0)
	num = currnum;
      else if (currnum != num)
	Die("Parse error in ReadSELEX()");
      blocknum++;

				/* get first line of next block 
				 * (non-comment, non-blank) */
      do
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; }
	  strcpy(bufcpy, buffer);
	}
      while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	     (strchr(commentsyms, *nptr) != NULL));
    }

  
  /***************************************************
   * Get ready for second pass:
   *   figure out the length of the alignment
   *   malloc space
   *   rewind the file
   ***************************************************/

  alen = 0;
  for (currblock = 0; currblock < blocknum; currblock++)
    alen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

  rewind(fp);

  /* allocations. we can't use AllocateAlignment because of
   * the way we already used ainfo->sqinfo.
   */
  aseqs     = (char **) MallocOrDie (num * sizeof(char *));
  if (have_cs) 
    ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char));
  if (have_rf) 
    ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char));

  
  
  for (i = 0; i < num; i++)
    {
      aseqs[i]     = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SS)
	ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SA)
	ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char));
    }
  
  ainfo->alen = alen;
  ainfo->nseq = num; 
  ainfo->wgt  = (float *) MallocOrDie (sizeof(float) * num);
  FSet(ainfo->wgt, num, 1.0);

  /***************************************************
   * Second pass across file. Parse header; assemble sequences
   ***************************************************/
  /* We've now made a complete first pass over the file. We know how
   * many blocks it contains, we know the number of seqs in the first
   * block, and we know every block has the same number of blocks;
   * so we can be a bit more cavalier about error-checking as we
   * make the second pass.
   */

  /* Look for header
   */
  headnum = 0;
  for (;;)
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	Die("Parse error in ReadSELEX()");
      strcpy(bufcpy, buffer);
      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */

      if (strcmp(nptr, "#=AU") == 0  && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->au = Strdup(sptr);
      else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->name = Strdup(sptr);
      else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->acc  = Strdup(sptr);
      else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->desc = Strdup(sptr);
      else if (strcmp(nptr, "#=GA") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga2 = atof(sptr);

	  ainfo->flags |= AINFO_GA;
	}
      else if (strcmp(nptr, "#=TC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc2 = atof(sptr);

	  ainfo->flags |= AINFO_TC;
	}
      else if (strcmp(nptr, "#=NC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc2 = atof(sptr);

	  ainfo->flags |= AINFO_NC;
	}
      else if (strcmp(nptr, "#=SQ") == 0)      /* per-sequence header info */
	{
				/* first field is the name */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE;

				/* second field is the weight */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (!IsReal(sptr)) 
	    Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number");
	  ainfo->wgt[headnum] = atof(sptr);

				/* third field is database source id */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID);

				/* fourth field is database accession number */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC);

				/* fifth field is start..stop::olen */
	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START);

	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP);
	  
	  if ((sptr = strtok(NULL, ":\t ")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN);

				/* rest of line is optional description */
	  if ((sptr = strtok(NULL, "\n")) != NULL)
	    SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC);
	  
	  headnum++;
	}
      else if (strcmp(nptr, "#=CS") == 0) break;
      else if (strcmp(nptr, "#=RF") == 0) break;
      else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */
    }
  

  currlen = 0;
  for (currblock = 0 ; currblock < blocknum; currblock++)
    {
				/* parse the block */
      seqidx = 0;
      while (nptr != NULL)
	{
				/* Consensus structure */
	  if (strcmp(nptr, "#=CS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=CS line in ReadSELEX()");
	    }

				/* Reference coordinates */
	  else if (strcmp(nptr, "#=RF") == 0)
	    {
	      if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=RF line in ReadSELEX()");
	    }
				/* Individual secondary structure */
	  else if (strcmp(nptr, "#=SS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SS line in ReadSELEX()");
	    }

				/* Side chain % surface accessibility code */
	  else if (strcmp(nptr, "#=SA") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SA line in ReadSELEX()");
	    }
				/* Aligned sequence; avoid unparsed machine comments */
	  else if (strncmp(nptr, "#=", 2) != 0)
	    {
	      if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in alignment line in ReadSELEX()");
	      seqidx++;
	    }

				/* get next line */
	  for (;;)
	    {
	      nptr = NULL;
	      if (fgets(buffer, LINEBUFLEN, fp) == NULL) break;	/* EOF */
	      strcpy(bufcpy, buffer);
	      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */
	      if (strncmp(buffer, "#=", 2) == 0) break;      /* machine comment */
	      if (strchr(commentsyms, *nptr) == NULL) break; /* data */
	    }
	} /* end of a block */

      currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

				/* get line 1 of next block */
      for (;;)
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */
	  strcpy(bufcpy, buffer);
	  if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */
	  if (strncmp(buffer, "#=", 2) == 0)       break; /* machine comment */
	  if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */
	}
    } /* end of the file */

  /* Lengths in sqinfo are for raw sequence (ungapped),
   * and SS, SA are 0..rlen-1 not 0..alen-1.
   * Only the seqs with structures come out of here with lengths set.
   */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      int apos, rpos;
				/* secondary structures */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SS)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].ss[rpos] = '\0';
	}
				/* Surface accessibility */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SA)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].sa[rpos] = '\0';
	}
    }

				/* NULL-terminate all the strings */
  if (ainfo->rf != NULL) ainfo->rf[alen] = '\0';
  if (ainfo->cs != NULL) ainfo->cs[alen] = '\0';
  for (seqidx = 0; seqidx < num; seqidx++)
    aseqs[seqidx][alen]            = '\0';
  
				/* find raw sequence lengths for sqinfo */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      count = 0;
      for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++)
	if (!isgap(*sptr)) count++;
      ainfo->sqinfo[seqidx].len    = count;
      ainfo->sqinfo[seqidx].flags |= SQINFO_LEN;
    }


  /***************************************************
   * Garbage collection and return
   ***************************************************/
  free(blocks);
  if (warn_names) 
    Warn("sequences may be in different orders in blocks of %s?", afp->fname);

  /* Convert back to MSA structure. (Wasteful kludge.)
   */
  msa = MSAFromAINFO(aseqs, ainfo);
  MSAVerifyParse(msa);
  FreeAlignment(aseqs, ainfo);
  return msa;
}
/* Function: WriteMSF()
 * Date:     SRE, Mon May 31 11:25:18 1999 [St. Louis]
 *
 * Purpose:  Write an alignment in MSF format to an open file.
 *
 * Args:     fp    - file that's open for writing.
 *           msa   - alignment to write. 
 *
 *                   Note that msa->type, usually optional, must be
 *                   set for WriteMSF to work. If it isn't, a fatal
 *                   error is generated.
 *
 * Returns:  (void)
 */
void
WriteMSF(FILE *fp, MSA *msa)
{
  time_t now;			/* current time as a time_t */
  char   date[64];		/* today's date in GCG's format "October 3, 1996 15:57" */
  char **gcg_aseq;              /* aligned sequences with gaps converted to GCG format */
  char **gcg_sqname;		/* sequence names with GCG-valid character sets */
  int    idx;			/* counter for sequences         */
  char  *s;                     /* pointer into sqname or seq    */
  int    len;			/* tmp variable for name lengths */
  int    namelen;		/* maximum name length used      */
  int    pos;			/* position counter              */
  char   buffer[51];		/* buffer for writing seq        */
  int    i;			/* another position counter */

  /*****************************************************************
   * Make copies of sequence names and sequences.
   *   GCG recommends that name characters should only contain
   *   alphanumeric characters, -, or _
   *   Some GCG and GCG-compatible software is sensitive to this.
   *   We silently convert all other characters to '_'.
   *   
   *   For sequences, GCG allows only ~ and . for gaps.
   *   Otherwise, everthing is interpreted as a residue;
   *   so squid's IUPAC-restricted chars are fine. ~ means
   *   an external gap. . means an internal gap.
   *****************************************************************/ 
   
				/* make copies that we can edit */
   gcg_aseq   = MallocOrDie(sizeof(char *) * msa->nseq);
   gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq);
   for (idx = 0; idx < msa->nseq; idx++)
     {
       gcg_aseq[idx]   = sre_strdup(msa->aseq[idx],   msa->alen);
       gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1);
     }
				/* alter names as needed  */
   for (idx = 0; idx < msa->nseq; idx++)
     for (s = gcg_sqname[idx]; *s != '\0'; s++)
       if (! isalnum((int) *s) && *s != '-' && *s != '_')
	 *s = '_';
				/* alter gap chars in seq  */
   for (idx = 0; idx < msa->nseq; idx++)
     {
       for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++)
	 *s = '~';
       for (; *s != '\0'; s++)
	 if (isgap(*s)) *s = '.';
       for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--)
	 gcg_aseq[idx][pos] = '~';
     }
				/* calculate max namelen used */
  namelen = 0;
  for (idx = 0; idx < msa->nseq; idx++)
    if ((len = strlen(msa->sqname[idx])) > namelen) 
      namelen = len;

  /*****************************************************
   * Write the MSF header
   *****************************************************/
				/* required file type line */
  if (msa->type == kOtherSeq)
    msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq);

  if      (msa->type == kRNA)   fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
  else if (msa->type == kDNA)   fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
  else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n");
  else if (msa->type == kOtherSeq) 
    Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n"); 
  else    
    Die("Invalid sequence type %d in WriteMSF()\n", msa->type); 

				/* free text comments */
  if (msa->ncomment > 0)
    {
      for (idx = 0; idx < msa->ncomment; idx++)
	fprintf(fp, "%s\n", msa->comment[idx]);
      fprintf(fp, "\n");
    }
				/* required checksum line */
  now = time(NULL);
  if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0)
    Die("What time is it on earth? strftime() failed in WriteMSF().\n");
  fprintf(fp, " %s  MSF: %d  Type: %c  %s  Check: %d  ..\n", 
	  msa->name != NULL ? msa->name : "squid.msf",
	  msa->alen,
	  msa->type == kRNA ? 'N' : 'P',
	  date,
	  GCGMultchecksum(gcg_aseq, msa->nseq));
  fprintf(fp, "\n");

  /*****************************************************
   * Names/weights section
   *****************************************************/

  for (idx = 0; idx < msa->nseq; idx++)
    {
      fprintf(fp, " Name: %-*.*s  Len:  %5d  Check: %4d  Weight: %.2f\n",
	      namelen, namelen,
	      gcg_sqname[idx],
	      msa->alen,
	      GCGchecksum(gcg_aseq[idx], msa->alen),
	      msa->wgt[idx]);
    }
  fprintf(fp, "\n");
  fprintf(fp, "//\n");

  /*****************************************************
   * Write the sequences
   *****************************************************/

  for (pos = 0; pos < msa->alen; pos += 50)
    {
      fprintf(fp, "\n");	/* Blank line between sequence blocks */

				/* Coordinate line */
      len = (pos + 50) > msa->alen ? msa->alen - pos : 50;
      if (len > 10)
	fprintf(fp, "%*s  %-6d%*s%6d\n", namelen, "", 
		pos+1,
		len + ((len-1)/10) - 12, "",
		pos + len);
      else
	fprintf(fp, "%*s  %-6d\n", namelen, "", pos+1);

      for (idx = 0; idx < msa->nseq; idx++)
	{
	  fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]);
				/* get next line's worth of 50 from seq */
	  strncpy(buffer, gcg_aseq[idx] + pos, 50);
	  buffer[50] = '\0';
				/* draw the sequence line */
	  for (i = 0; i < len; i++)
	    {
	      if (! (i % 10)) fputc(' ', fp);
	      fputc(buffer[i], fp);
	    }
	  fputc('\n', fp);
	}
    }

  Free2DArray((void **) gcg_aseq,   msa->nseq);
  Free2DArray((void **) gcg_sqname, msa->nseq);
  return;
}