コード例 #1
0
void
P7ReadNullModel(char *rndfile, float *null, float *ret_p1)
{
  FILE *fp;
  char *s;
  int   x;
  int   type = 0; 

  if ((fp = fopen(rndfile, "r")) == NULL)
    Die("Failed to open null model file %s\n", rndfile);
  if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
  s2upper(s);
  if      (strcmp(s, "NUCLEIC") == 0) type = hmmNUCLEIC;
  else if (strcmp(s, "AMINO")   == 0) type = hmmAMINO;
  else    goto FAILURE;
				/* check/set alphabet type */
  if (Alphabet_type == 0) 
    SetAlphabet(type);
  else if (Alphabet_type != type)
    Die("Alphabet type conflict; null model in %s is inappropriate\n", rndfile);
				/* parse the file */
  for (x = 0; x < Alphabet_size; x++) {
    if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE;
    null[x] = atof(s);
  }
  if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE;
  *ret_p1 = atof(s);

  fclose(fp);
  return;

FAILURE:
  fclose(fp);
  Die("%s is not in HMMER null model file format", rndfile);
}
コード例 #2
0
/* Function: P7ReadPrior()
 * 
 * Purpose:  Input a prior from disk file.
 */
struct p7prior_s *
P7ReadPrior(char *prifile) 
{
  FILE             *fp;
  struct p7prior_s *pri;
  char             *sptr;
  int               q, x;

  if ((fp = fopen(prifile, "r")) == NULL)
    Die("Failed to open HMMER prior file %s\n", prifile);
  pri = P7AllocPrior();

  /* First entry is the strategy: 
   * Only standard Dirichlet prior (simple or mixture) is supported in Plan7 so far
   */
  sptr = Getword(fp, sqdARG_STRING);
  s2upper(sptr);
  if      (strcmp(sptr, "DIRICHLET") == 0) pri->strategy = PRI_DCHLET;
  else Die("No such prior strategy %s; failed to parse file %s", sptr, prifile);

  /* Second entry is the alphabet type:
   * Amino or Nucleic
   */
  sptr = Getword(fp, sqdARG_STRING);
  s2upper(sptr);
  if (strcmp(sptr, "AMINO") == 0)
    { 
      if (Alphabet_type != hmmAMINO)
	Die("HMM and/or sequences are DNA/RNA; can't use protein prior %s", prifile);
    }
  else if (strcmp(sptr, "NUCLEIC") == 0)
    {
      if (Alphabet_type != hmmNUCLEIC)
	Die("HMM and/or sequences are protein; can't use DNA/RNA prior %s", prifile);
    }
  else 
    Die("Alphabet \"%s\" in prior file %s isn't valid.", sptr, prifile);

  /* State transition priors:
   * # of mixtures.
   * then for each mixture:
   *    prior P(q)
   *    Dirichlet terms for Tmm, Tmi, Tmd, Tim, Tii, Tid, Tdm, Tdi, Tdd
   */
  pri->tnum = atoi(Getword(fp, sqdARG_INT));
  if (pri->tnum < 0)
    Die("%d is bad; need at least one state transition mixture component", pri->tnum);
  if (pri->tnum > MAXDCHLET)
    Die("%d is bad, too many transition components (MAXDCHLET = %d)\n", MAXDCHLET);
  for (q = 0; q < pri->tnum; q++)
    {
      pri->tq[q]    = (float) atof(Getword(fp, sqdARG_FLOAT));
      for (x = 0; x < 7; x++) 
	pri->t[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));
    }

  /* Match emission priors:
   * # of mixtures.
   * then for each mixture:
   *    prior P(q)
   *    Dirichlet terms for Alphabet_size symbols in Alphabet
   */
  pri->mnum = atoi(Getword(fp, sqdARG_INT));
  if (pri->mnum < 0)
    Die("%d is bad; need at least one match emission mixture component", pri->mnum);
  if (pri->mnum > MAXDCHLET)
    Die("%d is bad; too many match components (MAXDCHLET = %d)\n", pri->mnum, MAXDCHLET);

  for (q = 0; q < pri->mnum; q++)
    {
      pri->mq[q] = (float) atof(Getword(fp, sqdARG_FLOAT));
      for (x = 0; x < Alphabet_size; x++) 
	pri->m[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));
    }
  
  /* Insert emission priors:
   * # of mixtures.
   * then for each mixture component:
   *    prior P(q)
   *    Dirichlet terms for Alphabet_size symbols in Alphabet
   */
  pri->inum = atoi(Getword(fp, sqdARG_INT));
  if (pri->inum < 0)
    Die("%d is bad; need at least one insert emission mixture component", pri->inum);
  if (pri->inum > MAXDCHLET)
    Die("%d is bad; too many insert components (MAXDCHLET = %d)\n", pri->inum,  MAXDCHLET);
  for (q = 0; q < pri->inum; q++)
    {
      pri->iq[q]  = (float) atof(Getword(fp, sqdARG_FLOAT));
      for (x = 0; x < Alphabet_size; x++) 
	pri->i[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));
    }

  fclose(fp);
  return pri;
}
コード例 #3
0
/* Function: include_alignment()
 * Date:     SRE, Sun Jul  5 15:25:13 1998 [St. Louis]
 *
 * Purpose:  Given the name of a multiple alignment file,
 *           align that alignment to the HMM, and add traces
 *           to an existing array of traces. If do_mapped
 *           is TRUE, we use the HMM's map file. If not,
 *           we use P7ViterbiAlignAlignment().
 *
 * Args:     seqfile  - name of alignment file
 *           hmm      - model to align to
 *           do_mapped- TRUE if we're to use the HMM's alignment map
 *           rsq      - RETURN: array of rseqs to add to
 *           dsq      - RETURN: array of dsq to add to
 *           sqinfo   - RETURN: array of SQINFO to add to
 *           tr       - RETURN: array of traces to add to
 *           nseq     - RETURN: number of seqs           
 *
 * Returns:  new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is
 *           increased to nseq+ainfo.nseq.
 */
void
include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped,
		  char ***rsq, char ***dsq, SQINFO **sqinfo, 
		  struct p7trace_s ***tr, int *nseq)
{
  int format;			/* format of alignment file */
  char **aseq;			/* aligned seqs             */
  char **newdsq;
  char **newrseq;
  AINFO ainfo;			/* info that goes with aseq */
  int   idx;			/* counter over aseqs       */
  struct p7trace_s *master;     /* master trace             */
  struct p7trace_s **addtr;     /* individual traces for aseq */

  if (! SeqfileFormat(seqfile, &format, NULL))
    switch (squid_errno) {
    case SQERR_NOFILE: 
      ajFatal("Alignment file %s could not be opened for reading", seqfile);
      /*FALLTHRU*/ /* a white lie to shut lint up */
    case SQERR_FORMAT: 
    default:           
      ajFatal("Failed to determine format of alignment file %s", seqfile);
    }
				/* read the alignment from file */
  if (! ReadAlignment(seqfile, format, &aseq, &ainfo))
    ajFatal("Failed to read aligned sequence file %s", seqfile);
  for (idx = 0; idx < ainfo.nseq; idx++)
    s2upper(aseq[idx]);
				/* Verify checksums before mapping */
  if (do_mapped && GCGMultchecksum(aseq, ainfo.nseq) != hmm->checksum)
    ajFatal("The checksums for alignment file %s and the HMM alignment map don't match.", 
	seqfile);
				/* Get a master trace */
  if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, ainfo.alen);
  else           master = P7ViterbiAlignAlignment(aseq, &ainfo, hmm);

				/* convert to individual traces */
  ImposeMasterTrace(aseq, ainfo.nseq, master, &addtr);
				/* add those traces to existing ones */
  *tr = MergeTraceArrays(*tr, *nseq, addtr, ainfo.nseq);
  
				/* additional bookkeeping: add to dsq, sqinfo */
  *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DealignAseqs(aseq, ainfo.nseq, &newrseq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*rsq)[idx] = newrseq[idx - (*nseq)];
  free(newrseq);

  *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DigitizeAlignment(aseq, &ainfo, &newdsq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*dsq)[idx] = newdsq[idx - (*nseq)];
  free(newdsq);
				/* unnecessarily complex, but I can't be bothered... */
  *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + ainfo.nseq));
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    SeqinfoCopy(&((*sqinfo)[idx]), &(ainfo.sqinfo[idx - (*nseq)]));
  
  *nseq = *nseq + ainfo.nseq;

				/* Cleanup */
  P7FreeTrace(master);
  FreeAlignment(aseq, &ainfo);
				/* Return */
  return;
}
コード例 #4
0
ファイル: sqio.c プロジェクト: obbila/CustomWise
int
WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo)
{
  int   numline = 0;
  int   lines = 0, spacer = 0, width  = 50, tab = 0;
  int   i, j, l, l1, ibase;
  char  endstr[10];
  char  s[100];			/* buffer for sequence  */
  char  ss[100];		/* buffer for structure */
  int   checksum = 0;
  int   seqlen;   
  int   which_case;    /* 0 = do nothing. 1 = upper case. 2 = lower case */
  int   dostruc;		/* TRUE to print structure lines*/

  which_case = 0;
  dostruc    = FALSE;		
  seqlen     = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq);

				/* intercept Selex-format requests - SRE */
  if (outform == kSelex) {
    fprintf(outf, "%10s %s\n", sqinfo->name, seq);
    return 1;
  }

  if (outform == kClustal || outform == kMSF) {
    Warn("Tried to write Clustal or MSF with WriteSeq() -- bad, bad.");
    return 1;
  }

  strcpy( endstr,"");
  l1 = 0;

  /* 10Nov91: write this out in all possible formats: */
  checksum = GCGchecksum(seq, seqlen);

  switch (outform) {

    case kUnknown:    /* no header, just sequence */
      strcpy(endstr,"\n"); /* end w/ extra blank line */
      break;

    case kGenBank:
      fprintf(outf,"LOCUS       %s       %d bp\n", 
	      (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name,
	      seqlen);
      fprintf(outf,"DEFINITION  %s\n", 
	      (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
      fprintf(outf,"ACCESSION   %s\n", 
	      (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
      fprintf(outf,"ORIGIN      \n");
      spacer = 11;
      numline = 1;
      strcpy(endstr, "\n//");
      break;

    case kGCGdata:
      fprintf(outf, ">>>>%s  9/95  ASCII  Len: %d\n", sqinfo->name, seqlen);
      fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
      break;

    case kPIR:
      fprintf(outf, "ENTRY          %s\n", 
	      (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
      fprintf(outf, "TITLE          %s\n", 
	      (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
      fprintf(outf, "ACCESSION      %s\n",
	      (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
      fprintf(outf, "SUMMARY                                #Length %d  #Checksum  %d\n",
	      sqinfo->len, checksum);
      fprintf(outf, "SEQUENCE\n");
      fprintf(outf, "                  5        10        15        20        25        30\n");
      spacer  = 2;		/* spaces after every residue */
      numline = 1;              /* number lines w/ coords     */
      width   = 30;             /* 30 aa per line             */
      strcpy(endstr, "\n///");
      break;

    case kSquid:
      fprintf(outf, "NAM  %s\n", sqinfo->name);
      if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN))
	fprintf(outf, "SRC  %s %s %d..%d::%d\n",
		(sqinfo->flags & SQINFO_ID)    ? sqinfo->id     : "-",
		(sqinfo->flags & SQINFO_ACC)   ? sqinfo->acc    : "-",
		(sqinfo->flags & SQINFO_START) ? sqinfo->start  : 0,
		(sqinfo->flags & SQINFO_STOP)  ? sqinfo->stop   : 0,
		(sqinfo->flags & SQINFO_OLEN)  ? sqinfo->olen   : 0);
      if (sqinfo->flags & SQINFO_DESC)
	fprintf(outf, "DES  %s\n", sqinfo->desc);
      if (sqinfo->flags & SQINFO_SS)
	{
	  fprintf(outf, "SEQ  +SS\n");
	  dostruc = TRUE;	/* print structure lines too */
	}
      else
	fprintf(outf, "SEQ\n");
      numline = 1;                /* number seq lines w/ coords  */
      strcpy(endstr, "\n++");
      break;

    case kEMBL:
      fprintf(outf,"ID   %s\n",
	      (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
      fprintf(outf,"AC   %s\n",
	      (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
      fprintf(outf,"DE   %s\n", 
	      (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
      fprintf(outf,"SQ             %d BP\n", seqlen);
      strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
      tab = 5;     /** added 31jan91 */
      spacer = 11; /** added 31jan91 */
      break;

    case kGCG:
      fprintf(outf,"%s\n", sqinfo->name);
      if (sqinfo->flags & SQINFO_ACC)
	fprintf(outf,"ACCESSION   %s\n", sqinfo->acc); 
      if (sqinfo->flags & SQINFO_DESC)
	fprintf(outf,"DEFINITION  %s\n", sqinfo->desc);
      fprintf(outf,"    %s  Length: %d  (today)  Check: %d  ..\n", 
	      sqinfo->name, seqlen, checksum);
      spacer = 11;
      numline = 1;
      strcpy(endstr, "\n");  /* this is insurance to help prevent misreads at eof */
      break;

    case kStrider: /* ?? map ?*/
      fprintf(outf,"; ### from DNA Strider ;-)\n");
      fprintf(outf,"; DNA sequence  %s, %d bases, %d checksum.\n;\n", 
	      sqinfo->name, seqlen, checksum);
      strcpy(endstr, "\n//");
      break;

			/* SRE: Don had Zuker default to Pearson, which is not
			   intuitive or helpful, since Zuker's MFOLD can't read
			   Pearson format. More useful to use kIG */
    case kZuker:
      which_case = 1;			/* MFOLD requires upper case. */
      /*FALLTHRU*/
    case kIG:
      fprintf(outf,";%s %s\n", 
	      sqinfo->name,
	      (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
      fprintf(outf,"%s\n", sqinfo->name);
      strcpy(endstr,"1"); /* == linear dna */
      break;

    case kRaw:			/* Raw: just print the whole sequence. */
      fprintf(outf, "%s\n", seq);
      return 1;

    default :
    case kPearson:
      fprintf(outf,">%s  %s\n", sqinfo->name,
	      (sqinfo->flags & SQINFO_DESC)  ? sqinfo->desc   : "");
      break;
    }

  if (which_case == 1) s2upper(seq);
  if (which_case == 2) s2lower(seq);


  width = MIN(width,100);
  for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) {
    if (l1 < 0) l1 = 0;
    else if (l1 == 0) {
      if (numline) fprintf(outf,"%8d ",ibase);
      for (j=0; j<tab; j++) fputc(' ',outf);
      }
    if ((spacer != 0) && ((l+1) % spacer == 1)) 
      { s[l] = ' '; ss[l] = ' '; l++; }
    s[l]  = seq[i];
    ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.';
    l++; i++;
    l1++;                 /* don't count spaces for width*/
    if (l1 == width || i == seqlen) {
      s[l] = ss[l] = '\0';
      l = 0; l1 = 0;
      if (dostruc)
	{
	  fprintf(outf, "%s\n", s);
	  if (numline) fprintf(outf,"         ");
	  for (j=0; j<tab; j++) fputc(' ',outf);
	  if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr);
	  else fprintf(outf,"%s\n",ss);
	}
      else
	{
	  if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr);
	  else fprintf(outf,"%s\n",s);
	}
      lines++;
      ibase = i+1;
      }
    }
  return lines;
}