Exemple #1
0
/* utest_basic()
 * An MSA to ex{e,o}rcise past demons.
 *   1. seq2 gives an I->end transition.
 *   2. seq1 contains degenerate Z,X, exercising symbol counting
 *      of degenerate residues.
 */
static void
utest_basic(void)
{
  char         *failmsg      = "failure in build.c::utest_basic() unit test";
  char          msafile[16]  = "p7tmpXXXXXX"; /* tmpfile name template */
  FILE         *ofp          = NULL;
  ESL_ALPHABET *abc          = esl_alphabet_Create(eslAMINO);
  ESL_MSAFILE  *afp          = NULL;
  ESL_MSA      *msa          = NULL;
  P7_HMM       *hmm          = NULL;
  float         symfrac      = 0.5;

  if (esl_tmpfile_named(msafile, &ofp) != eslOK) esl_fatal(failmsg);
  fprintf(ofp, "# STOCKHOLM 1.0\n");
  fprintf(ofp, "#=GC RF --xxxxxxxxxxxxxxxx-xxx-x--\n");
  fprintf(ofp, "seq1    --ACDEFGHIKLMNPZXS-TVW-Yyy\n");
  fprintf(ofp, "seq2    aaACDEFGHIKLMNPQRS-TVWw---\n");
  fprintf(ofp, "seq3    aaAC-EFGHIKLMNPQRS-TVW-Y--\n");
  fprintf(ofp, "seq4    aaAC-EFGHIKLMNPQRS-TVW-Y--\n");
  fprintf(ofp, "//\n");
  fclose(ofp);

  if (esl_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp) != eslOK) esl_fatal(failmsg);
  if (esl_msafile_Read(afp, &msa)                                           != eslOK) esl_fatal(failmsg);
  if (p7_Fastmodelmaker(msa, symfrac, NULL, &hmm, NULL)                     != eslOK) esl_fatal(failmsg);
  
  p7_hmm_Destroy(hmm);
  esl_msa_Destroy(msa);
  esl_msafile_Close(afp);
  esl_alphabet_Destroy(abc);
  remove(msafile);
  return;
}
Exemple #2
0
/* Create an SSI index file for open MSA file <afp>.
 * Both name and accession of MSAs are stored as keys.
 */
static void
create_ssi_index(ESL_GETOPTS *go, ESLX_MSAFILE *afp)
{
  ESL_NEWSSI *ns      = NULL;
  ESL_MSA    *msa     = NULL;
  int         nali    = 0;
  char       *ssifile = NULL;
  uint16_t    fh;
  int         status;

  if (afp->bf->mode_is != eslBUFFER_FILE &&
      afp->bf->mode_is != eslBUFFER_ALLFILE &&
      afp->bf->mode_is != eslBUFFER_MMAP)
    esl_fatal("<msafile> must be a regular file to be SSI indexed");

  esl_sprintf(&ssifile, "%s.ssi", afp->bf->filename);

  status = esl_newssi_Open(ssifile, FALSE, &ns);
  if      (status == eslENOTFOUND)   esl_fatal("failed to open SSI index %s", ssifile);
  else if (status == eslEOVERWRITE)  esl_fatal("SSI index %s already exists; delete or rename it", ssifile);
  else if (status != eslOK)          esl_fatal("failed to create a new SSI index");

  if (esl_newssi_AddFile(ns, afp->bf->filename, afp->format, &fh) != eslOK)
    esl_fatal("Failed to add MSA file %s to new SSI index\n", afp->bf->filename);

  printf("Working...    "); 
  fflush(stdout);
  
  while ((status = eslx_msafile_Read(afp, &msa)) != eslEOF)
    {
      if (status != eslOK) 
	eslx_msafile_ReadFailure(afp, status);

      nali++;

      if (! msa->name)
	esl_fatal("Every alignment in file must have a name to be indexed. Failed to find name of alignment #%d\n", nali);

      if (esl_newssi_AddKey(ns, msa->name, fh, msa->offset, 0, 0) != eslOK) 
	esl_fatal("Failed to add key %s to SSI index", msa->name);

      if (msa->acc && esl_newssi_AddAlias(ns, msa->acc, msa->name) != eslOK)
	esl_fatal("Failed to add secondary key %s to SSI index", msa->acc);
      
      esl_msa_Destroy(msa);
    }
  
  if (esl_newssi_Write(ns) != eslOK) 
    esl_fatal("Failed to write keys to ssi file %s\n", ssifile);

  printf("done.\n");

  if (ns->nsecondary) printf("Indexed %d alignments (%ld names and %ld accessions).\n", nali, (long) ns->nprimary, (long) ns->nsecondary);
  else                printf("Indexed %d alignments (%ld names).\n", nali, (long) ns->nprimary);
  printf("SSI index written to file %s\n", ssifile);

  free(ssifile);
  esl_newssi_Close(ns);
  return;
}  
/* The "basic" utest is a minimal driver for making a small DNA profile and a small DNA sequence,
 * then running Viterbi and Forward. It's useful for dumping DP matrices and profiles for debugging.
 */
static void
utest_basic(ESL_GETOPTS *go)
{
  char           *query= "# STOCKHOLM 1.0\n\nseq1 GAATTC\nseq2 GAATTC\n//\n";
  int             fmt  = eslMSAFILE_STOCKHOLM;
  char           *targ = "GAATTC";
  ESL_ALPHABET   *abc  = NULL;
  ESL_MSA        *msa  = NULL;
  P7_HMM         *hmm  = NULL;
  P7_PROFILE     *gm   = NULL;
  P7_BG          *bg   = NULL;
  P7_PRIOR       *pri  = NULL;	
  ESL_DSQ        *dsq  = NULL;
  P7_GMX         *gx   = NULL;
  P7_TRACE        *tr  = NULL;
  int             L    = strlen(targ);
  float           vsc, vsc2, fsc;

  if ((abc = esl_alphabet_Create(eslDNA))          == NULL)  esl_fatal("failed to create alphabet");
  if ((pri = p7_prior_CreateNucleic())             == NULL)  esl_fatal("failed to create prior");
  if ((msa = esl_msa_CreateFromString(query, fmt)) == NULL)  esl_fatal("failed to create MSA");
  if (esl_msa_Digitize(abc, msa, NULL)             != eslOK) esl_fatal("failed to digitize MSA");
  if (p7_Fastmodelmaker(msa, 0.5, NULL, &hmm, NULL) != eslOK) esl_fatal("failed to create GAATTC model");
  if (p7_ParameterEstimation(hmm, pri)             != eslOK) esl_fatal("failed to parameterize GAATTC model");
  if (p7_hmm_SetConsensus(hmm, NULL)               != eslOK) esl_fatal("failed to make consensus");
  if ((bg = p7_bg_Create(abc))                     == NULL)  esl_fatal("failed to create DNA null model");
  if ((gm = p7_profile_Create(hmm->M, abc))        == NULL)  esl_fatal("failed to create GAATTC profile");
  if (p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL)!= eslOK) esl_fatal("failed to config profile");
  if (p7_profile_Validate(gm, NULL, 0.0001)        != eslOK) esl_fatal("whoops, profile is bad!");
  if (esl_abc_CreateDsq(abc, targ, &dsq)           != eslOK) esl_fatal("failed to create GAATTC digital sequence");
  if ((gx = p7_gmx_Create(gm->M, L))               == NULL)  esl_fatal("failed to create DP matrix");
  if ((tr = p7_trace_Create())                     == NULL)  esl_fatal("trace creation failed");

  p7_GViterbi   (dsq, L, gm, gx, &vsc);
  if (esl_opt_GetBoolean(go, "-v")) printf("Viterbi score: %.4f\n", vsc);
  if (esl_opt_GetBoolean(go, "-v")) p7_gmx_Dump(stdout, gx, p7_DEFAULT);

  p7_GTrace     (dsq, L, gm, gx, tr);
  p7_trace_Score(tr, dsq, gm, &vsc2);
  if (esl_opt_GetBoolean(go, "-v")) p7_trace_Dump(stdout, tr, gm, dsq);
  
  if (esl_FCompare(vsc, vsc2, 1e-5) != eslOK)  esl_fatal("trace score and Viterbi score don't agree.");

  p7_GForward   (dsq, L, gm, gx, &fsc);
  if (esl_opt_GetBoolean(go, "-v")) printf("Forward score: %.4f\n", fsc);
  if (esl_opt_GetBoolean(go, "-v")) p7_gmx_Dump(stdout, gx, p7_DEFAULT);

  p7_trace_Destroy(tr);
  p7_gmx_Destroy(gx);
  free(dsq);
  p7_profile_Destroy(gm);
  p7_bg_Destroy(bg);
  p7_hmm_Destroy(hmm);
  esl_msa_Destroy(msa);
  p7_prior_Destroy(pri);
  esl_alphabet_Destroy(abc);
  return;
}
Exemple #4
0
static void
read_test_msas_text(char *a2mfile, char *stkfile)
{
  char msg[]         = "A2M msa text-mode read unit test failed";
  ESL_MSAFILE  *afp1 = NULL;
  ESL_MSAFILE  *afp2 = NULL;
  ESL_MSA      *msa1, *msa2, *msa3, *msa4;
  FILE         *a2mfp, *stkfp;
  char          a2mfile2[32] = "esltmpa2m2XXXXXX";
  char          stkfile2[32] = "esltmpstk2XXXXXX";

  /*                     vvvv-- everything's the same as the digital utest except these NULLs  */
  if ( esl_msafile_Open(NULL, a2mfile, NULL, eslMSAFILE_A2M,       NULL, &afp1) != eslOK)  esl_fatal(msg);
  if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK)  esl_fatal(msg);
  if ( esl_msafile_a2m_Read      (afp1, &msa1)                                  != eslOK)  esl_fatal(msg);
  if ( esl_msafile_stockholm_Read(afp2, &msa2)                                  != eslOK)  esl_fatal(msg);
  if ( esl_msa_Compare(msa1, msa2)                                              != eslOK)  esl_fatal(msg);
  if ( esl_msafile_a2m_Read      (afp1, &msa3)                                  != eslEOF) esl_fatal(msg);
  if ( esl_msafile_stockholm_Read(afp2, &msa3)                                  != eslEOF) esl_fatal(msg);
  esl_msafile_Close(afp2);
  esl_msafile_Close(afp1);

  if ( esl_tmpfile_named(a2mfile2, &a2mfp)                        != eslOK) esl_fatal(msg);
  if ( esl_tmpfile_named(stkfile2, &stkfp)                        != eslOK) esl_fatal(msg);
  if ( esl_msafile_a2m_Write      (a2mfp, msa2)                   != eslOK) esl_fatal(msg);
  if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_PFAM)  != eslOK) esl_fatal(msg);
  fclose(a2mfp);
  fclose(stkfp);
  if ( esl_msafile_Open(NULL, a2mfile2, NULL, eslMSAFILE_A2M,       NULL, &afp1) != eslOK) esl_fatal(msg);
  if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
  if ( esl_msafile_a2m_Read      (afp1, &msa3)                                   != eslOK) esl_fatal(msg);
  if ( esl_msafile_stockholm_Read(afp2, &msa4)                                   != eslOK) esl_fatal(msg);
  if ( esl_msa_Compare(msa3, msa4)                                               != eslOK) esl_fatal(msg);

  remove(a2mfile2);
  remove(stkfile2);
  esl_msafile_Close(afp2);
  esl_msafile_Close(afp1);

  esl_msa_Destroy(msa1);
  esl_msa_Destroy(msa2);
  esl_msa_Destroy(msa3);  
  esl_msa_Destroy(msa4);
}
Exemple #5
0
/* Function:  p7_tracealign_MSA()
 * Synopsis:  Convert array of traces (for a previous MSA) to a new MSA.
 * Incept:    SRE, Mon Mar  2 18:18:22 2009 [Casa de Gatos]
 *
 * Purpose:   Identical to <p7_tracealign_Seqs()> except that the trace 
 *            array <tr> accompanies a digital multiple alignment <premsa>, 
 *            rather than an array of digital sequences. 
 *            
 *            This gets used in <p7_Builder()>, where we've
 *            constructed an array of faux traces directly from an
 *            input alignment, and we want to reconstruct the 
 *            MSA that corresponds to what HMMER actually used
 *            to build its model (after trace doctoring to be
 *            compatible with Plan 7, and with <#=RF> annotation
 *            on assigned consensus columns).
 *
 * Xref:      J4/102.
 */
int
p7_tracealign_MSA(const ESL_MSA *premsa, P7_TRACE **tr, int M, int optflags, ESL_MSA **ret_postmsa)
{
  const ESL_ALPHABET *abc  = premsa->abc;
  ESL_MSA      *msa        = NULL;	/* RETURN: new MSA */
  int          *inscount   = NULL;	/* array of max gaps between aligned columns */
  int          *matmap     = NULL;      /* matmap[k] = apos of match k matmap[1..M] = [1..alen] */
  int          *matuse     = NULL;      /* TRUE if an alignment column is associated with match state k [1..M] */
  int           idx;                    /* counter over sequences */
  int           alen;		        /* width of alignment */
  int           status;

  if ((status = map_new_msa(tr, premsa->nseq, M, optflags, &inscount, &matuse, &matmap, &alen)) != eslOK) return status;
 
  if (optflags & p7_DIGITIZE) { if ((status = make_digital_msa(NULL, premsa, tr, premsa->nseq, matuse, matmap, M, alen, optflags, &msa)) != eslOK) goto ERROR; }
  else                        { if ((status = make_text_msa   (NULL, premsa, tr, premsa->nseq, matuse, matmap, M, alen, optflags, &msa)) != eslOK) goto ERROR; }

  if ((status = annotate_rf(msa, M, matuse, matmap))                          != eslOK) goto ERROR;
  if ((status = annotate_posterior_probability(msa, tr, matmap, M, optflags)) != eslOK) goto ERROR;

  if (optflags & p7_DIGITIZE) rejustify_insertions_digital(     msa, inscount, matmap, matuse, M);
  else                        rejustify_insertions_text   (abc, msa, inscount, matmap, matuse, M);


  /* Transfer information from old MSA to new */
  esl_msa_SetName     (msa, premsa->name);
  esl_msa_SetDesc     (msa, premsa->desc);
  esl_msa_SetAccession(msa, premsa->acc);

  for (idx = 0; idx < premsa->nseq; idx++)
    {
      esl_msa_SetSeqName       (msa, idx, premsa->sqname[idx]);
      if (msa->sqacc)  esl_msa_SetSeqAccession  (msa, idx, premsa->sqacc[idx]);
      if (msa->sqdesc) esl_msa_SetSeqDescription(msa, idx, premsa->sqdesc[idx]);
      msa->wgt[idx] = premsa->wgt[idx];
    }

  if (premsa->flags & eslMSA_HASWGTS)
    msa->flags |= eslMSA_HASWGTS;

  free(inscount);
  free(matmap);
  free(matuse);
  *ret_postmsa = msa;
  return eslOK;

 ERROR:
  if (msa      != NULL) esl_msa_Destroy(msa);
  if (inscount != NULL) free(inscount);
  if (matmap   != NULL) free(matmap);
  if (matuse   != NULL) free(matuse);
  *ret_postmsa = NULL;
  return status;
}
Exemple #6
0
int
main(int argc, char **argv)
{
  char        *filename   = argv[1];
  int          fmt        = eslMSAFILE_UNKNOWN; 
  int          type       = eslUNKNOWN;
  ESL_ALPHABET *abc       = NULL;
  ESL_MSAFILE *afp        = NULL;
  ESL_MSA     *msa        = NULL;
  double       maxid      = 0.62; /* cluster at 62% identity: the BLOSUM62 rule */
  int         *assignment = NULL;
  int         *nin        = NULL;
  int          nclusters;
  int          c, i;		  
  int          status;

  /* Open; guess alphabet; set to digital mode */
  status = esl_msafile_Open(filename, fmt, NULL, &afp);
  if (status == eslENOTFOUND)    esl_fatal("Alignment file %s isn't readable", filename);
  else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of %s",  filename);
  else if (status != eslOK)      esl_fatal("Alignment file open failed (error code %d)", status);

  status = esl_msafile_GuessAlphabet(afp, &type);
  if      (status == eslEAMBIGUOUS) esl_fatal("Couldn't guess alphabet from first alignment in %s", filename);
  else if (status == eslEFORMAT)    esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nBad line is: %s\n",
					       afp->linenumber, afp->fname, afp->errbuf, afp->buf);
  else if (status == eslENODATA)    esl_fatal("Alignment file %s contains no data?", filename);
  else if (status != eslOK)         esl_fatal("Failed to guess alphabet (error code %d)\n", status);

  abc = esl_alphabet_Create(type);
  esl_msafile_SetDigital(afp, abc);

  /* read one alignment */
  status = esl_msa_Read(afp, &msa);
  if      (status == eslEFORMAT)  esl_fatal("alignment file %s: %s\n", afp->fname, afp->errbuf);
  else if (status != eslOK)       esl_fatal("Alignment file read failed with error code %d\n", status);

  /* do the clustering */
  esl_msacluster_SingleLinkage(msa, maxid, &assignment, &nin, &nclusters);

  printf("%d clusters at threshold of %f fractional identity\n", nclusters, maxid);
  for (c = 0; c < nclusters; c++) {
    printf("cluster %d:\n", c);
    for (i = 0; i < msa->nseq; i++) if (assignment[i] == c) printf("  %s\n", msa->sqname[i]);
    printf("(%d sequences)\n\n", nin[c]);
  }

  esl_msa_Destroy(msa);
  esl_msafile_Close(afp);
  free(assignment);
  free(nin);
  return 0;
}
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  char         *msafile = esl_opt_GetArg(go, 1);
  ESL_ALPHABET *abc     = NULL;
  int           infmt   = eslMSAFILE_UNKNOWN;
  ESLX_MSAFILE *afp     = NULL;
  ESL_MSA      *msa     = NULL;
  FILE         *ofp     = stdout;
  int           nali    = 0;
  int           namewidth;
  double        pid;
  int           nid, n;
  int           i,j;
  int           status;

  /* allow user to assert the input MSA alphabet */
  if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
  else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); 

  /* allow user to assert the input MSA format */
  if (esl_opt_IsOn(go, "--informat") &&
      (infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
    esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat"));

  /* digital open */
  if ( ( status = eslx_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK)
    eslx_msafile_OpenFailure(afp, status);

  while ((status = eslx_msafile_Read(afp, &msa)) == eslOK)
    {	
      nali++;

      namewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq);

      for (i = 0; i < msa->nseq; i++)
	for (j = i+1; j < msa->nseq; j++)
	  {
	    esl_dst_XPairId(abc, msa->ax[i], msa->ax[j], &pid, &nid, &n);
	    fprintf(ofp, "%-*s %-*s %6.2f %6d %6d\n", namewidth, msa->sqname[i], namewidth, msa->sqname[j], pid*100.0, nid, n);
	  }

      esl_msa_Destroy(msa);
    }
  if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); 

  eslx_msafile_Close(afp);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go        = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  ESL_RANDOMNESS *rng       = esl_randomness_Create(0);
  char           *msafile   = esl_opt_GetArg(go, 1);
  int             fmt       = eslMSAFILE_UNKNOWN;
  ESL_ALPHABET   *abc       = NULL;
  ESLX_MSAFILE   *afp       = NULL;
  ESL_MSA        *msa       = NULL;
  int             textmode  = esl_opt_GetBoolean(go, "--text");
  int             nali      = 0;
  int             status;

  /* If you know the alphabet you want, create it - you'll pass it to eslx_msafile_Open() */
  if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
  else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); 

  /* Open in text or digital mode.
   *   To let the Open() function autoguess the format, you pass <infmt=eslMSAFILE_UNKNOWN>. 
   *   To let it autoguess the alphabet, you set <abc=NULL> and pass <&abc>.
   *   To open in text mode instead of digital, you pass <NULL> for the alphabet argument.
   * eslx_msafile_OpenFailure() is a convenience, printing various diagnostics of any
   * open failure to <stderr>. You can of course handle your own diagnostics instead.
   */
  if (textmode) status = eslx_msafile_Open(NULL, msafile, NULL, fmt, NULL, &afp);
  else          status = eslx_msafile_Open(&abc, msafile, NULL, fmt, NULL, &afp);
  if (status != eslOK)   eslx_msafile_OpenFailure(afp, status);
  
  fmt = afp->format;

  while ((status = eslx_msafile_Read(afp, &msa)) == eslOK)
    {	
      /* if digital MSA: msa->ax[idx=0..nseq-1][acol=1..alen] is the alignment data; 
       * if text MSA:  msa->aseq[idx=0..nseq-1][acol=0..alen-1] */
      nali++;
      
      /* permute it */
      esl_msashuffle_PermuteSequenceOrder(rng, msa);

      eslx_msafile_Write(stdout, msa, fmt);
      esl_msa_Destroy(msa);
    }
  if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); /* a convenience, like eslx_msafile_OpenFailure() */

  esl_alphabet_Destroy(abc);
  eslx_msafile_Close(afp);
  esl_randomness_Destroy(rng);
  esl_getopts_Destroy(go);
  exit(0);
}
Exemple #9
0
/* multifetch:
 * given a file containing lines with one name or key per line;
 * parse the file line-by-line;
 * if we have an SSI index available, retrieve the MSAs by key
 * as we see each line;
 * else, without an SSI index, store the keys in a hash, then
 * read the entire MSA file in a single pass, outputting MSAs
 * that are in our keylist. 
 * 
 * Note that with an SSI index, you get the MSAs in the order they
 * appear in the <keyfile>, but without an SSI index, you get MSAs in
 * the order they occur in the MSA file.
 */
static void
multifetch(ESL_GETOPTS *go, FILE *ofp, int outfmt, char *keyfile, ESLX_MSAFILE *afp)
{
  ESL_KEYHASH    *keys   = esl_keyhash_Create();
  ESL_FILEPARSER *efp    = NULL;
  ESL_MSA        *msa    = NULL;
  int             nali   = 0;
  char           *key;
  int             keylen;
  int             keyidx;
  int             status;
  
  if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) 
    esl_fatal("Failed to open key file %s\n", keyfile);
  esl_fileparser_SetCommentChar(efp, '#');

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK)
	esl_fatal("Failed to read MSA name on line %d of file %s\n", efp->linenumber, keyfile);
      
      status = esl_keyhash_Store(keys, key, keylen, &keyidx);
      if (status == eslEDUP) esl_fatal("MSA key %s occurs more than once in file %s\n", key, keyfile);
	
      if (afp->ssi) { onefetch(go, ofp, outfmt, key, afp);  nali++; }

    }

  if (! afp->ssi)
    {
      while ((status = eslx_msafile_Read(afp, &msa)) != eslEOF)
	{
	  if (status != eslOK) eslx_msafile_ReadFailure(afp, status);
	  nali++;

	  if (msa->name == NULL) 
	    esl_fatal("Every alignment in file must have a name to be retrievable. Failed to find name of alignment #%d\n", nali);

	  if ( (esl_keyhash_Lookup(keys, msa->name, -1, NULL) == eslOK) ||
	       (msa->acc != NULL && esl_keyhash_Lookup(keys, msa->acc, -1, NULL) == eslOK))
	    eslx_msafile_Write(ofp, msa, outfmt);

	  esl_msa_Destroy(msa);
	}
    }
  
  if (ofp != stdout) printf("\nRetrieved %d alignments.\n", nali);
  esl_keyhash_Destroy(keys);
  esl_fileparser_Close(efp);
  return;
}
Exemple #10
0
/* Function:  p7_tracealign_Seqs()
 * Synopsis:  Convert array of traces (for a sequence array) to a new MSA.
 * Incept:    SRE, Tue Oct 21 19:40:33 2008 [Janelia]
 *
 * Purpose:   Convert an array of <nseq> traces <tr[0..nseq-1]>,
 *            corresponding to an array of digital sequences
 *            <sq[0..nseq-1]> aligned to a model of
 *            length <M>, to a new multiple sequence alignment.
 *            The new alignment structure is allocated here, and returned
 *            in <*ret_msa>.
 *            
 *            As a special case, the traces may contain I->D and D->I
 *            transitions. This feature is used by <hmmalign --mapali>
 *            to reconstruct an input alignment without modification
 *            from trace doctoring.
 *            
 *            <optflags> controls some optional behaviors in producing
 *            the alignment, as follows:
 *            
 *            <p7_DIGITIZE>: creates the MSA in digital mode, as
 *            opposed to a default text mode. 
 *            
 *            <p7_ALL_CONSENSUS_COLS>: create a column for every
 *            consensus column in the model, even if it means having
 *            all gap characters (deletions) in a column; this
 *            guarantees that the alignment will have at least <M>
 *            columns. The default is to only show columns that have
 *            at least one residue in them.
 *            
 *            <p7_TRIM>: trim off any residues that get assigned to
 *            flanking N,C states (in profile traces) or I_0 and I_M
 *            (in core traces).
 *            
 *            The <optflags> can be combined by logical OR; for
 *            example, <p7_DIGITIZE | p7_ALL_CONSENSUS_COLS>.
 *            
 * Args:      sq       - array of digital sequences, 0..nseq-1
 *            tr       - array of tracebacks, 0..nseq-1
 *            nseq     - number of sequences
 *            M        - length of model sequences were aligned to
 *            optflags - flags controlling optional behaviours.
 *            ret_msa  - RETURN: new multiple sequence alignment
 *
 * Returns:   <eslOK> on success, and <*ret_msa> points to a new
 *            <ESL_MSA> object. Caller is responsible for free'ing
 *            this new MSA with <esl_msa_Destroy()>.
 *
 * Throws:    <eslEMEM> on allocation failure; <*ret_msa> is <NULL>.
 * 
 * Notes:     * why a text mode, when most of HMMER works in digital
 *              sequences and alignments? Text mode MSAs are created
 *              for output, whereas digital mode MSAs are created for
 *              internal use. Text mode allows HMMER's output 
 *              conventions to be used for match vs. insert columns:
 *              lowercase/. for residues/gaps in inserts, uppercase/-
 *              for residues/gaps in match columns.
 *
 *            * why not pass HMM as an argument, so we can transfer
 *              column annotation? In <p7_tophits_Alignment()>, the
 *              HMM is unavailable -- because of constraints of what's
 *              made available to the master process in an MPI
 *              implementation. (We could make the HMM an optional 
 *              argument.)
 */
int
p7_tracealign_Seqs(ESL_SQ **sq, P7_TRACE **tr, int nseq, int M, int optflags, ESL_MSA **ret_msa)
{
  ESL_MSA      *msa        = NULL;	/* RETURN: new MSA */
  const ESL_ALPHABET *abc  = sq[0]->abc;
  int          *inscount   = NULL;	/* array of max gaps between aligned columns */
  int          *matmap     = NULL;      /* matmap[k] = apos of match k matmap[1..M] = [1..alen] */
  int          *matuse     = NULL;      /* TRUE if an alignment column is associated with match state k [1..M] */
  int           idx;                    /* counter over sequences */
  int           alen;		        /* width of alignment */
  int           status;

  if ((status = map_new_msa(tr, nseq, M, optflags, &inscount, &matuse, &matmap, &alen)) != eslOK) return status;

  if (optflags & p7_DIGITIZE) { if ((status = make_digital_msa(sq, NULL, tr, nseq, matuse, matmap, M, alen, optflags, &msa)) != eslOK) goto ERROR; }
  else                        { if ((status = make_text_msa   (sq, NULL, tr, nseq, matuse, matmap, M, alen, optflags, &msa)) != eslOK) goto ERROR; }

  if ((status = annotate_rf(msa, M, matuse, matmap))                          != eslOK) goto ERROR;
  if ((status = annotate_posterior_probability(msa, tr, matmap, M, optflags)) != eslOK) goto ERROR;

  if (optflags & p7_DIGITIZE) rejustify_insertions_digital(     msa, inscount, matmap, matuse, M);
  else                        rejustify_insertions_text   (abc, msa, inscount, matmap, matuse, M);

  for (idx = 0; idx < nseq; idx++)
    {
      esl_msa_SetSeqName(msa, idx, sq[idx]->name);
      if (sq[idx]->acc[0]  != '\0') esl_msa_SetSeqAccession  (msa, idx, sq[idx]->acc);
      if (sq[idx]->desc[0] != '\0') esl_msa_SetSeqDescription(msa, idx, sq[idx]->desc);
      msa->wgt[idx] = 1.0;
      if (msa->sqlen != NULL) msa->sqlen[idx] = sq[idx]->n;
    }

  free(inscount);
  free(matmap);
  free(matuse);
  *ret_msa = msa;
  return eslOK;

 ERROR:
  if (msa      != NULL) esl_msa_Destroy(msa);
  if (inscount != NULL) free(inscount);
  if (matmap   != NULL) free(matmap);
  if (matuse   != NULL) free(matuse);
  *ret_msa = NULL;
  return status;
}
Exemple #11
0
int 
main(int argc, char **argv)
{
  char         *filename = argv[1];
  int           fmt      = eslMSAFILE_A2M;
  ESL_MSAFILE  *afp      = NULL;
  ESL_MSA      *msa      = NULL;
  int           status;

  if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
  if ( (status = esl_msafile_a2m_Read(afp, &msa))                         != eslOK) esl_msafile_ReadFailure(afp, status);

  printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);

  esl_msafile_a2m_Write(stdout, msa);
  esl_msa_Destroy(msa);
  esl_msafile_Close(afp);
  exit(0);
}
Exemple #12
0
/* make_post_msa()
 * 
 * Optionally, we can return the alignment we actually built the model
 * from (including RF annotation on assigned consensus columns, and any
 * trace doctoring to enforce Plan7 consistency). 
 */
static int
make_post_msa(P7_BUILDER *bld, const ESL_MSA *premsa, const P7_HMM *hmm, P7_TRACE **tr, ESL_MSA **opt_postmsa)
{
  ESL_MSA  *postmsa  = NULL;
  int       optflags = p7_DEFAULT;
  int       status;

  if (opt_postmsa == NULL) return eslOK;

  /* someday we might want to transfer more info from HMM to postmsa */
  if ((status = p7_tracealign_MSA(premsa, tr, hmm->M, optflags, &postmsa)) != eslOK) goto ERROR;
  
  *opt_postmsa = postmsa;
  return eslOK;
  
 ERROR:
  if (postmsa != NULL) esl_msa_Destroy(postmsa);
  return status;
}
Exemple #13
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS        *go          = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  char               *filename    = esl_opt_GetArg(go, 1);
  int                 infmt       = eslMSAFILE_UNKNOWN;
  ESL_ALPHABET       *abc         = NULL;
  ESL_MSAFILE        *afp         = NULL;
  ESL_MSA            *msa         = NULL;
  int                 status;

  if      (esl_opt_GetBoolean(go, "-1"))      infmt = eslMSAFILE_A2M;  /* override format autodetection */

  if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
  else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); 

  /* Text mode: pass NULL for alphabet.
   * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed 
   */
  if   (esl_opt_GetBoolean(go, "-t"))  status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
  else                                 status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
  if (status != eslOK) esl_msafile_OpenFailure(afp, status);

  if ((status = esl_msafile_a2m_Read(afp, &msa)) != eslOK)
    esl_msafile_ReadFailure(afp, status);

  printf("alphabet:       %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
  printf("# of seqs:      %d\n", msa->nseq);
  printf("# of cols:      %d\n", (int) msa->alen);
  printf("\n");

  if (! esl_opt_GetBoolean(go, "-q"))
    esl_msafile_a2m_Write(stdout, msa);

  esl_msa_Destroy(msa);
  esl_msafile_Close(afp);
  if (abc) esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  exit(0);
}
Exemple #14
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go      = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
  ESL_ALPHABET   *abc     = esl_alphabet_Create(eslAMINO);
  ESL_MSA        *msa     = esl_msa_CreateFromString("\
# STOCKHOLM 1.0\n\
\n\
seq0  AAAAAAAAAA\n\
seq1  AAAAAAAAAA\n\
seq2  AAAAAAAAAC\n\
seq3  AAAAAAAADD\n\
seq4  AAAAAAAEEE\n\
seq5  AAAAAAFFFF\n\
seq6  AAAAAGGGGG\n\
seq7  AAAAHHHHHH\n\
seq8  AAAIIIIIII\n\
seq9  AAKKKKKKKK\n\
seq10 ALLLLLLLLL\n\
seq11 MMMMMMMMMM\n\
//",   eslMSAFILE_STOCKHOLM);


  utest_SingleLinkage(go, msa, 1.0, 11, 10);    /* at 100% id, only seq0/seq1 cluster */
  utest_SingleLinkage(go, msa, 0.5,  6,  5);    /* at 50% id, seq0-seq6 cluster       */
  utest_SingleLinkage(go, msa, 0.0,  1,  0);    /* at 0% id, everything clusters      */

  /* Do the same tests, but now with a digital MSA */
  esl_msa_Digitize(abc, msa, NULL);
  utest_SingleLinkage(go, msa, 1.0, 11, 10);    /* at 100% id, only seq0/seq1 cluster */
  utest_SingleLinkage(go, msa, 0.5,  6,  5);    /* at 50% id, seq0-seq6 cluster       */
  utest_SingleLinkage(go, msa, 0.0,  1,  0);    /* at 0% id, everything clusters      */

  esl_msa_Destroy(msa);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
Exemple #15
0
int main(int argc, char **argv)
{
  ESLX_MSAFILE *afp; 
  ESL_MSA      *msa;
  ESL_DMATRIX  *P;
  int           i,j;
  double        min, avg, max;
  int           status;

  if ((status = eslx_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
    eslx_msafile_OpenFailure(afp, status);
  if ((status = eslx_msafile_Read(afp, &msa)) != eslOK)
    eslx_msafile_ReadFailure(afp, status);

  esl_dst_CPairIdMx(msa->aseq, msa->nseq, &P);

  min = 1.0;
  max = 0.0;
  avg = 0.0;
  for (i = 0; i < msa->nseq; i++)
    for (j = i+1; j < msa->nseq; j++)
      {
	avg += P->mx[i][j];
	if (P->mx[i][j] < min) min = P->mx[i][j];
	if (P->mx[i][j] > max) max = P->mx[i][j];
      }
  avg /= (double) (msa->nseq * (msa->nseq-1) / 2);

  printf("Average pairwise %% id:  %.1f%%\n", avg * 100.);
  printf("Minimum pairwise %% id:  %.1f%%\n", min * 100.);
  printf("Maximum pairwise %% id:  %.1f%%\n", max * 100.);

  esl_dmatrix_Destroy(P);
  esl_msa_Destroy(msa);
  eslx_msafile_Close(afp);
  return 0;
}
Exemple #16
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* application configuration       */
  ESL_ALPHABET *abc     = NULL;	/* biological alphabet             */
  char         *alifile = NULL;	/* alignment file name             */
  int           fmt;		/* format code for alifiles        */
  ESL_MSAFILE  *afp     = NULL;	/* open alignment file             */
  ESL_MSA      *msa     = NULL;	/* multiple sequence alignment     */
  int           status;		/* easel return code               */

  int           do_info = TRUE;                /* TRUE if -i */
  int           do_max = FALSE;                /* TRUE if -x */
  int           do_ffreq = FALSE;              /* TRUE if --ffreq */
  int           do_fmin  = FALSE;              /* TRUE if --fmin */
  float         fthresh = 0.;                  /* <x> from -f <x> */
  int           do_remove_bps = FALSE;         /* TRUE if -r */
  int           do_consistent = FALSE;         /* TRUE if -c */
  int           do_indi2cons = FALSE;          /* TRUE if --indi <x> */
  int           have_cons;                     /* TRUE if first alignment has consensus sequence */
  int           do_newcons = FALSE;            /* TRUE if we're creating a new consensus structure
						* and outputing a new alignment (if -x -f -c or --indi)
						*/
  int           do_a = FALSE;                  /* TRUE if -a */
  char         *indi;                          /* for <x> from --indi <x> */
  int           nindi_read;                    /* number of individual sequence SS lines we've read for current alignment */

  int           a;		               /* counter over seqs               */
  int           i, i2;		               /* counter over residues */
  int           j, j2;		               /* counter over residues */
  int           nali;                          /* counter over alignments */
  int         **bp = NULL;                     /* bp[i][j] is number of individual bps exist between aln cols i and j */
  int          *cur_ct = NULL;                 /* ct array of basepairs for current sequence */
  int          *cons_ct = NULL;                /* ct array of basepairs for SS_cons being created */
  int          *xcons_ct = NULL;               /* ct array of basepairs for existing SS_cons */
  int          *ngaps = NULL;                  /* number of gaps in each alignment position */
  FILE         *ofp;		               /* output file (default is stdout) */
  int           be_verbose = FALSE;            /* TRUE to print extra info */
  int           seqthresh;                     /* sequence number threshold for defining a bp as consensus (int) ((fthresh * nseq) + 0.5)*/
  char         *sscons = NULL;                 /* the new SS_cons line */
  FILE         *lfp = NULL;                    /* file to list sequences with conflicting bps to */
  int           nlist = 0;                     /* number of sequences listed to list file */
  int          *nconflictsA;                   /* number of conflicting bps in seq a's individual structure annotation */
  int           nconflicts_total = 0;          /* total number of conflicts */
  int           nconflicts_list = 0;           /* total number of conflicts in sequences listed to file <x> from -l <x> */
  int           noverlaps_total = 0;           /* total number of overlaps */
  int           nconsistent_total = 0;         /* total number of consistent bps */
  int           nbps_total = 0;                /* total number of bps */
  int          *nconsistentA;                  /* number of consistent bps in seq a's individual structure annotation */
  int          *noverlapsA;                    /* number of bps in seq a's indi structure that overlap with consensus structure */
  int          *nbpsA;                         /* number of bps in seq a's indi structure that overlap with consensus structure */
  int           ncons_bps = 0;                 /* number of bps in consensus structure */
  int           max_noverlaps_aidx;
  int           max_nconsistent_aidx;
  int           max_nbps_aidx;
  int          *removebp;                      /* removebp[i] is TRUE remove consensus bp [i]:xcons_ct[i] */
  int          *has_conflict;    
  int          *nmates_l2r;                    /* half matrix, nmate_l2r[i] = <x>, i < nmate_l2r[i], there are <x> different right mates j for i */
  int          *nmates_r2l;                    /* half matrix, nmate_r2l[j] = <x>, j < nmate_r2l[j], there are <x> different left  mates i for j */

  int           lmax;                          /* with -l, maximum number of conflicts to allow */
  int           namewidth = 18;                 /* length of 'SS_cons(consensus)' */
  char         *namedashes = NULL;             /* to store underline for seq name */

  /* --fmin related variables */
  int nbps = 0;
  int prev_nbps = -1;
  float fmin;
  int inconsistent_flag;
  int pknot_flag;
  int k,l;

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\nwhere basic options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      puts("\noptions for defining a new consensus structure (all of these require -o):");
      esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
      puts("\noptions for listing sequences based on structure:");
      esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
      exit(0);
    }

  if (esl_opt_ArgNumber(go) != 1) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  alifile  = esl_opt_GetArg(go, 1);

  fmt = eslMSAFILE_STOCKHOLM;

  /***********************************************
   * Open the MSA file; determine alphabet; set for digital input
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--dna"))  abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))  abc = esl_alphabet_Create(eslRNA);

  if ( (status = esl_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK)
    esl_msafile_OpenFailure(afp, status);

  /* open output file */
  if (esl_opt_GetString(go, "-o") != NULL) {
    if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) 
	esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o"));
  } else ofp = NULL;
  if (esl_opt_GetString(go, "-l") != NULL) { 
    if ((lfp = fopen(esl_opt_GetString(go, "-l"), "w")) == NULL) 
	esl_fatal("Failed to open -l output file %s\n", esl_opt_GetString(go, "-l"));
  }

  /* determine if we're creating a structure */
  do_max = esl_opt_GetBoolean(go, "-x");
    
  if(!(esl_opt_IsDefault(go, "--ffreq"))) { 
    do_ffreq = TRUE; 
    fthresh = esl_opt_GetReal(go, "--ffreq"); 
  }
  if(!(esl_opt_IsDefault(go, "--fmin"))) { 
    do_fmin = TRUE; 
  }
  do_remove_bps = esl_opt_GetBoolean(go, "-r"); 
  do_consistent = esl_opt_GetBoolean(go, "-c");
  if(!(esl_opt_IsDefault(go, "--indi"))) { 
    do_indi2cons = TRUE; 
  }
  if(do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { 
    do_newcons = TRUE;
  }
  do_a = esl_opt_GetBoolean(go, "-a");
  if(do_a || do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { 
    do_info = FALSE;
  }

  /***********************************************
   * Read MSAs one at a time.
   ***********************************************/
  nali = 0;
  have_cons = FALSE;
  lmax = esl_opt_GetInteger(go, "--lmax");
  if(esl_opt_GetBoolean(go, "-v")) be_verbose = TRUE; 

  while ((status = esl_msafile_Read(afp, &msa)) != eslEOF)
    {
      if (status != eslOK) esl_msafile_ReadFailure(afp, status);
      nali++;

      /* determine max length name */
      namewidth = 18; /* length of 'SS_cons(consensus)' */
      for(i = 0; i < msa->nseq; i++) namewidth = ESL_MAX(namewidth, strlen(msa->sqname[i]));
      if(namedashes != NULL) { free(namedashes); }
      ESL_ALLOC(namedashes, sizeof(char) * namewidth+1);
      namedashes[namewidth] = '\0';
      for(i = 0; i < namewidth; i++) namedashes[i] = '-';

      ESL_ALLOC(sscons, sizeof(char) * (msa->alen+1));
      ESL_ALLOC(cur_ct, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(cons_ct, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(xcons_ct, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(bp, sizeof(int *) * (msa->alen+1));
      ESL_ALLOC(removebp, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(has_conflict, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(nmates_l2r, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(nmates_r2l, sizeof(int) * (msa->alen+1));
      esl_vec_ISet(cur_ct, (msa->alen+1), 0);
      esl_vec_ISet(cons_ct, (msa->alen+1), 0);
      esl_vec_ISet(xcons_ct, (msa->alen+1), 0);
      esl_vec_ISet(removebp, (msa->alen+1), FALSE);
      esl_vec_ISet(has_conflict, (msa->alen+1), FALSE);
      esl_vec_ISet(nmates_l2r, (msa->alen+1), 0);
      esl_vec_ISet(nmates_r2l, (msa->alen+1), 0);

      ESL_ALLOC(nconflictsA, sizeof(int) * msa->nseq);
      ESL_ALLOC(noverlapsA, sizeof(int) * msa->nseq);
      ESL_ALLOC(nconsistentA, sizeof(int) * msa->nseq);
      ESL_ALLOC(nbpsA, sizeof(int) * msa->nseq);
      esl_vec_ISet(nconflictsA, msa->nseq, 0);
      esl_vec_ISet(noverlapsA, msa->nseq, 0);
      esl_vec_ISet(nconsistentA, msa->nseq, 0);
      esl_vec_ISet(nbpsA, msa->nseq, 0);

      max_noverlaps_aidx = max_nconsistent_aidx = max_nbps_aidx = 0;
      nconsistent_total = nbps_total = noverlaps_total = nconflicts_total = nconflicts_list = 0;
      for(i = 1; i <= msa->alen; i++) { 
	ESL_ALLOC(bp[i], sizeof(int) * (msa->alen+1));
	esl_vec_ISet(bp[i], (msa->alen+1), 0);
      }

      /* make sure we have ss_cons and indi ss if we need it */
      if(msa->ss_cons == NULL && do_remove_bps) esl_fatal("-r requires all alignments have SS_cons annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_max)             esl_fatal("-x requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_consistent)      esl_fatal("-c requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_indi2cons)       esl_fatal("--indi requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_ffreq)           esl_fatal("--ffreq requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_fmin)            esl_fatal("--fmin requires all alignments have individual SS annotation, alignment %d does not.", nali);

      if(msa->ss_cons != NULL) { 
	if((status = esl_wuss2ct(msa->ss_cons, msa->alen, xcons_ct)) != eslOK) { 
	  esl_fatal("Existing SS_cons for alignment %d is invalid.", nali);
	}
	ncons_bps = 0;
	for(i = 1; i <= msa->alen; i++) 
	  if(xcons_ct[i] != 0 && i < xcons_ct[i]) 
	    ncons_bps++;

	if(nali > 1 && !have_cons)
	  esl_fatal("the first aln has SS_cons but aln %d lacks it, if one has it, they all must.", nali); 
	if(nali == 1) have_cons = TRUE;
      }
      else if (lfp != NULL) { 
	esl_fatal("the -l option requires existing SS_cons annotation, aln %d lacks it.", nali); 
      }
      else if (do_remove_bps) { 
	esl_fatal("the -r option requires existing SS_cons annotation, aln %d lacks it.", nali); 
      }
      else if (do_consistent) { 
	esl_fatal("the -c option requires existing SS_cons annotation, aln %d lacks it.", nali); 
      }
      else { 
	if(nali > 1 && have_cons)
	  esl_fatal("the first aln does not have SS_cons but aln %d does, if one has it, they all must.", nali); 
      }

      if(do_info) { 
	printf("# Per-sequence basepair information:\n"); 
	printf("# Alignment file: %s\n", alifile);
	printf("# Alignment idx:  %d\n", nali);
	if(msa->name != NULL) { printf("# Alignment name: %s\n", msa->name); }
	if(have_cons) { 
	  printf("#\n");
	  printf("# indibp: number of basepairs in the individual sequence SS annotation\n");
	  printf("# ovrlap: number of indibp basepairs that also exist as consensus basepairs\n");
	  printf("# cnsist: number of indibp basepairs that do not conflict with any consensus basepairs\n");
	  printf("# cnflct: number of indibp basepairs that conflict with >= 1 consensus basepairs\n");
	  printf("#\n");
	  printf("# A conflict exists between two basepairs in different structures, one between columns i and j\n");
	  printf("# and the other between columns k and l, if (i == k and j != l) or (j == l and i != k).\n");
	  printf("#\n");
	  printf("# %-*s  %6s  %6s  %6s  %6s\n", namewidth, "seqname", "indibp", "ovrlap", "cnsist", "cnflct");
	  printf("# %-*s  %6s  %6s  %6s  %6s\n", namewidth, namedashes, "------", "------", "-----", "------");
	}
	else { 
	  printf("# %-*s  %6s\n", namewidth, "seqname", "nbp");
	  printf("# %-*s  %6s\n", namewidth, namedashes, "------");
	}
      }

      nindi_read = 0;
      for (a = 0; a < msa->nseq; a++) { 
	if(msa->ss != NULL && msa->ss[a] != NULL) { 
	  if((status = esl_wuss2ct(msa->ss[a], msa->alen, cur_ct)) != eslOK) { 
	    esl_fatal("SS annotation for sequence %d, aln %d  is invalid.\n", (a+1), nali);
	  }
	  nindi_read++;
	  for(i = 1; i <= msa->alen; i++) { 
	    if(i < cur_ct[i]) { 
	      bp[i][cur_ct[i]]++;
	      if(bp[i][cur_ct[i]] == 1) { 
		nmates_l2r[i]++;
		nmates_r2l[cur_ct[i]]++;
	      }
	    }
	  }

	  for(i = 1; i <= msa->alen; i++) { 
	    if(cur_ct[i] != 0 && i < cur_ct[i]) { 
	      if(xcons_ct[i] == cur_ct[i]) noverlapsA[a]++;
	      if((xcons_ct[i] != 0) && (xcons_ct[i] != cur_ct[i])) { 
		if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], i, cur_ct[i], i, xcons_ct[i]); }
		nconflictsA[a]++;
		/* indi bp i:cur_ct[i] conflicts with i:xcons_ct[i] */
		removebp[i]           = TRUE;
		removebp[xcons_ct[i]] = TRUE;
	      }
	      else if((xcons_ct[cur_ct[i]] != 0) && (xcons_ct[cur_ct[i]] != i) && (cur_ct[xcons_ct[cur_ct[i]]] == 0)) { 
		if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], xcons_ct[i], cur_ct[xcons_ct[i]], xcons_ct[cur_ct[i]], cur_ct[i]); }
		nconflictsA[a]++;
		/* indi bp i:cur_ct[i] conflicts with xcons_ct[cur_ct[i]]:cur_ct[i] */
		removebp[cur_ct[i]] = TRUE;
		removebp[xcons_ct[cur_ct[i]]] = TRUE;
	      }
	      else nconsistentA[a]++;
	    }		  
	  }
	  if(nconflictsA[a] > lmax) { 
	    if(lfp != NULL) fprintf(lfp, "%s\n", msa->sqname[a]); 
	    nconflicts_list += nconflictsA[a];
	    nlist++;
	  }
	  nbpsA[a] = nconflictsA[a] + nconsistentA[a];
	  nconflicts_total += nconflictsA[a];
	  nconsistent_total += nconsistentA[a];
	  noverlaps_total += noverlapsA[a];
	  nbps_total += nbpsA[a];

	  if(do_info && have_cons)  printf("  %-*s  %6d  %6d  %6d  %6d\n", namewidth, msa->sqname[a], nbpsA[a], noverlapsA[a], nconsistentA[a], nconflictsA[a]); 
	  if(do_info && !have_cons) printf("  %-*s  %6d\n", namewidth, msa->sqname[a], nbpsA[a]);
	  if(nbpsA[a] > nbpsA[max_nbps_aidx]) max_nbps_aidx = a;
	  if((noverlapsA[a] > noverlapsA[max_noverlaps_aidx]) || ((noverlapsA[a] == noverlapsA[max_noverlaps_aidx]) && (nbpsA[a] > nbpsA[max_noverlaps_aidx]))) max_noverlaps_aidx = a;
	  if((nconsistentA[a] > nconsistentA[max_nconsistent_aidx]) || ((nconsistentA[a] == nconsistentA[max_nconsistent_aidx]) && (nbpsA[a] > nbpsA[max_nconsistent_aidx]))) max_nconsistent_aidx = a;
	}
	else if(do_newcons || esl_opt_GetBoolean(go, "-a")) { esl_fatal("No SS annotation for sequence %d, aln %d.\n", (a+1), nali); }
      }

      if(do_info && have_cons) { 
	if(nindi_read > 0) printf("\n"); 
	printf("  %-*s  %6d  %6d  %6d  %6d\n", namewidth, "SS_cons(consensus)", ncons_bps, ncons_bps, ncons_bps, 0); 
	if(nindi_read > 0) { 
	  printf("\n# %6d/%6d (%.3f) overlap\n", noverlaps_total, nbps_total, nbps_total > 0 ? (float) noverlaps_total / (float) nbps_total : 0.);
	  printf("# %6d/%6d (%.3f) consistent\n", nconsistent_total, nbps_total, nbps_total > 0 ? (float) nconsistent_total / (float) nbps_total: 0.);
	  printf("# %6d/%6d (%.3f) conflict\n", nconflicts_total, nbps_total, nbps_total > 0 ? (float) nconflicts_total / (float) nbps_total: 0.);
	}
	else { 
	  printf("# No sequences in the alignment have GR SS annotation.\n");
	}
      }

      if(lfp != NULL) { 
	printf("# %d/%d sequences with %.3f individual bps on avg that conflict with SS_cons written to %s\n", nlist, msa->nseq, (float) nconflicts_list / (float) nlist, esl_opt_GetString(go, "-l")); 
      }

      /* determine number of gaps per alignment column */
      if((status = get_gaps_per_column(msa, &ngaps)) != eslOK) goto ERROR;

      /* -x: determine max bp structure OR
       * -a: list all conflicts in individual structures */
      if(do_max || do_a) { 
	for(i = 1; i <= msa->alen; i++) { 
	  if(nmates_l2r[i] > 1) {/* list the conflicts */
	    has_conflict[i] = TRUE;
	    for(j = 1; j <= msa->alen; j++) { 
	      if(bp[i][j] > 0) { 
		if(do_a) printf("More than 1 right mates for left  mate %4d   %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, i, j, bp[i][j], msa->nseq - ngaps[i], (float) bp[i][j] / (float) (msa->nseq - ngaps[i])); 
		has_conflict[j] = TRUE;
	      }
	    }
	  }
	}
	for(i = 1; i <= msa->alen; i++) { 
	  if(nmates_r2l[i] > 1) {/* list the conflicts */
	    has_conflict[i] = TRUE;
	    for(j = 1; j <= msa->alen; j++) { 
	      if(bp[j][i] > 0) { 
		if(do_a) printf("More than 1 left  mates for right mate %4d   %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, j, i, bp[j][i], msa->nseq - ngaps[i], (float) bp[j][i] / (float) (msa->nseq - ngaps[i])); 
		has_conflict[j] = TRUE;
	      }
	    }
	  }
	}
	for(i = 1; i <= msa->alen; i++) { 
	  /*printf("conflict[%4d]: %d\n", i, has_conflict[i]);*/
	  if(nmates_l2r[i] == 1 && (!(has_conflict[i]))) { 
	    j = i+1; 
	    while(bp[i][j] == 0) j++;
	    cons_ct[i] = j;
	    cons_ct[j] = i;
	  }
	}

	/* remove pseudoknotted bps greedily */
	for(i = 1; i <= msa->alen; i++) { 
	  j = cons_ct[i]; 
	  if(j != 0 && i < j) { 
	    for(i2 = i+1; i2 <= msa->alen; i2++) { 
	      j2 = cons_ct[i2];
	      if(j2 != 0 && i2 < j2) { 
		if((i2 < j) && (j < j2)) { 
		  /*printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);*/
		  /* note: remove both if they have equal number of sequences */
		  if(bp[i][j] <= bp[i2][j2]) { 
		    /*printf("rm %4d:%4d\n", i, j);*/
		    cons_ct[cons_ct[i]] = 0;
		    cons_ct[i]          = 0;
		  }
		  if(bp[i][j] >= bp[i2][j2]) { 
		    /*printf("rm %4d:%4d\n", i2, j2);*/
		    cons_ct[cons_ct[i2]] = 0;
		    cons_ct[i2]          = 0;
		  }
		}
	      }
	    }
	  }
	}
      }
      
      /***************************************/
      /*PARANOID, second check for knots 
      for(i = 1; i <= msa->alen; i++) { 
	j = cons_ct[i]; 
	if(j != 0 && i < j) { 
	  printf("BP: %4d:%4d\n", i, j);
	  for(i2 = 1; i2 <= msa->alen; i2++) { 
	    j2 = cons_ct[i2];
	    if(j2 != 0 && i2 < j2) { 
	      if((i2 < j) && (j < j2)) { 
		if((i < i2)) { 
		  printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);
		}
	      }
	    }
	  }
	}
      }
      ******************************************/

      /***************************************/
      /*PARANOID, check cons_ct for consistency
      for(i = 1; i <= msa->alen; i++) { 
	if(cons_ct[i] != 0) { 
	  if(cons_ct[cons_ct[i]] != i) { printf("ERROR: i: %4d cons_ct[i]: %4d cons_ct[cons_ct[i]]: %4d\n", i, cons_ct[i], cons_ct[cons_ct[i]]); }
	}
      }
      */
      /*PARANOID, write out SS_cons 
      for(i = 1; i <= msa->alen; i++) { 
	if(i < cons_ct[i]) printf("<"); 
	else if(cons_ct[i] != 0) { printf(">"); }
	else printf(".");
      }
      printf("\n");
      */
      /***************************************/

      /* textize alignment */
      if((status = esl_msa_Textize(msa)) != eslOK) esl_fatal("ERROR textizing alignment %d\n", nali); 

      /* --fmin */
      if(do_fmin) { 
	/* define ss_cons */
	prev_nbps = -1;
	fthresh = 0.99;
	inconsistent_flag = pknot_flag = FALSE;
	printf("# Defining consensus structure:\n");
	printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n");
	printf("# if > <x> individual SS contain i:j as a pair\n");
	printf("# We'll search for minimal <x> that gives a consistent consensus structure.\n");
	printf("# A consistent structure has each position involved in 0 or 1 basepairs.\n");
	printf("#\n");
	printf("# Alignment file: %s\n", alifile);
	printf("# Alignment idx:  %d\n", nali);
	printf("# Number of seqs: %d\n", msa->nseq);
	printf("#\n");
	printf("# %5s  %23s  %6s\n", "<x>", "nseq-required-with-bp", "numbps");
	printf("# %5s  %23s  %6s\n", "-----", "-----------------------", "------");
	while(fthresh >= 0.00 && (inconsistent_flag == FALSE) && (pknot_flag == FALSE)) { 
	  nbps = 0;
	  seqthresh = (int) (fthresh * msa->nseq);
	  /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/
	  esl_vec_ISet(cons_ct, msa->alen+1, 0);
	  for(i = 1; i <= msa->alen; i++) { 
	    for(j = i+1; j <= msa->alen; j++) { 
	      if(bp[i][j] > seqthresh) { 
		if(cons_ct[i] != 0 || cons_ct[j] != 0) { 
		  inconsistent_flag = TRUE;
		}
		/* check for pseudoknots */
		for(k = i+1; k < j; k++) { 
		  l = cons_ct[k];
		  if((k < l) && (l > j)) { 
		    pknot_flag = TRUE;
		  }
		  if((k > l) && (l != 0) && (l < i)) { 
		    pknot_flag = TRUE;
		  }
		}
		cons_ct[i] = j;
		cons_ct[j] = i;
		nbps++;
	      }
	    }
	  }
	  if(inconsistent_flag) 
	    printf("  %.3f  %23d  %s\n", fthresh, seqthresh+1, "inconsistent");
	  else if(pknot_flag) 
	    printf("  %.3f  %23d  %s\n", fthresh, seqthresh+1, "pseudoknotted");
	  else { 
	    if(nbps != prev_nbps) { 
	      printf("  %.3f  %23d  %6d\n", fthresh, seqthresh+1, nbps);
	    }
	    fmin = fthresh;
	  }
	  fthresh -= 0.01;
	  prev_nbps = nbps;
	}
	fthresh = fmin;
	esl_vec_ISet(cons_ct, msa->alen+1, 0);
      }

      /* --ffreq: determine structure by defining consensus bps that occur in <x> fraction of indi structures */
      if(do_ffreq || do_fmin) { 
	if(do_fmin)  { printf("#\n# <x> determined to be %.3f\n", fthresh); }
	if(do_ffreq) { 
	  printf("# Defining consensus structure:\n");
	  printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n");
	  printf("# if > %f individual SS contain i:j as a pair\n", fthresh);
	}
	esl_vec_ISet(cons_ct, msa->alen+1, 0);
	/* define ss_cons */
	  seqthresh = (int) (fthresh * msa->nseq);
	  /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/
	  for(i = 1; i <= msa->alen; i++) { 
	    for(j = i+1; j <= msa->alen; j++) { 
	      if(bp[i][j] > seqthresh) { 
		if(cons_ct[i] != 0) { 
		  esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", i, i, cons_ct[i], i, j);
		}
		if(cons_ct[j] != 0) { 
		  esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", j, j, cons_ct[j], i, j);
		}
		cons_ct[i] = j;
		cons_ct[j] = i;
	      }
	    }
	  }
	}

      /* -r: redefine consensus struct by removing any bps that conflict with individual structures */
      if(do_remove_bps) { 
	for(i = 1; i <= msa->alen; i++) { 
	  if(!(removebp[i])) {
	    cons_ct[i]          = xcons_ct[i];
	    cons_ct[cons_ct[i]] = i;
	  }
	  else {
	    printf("# Removing consensus bp: %d:%d\n", i, xcons_ct[i]);
	    cons_ct[xcons_ct[i]] = 0; 
	    cons_ct[i]           = 0; 
	  }
	}
      }
      
      /* -c:     define consensus structure as indi sequence with highest number of consistent bps with structure  OR */
      /* --indi: define consensus structure as indi sequence <x> from --indi <x> */
      if(do_consistent || do_indi2cons) {
	if(do_indi2cons) { 
	  indi = esl_opt_GetString(go, "--indi");
	  for(a = 0; a < msa->nseq; a++) { 
	    if(strcmp(indi, msa->sqname[a]) == 0) break;
	  }
	  if(a == msa->nseq) esl_fatal("ERROR, could not find a sequence named %s in the alignment.\n", indi);
	}
	else { /* do_consistent */
	  a = max_nconsistent_aidx;
	}
	if(msa->ss == NULL || msa->ss[a] == NULL) esl_fatal("ERROR, no individual SS annotation for %s in the alignment.\n", msa->sqname[a]);
	if((status = esl_wuss2ct(msa->ss[a], msa->alen, cons_ct)) != eslOK) { 
	  esl_fatal("Second pass... SS annotation for sequence %d, aln %d  is invalid.\n", (a), nali);
	}	
	printf("# Defined new SS_cons as SS annotation for %s (%d basepairs)\n", msa->sqname[a], nbpsA[a]);
	if(esl_opt_GetBoolean(go, "--rfc") || esl_opt_GetBoolean(go, "--rfindi")) {
	  if(msa->rf != NULL) { free(msa->rf); msa->rf = NULL; }
	  if((status = esl_strcat(&(msa->rf), -1, msa->aseq[a], msa->alen)) != eslOK) goto ERROR;
	  printf("# Defined new RF as %s sequence\n", msa->sqname[a]);
	}
      }
      
      /* write out alignment with new SS_cons */
      if(do_newcons) { 
	if((status = esl_ct2wuss(cons_ct, msa->alen, sscons)) != eslOK) goto ERROR;
	if(msa->ss_cons != NULL) { free(msa->ss_cons); msa->ss_cons = NULL; }
	if((status = esl_strcat(&(msa->ss_cons), -1, sscons, msa->alen)) != eslOK) goto ERROR;
	status = esl_msafile_Write(ofp, msa, (esl_opt_GetBoolean(go, "--pfam") ? eslMSAFILE_PFAM : eslMSAFILE_STOCKHOLM));
	if      (status == eslEMEM) esl_fatal("Memory error when outputting alignment\n");
	else if (status != eslOK)   esl_fatal("Writing alignment file failed with error %d\n", status);
      }
      
      free(sscons);
      free(cur_ct);
      free(cons_ct);
      free(xcons_ct);
      for(i = 1; i <= msa->alen; i++) free(bp[i]);
      free(bp);
      esl_msa_Destroy(msa);
    }
  if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile);

  /* Cleanup, normal return
   */
  if(lfp != NULL) fclose(lfp);
  if(ofp != NULL) { 
    printf("# Alignment(s) saved to file %s\n", esl_opt_GetString(go, "-o"));
    fclose(ofp);
  }
  esl_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;

 ERROR:
  if(afp) esl_msafile_Close(afp);
  if(go)  esl_getopts_Destroy(go);
  if(msa) esl_msa_Destroy(msa);
  if(lfp) fclose(lfp);
  if(ofp) fclose(ofp);
  esl_fatal("ERROR\n");
  return 1;
  
}
Exemple #17
0
static int
thread_loop(ESL_THREADS *obj, ESL_WORK_QUEUE *queue, struct cfg_s *cfg)
{
  int          status    = eslOK;
  int          sstatus   = eslOK;
  int          processed = 0;
  WORK_ITEM   *item;
  void        *newItem;

  int           next     = 1;
  PENDING_ITEM *top      = NULL;
  PENDING_ITEM *empty    = NULL;
  PENDING_ITEM *tmp      = NULL;

  char        errmsg[eslERRBUFSIZE];

  esl_workqueue_Reset(queue);
  esl_threads_WaitForStart(obj);

  status = esl_workqueue_ReaderUpdate(queue, NULL, &newItem);
  if (status != eslOK) esl_fatal("Work queue reader failed");
      
  /* Main loop: */
  item = (WORK_ITEM *) newItem;
  while (sstatus == eslOK) {
    sstatus = esl_msa_Read(cfg->afp, &item->msa);
    if (sstatus == eslOK) {
      item->nali = ++cfg->nali;
      if (set_msa_name(cfg, errmsg, item->msa) != eslOK) p7_Fail("%s\n", errmsg);
    }
    if (sstatus == eslEOF && processed < cfg->nali) sstatus = eslOK;
	  
    if (sstatus == eslOK) {
      status = esl_workqueue_ReaderUpdate(queue, item, &newItem);
      if (status != eslOK) esl_fatal("Work queue reader failed");

      /* process any results */
      item = (WORK_ITEM *) newItem;
      if (item->processed == TRUE) {
	++processed;

	/* try to keep the input output order the same */
	if (item->nali == next) {
	  sstatus = output_result(cfg, errmsg, item->nali, item->msa, item->hmm, item->postmsa, item->entropy);
	  if (sstatus != eslOK) p7_Fail(errmsg);

	  p7_hmm_Destroy(item->hmm);
	  esl_msa_Destroy(item->msa);
	  esl_msa_Destroy(item->postmsa);

	  ++next;

	  /* output any pending msa as long as the order
	   * remains the same as read in.
	   */
	  while (top != NULL && top->nali == next) {
	    sstatus = output_result(cfg, errmsg, top->nali, top->msa, top->hmm, top->postmsa, top->entropy);
	    if (sstatus != eslOK) p7_Fail(errmsg);

	    p7_hmm_Destroy(top->hmm);
	    esl_msa_Destroy(top->msa);
	    esl_msa_Destroy(top->postmsa);

	    tmp = top;
	    top = tmp->next;

	    tmp->next = empty;
	    empty     = tmp;
	    
	    ++next;
	  }
	} else {
	  /* queue up the msa so the sequence order is the same in
	   * the .sto and .hmm
	   */
	  if (empty != NULL) {
	    tmp   = empty;
	    empty = tmp->next;
	  } else {
	    ESL_ALLOC(tmp, sizeof(PENDING_ITEM));
	  }

	  tmp->nali     = item->nali;
	  tmp->hmm      = item->hmm;
	  tmp->msa      = item->msa;
	  tmp->postmsa  = item->postmsa;
	  tmp->entropy  = item->entropy;

	  /* add the msa to the pending list */
	  if (top == NULL || tmp->nali < top->nali) {
	    tmp->next = top;
	    top       = tmp;
	  } else {
	    PENDING_ITEM *ptr = top;
	    while (ptr->next != NULL && tmp->nali > ptr->next->nali) {
	      ptr = ptr->next;
	    }
	    tmp->next = ptr->next;
	    ptr->next = tmp;
	  }
	}

	item->nali      = 0;
	item->processed = FALSE;
	item->hmm       = NULL;
	item->msa       = NULL;
	item->postmsa   = NULL;
	item->entropy   = 0.0;
      }
    }
  }

  if (top != NULL) esl_fatal("Top is not empty\n");

  while (empty != NULL) {
    tmp   = empty;
    empty = tmp->next;
    free(tmp);
  }

  status = esl_workqueue_ReaderUpdate(queue, item, NULL);
  if (status != eslOK) esl_fatal("Work queue reader failed");

  if (sstatus == eslEOF)
    {
      /* wait for all the threads to complete */
      esl_threads_WaitForFinish(obj);
      esl_workqueue_Complete(queue);  
    }

  return sstatus;

 ERROR:
  return eslEMEM;
}
Exemple #18
0
static void
mpi_worker(const ESL_GETOPTS *go, struct cfg_s *cfg)
{
  int           xstatus = eslOK;
  int           status;
  int           type;
  P7_BUILDER   *bld         = NULL;
  ESL_MSA      *msa         = NULL;
  ESL_MSA      *postmsa     = NULL;
  ESL_MSA     **postmsa_ptr = (cfg->postmsafile != NULL) ? &postmsa : NULL;
  P7_HMM       *hmm         = NULL;
  P7_BG        *bg          = NULL;
  char         *wbuf        = NULL;	/* packed send/recv buffer  */
  void         *tmp;			/* for reallocation of wbuf */
  int           wn          = 0;	/* allocation size for wbuf */
  int           sz, n;		        /* size of a packed message */
  int           pos;
  char          errmsg[eslERRBUFSIZE];

  /* After master initialization: master broadcasts its status.
   */
  MPI_Bcast(&xstatus, 1, MPI_INT, 0, MPI_COMM_WORLD);
  if (xstatus != eslOK) return; /* master saw an error code; workers do an immediate normal shutdown. */
  ESL_DPRINTF2(("worker %d: sees that master has initialized\n", cfg->my_rank));
  
  /* Master now broadcasts worker initialization information (alphabet type) 
   * Workers returns their status post-initialization.
   * Initial allocation of wbuf must be large enough to guarantee that
   * we can pack an error result into it, because after initialization,
   * errors will be returned as packed (code, errmsg) messages.
   */
  MPI_Bcast(&type, 1, MPI_INT, 0, MPI_COMM_WORLD);
  if (xstatus == eslOK) { if ((cfg->abc = esl_alphabet_Create(type))      == NULL)    xstatus = eslEMEM; }
  if (xstatus == eslOK) { wn = 4096;  if ((wbuf = malloc(wn * sizeof(char))) == NULL) xstatus = eslEMEM; }
  if (xstatus == eslOK) { if ((bld = p7_builder_Create(go, cfg->abc))     == NULL)    xstatus = eslEMEM; }
  MPI_Reduce(&xstatus, &status, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD); /* everyone sends xstatus back to master */
  if (xstatus != eslOK) {
    if (wbuf != NULL) free(wbuf);
    if (bld  != NULL) p7_builder_Destroy(bld);
    return; /* shutdown; we passed the error back for the master to deal with. */
  }

  bg = p7_bg_Create(cfg->abc);

  ESL_DPRINTF2(("worker %d: initialized\n", cfg->my_rank));

                      /* source = 0 (master); tag = 0 */
  while (esl_msa_MPIRecv(0, 0, MPI_COMM_WORLD, cfg->abc, &wbuf, &wn, &msa) == eslOK) 
    {
      /* Build the HMM */
      ESL_DPRINTF2(("worker %d: has received MSA %s (%d columns, %d seqs)\n", cfg->my_rank, msa->name, msa->alen, msa->nseq));
      if ((status = p7_Builder(bld, msa, bg, &hmm, NULL, NULL, NULL, postmsa_ptr)) != eslOK) { strcpy(errmsg, bld->errbuf); goto ERROR; }
      ESL_DPRINTF2(("worker %d: has produced an HMM %s\n", cfg->my_rank, hmm->name));

      /* Calculate upper bound on size of sending status, HMM, and optional postmsa; make sure wbuf can hold it. */
      n = 0;
      if (MPI_Pack_size(1,    MPI_INT, MPI_COMM_WORLD, &sz) != 0)     goto ERROR;   n += sz;
      if (p7_hmm_MPIPackSize( hmm,     MPI_COMM_WORLD, &sz) != eslOK) goto ERROR;   n += sz;
      if (esl_msa_MPIPackSize(postmsa, MPI_COMM_WORLD, &sz) != eslOK) goto ERROR;   n += sz;
      if (n > wn) { ESL_RALLOC(wbuf, tmp, sizeof(char) * n); wn = n; }
      ESL_DPRINTF2(("worker %d: has calculated that HMM will pack into %d bytes\n", cfg->my_rank, n));

      /* Send status, HMM, and optional postmsa back to the master */
      pos = 0;
      if (MPI_Pack       (&status, 1, MPI_INT, wbuf, wn, &pos, MPI_COMM_WORLD) != 0)     goto ERROR;
      if (p7_hmm_MPIPack (hmm,                 wbuf, wn, &pos, MPI_COMM_WORLD) != eslOK) goto ERROR;
      if (esl_msa_MPIPack(postmsa,             wbuf, wn, &pos, MPI_COMM_WORLD) != eslOK) goto ERROR;
      MPI_Send(wbuf, pos, MPI_PACKED, 0, 0, MPI_COMM_WORLD);
      ESL_DPRINTF2(("worker %d: has sent HMM to master in message of %d bytes\n", cfg->my_rank, pos));

      esl_msa_Destroy(msa);     msa     = NULL;
      esl_msa_Destroy(postmsa); postmsa = NULL;
      p7_hmm_Destroy(hmm);      hmm     = NULL;
    }

  if (wbuf != NULL) free(wbuf);
  p7_builder_Destroy(bld);
  return;

 ERROR:
  ESL_DPRINTF2(("worker %d: fails, is sending an error message, as follows:\n%s\n", cfg->my_rank, errmsg));
  pos = 0;
  MPI_Pack(&status, 1,                MPI_INT,  wbuf, wn, &pos, MPI_COMM_WORLD);
  MPI_Pack(errmsg,  eslERRBUFSIZE,    MPI_CHAR, wbuf, wn, &pos, MPI_COMM_WORLD);
  MPI_Send(wbuf, pos, MPI_PACKED, 0, 0, MPI_COMM_WORLD);
  if (wbuf != NULL) free(wbuf);
  if (msa  != NULL) esl_msa_Destroy(msa);
  if (hmm  != NULL) p7_hmm_Destroy(hmm);
  if (bld  != NULL) p7_builder_Destroy(bld);
  return;
}
Exemple #19
0
/* mpi_master()
 * The MPI version of hmmbuild.
 * Follows standard pattern for a master/worker load-balanced MPI program (J1/78-79).
 * 
 * A master can only return if it's successful. 
 * Errors in an MPI master come in two classes: recoverable and nonrecoverable.
 * 
 * Recoverable errors include all worker-side errors, and any
 * master-side error that do not affect MPI communication. Error
 * messages from recoverable messages are delayed until we've cleanly
 * shut down the workers.
 * 
 * Unrecoverable errors are master-side errors that may affect MPI
 * communication, meaning we cannot count on being able to reach the
 * workers and shut them down. Unrecoverable errors result in immediate
 * p7_Fail()'s, which will cause MPI to shut down the worker processes
 * uncleanly.
 */
static void
mpi_master(const ESL_GETOPTS *go, struct cfg_s *cfg)
{
  int         xstatus       = eslOK;	/* changes from OK on recoverable error */
  int         status;
  int         have_work     = TRUE;	/* TRUE while alignments remain  */
  int         nproc_working = 0;	        /* number of worker processes working, up to nproc-1 */
  int         wi;          	        /* rank of next worker to get an alignment to work on */
  char       *buf           = NULL;	/* input/output buffer, for packed MPI messages */
  int         bn            = 0;
  ESL_MSA    *msa           = NULL;
  P7_HMM     *hmm           = NULL;
  P7_BG      *bg            = NULL;
  ESL_MSA   **msalist       = NULL;
  ESL_MSA    *postmsa       = NULL;
  int        *msaidx        = NULL;
  char        errmsg[eslERRBUFSIZE];
  MPI_Status  mpistatus; 
  int         n;
  int         pos;

  double      entropy;
  
  /* Master initialization: including, figure out the alphabet type.
   * If any failure occurs, delay printing error message until we've shut down workers.
   */
  if (xstatus == eslOK) { if ((status = init_master_cfg(go, cfg, errmsg)) != eslOK) xstatus = status; }
  if (xstatus == eslOK) { bn = 4096; if ((buf = malloc(sizeof(char) * bn)) == NULL) { sprintf(errmsg, "allocation failed"); xstatus = eslEMEM; } }
  if (xstatus == eslOK) { if ((msalist = malloc(sizeof(ESL_MSA *) * cfg->nproc)) == NULL) { sprintf(errmsg, "allocation failed"); xstatus = eslEMEM; } }
  if (xstatus == eslOK) { if ((msaidx  = malloc(sizeof(int)       * cfg->nproc)) == NULL) { sprintf(errmsg, "allocation failed"); xstatus = eslEMEM; } }
  MPI_Bcast(&xstatus, 1, MPI_INT, 0, MPI_COMM_WORLD);
  if (xstatus != eslOK) {  MPI_Finalize(); p7_Fail(errmsg); }
  ESL_DPRINTF1(("MPI master is initialized\n"));

  bg = p7_bg_Create(cfg->abc);

  for (wi = 0; wi < cfg->nproc; wi++) { msalist[wi] = NULL; msaidx[wi] = 0; } 

  /* Worker initialization:
   * Because we've already successfully initialized the master before we start
   * initializing the workers, we don't expect worker initialization to fail;
   * so we just receive a quick OK/error code reply from each worker to be sure,
   * and don't worry about an informative message. 
   */
  MPI_Bcast(&(cfg->abc->type), 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Reduce(&xstatus, &status, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
  if (status != eslOK) { MPI_Finalize(); p7_Fail("One or more MPI worker processes failed to initialize."); }
  ESL_DPRINTF1(("%d workers are initialized\n", cfg->nproc-1));


  /* Main loop: combining load workers, send/receive, clear workers loops;
   * also, catch error states and die later, after clean shutdown of workers.
   * 
   * When a recoverable error occurs, have_work = FALSE, xstatus !=
   * eslOK, and errmsg is set to an informative message. No more
   * errmsg's can be received after the first one. We wait for all the
   * workers to clear their work units, then send them shutdown signals,
   * then finally print our errmsg and exit.
   * 
   * Unrecoverable errors just crash us out with p7_Fail().
   */
  wi = 1;
  while (have_work || nproc_working)
    {
      if (have_work) 
	{
	  if ((status = esl_msa_Read(cfg->afp, &msa)) == eslOK) 
	    {
	      cfg->nali++;  
	      ESL_DPRINTF1(("MPI master read MSA %s\n", msa->name == NULL? "" : msa->name));
	    }
	  else 
	    {
	      have_work = FALSE;
	      if      (status == eslEFORMAT)  { xstatus = eslEFORMAT; snprintf(errmsg, eslERRBUFSIZE, "Alignment file parse error:\n%s\n", cfg->afp->errbuf); }
	      else if (status == eslEINVAL)   { xstatus = eslEFORMAT; snprintf(errmsg, eslERRBUFSIZE, "Alignment file parse error:\n%s\n", cfg->afp->errbuf); }
	      else if (status != eslEOF)      { xstatus = status;     snprintf(errmsg, eslERRBUFSIZE, "Alignment file read unexpectedly failed with code %d\n", status); }
	      ESL_DPRINTF1(("MPI master has run out of MSAs (having read %d)\n", cfg->nali));
	    } 
	}

      if ((have_work && nproc_working == cfg->nproc-1) || (!have_work && nproc_working > 0))
	{
	  if (MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &mpistatus) != 0) { MPI_Finalize(); p7_Fail("mpi probe failed"); }
	  if (MPI_Get_count(&mpistatus, MPI_PACKED, &n)                != 0) { MPI_Finalize(); p7_Fail("mpi get count failed"); }
	  wi = mpistatus.MPI_SOURCE;
	  ESL_DPRINTF1(("MPI master sees a result of %d bytes from worker %d\n", n, wi));

	  if (n > bn) {
	    if ((buf = realloc(buf, sizeof(char) * n)) == NULL) p7_Fail("reallocation failed");
	    bn = n; 
	  }
	  if (MPI_Recv(buf, bn, MPI_PACKED, wi, 0, MPI_COMM_WORLD, &mpistatus) != 0) { MPI_Finalize(); p7_Fail("mpi recv failed"); }
	  ESL_DPRINTF1(("MPI master has received the buffer\n"));

	  /* If we're in a recoverable error state, we're only clearing worker results;
           * just receive them, don't unpack them or print them.
           * But if our xstatus is OK, go ahead and process the result buffer.
	   */
	  if (xstatus == eslOK)	
	    {
	      pos = 0;
	      if (MPI_Unpack(buf, bn, &pos, &xstatus, 1, MPI_INT, MPI_COMM_WORLD)     != 0) { MPI_Finalize();  p7_Fail("mpi unpack failed");}
	      if (xstatus == eslOK) /* worker reported success. Get the HMM. */
		{
		  ESL_DPRINTF1(("MPI master sees that the result buffer contains an HMM\n"));
		  if (p7_hmm_MPIUnpack(buf, bn, &pos, MPI_COMM_WORLD, &(cfg->abc), &hmm) != eslOK) {  MPI_Finalize(); p7_Fail("HMM unpack failed"); }
		  ESL_DPRINTF1(("MPI master has unpacked the HMM\n"));

		  if (cfg->postmsafile != NULL) {
		    if (esl_msa_MPIUnpack(cfg->abc, buf, bn, &pos, MPI_COMM_WORLD, &postmsa) != eslOK) { MPI_Finalize(); p7_Fail("postmsa unpack failed");}
		  } 

		  entropy = p7_MeanMatchRelativeEntropy(hmm, bg);
		  if ((status = output_result(cfg, errmsg, msaidx[wi], msalist[wi], hmm, postmsa, entropy)) != eslOK) xstatus = status;

		  esl_msa_Destroy(postmsa); postmsa = NULL;
		  p7_hmm_Destroy(hmm);      hmm     = NULL;
		}
	      else	/* worker reported an error. Get the errmsg. */
		{
		  if (MPI_Unpack(buf, bn, &pos, errmsg, eslERRBUFSIZE, MPI_CHAR, MPI_COMM_WORLD) != 0) { MPI_Finalize(); p7_Fail("mpi unpack of errmsg failed"); }
		  ESL_DPRINTF1(("MPI master sees that the result buffer contains an error message\n"));
		}
	    }
	  esl_msa_Destroy(msalist[wi]);
	  msalist[wi] = NULL;
	  msaidx[wi]  = 0;
	  nproc_working--;
	}

      if (have_work)
	{   
	  ESL_DPRINTF1(("MPI master is sending MSA %s to worker %d\n", msa->name == NULL ? "":msa->name, wi));
	  if (esl_msa_MPISend(msa, wi, 0, MPI_COMM_WORLD, &buf, &bn) != eslOK) p7_Fail("MPI msa send failed");
	  msalist[wi] = msa;
	  msaidx[wi]  = cfg->nali; /* 1..N for N alignments in the MSA database */
	  msa = NULL;
	  wi++;
	  nproc_working++;
	}
    }
  
  /* On success or recoverable errors:
   * Shut down workers cleanly. 
   */
  ESL_DPRINTF1(("MPI master is done. Shutting down all the workers cleanly\n"));
  for (wi = 1; wi < cfg->nproc; wi++) 
    if (esl_msa_MPISend(NULL, wi, 0, MPI_COMM_WORLD, &buf, &bn) != eslOK) p7_Fail("MPI msa send failed");

  free(buf);
  free(msaidx);
  free(msalist);
  p7_bg_Destroy(bg);

  if (xstatus != eslOK) { MPI_Finalize(); p7_Fail(errmsg); }
  else                  return;
}
Exemple #20
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS *go;		/* application configuration       */
  int          kstatus, tstatus;/* return code from Easel routine  */
  int          fmt;		/* expected format of kfile, tfile */
  char        *kfile, *tfile;   /* known, test structure file      */
  ESL_MSAFILE *kfp, *tfp;       /* open kfile, tfile               */
  ESL_MSA     *ka,  *ta; 	/* known, trusted alignment        */
  int64_t      klen, tlen;	/* lengths of dealigned seqs       */
  int          i;		/* counter over sequences          */
  int          apos;		/* counter over alignment columns  */
  int          rfpos;		/* counter over consensus (non-gap RF) columns  */
  int       is_rfpos;            /* TRUE if current apos is a consensus pos, FALSE if not */
  int          uapos;		/* counter over unaligned residue positions */
  int          nali;            /* number of alignment we're on in each file */

  int        **kp;              /* [0..i..nseq-1][1..r..sq->n] = x known non-gap RF position of residue r in sequence i */
  int        **tp;              /* [0..i..nseq-1][1..r..sq->n] = x predicted non-gap RF position of residue r in sequence i */
  /* for both kp and pp, if x <= 0, residue r for seq i is not aligned to a non-gap RF position, but rather as an 'insert'
   * after non-gap RF position (x * -1) 
   */
  int        *km_pos;          /* [0..rflen] = x, in known aln,     number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */
  int        *ki_pos;          /* [0..rflen] = x, in known aln,     number of residues inserted after non-gap RF column x */
  int        *tm_pos;          /* [0..rflen] = x, in predicted aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */
  int        *ti_pos;          /* [0..rflen] = x, in predicted aln, number of residues inserted after non-gap RF column x */
  int    *cor_tm_pos;          /* [0..rflen] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF column x; special case: mct[0] = 0 */
  int    *cor_ti_pos;          /* [0..rflen] = x, in predicted aln, number of correctly predicted residues inserted after non-gap RF column x */

  int        *km_seq;          /* [0..i..nseq-1] = x, in known aln,     number of residues aligned to non-gap RF columns in seq i; */
  int        *ki_seq;          /* [0..i..nseq-1] = x, in known aln,     number of residues inserted in seq i */
  int        *tm_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of residues aligned to non-gap RF columns in seq i; */
  int        *ti_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of residues inserted in seq i */
  int    *cor_tm_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF columns in seq i */
  int    *cor_ti_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues inserted in seq i */

  int     *seqlen;             /* [0..i..nseq-1] = x, unaligned seq i has length x */
  ESL_ALPHABET *abc = NULL;    /* alphabet for all alignments */
  int      rflen, t_rflen;     /* non-gap RF length (consensus lengths) */
  int   status;
  char *namedashes;
  int ni;
  int namewidth = 8; /* length of 'seq name' */
  int cor_tm, cor_ti, km, ki; /* correct predicted match, correct predicted insert, total match, total insert */
  char *mask = NULL;
  int masklen;
  ESL_DSQ *ks;
  ESL_DSQ *ts;
  FILE *dfp = NULL; /* for --c2dfile */

  /* variables needed for -p and related options */
  int do_post = FALSE; /* TRUE if -p enabled */
  int do_post_for_this_rfpos = FALSE; /* set for each consensus position, always TRUE unless --mask-p2xm */
  int p;               /* counter over integerized posteriors */
  int *ptm = NULL;     /* [0..p..10] number of total   matches with posterior value p (10="*")*/
  int *pti = NULL;     /* [0..p..10] number of total   inserts with posterior value p */
  int *cor_ptm = NULL; /* [0..p..10] number of correct matches with posterior value p */
  int *cor_pti = NULL; /* [0..p..10] number of correct inserts with posterior value p */
  int npostvals = 11;  /* number of posterior values 0-9, * */
  int ppidx;           /* index of PP */
  char ppchars[11] = "0123456789*";
  int cm_cor_ptm, cm_cor_pti, cm_ptm, cm_pti, cm_incor_ptm, cm_incor_pti; /* cumulative counts of posteriors */
  // int tot_cor_ptm, tot_cor_pti, tot_ptm, tot_pti;       /* total counts of posteriors */
  // int tot_incor_ptm,tot_incor_pti;                      // SRE: commented out; don't seem to be used; need to silence compiler warning
  char errbuf[eslERRBUFSIZE];

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\n where options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
      exit(EXIT_SUCCESS);
    }

  if (esl_opt_ArgNumber(go) != 2) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  kfile = esl_opt_GetArg(go, 1);
  tfile = esl_opt_GetArg(go, 2);
  
  fmt = eslMSAFILE_STOCKHOLM;

  /***********************************************
   * Open the two Stockholm files.
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);

  if ( (kstatus = esl_msafile_Open(&abc, kfile, NULL, fmt, NULL, &kfp)) != eslOK) esl_msafile_OpenFailure(kfp, kstatus);
  if ( (tstatus = esl_msafile_Open(&abc, tfile, NULL, fmt, NULL, &tfp)) != eslOK) esl_msafile_OpenFailure(tfp, tstatus);

  do_post = esl_opt_GetBoolean(go, "-p");

  /* read the mask file if --p-mask is enabled */
  if(! esl_opt_IsDefault(go, "--p-mask")) { 
    if((status = read_mask_file(esl_opt_GetString(go, "--p-mask"), errbuf, &mask, &masklen)) != eslOK) esl_fatal(errbuf);
  }
  /* open the c2dfile for output, if nec */
  if (esl_opt_IsOn(go, "--c2dfile")) { 
    if ((dfp = fopen(esl_opt_GetString(go, "--c2dfile"), "w")) == NULL) esl_fatal("Failed to open --c2dfile output file %s\n", esl_opt_GetString(go, "--c2dfile"));
  }

  /***********************************************
   * Do alignment comparisons, one seq at a time;
   * this means looping over all seqs in all alignments.
   ***********************************************/
  nali = 0;
  while ( (kstatus = esl_msafile_Read(kfp, &ka)) != eslEOF)
    {
      if (  kstatus                               != eslOK) esl_msafile_ReadFailure(kfp, kstatus);
      if ( (tstatus = esl_msafile_Read(tfp, &ta)) != eslOK) esl_msafile_ReadFailure(tfp, tstatus);

      nali++;
      if((nali > 1) && (esl_opt_IsOn(go, "--c2dfile"))) esl_fatal("--c2dfile is only meant for msafiles with single alignments"); 

      /* Sanity check on alignment
       */
      if (ka->nseq != ta->nseq)
	esl_fatal("trusted, test alignments don't have same seq #\n");
      if (ka->rf == NULL)
	esl_fatal("trusted alignment has no reference annotation\n");
      if (ta->rf == NULL)
	esl_fatal("test alignment has no reference annotation\n");

      /* make sure the sequences are all identical */
      ESL_ALLOC(seqlen, sizeof(int) * ka->nseq);
      for(i = 0; i < ka->nseq; i++) { 
	if(strcmp(ka->sqname[i], ta->sqname[i]) != 0) esl_fatal("sequence %d of trusted alignment %s has different name than seq %d of predicted alignment %s\n", (i+1), ka->sqname[i], (i+1), ta->sqname[i]); 
	ESL_ALLOC(ks, sizeof(ESL_DSQ) * (ka->alen+2));
	memcpy(ks, ka->ax[i], (ka->alen+2) * sizeof(ESL_DSQ));
	esl_abc_XDealign(ka->abc, ks, ka->ax[i], &klen);

	ESL_ALLOC(ts, sizeof(ESL_DSQ) * (ta->alen+2));
	memcpy(ts, ta->ax[i], (ta->alen+2) * sizeof(ESL_DSQ));
	esl_abc_XDealign(ta->abc, ts, ta->ax[i], &tlen);

	if (tlen != klen)
	  esl_fatal("dealigned sequence mismatch, seq %d, when dealigned, is %d residues in the known alignment, but %d residues in the trusted alignment.", (i+1), klen, tlen);

	if (memcmp(ks, ts, sizeof(ESL_DSQ) * klen) != 0) 
	  esl_fatal("dealigned sequence mismatch, seq %d %s, when dealigned, are not identical.", (i+1), ka->sqname[i]);

	seqlen[i] = tlen;
	free(ks);
	free(ts);
      }

      /* determine non-gap RF length */
      rflen = 0;
      for(apos = 1; apos <= ka->alen; apos++) { 
	if((! esl_abc_CIsGap    (ka->abc, ka->rf[apos-1])) && 
	   (! esl_abc_CIsMissing(ka->abc, ka->rf[apos-1]))) rflen++;
      }
      t_rflen = 0;
      for(apos = 1; apos <= ta->alen; apos++) { 
	if((! esl_abc_CIsGap       (ta->abc, ta->rf[apos-1])) && 
	   (! esl_abc_CIsMissing   (ta->abc, ta->rf[apos-1]))) t_rflen++;
      }
      if(t_rflen != rflen) esl_fatal("Trusted alignment non-gap RF length (%d) != predicted alignment non-gap RF length (%d).\n", rflen, t_rflen);

      /* if -p, make sure the test alignment has posterior probabilities, and allocate our counters for correct/incorrect per post value */
      if(do_post) { 
	if(! esl_opt_IsDefault(go, "--p-mask")) {
	  if(masklen != rflen) { 
	    esl_fatal("Length of mask in %s (%d) not equal to non-gap RF len of alignments (%d)\n", esl_opt_GetString(go, "--p-mask"), masklen, rflen);
	  }
	}
	if(ta->pp == NULL) esl_fatal("-p requires \"#=GR PP\" annotation in the test alignment, but none exists");
	ESL_ALLOC(ptm,     sizeof(int) * npostvals);
	ESL_ALLOC(pti,     sizeof(int) * npostvals);
	ESL_ALLOC(cor_ptm, sizeof(int) * npostvals);
	ESL_ALLOC(cor_pti, sizeof(int) * npostvals);
	esl_vec_ISet(ptm, npostvals, 0);
	esl_vec_ISet(pti, npostvals, 0);
	esl_vec_ISet(cor_ptm, npostvals, 0);
	esl_vec_ISet(cor_pti, npostvals, 0);
      }

      /* allocate and initialize our counters */
      ESL_ALLOC(kp, sizeof(int *) * ka->nseq);
      ESL_ALLOC(tp, sizeof(int *) * ta->nseq);
      for(i = 0; i < ka->nseq; i++) { 
	ESL_ALLOC(kp[i], sizeof(int) * (seqlen[i]+1));
	ESL_ALLOC(tp[i], sizeof(int) * (seqlen[i]+1));
	esl_vec_ISet(kp[i], seqlen[i]+1, -987654321);
	esl_vec_ISet(tp[i], seqlen[i]+1, -987654321);
      }

      ESL_ALLOC(km_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(ki_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(tm_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(ti_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(cor_tm_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(cor_ti_pos, sizeof(int) * (rflen+1));
      esl_vec_ISet(km_pos, rflen+1, 0);
      esl_vec_ISet(ki_pos, rflen+1, 0);
      esl_vec_ISet(tm_pos, rflen+1, 0);
      esl_vec_ISet(ti_pos, rflen+1, 0);
      esl_vec_ISet(cor_tm_pos, rflen+1, 0);
      esl_vec_ISet(cor_ti_pos, rflen+1, 0);

      ESL_ALLOC(km_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(ki_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(tm_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(ti_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(cor_tm_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(cor_ti_seq, sizeof(int) * ka->nseq);
      esl_vec_ISet(km_seq, ka->nseq, 0);
      esl_vec_ISet(ki_seq, ka->nseq, 0);
      esl_vec_ISet(tm_seq, ka->nseq, 0);
      esl_vec_ISet(ti_seq, ka->nseq, 0);
      esl_vec_ISet(cor_tm_seq, ka->nseq, 0);
      esl_vec_ISet(cor_ti_seq, ka->nseq, 0);

      /* determine non-gap RF location of each residue in known alignment */
      for(i = 0; i < ka->nseq; i++) { 
	uapos = rfpos = 0;
	for(apos = 1; apos <= ka->alen; apos++) { 
	  is_rfpos = FALSE;
	  if((! esl_abc_CIsGap       (ka->abc, ka->rf[apos-1])) &&
	     (! esl_abc_CIsMissing   (ka->abc, ka->rf[apos-1]))) { 
	    rfpos++; is_rfpos = TRUE;
	  }
	  if(esl_abc_XIsResidue(ka->abc, ka->ax[i][apos])) { 
	    uapos++;
	    kp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos);
	    if(is_rfpos) { km_pos[rfpos]++; km_seq[i]++; }
	    else         { ki_pos[rfpos]++; ki_seq[i]++; }
	  }
	}
      }

      /* determine non-gap RF location of each residue in predicted alignment */
      for(i = 0; i < ta->nseq; i++) { 
	uapos = rfpos = 0;
	for(apos = 1; apos <= ta->alen; apos++) { 
	  is_rfpos = FALSE;
	  if((! esl_abc_CIsGap       (abc, ta->rf[apos-1])) && 
	     (! esl_abc_CIsMissing   (abc, ta->rf[apos-1]))) { 
	    rfpos++; is_rfpos = TRUE;
	    if(do_post) { 
	      do_post_for_this_rfpos = (mask != NULL && mask[rfpos-1] == '0') ? FALSE : TRUE;
	    }
	  }
	  if(esl_abc_XIsResidue(ta->abc, ta->ax[i][apos])) { 
	    uapos++;
	    tp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos);
	    if(do_post) { 
	      if(esl_abc_CIsGap(abc, ta->pp[i][(apos-1)])) esl_fatal("gap PP value for nongap residue: ali: %d seq: %d apos: %d\n", nali, i, apos);
	      ppidx = get_pp_idx(abc, ta->pp[i][(apos-1)]);
	      if(ppidx == -1) esl_fatal("unrecognized PP value (%c) for nongap residue: ali: %d seq: %d apos: %d\n", ta->pp[i][(apos-1)], nali, i, apos);
	    }
	    if(is_rfpos) { 
	      tm_pos[rfpos]++; tm_seq[i]++; 
	      if(do_post_for_this_rfpos) ptm[ppidx]++;
	    }
	    else { 
	      ti_pos[rfpos]++; ti_seq[i]++; 
	      if(do_post) pti[ppidx]++;
	    }
	    if(kp[i][uapos] == tp[i][uapos]) { /* correctly predicted this residue */
	      if(is_rfpos) { 
		cor_tm_seq[i]++; cor_tm_pos[rfpos]++; 
		if(do_post_for_this_rfpos) cor_ptm[ppidx]++;
	      } 
	      else {
		cor_ti_seq[i]++; cor_ti_pos[rfpos]++; 
		if(do_post) cor_pti[ppidx]++;
	      } 
	    }
	  }
	}
      }
      if((! (esl_opt_GetBoolean(go, "-c"))) && (! esl_opt_GetBoolean(go, "-p"))) { 
	/* print per sequence statistics */
	/* determine the longest name in msa */
	for(ni = 0; ni < ka->nseq; ni++) namewidth = ESL_MAX(namewidth, strlen(ka->sqname[ni]));
	ESL_ALLOC(namedashes, sizeof(char) * namewidth+1);
	namedashes[namewidth] = '\0';
	for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-';
	
	printf("# %-*s  %6s  %28s  %28s  %28s\n", namewidth, "seq name", "len",    "match columns", "insert columns", "all columns");
	printf("# %-*s  %6s  %28s  %28s  %28s\n", namewidth, namedashes, "------", "----------------------------", "----------------------------", "----------------------------");
	for(i = 0; i < ta->nseq; i++) { 
	  printf("  %-*s  %6d  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)\n", namewidth, ka->sqname[i], seqlen[i],
		 cor_tm_seq[i], km_seq[i], (km_seq[i] == 0) ? 0. : ((float) cor_tm_seq[i] / (float) km_seq[i]), 
		 cor_ti_seq[i], ki_seq[i], (ki_seq[i] == 0) ? 0. : ((float) cor_ti_seq[i] / (float) ki_seq[i]), 
		 (cor_tm_seq[i] + cor_ti_seq[i]), (km_seq[i] + ki_seq[i]), ((float) (cor_tm_seq[i] + cor_ti_seq[i]) / ((float) km_seq[i] + ki_seq[i]))); 
	}
	cor_tm = esl_vec_ISum(cor_tm_seq, ka->nseq);
	cor_ti = esl_vec_ISum(cor_ti_seq, ka->nseq);
	km = esl_vec_ISum(km_seq, ka->nseq);
	ki = esl_vec_ISum(ki_seq, ka->nseq);
	
	printf("# %-*s  %6s  %28s  %28s  %28s\n", namewidth, namedashes, "-----", "----------------------------", "----------------------------", "----------------------------");
	printf("# %-*s  %6s  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)\n",
	       namewidth, "*all*", "-", 
	       cor_tm, km, ((float) cor_tm / (float) km), 
	       cor_ti, ki, ((float) cor_ti / (float) ki), 
	       (cor_tm+cor_ti), (km+ki), (((float) (cor_tm + cor_ti))/ ((float) (km + ki)))); 
	free(namedashes);
	for(i = 0; i < ka->nseq; i++) { 
	  free(kp[i]); 
	  free(tp[i]); 
	}
      }
      else if(esl_opt_GetBoolean(go, "-c")) { /* print per column statistics */
	printf("# %5s  %20s  %20s  %20s\n", "rfpos", "match", "insert", "both");
	printf("# %5s  %20s  %20s  %20s\n", "-----", "--------------------", "--------------------", "--------------------");
	for(rfpos = 0; rfpos <= rflen; rfpos++) { 
	  printf("  %5d  %4d / %4d  (%.3f)  %4d / %4d  (%.3f)  %4d / %4d  (%.3f)\n", rfpos, 
		 
		 cor_tm_pos[rfpos], km_pos[rfpos], (km_pos[rfpos] == 0) ? 0. : ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), 
		 cor_ti_pos[rfpos], ki_pos[rfpos], (ki_pos[rfpos] == 0) ? 0. : ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), 
		 (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]), (km_pos[rfpos] + ki_pos[rfpos]), ((float) (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]) / ((float) km_pos[rfpos] + ki_pos[rfpos]))); 
	}
      }
      else if(do_post) { /* do posterior output */
	if(mask == NULL) { 
	  printf("# %2s  %29s  %29s\n", "",   "      match columns          ", "      insert columns         ");
	  printf("# %2s  %29s  %29s\n", "",   "-----------------------------", "-----------------------------") ;
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "PP", "ncorrect", "ntotal",   "fractcor",  "ncorrect", "ntotal",   "fractcor");
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------");
	}
	else { 
	  printf("# %2s  %29s  %29s\n", "",   " match columns within mask   ", "      insert columns         ");
	  printf("# %2s  %29s  %29s\n", "",   "-----------------------------", "-----------------------------") ;
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "PP", "ncorrect", "ntotal",   "fractcor",  "ncorrect", "ntotal",   "fractcor");
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------");
	}
	cm_ptm = cm_pti = cm_cor_ptm = cm_cor_pti = cm_incor_ptm = cm_incor_pti = 0;
	//tot_ptm = esl_vec_ISum(ptm, npostvals);
	//tot_pti = esl_vec_ISum(pti, npostvals);
	//tot_cor_ptm = esl_vec_ISum(cor_ptm, npostvals);
	//tot_cor_pti = esl_vec_ISum(cor_pti, npostvals);
	//tot_incor_ptm = tot_ptm - tot_cor_ptm;
	//tot_incor_pti = tot_pti - tot_cor_pti;
	for(p = (npostvals-1); p >= 0; p--) { 
	  cm_cor_ptm += cor_ptm[p];
	  cm_cor_pti += cor_pti[p];
	  cm_ptm     += ptm[p];
	  cm_pti     += pti[p];
	  cm_incor_ptm += ptm[p] - cor_ptm[p];
	  cm_incor_pti += pti[p] - cor_pti[p];
	  printf("  %2c  %8d / %8d (%.5f)  %8d / %8d (%.5f)\n", 
		 ppchars[p], cor_ptm[p], ptm[p], 
		 (ptm[p] == 0) ? 0. : (float) cor_ptm[p] / (float) ptm[p], 
		 cor_pti[p], pti[p], 
		 (pti[p] == 0) ? 0. : (float) cor_pti[p] / (float) pti[p]);
	}
      }

      /* handle --c2dfile */
      if (dfp != NULL) { 
	/* match stats, 4 fields, CMYK color values */
	for(rfpos = 1; rfpos <= rflen; rfpos++) { 
	  if(km_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.);
	  }
	  else { 
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 
		    0., /* cyan */
		    1. - ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), /* magenta, fraction incorrect */
		    1. - ((float) km_pos[rfpos] / ta->nseq), /* yellow, 1 - fraction of seqs with residue in column */
		    0.);
	  }		 
	}	
	fprintf(dfp, "//\n");
	/* insert stats, 4 fields, CMYK color values */
	rfpos = 0; /* special case, combine insert posn 0 and 1 together */
	if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */
	  fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.);
	}
	else { 
	  fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 
		  0., /* cyan */
		  1. - ((float) (cor_ti_pos[0] + cor_ti_pos[1]) / ((float) (ki_pos[0] + ki_pos[1]))), /* magenta, fraction correct */
		  0.,
		  0.);
	}
	/* insert stats posn 2..rflen */
	for(rfpos = 2; rfpos <= rflen; rfpos++) { 
	  if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.);
	  }
	  else { 
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 
		    0., /* cyan */
		    1. - ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), /* magenta, fraction correct */
		    0.,
		    0.);
	  }
	} 
	fprintf(dfp, "//\n");
      }
      
      if(ptm != NULL) free(ptm);
      if(pti != NULL) free(pti);
      if(cor_ptm != NULL) free(cor_ptm);
      if(cor_ptm != NULL) free(cor_pti);
      free(kp);
      free(tp);
      free(km_seq);
      free(ki_seq);
      free(tm_seq);
      free(ti_seq);
      free(cor_tm_seq);
      free(cor_ti_seq);
      free(km_pos);
      free(ki_pos);
      free(tm_pos);
      free(ti_pos);
      free(cor_tm_pos);
      free(cor_ti_pos);
      free(seqlen);
      esl_msa_Destroy(ka);
      esl_msa_Destroy(ta);
    }

  if(mask != NULL) free(mask);
  if(dfp != NULL) { 
    fclose(dfp);
    printf("# Draw file of per-column stats saved to file: %s\n", esl_opt_GetString(go, "--c2dfile"));
  }
	   
  if(abc) esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  esl_msafile_Close(tfp);
  esl_msafile_Close(kfp);
  return 0;

 ERROR:
  return status;
}
Exemple #21
0
/* Function:  esl_msafile_a2m_Read()
 * Synopsis:  Read a UCSC A2M format alignment.
 *
 * Purpose:   Read an MSA from an open <ESL_MSAFILE> <afp>, parsing
 *            for UCSC A2M (SAM) format. Create a new MSA,
 *            and return a ptr to it in <*ret_msa>. Caller is responsible
 *            for freeing this <ESL_MSA>.
 *            
 *            The <msa> has a reference line (<msa->rf[]>) that
 *            corresponds to the uppercase/lowercase columns in the
 *            alignment: consensus (uppercase) columns are marked 'X',
 *            and insert (lowercase) columns are marked '.' in the RF
 *            annotation line.
 *
 *            This input parser can deal both with "dotless" A2M, and
 *            full A2M format with dots.
 *            
 * Args:      afp     - open <ESL_MSAFILE>
 *            ret_msa - RETURN: newly parsed <ESL_MSA>
 *
 * Returns:   <eslOK> on success. <*ret_msa> is set to the newly
 *            allocated MSA, and <afp> is at EOF.
 *
 *            <eslEOF> if no (more) alignment data are found in
 *            <afp>, and <afp> is returned at EOF. 
 *
 *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
 *            <NULL>. <afp> contains information sufficient for
 *            constructing useful diagnostic output: 
 *            | <afp->errmsg>       | user-directed error message     |
 *            | <afp->linenumber>   | line # where error was detected |
 *            | <afp->line>         | offending line (not NUL-term)   |
 *            | <afp->n>            | length of offending line        |
 *            | <afp->bf->filename> | name of the file                |
 *            and <afp> is poised at the start of the following line,
 *            so (in principle) the caller could try to resume
 *            parsing.
 *            
 * Throws:    <eslEMEM> - an allocation failed.
 *            <eslESYS> - a system call such as fread() failed
 *            <eslEINCONCEIVABLE> - "impossible" corruption 
 *            On these, <*ret_msa> is returned <NULL>, and the state of
 *            <afp> is undefined.
 */
int
esl_msafile_a2m_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
{
  ESL_MSA  *msa        = NULL;
  char    **csflag     = NULL;	/* csflag[i][pos] is TRUE if aseq[i][pos] was uppercase consensus   */
  int      *nins       = NULL;	/* # of inserted residues before each consensus col [0..ncons-1]    */
  int      *this_nins  = NULL;	/* # of inserted residues before each consensus residue in this seq */
  int       nseq       = 0;
  int       ncons      = 0;
  int       idx;
  int64_t   thislen;
  int64_t   spos;
  int       this_ncons;
  int       cpos, bpos;
  char     *p, *tok;
  esl_pos_t n,  toklen;
  int       status;

  ESL_DASSERT1( (afp->format == eslMSAFILE_A2M) );

  afp->errmsg[0] = '\0';	

#ifdef eslAUGMENT_ALPHABET
  if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
#endif
  if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
  ESL_ALLOC(csflag, sizeof(char *) * msa->sqalloc);
  for (idx = 0; idx < msa->sqalloc; idx++) csflag[idx] = NULL; 

  /* skip leading blank lines in file */
  while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK  && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
  if      (status != eslOK) goto ERROR; /* includes normal EOF */

  /* tolerate sloppy space at start of name/desc line */
  while (n && isspace(*p)) { p++; n--; }    
  if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected A2M name/desc line starting with >");    

  do {	/* for each record starting in '>': */
    p++; n--; 			/* advance past > */
    
    if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen))   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for A2M record");
    if (nseq >= msa->sqalloc) {
      int old_sqalloc = msa->sqalloc;
      if ( (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
      ESL_REALLOC(csflag, sizeof(char *) * msa->sqalloc);
      for (idx = old_sqalloc; idx < msa->sqalloc; idx++) csflag[idx] = NULL;
    }

    if (     (status = esl_msa_SetSeqName       (msa, nseq, tok, toklen)) != eslOK) goto ERROR;
    if (n && (status = esl_msa_SetSeqDescription(msa, nseq, p,   n))      != eslOK) goto ERROR;

    /* now for each sequence line... */
    thislen = 0;		/* count of lowercase, uppercase, and '-': w/o dots, on first pass */
    this_ncons = 0;		/* count of uppercase + '-': number of consensus columns in alignment: must match for all seqs */
    if (nseq) {
      for (cpos = 0; cpos <= ncons; cpos++) // A little tricksy. <this_nins> is allocated on first seq, when nseq=0. 
	this_nins[cpos] = 0;                // cppcheck gets confused and erroneously calls "possible null pointer deference"; ignore it.
    }

    while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK)
      {				
	while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */
	if (n  == 0)   continue;	       /* tolerate and skip blank lines */
	if (*p == '>') break;

	ESL_REALLOC(csflag[nseq], sizeof(char) * (thislen + n + 1)); /* might be an overalloc by a bit, depending on whitespace on line */
	if (nseq == 0) {
	  ESL_REALLOC(this_nins, sizeof(int) * (this_ncons + n + 1));
	  for (cpos = this_ncons; cpos <= this_ncons+n; cpos++)
	    this_nins[cpos] = 0;
	}

	for (spos = thislen, bpos = 0; bpos < n; bpos++)
	  {
	    if      (p[bpos] == 'O')   continue;
	    else if (isupper(p[bpos])) { csflag[nseq][spos++] = TRUE;  this_ncons++;            }
	    else if (islower(p[bpos])) { csflag[nseq][spos++] = FALSE; this_nins[this_ncons]++; }
	    else if (p[bpos] == '-')   { csflag[nseq][spos++] = TRUE;  this_ncons++;            }
	    if (ncons && this_ncons > ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg,  "unexpected # of consensus residues, didn't match previous seq(s)");
	  }
	csflag[nseq][spos] = TRUE; /* need a sentinel, because of the way the padding functions work */

#ifdef eslAUGMENT_ALPHABET
	if (msa->abc)   { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[nseq]),   &thislen, p, n); } 
#endif
	if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[nseq]), &thislen, p, n); }
	if (status == eslEINVAL)   ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
	else if (status != eslOK)  goto ERROR;
	ESL_DASSERT1( (spos == thislen) );
      }	
    if (status != eslOK && status != eslEOF) goto ERROR; /* exception thrown by esl_msafile_GetLine() */
    /* status == OK: then *p == '>'. status == eslEOF: we're eof.  status == anything else: error */
    /* Finished reading a sequence record. */
    
    if (nseq == 0) 
      {
	ncons = this_ncons;
	ESL_ALLOC(nins, sizeof(int) * (ncons+1));
	for (cpos = 0; cpos <= ncons; cpos++)
	  nins[cpos] = this_nins[cpos];
      } 
    else 
      {
	if (this_ncons != ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)");
	for (cpos = 0; cpos <= ncons; cpos++) 
	  nins[cpos]      = ESL_MAX(nins[cpos], this_nins[cpos]);
      }
    nseq++;
  } while (status == eslOK);
  
  /* Now we have nseq *unaligned* sequences in ax/aseq[0..nseq-1]; call the length slen, though we don't explicitly store it
   * csflag[idx][spos] tells us whether each unaligned residue is an insertion or consensus, for spos==0..slen-1.
   * nins[0..ncons] tells us the max number of inserted residues before each consensus column
   * This is sufficient information to reconstruct each aligned sequence.
   */
  msa->nseq = nseq;
#ifdef eslAUGMENT_ALPHABET
  if (msa->abc)  { if ((status = a2m_padding_digital(msa, csflag, nins, ncons)) != eslOK) goto ERROR; }
#endif
  if (!msa->abc) { if ((status = a2m_padding_text   (msa, csflag, nins, ncons)) != eslOK) goto ERROR; }

  if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;

  *ret_msa  = msa;
  free(nins);
  free(this_nins);
  for (idx = 0; idx < msa->nseq; idx++) free(csflag[idx]);
  free(csflag);
  return eslOK;
  
 ERROR:
  if (nins)      free(nins);
  if (this_nins) free(this_nins);
  if (csflag) {
    for (idx = 0; idx < msa->nseq; idx++) 
      if (csflag[idx]) free(csflag[idx]);
    free(csflag);
  }
  if (msa) esl_msa_Destroy(msa);
  return status;
}
Exemple #22
0
int 
main(int argc, char **argv)
{
  ESL_STOPWATCH *w;
  ESL_GETOPTS   *go;
  char          *msafile;
  ESLX_MSAFILE  *afp;
  ESL_MSA       *msa;
  int            do_gsc;
  int            do_pb;
  int            do_blosum;
  int            maxN;
  double         maxid;
  double         cpu;
  int            status;

  /* Process command line
   */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") == TRUE) {
    puts(usage); 
    puts("\n  where options are:");
    esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
    return 0;
  }
  do_blosum = esl_opt_GetBoolean(go, "--blosum");
  do_gsc    = esl_opt_GetBoolean(go, "--gsc");
  do_pb     = esl_opt_GetBoolean(go, "--pb");
  maxid     = esl_opt_GetReal   (go, "--id");
  maxN      = esl_opt_GetInteger(go, "--maxN");
  if (esl_opt_ArgNumber(go) != 1) {
    puts("Incorrect number of command line arguments.");
    puts(usage);
    return 1;
  }
  if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("failed to parse cmd line: %s", go->errbuf);
  esl_getopts_Destroy(go);

  w = esl_stopwatch_Create();

  /* Weight one or more alignments from input file
   */
  if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
    eslx_msafile_OpenFailure(afp, status);

  while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) 
    {
      if (status != eslOK) eslx_msafile_ReadFailure(afp, status);
      if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; }

      esl_stopwatch_Start(w);

      if      (do_gsc) 	  esl_msaweight_GSC(msa);
      else if (do_pb) 	  esl_msaweight_PB(msa);
      else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid);

      esl_stopwatch_Stop(w);
      cpu = w->user;
      printf("%-20s %6d  %6d  %.3f\n", msa->name, msa->alen, msa->nseq, cpu);
      esl_msa_Destroy(msa);
    } 
  eslx_msafile_Close(afp);

  esl_stopwatch_Destroy(w);
  return eslOK;
}
/* Step 2. Extract the training set and test set.
 */
static int
separate_sets(struct cfg_s *cfg, ESL_MSA *msa, ESL_MSA **ret_trainmsa, ESL_STACK **ret_teststack)
{      
  ESL_MSA   *trainmsa  = NULL;
  ESL_MSA   *test_msa  = NULL;
  ESL_STACK *teststack = NULL;
  ESL_SQ    *sq        = NULL;
  int *assignment = NULL;
  int *nin        = NULL;
  int *useme      = NULL;
  int  nc         = 0;
  int  c;
  int  ctrain;			/* index of the cluster that becomes the training alignment */
  int  ntrain;			/* number of seqs in the training alignment */
  int  nskip;
  int  i;
  int  status;

  if ((teststack = esl_stack_PCreate()) == NULL) { status = eslEMEM; goto ERROR; }
  ESL_ALLOC(useme, sizeof(int) * msa->nseq);

  if ((status = esl_msacluster_SingleLinkage(msa, cfg->idthresh1, &assignment, &nin, &nc)) != eslOK) goto ERROR;
  ctrain = esl_vec_IArgMax(nin, nc);
  ntrain = esl_vec_IMax(nin, nc);

  for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] == ctrain) ? 1 : 0;
  if ((status = esl_msa_SequenceSubset(msa, useme, &trainmsa)) != eslOK) goto ERROR;

  /* If all the seqs went into the training msa, none are left for testing; we're done here */
  if (trainmsa->nseq == msa->nseq) {
    free(useme);
    free(assignment);
    free(nin);
    *ret_trainmsa  = trainmsa;
    *ret_teststack = teststack;
    return eslOK;
  }

  /* Put all the other sequences into an MSA of their own; from these, we'll
   * choose test sequences.
   */
  for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] != ctrain) ? 1 : 0;
  if ((status = esl_msa_SequenceSubset(msa, useme, &test_msa))                             != eslOK) goto ERROR;

  /* Cluster those test sequences. */
  free(nin);         nin        = NULL;
  free(assignment);  assignment = NULL;
  if ((status = esl_msacluster_SingleLinkage(test_msa, cfg->idthresh2, &assignment, &nin, &nc)) != eslOK) goto ERROR;
  for (c = 0; c < nc; c++)
    {
      nskip = esl_rnd_Roll(cfg->r, nin[c]); /* pick a random seq in this cluster to be the test. */
      for (i=0; i < test_msa->nseq; i++)
	if (assignment[i] == c) {
	  if (nskip == 0) {
	    esl_sq_FetchFromMSA(test_msa, i, &sq);
	    esl_stack_PPush(teststack, (void *) sq);
	    break;
	  } else nskip--;
	}
    }

  esl_msa_Destroy(test_msa);
  free(useme);
  free(nin);
  free(assignment);

  *ret_trainmsa  = trainmsa;
  *ret_teststack = teststack;
  return eslOK;

 ERROR:
  if (useme      != NULL) free(useme);
  if (assignment != NULL) free(assignment);
  if (nin        != NULL) free(nin);
  esl_msa_Destroy(trainmsa); 
  esl_msa_Destroy(test_msa); 
  while (esl_stack_PPop(teststack, (void **) &sq) == eslOK) esl_sq_Destroy(sq);
  esl_stack_Destroy(teststack);
  *ret_trainmsa  = NULL;
  *ret_teststack = NULL;
  return status;
}
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* command line configuration      */
  struct cfg_s  cfg;     	/* application configuration       */
  char         *basename= NULL;	/* base of the output file names   */
  char         *alifile = NULL;	/* alignment file name             */
  char         *dbfile  = NULL;	/* name of seq db file             */
  char          outfile[256];	/* name of an output file          */
  int           alifmt;		/* format code for alifile         */
  int           dbfmt;		/* format code for dbfile          */
  ESL_MSAFILE  *afp     = NULL;	/* open alignment file             */
  ESL_MSA      *origmsa = NULL;	/* one multiple sequence alignment */
  ESL_MSA      *msa     = NULL;	/* MSA after frags are removed     */
  ESL_MSA      *trainmsa= NULL;	/* training set, aligned           */
  ESL_STACK    *teststack=NULL; /* test set: stack of ESL_SQ ptrs  */
  int           status;		/* easel return code               */
  int           nfrags;		/* # of fragments removed          */
  int           ntestdom;       /* # of test domains               */
  int           ntest;		/* # of test sequences created     */
  int           nali;		/* number of alignments read       */
  double        avgid;
  
  
  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration:   %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h"))                    cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go)                  != 3)     cmdline_failure(argv[0], "Incorrect number of command line arguments\n");
  basename = esl_opt_GetArg(go, 1); 
  alifile  = esl_opt_GetArg(go, 2);
  dbfile   = esl_opt_GetArg(go, 3);
  alifmt   = eslMSAFILE_STOCKHOLM;
  dbfmt    = eslSQFILE_FASTA;

  /* Set up the configuration structure shared amongst functions here */
  if (esl_opt_IsDefault(go, "--seed"))   cfg.r = esl_randomness_CreateTimeseeded();
  else                                   cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed"));
  cfg.abc       = NULL;		          /* until we open the MSA file, below */
  cfg.fragfrac  = esl_opt_GetReal(go, "-F");
  cfg.idthresh1 = esl_opt_GetReal(go, "-1");
  cfg.idthresh2 = esl_opt_GetReal(go, "-2");
  cfg.test_lens = NULL;
  cfg.ntest     = 0;

  /* Open the output files */ 
  if (snprintf(outfile, 256, "%s.msa", basename) >= 256)  esl_fatal("Failed to construct output MSA file name");
  if ((cfg.out_msafp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.fa",  basename) >= 256)  esl_fatal("Failed to construct output FASTA file name");
  if ((cfg.out_seqfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.pos", basename) >= 256)  esl_fatal("Failed to construct pos test set summary file name");
  if ((cfg.possummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.neg", basename) >= 256)  esl_fatal("Failed to construct neg test set summary file name");
  if ((cfg.negsummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.tbl", basename) >= 256)  esl_fatal("Failed to construct benchmark table file name");
  if ((cfg.tblfp     = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile);

  /* Open the MSA file; determine alphabet */
  status = esl_msafile_Open(alifile, alifmt, NULL, &afp);
  if      (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile);
  else if (status == eslEFORMAT)   esl_fatal("Couldn't determine format of alignment %s\n", alifile);
  else if (status != eslOK)        esl_fatal("Alignment file open failed with error %d\n", status);

  if      (esl_opt_GetBoolean(go, "--amino"))   cfg.abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     cfg.abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     cfg.abc = esl_alphabet_Create(eslRNA);
  else {
    int type;
    status = esl_msafile_GuessAlphabet(afp, &type);
    if (status == eslEAMBIGUOUS)    esl_fatal("Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", alifile);
    else if (status == eslEFORMAT)  esl_fatal("Alignment file parse failed: %s\n", afp->errbuf);
    else if (status == eslENODATA)  esl_fatal("Alignment file %s is empty\n", alifile);
    else if (status != eslOK)       esl_fatal("Failed to read alignment file %s\n", alifile);
    cfg.abc = esl_alphabet_Create(type);
  }
  esl_msafile_SetDigital(afp, cfg.abc);

  if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq);
  else                           esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K);

  /* Open and process the dbfile; make sure it's in the same alphabet */
  process_dbfile(&cfg, dbfile, dbfmt);

  /* Read and process MSAs one at a time  */
  nali = 0;
  while ((status = esl_msa_Read(afp, &origmsa)) == eslOK)
    {
      remove_fragments(&cfg, origmsa, &msa, &nfrags);
      separate_sets   (&cfg, msa, &trainmsa, &teststack);
      ntestdom = esl_stack_ObjectCount(teststack);

      if (ntestdom >= 2) 
	{
	  esl_stack_Shuffle(cfg.r, teststack);
	  synthesize_positives(go, &cfg, msa->name, teststack, &ntest);

	  esl_msa_MinimGaps(trainmsa, NULL, NULL);
	  esl_msa_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM);

	  esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */
	  fprintf(cfg.tblfp, "%-20s  %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest);
	  nali++;
	}

      esl_msa_Destroy(trainmsa);
      esl_msa_Destroy(origmsa);
      esl_msa_Destroy(msa);
    }
  if      (status == eslEFORMAT)  esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", 
					    afp->linenumber, afp->fname, afp->errbuf, afp->buf);	
  else if (status != eslEOF)      esl_fatal("Alignment file read failed with error code %d\n", status);
  else if (nali   == 0)           esl_fatal("No alignments found in file %s\n", alifile);

  if (nali > 0)
    synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N"));

  fclose(cfg.out_msafp);
  fclose(cfg.out_seqfp);
  fclose(cfg.possummfp);
  fclose(cfg.negsummfp);
  fclose(cfg.tblfp);
  esl_randomness_Destroy(cfg.r);
  esl_alphabet_Destroy(cfg.abc);
  esl_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;
}
Exemple #25
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  char         *msafile = esl_opt_GetArg(go, 1);
  int           infmt   = eslMSAFILE_UNKNOWN;
  int           outfmt  = eslMSAFILE_UNKNOWN;
  ESL_ALPHABET *abc     = NULL;
  ESL_MSAFILE  *afp     = NULL;
  ESL_MSA      *msa     = NULL;
  int           nali    = 0;
  int           status;

  /* Alphabet specification from cmdline? */
  if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
  else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);

  /* MSA input file format from cmdline? */
  if (esl_opt_IsOn(go, "--informat") &&
      (infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
    esl_fatal("Your --informat, %s, is not a recognized multiple alignment file format",
	      esl_opt_GetString(go, "--informat"));

  /* Open in digital mode. Autoguess alphabet, format if we haven't set them already. */
  if (( status = esl_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK)
    esl_msafile_OpenFailure(afp, status);
  
  /* Reverse complementation only makes sense for alphabets that have abc->complement set */
  if (! abc->complement)
    esl_fatal("File %s appears to use the %s alphabet.\nThat alphabet cannot be reverse complemented.\n",
	      msafile, esl_abc_DecodeType(abc->type));

  /* Set the output format, if requested. 
   * By default, write in the same format we read in. 
   * Remember, it's afp->format that gets set; infmt was only a hint.
   */
  if ( esl_opt_IsOn(go, "--outformat") )
    {
      outfmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"));
      if (outfmt == eslMSAFILE_UNKNOWN)
	esl_fatal("Your --outformat, %s, is not a recognized multiple alignment file format.\n",
		  esl_opt_GetString(go, "--outformat"));
    }
  else outfmt = afp->format;

  /* Here we go. */
  while ((status = esl_msafile_Read(afp, &msa)) == eslOK)
    {	
      nali++;

      status = esl_msa_ReverseComplement(msa);

      esl_msafile_Write(stdout, msa, outfmt);

      esl_msa_Destroy(msa);
    }
   if (nali   == 0)      esl_fatal("No alignments found in input file %s\n", msafile);
   if (status != eslEOF) esl_msafile_ReadFailure(afp, status);

   esl_msafile_Close(afp);
   esl_alphabet_Destroy(abc);
   esl_getopts_Destroy(go);
   return 0;
}
Exemple #26
0
int
main(int argc, char **argv)
{
  ESL_ALPHABET *aa_abc = NULL,
               *nt_abc = NULL;
  ESL_MSA      *msa1   = NULL,
               *msa2   = NULL, 
               *msa3   = NULL,
               *msa4   = NULL,
               *msa5   = NULL;
  double uniform[5] = { 1.0, 1.0, 1.0, 1.0, 1.0 };
  double wgt2[5]    = { 0.833333, 0.833333, 0.833333, 0.833333, 1.66667 }; /* GSC, PB give same answer */
  double gsc3[4]    = { 1.125000, 0.875000, 0.875000, 1.125000 };
  double pb3[4]     = { 1.066667, 1.066667, 0.800000, 1.066667 };
  double blosum3[4] = { 1.333333, 0.666667, 0.666667, 1.333333 };
  double gsc4[4]    = { 0.760870, 0.760870, 1.086957, 1.391304 };
  double pb4[4]     = { 0.800000, 0.800000, 1.000000, 1.400000 };
  double blosum4[4] = { 0.666667, 0.666667, 1.333333, 1.333333 };
  
  if ((aa_abc = esl_alphabet_Create(eslAMINO)) == NULL)  esl_fatal("failed to create amino alphabet");
  if ((nt_abc = esl_alphabet_Create(eslDNA))   == NULL)  esl_fatal("failed to create DNA alphabet");

  /* msa1: all sequences identical. Any weighting method should assign uniform weights.
   * msa2: "contrived" example of [Henikoff94b]. "Correct" solution is 1==2, 3==4, and 5==2x other weights.
   * msa3: the "nitrogenase segments" example of [Henikoff94b].
   * msa4: alignment that makes the same distances as Figure 4 from [Gerstein94]
   * msa5: gap pathology. no information here, so weighting methods should resort to uniform weights.
   */
  if ((msa1 = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nseq1 AAAAA\nseq2 AAAAA\nseq3 AAAAA\nseq4 AAAAA\nseq5 AAAAA\n//\n", 
				       eslMSAFILE_STOCKHOLM)) == NULL) esl_fatal("msa 1 creation failed");
  if ((msa2 = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nseq1 AAAAA\nseq2 AAAAA\nseq3 CCCCC\nseq4 CCCCC\nseq5 TTTTT\n//\n",
				       eslMSAFILE_STOCKHOLM)) == NULL) esl_fatal("msa 2 creation failed");
  if ((msa3 = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nNIFE_CLOPA GYVGS\nNIFD_AZOVI GFDGF\nNIFD_BRAJA GYDGF\nNIFK_ANASP GYQGG\n//\n",
				       eslMSAFILE_STOCKHOLM)) == NULL) esl_fatal("msa 3 creation failed");
  if ((msa4 = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nA  AAAAAAAAAA\nB  TTAAAAAAAA\nC  ATAAAACCCC\nD  GGGAAGGGGG\n//\n",
				       eslMSAFILE_STOCKHOLM)) == NULL) esl_fatal("msa 4 creation failed");
  if ((msa5 = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nA  A----\nB  -C---\nC  --G--\nD  ---T-\nE  ----T\n//\n",
				       eslMSAFILE_STOCKHOLM)) == NULL) esl_fatal("msa 5 creation failed");

  utest_GSC(aa_abc, msa1, uniform);
  utest_GSC(nt_abc, msa1, uniform);
  utest_GSC(aa_abc, msa2, wgt2);
  utest_GSC(nt_abc, msa2, wgt2);
  utest_GSC(aa_abc, msa3, gsc3);
  /* no nt test on msa3: it's protein-only */
  utest_GSC(aa_abc, msa4, gsc4);
  utest_GSC(nt_abc, msa4, gsc4);
  utest_GSC(aa_abc, msa5, uniform);
  utest_GSC(aa_abc, msa5, uniform);

  utest_PB(aa_abc, msa1, uniform);
  utest_PB(nt_abc, msa1, uniform);
  utest_PB(aa_abc, msa2, wgt2);
  utest_PB(nt_abc, msa2, wgt2);
  utest_PB(aa_abc, msa3, pb3);
  /* no nt test on msa3: it's protein-only */
  utest_PB(aa_abc, msa4, pb4);
  utest_PB(nt_abc, msa4, pb4);
  utest_PB(aa_abc, msa5, uniform);
  utest_PB(nt_abc, msa5, uniform);

  utest_BLOSUM(aa_abc, msa1, 0.62, uniform);
  utest_BLOSUM(nt_abc, msa1, 0.62, uniform);
  utest_BLOSUM(aa_abc, msa2, 0.62, wgt2);
  utest_BLOSUM(nt_abc, msa2, 0.62, wgt2);
  utest_BLOSUM(aa_abc, msa3, 0.62, blosum3);
  /* no nt test on msa3: it's protein-only */
  utest_BLOSUM(aa_abc, msa4, 0.62, blosum4);
  utest_BLOSUM(nt_abc, msa4, 0.62, blosum4);
  utest_BLOSUM(aa_abc, msa5, 0.62, uniform);
  utest_BLOSUM(nt_abc, msa5, 0.62, uniform);

  /* BLOSUM weights have the peculiar property of going flat at maxid=0.0 (everyone
   * clusters) or maxid=1.0 (nobody clusters).
   */
  utest_BLOSUM(aa_abc, msa4, 0.0,  uniform);
  utest_BLOSUM(aa_abc, msa4, 1.0,  uniform);

  esl_msa_Destroy(msa1);
  esl_msa_Destroy(msa2);
  esl_msa_Destroy(msa3);
  esl_msa_Destroy(msa4);
  esl_msa_Destroy(msa5);
  esl_alphabet_Destroy(aa_abc);
  esl_alphabet_Destroy(nt_abc);
  exit(0);
}
Exemple #27
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS  *go;
  char         *msafile;
  ESLX_MSAFILE *afp;
  ESL_MSA      *msa;
  int           do_gsc;
  int           do_pb;
  int           do_blosum;
  int           maxN;
  double        maxid;
  int           nsmall, nbig;
  int           i;
  int           status;

  /* Process command line  */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("%s", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) esl_fatal("%s", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") == TRUE){
    puts(usage); 
    puts("\n  where options are:");
    esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
    return 0;
  }
  do_blosum = esl_opt_GetBoolean(go, "--blosum");
  do_gsc    = esl_opt_GetBoolean(go, "--gsc");
  do_pb     = esl_opt_GetBoolean(go, "--pb");
  maxid     = esl_opt_GetReal   (go, "--id");
  maxN      = esl_opt_GetInteger(go, "--maxN");
  if (esl_opt_ArgNumber(go) != 1) {
    puts("Incorrect number of command line arguments.");
    puts(usage);
    return 1;
  }
  if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("%s", go->errbuf);
  esl_getopts_Destroy(go);

  /* Weight one or more alignments from input file
   */
  if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
    eslx_msafile_OpenFailure(afp, status);

  while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF)
    {
      if (status != eslOK) eslx_msafile_ReadFailure(afp, status);
      if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; }

      if      (do_gsc) 	  esl_msaweight_GSC(msa);
      else if (do_pb) 	  esl_msaweight_PB(msa);
      else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid);

      for (nsmall = 0, nbig = 0, i = 0; i < msa->nseq; i++) {
	if (msa->wgt[i] < 0.2) nsmall++;
	if (msa->wgt[i] > 5.0) nbig++;
      }

      printf("%-20s  %5d %5d %8.4f  %8.4f  %5d  %5d\n", 
	     msa->name, 
	     msa->nseq, 
	     msa->alen,
	     esl_vec_DMin(msa->wgt, msa->nseq),
	     esl_vec_DMax(msa->wgt, msa->nseq),
	     nsmall,
	     nbig);
      esl_msa_Destroy(msa);
    } 
  eslx_msafile_Close(afp);
  return eslOK;
}
static int
profillic_esl_msafile_profile_Read(ESLX_MSAFILE *afp, ESL_MSA **ret_msa, ProfileType * profile_ptr )
{
  /// \note Right now this isn't actually using the open file pointer; for convenience I just use the profile.fromFile( <filename> ) method.
  /// \todo Use convenience fns in esl_buffer.h; see eg hmmer-3.1/easel/esl_msafile_stockholm.c for examples...
  ESL_MSA                 *msa      = NULL;
  string profile_string;
  char *buf;
  long len;
  int                      seqidx;
  int                      status;
  char       errmsg2[eslERRBUFSIZE];

  ESL_DASSERT1((afp->format == eslMSAFILE_PROFILLIC));

  const char * const seqname = "Galosh Profile Consensus";
  const char * const msaname = "Galosh Profile";
  uint32_t profile_length;
  galosh::Sequence<typename ProfileType::ProfileResidueType> consensus_sequence;
  stringstream tmp_consensus_output_stream;

  uint32_t pos_i;

  if (profile_ptr == NULL)  { ESL_EXCEPTION(eslEINCONCEIVABLE, "profile_ptr is NULL in profillic_esl_msafile_profile_Read(..)!"); }
  //if (feof(afp->bf->fp))  { status = eslEOF; goto ERROR; }
  afp->errmsg[0] = '\0';

  // Read in the galosh profile (from profillic)
  //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end
  //len = afp->bf->ftell( afp->bf->fp ); // get the position at the end (length)
  //fseek( afp->bf->fp, 0, SEEK_SET ); // go to the beginning again.

  //ESL_ALLOC_CPP( char, buf, sizeof( char ) * len ); //malloc buffer
  //fread( buf, len, 1, afp->bf->fp ); //read into buffer

  //profile_string = buf;
  //profile_ptr->fromString( profile_string );
  profile_ptr->fromFile( afp->bf->filename );
  //if (buf)      free(buf);
  // \todo WHY WON'T THIS WORK?  See HACKs in profillic-hmmbuild.cpp to work around it.
  //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end (to signal there's no more profiles in the file, the next time we come to this function)

  // Calculate the consensus sequence.
  profile_length = profile_ptr->length();
  consensus_sequence.reinitialize( profile_length );
  for( pos_i = 0; pos_i < profile_length; pos_i++ ) {
    consensus_sequence[ pos_i ] =
      ( *profile_ptr )[ pos_i ][ galosh::Emission::Match ].maximumValueType();
  }
  tmp_consensus_output_stream << consensus_sequence;

  /* Allocate a growable MSA, and auxiliary parse data coupled to the MSA allocation */
#ifdef eslAUGMENT_ALPHABET
  if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
#endif
  if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }


  // Set first-and-only seq to the consensus.  This should set sqlen[0] to the profile's length and set ax to have length 1 and ax[0] to be the sequence itself.  Also msa->sqname[0] to the "name" of that consensus sequence.

  /* if nec, make room for the new seq */
  if (msa->nseq >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) return status; 
  seqidx = msa->nseq; // 0
  msa->nseq++; // = 1
  status = esl_strdup(seqname, -1, &(msa->sqname[seqidx]));
  // NOTE: Could add description of this "sequence" here, using esl_msa_SetSeqDescription(msa, seqidx, desc).
#ifdef eslAUGMENT_ALPHABET
  if (msa->flags & eslMSA_DIGITAL)
    {
      // NOTE (profillic): There was a bug in this; it had said .."esl_abc_dsqcat(msa->abc, " where it should have said .."esl_abc_dsqcat(msa->abc->inmap, "
      if((status = esl_abc_dsqcat(msa->abc->inmap, &(msa->ax[seqidx]), &(msa->sqlen[seqidx]), tmp_consensus_output_stream.str().c_str(), profile_length)) != eslOK) {
        /* invalid char(s), get informative error message */
        if (esl_abc_ValidateSeq(msa->abc, tmp_consensus_output_stream.str().c_str(), profile_length, afp->errmsg) != eslOK) 
          ESL_XFAIL(eslEFORMAT, errmsg2, "%s (line %d): %s", msa->sqname[0], afp->linenumber, afp->errmsg);
      }
    }
#endif
  if (! (msa->flags & eslMSA_DIGITAL))
    {
      status = esl_strcat(&(msa->aseq[seqidx]), 0, tmp_consensus_output_stream.str().c_str(), profile_length);
      msa->sqlen[seqidx] = profile_length;
    } 
  msa->alen = profile_length;

  /// \todo OR read in a fasta file of sequences too.
  /// \todo (Optional?) Set msa->name to the name of the profile (file?)
  esl_strdup(msaname, -1, &(msa->name));
  /// \todo make sure eslMSA_HASWGTS is FALSE .. OR set it to TRUE and set msa->wgt[idx] to 1.0.
  /// \note Could have secondary structure (per sequence) too. msa->ss[0]. msa->sslen[0] should be the same as msa->sqlen[0].
  /// \todo Investigate what msa->sa and msa->pp are for.

  /* Give the newly parsed MSA a good
   * going-over, and finalize the fields of the MSA data structure.
   * verify_parse will fill in errmsg if it sees a problem.
   */
  //if (verify_parse(msa, afp->errmsg) != eslOK) { status = eslEFORMAT; goto ERROR; } 

  if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;

  if (ret_msa != NULL) *ret_msa = msa; else esl_msa_Destroy(msa);
  return eslOK;

 ERROR:
  if (msa != NULL)      esl_msa_Destroy(msa);
  if (ret_msa != NULL) *ret_msa = NULL;
  return status;
}
Exemple #29
0
/* Function:  p7_tophits_Alignment()
 * Synopsis:  Create a multiple alignment of all the included domains.
 *
 * Purpose:   Create a multiple alignment of all domains marked
 *            "includable" in the top hits list <th>, and return it in
 *            <*ret_msa>.
 *            
 *            Use of <optflags> is identical to <optflags> in <p7_tracealign_Seqs()>.
 *            Possible flags include <p7_DIGITIZE>, <p7_ALL_CONSENSUS_COLS>,
 *            and <p7_TRIM>; they may be OR'ed together. Otherwise, pass
 *            <p7_DEFAULT> to set no flags.
 *
 *            Caller may optionally provide <inc_sqarr>, <inc_trarr>, and
 *            <inc_n> to include additional sequences in the alignment
 *            (the jackhmmer query, for example). Otherwise, pass <NULL, NULL, 0>.
 *
 * Returns:   <eslOK> on success, and <*ret_msa> points to a new MSA that
 *            the caller is responsible for freeing.
 *
 *            Returns <eslFAIL> if there are no reported domains that
 *            satisfy reporting thresholds, in which case <*ret_msa>
 *            is <NULL>.
 *
 * Throws:    <eslEMEM> on allocation failure; <eslECORRUPT> on 
 *            unexpected internal data corruption.
 *
 * Xref:      J4/29: incept.
 *            J4/76: added inc_sqarr, inc_trarr, inc_n, optflags 
 */
int
p7_tophits_Alignment(const P7_TOPHITS *th, const ESL_ALPHABET *abc, 
         ESL_SQ **inc_sqarr, P7_TRACE **inc_trarr, int inc_n,
         int optflags, ESL_MSA **ret_msa)
{
  ESL_SQ   **sqarr = NULL;
  P7_TRACE **trarr = NULL;
  ESL_MSA   *msa   = NULL;
  int        ndom  = 0;
  int        h, d, y;
  int        M;
  int        status;

  /* How many domains will be included in the new alignment? 
   * We also set model size M here; every alignment has a copy.
   */
  for (h = 0; h < th->N; h++)
    if (th->hit[h]->flags & p7_IS_INCLUDED)
    {
        for (d = 0; d < th->hit[h]->ndom; d++)
          if (th->hit[h]->dcl[d].is_included)
            ndom++;
    }

  if (inc_n+ndom == 0) { status = eslFAIL; goto ERROR; }

  if (inc_n)     M = inc_trarr[0]->M;          
  else           M = th->hit[0]->dcl[0].ad->M;
  
  /* Allocation */
  ESL_ALLOC(sqarr, sizeof(ESL_SQ *)   * (ndom + inc_n));
  ESL_ALLOC(trarr, sizeof(P7_TRACE *) * (ndom + inc_n));
  /* Inclusion of preexisting seqs, traces: make copy of pointers */
  for (y = 0; y < inc_n;        y++) { sqarr[y] = inc_sqarr[y];  trarr[y] = inc_trarr[y]; }
  for (;      y < (ndom+inc_n); y++) { sqarr[y] = NULL;          trarr[y] = NULL; }

  /* Make faux sequences, traces from hit list */
  y = inc_n;
  for (h = 0; h < th->N; h++)
    if (th->hit[h]->flags & p7_IS_INCLUDED)
    {
        for (d = 0; d < th->hit[h]->ndom; d++)
          if (th->hit[h]->dcl[d].is_included)
          {
              if ((status = p7_alidisplay_Backconvert(th->hit[h]->dcl[d].ad, abc, &(sqarr[y]), &(trarr[y]))) != eslOK) goto ERROR;
              y++;
          }
    }
  
  /* Make the multiple alignment */
  if ((status = p7_tracealign_Seqs(sqarr, trarr, inc_n+ndom, M, optflags, NULL, &msa)) != eslOK) goto ERROR;

  /* Clean up */
  for (y = inc_n; y < ndom+inc_n; y++) esl_sq_Destroy(sqarr[y]);
  for (y = inc_n; y < ndom+inc_n; y++) p7_trace_Destroy(trarr[y]);
  free(sqarr);
  free(trarr);
  *ret_msa = msa;
  return eslOK;
  
 ERROR:
  if (sqarr != NULL) { for (y = inc_n; y < ndom+inc_n; y++) if (sqarr[y] != NULL) esl_sq_Destroy(sqarr[y]);   free(sqarr); }
  if (trarr != NULL) { for (y = inc_n; y < ndom+inc_n; y++) if (trarr[y] != NULL) p7_trace_Destroy(trarr[y]); free(trarr); }
  if (msa   != NULL) esl_msa_Destroy(msa);
  *ret_msa = NULL;
  return status;
}
Exemple #30
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS  *go;
  char         *msafile;
  ESLX_MSAFILE *afp;
  ESL_MSA      *msa;
  float        *sqd;
  int          status;
  int          nbad;
  int          nali    = 0;
  int          nbadali = 0;
  int          nwgt    = 0;
  int          nbadwgt = 0;
  int i;
  int be_quiet;
  int do_gsc;
  int do_pb;
  int do_blosum;
  double maxid;
  double tol;
  int    maxN;

  /* Process command line
   */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") == TRUE) {
    puts(usage); 
    puts("\n  where options are:");
    esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
    return 0;
  }
  be_quiet  = esl_opt_GetBoolean(go, "-q");
  do_blosum = esl_opt_GetBoolean(go, "--blosum");
  do_gsc    = esl_opt_GetBoolean(go, "--gsc");
  do_pb     = esl_opt_GetBoolean(go, "--pb");
  maxid     = esl_opt_GetReal   (go, "--id");
  tol       = esl_opt_GetReal   (go, "--tol");
  maxN      = esl_opt_GetInteger(go, "--maxN");
  if (esl_opt_ArgNumber(go) != 1) {
    puts("Incorrect number of command line arguments.");
    puts(usage);
    return 1;
  }
  msafile = esl_opt_GetArg(go, 1);
  esl_getopts_Destroy(go);

  /* Weight one or more alignments from input file
   */
  if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
    eslx_msafile_OpenFailure(afp, status);

  while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF)
    {
      if (status != eslOK) eslx_msafile_ReadFailure(afp, status);

      if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; }

      nali++;
      nwgt += msa->nseq;
      ESL_ALLOC(sqd, sizeof(float) * msa->nseq);

      if (do_gsc) {
	esl_msaweight_GSC(msa);
	GSCWeights(msa->aseq, msa->nseq, msa->alen, sqd);
      } else if (do_pb) {
	esl_msaweight_PB(msa);
	PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, sqd);
      } else if (do_blosum) {
	esl_msaweight_BLOSUM(msa, maxid);
	BlosumWeights(msa->aseq, msa->nseq, msa->alen, maxid, sqd);
	/* workaround SQUID bug: BLOSUM weights weren't renormalized to sum to nseq. */
	esl_vec_FNorm (sqd, msa->nseq);
	esl_vec_FScale(sqd, msa->nseq, (float) msa->nseq);	
      }

      if (! be_quiet) {
	for (i = 0; i < msa->nseq; i++)
	  fprintf(stdout, "%-20s  %.3f  %.3f\n",
		  msa->sqname[i], msa->wgt[i], sqd[i]);
      }
	
      nbad = 0;
      for (i = 0; i < msa->nseq; i++)
	if (esl_DCompare((double) sqd[i], msa->wgt[i], tol) != eslOK) 
	  nbad++;
      if (nbad > 0) nbadali++;
      nbadwgt += nbad;

      if (nbad > 0) printf("%-20s  :: alignment shows %d weights that differ (out of %d) \n", 
			   msa->name, nbad, msa->nseq);

      esl_msa_Destroy(msa);
      free(sqd);
    } 
  eslx_msafile_Close(afp);

  if (nbadali == 0) 
    printf("OK: all weights identical between squid and Easel in %d alignment(s)\n", nali);
  else {
    printf("%d of %d weights mismatched at (> %f fractional difference)\n",
	   nbadwgt, nwgt, tol);
    printf("involving %d of %d total alignments\n", nbadali, nali);
  }
  return eslOK;

 ERROR:
  return status;
}