Пример #1
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go  = NULL;
  ESL_RANDOMNESS *r   = NULL;
  int             be_verbose;

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK) esl_fatal("%s", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") == TRUE) {
    esl_usage(stdout, argv[0], usage);
    puts("\n  where options are:");
    esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
    exit(0);
  }
  if (esl_opt_ArgNumber(go) != 0) {
    printf("Incorrect number of command line arguments.\n");
    esl_usage(stdout, argv[0], usage);
    exit(1);
  }
  be_verbose = esl_opt_GetBoolean(go, "-v");

  if (esl_opt_GetBoolean(go, "-r")) {
    r = esl_randomness_CreateTimeseeded();
    if (be_verbose) printf("seed = %ld\n", esl_randomness_GetSeed(r));
  } else r = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));

  utest_LinearRegression(r, TRUE,  be_verbose);
  utest_LinearRegression(r, FALSE, be_verbose);
  
  esl_getopts_Destroy(go);
  esl_randomness_Destroy(r);
  exit(0);
}
/* Function:  esl_dst_XAverageId()
 * Synopsis:  Calculate avg identity for digital MSA 
 * Incept:    SRE, Fri May 18 15:19:14 2007 [Janelia]
 *
 * Purpose:   Calculates the average pairwise fractional identity in
 *            a digital multiple sequence alignment <ax>, consisting of <N>
 *            aligned digital sequences of identical length.
 *            
 *            If an exhaustive calculation would require more than
 *            <max_comparisons> pairwise comparisons, then instead of
 *            looking at all pairs, calculate the average over a
 *            stochastic sample of <max_comparisons> random pairs.
 *            This allows the routine to work efficiently even on very
 *            deep MSAs.
 *            
 *            Each fractional pairwise identity (range $[0..$ pid $..1]$
 *            is calculated using <esl_dsq_XPairId()>.
 *
 * Returns:   <eslOK> on success, and <*ret_id> contains the average
 *            fractional identity.
 *
 * Throws:    <eslEMEM> on allocation failure.
 *            <eslEINVAL> if any of the aligned sequence pairs aren't 
 *            of the same length.
 *            In either case, <*ret_id> is set to 0.
 */
int
esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_id)
{
  int    status;
  double id;
  double sum;
  int    i,j,n;
  
  if (N <= 1) { *ret_id = 1.; return eslOK; }
  *ret_id = 0.;

  /* Is N small enough that we can average over all pairwise comparisons? 
     watch out for numerical overflow in this: Pfam N's easily overflow when squared
   */
  if (N <= max_comparisons &&
      N <= sqrt(2. * max_comparisons) &&
      (N * (N-1) / 2) <= max_comparisons)
    {
      for (i = 0; i < N; i++)
	for (j = i+1; j < N; j++)
	  {
	    if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
	    sum += id;
	  }
      sum /= (double) (N * (N-1) / 2);
    }

  /* If nseq is large, calculate average over a stochastic sample. */
  else				
    {
      ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded();

      for (n = 0; n < max_comparisons; n++)
	{
	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
	  if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
	  sum += id;
	}
      sum /= (double) max_comparisons;
      esl_randomness_Destroy(r);
    }

  *ret_id = sum;
  return eslOK;
}
/* Function:  esl_dst_CAverageId()
 * Synopsis:  Calculate avg identity for multiple alignment
 * Incept:    SRE, Fri May 18 15:02:38 2007 [Janelia]
 *
 * Purpose:   Calculates the average pairwise fractional identity in
 *            a multiple sequence alignment <as>, consisting of <N>
 *            aligned character sequences of identical length.
 *            
 *            If an exhaustive calculation would require more than
 *            <max_comparisons> pairwise comparisons, then instead of
 *            looking at all pairs, calculate the average over a
 *            stochastic sample of <max_comparisons> random pairs.
 *            This allows the routine to work efficiently even on very
 *            deep MSAs.
 *            
 *            Each fractional pairwise identity (range $[0..$ pid $..1]$
 *            is calculated using <esl_dsq_CPairId()>.
 *
 * Returns:   <eslOK> on success, and <*ret_id> contains the average
 *            fractional identity.
 *
 * Throws:    <eslEMEM> on allocation failure.
 *            <eslEINVAL> if any of the aligned sequence pairs aren't 
 *            of the same length.
 *            In either case, <*ret_id> is set to 0.
 */
int
esl_dst_CAverageId(char **as, int N, int max_comparisons, double *ret_id)
{
  int    status;
  double id;
  double sum;
  int    i,j,n;
  
  if (N <= 1) { *ret_id = 1.; return eslOK; }
  *ret_id = 0.;

  /* Is nseq small enough that we can average over all pairwise comparisons? */
  if ((N * (N-1) / 2) <= max_comparisons)
    {
      for (i = 0; i < N; i++)
	for (j = i+1; j < N; j++)
	  {
	    if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
	    sum += id;
	  }
      id /= (double) (N * (N-1) / 2);
    }

  /* If nseq is large, calculate average over a stochastic sample. */
  else				
    {
      ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded();

      for (n = 0; n < max_comparisons; n++)
	{
	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
	  if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
	  sum += id;
	}
      id /= (double) max_comparisons;
      esl_randomness_Destroy(r);
    }

  *ret_id = id;
  return eslOK;
}
Пример #4
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go         = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
  ESL_RANDOMNESS *r          = NULL;
  char           *bitfile    = esl_opt_GetString (go, "--bitfile");
  int             nbins      = esl_opt_GetInteger(go, "-b");
  int             n          = esl_opt_GetInteger(go, "-n");
  int             be_verbose = esl_opt_GetBoolean(go, "-v");
  int             seed       = esl_opt_GetInteger(go, "-s");

  if (esl_opt_GetBoolean(go, "-r")) r = esl_randomness_CreateTimeseeded();
  else                              r = esl_randomness_Create(seed);

  utest_random(seed, n, nbins, be_verbose);
  utest_choose(r,    n, nbins, be_verbose);

  if (bitfile != NULL) save_bitfile(bitfile, r, n);

  esl_randomness_Destroy(r);
  esl_getopts_Destroy(go);
  return 0;
}
Пример #5
0
int main(void)
{
  ESL_RANDOMNESS *r   = esl_randomness_CreateTimeseeded();
  double a            = -3.;
  double b            = 1.;
  double xori         = -20.;
  double xstep        = 1.0;
  double setsigma     = 1.0;		/* sigma on all points */
  int    n            = 100;
  double *x           = malloc(sizeof(double) * n);
  double *y           = malloc(sizeof(double) * n);
  double *sigma       = malloc(sizeof(double) * n);
  int    i;
  double  ae, be, siga, sigb, cov_ab, cc, Q;
  
  /* Simulate some linear data, with Gaussian noise added to y_i */
  for (i = 0; i < n; i++) {
    sigma[i] = setsigma;
    x[i]     = xori + i*xstep;
    y[i]     = esl_rnd_Gaussian(r, a + b*x[i], sigma[i]);
  }
  
  if (esl_stats_LinearRegression(x, y, sigma, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK)
    esl_fatal("linear regression failed");

  printf("estimated intercept a = %8.4f   [true = %8.4f]\n", ae, a);
  printf("estimated slope b     = %8.4f   [true = %8.4f]\n", be, b);
  printf("estimated sigma on a  = %8.4f\n",                  siga);
  printf("estimated sigma on b  = %8.4f\n",                  sigb);
  printf("estimated cov(a,b)    = %8.4f\n",                  cov_ab);
  printf("correlation coeff     = %8.4f\n",                  cc);
  printf("P-value               = %8.4f\n",                  Q);

  free(x);  free(y);  free(sigma); 
  esl_randomness_Destroy(r);
  exit(0);
}
Пример #6
0
/* The esl_random() unit test:
 * a binned frequency test.
 */
static void
utest_random(long seed, int n, int nbins, int be_verbose)
{
  ESL_RANDOMNESS *r      = NULL;
  int            *counts = NULL;
  double          X2p    = 0.;
  int             i;
  double          X2, exp, diff;

  if ((counts = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed");
  esl_vec_ISet(counts, nbins, 0);

  /* This contrived call sequence exercises CreateTimeseeded() and
   * Init(), while leaving us a reproducible chain. Because it's
   * reproducible, we know this test succeeds, despite being
   * statistical in nature.
   */
  if ((r = esl_randomness_CreateTimeseeded()) == NULL)  esl_fatal("randomness create failed");
  if (esl_randomness_Init(r, seed)            != eslOK) esl_fatal("randomness init failed");

  for (i = 0; i < n; i++)
    counts[esl_rnd_Roll(r, nbins)]++;

  /* X^2 value: \sum (o_i - e_i)^2 / e_i */
  for (X2 = 0., i = 0; i < nbins; i++) {
    exp  = (double) n / (double) nbins;
    diff = (double) counts[i] - exp;
    X2 +=  diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi squared eval failed");
  if (be_verbose) printf("random():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");

  esl_randomness_Destroy(r);
  free(counts);
  return;
}
Пример #7
0
int
main(int argc, char **argv)
{
  int status;
  ESL_RANDOMNESS  *r   = esl_randomness_CreateTimeseeded();
  ESL_ALPHABET    *abc = esl_alphabet_Create(eslAMINO);
  ESL_SCOREMATRIX *S   = NULL;
  ESL_DSQ         *x   = NULL;	/* iid query */
  ESL_DSQ         *y   = NULL;	/* iid target */
  double    lambda;
  double    bg[20];		/* iid background probabilities */
  int       L;			/* query length */
  int       M;			/* target length */
  int       nseq;		/* number of target seqs to simulate */
  int       i;
  int       gop; 
  int       gex;
  char     *mxfile     = "PMX";
  int       raw_sc;

  /* Configuration 
   */
  L = 400;			/* query length */
  M = 400;			/* target length */
  nseq = 50000;
  gop = -11;
  gex = -1;
  lambda = 0.3207;

  ESL_ALLOC(x, sizeof(ESL_DSQ) * (L+2));
  ESL_ALLOC(y, sizeof(ESL_DSQ) * (M+2));

  /* Input an amino acid score matrix from a file. */
  if (mxfile != NULL) {
    ESL_FILEPARSER  *efp = NULL;
    if ( esl_fileparser_Open(mxfile, &efp)  != eslOK) esl_fatal("failed to open score file %s", mxfile);
    if ( esl_sco_Read(efp, abc, &S)         != eslOK) esl_fatal("failed to read matrix from %s", mxfile);
    esl_fileparser_Close(efp);
  } else {			/* default = BLOSUM62 */
    S = esl_scorematrix_Create(abc);
    esl_scorematrix_SetBLOSUM62(S);
  }
  esl_composition_BL62(bg);

  esl_rsq_xIID(r, bg, 20, L, x);
  
  for (i = 0; i < nseq; i++)
    {
      esl_rsq_xIID(r, bg, 20, M, y);
      esl_swat_Score(x, L, y, M, S, gop, gex, &raw_sc);
      printf("%d\n", raw_sc);
    }
  
  free(x);
  free(y);
  esl_scorematrix_Destroy(S);
  esl_alphabet_Destroy(abc);
  esl_randomness_Destroy(r);
  exit(0);

 ERROR:
  exit(status);
}
Пример #8
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* command line configuration      */
  struct cfg_s  cfg;     	/* application configuration       */
  char         *basename= NULL;	/* base of the output file names   */
  char         *alifile = NULL;	/* alignment file name             */
  char         *dbfile  = NULL;	/* name of seq db file             */
  char          outfile[256];	/* name of an output file          */
  int           alifmt;		/* format code for alifile         */
  int           dbfmt;		/* format code for dbfile          */
  ESL_MSAFILE  *afp     = NULL;	/* open alignment file             */
  ESL_MSA      *origmsa = NULL;	/* one multiple sequence alignment */
  ESL_MSA      *msa     = NULL;	/* MSA after frags are removed     */
  ESL_MSA      *trainmsa= NULL;	/* training set, aligned           */
  ESL_STACK    *teststack=NULL; /* test set: stack of ESL_SQ ptrs  */
  int           status;		/* easel return code               */
  int           nfrags;		/* # of fragments removed          */
  int           ntestdom;       /* # of test domains               */
  int           ntest;		/* # of test sequences created     */
  int           nali;		/* number of alignments read       */
  double        avgid;
  
  
  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration:   %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h"))                    cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go)                  != 3)     cmdline_failure(argv[0], "Incorrect number of command line arguments\n");
  basename = esl_opt_GetArg(go, 1); 
  alifile  = esl_opt_GetArg(go, 2);
  dbfile   = esl_opt_GetArg(go, 3);
  alifmt   = eslMSAFILE_STOCKHOLM;
  dbfmt    = eslSQFILE_FASTA;

  /* Set up the configuration structure shared amongst functions here */
  if (esl_opt_IsDefault(go, "--seed"))   cfg.r = esl_randomness_CreateTimeseeded();
  else                                   cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed"));
  cfg.abc       = NULL;		          /* until we open the MSA file, below */
  cfg.fragfrac  = esl_opt_GetReal(go, "-F");
  cfg.idthresh1 = esl_opt_GetReal(go, "-1");
  cfg.idthresh2 = esl_opt_GetReal(go, "-2");
  cfg.test_lens = NULL;
  cfg.ntest     = 0;

  /* Open the output files */ 
  if (snprintf(outfile, 256, "%s.msa", basename) >= 256)  esl_fatal("Failed to construct output MSA file name");
  if ((cfg.out_msafp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.fa",  basename) >= 256)  esl_fatal("Failed to construct output FASTA file name");
  if ((cfg.out_seqfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.pos", basename) >= 256)  esl_fatal("Failed to construct pos test set summary file name");
  if ((cfg.possummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.neg", basename) >= 256)  esl_fatal("Failed to construct neg test set summary file name");
  if ((cfg.negsummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.tbl", basename) >= 256)  esl_fatal("Failed to construct benchmark table file name");
  if ((cfg.tblfp     = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile);

  /* Open the MSA file; determine alphabet */
  status = esl_msafile_Open(alifile, alifmt, NULL, &afp);
  if      (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile);
  else if (status == eslEFORMAT)   esl_fatal("Couldn't determine format of alignment %s\n", alifile);
  else if (status != eslOK)        esl_fatal("Alignment file open failed with error %d\n", status);

  if      (esl_opt_GetBoolean(go, "--amino"))   cfg.abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     cfg.abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     cfg.abc = esl_alphabet_Create(eslRNA);
  else {
    int type;
    status = esl_msafile_GuessAlphabet(afp, &type);
    if (status == eslEAMBIGUOUS)    esl_fatal("Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", alifile);
    else if (status == eslEFORMAT)  esl_fatal("Alignment file parse failed: %s\n", afp->errbuf);
    else if (status == eslENODATA)  esl_fatal("Alignment file %s is empty\n", alifile);
    else if (status != eslOK)       esl_fatal("Failed to read alignment file %s\n", alifile);
    cfg.abc = esl_alphabet_Create(type);
  }
  esl_msafile_SetDigital(afp, cfg.abc);

  if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq);
  else                           esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K);

  /* Open and process the dbfile; make sure it's in the same alphabet */
  process_dbfile(&cfg, dbfile, dbfmt);

  /* Read and process MSAs one at a time  */
  nali = 0;
  while ((status = esl_msa_Read(afp, &origmsa)) == eslOK)
    {
      remove_fragments(&cfg, origmsa, &msa, &nfrags);
      separate_sets   (&cfg, msa, &trainmsa, &teststack);
      ntestdom = esl_stack_ObjectCount(teststack);

      if (ntestdom >= 2) 
	{
	  esl_stack_Shuffle(cfg.r, teststack);
	  synthesize_positives(go, &cfg, msa->name, teststack, &ntest);

	  esl_msa_MinimGaps(trainmsa, NULL, NULL);
	  esl_msa_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM);

	  esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */
	  fprintf(cfg.tblfp, "%-20s  %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest);
	  nali++;
	}

      esl_msa_Destroy(trainmsa);
      esl_msa_Destroy(origmsa);
      esl_msa_Destroy(msa);
    }
  if      (status == eslEFORMAT)  esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", 
					    afp->linenumber, afp->fname, afp->errbuf, afp->buf);	
  else if (status != eslEOF)      esl_fatal("Alignment file read failed with error code %d\n", status);
  else if (nali   == 0)           esl_fatal("No alignments found in file %s\n", alifile);

  if (nali > 0)
    synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N"));

  fclose(cfg.out_msafp);
  fclose(cfg.out_seqfp);
  fclose(cfg.possummfp);
  fclose(cfg.negsummfp);
  fclose(cfg.tblfp);
  esl_randomness_Destroy(cfg.r);
  esl_alphabet_Destroy(cfg.abc);
  esl_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;
}
Пример #9
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* command line configuration      */
  struct cfg_s  cfg;     	/* application configuration       */
  char         *basename= NULL;	/* base of the output file names   */
  char         *alifile = NULL;	/* alignment file name             */
  char         *dbfile  = NULL;	/* name of seq db file             */
  char          outfile[256];	/* name of an output file          */
  int           alifmt;		/* format code for alifile         */
  int           dbfmt;		/* format code for dbfile          */
  ESLX_MSAFILE  *afp    = NULL;	/* open alignment file             */
  ESL_MSA      *origmsa = NULL;	/* one multiple sequence alignment */
  ESL_MSA      *msa     = NULL;	/* MSA after frags are removed     */
  ESL_MSA      *trainmsa= NULL;	/* training set, aligned           */
  ESL_STACK    *teststack=NULL; /* test set: stack of ESL_SQ ptrs  */
  int           status;		/* easel return code               */
  int           nfrags;		/* # of fragments removed          */
  int           ntestdom;       /* # of test domains               */
  int           ntest;		/* # of test sequences created     */
  int           nali;		/* number of alignments read       */
  double        avgid;
  
  
  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration:   %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h"))                    cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go)                  != 3)     cmdline_failure(argv[0], "Incorrect number of command line arguments\n");
  basename = esl_opt_GetArg(go, 1); 
  alifile  = esl_opt_GetArg(go, 2);
  dbfile   = esl_opt_GetArg(go, 3);
  alifmt   = eslMSAFILE_STOCKHOLM;
  dbfmt    = eslSQFILE_FASTA;

  /* Set up the configuration structure shared amongst functions here */
  if (esl_opt_IsDefault(go, "--seed"))   cfg.r = esl_randomness_CreateTimeseeded();
  else                                   cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed"));
  cfg.abc        = NULL;		          /* until we open the MSA file, below */
  cfg.fragfrac   = esl_opt_GetReal(go, "-F");
  cfg.idthresh1  = esl_opt_GetReal(go, "-1");
  cfg.idthresh2  = esl_opt_GetReal(go, "-2");
  cfg.test_lens  = NULL;
  cfg.ntest      = 0;
  cfg.max_ntest  = (esl_opt_IsOn(go, "--maxtest")  ? esl_opt_GetInteger(go, "--maxtest")  : 0); 
  cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0); 

  /* Open the output files */ 
  if (snprintf(outfile, 256, "%s.msa", basename) >= 256)  esl_fatal("Failed to construct output MSA file name");
  if ((cfg.out_msafp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.fa",  basename) >= 256)  esl_fatal("Failed to construct output FASTA file name");
  if ((cfg.out_seqfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.pos", basename) >= 256)  esl_fatal("Failed to construct pos test set summary file name");
  if ((cfg.possummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.neg", basename) >= 256)  esl_fatal("Failed to construct neg test set summary file name");
  if ((cfg.negsummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.tbl", basename) >= 256)  esl_fatal("Failed to construct benchmark table file name");
  if ((cfg.tblfp     = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile);
  if (esl_opt_GetBoolean(go, "--pid")) {
    if (snprintf(outfile, 256, "%s.pid", basename) >= 256)  esl_fatal("Failed to construct %%id table file name");
    if ((cfg.pidfp   = fopen(outfile, "w"))        == NULL) esl_fatal("Failed to open %%id table file %s\n", outfile);
  } else cfg.pidfp   = NULL;

  /* Open the MSA file, digital mode; determine alphabet */
  if      (esl_opt_GetBoolean(go, "--amino"))   cfg.abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     cfg.abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     cfg.abc = esl_alphabet_Create(eslRNA);

  status = eslx_msafile_Open(&(cfg.abc), alifile, NULL, alifmt, NULL, &afp);
  if (status != eslOK) eslx_msafile_OpenFailure(afp, status);

  if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq);
  else                           esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K);

  /* Open and process the dbfile; make sure it's in the same alphabet */
  process_dbfile(&cfg, dbfile, dbfmt);

  /* Read and process MSAs one at a time  */
  nali = 0;
  while ((status = eslx_msafile_Read(afp, &origmsa)) != eslEOF)
    {
      if (status != eslOK) eslx_msafile_ReadFailure(afp, status);
      esl_msa_ConvertDegen2X(origmsa); 
      esl_msa_Hash(origmsa);

      remove_fragments(&cfg, origmsa, &msa, &nfrags);
      separate_sets   (&cfg, msa, &trainmsa, &teststack);

      if ( esl_stack_ObjectCount(teststack) >= 2) 
	{
	  /* randomize test domain order, and apply size limit if any */
	  esl_stack_Shuffle(cfg.r, teststack);
	  if (cfg.max_ntest) pstack_select_topn(&teststack, cfg.max_ntest);
	  ntestdom =  esl_stack_ObjectCount(teststack);

	  /* randomize training set alignment order, and apply size limit if any */
	  esl_msashuffle_PermuteSequenceOrder(cfg.r, trainmsa);
	  if (cfg.max_ntrain) msa_select_topn(&trainmsa, cfg.max_ntrain);
	  esl_msa_MinimGaps(trainmsa, NULL, NULL, FALSE);
	  
	  if (esl_opt_GetBoolean(go, "--pid")) write_pids(cfg.pidfp, origmsa, trainmsa, teststack);

	  synthesize_positives(go, &cfg, msa->name, teststack, &ntest);

	  eslx_msafile_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM);

	  esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */
	  fprintf(cfg.tblfp, "%-20s  %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest);
	  nali++;
	}

      esl_msa_Destroy(trainmsa);
      esl_msa_Destroy(origmsa);
      esl_msa_Destroy(msa);
    }
  if  (nali == 0) esl_fatal("No alignments found in file %s\n", alifile);
  
  synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N"));

  fclose(cfg.out_msafp);
  fclose(cfg.out_seqfp);
  fclose(cfg.possummfp);
  fclose(cfg.negsummfp);
  fclose(cfg.tblfp);
  if (cfg.pidfp) fclose(cfg.pidfp);
  esl_randomness_Destroy(cfg.r);
  esl_alphabet_Destroy(cfg.abc);
  eslx_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;
}