Exemplo n.º 1
0
/* Open the source sequence database for negative subseqs;
 * upon return, cfg->dbfp is open (digital, SSI indexed);
 * cfg->db_maxL and cfg->db_nseq are set.
 */
static int
process_dbfile(struct cfg_s *cfg, char *dbfile, int dbfmt)
{
  ESL_SQ     *sq    = esl_sq_CreateDigital(cfg->abc);
  int         status;
  
  /* Open the sequence file in digital mode */
  status = esl_sqfile_OpenDigital(cfg->abc, dbfile, dbfmt, NULL, &(cfg->dbfp));
  if      (status == eslENOTFOUND) esl_fatal("No such file %s", dbfile);
  else if (status == eslEFORMAT)   esl_fatal("Format of seqfile %s unrecognized.", dbfile);
  else if (status == eslEINVAL)    esl_fatal("Can't autodetect stdin or .gz.");
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  /* Read info on each sequence */
  cfg->db_nseq   = 0;
  cfg->db_maxL   = 0;
  while ((status = esl_sqio_ReadInfo(cfg->dbfp, sq)) == eslOK) {
    cfg->db_maxL = ESL_MAX(sq->L, cfg->db_maxL);
    cfg->db_nseq++;
    esl_sq_Reuse(sq);
  }
  if (status != eslEOF) esl_fatal("Something went wrong with reading the seq db");

  /* Open SSI index */
  if (esl_sqfile_OpenSSI(cfg->dbfp, NULL) != eslOK) esl_fatal("Failed to open SSI index file");
  if (cfg->dbfp->data.ascii.ssi->nprimary != cfg->db_nseq)     esl_fatal("oops, nprimary != nseq");

  esl_sq_Destroy(sq);
  return eslOK;
}
Exemplo n.º 2
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go        = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  ESL_ALPHABET *abc       = esl_alphabet_Create(eslAMINO);
  ESL_SQ       *sq        = esl_sq_CreateDigital(abc);
  ESL_SQFILE   *sqfp      = NULL;
  ESL_HMM      *hmm       = create_test_hmm(abc);
  ESL_HMM      *bg        = create_null_hmm(abc);
  ESL_HMX      *hmx       = esl_hmx_Create(400, hmm->M);
  int           format    = eslSQFILE_UNKNOWN;
  char         *seqfile   = esl_opt_GetArg(go, 1);
  float         fwdsc, nullsc;
  int           status;

  status = esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("No such file.");
  else if (status == eslEFORMAT)   esl_fatal("Format unrecognized.");
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
    {   
      esl_hmx_GrowTo(hmx, sq->n, hmm->M);

      esl_hmm_Forward(sq->dsq, sq->n, hmm,  hmx, &fwdsc);
      esl_hmm_Forward(sq->dsq, sq->n, bg, hmx, &nullsc);

      printf("%-16s %5d  %11.4f %8.4f    %11.4f %8.4f    %11.4f %8.4f\n", sq->name, (int) sq->n,
	     fwdsc  * eslCONST_LOG2R, (fwdsc  * eslCONST_LOG2R) / sq->n,
	     nullsc * eslCONST_LOG2R, (nullsc * eslCONST_LOG2R) / sq->n,
	     (fwdsc - nullsc) * eslCONST_LOG2R, (fwdsc-nullsc) * eslCONST_LOG2R / sq->n);

      esl_sq_Reuse(sq);
    }
  if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s)\n%s\n",
					   sqfp->filename, sqfp->get_error(sqfp));     
  else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
					   status, sqfp->filename);
 
  
  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(sq);
  esl_hmm_Destroy(hmm);
  esl_hmm_Destroy(bg);
  esl_hmx_Destroy(hmx);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
Exemplo n.º 3
0
int main(int argc, char **argv)
{
	ESL_GETOPTS   *go	= esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage);
	char	*hmmfile	= esl_opt_GetArg(go, 1);
	char	*seqfile	= esl_opt_GetArg(go, 2);
	ESL_STOPWATCH *w	= esl_stopwatch_Create();
	ESL_RANDOMNESS*r	= esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
	ESL_ALPHABET*abc	= NULL;
	P7_HMMFILE	*hfp	= NULL;
	P7_HMM		*hmm	= NULL;
	P7_BG		*bg		= NULL;
	P7_PROFILE	*gm1, *gm2;
	int			L		= 2000;		// esl_opt_GetInteger(go, "-L");
	int			N		= esl_opt_GetInteger(go, "-N");
	int			MaxPart	= esl_opt_GetInteger(go, "-M");
	__m128		resdata[10];
	float		*sc1 	= (float*)	(resdata+0);	// uses 1 __m128s
	ESL_DSQ		*dsq	= NULL;
	int			i, j;
	ESL_SQFILE   *sqfp	= NULL;
	DATA_STREAM *dstream;
	struct timeb tbstart, tbend;
	int sumlengths = 0;

	if (p7_hmmfile_Open(hmmfile, NULL, &hfp)!= eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
	if (p7_hmmfile_Read(hfp, &abc, &hmm)	!= eslOK) p7_Fail("Failed to read HMM");

	bg = p7_bg_Create(abc);
	p7_bg_SetLength(bg, L);

	gm1 = p7_profile_Create(hmm->M, abc);
	gm2 = p7_profile_Create(hmm->M, abc);
	p7_ProfileConfig(hmm, bg, gm1, L, p7_UNILOCAL);
	p7_ProfileConfig(hmm, bg, gm2, L, p7_UNILOCAL);

	dstream = p7_ViterbiStream_Create(gm1);
	p7_ViterbiStream_Setup(dstream, L+100, MaxPart); // use max L
	dstream->L = L;

	// No. of partitions computed without full parallelism ( == no. of threads active while some are idle)
	int Niters_part	= dstream->Npartitions % NTHREADS;
	// No. of Model lines that could be computed but are wasted by idle threads waiting on the end
	int Nwasted_threads	= dstream->partition * ((NTHREADS-Niters_part) % NTHREADS);
	// No. of lines in the last partition that go beyond M. It's wasted comp time by a single thread
	int Nwasted_leftover= (dstream->partition - gm1->M % dstream->partition) % dstream->partition;
	// Total number of wasted lines
	int wastedcomp = Nwasted_threads + Nwasted_leftover;
	// Total number of lines computed and waited for
	int totalcomp = wastedcomp + gm1->M; // same as: roundtop(gm1->M, dstream->partition * NTHREADS);

	printf("Viterbi Stream Word with %d Threads, model %s: Modelsize %d, #Segms: %d, SeqL: %d, Nseqs %d, Part %d, #Parts %d\n",
			NTHREADS, hmmfile, gm1->M, (int) ceil(gm1->M/8.0), L, 8*N, dstream->partition, dstream->Npartitions);
	printf("Total Comp Lines: %d | Wasted Comp Lines: %d\n", totalcomp, wastedcomp);

	// for ViterbiFilter
	P7_OPROFILE *om = p7_oprofile_Create(hmm->M, gm1->abc);
	p7_oprofile_Convert(gm1, om);
	P7_OMX		*ox	= p7_omx_Create(hmm->M, 0, 0);

	dsq_cmp_t **seqsdb= calloc(8*N+64, sizeof(dsq_cmp_t*));
if(0)
{	ESL_SQ* sq = esl_sq_CreateDigital(abc);
	if (esl_sqfile_OpenDigital(abc, seqfile, eslSQFILE_FASTA, NULL, &sqfp) != eslOK)
	{	p7_Fail("Failed to open sequence file\n");	return -1; }

	for (j = 0; j < 8*N; j++)
	{
		int res = esl_sqio_Read(sqfp, sq);
		if (res != eslOK) { printf("ATENCAO: faltam sequencias\n"); break; }
		int len = sq->n;
		dsq = sq->dsq;
		seqsdb[j] = malloc(sizeof(dsq_cmp_t));
		seqsdb[j]->length = len;
		seqsdb[j]->seq = malloc((len+4)*sizeof(ESL_DSQ));
		memcpy(seqsdb[j]->seq, dsq, len+2);
		sumlengths += len;
		esl_sq_Reuse(sq);
	}

	ftime(&tbstart);
	N = j/8;
	printf("N = %d\n", N);
		// Sort sequences by length
	qsort(seqsdb, N*8, sizeof(dsq_cmp_t*), compare_seqs);
}
else if(0)
	for (i = 0; i < N; i++)
	{
		for (j = 0; j < 8; j++)
		{
			int len = L - rand()%1000;
			seqsdb[i*8+j] = malloc(sizeof(dsq_cmp_t));
			seqsdb[i*8+j]->seq = malloc(len+4);
			seqsdb[i*8+j]->length = len;
		    esl_rsq_xfIID(r, bg->f, abc->K, len, seqsdb[i*8+j]->seq);
			sumlengths += len;
		}
	}

//	double sumerrors = 0;
	float* results = (float*) alloc_m128_aligned64(N*2+2);	
	ftime(&tbstart);

	for (j = 0; j < N; j++)
	for (i = 0; i < N; i++)
	{
	//	if (i % 10000 == 0) printf("START %d\n", i);

		p7_ViterbiStream(dstream, seqsdb+i*8, sc1);

	//	memcpy(results+i*8, sc1, 32);
	}
	ftime(&tbend);

	double secs = TIMEDIFF(tbstart,tbend);
//	printf("Qsort time: %6.3f | Viterbi time: %6.3f\n", TIMEDIFF(tbqsort,tbstart), secs);
	w->elapsed = w->user = secs;
	esl_stopwatch_Display(stdout, w, "# Opt CPU time: ");
	printf("# %.0fM cells in %.1f Mc/s\n", (sumlengths * (double) gm1->M) / 1e6, (sumlengths * (double) gm1->M * 1e-6) / secs);

if(0)	// compare results against base version
	for (i = 0; i < 1000 && i < N; i++)
	{
		int maxll = 0; float sc2;
		for (j = 0; j < 8; j++)
			if (maxll < seqsdb[i*8+j]->length)
				maxll = seqsdb[i*8+j]->length;

//		for (j = 0; j < 8; j++)	printf("%d ", seqsdb[i*8+j]->length); printf("\n");
//		if (i % 10 == 0) printf("i %d\n", i);

		p7_oprofile_ReconfigRestLength(om, maxll);
		p7_ReconfigLength(gm2, maxll);	// fazer Reconfig aqui para emular compl o VitStream

		for (j = 0; j < 8; j++)
		{
//			p7_ReconfigLength(gm2, seqsdb[i*8+j]->length);
//			p7_Viterbi_unilocal(seqsdb[i*8+j]->seq, seqsdb[i*8+j]->length, gm2, &sc3);
//			p7_Viterbi_unilocal_word(seqsdb[i*8+j]->seq, seqsdb[i*8+j]->length, gm2, &sc2);

//			p7_oprofile_ReconfigLength(om, seqsdb[i*8+j]->length);
			p7_ViterbiFilter(seqsdb[i*8+j]->seq, seqsdb[i*8+j]->length, om, ox, &sc2);
			
			//sumerrors += fabs(sc1[j]- sc2);
			if (fabs(results[i*8+j] - sc2) > 0.00001)
			{	printf("%3d-%d L %4d: VS %f d %f\t| Base SerI %f\n", i, j, seqsdb[i*8+j]->length, 
						results[i*8+j], fabs(results[i*8+j] - sc2), sc2);
				getc(stdin);
			}
		}
	}

	return 0;
}
Exemplo n.º 4
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS    *go      = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
  char           *hmmfile = esl_opt_GetArg(go, 1);
  char           *seqfile = esl_opt_GetArg(go, 2);
  ESL_ALPHABET   *abc     = NULL;
  P7_HMMFILE     *hfp     = NULL;
  P7_HMM         *hmm     = NULL;
  P7_BG          *bg      = NULL;
  P7_PROFILE     *gm      = NULL;
  P7_GMX         *gx1     = NULL;
  P7_GMX         *gx2     = NULL;
  ESL_SQ         *sq      = NULL;
  ESL_SQFILE     *sqfp    = NULL;
  P7_TRACE       *tr      = NULL;
  int             format  = eslSQFILE_UNKNOWN;
  char            errbuf[eslERRBUFSIZE];
  float           fsc, bsc, vsc;
  float           accscore;
  int             status;

  /* Read in one HMM */
  if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
  if (p7_hmmfile_Read(hfp, &abc, &hmm)            != eslOK) p7_Fail("Failed to read HMM");
  p7_hmmfile_Close(hfp);
 
  /* Read in one sequence */
  sq     = esl_sq_CreateDigital(abc);
  status = esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp);
  if      (status == eslENOTFOUND) p7_Fail("No such file.");
  else if (status == eslEFORMAT)   p7_Fail("Format unrecognized.");
  else if (status == eslEINVAL)    p7_Fail("Can't autodetect stdin or .gz.");
  else if (status != eslOK)        p7_Fail("Open failed, code %d.", status);
  if  (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence");
  esl_sqfile_Close(sqfp);
 
  /* Configure a profile from the HMM */
  bg = p7_bg_Create(abc);
  p7_bg_SetLength(bg, sq->n);
  gm = p7_profile_Create(hmm->M, abc);
  p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* multihit local: H3 default */
  
  /* Allocations */
  gx1 = p7_gmx_Create(gm->M, sq->n);
  gx2 = p7_gmx_Create(gm->M, sq->n);
  tr  = p7_trace_CreateWithPP();
  p7_FLogsumInit();

  /* Run Forward, Backward; do OA fill and trace */
  p7_GForward (sq->dsq, sq->n, gm, gx1, &fsc);
  p7_GBackward(sq->dsq, sq->n, gm, gx2, &bsc);
  p7_GDecoding(gm, gx1, gx2, gx2);                   /* <gx2> is now the posterior decoding matrix */
  p7_GOptimalAccuracy(gm, gx2, gx1, &accscore);	     /* <gx1> is now the OA matrix */
  p7_GOATrace(gm, gx2, gx1, tr);

  if (esl_opt_GetBoolean(go, "-d")) p7_gmx_Dump(stdout, gx2, p7_DEFAULT);
  if (esl_opt_GetBoolean(go, "-m")) p7_gmx_Dump(stdout, gx1, p7_DEFAULT);

  p7_trace_Dump(stdout, tr, gm, sq->dsq);
  if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace fails validation:\n%s\n", errbuf);

  printf("fwd = %.4f nats\n", fsc);
  printf("bck = %.4f nats\n", bsc);
  printf("acc = %.4f (%.2f%%)\n", accscore, accscore * 100. / (float) sq->n);

  p7_trace_Reuse(tr);

  p7_GViterbi(sq->dsq, sq->n, gm, gx1, &vsc);
  p7_GTrace  (sq->dsq, sq->n, gm, gx1, tr);
  p7_trace_SetPP(tr, gx2);
  p7_trace_Dump(stdout, tr, gm, sq->dsq);

  printf("vit = %.4f nats\n", vsc);
  printf("acc = %.4f\n", p7_trace_GetExpectedAccuracy(tr));

  /* Cleanup */
  esl_sq_Destroy(sq);
  p7_trace_Destroy(tr);
  p7_gmx_Destroy(gx1);
  p7_gmx_Destroy(gx2);
  p7_profile_Destroy(gm);
  p7_bg_Destroy(bg);
  p7_hmm_Destroy(hmm);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
Exemplo n.º 5
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS   *go          = NULL;
  ESL_ALPHABET  *nt_abc      = esl_alphabet_Create(eslDNA);
  ESL_ALPHABET  *aa_abc      = esl_alphabet_Create(eslAMINO);
  ESL_GENCODE   *gcode       = NULL;
  ESL_GENCODE_WORKSTATE *wrk    = NULL;
  char          *seqfile     = NULL;
  int            informat    = eslSQFILE_UNKNOWN;
  ESL_SQFILE    *sqfp        = NULL;
  int            status;
  
  /***************************************************************** 
   * command line parsing
   *****************************************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }
  
  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\n where options are:");
      esl_opt_DisplayHelp(stdout, go, /*docgroup=*/0, /*indent=*/2, /*textwidth=*/80);

      puts("\nAvailable NCBI genetic code tables (for -c <id>):");
      esl_gencode_DumpAltCodeTable(stdout);

      exit(0);
    }

  if (esl_opt_ArgNumber(go) != 1) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }
 
  seqfile = esl_opt_GetArg(go, 1);

  if (esl_opt_IsOn(go, "--informat") &&
      (informat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
    esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); 

  status = esl_sqfile_OpenDigital(nt_abc, seqfile, informat, /*env=*/NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("Failed to find (or open) sequence file %s", seqfile);
  else if (status == eslEFORMAT)   esl_fatal("Failed to recognize format of sequence file %s", seqfile);
  else if (status != eslOK)        esl_fatal("Failure in opening sequence file %s; code %d", seqfile, status);

  /* A limitation. The esl_sqio_ReadWindow() interface needs to use SSI positioning
   * to read reverse complement, and that doesn't work on nonrewindable streams.
   */
  if ( esl_opt_GetBoolean(go, "-W") && ! esl_sqfile_IsRewindable(sqfp) && ! esl_opt_GetBoolean(go, "--watson"))
    esl_fatal("esl-translate can't read reverse complement from a nonrewindable stream (stdin pipe, .gz file, etc).");

  /* Set up the genetic code. Default = NCBI 1, the standard code; allow ORFs to start at any aa
   */
  gcode = esl_gencode_Create(nt_abc, aa_abc);
  esl_gencode_Set(gcode, esl_opt_GetInteger(go, "-c"));  // default = 1, the standard genetic code

  if      (esl_opt_GetBoolean(go, "-m"))   esl_gencode_SetInitiatorOnlyAUG(gcode);
  else if (! esl_opt_GetBoolean(go, "-M")) esl_gencode_SetInitiatorAny(gcode);      // note this is the default, if neither -m or -M are set


  /* Set up the workstate structure, which contains both stateful 
   * info about our position in <sqfp> and the DNA <sq>, as well as
   * one-time config info from options
   */
  wrk = esl_gencode_WorkstateCreate(go, gcode);


  /* The two styles of main processing loop:
   */
  if (esl_opt_GetBoolean(go, "-W"))  do_by_windows(gcode, wrk, sqfp);
  else                               do_by_sequences(gcode, wrk, sqfp);


  esl_gencode_WorkstateDestroy(wrk);
  esl_sqfile_Close(sqfp);
  esl_gencode_Destroy(gcode);
  esl_alphabet_Destroy(aa_abc);
  esl_alphabet_Destroy(nt_abc);
  esl_getopts_Destroy(go);
  return 0;
}
Exemplo n.º 6
0
int main(int argc, char **argv)
{
	ESL_GETOPTS   *go	= esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage);
	char	*hmmfile	= esl_opt_GetArg(go, 1);
	char	*seqfile	= esl_opt_GetArg(go, 2);
	ESL_STOPWATCH *w	= esl_stopwatch_Create();
	ESL_RANDOMNESS*r	= esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
	ESL_ALPHABET*abc	= NULL;
	P7_HMMFILE	*hfp	= NULL;
	P7_HMM		*hmm	= NULL;
	P7_BG		*bg		= NULL;
	P7_PROFILE	*gm1, *gm2;
	int			L		= esl_opt_GetInteger(go, "-L");
	int			N		= esl_opt_GetInteger(go, "-N") / SSE16_NVALS;
	int			MaxPart	= esl_opt_GetInteger(go, "-M");
	int			NROUNDS	= esl_opt_GetInteger(go, "-R");
   	int			check	= esl_opt_GetBoolean(go, "-c");
	__m128		resdata[10];
	int			i, j;
	float		*sc1 	= (float*) resdata;
	ESL_SQFILE   *sqfp	= NULL;
	DATA_COPS16 *dcops;
	struct timeb tbstart, tbend;
	int sumlengths = 0;
	float* results = NULL;

	srand(time(NULL));
	if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
	if (p7_hmmfile_Read(hfp, &abc, &hmm)	 != eslOK) p7_Fail("Failed to read HMM");

	bg = p7_bg_Create(abc);
	p7_bg_SetLength(bg, L);
	gm1 = p7_profile_Create(hmm->M, abc);
	gm2 = p7_profile_Create(hmm->M, abc);
	p7_ProfileConfig(hmm, bg, gm1, L, p7_UNILOCAL);
	p7_ProfileConfig(hmm, bg, gm2, L, p7_UNILOCAL);

	dcops = p7_ViterbiCOPSw_Create(gm1);
	p7_ViterbiCOPSW_Setup(dcops, L+100, MaxPart); // use max L
	dcops->L = L;

    int dbsize = SSE16_NVALS*N;
	SEQ **seqsdb= calloc(dbsize+1, sizeof(SEQ*));
	int equallength = 1;

	if (esl_sqfile_OpenDigital(abc, seqfile, eslSQFILE_FASTA, NULL, &sqfp) == eslOK)
	{	// Use Sequence file
		ESL_SQ* sq = esl_sq_CreateDigital(abc);
        int maxseqs, len=0;
        
        if (esl_opt_IsDefault(go, "-N"))    // N not specified in cmdline
            maxseqs = INT_MAX;   // no limit
        else
            maxseqs = SSE16_NVALS*N;      // use cmdline limit

		for (j = 0; j < maxseqs && esl_sqio_Read(sqfp, sq) == eslOK; j++)
		{
		 	if (equallength && sq->n != len && j > 0)
                equallength = 0;
               
			len = sq->n;
			if (j > dbsize)
			{	seqsdb = realloc(seqsdb, 2*(dbsize+1)*sizeof(SEQ*));
				dbsize *= 2;
			}
            
			ESL_DSQ* dsq = sq->dsq;
			seqsdb[j] = malloc(sizeof(SEQ));
			seqsdb[j]->length = len;
			seqsdb[j]->seq = malloc((len+4)*sizeof(ESL_DSQ));
			memcpy(seqsdb[j]->seq, dsq, len+2);
			sumlengths += len;
			esl_sq_Reuse(sq);
		}
		N = j/SSE16_NVALS;
	}
    else	// Not found database. Generate random sequences
        for (i = 0; i < N; i++)
			for (j = 0; j < SSE16_NVALS; j++)
			{
				int len = L; // - rand()%1000;
				seqsdb[i*SSE16_NVALS+j] = malloc(sizeof(SEQ));
				seqsdb[i*SSE16_NVALS+j]->seq = malloc(len+4);
				seqsdb[i*SSE16_NVALS+j]->length = len;
				esl_rsq_xfIID(r, bg->f, abc->K, len, seqsdb[i*SSE16_NVALS+j]->seq);
				sumlengths += len;
			}

   	printf("Viterbi COPS Word with %d threads, model %s. ModelLen: %d, #Segms: %d, SeqL.: %d, #seqs: %d, Partition: %d, #parts: %d\n",
			NTHREADS, hmmfile, gm1->M, (int) ceil(gm1->M/SSE16_NVALS), L, SSE16_NVALS*N*NROUNDS, dcops->partition, dcops->Npartitions);
            
/*	// No. of partitions computed without full parallelism ( == no. of threads active while some are idle)
	int Niters_part	= dcops->Npartitions % NTHREADS;
	// No. of Model lines that could be computed but are wasted by idle threads waiting on the end
	int Nwasted_threads	= dcops->partition * ((NTHREADS-Niters_part) % NTHREADS);
	// No. of lines in the last partition that go beyond M. It's wasted comp time by a single thread
	int Nwasted_leftover= (dcops->partition - gm1->M % dcops->partition) % dcops->partition;
	// Total number of wasted lines
	int wastedcomp = Nwasted_threads + Nwasted_leftover;
	// Total number of lines computed and waited for
	int totalcomp = wastedcomp + gm1->M; // same as: roundtop(gm1->M, dcops->partition * NTHREADS);
	printf("Total Comp Lines: %d | Wasted Comp Lines: %d\n", totalcomp, wastedcomp);
*/

   	if (check) results = (float*) alloc_m128_aligned64((N+1)*2);

	ftime(&tbstart);
    
	if (!equallength)
	{	// Sort sequences by length
		qsort(seqsdb, N*SSE16_NVALS, sizeof(SEQ*), compare_seqs);
	}

	for (j = 0; j < NROUNDS; j++) 
        for (i = 0; i < N; i++)
        {
        //	if (i % 1000 == 0) printf("Seq %d\n", i);

            p7_ViterbiCOPSw_run(dcops, seqsdb+i*SSE16_NVALS, sc1);

         	if (check) memcpy(results+i*SSE16_NVALS, sc1, 32);	// 32 bytes indeed! SSE16_NVALS floats
        }
        
	ftime(&tbend);

	double secs = TIMEDIFF(tbstart,tbend);
	w->elapsed = w->user = secs;
	esl_stopwatch_Display(stdout, w, "# Opt CPU time: ");
	double compmillioncells = NROUNDS * (double) sumlengths * (double) hmm->M * 1e-6;
	printf("# %.0fM cells in %.1f Mc/s\n", compmillioncells, compmillioncells / secs);

	if (check)
    {   P7_OPROFILE *om = p7_oprofile_Create(hmm->M, gm1->abc);
		p7_oprofile_Convert(gm1, om);
		P7_OMX		*ox	= p7_omx_Create(hmm->M, 0, 0);
		printf("Compare results against base version\n");

        for (i = 0; i < N; i++)
        {
            int maxll = 0; float sc2;
            for (j = 0; j < SSE16_NVALS; j++)
                if (maxll < seqsdb[i*SSE16_NVALS+j]->length)
                    maxll = seqsdb[i*SSE16_NVALS+j]->length;

            p7_oprofile_ReconfigRestLength(om, maxll);
    //      p7_ReconfigLength(gm2, maxll);	// emulate the lock-step inter-sequence reconfigs

            for (j = 0; j < SSE16_NVALS; j++)
            {
    //			p7_ReconfigLength(gm2, seqsdb[i*SSE16_NVALS+j]->length);
    //			p7_Viterbi_unilocal(seqsdb[i*SSE16_NVALS+j]->seq, seqsdb[i*SSE16_NVALS+j]->length, gm2, &sc3);
    //			p7_Viterbi_unilocal_word(seqsdb[i*SSE16_NVALS+j]->seq, seqsdb[i*SSE16_NVALS+j]->length, gm2, &sc2);

    //			p7_oprofile_ReconfigLength(om, seqsdb[i*SSE16_NVALS+j]->length);
                p7_ViterbiFilter(seqsdb[i*SSE16_NVALS+j]->seq, seqsdb[i*SSE16_NVALS+j]->length, om, ox, &sc2);
   				sc2 += 1.0;	// -2.0nat optimization, Local to Unilocal mode		
  
                if (fabs(results[i*SSE16_NVALS+j] - sc2) > 0.0001)
                {	printf("Seq %d Len %4d: %f - %f\tdiff: %f\n", i*SSE16_NVALS+j, seqsdb[i*SSE16_NVALS+j]->length, 
							results[i*SSE16_NVALS+j], sc2, fabs(results[i*SSE16_NVALS+j] - sc2));
                }
            }
        }
    }
    
	return 0;
}
Exemplo n.º 7
0
/* serial_master()
 * The serial version of hmmsearch.
 * For each query HMM in <hmmdb> search the database for hits.
 * 
 * A master can only return if it's successful. All errors are handled
 * immediately and fatally with p7_Fail().
 */
static int
serial_master(ESL_GETOPTS *go, struct cfg_s *cfg)
{
  FILE            *ofp      = stdout;	         /* output file for results (default stdout)        */
  FILE            *tblfp    = NULL;		 /* output stream for tabular per-seq (--tblout)    */
  FILE            *dfamtblfp    = NULL;            /* output stream for tabular Dfam format (--dfamtblout)    */
  FILE            *aliscoresfp  = NULL;            /* output stream for alignment scores (--aliscoresout)    */

//  P7_HMM          *hmm        = NULL;              /* one HMM query                                   */
//  P7_SCOREDATA    *scoredata  = NULL;

  int              seqfmt   = eslSQFILE_UNKNOWN; /* format of seqfile                               */
  ESL_SQFILE      *sqfp     = NULL;              /* open seqfile                                    */
  P7_HMMFILE      *hfp      = NULL;		 /* open HMM database file                          */
  ESL_ALPHABET    *abc      = NULL;              /* sequence alphabet                               */
  P7_OPROFILE     *om       = NULL;		 /* target profile                                  */
  ESL_STOPWATCH   *w        = NULL;              /* timing                                          */
  ESL_SQ          *qsq      = NULL;		 /* query sequence                                  */
  int              nquery   = 0;
  int              textw;
  int              status   = eslOK;
  int              hstatus  = eslOK;
  int              sstatus  = eslOK;
  int              i;

  int              ncpus    = 0;

  int              infocnt  = 0;
  WORKER_INFO     *info     = NULL;
#ifdef HMMER_THREADS
  P7_OM_BLOCK     *block    = NULL;
  ESL_THREADS     *threadObj= NULL;
  ESL_WORK_QUEUE  *queue    = NULL;
#endif
  char             errbuf[eslERRBUFSIZE];

  double window_beta = -1.0 ;
  int window_length  = -1;
  if (esl_opt_IsUsed(go, "--w_beta")) { if (  ( window_beta   = esl_opt_GetReal(go, "--w_beta") )  < 0 || window_beta > 1  ) esl_fatal("Invalid window-length beta value\n"); }
  if (esl_opt_IsUsed(go, "--w_length")) { if (( window_length = esl_opt_GetInteger(go, "--w_length")) < 4  ) esl_fatal("Invalid window length value\n"); }


  w = esl_stopwatch_Create();

  if (esl_opt_GetBoolean(go, "--notextw")) textw = 0;
  else                                     textw = esl_opt_GetInteger(go, "--textw");

  /* If caller declared an input format, decode it */
  if (esl_opt_IsOn(go, "--qformat")) {
    seqfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--qformat"));
    if (seqfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--qformat"));
  }

  /* validate options if running as a daemon */
//  if (esl_opt_IsOn(go, "--daemon")) {
    /* running as a daemon, the input format must be type daemon */
//    if (seqfmt != eslSQFILE_UNKNOWN && seqfmt != eslSQFILE_DAEMON)
//      esl_fatal("Input format %s not supported.  Must be daemon\n", esl_opt_GetString(go, "--qformat"));
//    seqfmt = eslSQFILE_DAEMON;

//    if (strcmp(cfg->seqfile, "-") != 0) esl_fatal("Query sequence file must be '-'\n");
//  }

  /* Open the target profile database to get the sequence alphabet */
  status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, errbuf);
  if      (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf);
  else if (status == eslEFORMAT)   p7_Fail("File format problem, trying to open HMM file %s.\n%s\n",                  cfg->hmmfile, errbuf);
  else if (status != eslOK)        p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n",               status, cfg->hmmfile, errbuf);  
  if (! hfp->is_pressed)           p7_Fail("Failed to open binary auxfiles for %s: use hmmpress first\n",             hfp->fname);

  hstatus = p7_oprofile_ReadMSV(hfp, &abc, &om);
  if      (hstatus == eslEFORMAT)   p7_Fail("bad format, binary auxfiles, %s:\n%s",     cfg->hmmfile, hfp->errbuf);
  else if (hstatus == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile);
  else if (hstatus != eslOK)        p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); 

  p7_oprofile_Destroy(om);
  p7_hmmfile_Close(hfp);

  /* Open the query sequence database */
  status = esl_sqfile_OpenDigital(abc, cfg->seqfile, seqfmt, NULL, &sqfp);
  if      (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n",      cfg->seqfile);
  else if (status == eslEFORMAT)   p7_Fail("Sequence file %s is empty or misformatted\n",        cfg->seqfile);
  else if (status == eslEINVAL)    p7_Fail("Can't autodetect format of a stdin or .gz seqfile");
  else if (status != eslOK)        p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->seqfile);
  if (sqfp->format > 100) // breaking the law!  That range is reserved for msa, for aligned formats
    p7_Fail("%s contains a multiple sequence alignment; expect unaligned sequences, like FASTA\n",   cfg->seqfile);
  qsq = esl_sq_CreateDigital(abc);


  /* Open the results output files */
  if (esl_opt_IsOn(go, "-o"))          { if ((ofp      = fopen(esl_opt_GetString(go, "-o"),          "w")) == NULL)  esl_fatal("Failed to open output file %s for writing\n",                 esl_opt_GetString(go, "-o")); }
  if (esl_opt_IsOn(go, "--tblout"))    { if ((tblfp    = fopen(esl_opt_GetString(go, "--tblout"),    "w")) == NULL)  esl_fatal("Failed to open tabular per-seq output file %s for writing\n", esl_opt_GetString(go, "--tblfp")); }
  if (esl_opt_IsOn(go, "--dfamtblout"))    { if ((dfamtblfp    = fopen(esl_opt_GetString(go, "--dfamtblout"),"w"))    == NULL)  esl_fatal("Failed to open tabular dfam output file %s for writing\n", esl_opt_GetString(go, "--dfamtblout")); }
  if (esl_opt_IsOn(go, "--aliscoresout"))  { if ((aliscoresfp  = fopen(esl_opt_GetString(go, "--aliscoresout"),"w")) == NULL)  esl_fatal("Failed to open alignment scores output file %s for writing\n", esl_opt_GetString(go, "--aliscoresout")); }
 
  output_header(ofp, go, cfg->hmmfile, cfg->seqfile);

#ifdef HMMER_THREADS
  /* initialize thread data */
  if (esl_opt_IsOn(go, "--cpu")) ncpus = esl_opt_GetInteger(go, "--cpu");
  else                           esl_threads_CPUCount(&ncpus);

  if (ncpus > 0)
    {
      threadObj = esl_threads_Create(&pipeline_thread);
      queue = esl_workqueue_Create(ncpus * 2);
    }
#endif

  infocnt = (ncpus == 0) ? 1 : ncpus;
  ESL_ALLOC(info, sizeof(*info) * infocnt);

  for (i = 0; i < infocnt; ++i)
    {
      info[i].bg    = p7_bg_Create(abc);
#ifdef HMMER_THREADS
      info[i].queue = queue;
#endif
    }

#ifdef HMMER_THREADS
  for (i = 0; i < ncpus * 2; ++i)
    {
      block = p7_oprofile_CreateBlock(BLOCK_SIZE);
      if (block == NULL)    esl_fatal("Failed to allocate sequence block");

      status = esl_workqueue_Init(queue, block);
      if (status != eslOK)  esl_fatal("Failed to add block to work queue");
    }
#endif

  /* Outside loop: over each query sequence in <seqfile>. */
  while ((sstatus = esl_sqio_Read(sqfp, qsq)) == eslOK)
  {
      if (sstatus == eslEMEM)                 p7_Fail("Memory allocation error reading sequence file\n", status);
      if (sstatus == eslEINCONCEIVABLE)       p7_Fail("Unexpected error %d reading sequence file\n", status);
     // if (qsq->L > NHMMER_MAX_RESIDUE_COUNT)  p7_Fail("Input sequence %s in file %s exceeds maximum length of %d bases.\n",  qsq->name, cfg->seqfile, NHMMER_MAX_RESIDUE_COUNT);

      nquery++;
      esl_stopwatch_Start(w);	                          

      /* Open the target profile database */
      status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, NULL);
      if (status != eslOK)        p7_Fail("Unexpected error %d in opening hmm file %s.\n",           status, cfg->hmmfile);  
  
#ifdef HMMER_THREADS
      /* if we are threaded, create a lock to prevent multiple readers */
      if (ncpus > 0)
      {
        status = p7_hmmfile_CreateLock(hfp);
        if (status != eslOK) p7_Fail("Unexpected error %d creating lock\n", status);
      }
#endif

      if (fprintf(ofp, "Query:       %s  [L=%ld]\n", qsq->name, (long) qsq->n) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
      if (qsq->acc[0]  != 0 && fprintf(ofp, "Accession:   %s\n", qsq->acc)     < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
      if (qsq->desc[0] != 0 && fprintf(ofp, "Description: %s\n", qsq->desc)    < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");

      for (i = 0; i < infocnt; ++i)
      {
        /* Create processing pipeline and hit list */
        info[i].th  = p7_tophits_Create();
        info[i].pli = p7_pipeline_Create(go, 100, 100, TRUE, p7_SCAN_MODELS); /* M_hint = 100, L_hint = 100 are just dummies for now */
        info[i].pli->hfp = hfp;  /* for two-stage input, pipeline needs <hfp> */


        p7_pli_NewSeq(info[i].pli, qsq);
        info[i].qsq = qsq;

        if (  esl_opt_IsUsed(go, "--toponly") )
          info[i].pli->strand = p7_STRAND_TOPONLY;
        else if (  esl_opt_IsUsed(go, "--bottomonly") )
          info[i].pli->strand = p7_STRAND_BOTTOMONLY;
        else
          info[i].pli->strand = p7_STRAND_BOTH;


        #ifdef HMMER_THREADS
          if (ncpus > 0) esl_threads_AddThread(threadObj, &info[i]);
        #endif
      }

#ifdef HMMER_THREADS
      if (ncpus > 0)  hstatus = thread_loop(threadObj, queue, hfp);
      else	      hstatus = serial_loop(info, hfp);
#else
      hstatus = serial_loop(info, hfp);
#endif
      switch(hstatus)
      {
        case eslEFORMAT:   p7_Fail("bad file format in HMM file %s",             cfg->hmmfile);	  break;
        case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets",   cfg->hmmfile);	  break;
        case eslEOF:
        case eslOK:   /* do nothing */
          break;
        default: 	   p7_Fail("Unexpected error in reading HMMs from %s",   cfg->hmmfile);
      }



      /* merge the results of the search results */
      for (i = 1; i < infocnt; ++i)
      {
        p7_tophits_Merge(info[0].th, info[i].th);
        p7_pipeline_Merge(info[0].pli, info[i].pli);

        p7_pipeline_Destroy(info[i].pli);
        p7_tophits_Destroy(info[i].th);
      }


      /* modify e-value to account for number of models */
      for (i = 0; i < info->th->N ; i++)
      {
        info->th->unsrt[i].lnP         += log((float)info->pli->nmodels);
        info->th->unsrt[i].dcl[0].lnP   = info->th->unsrt[i].lnP;
        info->th->unsrt[i].sortkey      = -1.0 * info->th->unsrt[i].lnP;
      }


      /* it's possible to have duplicates based on how viterbi ranges can overlap */
      p7_tophits_SortByModelnameAndAlipos(info->th);
      p7_tophits_RemoveDuplicates(info->th, info->pli->use_bit_cutoffs);

      /* Print results */
      p7_tophits_SortBySortkey(info->th);
      p7_tophits_Threshold(info->th, info->pli);

      //tally up total number of hits and target coverage
      info->pli->n_output = info->pli->pos_output = 0;
      for (i = 0; i < info->th->N; i++) {
          if ( (info->th->hit[i]->flags & p7_IS_REPORTED) || info->th->hit[i]->flags & p7_IS_INCLUDED) {
              info->pli->n_output++;
              info->pli->pos_output += abs(info->th->hit[i]->dcl[0].jali - info->th->hit[i]->dcl[0].iali) + 1;
          }
      }




      p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
      p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");

      if (tblfp)     p7_tophits_TabularTargets(tblfp,    qsq->name, qsq->acc, info->th, info->pli, (nquery == 1));
      if (dfamtblfp) p7_tophits_TabularXfam(dfamtblfp,   qsq->name, NULL, info->th, info->pli);
      if (aliscoresfp) p7_tophits_AliScores(aliscoresfp, qsq->name, info->th );


      esl_stopwatch_Stop(w);
      info->pli->nseqs = 1;
      p7_pli_Statistics(ofp, info->pli, w);
      if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
      fflush(ofp);

      p7_hmmfile_Close(hfp);
      p7_pipeline_Destroy(info->pli);
      p7_tophits_Destroy(info->th);
      esl_sq_Reuse(qsq);
  }



  if      (sstatus == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
					    sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
  else if (sstatus != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
					    sstatus, sqfp->filename);

  /* Terminate outputs - any last words?
   */
  if (tblfp)    p7_tophits_TabularTail(tblfp,    "hmmscan", p7_SCAN_MODELS, cfg->seqfile, cfg->hmmfile, go);
  if (ofp)      { if (fprintf(ofp, "[ok]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); }

  /* Cleanup - prepare for successful exit
   */
  for (i = 0; i < infocnt; ++i)
    p7_bg_Destroy(info[i].bg);

#ifdef HMMER_THREADS
  if (ncpus > 0)
    {
      esl_workqueue_Reset(queue);
      while (esl_workqueue_Remove(queue, (void **) &block) == eslOK)
        p7_oprofile_DestroyBlock(block);
      esl_workqueue_Destroy(queue);
      esl_threads_Destroy(threadObj);
    }
#endif

  free(info);

  esl_sq_Destroy(qsq);
  esl_stopwatch_Destroy(w);
  esl_alphabet_Destroy(abc);
  esl_sqfile_Close(sqfp);

  if (ofp != stdout) fclose(ofp);
  if (tblfp)         fclose(tblfp);
  if (dfamtblfp)     fclose(dfamtblfp);
  if (aliscoresfp)   fclose(aliscoresfp);

  return eslOK;

 ERROR:

 if (ofp != stdout) fclose(ofp);
 if (tblfp)         fclose(tblfp);
 if (dfamtblfp)     fclose(dfamtblfp);
 if (aliscoresfp)   fclose(aliscoresfp);

 return status;
}
Exemplo n.º 8
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go      = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage);
  char           *hmmfile = esl_opt_GetArg(go, 1);
  char           *seqfile = esl_opt_GetArg(go, 2);
  ESL_ALPHABET   *abc     = NULL;
  P7_HMMFILE     *hfp     = NULL;
  P7_HMM         *hmm     = NULL;
  P7_BG          *bg      = NULL;
  ESL_SQFILE     *sqfp    = NULL;
  int             format  = eslSQFILE_UNKNOWN;
  ESL_SQ         *sq      = NULL;
  float           nullsc, filtersc, H;
  int             status;
 
  /* Read one HMM from <hmmfile> */
  if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
  if (p7_hmmfile_Read(hfp, &abc, &hmm)     != eslOK) p7_Fail("Failed to read HMM");
  p7_hmmfile_Close(hfp);

  /* Open <seqfile> for reading */
  status = esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("No such file.");
  else if (status == eslEFORMAT)   esl_fatal("Format unrecognized.");
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  sq = esl_sq_CreateDigital(abc);
  bg = p7_bg_Create(abc);
  p7_bg_SetFilterByHMM(bg, hmm);

  H = esl_vec_FEntropy(bg->f, bg->abc->K);
  printf("bg iid H = %.4f\n", H);

  H = esl_vec_FEntropy(bg->mcomp, bg->abc->K);
  printf("modelcomp H = %.4f\n", H);

  while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
    {
      p7_bg_SetLength(bg, sq->n);

      p7_bg_NullOne    (bg, sq->dsq, sq->n, &nullsc);
      p7_bg_FilterScore(bg, sq->dsq, sq->n, &filtersc);

      printf("%-20s %5d %8.5f %8.5f %8.5f\n", sq->name, (int) sq->n, nullsc, filtersc, filtersc-nullsc);

      esl_sq_Reuse(sq);
    }
  if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n",
					    sqfp->filename, sqfp->linenumber, sqfp->errbuf);     
  else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
					    status, sqfp->filename);

  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(sq);
  p7_bg_Destroy(bg);
  p7_hmm_Destroy(hmm);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
Exemplo n.º 9
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* application configuration       */
  char         *hmmfile = NULL;	/* HMM file name                   */
  char         *seqfile = NULL; /* sequence file name              */
  char         *mapfile = NULL; /* optional mapped MSA file name   */
  int           infmt   = eslSQFILE_UNKNOWN;
  int           outfmt  = eslMSAFILE_STOCKHOLM;
  P7_HMMFILE   *hfp     = NULL;	/* open HMM file                   */
  ESL_SQFILE   *sqfp    = NULL;	/* open sequence file              */
  char         *outfile = NULL;	  /* output filename               */
  FILE         *ofp     = stdout; /* output stream                 */
  ESL_SQ      **sq      = NULL;	/* array of sequences              */
  void         *p       = NULL;	/* tmp ptr for reallocation        */
  int           nseq    = 0;	/* # of sequences in <seqfile>     */
  int           mapseq  = 0;	/* # of sequences in mapped MSA    */
  int           totseq  = 0;	/* # of seqs in all sources        */
  ESL_ALPHABET *abc     = NULL;	/* alphabet (set from the HMM file)*/
  P7_HMM       *hmm     = NULL;
  P7_TRACE    **tr      = NULL;	/* array of tracebacks             */
  ESL_MSA      *msa     = NULL;	/* resulting multiple alignment    */
  int           msaopts = 0;	/* flags to p7_tracealign_Seqs()   */
  int           idx;		/* counter over seqs, traces       */
  int           status;		/* easel/hmmer return code         */
  char          errbuf[eslERRBUFSIZE];

  /* Parse the command line
   */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n",       go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") )                   cmdline_help   (argv[0], go);
  if (esl_opt_ArgNumber(go) != 2)                      cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");        

  hmmfile = esl_opt_GetArg(go, 1);
  seqfile = esl_opt_GetArg(go, 2);

  if (strcmp(hmmfile, "-") == 0 && strcmp(seqfile, "-") == 0) 
    cmdline_failure(argv[0], "Either <hmmfile> or <seqfile> may be '-' (to read from stdin), but not both.\n");

  msaopts |= p7_ALL_CONSENSUS_COLS; /* default as of 3.1 */
  if (esl_opt_GetBoolean(go, "--trim"))    msaopts |= p7_TRIM;

  /* If caller declared an input format, decode it 
   */
  if (esl_opt_IsOn(go, "--informat")) {
    infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"));
    if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat"));
  }

  /* Determine output alignment file format */
  outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"));
  if (outfmt == eslMSAFILE_UNKNOWN)    cmdline_failure(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat"));

  /* Open output stream */
  if ( (outfile = esl_opt_GetString(go, "-o")) != NULL) 
  {
    if ((ofp = fopen(outfile, "w")) == NULL)
      cmdline_failure(argv[0], "failed to open -o output file %s for writing\n", outfile);
  }


  /* If caller forced an alphabet on us, create the one the caller wants  
   */
  if      (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);

  /* Read one HMM, and make sure there's only one.
   */
  status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf);
  if      (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
  else if (status == eslEFORMAT)   p7_Fail("File format problem in trying to open HMM file %s.\n%s\n",                hmmfile, errbuf);
  else if (status != eslOK)        p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n",                       status, hmmfile, errbuf);  

  status = p7_hmmfile_Read(hfp, &abc, &hmm);
  if      (status == eslEFORMAT)   p7_Fail("Bad file format in HMM file %s:\n%s\n",          hfp->fname, hfp->errbuf);
  else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type));
  else if (status == eslEOF)       p7_Fail("Empty HMM file %s? No HMM data found.\n",        hfp->fname);
  else if (status != eslOK)        p7_Fail("Unexpected error in reading HMMs from %s\n",     hfp->fname);

  status = p7_hmmfile_Read(hfp, &abc, NULL);
  if      (status != eslEOF)       p7_Fail("HMM file %s does not contain just one HMM\n",    hfp->fname);
  p7_hmmfile_Close(hfp);


  /* We're going to build up two arrays: sequences and traces.
   * If --mapali option is chosen, the first set of sequences/traces is from the provided alignment
   */
  if ( (mapfile = esl_opt_GetString(go, "--mapali")) != NULL)
  {
    map_alignment(mapfile, hmm, &sq, &tr, &mapseq);
  }
  totseq = mapseq;

  /* Read digital sequences into an array (possibly concat'ed onto mapped seqs)
   */
  status = esl_sqfile_OpenDigital(abc, seqfile, infmt, NULL, &sqfp);
  if      (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n",          seqfile);
  else if (status == eslEFORMAT)   p7_Fail("Sequence file %s is empty or misformatted\n",            seqfile);
  else if (status != eslOK)        p7_Fail("Unexpected error %d opening sequence file %s\n", status, seqfile);

  ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq + 1));
  sq[totseq] = esl_sq_CreateDigital(abc);
  nseq = 0;
  while ((status = esl_sqio_Read(sqfp, sq[totseq+nseq])) == eslOK)
  {
    nseq++;
    ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq+nseq+1));
    sq[totseq+nseq] = esl_sq_CreateDigital(abc);
  }
  if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", 
					   sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
  else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename);
  esl_sqfile_Close(sqfp);
  totseq += nseq;


  /* Remaining initializations, including trace array allocation
   */
  ESL_RALLOC(tr, p, sizeof(P7_TRACE *) * totseq);
  for (idx = mapseq; idx < totseq; idx++)
    tr[idx] = p7_trace_CreateWithPP();

  p7_tracealign_computeTraces(hmm, sq, mapseq, totseq - mapseq, tr);

  p7_tracealign_Seqs(sq, tr, totseq, hmm->M, msaopts, hmm, &msa);

  eslx_msafile_Write(ofp, msa, outfmt);

  for (idx = 0; idx <= totseq; idx++) esl_sq_Destroy(sq[idx]);    /* including sq[nseq] because we overallocated */
  for (idx = 0; idx <  totseq; idx++) p7_trace_Destroy(tr[idx]); 
  free(sq);
  free(tr);
  esl_msa_Destroy(msa);
  p7_hmm_Destroy(hmm);
  if (ofp != stdout) fclose(ofp);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return eslOK;

 ERROR:
  return status;
}
Exemplo n.º 10
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS    *go      = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
  ESL_ALPHABET   *abc     = esl_alphabet_Create(eslAMINO);
  char           *qfile   = esl_opt_GetArg(go, 1);
  char           *tfile   = esl_opt_GetArg(go, 2);
  ESL_SQFILE     *qfp     = NULL;
  ESL_SQFILE     *tfp     = NULL;
  ESL_SQ         *qsq     = esl_sq_CreateDigital(abc);
  ESL_SQ         *tsq     = esl_sq_CreateDigital(abc);
  ESL_SCOREMATRIX *S      = esl_scorematrix_Create(abc);
  ESL_DMATRIX     *Q      = NULL;
  P7_BG           *bg     = p7_bg_Create(abc);
  P7_HMM          *hmm    = NULL;
  P7_PROFILE      *gm     = NULL;
  P7_REFMX        *vit    = p7_refmx_Create(200, 400); /* will grow as needed */
  double          *fa     = malloc(sizeof(double) * abc->K);
  double          popen   = 0.02;
  double          pextend = 0.4;
  double          lambda;
  float           vsc;
  float           nullsc;
  int             status;

  esl_composition_BL62(fa);
  esl_vec_D2F(fa, abc->K, bg->f);
  esl_scorematrix_Set("BLOSUM62", S);
  esl_scorematrix_ProbifyGivenBG(S, fa, fa, &lambda, &Q); 
  esl_scorematrix_JointToConditionalOnQuery(abc, Q);

  if (esl_sqfile_OpenDigital(abc, qfile, eslSQFILE_UNKNOWN, NULL, &qfp) != eslOK) esl_fatal("failed to open %s", qfile);
  if (esl_sqio_Read(qfp, qsq) != eslOK) esl_fatal("failed to read query seq");

  p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm);  
  p7_hmm_SetComposition(hmm);
  p7_hmm_SetConsensus(hmm, qsq); 
  
  gm = p7_profile_Create(hmm->M, abc);
  p7_profile_ConfigUnilocal(gm, hmm, bg, 400);
  
  if (esl_sqfile_OpenDigital(abc, tfile, eslSQFILE_UNKNOWN, NULL, &tfp) != eslOK) esl_fatal("failed to open %s", tfile);
  while ((status = esl_sqio_Read(tfp, tsq)) == eslOK)
    {
      p7_bg_SetLength     (bg, tsq->n);
      p7_profile_SetLength(gm, tsq->n);
      
      p7_ReferenceViterbi(tsq->dsq, tsq->n, gm, vit, NULL, &vsc);    
      p7_bg_NullOne(bg, tsq->dsq, tsq->n, &nullsc);

      printf("%.4f %-25s %-25s\n", (vsc - nullsc) / eslCONST_LOG2, tsq->name, gm->name);
   
      esl_sq_Reuse(tsq);
      p7_refmx_Reuse(vit);
    }
  
  p7_refmx_Destroy(vit);
  p7_profile_Destroy(gm);
  p7_hmm_Destroy(hmm);
  p7_bg_Destroy(bg);
  esl_dmatrix_Destroy(Q);
  esl_scorematrix_Destroy(S);
  free(fa);
  esl_sq_Destroy(qsq);
  esl_sq_Destroy(tsq);
  esl_sqfile_Close(qfp);
  esl_sqfile_Close(tfp);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}