Exemple #1
0
/* Fetch in a random sequence of length <L> from the the pre-digitized
 * concatenated sequence database, select a random subseq, shuffle it
 * by the chosen algorithm; set dsq[1..L] to the resulting randomized
 * segment.
 *
 * If <logfp> is non-NULL, append one or more "<sqname> <from> <to>"
 * fields to current line, to record where the random segment was
 * selected from. This is useful in cases where we want to track back
 * the origin of a high-scoring segment, in case the randomization
 * wasn't good enough to obscure the identity of a segment.
 *
 */
static int
set_random_segment(ESL_GETOPTS *go, struct cfg_s *cfg, FILE *logfp, ESL_DSQ *dsq, int L)
{
    ESL_SQ  *sq           = esl_sq_CreateDigital(cfg->abc);
    int      minDPL       = esl_opt_GetInteger(go, "--minDPL");
    int      db_dependent = (esl_opt_GetBoolean(go, "--iid") == TRUE ? FALSE : TRUE);
    char    *pkey         = NULL;
    int      start, end;
    int64_t  Lseq;
    int      status;

    if (L==0) return eslOK;
    if (L > cfg->db_maxL) esl_fatal("can't fetch a segment of length %d; database max is %d\n", L, cfg->db_maxL);

    /* fetch a random subseq from the source database */
    esl_sq_GrowTo(sq, L);
    if (db_dependent)
    {
        do {
            if (pkey != NULL) free(pkey);
            if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &Lseq, &pkey) != eslOK)
                esl_fatal("failed to look up a random seq");
        } while (Lseq < L);

        start = 1 + esl_rnd_Roll(cfg->r, Lseq-L);
        end   = start + L - 1;
        if (esl_sqio_FetchSubseq(cfg->dbfp, pkey, start, end, sq) != eslOK) esl_fatal("failed to fetch subseq");
        esl_sq_ConvertDegen2X(sq);
    }

    /* log sequence source info: <name> <start> <end> */
    if (logfp != NULL && db_dependent)
        fprintf(logfp, " %-15s %5d %5d", pkey, start, end);

    /* Now apply the appropriate randomization algorithm */
    if      (esl_opt_GetBoolean(go, "--mono"))    status = esl_rsq_XShuffle  (cfg->r, sq->dsq, L, sq->dsq);
    else if (esl_opt_GetBoolean(go, "--di")) {
        if (L < minDPL)                             status = esl_rsq_XShuffle  (cfg->r, sq->dsq, L, sq->dsq);
        else                                        status = esl_rsq_XShuffleDP(cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq);
    }
    else if (esl_opt_GetBoolean(go, "--markov0")) status = esl_rsq_XMarkov0  (cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq);
    else if (esl_opt_GetBoolean(go, "--markov1")) status = esl_rsq_XMarkov1  (cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq);
    else if (esl_opt_GetBoolean(go, "--reverse")) status = esl_rsq_XReverse  (sq->dsq, L, sq->dsq);
    else if (esl_opt_GetBoolean(go, "--iid"))     status = esl_rsq_xIID      (cfg->r, cfg->fq, cfg->abc->K, L, sq->dsq);
    else                                          status = eslEINCONCEIVABLE;
    if (status != eslOK) esl_fatal("esl's shuffling failed");

    memcpy(dsq, sq->dsq+1, sizeof(ESL_DSQ) * L);
    esl_sq_Destroy(sq);
    free(pkey);
    return eslOK;
}
Exemple #2
0
/* seq_generation()
 *
 * Generating sequences.
 */
static int
seq_generation(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt)
{
  ESL_ALPHABET *abc = NULL;
  ESL_SQ       *sq  = NULL;
  double       *fq  = NULL;
  int           alphatype = eslUNKNOWN;   // static checkers can't see that 1 of --rna, --dna, --amino must be true
  int           N         = esl_opt_GetInteger(go, "-N");
  int           L         = esl_opt_GetInteger(go, "-L");
  int           i;
  int           status;

  if (L <= 0) esl_fatal("To generate sequences, set -L option (length of generated seqs) > 0 ");
  if (esl_opt_GetBoolean(go, "--rna"))   alphatype = eslRNA;
  if (esl_opt_GetBoolean(go, "--dna"))   alphatype = eslDNA;
  if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO;
  abc = esl_alphabet_Create(alphatype);
  sq  = esl_sq_CreateDigital(abc);
  esl_sq_GrowTo(sq, L);

  /* Pick the iid frequency distribution to use */
  ESL_ALLOC(fq, sizeof(double) * abc->K);
  switch (alphatype) {
  case eslRNA:
  case eslDNA:    esl_vec_DSet(fq, 4, 0.25); break;
  case eslAMINO:  esl_composition_SW34(fq);  break;
  default:        esl_vec_DSet(fq, abc->K, 1.0 / (double) abc->K); break;
  }
    
  /* generate */
  for (i = 0; i < N; i++)
    {
      esl_rsq_xIID(r, fq, abc->K, L, sq->dsq);
      if (N > 1) esl_sq_FormatName(sq, "random%d", i);
      else       esl_sq_SetName(sq, "random");
      sq->n = L;
      esl_sqio_Write(ofp, sq, outfmt, FALSE);
    }

  free(fq);
  esl_alphabet_Destroy(abc);
  esl_sq_Destroy(sq);
  return eslOK;

 ERROR:
  if (fq != NULL) free(fq);
  esl_alphabet_Destroy(abc);
  esl_sq_Destroy(sq);
  return status;
}
int
main(int argc, char **argv)
{
  int status;
  ESL_RANDOMNESS  *r   = esl_randomness_CreateTimeseeded();
  ESL_ALPHABET    *abc = esl_alphabet_Create(eslAMINO);
  ESL_SCOREMATRIX *S   = NULL;
  ESL_DSQ         *x   = NULL;	/* iid query */
  ESL_DSQ         *y   = NULL;	/* iid target */
  double    lambda;
  double    bg[20];		/* iid background probabilities */
  int       L;			/* query length */
  int       M;			/* target length */
  int       nseq;		/* number of target seqs to simulate */
  int       i;
  int       gop; 
  int       gex;
  char     *mxfile     = "PMX";
  int       raw_sc;

  /* Configuration 
   */
  L = 400;			/* query length */
  M = 400;			/* target length */
  nseq = 50000;
  gop = -11;
  gex = -1;
  lambda = 0.3207;

  ESL_ALLOC(x, sizeof(ESL_DSQ) * (L+2));
  ESL_ALLOC(y, sizeof(ESL_DSQ) * (M+2));

  /* Input an amino acid score matrix from a file. */
  if (mxfile != NULL) {
    ESL_FILEPARSER  *efp = NULL;
    if ( esl_fileparser_Open(mxfile, &efp)  != eslOK) esl_fatal("failed to open score file %s", mxfile);
    if ( esl_sco_Read(efp, abc, &S)         != eslOK) esl_fatal("failed to read matrix from %s", mxfile);
    esl_fileparser_Close(efp);
  } else {			/* default = BLOSUM62 */
    S = esl_scorematrix_Create(abc);
    esl_scorematrix_SetBLOSUM62(S);
  }
  esl_composition_BL62(bg);

  esl_rsq_xIID(r, bg, 20, L, x);
  
  for (i = 0; i < nseq; i++)
    {
      esl_rsq_xIID(r, bg, 20, M, y);
      esl_swat_Score(x, L, y, M, S, gop, gex, &raw_sc);
      printf("%d\n", raw_sc);
    }
  
  free(x);
  free(y);
  esl_scorematrix_Destroy(S);
  esl_alphabet_Destroy(abc);
  esl_randomness_Destroy(r);
  exit(0);

 ERROR:
  exit(status);
}