Example #1
0
/* Function: esl_keyhash_Store()
 * Synopsis: Store a key and get a key index for it.
 *
 * Purpose:  Store a string <key> of length <n> in the key index hash table <kh>.
 *           Associate it with a unique key index, counting from
 *           0. It's this index that lets us map the hashed keys to
 *           integer-indexed C arrays, clumsily emulating Perl's
 *           hashes. Optionally returns the index through <opt_index>.
 *           
 *           <key>, <n> follow the standard idiom for strings and
 *           unterminated buffers.
 *
 * Returns:  <eslOK> on success; stores <key> in <kh>; <opt_index> is 
 *           returned, set to the next higher index value.
 *           Returns <eslEDUP> if <key> was already stored in the table;
 *           <opt_index> is set to the existing index for <key>.
 *
 * Throws:   <eslEMEM> on allocation failure, and sets <opt_index> to -1.
 */
int
esl_keyhash_Store(ESL_KEYHASH *kh, const char *key, esl_pos_t n, int *opt_index)
{
  uint32_t val = jenkins_hash(key, n, kh->hashsize);
  int idx;
  int status;
  
  if (n == -1) n = strlen(key);

  /* Was this key already stored?  */
  for (idx = kh->hashtable[val]; idx != -1; idx = kh->nxt[idx])
    if (esl_memstrcmp(key, n, kh->smem + kh->key_offset[idx]))
      { 
	if (opt_index != NULL) *opt_index = idx; 
	return eslEDUP; 
      }

  /* Reallocate key ptr/index memory if needed */
  if (kh->nkeys == kh->kalloc) 
    { 
      ESL_REALLOC(kh->key_offset, sizeof(int)*kh->kalloc*2);
      ESL_REALLOC(kh->nxt,        sizeof(int)*kh->kalloc*2);
      kh->kalloc *= 2;
    }

  /* Reallocate key string memory if needed */
  while (kh->sn + n + 1 > kh->salloc)
    {
      ESL_REALLOC(kh->smem, sizeof(char) * kh->salloc * 2);
      kh->salloc *= 2;
    }

  /* Copy the key, assign its index */
  idx                 = kh->nkeys;
  kh->key_offset[idx] = kh->sn;
  kh->sn             += n+1;
  esl_memstrcpy(key, n, kh->smem + kh->key_offset[idx]);
  kh->nkeys++;

  /* Insert new element at head of the approp linked list in hashtable */
  kh->nxt[idx]       = kh->hashtable[val];
  kh->hashtable[val] = idx;

  /* Time to upsize? If we're 3x saturated, expand the hash table */
  if (kh->nkeys > 3*kh->hashsize)
    if ((status = key_upsize(kh)) != eslOK) goto ERROR;

  if (opt_index != NULL) *opt_index = idx;
  return eslOK;

 ERROR:
  if (opt_index != NULL) *opt_index = -1;
  return status;
}
Example #2
0
/* regurgitate_pfam_as_pfam()
 * 
 * Given an open Pfam formatted msafile, read the next alignment and
 * regurgitate it, after modifying it as necessary (change dna to rna,
 * wussify SS, etc) in Pfam format.
 * 
 * Returns <eslOK> on success. 
 * Returns <eslEOF> if there are no more alignments in <afp>.
 * Returns <eslEFORMAT> if parse fails because of a file format
 * problem, in which case afp->errmsg is set to contain a formatted
 * message that indicates the cause of the problem.
 */
static int
regurgitate_pfam_as_pfam(ESLX_MSAFILE *afp, FILE *ofp, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, int wussify, int dewuss, int fullwuss, char *rfrom, char *rto)
{
  char      *p;
  esl_pos_t  n;
  char      *first_seqname = NULL;
  char      *gx      = NULL;
  char      *seqname = NULL;
  char      *tag     = NULL;
  char      *text    = NULL;
  esl_pos_t  gxlen, namelen, taglen, textlen;
  int        nseq_read = 0;
  int        parse_gc_and_gr;
  int        flushpoint = 10000;
  int        exp_alen = -1;
  char      *buf      = NULL;
  esl_pos_t  pos, pos2;
  int        status;


  parse_gc_and_gr = (wussify || dewuss || fullwuss) ? TRUE : FALSE; /* should we parse out GR/GC lines and check if they're SS lines? */
  afp->errmsg[0] = '\0';
   
  /* Check the magic Stockholm header line.
   * We have to skip blank lines here, else we perceive
   * trailing blank lines in a file as a format error when
   * reading in multi-record mode.
   */
  /* Check the magic Stockholm header line, allowing blank lines */
  do { 
    status = eslx_msafile_GetLine(afp, &p, &n);
    if      (status == eslEOF) return eslEOF; 
    else if (status != eslOK)  esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber);
    fprintf(ofp, "%.*s\n", (int) afp->n, afp->line);
  } while (esl_memspn(afp->line, afp->n, " \t") == afp->n  ||                 /* skip blank lines             */
	       (esl_memstrpfx(afp->line, afp->n, "#")                         /* and skip comment lines       */
	   && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM")));            /* but stop on Stockholm header */

  if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber);

  /* Read the alignment file one line at a time.  */
  while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) 
    {
      if ((int) afp->linenumber % flushpoint == 0) fflush(ofp);
      while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */
    
      if      (!n)                          fprintf(ofp, "\n");
      else if (esl_memstrpfx(p, n, "//")) { fprintf(ofp, "//\n"); break; } /* normal way out */
      else if (*p == '#') 
	{
	  if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GC")) 
	    {  	/* parse line into temporary strings */
	      if (esl_memtok(&p, &n, " \t",  &gx,   &gxlen)   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n, " \t",  &tag,  &taglen)  != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n,  " \t", &text, &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber);
	      pos = text - afp->line; /* pos: position of first aligned char on line; total width of annotation tag w/spaces */
	
	      /* verify alignment length */
	      if      (exp_alen == -1)      exp_alen = textlen;
	      else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen);
	
	      /* we need to make a writable string copy of the annotation, to edit it */
	      ESL_REALLOC(buf, sizeof(char) * (textlen+1));
	      esl_memstrcpy(text, textlen, buf);
	     
	      if (esl_memstrcmp(tag, taglen, "SS_cons")) 
		{
		  if      (wussify)  esl_kh2wuss(buf, buf);
		  else if (dewuss)   esl_wuss2kh(buf, buf);
		  else if (fullwuss) 
		    { 
		      status = esl_wuss_full(buf, buf);
		      if      (status == eslESYNTAX) esl_fatal("Bad SS_cons line: not in WUSS format, alifile line: %d", (int) afp->linenumber);
		      else if (status != eslOK)      esl_fatal("Conversion of SS_cons line failed, code %d, alifile line: %d", status, (int) afp->linenumber);
		    }
		}		  
	      fprintf(ofp, "#=GC %.*s%*s%s\n", (int) taglen, tag, (int) (pos-taglen-5), "", buf);
	    }
	  else if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GR") == 0) 
	    { 
	      /* parse line into temporary strings */
	      if (esl_memtok(&p, &n, " \t", &gx,      &gxlen)   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n, " \t", &tag,     &taglen)  != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      pos = tag   - afp->line;
	      if (esl_memtok(&p, &n, " \t", &text,    &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      pos2 = text - afp->line;

	      /* we need to make a writable string copy of the annotation, to edit it */
	      ESL_REALLOC(buf, sizeof(char) * (textlen+1));
	      esl_memstrcpy(text, textlen, buf);

	      /* verify alignment length */
	      if      (exp_alen == -1)      exp_alen = textlen;
	      else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen);
	
	      if (esl_memstrcmp(tag, taglen, "SS") == 0) 
		{
		  if      (wussify)  esl_kh2wuss(buf, buf);
		  else if (dewuss)   esl_wuss2kh(buf, buf);
		  else if (fullwuss) { 
		    status = esl_wuss_full(buf, buf);
		    if      (status == eslESYNTAX) esl_fatal("Bad SS line: not in WUSS format, alifile line: %d", (int) afp->linenumber);
		    else if (status != eslOK)      esl_fatal("Conversion of SS line failed, code %d, alifile line: %d", status, (int) afp->linenumber);
		  }
		}		  

	      fprintf(ofp, "#=GR %.*s%*s%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen-5), "", (int) taglen, tag, (int) (pos2-pos-taglen), "", buf);
	    }
	  else { /* '#' prefixed line that is not #=GR (or it is #=GR and wussify,dewuss,fullwuss are all FALSE) */
	    fprintf(ofp, "%.*s\n", (int) afp->n, afp->line); /* print the line */
	  }
	} /* end of 'if (*s == '#')' */ 
      else 
	{ /* sequence line */
	  if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber);
	  if (esl_memtok(&p, &n, " \t", &text,    &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber);
	  pos = text - afp->line;

	  /* verify alignment length */
	  if     (exp_alen == -1)      exp_alen = textlen;
	  else if(exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen);
      
	  /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */
	  if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, namelen, &(first_seqname))) != eslOK) goto ERROR; }
	  else if (esl_memstrcmp(seqname, namelen, first_seqname)) { ESL_XFAIL(eslEFORMAT, afp->errmsg, "parse failed (line %d): two seqs named %s. Alignment appears to be in Stockholm format. Reformat to Pfam with esl-reformat.", (int) afp->linenumber, seqname); }
	  nseq_read++;
      
	  /* we need to make a writable string copy of the annotation, to edit it */
	  ESL_REALLOC(buf, sizeof(char) * (textlen+1));
	  esl_memstrcpy(text, textlen, buf);

	  /* make adjustments as necessary */
	  if (rfrom)       symconvert(buf, rfrom, rto);
	  if (gapsym)      symconvert(buf, "-_.", gapsym);
	  if (force_lower) symconvert(buf,
				      "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
				      "abcdefghijklmnopqrstuvwxyz");
	  if (force_upper) symconvert(buf,
				      "abcdefghijklmnopqrstuvwxyz",
				      "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
	  if (force_rna)   symconvert(buf, "Tt", "Uu");
	  if (force_dna)   symconvert(buf, "Uu", "Tt");
	  if (iupac_to_n)  symconvert(buf, 
				      "RYMKSWHBVDrymkswhbvd",
				      "NNNNNNNNNNnnnnnnnnnn");
	  if (x_is_bad)    symconvert(buf,   "Xx", "Nn");
	  /* print it out */
	  fprintf(ofp, "%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen), "", buf);
	}
    }

  /* If we saw a normal // end, we would've successfully read a line,
   * so when we get here, status (from the line read) should be eslOK.
   */ 
  if (status != eslOK) esl_fatal("--small parse failed (line %d): didn't find // at end of alignment", (int) afp->linenumber);
  if (first_seqname) free(first_seqname);
  if (buf)           free(buf);
  return eslOK;

 ERROR:
  return status;
}
/* Function:  fm_alphabetCreate()
 *
 * Synopsis:   Produce an alphabet for FMindex.
 *
 * Purpose:   Produce an alphabet for FMindex. This may end up being
 *            replaced with easel alphabet functions, but the easel
 *            requirement of having a gap-character between
 *            cannonical and degenerate symbols poses a problem
 *            from a bit-packing perspective
 *
 * Args:      meta      - metadata object already initialized with the alphabet type.
 *                        This will hold the alphabet (and corresponding reverse alphabet)
 *                        created here.
 *            alph_bits - pointer to an int that this function sets equal to the
 *                        number of bits required to store the alphabet (log of alph size)
 *
 * Returns:   <eslOK> on success.
 */
int
fm_alphabetCreate (FM_METADATA *meta, uint8_t *alph_bits) {

	int i = 0;
	int status;

	if ( meta->alph_type ==  fm_DNA) {
      meta->alph_size = 4;
      if (alph_bits) *alph_bits = 2;
	} else if ( meta->alph_type ==  fm_DNA_full) {
      meta->alph_size = 15;
      if (alph_bits) *alph_bits = 4;
	} else if ( meta->alph_type ==  fm_AMINO) {
	    meta->alph_size = 26;
      if (alph_bits) *alph_bits = 5;
	} else {
      esl_fatal("Unknown alphabet type\n%s", "");
	}

	ESL_ALLOC(meta->alph, (1+meta->alph_size)*sizeof(char));
	ESL_ALLOC(meta->inv_alph, 256*sizeof(char));

	if ( meta->alph_type ==  fm_DNA || meta->alph_type ==  fm_DNA_full)
	  ESL_ALLOC(meta->compl_alph, (1+meta->alph_size)*sizeof(int));



	if ( meta->alph_type ==  fm_DNA) {
		esl_memstrcpy("ACGT", 4, meta->alph);
		for (i=0; i<4; i++)
		  meta->compl_alph[i] = 3-i;
	} else if ( meta->alph_type ==  fm_DNA_full) {
		esl_memstrcpy("ACGTRYMKSWHBVDN", 15, meta->alph);
	  meta->compl_alph[0] = 3;    /* A->T */
		meta->compl_alph[1] = 2;    /* C->G */
	  meta->compl_alph[2] = 1;    /* G->C */
	  meta->compl_alph[3] = 0;    /* T->A */
	  meta->compl_alph[4] = 5;    /* R->Y */
	  meta->compl_alph[5] = 4;    /* Y->R */
	  meta->compl_alph[6] = 7;    /* M->K */
	  meta->compl_alph[7] = 6;    /* K->M */
	  meta->compl_alph[8] = 8;    /* S  S */
	  meta->compl_alph[9] = 9;    /* W  W */
	  meta->compl_alph[10]= 13;   /* H->D */
	  meta->compl_alph[11]= 12;   /* B->V */
	  meta->compl_alph[12]= 11;   /* V->B */
	  meta->compl_alph[13]= 10;   /* D->H */
	  meta->compl_alph[14]= 14;   /* N  N */
	} else if ( meta->alph_type ==  fm_AMINO) {
		esl_memstrcpy("ACDEFGHIKLMNPQRSTVWYBJZOUX", meta->alph_size, meta->alph);
	}

	for (i=0; i<256; i++)
	  meta->inv_alph[i] = -1;

	for (i=0; i<meta->alph_size; i++) {
	  meta->inv_alph[tolower(meta->alph[i])] = meta->inv_alph[toupper(meta->alph[i])] = i;

	  //special case for RNA, equate U to T:
	  if (   (meta->alph_type ==  fm_DNA || meta->alph_type ==  fm_DNA_full) && toupper(meta->alph[i]) == 'T')
	    meta->inv_alph['u'] = meta->inv_alph['U'] = i;
	}



	return eslOK;

ERROR:
    esl_fatal("error allocating space for alphabet\n");
    return eslFAIL;
}