Esempio n. 1
0
/* regurgitate_pfam_as_afa()
 * 
 * Given an open Pfam formatted msafile, read the next alignment and 
 * regurgitate it in aligned FASTA (AFA) format without storing
 * it in a esl_msa data structure.
 * 
 * We need to do two passes through the file because in Pfam
 * sequence accessions (#=GS <seqname> AC) and sequence descriptions
 * (#=GS <seqname> DE) appear altogether before any aligned sequence
 * data, while in AFA they appear on the same line as the sequence
 * name (accession, then description).
 *
 * Example: 
 * # STOCKHOLM 1.0
 * #=GS tRNA1 AC RF00005-1
 * #=GS tRNA2 AC RF00005-2
 * #=GS tRNA1 DE first tRNA
 * #=GS tRNA2 DE second tRNA
 * 
 * tRNA1 GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCA
 * tRNA2 UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGACUCCCCGUAUCGGAG
 * 
 * converts to AFA:
 * >tRNA1 RF00005-1 first tRNA
 * GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAU
 * CCACAGAAUUCGCA
 * >tRNA2 RF00005-2 second tRNA
 * UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGAC
 * UCCCCGUAUCGGAG
 * 
 * In the first pass, output the sequence names and accessions we find
 * as '#=GS <seqname> AC' lines in the Pfam alignment to an accession
 * tmpfile, and output sequence names and descriptions we find as 
 * as '#=GS <seqname> DE' lines in the Pfam alignment to a description
 * tmpfile.
 *
 * In the second pass, rewind all (up to 3) files: <ac_tmpfile>,
 * <de_tmpfile> and the Pfam alignment file and start reading them
 * again.  As we're reading them, output the accessions, descriptions
 * and aligned sequence data in the proper order to an aligned FASTA
 * file.
 * 
 * Set <ret_reached_eof> as TRUE if the alignment read and reformatted
 * appears to be the only one remaining in afp.  Set <ret_reached_eof>
 * as FALSE if afp appears to include at least one more alignment.
 * 
 * Returns void. Dies upon any input error.
 */
static void
regurgitate_pfam_as_afa(ESLX_MSAFILE *afp, FILE *ofp, char *alifile, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, char *rename, char *rfrom, char *rto, int *ret_reached_eof)
{
  char      *p = NULL;
  esl_pos_t  n = 0;
  esl_pos_t  gslen, seqnamelen, taglen;
  char      *seqname = NULL;
  char      *first_seqname = NULL;
  char      *tag = NULL;
  char      *gs = NULL;
  int        nseq_read = 0;
  int        reached_eof;
  /* variables related to reading accessions */
  char       ac_tmpfile[16] = "esltmpXXXXXX";
  FILE      *ac_fp = NULL; /* file ptr for accession tmpfile */
  char      *ac_buf = NULL;	/* buffer for line input w/ sre_fgets()      */
  int        ac_buflen = 0;	/* current allocated length for buf          */
  char      *ac_s = NULL;	        
  char      *ac_seqname = NULL;
  char      *ac = NULL;
  int        have_ac = FALSE;
  /* variables related to reading descriptions */
  char       de_tmpfile[16] = "esltmpXXXXXX";
  FILE      *de_fp = NULL; /* file ptr for description tmpfile */
  char      *de_buf = NULL;	/* buffer for line input w/ sre_fgets()      */
  int        de_buflen = 0;	/* current allocated length for buf          */
  char      *de_s = NULL;	        
  char      *de_seqname = NULL;
  char      *de = NULL;
  int        have_de = FALSE;
  /* variables related to printing out sequences */
  char      *aseq = NULL;
  esl_pos_t  aseqlen = 0;
  int64_t    apos;
  char       aseqbuf[61];
  int        cpl = 60;	     /* number of residues per afa seq line */
  int        acpl;       /* actual number of character per line */
  int        status;
  
  afp->errmsg[0] = '\0';
   
  /**************************************************************************************************
   * First pass, go through each line of the Pfam file and output all GS DE and AC annotation to tmpfiles
   **************************************************************************************************/

  /* Check the magic Stockholm header line, allowing blank lines */
  do { 
    status = eslx_msafile_GetLine(afp, &p, &n);
    if      (status == eslEOF) return; 
    else if (status != eslOK)  esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber);
  } while (esl_memspn(afp->line, afp->n, " \t") == afp->n  ||                 /* skip blank lines             */
	       (esl_memstrpfx(afp->line, afp->n, "#")                         /* and skip comment lines       */
	   && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM")));            /* but stop on Stockholm header */

  if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber);

  while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) 
    {
      while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */

      if (esl_memstrpfx(p, n, "#=GS"))
	{ /* only lines we need to check are AC and DE lines, we don't even check other lines for validity */
	  if (esl_memtok(&p, &n, " \t", &gs,      &gslen)      != eslOK) esl_fatal("small mem parse failed (line %d) in a way that can't happen",                      (int) afp->linenumber);
	  if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <seqname>, <tag>, annotation", (int) afp->linenumber);
	  if (esl_memtok(&p, &n, " \t", &tag,     &taglen)     != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <tag>, annotation",            (int) afp->linenumber);
	  if (! esl_memstrcmp(gs, gslen, "#=GS"))                        esl_fatal("small mem parse failed (line %d): faux #=GS line?",                                (int) afp->linenumber);

	  if (esl_memstrcmp(tag, taglen, "AC"))
	    { 
	      if (! ac_fp && esl_tmpfile(ac_tmpfile, &ac_fp) != eslOK) esl_fatal("small mem parse failed, unable to open accession tmpfile");
	      fprintf(ac_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p);
	    }
	  if (esl_memstrcmp(tag, taglen, "DE"))
	    { 
	      if (! de_fp && esl_tmpfile(de_tmpfile, &de_fp) != eslOK) esl_fatal("small mem parse failed, unable to open description tmpfile");
	      fprintf(de_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p);
	    }
	}
      else if (esl_memstrpfx(p, n, "//")) break;
    }
  if      (status == eslEOF) esl_fatal("small mem parse failed (line %d): missing // terminator", (int) afp->linenumber);
  else if (status != eslOK)  esl_fatal("small mem parse failed (line %d) with code %d", (int) afp->linenumber, status);
  
  /* The regurgitate_*() functions are limited, and only deal with single-record Pfam files. 
   * If there appears to be more data in the file, drop the reached_eof flag.
   */
  while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) 
    {
      while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */
      if    (esl_memstrpfx(p, n, "# STOCKHOLM 1.")) break;
      if    (n && ! esl_memstrpfx(p, n, "#"))       esl_fatal("small mem parse failed (line %d): unexpected data", (int) afp->linenumber);
    }
  if      (status == eslOK)  reached_eof = FALSE;
  else if (status == eslEOF) reached_eof = TRUE;
  else esl_fatal("--small parse error. problem reading line %d of msafile", (int) afp->linenumber);

  /*****************************************************************
   * Pass 1 complete; rewind (close/reopen) all files
   *****************************************************************/

  eslx_msafile_Close(afp);
  if ((status = eslx_msafile_Open(NULL, alifile, NULL, eslMSAFILE_PFAM, NULL, &afp)) != eslOK)
    esl_fatal("--small, second pass, unable to open file %s for reading", alifile);
  
  if (ac_fp) { /* open the tmpfile with the seq accessions */
    rewind(ac_fp);
    if((status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp)) != eslOK) esl_fatal("--small accession tmpfile parse failed");
    ac_s = ac_buf;
    if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed");
    if (esl_strtok_adv(&ac_s, "\n\r",    &ac,         NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed");
  }
  if (de_fp) { /* open the tmpfile with the seq descriptions */
    rewind(de_fp);
    if((status = esl_fgets(&(de_buf), &(de_buflen), de_fp)) != eslOK) esl_fatal("--small description tmpfile parse failed");
    de_s = de_buf;
    if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed");
    if (esl_strtok_adv(&de_s, "\n\r",    &de,         NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed");
  }

  /******************************************************************************************
   * Pass 2, step through files, outputting appropriately
   ******************************************************************************************/

  do { 
    status = eslx_msafile_GetLine(afp, &p, &n);
    if      (status == eslEOF) return; 
    else if (status != eslOK)  esl_fatal("small mem parse pass 2 error. problem reading line %d of msafile", (int) afp->linenumber);
  } while (esl_memspn(afp->line, afp->n, " \t") == afp->n  ||                 /* skip blank lines             */
	       (esl_memstrpfx(afp->line, afp->n, "#")                         /* and skip comment lines       */
	   && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM")));            /* but stop on Stockholm header */

  if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse pass 2 failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber);

  while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) 
    {
      while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */

      if      (!n || *p == '#')           continue;	    /* skip blank lines, comments */
      else if (esl_memstrpfx(p, n, "//")) break;	    /* end of alignment: end of record */
      else 
	{ /* sequence line. parse line into temporary strings */
	  if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no seq name", (int) afp->linenumber);
	  if (esl_memtok(&p, &n, " \t", &aseq,    &aseqlen)    != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no aseq",     (int) afp->linenumber);

	  /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */
	  if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, seqnamelen, &(first_seqname))) != eslOK) esl_fatal("small mem parse failed: unable to copy seqname"); }
	  else if (esl_memstrcmp(seqname, seqnamelen, first_seqname)) esl_fatal("--small parse pass 2 failed (line %d): two seqs named %s. Alignment appears to be in interleaved Stockholm (not Pfam) format.", (int) afp->linenumber, seqname); 
	  nseq_read++;

	  /* determine if we have an accession and/or description for this sequence */
	  have_de = have_ac = FALSE;
	  if (ac_seqname && (esl_memstrcmp(seqname, seqnamelen, ac_seqname))) have_ac = TRUE;
	  if (de_seqname && (esl_memstrcmp(seqname, seqnamelen, de_seqname))) have_de = TRUE;

	  if (rename) fprintf(ofp, ">%s.%d%s%s%s%s\n",          rename, nseq_read, (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); 
	  else        fprintf(ofp, ">%.*s%s%s%s%s\n", (int) seqnamelen, seqname,   (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); 

	  /* load next ac, de */
	  if (have_ac) {
	    status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp);
	    if      (status == eslEOF) ac_seqname = NULL;
	    else if (status == eslOK) { 
	      ac_s = ac_buf;
	      if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed");
	      if (esl_strtok_adv(&ac_s, "\n\r",    &ac,         NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed");
	    }
	  }
	  if (have_de) {
	    status = esl_fgets(&(de_buf), &(de_buflen), de_fp);
	    if(status == eslEOF) de_seqname = NULL;
	    else if (status == eslOK) { 
	      de_s = de_buf;
	      if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed");
	      if (esl_strtok_adv(&de_s, "\n\r",    &de,         NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed");
	    }
	  }

	  /* now print sequence, after converting symbols as nec */
	  /* remember, aseq itself is part of an ESL_BUFFER and you
	     can't write to it, so symconverts have to be on the
	     copy */
	  for (apos = 0; apos < aseqlen; apos += cpl)
	    {
	      acpl = (aseqlen - apos > cpl ? cpl : aseqlen - apos);
	      strncpy(aseqbuf, aseq + apos, acpl);
	      aseqbuf[acpl] = '\0';

	      if (rfrom)       symconvert(aseqbuf, rfrom, rto);
	      if (gapsym)      symconvert(aseqbuf, "-_.", gapsym);
	      if (force_lower) symconvert(aseqbuf,
					  "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
					  "abcdefghijklmnopqrstuvwxyz");
	      if (force_upper) symconvert(aseqbuf,
					  "abcdefghijklmnopqrstuvwxyz",
					  "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
	      if (force_rna)   symconvert(aseqbuf, "Tt", "Uu");
	      if (force_dna)   symconvert(aseqbuf, "Uu", "Tt");
	      if (iupac_to_n)  symconvert(aseqbuf, 
					  "RYMKSWHBVDrymkswhbvd",
					  "NNNNNNNNNNnnnnnnnnnn");
	      if (x_is_bad)    symconvert(aseqbuf,   "Xx", "Nn");

	      fprintf(ofp, "%s\n", aseqbuf);	      
	    }
	}
    }
  /* If we saw a normal // end, we would've successfully read a line,
   * so when we get here, status (from the line read) should be eslOK.
   */ 
  if (status != eslOK) esl_fatal("--small parse pass 2 failed (line %d): didn't find // at end of alignment", (int) afp->linenumber);
  if (ac_seqname)      esl_fatal("--small parse pass 2 failed, sequence %s with #=GS AC line does not exist in alignment or is in different order.", ac_seqname);
  if (de_seqname)      esl_fatal("--small parse pass 2 failed, sequence %s with #=GS DE line does not exist in alignment or is in different order.", de_seqname);

  if (ac_fp) fclose(ac_fp);
  if (de_fp) fclose(de_fp);
  eslx_msafile_Close(afp);

  if (first_seqname) free(first_seqname);
  if (ac_buf)        free(ac_buf);
  if (de_buf)        free(de_buf);

  *ret_reached_eof = reached_eof;
  return;
}
/* Function:  main()
 * Synopsis:  break input sequence set into chunks, for each one building the
 *            Burrows-Wheeler transform and corresponding FM-index. Maintain requisite
 *            meta data.
 * Notes:     Currently depends on the divsufsort-lite code of Yuta Mori, though this
 *            could easily be replaced.
 */
int
main(int argc, char **argv) 
{
  char tmp_filename[16] = "fmtmpXXXXXX";
  FILE *fptmp          = NULL;
  FILE *fp             = NULL;
  uint8_t *T           = NULL;
  uint8_t *BWT         = NULL;
  int *SA              = NULL; //what I write will be 32-bit ints, but I need to keep this as int so it'll work with libdivsufsort
  uint32_t *SAsamp     = NULL;
  uint32_t *occCnts_sb = NULL; // same indexing as above
  uint32_t *cnts_sb    = NULL;
  uint16_t *occCnts_b  = NULL; // this is logically a 2D array, but will be indexed as occ_cnts[alph_size*index + char]  (instead of occ_cnts[index][char])
  uint16_t *cnts_b     = NULL;
  FM_METADATA *meta    = NULL;

  clock_t t1, t2;
  struct tms ts1, ts2;

  long i,j,c;
  int status = eslOK;

  int chars_per_byte;
  int num_freq_cnts_sb ;
  int num_freq_cnts_b ;
  int num_SA_samples ;

  int             infmt     = eslSQFILE_UNKNOWN;
  int             alphatype = eslUNKNOWN;
  int             alphaguess =eslUNKNOWN;
  ESL_ALPHABET   *abc       = NULL;
  ESL_SQ         *sq        = NULL;
  ESL_SQFILE     *sqfp      = NULL;

  ESL_SQ       *tmpsq = NULL;
  ESL_SQ_BLOCK *block = NULL;

  char *fname_in = NULL;
  char *fname_out= NULL;
  int block_size = 50000000;
  int sq_cnt = 0;
  int use_tmpsq = 0;
  uint64_t block_length;
  uint64_t total_char_count = 0;

  int max_block_size;

  int numblocks = 0;
  uint32_t numseqs = 0;


  int allocedseqs = 1000;
  uint32_t seq_offset = 0;
  uint32_t ambig_offset = 0;
  uint32_t overlap = 0;
  uint16_t seq_cnt;
  uint16_t ambig_cnt;

  uint32_t prev_numseqs = 0;

  int compressed_bytes;
  uint32_t term_loc;

  ESL_GETOPTS     *go  = NULL;    /* command line processing                 */

  uint8_t        ambig_repl = 0;
  int            in_ambig_run = 0;
  FM_AMBIGLIST   ambig_list;

  ESL_ALLOC (meta, sizeof(FM_METADATA));
  if (meta == NULL)
    esl_fatal("unable to allocate memory to store FM meta data\n");

  ESL_ALLOC (meta->ambig_list, sizeof(FM_AMBIGLIST));
  if (meta->ambig_list == NULL)
      esl_fatal("unable to allocate memory to store FM ambiguity data\n");
  fm_initAmbiguityList(meta->ambig_list);


  meta->alph_type   = fm_DNA;
  meta->freq_SA     = 8;
  meta->freq_cnt_b  = 256;
  meta->freq_cnt_sb = pow(2,16); //65536 - that's the # values in a short
  meta->seq_count = 0;
  ESL_ALLOC (meta->seq_data, allocedseqs * sizeof(FM_SEQDATA));
  if (meta->seq_data == NULL )
    esl_fatal("unable to allocate memory to store FM sequence data\n");


  process_commandline(argc, argv, &go, &fname_in, &fname_out);

  if (esl_opt_IsOn(go, "--bin_length")) meta->freq_cnt_b = esl_opt_GetInteger(go, "--bin_length");
  if ( meta->freq_cnt_b < 32 || meta->freq_cnt_b >4096 ||  (meta->freq_cnt_b & (meta->freq_cnt_b - 1))  ) // test power of 2
    esl_fatal("bin_length must be a power of 2, at least 128, and at most 4096\n");

  if (esl_opt_IsOn(go, "--sa_freq")) meta->freq_SA = esl_opt_GetInteger(go, "--sa_freq");
  if ( (meta->freq_SA & (meta->freq_SA - 1))  )  // test power of 2
    esl_fatal ("SA_freq must be a power of 2\n");


  if (esl_opt_IsOn(go, "--block_size")) block_size = 1000000 * esl_opt_GetInteger(go, "--block_size");
  if ( block_size <=0  )
    esl_fatal ("block_size must be a positive number\n");

  //start timer
  t1 = times(&ts1);

  output_header(stdout, go, fname_in, fname_out);

  if (esl_opt_GetString(go, "--informat") != NULL) {
    infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"));
    if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat");
  }

  status = esl_sqfile_Open(fname_in, infmt, NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("No such file %s", fname_in);
  else if (status == eslEFORMAT)   esl_fatal("Format of seqfile %s unrecognized.", fname_in);
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  meta->fwd_only = 0;

  if (esl_opt_IsUsed(go, "--alph")) {
    meta->alph    = esl_opt_GetString(go, "--alph") ;
    if ( esl_strcmp(meta->alph, "dna")==0  ||  esl_strcmp(meta->alph, "rna")==0) {
      meta->alph_type = fm_DNA;
      alphatype = eslDNA;
    } else if (esl_strcmp(meta->alph, "dna_full")==0  || esl_strcmp(meta->alph, "rna_full")==0) {
      meta->alph_type = fm_DNA_full;
      alphatype = eslDNA;
    } else if (esl_strcmp(meta->alph, "amino")==0) {
      meta->alph_type = fm_AMINO;
      alphatype = eslAMINO;
      meta->fwd_only = 1;
    } else {
      esl_fatal("Unknown alphabet type. Try 'dna', 'dna_full', or 'amino'\n%s", "");
    }
  } else {
    esl_sqfile_GuessAlphabet(sqfp, &alphaguess);

    if (alphaguess == eslDNA || alphaguess == eslRNA) {
      meta->alph_type = fm_DNA;
      alphatype = eslDNA;
    } else if (alphaguess == eslAMINO) {
      meta->alph_type = fm_AMINO;
      alphatype = eslAMINO;
      meta->fwd_only = 1;
    } else {
      esl_fatal("Unknown alphabet type. Try 'dna', 'dna_full', or 'amino'\n%s", "");
    }
  }


  if (esl_opt_IsOn(go, "--fwd_only") )
    meta->fwd_only = 1;

  meta->alph = NULL;



  //getInverseAlphabet
  fm_alphabetCreate(meta, &(meta->charBits));
  chars_per_byte = 8/meta->charBits;

    //shift inv_alph up one, to make space for '$' at 0
  for (i=0; i<256; i++)
    if ( meta->inv_alph[i] >= 0)
      meta->inv_alph[i]++;


  abc     = esl_alphabet_Create(alphatype);
  sq      = esl_sq_CreateDigital(abc);
  tmpsq   =  esl_sq_CreateDigital(abc);

  esl_sqfile_SetDigital(sqfp, abc);
  block = esl_sq_CreateDigitalBlock(FM_BLOCK_COUNT, abc);
  block->complete = FALSE;
//  max_block_size = FM_BLOCK_OVERLAP+block_size+1  + block_size*.2; // +1 for the '$'
  max_block_size = FM_BLOCK_OVERLAP+block_size+1  + block_size; // temporary hack to avoid memory over-runs (see end of 1101_fmindex_benchmarking/00NOTES)

  if (alphatype == fm_DNA)
    fm_initAmbiguityList(&ambig_list);


  /* Allocate BWT, Text, SA, and FM-index data structures, allowing storage of maximally large sequence*/
  ESL_ALLOC (T, max_block_size * sizeof(uint8_t));
  ESL_ALLOC (BWT, max_block_size * sizeof(uint8_t));
  ESL_ALLOC (SA, max_block_size * sizeof(int));
  ESL_ALLOC (SAsamp,     (1+floor((double)max_block_size/meta->freq_SA) ) * sizeof(uint32_t));

  ESL_ALLOC (occCnts_sb, (1+ceil((double)max_block_size/meta->freq_cnt_sb)) *  meta->alph_size * sizeof(uint32_t)); // every freq_cnt_sb positions, store an array of ints
  ESL_ALLOC (cnts_sb,    meta->alph_size * sizeof(uint32_t));
  ESL_ALLOC (occCnts_b,  ( 1+ceil((double)max_block_size/meta->freq_cnt_b)) *  meta->alph_size * sizeof(uint16_t)); // every freq_cnt_b positions, store an array of 8-byte ints
  ESL_ALLOC (cnts_b,     meta->alph_size * sizeof(uint16_t));

  if((T == NULL)  || (BWT == NULL)  || (SA==NULL) || (SAsamp==NULL) || (BWT==NULL) || (cnts_b==NULL) || (occCnts_b==NULL) || (cnts_sb==NULL) || (occCnts_sb==NULL) ) {
    esl_fatal( "%s: Cannot allocate memory.\n", argv[0]);
  }


  // Open a temporary file, to which FM-index data will be written
  if (esl_tmpfile(tmp_filename, &fptmp) != eslOK) esl_fatal("unable to open fm-index tmpfile");


  /* Main loop: */
  while (status == eslOK ) {

    //reset block as an empty vessel
    for (i=0; i<block->count; i++)
        esl_sq_Reuse(block->list + i);

    if (use_tmpsq) {
        esl_sq_Copy(tmpsq , block->list);
        block->complete = FALSE;  //this lets ReadBlock know that it needs to append to a small bit of previously-read seqeunce
        block->list->C = FM_BLOCK_OVERLAP; // overload the ->C value, which ReadBlock uses to determine how much
                                               // overlap should be retained in the ReadWindow step
    } else {
        block->complete = TRUE;
    }

    status = esl_sqio_ReadBlock(sqfp, block, block_size, -1, alphatype != eslAMINO);
    if (status == eslEOF) continue;
    if (status != eslOK)  ESL_XEXCEPTION(status, "failure reading sequence block");

    seq_offset = numseqs;
    ambig_offset = meta->ambig_list->count;

    if (block->complete || block->count == 0) {
        use_tmpsq = FALSE;
    } else {
        /* The final sequence on the block was a probably-incomplete window of the active sequence.
         * Grab a copy of the end for use in the next pass, to ensure we don't miss hits crossing
         * the boundary between two blocks.
         */
        esl_sq_Copy(block->list + (block->count - 1) , tmpsq);
        use_tmpsq = TRUE;
    }

    block->first_seqidx = sq_cnt;
    sq_cnt += block->count - (use_tmpsq ? 1 : 0);// if there's an incomplete sequence read into the block wait to count it until it's complete.


    /* Read dseqs from block into text element T.
    *  Convert the dsq from esl-alphabet to fm-alphabet (1..k for alphabet of size k).
    *  (a) collapsing upper/lower case for appropriate sorting.
    *  (b) reserving 0 for '$', which must be lexicographically smallest
    *      (these will later be shifted to 0-based alphabet, once SA has been built)
    *
    */
    block_length = 0;
    for (i=0; i<block->count; i++) {

      //start a new block, with space for the name
      allocateSeqdata(meta, block->list+i, numseqs, &allocedseqs);

      //meta data
      meta->seq_data[numseqs].target_id       = block->first_seqidx + i ;
      meta->seq_data[numseqs].target_start    = block->list[i].start;
      meta->seq_data[numseqs].fm_start        = block_length;

      if (block->list[i].name == NULL) meta->seq_data[numseqs].name[0] = '\0';
          else  strcpy(meta->seq_data[numseqs].name, block->list[i].name );
      if (block->list[i].acc == NULL) meta->seq_data[numseqs].acc[0] = '\0';
          else  strcpy(meta->seq_data[numseqs].acc, block->list[i].acc );
      if (block->list[i].source == NULL) meta->seq_data[numseqs].source[0] = '\0';
          else  strcpy(meta->seq_data[numseqs].source, block->list[i].source );
      if (block->list[i].desc == NULL) meta->seq_data[numseqs].desc[0] = '\0';
          else  strcpy(meta->seq_data[numseqs].desc, block->list[i].desc );

      for (j=1; j<=block->list[i].n; j++) {
        c = abc->sym[block->list[i].dsq[j]];
        if ( meta->alph_type == fm_DNA) {
          if (meta->inv_alph[c] == -1) {
            // replace ambiguity characters by rotating through A,C,G, and T.
            c = meta->alph[ambig_repl];
            ambig_repl = (ambig_repl+1)%4;

            if (!in_ambig_run) {
              fm_addAmbiguityRange(meta->ambig_list, block_length, block_length);
              in_ambig_run=1;
            } else {
              meta->ambig_list->ranges[meta->ambig_list->count - 1].upper = block_length;
            }
          } else {
            in_ambig_run=0;
          }
        } else if (meta->inv_alph[c] == -1) {
          esl_fatal("requested alphabet doesn't match input text\n");
        }

        T[block_length] = meta->inv_alph[c];

        block_length++;
        if (j>block->list[i].C) total_char_count++; // add to total count, only if it's not redundant with earlier read
        meta->seq_data[numseqs].length++;

      }
      numseqs++;
    }

    T[block_length] = 0; // last character 0 is effectively '$' for suffix array
    block_length++;

    seq_cnt = numseqs-seq_offset;
    ambig_cnt = meta->ambig_list->count - ambig_offset;

    //build and write FM-index for T.  This will be a BWT on the reverse of the sequence, required for reverse-traversal of the BWT
    buildAndWriteFMIndex(meta, seq_offset, ambig_offset, seq_cnt, ambig_cnt, (uint32_t)block->list[0].C, T, BWT, SA, SAsamp,
        occCnts_sb, cnts_sb, occCnts_b, cnts_b, block_length, fptmp);


    if ( ! meta->fwd_only ) {
      //build and write FM-index for un-reversed T  (used to find reverse hits using forward traversal of the BWT
      buildAndWriteFMIndex(meta, seq_offset, ambig_offset, seq_cnt, ambig_cnt, 0, T, BWT, SA, NULL,
          occCnts_sb, cnts_sb, occCnts_b, cnts_b, block_length, fptmp);
    }

    prev_numseqs = numseqs;

    numblocks++;
  }


  esl_sqfile_Close(sqfp);
  esl_alphabet_Destroy(abc);
  esl_sq_Destroy(sq);
  esl_sq_Destroy(tmpsq);
  esl_sq_DestroyBlock(block);

  meta->seq_count = numseqs;
  meta->block_count = numblocks;



    /* Finished writing the FM-index data to a temporary file. Now write
     * metadata to fname_out, than append FM-index data from temp file
     */
  if((fp = fopen(fname_out, "wb")) == NULL)
    esl_fatal( "%s: Cannot open file `%s': ", argv[0], fname_out);


    //write out meta data
  if( fwrite(&(meta->fwd_only),     sizeof(meta->fwd_only),     1, fp) != 1 ||
      fwrite(&(meta->alph_type),    sizeof(meta->alph_type),    1, fp) != 1 ||
      fwrite(&(meta->alph_size),    sizeof(meta->alph_size),    1, fp) != 1 ||
      fwrite(&(meta->charBits),     sizeof(meta->charBits),     1, fp) != 1 ||
      fwrite(&(meta->freq_SA),      sizeof(meta->freq_SA),      1, fp) != 1 ||
      fwrite(&(meta->freq_cnt_sb),  sizeof(meta->freq_cnt_sb),  1, fp) != 1 ||
      fwrite(&(meta->freq_cnt_b),   sizeof(meta->freq_cnt_b),   1, fp) != 1 ||
      fwrite(&(meta->block_count),  sizeof(meta->block_count),  1, fp) != 1 ||
      fwrite(&(meta->seq_count),    sizeof(meta->seq_count),    1, fp) != 1 ||
      fwrite(&(meta->ambig_list->count),  sizeof(meta->ambig_list->count),    1, fp) != 1 ||
      fwrite(&total_char_count,     sizeof(total_char_count),   1, fp) != 1
  )
    esl_fatal( "%s: Error writing meta data for FM index.\n", argv[0]);


  for (i=0; i<meta->seq_count; i++) {
    if( fwrite(&(meta->seq_data[i].target_id),    sizeof(meta->seq_data[i].target_id),          1, fp) != 1 ||
        fwrite(&(meta->seq_data[i].target_start), sizeof(meta->seq_data[i].target_start),       1, fp) != 1 ||
        fwrite(&(meta->seq_data[i].fm_start),     sizeof(meta->seq_data[i].fm_start),  1, fp) != 1 ||
        fwrite(&(meta->seq_data[i].length),       sizeof(meta->seq_data[i].length), 1, fp) != 1 ||
        fwrite(&(meta->seq_data[i].name_length),  sizeof(meta->seq_data[i].name_length), 1, fp) != 1 ||
        fwrite(&(meta->seq_data[i].acc_length),   sizeof(meta->seq_data[i].acc_length), 1, fp) != 1 ||
        fwrite(&(meta->seq_data[i].source_length),sizeof(meta->seq_data[i].source_length), 1, fp) != 1 ||
        fwrite(&(meta->seq_data[i].desc_length),  sizeof(meta->seq_data[i].desc_length), 1, fp) != 1 ||
        fwrite(meta->seq_data[i].name,            sizeof(char),    meta->seq_data[i].name_length+1  , fp) !=  meta->seq_data[i].name_length+1 ||
        fwrite(meta->seq_data[i].acc,             sizeof(char),    meta->seq_data[i].acc_length+1   , fp) !=  meta->seq_data[i].acc_length+1 ||
        fwrite(meta->seq_data[i].source,          sizeof(char),    meta->seq_data[i].source_length+1, fp) !=  meta->seq_data[i].source_length+1 ||
        fwrite(meta->seq_data[i].desc,            sizeof(char),    meta->seq_data[i].desc_length+1  , fp) !=  meta->seq_data[i].desc_length+1
    )
      esl_fatal( "%s: Error writing meta data for FM index.\n", argv[0]);
  }

  for (i=0; i<meta->ambig_list->count; i++) {
    if( fwrite(&(meta->ambig_list->ranges[i].lower), sizeof(meta->ambig_list->ranges[i].lower),       1, fp) != 1 ||
        fwrite(&(meta->ambig_list->ranges[i].upper), sizeof(meta->ambig_list->ranges[i].upper),       1, fp) != 1
    )
      esl_fatal( "%s: Error writing ambiguity data for FM index.\n", argv[0]);
  }


  /* now append the FM-index data in fptmp to the desired output file, fp */
  rewind(fptmp);
  for (i=0; i<numblocks; i++) {

    for(j=0; j< (meta->fwd_only?1:2); j++ ) { //do this once or twice, once for forward-T index, and possibly once for reversed
    //first, read
    if(fread(&block_length, sizeof(block_length), 1, fptmp) !=  1)
      esl_fatal( "%s: Error reading block_length in FM index.\n", argv[0]);
    if(fread(&term_loc, sizeof(term_loc), 1, fptmp) !=  1)
      esl_fatal( "%s: Error reading terminal location in FM index.\n", argv[0]);
    if(fread(&seq_offset, sizeof(seq_offset), 1, fptmp) !=  1)
      esl_fatal( "%s: Error reading seq_offset in FM index.\n", argv[0]);
    if(fread(&ambig_offset, sizeof(ambig_offset ), 1, fptmp) !=  1)
      esl_fatal( "%s: Error reading ambig_offset in FM index.\n", argv[0]);
    if(fread(&overlap, sizeof(overlap), 1, fptmp) !=  1)
      esl_fatal( "%s: Error reading overlap in FM index.\n", argv[0]);
    if(fread(&seq_cnt, sizeof(seq_cnt), 1, fptmp) !=  1)
      esl_fatal( "%s: Error reading seq_cnt in FM index.\n", argv[0]);
    if(fread(&ambig_cnt, sizeof(ambig_cnt), 1, fptmp) !=  1)
      esl_fatal( "%s: Error reading ambig_cnt in FM index.\n", argv[0]);


    compressed_bytes =   ((chars_per_byte-1+block_length)/chars_per_byte);
    num_freq_cnts_b  = 1+ceil((double)block_length/meta->freq_cnt_b);
    num_freq_cnts_sb = 1+ceil((double)block_length/meta->freq_cnt_sb);
    num_SA_samples   = 1+floor((double)block_length/meta->freq_SA);


    //j==0 test cause T and SA to be written only for forward sequence
    if(j==0 && fread(T, sizeof(uint8_t), compressed_bytes, fptmp) != compressed_bytes)
      esl_fatal( "%s: Error reading T in FM index.\n", argv[0]);
    if(fread(BWT, sizeof(uint8_t), compressed_bytes, fptmp) != compressed_bytes)
      esl_fatal( "%s: Error reading BWT in FM index.\n", argv[0]);
    if(j==0 && fread(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fptmp) != (size_t)num_SA_samples)
      esl_fatal( "%s: Error reading SA in FM index.\n", argv[0]);
    if(fread(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fptmp) != (size_t)num_freq_cnts_b)
      esl_fatal( "%s: Error reading occCnts_b in FM index.\n", argv[0]);
    if(fread(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fptmp) != (size_t)num_freq_cnts_sb)
      esl_fatal( "%s: Error reading occCnts_sb in FM index.\n", argv[0]);



    //then, write
    if(fwrite(&block_length, sizeof(block_length), 1, fp) !=  1)
      esl_fatal( "%s: Error writing block_length in FM index.\n", argv[0]);
    if(fwrite(&term_loc, sizeof(term_loc), 1, fp) !=  1)
      esl_fatal( "%s: Error writing terminal location in FM index.\n", argv[0]);
    if(fwrite(&seq_offset, sizeof(seq_offset), 1, fp) !=  1)
      esl_fatal( "%s: Error writing seq_offset in FM index.\n", argv[0]);
    if(fwrite(&ambig_offset, sizeof(ambig_offset), 1, fp) !=  1)
      esl_fatal( "%s: Error writing ambig_offset in FM index.\n", argv[0]);
    if(fwrite(&overlap, sizeof(overlap), 1, fp) !=  1)
      esl_fatal( "%s: Error writing overlap in FM index.\n", argv[0]);
    if(fwrite(&seq_cnt, sizeof(seq_cnt), 1, fp) !=  1)
      esl_fatal( "%s: Error writing seq_cnt in FM index.\n", argv[0]);
    if(fwrite(&ambig_cnt, sizeof(ambig_cnt), 1, fp) !=  1)
      esl_fatal( "%s: Error writing ambig_cnt in FM index.\n", argv[0]);


    if(j==0 && fwrite(T, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes)
      esl_fatal( "%s: Error writing T in FM index.\n", argv[0]);
    if(fwrite(BWT, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes)
      esl_fatal( "%s: Error writing BWT in FM index.\n", argv[0]);
    if(j==0 && fwrite(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fp) != (size_t)num_SA_samples)
      esl_fatal( "%s: Error writing SA in FM index.\n", argv[0]);
    if(fwrite(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fp) != (size_t)num_freq_cnts_b)
      esl_fatal( "%s: Error writing occCnts_b in FM index.\n", argv[0]);
    if(fwrite(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fp) != (size_t)num_freq_cnts_sb)
      esl_fatal( "%s: Error writing occCnts_sb in FM index.\n", argv[0]);

    }
  }


  fprintf (stderr, "Number of characters in index:  %ld\n", (long)total_char_count);
  fprintf (stderr, "Number of FM-index blocks:      %ld\n", (long)meta->block_count);


  fclose(fp);
  fclose(fptmp);
  free(T);
  free(BWT);
  free(SA);
  free(SAsamp);
  free(occCnts_b);
  free(cnts_b);
  free(occCnts_sb);
  free(cnts_sb);

  fm_metaDestroy(meta);

  esl_getopts_Destroy(go);


  // compute and print the elapsed time in millisec
  t2 = times(&ts2);
  {
    double clk_ticks = sysconf(_SC_CLK_TCK);
    double elapsedTime = (t2-t1)/clk_ticks;

    fprintf (stderr, "run time:  %.2f seconds\n", elapsedTime);
  }


  return (eslOK);


ERROR:
  /* Deallocate memory. */
  if (fp)         fclose(fp);
  if (T)          free(T);
  if (BWT)        free(BWT);
  if (SA)         free(SA);
  if (SAsamp)     free(SAsamp);
  if (occCnts_b)  free(occCnts_b);
  if (cnts_b)     free(cnts_b);
  if (occCnts_sb) free(occCnts_sb);
  if (cnts_sb)    free(cnts_sb);
  if (ambig_list.ranges) free(ambig_list.ranges);

  fm_metaDestroy(meta);
  esl_getopts_Destroy(go);


  esl_sqfile_Close(sqfp);
  esl_alphabet_Destroy(abc);
  esl_sq_Destroy(sq);
  if (tmpsq) esl_sq_Destroy(tmpsq);
  if (block) esl_sq_DestroyBlock(block);

  fprintf (stderr, "failure during memory allocation\n");

  exit(status);

}