// Read the next batch of strings from  stream  and create a hash
//  table index of their  Kmer_Len -mers.  Return  1  if successful;
//  0 otherwise.  The batch ends when either end-of-file is encountered
//  or  Max_Hash_Strings  have been read in.   first_frag_id  is the
//  internal ID of the first fragment in the hash table.
int
Build_Hash_Index(gkStream *stream, int32 first_frag_id, gkFragment *myRead) {
  String_Ref_t  ref;
  uint64  total_len;
  static int64  max_extra_ref_ct = 0;
  static int64  old_ref_len, new_ref_len;
  int  frag_status;
  uint64   hash_entry_limit;

  Hash_String_Num_Offset = first_frag_id;
  String_Ct = Extra_String_Ct = 0;
  Extra_String_Subcount = MAX_EXTRA_SUBCOUNT;
  total_len = 0;
  if  (Data == NULL) {
    Extra_Data_Len = Data_Len = Max_Hash_Data_Len + AS_READ_MAX_NORMAL_LEN;
    Data = (char *) safe_realloc (Data, Data_Len);
    Quality_Data = (char *) safe_realloc (Quality_Data, Data_Len);
    old_ref_len = Data_Len / (HASH_KMER_SKIP + 1);
    Next_Ref = (String_Ref_t *) safe_realloc (Next_Ref, old_ref_len * sizeof (String_Ref_t));
  }

  memset (Next_Ref, '\377', old_ref_len * sizeof (String_Ref_t));
  memset (Hash_Table, 0, HASH_TABLE_SIZE * sizeof (Hash_Bucket_t));
  memset (Hash_Check_Array, 0, HASH_TABLE_SIZE * sizeof (Check_Vector_t));

  Extra_Ref_Ct = 0;
  Hash_Entries = 0;
  hash_entry_limit = Max_Hash_Load * HASH_TABLE_SIZE * ENTRIES_PER_BUCKET;

#if 0
  fprintf(stderr, "HASH LOADING STARTED: fragID   %12"F_U64P"\n", first_frag_id);
  fprintf(stderr, "HASH LOADING STARTED: strings  %12"F_U64P" out of %12"F_U64P" max.\n", String_Ct, Max_Hash_Strings);
  fprintf(stderr, "HASH LOADING STARTED: length   %12"F_U64P" out of %12"F_U64P" max.\n", total_len, Max_Hash_Data_Len);
  fprintf(stderr, "HASH LOADING STARTED: entries  %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit,
          (100.0 * Hash_Entries) / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET));
#endif

  while  (String_Ct < Max_Hash_Strings
          && total_len < Max_Hash_Data_Len
          && Hash_Entries < hash_entry_limit
          && (frag_status
              = Read_Next_Frag (Sequence_Buffer, Quality_Buffer, stream,
                                myRead, & Last_Hash_Frag_Read, minLibToHash, maxLibToHash))) {
    int  extra, len;
    size_t  new_len;

    if  (frag_status == DELETED_FRAG) {
      Sequence_Buffer [0] = '\0';
      Quality_Buffer [0] = '\0';
    }

    String_Start [String_Ct] = total_len;
    len = strlen (Sequence_Buffer);
    String_Info [String_Ct] . length = len;
    String_Info [String_Ct] . lfrag_end_screened = FALSE;
    String_Info [String_Ct] . rfrag_end_screened = FALSE;
    new_len = total_len + len + 1;
    extra = new_len % (HASH_KMER_SKIP + 1);
    if  (extra > 0)
      new_len += 1 + HASH_KMER_SKIP - extra;

    if  (new_len > Data_Len) {
      Data_Len = (size_t) (Data_Len * MEMORY_EXPANSION_FACTOR);
      if  (new_len > Data_Len)
        Data_Len = new_len;
      if  (Data_Len > Extra_Data_Len) {
        Data = (char *) safe_realloc (Data, Data_Len);
        Extra_Data_Len = Data_Len;
      }
      Quality_Data = (char *) safe_realloc (Quality_Data, Data_Len);
      new_ref_len = Data_Len / (HASH_KMER_SKIP + 1);
      Next_Ref = (String_Ref_t *) safe_realloc
        (Next_Ref, new_ref_len * sizeof (String_Ref_t));
      memset (Next_Ref + old_ref_len, '\377',
              (new_ref_len - old_ref_len) * sizeof (String_Ref_t));
      old_ref_len = new_ref_len;
    }

    strcpy (Data + total_len, Sequence_Buffer);
    memcpy (Quality_Data + total_len, Quality_Buffer, len + 1);
    total_len = new_len;

    Put_String_In_Hash (String_Ct);

    String_Ct ++;

    if ((String_Ct % 100000) == 0)
      fprintf (stderr, "String_Ct:%12"F_U64P"/%12"F_U32P"  totalLen:%12"F_U64P"/%12"F_U64P"  Hash_Entries:%12"F_U64P"/%12"F_U64P"  Load: %.2f%%\n",
               String_Ct,    Max_Hash_Strings,
               total_len,    Max_Hash_Data_Len,
               Hash_Entries,
               hash_entry_limit,
               100.0 * Hash_Entries / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET));
  }

  if  (String_Ct == 0)
    return  0;

  fprintf(stderr, "HASH LOADING STOPPED: strings  %12"F_U64P" out of %12"F_U32P" max.\n", String_Ct, Max_Hash_Strings);
  fprintf(stderr, "HASH LOADING STOPPED: length   %12"F_U64P" out of %12"F_U64P" max.\n", total_len, Max_Hash_Data_Len);
  fprintf(stderr, "HASH LOADING STOPPED: entries  %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit,
          100.0 * Hash_Entries / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET));

  Used_Data_Len = total_len;

  if  (Extra_Ref_Ct > max_extra_ref_ct) {
    max_extra_ref_ct *= MEMORY_EXPANSION_FACTOR;
    if  (Extra_Ref_Ct > max_extra_ref_ct)
      max_extra_ref_ct = Extra_Ref_Ct;
    fprintf (stderr,
             "### realloc  Extra_Ref_Space  max_extra_ref_ct = " F_S64 "\n",
             max_extra_ref_ct);
    Extra_Ref_Space = (String_Ref_t *) safe_realloc (Extra_Ref_Space,
                                                     max_extra_ref_ct * sizeof (String_Ref_t));
  }

  if  (Kmer_Skip_File != NULL)
    Mark_Skip_Kmers ();

  // Coalesce reference chain into adjacent entries in  Extra_Ref_Space
  Extra_Ref_Ct = 0;
  for  (int32 i = 0;  i < HASH_TABLE_SIZE;  i ++)
    for  (int32 j = 0;  j < Hash_Table [i] . Entry_Ct;  j ++) {
      ref = Hash_Table [i] . Entry [j];
      if  (! getStringRefLast(ref) && ! getStringRefEmpty(ref)) {
        Extra_Ref_Space [Extra_Ref_Ct] = ref;
        setStringRefStringNum(Hash_Table [i] . Entry [j], (String_Ref_t)(Extra_Ref_Ct >> OFFSET_BITS));
        setStringRefOffset   (Hash_Table [i] . Entry [j], (String_Ref_t)(Extra_Ref_Ct & OFFSET_MASK));
        Extra_Ref_Ct ++;
        do {
          ref = Next_Ref [(String_Start [getStringRefStringNum(ref)] + getStringRefOffset(ref)) / (HASH_KMER_SKIP + 1)];
          Extra_Ref_Space [Extra_Ref_Ct ++] = ref;
        }  while  (! getStringRefLast(ref));
      }
    }
예제 #2
0
// Read the next batch of strings from  stream  and create a hash
//  table index of their  G.Kmer_Len -mers.  Return  1  if successful;
//  0 otherwise.  The batch ends when either end-of-file is encountered
//  or  Max_Hash_Strings  have been read in.   first_frag_id  is the
//  internal ID of the first fragment in the hash table.
int
Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) {
  String_Ref_t  ref;
  uint64  total_len;
  uint64   hash_entry_limit;

  fprintf(stderr, "Build_Hash_Index from "F_U32" to "F_U32"\n", bgnID, endID);

  Hash_String_Num_Offset = bgnID;
  String_Ct              = 0;
  Extra_String_Ct        = 0;
  Extra_String_Subcount  = MAX_EXTRA_SUBCOUNT;
  total_len              = 0;

  //if (Data == NULL) {
  //  Extra_Data_Len    = Max_Hash_Data_Len + AS_MAX_READLEN;
  //  Data_Len          = Max_Hash_Data_Len + AS_MAX_READLEN;
  //
  //  basesData         = new char [Data_Len];
  //  qualsData         = new char [Data_Len];
  //
  //  old_ref_len       = Data_Len / (HASH_KMER_SKIP + 1);
  //  nextRef           = new String_Ref_t [old_ref_len];
  //}

  //memset(nextRef,         0xff, old_ref_len     * sizeof(String_Ref_t));

  memset(Hash_Table,       0x00, HASH_TABLE_SIZE * sizeof(Hash_Bucket_t));
  memset(Hash_Check_Array, 0x00, HASH_TABLE_SIZE * sizeof(Check_Vector_t));

  Extra_Ref_Ct     = 0;
  Hash_Entries     = 0;
  hash_entry_limit = G.Max_Hash_Load * HASH_TABLE_SIZE * ENTRIES_PER_BUCKET;

#if 0
  fprintf(stderr, "HASH LOADING STARTED: fragID   %12"F_U64P"\n", first_frag_id);
  fprintf(stderr, "HASH LOADING STARTED: strings  %12"F_U64P" out of %12"F_U64P" max.\n", String_Ct, G.Max_Hash_Strings);
  fprintf(stderr, "HASH LOADING STARTED: length   %12"F_U64P" out of %12"F_U64P" max.\n", total_len, G.Max_Hash_Data_Len);
  fprintf(stderr, "HASH LOADING STARTED: entries  %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit,
         (100.0 * Hash_Entries) / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET));
#endif

  //  Compute an upper limit on the number of bases we will load.  The number of Hash_Entries
  //  can't be computed here, so the real loop below could end earlier than expected - and we
  //  don't use a little bit of memory.

  uint32  nSkipped  = 0;
  uint32  nShort    = 0;
  uint32  nLoadable = 0;

  uint64  maxAlloc = 0;
  uint32  curID    = 0;  //  The last ID loaded into the hash

  for (curID=bgnID; ((String_Ct <  G.Max_Hash_Strings) &&
                     (total_len <  G.Max_Hash_Data_Len) &&
                     (curID     <= endID)); curID++) {
    gkRead *read = gkpStore->gkStore_getRead(curID);

    if ((read->gkRead_libraryID() < G.minLibToHash) ||
        (read->gkRead_libraryID() > G.maxLibToHash)) {
      nSkipped++;
      continue;
    }

    if (read->gkRead_sequenceLength() < G.Min_Olap_Len) {
      nShort++;
      continue;
    }

    nLoadable++;

    maxAlloc += read->gkRead_sequenceLength() + 1;
  }

  fprintf(stderr, "Found "F_U32" reads with length "F_U64" to load; "F_U32" skipped by being too short; "F_U32" skipped per library restriction\n",
          nLoadable, maxAlloc, nShort, nSkipped);

  //  This should be less than what the user requested on the command line

  if (maxAlloc >= G.Max_Hash_Data_Len + AS_MAX_READLEN)
    fprintf(stderr, "maxAlloc = "F_U64" G.Max_Hash_Data_Len = "F_U64"  AS_MAX_READLEN = %u\n", maxAlloc, G.Max_Hash_Data_Len, AS_MAX_READLEN);
  assert(maxAlloc < G.Max_Hash_Data_Len + AS_MAX_READLEN);

  //  Allocate space, then fill it.

  uint64 nextRef_Len = maxAlloc / (HASH_KMER_SKIP + 1);
  Extra_Data_Len = Data_Len  = maxAlloc;

  basesData = new char         [Data_Len];
  qualsData = new char         [Data_Len];
  nextRef   = new String_Ref_t [nextRef_Len];

  memset(nextRef, 0xff, sizeof(String_Ref_t) * nextRef_Len);

  gkReadData   *readData = new gkReadData;

  for (curID=bgnID; ((String_Ct    <  G.Max_Hash_Strings) &&
                     (total_len    <  G.Max_Hash_Data_Len) &&
                     (Hash_Entries <  hash_entry_limit) &&
                     (curID        <= endID)); curID++, String_Ct++) {

    //  Load sequence if it exists, otherwise, add an empty read.
    //  Duplicated in Process_Overlaps().

    String_Start[String_Ct]                    = UINT64_MAX;

    String_Info[String_Ct].length              = 0;
    String_Info[String_Ct].lfrag_end_screened  = TRUE;
    String_Info[String_Ct].rfrag_end_screened  = TRUE;

    gkRead  *read = gkpStore->gkStore_getRead(curID);

    if ((read->gkRead_libraryID() < G.minLibToHash) ||
        (read->gkRead_libraryID() > G.maxLibToHash))
      continue;

    uint32 len = read->gkRead_sequenceLength();

    if (len < G.Min_Olap_Len)
      continue;

    gkpStore->gkStore_loadReadData(read, readData);

    char   *seqptr   = readData->gkReadData_getSequence();
    char   *qltptr   = readData->gkReadData_getQualities();

    //  Note where we are going to store the string, and how long it is

    String_Start[String_Ct]                    = total_len;

    String_Info[String_Ct].length              = len;
    String_Info[String_Ct].lfrag_end_screened  = FALSE;
    String_Info[String_Ct].rfrag_end_screened  = FALSE;

    //  Store it.

    for (uint32 i=0; i<len; i++, total_len++) {
      basesData[total_len] = tolower(seqptr[i]);
      qualsData[total_len] = qltptr[i];
    }

    basesData[total_len] = 0;
    qualsData[total_len] = 0;

    total_len++;

    //  Skipping kners is totally untested.
#if 0
    if (HASH_KMER_SKIP > 0) {
      uint32 extra   = new_len % (HASH_KMER_SKIP + 1);

      if (extra > 0)
        new_len += 1 + HASH_KMER_SKIP - extra;
    }
#endif

    //  Trouble - allocate more space for sequence and quality data.
    //  This was computed ahead of time!

    if (total_len > maxAlloc)
      fprintf(stderr, "total_len="F_U64"  len="F_U32"  maxAlloc="F_U64"\n", total_len, len, maxAlloc);
    assert(total_len <= maxAlloc);

    //  What is Extra_Data_Len?  It's set to Data_Len if we would have reallocated here.

    Put_String_In_Hash(curID, String_Ct);

    if ((String_Ct % 100000) == 0)
      fprintf (stderr, "String_Ct:%12"F_U64P"/%12"F_U32P"  totalLen:%12"F_U64P"/%12"F_U64P"  Hash_Entries:%12"F_U64P"/%12"F_U64P"  Load: %.2f%%\n",
               String_Ct,    G.Max_Hash_Strings,
               total_len,    G.Max_Hash_Data_Len,
               Hash_Entries,
               hash_entry_limit,
               100.0 * Hash_Entries / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET));
  }

  curID--;  //  We always stop on the read after we loaded.

  delete readData;

  fprintf(stderr, "HASH LOADING STOPPED: strings  %12"F_U64P" out of %12"F_U32P" max.\n", String_Ct, G.Max_Hash_Strings);
  fprintf(stderr, "HASH LOADING STOPPED: length   %12"F_U64P" out of %12"F_U64P" max.\n", total_len, G.Max_Hash_Data_Len);
  fprintf(stderr, "HASH LOADING STOPPED: entries  %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit,
          100.0 * Hash_Entries / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET));

  if (String_Ct == 0) {
    fprintf(stderr, "HASH LOADING STOPPED: no strings added?\n");
    return(endID);
  }

  Used_Data_Len = total_len;

  //fprintf(stderr, "Extra_Ref_Ct = "F_U64"  Max_Extra_Ref_Space = "F_U64"\n", Extra_Ref_Ct, Max_Extra_Ref_Space);

  if (Extra_Ref_Ct > Max_Extra_Ref_Space) {
    int32          newSize  = (Max_Extra_Ref_Space == 0) ? 16 * 1024 : Max_Extra_Ref_Space * 2;

    while (newSize < Extra_Ref_Ct)
      newSize *= 2;

    String_Ref_t  *newSpace = new String_Ref_t [newSize];

    memcpy(newSpace, Extra_Ref_Space, sizeof(String_Ref_t) * Max_Extra_Ref_Space);

    delete [] Extra_Ref_Space;

    Max_Extra_Ref_Space = newSize;    //  Former max_extra_ref_ct
    Extra_Ref_Space     = newSpace;
  }


  if (G.Kmer_Skip_File != NULL)
    Mark_Skip_Kmers();


  // Coalesce reference chain into adjacent entries in  Extra_Ref_Space
  Extra_Ref_Ct = 0;
  for (int32 i = 0;  i < HASH_TABLE_SIZE;  i ++)
    for (int32 j = 0;  j < Hash_Table[i].Entry_Ct;  j ++) {
      ref = Hash_Table[i].Entry[j];
      if (! getStringRefLast(ref) && ! getStringRefEmpty(ref)) {
        Extra_Ref_Space[Extra_Ref_Ct] = ref;
        setStringRefStringNum(Hash_Table[i].Entry[j], (String_Ref_t)(Extra_Ref_Ct >> OFFSET_BITS));
        setStringRefOffset  (Hash_Table[i].Entry[j], (String_Ref_t)(Extra_Ref_Ct & OFFSET_MASK));
        Extra_Ref_Ct ++;
        do {
          ref = nextRef[(String_Start[getStringRefStringNum(ref)] + getStringRefOffset(ref)) / (HASH_KMER_SKIP + 1)];
          Extra_Ref_Space[Extra_Ref_Ct ++] = ref;
        }  while (! getStringRefLast(ref));
      }
    }