// Read the next batch of strings from stream and create a hash // table index of their Kmer_Len -mers. Return 1 if successful; // 0 otherwise. The batch ends when either end-of-file is encountered // or Max_Hash_Strings have been read in. first_frag_id is the // internal ID of the first fragment in the hash table. int Build_Hash_Index(gkStream *stream, int32 first_frag_id, gkFragment *myRead) { String_Ref_t ref; uint64 total_len; static int64 max_extra_ref_ct = 0; static int64 old_ref_len, new_ref_len; int frag_status; uint64 hash_entry_limit; Hash_String_Num_Offset = first_frag_id; String_Ct = Extra_String_Ct = 0; Extra_String_Subcount = MAX_EXTRA_SUBCOUNT; total_len = 0; if (Data == NULL) { Extra_Data_Len = Data_Len = Max_Hash_Data_Len + AS_READ_MAX_NORMAL_LEN; Data = (char *) safe_realloc (Data, Data_Len); Quality_Data = (char *) safe_realloc (Quality_Data, Data_Len); old_ref_len = Data_Len / (HASH_KMER_SKIP + 1); Next_Ref = (String_Ref_t *) safe_realloc (Next_Ref, old_ref_len * sizeof (String_Ref_t)); } memset (Next_Ref, '\377', old_ref_len * sizeof (String_Ref_t)); memset (Hash_Table, 0, HASH_TABLE_SIZE * sizeof (Hash_Bucket_t)); memset (Hash_Check_Array, 0, HASH_TABLE_SIZE * sizeof (Check_Vector_t)); Extra_Ref_Ct = 0; Hash_Entries = 0; hash_entry_limit = Max_Hash_Load * HASH_TABLE_SIZE * ENTRIES_PER_BUCKET; #if 0 fprintf(stderr, "HASH LOADING STARTED: fragID %12"F_U64P"\n", first_frag_id); fprintf(stderr, "HASH LOADING STARTED: strings %12"F_U64P" out of %12"F_U64P" max.\n", String_Ct, Max_Hash_Strings); fprintf(stderr, "HASH LOADING STARTED: length %12"F_U64P" out of %12"F_U64P" max.\n", total_len, Max_Hash_Data_Len); fprintf(stderr, "HASH LOADING STARTED: entries %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit, (100.0 * Hash_Entries) / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET)); #endif while (String_Ct < Max_Hash_Strings && total_len < Max_Hash_Data_Len && Hash_Entries < hash_entry_limit && (frag_status = Read_Next_Frag (Sequence_Buffer, Quality_Buffer, stream, myRead, & Last_Hash_Frag_Read, minLibToHash, maxLibToHash))) { int extra, len; size_t new_len; if (frag_status == DELETED_FRAG) { Sequence_Buffer [0] = '\0'; Quality_Buffer [0] = '\0'; } String_Start [String_Ct] = total_len; len = strlen (Sequence_Buffer); String_Info [String_Ct] . length = len; String_Info [String_Ct] . lfrag_end_screened = FALSE; String_Info [String_Ct] . rfrag_end_screened = FALSE; new_len = total_len + len + 1; extra = new_len % (HASH_KMER_SKIP + 1); if (extra > 0) new_len += 1 + HASH_KMER_SKIP - extra; if (new_len > Data_Len) { Data_Len = (size_t) (Data_Len * MEMORY_EXPANSION_FACTOR); if (new_len > Data_Len) Data_Len = new_len; if (Data_Len > Extra_Data_Len) { Data = (char *) safe_realloc (Data, Data_Len); Extra_Data_Len = Data_Len; } Quality_Data = (char *) safe_realloc (Quality_Data, Data_Len); new_ref_len = Data_Len / (HASH_KMER_SKIP + 1); Next_Ref = (String_Ref_t *) safe_realloc (Next_Ref, new_ref_len * sizeof (String_Ref_t)); memset (Next_Ref + old_ref_len, '\377', (new_ref_len - old_ref_len) * sizeof (String_Ref_t)); old_ref_len = new_ref_len; } strcpy (Data + total_len, Sequence_Buffer); memcpy (Quality_Data + total_len, Quality_Buffer, len + 1); total_len = new_len; Put_String_In_Hash (String_Ct); String_Ct ++; if ((String_Ct % 100000) == 0) fprintf (stderr, "String_Ct:%12"F_U64P"/%12"F_U32P" totalLen:%12"F_U64P"/%12"F_U64P" Hash_Entries:%12"F_U64P"/%12"F_U64P" Load: %.2f%%\n", String_Ct, Max_Hash_Strings, total_len, Max_Hash_Data_Len, Hash_Entries, hash_entry_limit, 100.0 * Hash_Entries / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET)); } if (String_Ct == 0) return 0; fprintf(stderr, "HASH LOADING STOPPED: strings %12"F_U64P" out of %12"F_U32P" max.\n", String_Ct, Max_Hash_Strings); fprintf(stderr, "HASH LOADING STOPPED: length %12"F_U64P" out of %12"F_U64P" max.\n", total_len, Max_Hash_Data_Len); fprintf(stderr, "HASH LOADING STOPPED: entries %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit, 100.0 * Hash_Entries / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET)); Used_Data_Len = total_len; if (Extra_Ref_Ct > max_extra_ref_ct) { max_extra_ref_ct *= MEMORY_EXPANSION_FACTOR; if (Extra_Ref_Ct > max_extra_ref_ct) max_extra_ref_ct = Extra_Ref_Ct; fprintf (stderr, "### realloc Extra_Ref_Space max_extra_ref_ct = " F_S64 "\n", max_extra_ref_ct); Extra_Ref_Space = (String_Ref_t *) safe_realloc (Extra_Ref_Space, max_extra_ref_ct * sizeof (String_Ref_t)); } if (Kmer_Skip_File != NULL) Mark_Skip_Kmers (); // Coalesce reference chain into adjacent entries in Extra_Ref_Space Extra_Ref_Ct = 0; for (int32 i = 0; i < HASH_TABLE_SIZE; i ++) for (int32 j = 0; j < Hash_Table [i] . Entry_Ct; j ++) { ref = Hash_Table [i] . Entry [j]; if (! getStringRefLast(ref) && ! getStringRefEmpty(ref)) { Extra_Ref_Space [Extra_Ref_Ct] = ref; setStringRefStringNum(Hash_Table [i] . Entry [j], (String_Ref_t)(Extra_Ref_Ct >> OFFSET_BITS)); setStringRefOffset (Hash_Table [i] . Entry [j], (String_Ref_t)(Extra_Ref_Ct & OFFSET_MASK)); Extra_Ref_Ct ++; do { ref = Next_Ref [(String_Start [getStringRefStringNum(ref)] + getStringRefOffset(ref)) / (HASH_KMER_SKIP + 1)]; Extra_Ref_Space [Extra_Ref_Ct ++] = ref; } while (! getStringRefLast(ref)); } }
// Read the next batch of strings from stream and create a hash // table index of their G.Kmer_Len -mers. Return 1 if successful; // 0 otherwise. The batch ends when either end-of-file is encountered // or Max_Hash_Strings have been read in. first_frag_id is the // internal ID of the first fragment in the hash table. int Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) { String_Ref_t ref; uint64 total_len; uint64 hash_entry_limit; fprintf(stderr, "Build_Hash_Index from "F_U32" to "F_U32"\n", bgnID, endID); Hash_String_Num_Offset = bgnID; String_Ct = 0; Extra_String_Ct = 0; Extra_String_Subcount = MAX_EXTRA_SUBCOUNT; total_len = 0; //if (Data == NULL) { // Extra_Data_Len = Max_Hash_Data_Len + AS_MAX_READLEN; // Data_Len = Max_Hash_Data_Len + AS_MAX_READLEN; // // basesData = new char [Data_Len]; // qualsData = new char [Data_Len]; // // old_ref_len = Data_Len / (HASH_KMER_SKIP + 1); // nextRef = new String_Ref_t [old_ref_len]; //} //memset(nextRef, 0xff, old_ref_len * sizeof(String_Ref_t)); memset(Hash_Table, 0x00, HASH_TABLE_SIZE * sizeof(Hash_Bucket_t)); memset(Hash_Check_Array, 0x00, HASH_TABLE_SIZE * sizeof(Check_Vector_t)); Extra_Ref_Ct = 0; Hash_Entries = 0; hash_entry_limit = G.Max_Hash_Load * HASH_TABLE_SIZE * ENTRIES_PER_BUCKET; #if 0 fprintf(stderr, "HASH LOADING STARTED: fragID %12"F_U64P"\n", first_frag_id); fprintf(stderr, "HASH LOADING STARTED: strings %12"F_U64P" out of %12"F_U64P" max.\n", String_Ct, G.Max_Hash_Strings); fprintf(stderr, "HASH LOADING STARTED: length %12"F_U64P" out of %12"F_U64P" max.\n", total_len, G.Max_Hash_Data_Len); fprintf(stderr, "HASH LOADING STARTED: entries %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit, (100.0 * Hash_Entries) / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET)); #endif // Compute an upper limit on the number of bases we will load. The number of Hash_Entries // can't be computed here, so the real loop below could end earlier than expected - and we // don't use a little bit of memory. uint32 nSkipped = 0; uint32 nShort = 0; uint32 nLoadable = 0; uint64 maxAlloc = 0; uint32 curID = 0; // The last ID loaded into the hash for (curID=bgnID; ((String_Ct < G.Max_Hash_Strings) && (total_len < G.Max_Hash_Data_Len) && (curID <= endID)); curID++) { gkRead *read = gkpStore->gkStore_getRead(curID); if ((read->gkRead_libraryID() < G.minLibToHash) || (read->gkRead_libraryID() > G.maxLibToHash)) { nSkipped++; continue; } if (read->gkRead_sequenceLength() < G.Min_Olap_Len) { nShort++; continue; } nLoadable++; maxAlloc += read->gkRead_sequenceLength() + 1; } fprintf(stderr, "Found "F_U32" reads with length "F_U64" to load; "F_U32" skipped by being too short; "F_U32" skipped per library restriction\n", nLoadable, maxAlloc, nShort, nSkipped); // This should be less than what the user requested on the command line if (maxAlloc >= G.Max_Hash_Data_Len + AS_MAX_READLEN) fprintf(stderr, "maxAlloc = "F_U64" G.Max_Hash_Data_Len = "F_U64" AS_MAX_READLEN = %u\n", maxAlloc, G.Max_Hash_Data_Len, AS_MAX_READLEN); assert(maxAlloc < G.Max_Hash_Data_Len + AS_MAX_READLEN); // Allocate space, then fill it. uint64 nextRef_Len = maxAlloc / (HASH_KMER_SKIP + 1); Extra_Data_Len = Data_Len = maxAlloc; basesData = new char [Data_Len]; qualsData = new char [Data_Len]; nextRef = new String_Ref_t [nextRef_Len]; memset(nextRef, 0xff, sizeof(String_Ref_t) * nextRef_Len); gkReadData *readData = new gkReadData; for (curID=bgnID; ((String_Ct < G.Max_Hash_Strings) && (total_len < G.Max_Hash_Data_Len) && (Hash_Entries < hash_entry_limit) && (curID <= endID)); curID++, String_Ct++) { // Load sequence if it exists, otherwise, add an empty read. // Duplicated in Process_Overlaps(). String_Start[String_Ct] = UINT64_MAX; String_Info[String_Ct].length = 0; String_Info[String_Ct].lfrag_end_screened = TRUE; String_Info[String_Ct].rfrag_end_screened = TRUE; gkRead *read = gkpStore->gkStore_getRead(curID); if ((read->gkRead_libraryID() < G.minLibToHash) || (read->gkRead_libraryID() > G.maxLibToHash)) continue; uint32 len = read->gkRead_sequenceLength(); if (len < G.Min_Olap_Len) continue; gkpStore->gkStore_loadReadData(read, readData); char *seqptr = readData->gkReadData_getSequence(); char *qltptr = readData->gkReadData_getQualities(); // Note where we are going to store the string, and how long it is String_Start[String_Ct] = total_len; String_Info[String_Ct].length = len; String_Info[String_Ct].lfrag_end_screened = FALSE; String_Info[String_Ct].rfrag_end_screened = FALSE; // Store it. for (uint32 i=0; i<len; i++, total_len++) { basesData[total_len] = tolower(seqptr[i]); qualsData[total_len] = qltptr[i]; } basesData[total_len] = 0; qualsData[total_len] = 0; total_len++; // Skipping kners is totally untested. #if 0 if (HASH_KMER_SKIP > 0) { uint32 extra = new_len % (HASH_KMER_SKIP + 1); if (extra > 0) new_len += 1 + HASH_KMER_SKIP - extra; } #endif // Trouble - allocate more space for sequence and quality data. // This was computed ahead of time! if (total_len > maxAlloc) fprintf(stderr, "total_len="F_U64" len="F_U32" maxAlloc="F_U64"\n", total_len, len, maxAlloc); assert(total_len <= maxAlloc); // What is Extra_Data_Len? It's set to Data_Len if we would have reallocated here. Put_String_In_Hash(curID, String_Ct); if ((String_Ct % 100000) == 0) fprintf (stderr, "String_Ct:%12"F_U64P"/%12"F_U32P" totalLen:%12"F_U64P"/%12"F_U64P" Hash_Entries:%12"F_U64P"/%12"F_U64P" Load: %.2f%%\n", String_Ct, G.Max_Hash_Strings, total_len, G.Max_Hash_Data_Len, Hash_Entries, hash_entry_limit, 100.0 * Hash_Entries / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET)); } curID--; // We always stop on the read after we loaded. delete readData; fprintf(stderr, "HASH LOADING STOPPED: strings %12"F_U64P" out of %12"F_U32P" max.\n", String_Ct, G.Max_Hash_Strings); fprintf(stderr, "HASH LOADING STOPPED: length %12"F_U64P" out of %12"F_U64P" max.\n", total_len, G.Max_Hash_Data_Len); fprintf(stderr, "HASH LOADING STOPPED: entries %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit, 100.0 * Hash_Entries / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET)); if (String_Ct == 0) { fprintf(stderr, "HASH LOADING STOPPED: no strings added?\n"); return(endID); } Used_Data_Len = total_len; //fprintf(stderr, "Extra_Ref_Ct = "F_U64" Max_Extra_Ref_Space = "F_U64"\n", Extra_Ref_Ct, Max_Extra_Ref_Space); if (Extra_Ref_Ct > Max_Extra_Ref_Space) { int32 newSize = (Max_Extra_Ref_Space == 0) ? 16 * 1024 : Max_Extra_Ref_Space * 2; while (newSize < Extra_Ref_Ct) newSize *= 2; String_Ref_t *newSpace = new String_Ref_t [newSize]; memcpy(newSpace, Extra_Ref_Space, sizeof(String_Ref_t) * Max_Extra_Ref_Space); delete [] Extra_Ref_Space; Max_Extra_Ref_Space = newSize; // Former max_extra_ref_ct Extra_Ref_Space = newSpace; } if (G.Kmer_Skip_File != NULL) Mark_Skip_Kmers(); // Coalesce reference chain into adjacent entries in Extra_Ref_Space Extra_Ref_Ct = 0; for (int32 i = 0; i < HASH_TABLE_SIZE; i ++) for (int32 j = 0; j < Hash_Table[i].Entry_Ct; j ++) { ref = Hash_Table[i].Entry[j]; if (! getStringRefLast(ref) && ! getStringRefEmpty(ref)) { Extra_Ref_Space[Extra_Ref_Ct] = ref; setStringRefStringNum(Hash_Table[i].Entry[j], (String_Ref_t)(Extra_Ref_Ct >> OFFSET_BITS)); setStringRefOffset (Hash_Table[i].Entry[j], (String_Ref_t)(Extra_Ref_Ct & OFFSET_MASK)); Extra_Ref_Ct ++; do { ref = nextRef[(String_Start[getStringRefStringNum(ref)] + getStringRefOffset(ref)) / (HASH_KMER_SKIP + 1)]; Extra_Ref_Space[Extra_Ref_Ct ++] = ref; } while (! getStringRefLast(ref)); } }