//  Set the  empty  bit to true for the hash table entry
//  corresponding to string  s  whose hash key is  key .
//  Also set global  String_Info . left/right_end_screened
//  true if the entry occurs near the left/right end, resp.,
//  of the string in the hash table.  If not found, add an
//  entry to the hash table and mark it empty.
static
void
Hash_Mark_Empty(uint64 key, char * s) {
  String_Ref_t  h_ref;
  char  * t;
  unsigned char  key_check;
  int64  ct, probe;
  int64  sub;
  int  i, shift;

  sub = HASH_FUNCTION (key);
  key_check = KEY_CHECK_FUNCTION (key);
  probe = PROBE_FUNCTION (key);

  ct = 0;
  do {
    for  (i = 0;  i < Hash_Table [sub] . Entry_Ct;  i ++)
      if  (Hash_Table [sub] . Check [i] == key_check) {
        h_ref = Hash_Table [sub] . Entry [i];
        t = Data + String_Start [getStringRefStringNum(h_ref)] + getStringRefOffset(h_ref);
        if  (strncmp (s, t, Kmer_Len) == 0) {
          if  (! getStringRefEmpty(Hash_Table [sub] . Entry [i]))
            Mark_Screened_Ends_Chain (Hash_Table [sub] . Entry [i]);
          setStringRefEmpty(Hash_Table [sub] . Entry [i], TRUELY_ONE);
          return;
        }
      }
    assert (i == Hash_Table [sub] . Entry_Ct);
    if  (Hash_Table [sub] . Entry_Ct < ENTRIES_PER_BUCKET) {
      // Not found
      if  (Use_Hopeless_Check) {
        Hash_Table [sub] . Entry [i] = Add_Extra_Hash_String (s);
        setStringRefEmpty(Hash_Table [sub] . Entry [i], TRUELY_ONE);
        Hash_Table [sub] . Check [i] = key_check;
        Hash_Table [sub] . Entry_Ct ++;
        Hash_Table [sub] . Hits [i] = 0;
        Hash_Entries ++;
        shift = HASH_CHECK_FUNCTION (key);
        Hash_Check_Array [sub] |= (((Check_Vector_t) 1) << shift);
      }
      return;
    }
    sub = (sub + probe) % HASH_TABLE_SIZE;
  }  while  (++ ct < HASH_TABLE_SIZE);

  fprintf (stderr, "ERROR:  Hash table full\n");
  assert (FALSE);
}
//  Insert string subscript  i  into the global hash table.
//  Sequence and information about the string are in
//  global variables  Data, String_Start, String_Info, ....
static
void
Put_String_In_Hash(int i) {
  String_Ref_t  ref = 0;
  char  * p, * window;
  int  kmers_inserted = 0;
  int  skip_ct;
  uint64  key, key_is_bad;
  int  j;

  if  (String_Info [i] . length < Kmer_Len)
    return;

  p = window = Data + String_Start [i];
  key = key_is_bad = 0;
  for  (j = 0;  j < Kmer_Len;  j ++) {
    key_is_bad |= (uint64) (Char_Is_Bad [(int) * p]) << j;
    key |= (uint64) (Bit_Equivalent [(int) * (p ++)]) << (2 * j);
  }

  setStringRefStringNum(ref, i);
  if  (i > MAX_STRING_NUM) {
    fprintf (stderr, "Too many strings for hash table--exiting\n");
    exit (1);
  }
  setStringRefOffset(ref, TRUELY_ZERO);
  skip_ct = 0;
  setStringRefEmpty(ref, TRUELY_ZERO);

  if  (key_is_bad == false) {
    Hash_Insert (ref, key, window);
    kmers_inserted ++;
  }

  while  ((* p) != '\0') {
    window ++;

    {
      String_Ref_t newoff = getStringRefOffset(ref) + 1;
      assert(newoff < OFFSET_MASK);
      setStringRefOffset(ref, newoff);
    }

    if  (++ skip_ct > HASH_KMER_SKIP)
      skip_ct = 0;

    key_is_bad >>= 1;
    key_is_bad |= (uint64) (Char_Is_Bad [(int) * p]) << (Kmer_Len - 1);
    key >>= 2;
    key |= (uint64) (Bit_Equivalent [(int) * (p ++)]) << (2 * (Kmer_Len - 1));

    if  (skip_ct == 0 && ! key_is_bad) {
      Hash_Insert (ref, key, window);
      kmers_inserted ++;
    }
  }
}
//  Add string  s  as an extra hash table string and return
//  a single reference to the beginning of it.
static
String_Ref_t
Add_Extra_Hash_String(const char * s) {
  String_Ref_t  ref = 0;
  String_Ref_t  sub = 0;
  size_t  new_len;
  int  len;
   
  new_len = Used_Data_Len + Kmer_Len;
  if  (Extra_String_Subcount < MAX_EXTRA_SUBCOUNT) {
    sub = String_Ct + Extra_String_Ct - 1;
  } else {
    sub = String_Ct + Extra_String_Ct;
    if  (sub >= String_Start_Size) {
      String_Start_Size *= MEMORY_EXPANSION_FACTOR;
      if  (sub >= String_Start_Size)
        String_Start_Size = sub;
      String_Start = (int64 *) safe_realloc (String_Start,
                                             String_Start_Size * sizeof (int64));
    }
    String_Start [sub] = Used_Data_Len;
    Extra_String_Ct ++;
    Extra_String_Subcount = 0;
    new_len ++;
  }

  if  (new_len >= Extra_Data_Len) {
    Extra_Data_Len = (size_t) (Extra_Data_Len * MEMORY_EXPANSION_FACTOR);
    if  (new_len > Extra_Data_Len)
      Extra_Data_Len = new_len;
    Data = (char *) safe_realloc (Data, Extra_Data_Len);
  }
  strncpy (Data + String_Start [sub] + Kmer_Len * Extra_String_Subcount,
           s, Kmer_Len + 1);
  Used_Data_Len = new_len;

  setStringRefStringNum(ref, sub);
  if  (sub > MAX_STRING_NUM) {
    fprintf (stderr, "Too many skip kmer strings for hash table.\n"
             "Try skipping hopeless check (-z option)\n"
             "Exiting\n");
    exit (1);
  }
  setStringRefOffset(ref, (String_Ref_t)Extra_String_Subcount * (String_Ref_t)Kmer_Len);

  assert(Extra_String_Subcount * Kmer_Len < OFFSET_MASK);

  setStringRefLast(ref, TRUELY_ONE);
  setStringRefEmpty(ref, TRUELY_ONE);

  Extra_String_Subcount ++;

  return  ref;
}
Exemplo n.º 4
0
//  Add string  s  as an extra hash table string and return
//  a single reference to the beginning of it.
static
String_Ref_t
Add_Extra_Hash_String(const char *s) {
  String_Ref_t  ref = 0;
  String_Ref_t  sub = 0;

  int  len;

  uint32 new_len = Used_Data_Len + G.Kmer_Len;

  if (Extra_String_Subcount < MAX_EXTRA_SUBCOUNT) {
    sub = String_Ct + Extra_String_Ct - 1;

  } else {
    sub = String_Ct + Extra_String_Ct;

    if (sub >= String_Start_Size) {
      uint64  n = max(sub * 1.1, String_Start_Size * 1.5);

      //fprintf(stderr, "REALLOC String_Start from "F_U64" to "F_U64"\n", String_Start_Size, n);
      resizeArray(String_Start, String_Start_Size, String_Start_Size, n);
    }

    String_Start[sub] = Used_Data_Len;

    Extra_String_Ct++;
    Extra_String_Subcount = 0;
    new_len++;
  }

  if (new_len >= Extra_Data_Len) {
    uint64  n = max(new_len * 1.1, Extra_Data_Len * 1.5);

    //fprintf(stderr, "REALLOC basesData from "F_U64" to "F_U64"\n", Extra_Data_Len, n);
    resizeArray(basesData, Extra_Data_Len, Extra_Data_Len, n);
  }

  strncpy(basesData + String_Start[sub] + G.Kmer_Len * Extra_String_Subcount, s, G.Kmer_Len + 1);

  Used_Data_Len = new_len;

  setStringRefStringNum(ref, sub);

  if (sub > MAX_STRING_NUM) {
    fprintf(stderr, "Too many skip kmer strings for hash table.\n");
    fprintf(stderr, "Try skipping hopeless check (-z option)\n");
    fprintf(stderr, "Exiting\n");
    exit (1);
  }

  setStringRefOffset(ref, (String_Ref_t)Extra_String_Subcount * (String_Ref_t)G.Kmer_Len);

  assert(Extra_String_Subcount * G.Kmer_Len < OFFSET_MASK);

  setStringRefLast(ref,  (uint64)1);
  setStringRefEmpty(ref, TRUELY_ONE);

  Extra_String_Subcount++;

  return(ref);
}
Exemplo n.º 5
0
//  Insert string subscript  i  into the global hash table.
//  Sequence and information about the string are in
//  global variables  basesData, String_Start, String_Info, ....
static
void
Put_String_In_Hash(uint32 curID, uint32 i) {
  String_Ref_t  ref = 0;
  int           skip_ct;
  uint64        key;
  uint64        key_is_bad;
  int           j;

  uint32        kmers_skipped  = 0;
  uint32        kmers_bad      = 0;
  uint32        kmers_inserted = 0;

  char *p      = basesData + String_Start[i];
  char *window = basesData + String_Start[i];

  key = key_is_bad = 0;

  for (uint32 j=0;  j<G.Kmer_Len; j ++) {
    key_is_bad |= (uint64) (Char_Is_Bad[(int) * p]) << j;
    key        |= (uint64) (Bit_Equivalent[(int) * (p ++)]) << (2 * j);
  }

  setStringRefStringNum(ref, i);

  if (i > MAX_STRING_NUM)
    fprintf (stderr, "Too many strings for hash table--exiting\n"), exit(1);

  setStringRefOffset(ref, TRUELY_ZERO);

  skip_ct = 0;

  setStringRefEmpty(ref, TRUELY_ZERO);

  if (key_is_bad == false) {
    Hash_Insert(ref, key, window);
    kmers_inserted++;

  } else {
    kmers_bad++;
  }

  while (*p != 0) {
    window++;

    String_Ref_t newoff = getStringRefOffset(ref) + 1;
    assert(newoff < OFFSET_MASK);

    setStringRefOffset(ref, newoff);

    if (++skip_ct > HASH_KMER_SKIP)
      skip_ct = 0;

    key_is_bad >>= 1;
    key_is_bad |= (uint64) (Char_Is_Bad[(int) * p]) << (G.Kmer_Len - 1);

    key >>= 2;
    key  |= (uint64) (Bit_Equivalent[(int) * (p ++)]) << (2 * (G.Kmer_Len - 1));

    if (skip_ct > 0) {
      kmers_skipped++;
      continue;
    }

    if (key_is_bad) {
      kmers_bad++;
      continue;
    }

    Hash_Insert(ref, key, window);
    kmers_inserted++;
  }

  //fprintf(stderr, "STRING %u skipped %u bad %u inserted %u\n",
  //        curID, kmers_skipped, kmers_bad, kmers_inserted);
}