Beispiel #1
0
DocTable AllocateDocTable(void) {
  DocTableRecord *dt = (DocTableRecord *) malloc(sizeof(DocTableRecord));
  Verify333(dt != NULL);

  dt->docid_to_docname = AllocateHashTable(1024);
  dt->docname_to_docid = AllocateHashTable(1024);
  dt->max_id = 0;

  Verify333(dt->docid_to_docname != NULL);
  Verify333(dt->docname_to_docid != NULL);
  return dt;
}
Beispiel #2
0
static void TestFindOrInsert() {
  struct HashTable* ht;
  int i;
  int iterations = 1000000;
  int range = 30;         /* random number between 1 and 30 */

  ht = AllocateHashTable(4, 0);    /* value is 4 bytes, 0: don't copy keys */

  /* We'll test how good rand() is as a random number generator */
  for (i = 0; i < iterations; ++i) {
    int key = rand() % range;
    HTItem* bck = HashFindOrInsert(ht, key, 0);     /* initialize to 0 */
    bck->data++;                   /* found one more of them */
  }

  for (i = 0; i < range; ++i) {
    HTItem* bck = HashFind(ht, i);
    if (bck) {
      printf("%3d: %d\n", bck->key, bck->data);
    } else {
      printf("%3d: 0\n", i);
    }
  }

  FreeHashTable(ht);
}
Beispiel #3
0
static void ResizeHashtable(HashTable ht) {
  // Resize if the load factor is > 3.
  if (ht->num_elements < 3 * ht->num_buckets)
    return;

  // This is the resize case.  Allocate a new hashtable,
  // iterate over the old hashtable, do the surgery on
  // the old hashtable record and free up the new hashtable
  // record.
  HashTable newht = AllocateHashTable(ht->num_buckets * 9);

  // Give up if out of memory.
  if (newht == NULL)
    return;

  // Loop through the old ht with an iterator,
  // inserting into the new HT.
  HTIter it = HashTableMakeIterator(ht);
  if (it == NULL) {
    // Give up if out of memory.
    FreeHashTable(newht, &HTNullFree);
    return;
  }

  while (!HTIteratorPastEnd(it)) {
    HTKeyValue item, dummy;

    Verify333(HTIteratorGet(it, &item) == 1);
    if (InsertHashTable(newht, item, &dummy) != 1) {
      // failure, free up everything, return.
      HTIteratorFree(it);
      FreeHashTable(newht, &HTNullFree);
      return;
    }
    HTIteratorNext(it);
  }

  // Worked!  Free the iterator.
  HTIteratorFree(it);

  // Sneaky: swap the structures, then free the new table,
  // and we're done.
  {
    HashTableRecord tmp;

    tmp = *ht;
    *ht = *newht;
    *newht = tmp;
    FreeHashTable(newht, &HTNullFree);
  }

  return;
}
HashTable BuildWordHT(char *filename) {
  char *filecontent;
  HashTable tab;
  HWSize_t filelen, i;

  if (filename == NULL)
    return NULL;

  // STEP 6.
  // Use ReadFile() to slurp in the file contents.  If the
  // file turns out to be empty (i.e., its length is 0),
  // or you couldn't read the file at all, return NULL to indicate
  // failure.
  filecontent = ReadFile(filename, &filelen);
  if (filecontent == NULL || filelen == 0)
    return NULL;

  // Verify that the file contains only ASCII text.  We won't try to index any
  // files that contain non-ASCII text; unfortunately, this means we aren't
  // Unicode friendly.
  for (i = 0; i < filelen; i++) {
    if ((filecontent[i] == '\0') ||
        ((unsigned char) filecontent[i] > ASCII_UPPER_BOUND)) {
      free(filecontent);
      return NULL;
    }
  }

  // Great!  Let's split the file up into words.  We'll allocate the hash
  // table that will store the WordPositions structures associated with each
  // word.  Since our hash table dynamically grows, we'll start with a small
  // number of buckets.
  tab = AllocateHashTable(64);

  // Loop through the file, splitting it into words and inserting a record for
  // each word.
  LoopAndInsert(tab, filecontent);

  // If we found no words, return NULL instead of a
  // zero-sized hashtable.
  if (NumElementsInHashTable(tab) == 0) {
    FreeHashTable(tab, &WordHTFree);
    tab = NULL;
  }

  // Now that we've finished parsing the document, we can free up the
  // filecontent buffer and return our built-up table.
  free(filecontent);
  filecontent = NULL;
  return tab;
}
Beispiel #5
0
static void TestInsert() {
  struct HashTable* ht;
  HTItem* bck;

  ht = AllocateHashTable(1, 0);    /* value is 1 byte, 0: don't copy keys */

  HashInsert(ht, PTR_KEY(ht, "January"), 31);  /* 0: don't overwrite old val */
  bck = HashInsert(ht, PTR_KEY(ht, "February"), 28);
  bck = HashInsert(ht, PTR_KEY(ht, "March"), 31);

  bck = HashFind(ht, PTR_KEY(ht, "February"));
  assert(bck);
  assert(bck->data == 28);

  FreeHashTable(ht);
}
Beispiel #6
0
gar_list* gar_index(void* gar, size_t length)
{
	gar_list* list = malloc(sizeof(gar_list));
	size_t size;
	char name[100];
	unsigned long i;
	name[99] = '\0';
	list->gar = gar;
	list->length = length;
	list->ht = AllocateHashTable(0, 1);
	for (; length >= 512; gar += 512, length -= 512)
		if (!*(char*)gar)
			return list;
		else
		{
			size = strtol(gar + 124, NULL, 8);
			if (((char*)gar)[156] == '0' || ((char*)gar)[156] == '\0')
			{
				for (i = 5; i < 99; i ++)
					switch ((name[i - 5] = ((char*)gar)[i]))
					{
						case '/':
							name[i - 5] = '.';
							break;
						case ' ':
							name[i - 5] = '\0';
						case '\0':
							goto copied;
					}
				copied:
				HashInsert(list->ht, PTR_KEY(list->ht, name), (ulong)gar + 512);
			}
			size = size ? ((size - 1) / 512 + 1) * 512 : 0;
			if (length < size)
				break;
			gar += size;
			length -= size;
		}
	FreeHashTable(list->ht);
	free(list);
	return NULL;
}
int main (int argc, char *argv[])
{

	paper_rec_t DedupeRecord;
	dd_uint64_t Unique_CRID;	/* Unique CR_ID = (C_ID << 16) | CR_ID */

	long victim_index = 0, cache_size, window_size, bloom_filter_size;
	long i, j=0, temp_index;
	int Initial_Flag = 0, cache_algorithm;

	dd_uint8_t *sha1_value=NULL;
	int nLen = 0;
	long max_counter=0;
	HTItem *chunk_item, *item;
	long byte_len, temp, offset, ver, temp1; /* to read a trace file */

	unsigned long hash1, hash2;
	/* Heap Data structure variables */
	Cache_Memory Dataitem;
	std::vector<Cache_Memory>::iterator itr;

	unsigned long writeCounter = 0;
	unsigned long access_counter;
	long file_position;
	FILE *fp1, *fp;
	size_t keySize=0,iCnt;
	clock_t start = clock();
	time_t begin,end;
	time(&begin);
	if (argc < 5) {
	    /*                      0            1            2			3                     4 		*/
		fprintf(stderr, "usage: %s <Cache Size> <Window Size> <Cache Algorithm (0, 1, 2)> <Trace File>\n", argv[0]);
		fprintf(stderr, "       - Cache Size: Dedupe read cache size in terms of # of data chunk (e.g. 500 chunks = 4MB (500*8KB))\n");
		fprintf(stderr, "       - Window Size: Future sliding window size in terms of TIMES of cache size.\n");
		fprintf(stderr, "       - Cache Algorithm: 0 (Belady MIN), 1 (Belady MIN with a future window), 2 (Lookahead read cache)\n");
		fprintf(stderr, "       - Trace File: Trace file name with path\n");
		exit(1);
	  }
 
	cache_size = atol(argv[1]);
	assert(cache_size > 0);		/* cache size must be positive */
	window_size = atol(argv[2]);
	assert(window_size > 0);	/* window size must be positive */
	cache_algorithm = atoi(argv[3]);
	assert((cache_algorithm == 0)||(cache_algorithm == 1)||(cache_algorithm == 2)); /* there are only three selections */

	bloom_filter_size = cache_size*2; //No. of Hash functions for BF is 2
	bloom_filter = (long *)malloc(sizeof(long)*bloom_filter_size);

	ht_cache = AllocateHashTable(SHA1_VALUE_LENGTH, 1);
	heap = newMinHeap((u32)cache_size);
	if((fp1 = fopen(argv[4], "rb")) == NULL){	//for reading data one by one
		DEBUG_INFO("File open error....1\n");
		exit (-1);
	}

	if((fp = fopen(argv[4], "rb")) == NULL){	//for searching its future reference distance
		DEBUG_INFO("File open error....2\n");
		exit (-1);
	}

	long c=0, d=0;
	u32 itemIndex;

	keySize = sizeof(DedupeRecord.fp_bytes);
	DEBUG_INFO("Record Size is: %d\n",keySize);
	while (1)
	{
		fread(&DedupeRecord, sizeof(struct _paper_rec_t),1, fp1);
		/*if(DedupeRecord.fp_bytes[0] == 0)
		  DedupeRecord.fp_bytes[0] = '0';*/
		/*for(iCnt = 0;iCnt<sizeof(DedupeRecord.fp_bytes);++iCnt)
		  printf("%c",DedupeRecord.fp_bytes[iCnt]);*/
		//DEBUG_INFO("Reading chunks : %ld\n", c++);
		c++;
                if(c%1000 == 0){
                        printf("Reading Chunks: %ld\n",c);
                }

                if (c % 10000 == 0) {
                    printf("Cache hit ratio: %.3f = %lu / %lu \n",
                           (double) (Hit_Count * 100) / (double) totalAccess ,
                           Hit_Count,
                           totalAccess);
                }

		if(feof(fp1)) 
			break;

		file_position = ftell(fp1);

		/* initially fill the cache. During this initialization process, we do not count the cache hit ratio. */
		if (Initial_Flag == 0) {
			// Temporally store this current access chunk  with its future reference distance in the cache 

			chunk_item = HashFind(ht_cache, PTR_KEY(ht_cache,DedupeRecord.fp_bytes));

			//Update Bloom filter counters
                        hash1 = hash_djb2(DedupeRecord.fp_bytes,keySize)%bloom_filter_size;
                        hash2 = hash_sdbm(DedupeRecord.fp_bytes,keySize)%bloom_filter_size;

                        max_counter = bloom_filter[hash1]++;
                        if((bloom_filter[hash2]++) > max_counter)
				max_counter = bloom_filter[hash2];

			if(chunk_item) { //Cache Hit
			  itemIndex = (u32)chunk_item->data;
			   DEBUG_INFO("Index its updating is %ld:\n",itemIndex);
			   heapUpdate(heap,max_counter,itemIndex,&ht_cache);
			}
			else {
			    heapInsert(heap,DedupeRecord.fp_bytes, max_counter,&ht_cache);
			  //Sandeep - Insert into Heap and Heapify
			    cache_counter++;
			}

			if(cache_counter == cache_size) {
				DEBUG_INFO("\n#### Cache Initialization complete~!!####\n");	
				Initial_Flag = 1; 
				//Sandeep - Construct Heap and Heapify
				//fnBuildHeap(cache_heap);
				#ifdef DEBUG
				printf("Heap Size is: %d\n",cache_heap.size());
				/*PrintHashTable(ht_cache,-1,2);
				  fnPrintHeap(cache_heap);*/
                               #endif

			}
		} 
		else { /* Once the cache is full of data initially, we start to measure the cache hit ratio from now. */

			totalAccess++;
			if((totalAccess % 100) == 0) {
				DEBUG_INFO("[CHECK] Current Access Number: %ld\n", totalAccess);
			}

			Unique_CRID = (DedupeRecord.cmc_id << 16) | DedupeRecord.creg_id;

                        chunk_item = HashFind(ht_cache, PTR_KEY(ht_cache,DedupeRecord.fp_bytes));

                        if(chunk_item) { //Cache Hit
				Hit_Count++;
				DEBUG_INFO("Cache Hit\n");

                                //Update Bloom filter counters
                                hash1 = hash_djb2(DedupeRecord.fp_bytes,keySize)%bloom_filter_size;
                                hash2 = hash_sdbm(DedupeRecord.fp_bytes,keySize)%bloom_filter_size;
				//DEBUG_INFO("### Returned hash values are %ld and %ld\n",bloom_filter[hash1],bloom_filter[hash2]);
                	        max_counter = bloom_filter[hash1]++;
        	                if((bloom_filter[hash2]++) > max_counter)
	                                max_counter = bloom_filter[hash2];
				itemIndex = (ulong)chunk_item->data;
				DEBUG_INFO("Index its updating is %ld:\n",itemIndex);
				assert(itemIndex>=0 && itemIndex<=cache_size);
				heapUpdate(heap,max_counter,itemIndex,&ht_cache);
                                //Sandeep - Update heap counter val for this chunk with max_counter
				//fnUpdateHeap(cache_heap, Read_Cache[(ulong)chunk_item->data],max_counter);

                        }
			else {
			        heapPopMin(heap,&sha1_value,&access_counter,&ht_cache);
				if(!sha1_value)
				  ERROR("SHA1 Value in main is NULL\n");
				/*for(iCnt = 0;iCnt<sizeof(DedupeRecord.fp_bytes);++iCnt)
				  printf("%c",sha1_value[iCnt]);*/
                                //Update Bloom filter counters
                                hash1 = hash_djb2(sha1_value,sizeof(sha1_value))%bloom_filter_size;
                                hash2 = hash_sdbm(sha1_value,sizeof(sha1_value))%bloom_filter_size;
				//DEBUG_INFO("### In Main before decrement %ld and %ld\n",bloom_filter[hash1],bloom_filter[hash2]);
				//Decrement BF counters
				bloom_filter[hash1]--;
                                bloom_filter[hash2]--;

				free(sha1_value);
				
				//GP - Increment the BF counters for this chunk
                                hash1 = hash_djb2(DedupeRecord.fp_bytes,keySize)%bloom_filter_size;
                                hash2 = hash_sdbm(DedupeRecord.fp_bytes,keySize)%bloom_filter_size;
				//DEBUG_INFO("### Returned hash values are in main cache_miss %ld and %ld\n",bloom_filter[hash1],bloom_filter[hash2]);
	                        max_counter = bloom_filter[hash1]++;
        	                if((bloom_filter[hash2]++) > max_counter)
                	                max_counter = bloom_filter[hash2];
								
				heapInsert(heap,DedupeRecord.fp_bytes,max_counter,&ht_cache);
				 if(cache_algorithm == LOOKAHEAD){
				   /* Check if any other chunks in the current CR will appear within the future window.
				   If we found one, we add such chunk(s) in the cache. */
				   Check_Unique_CRID(fp, Unique_CRID, file_position, 0, cache_size, window_size*cache_size, bloom_filter_size);
				 }
			}

                       
		} //else

	} //while

	printf("\n###################################################################\n");
	printf("Cache hit ratio: %.3f = %lu / %lu \n", (double) (Hit_Count * 100) / (double) totalAccess , Hit_Count, totalAccess);
	printf("Cache size: %ld, window size: %ld\n", cache_size, window_size*cache_size); 
	printf("Dedupe trace: %s\n", argv[4]);
	printf("###################################################################\n");
	fclose(fp1);
	fclose(fp);

	FreeHashTable(ht_cache);
	deleteMinHeap(heap);
	time(&end);
	printf("###################################################################\n");
	printf("Total time taken is %f \n",((double)clock()-start)/CLOCKS_PER_SEC);
	printf("###################################################################\n");
	return 0;
}
Beispiel #8
0
// our main function; here, we demonstrate how to use some
// of the hash table functions
int main(int argc, char **argv) {
  ExampleValuePtr evp;
  HashTable ht;
  HTIter iter;
  HTKeyValue kv, old_kv;
  HTKey_t i;

  // allocate a hash table with 10,000 initial buckets
  ht = AllocateHashTable(10000);
  Verify333(ht != NULL);

  // insert 20,000 elements (load factor = 2.0)
  for (i = 0; i < 20000; i++) {
    evp = (ExampleValuePtr) malloc(sizeof(ExampleValue));
    Verify333(evp != NULL);
    evp->num = i;

    // make sure HT has the right # of elements in it to start
    Verify333(NumElementsInHashTable(ht) == (HWSize_t) i);

    // insert a new element
    kv.key = FNVHashInt64((HTValue_t)i);
    kv.value = (HTValue_t)evp;
    Verify333(InsertHashTable(ht, kv, &old_kv) == 1);

    // make sure hash table has right # of elements post-insert
    Verify333(NumElementsInHashTable(ht) == (HWSize_t) (i+1));
  }

  // look up a few values
  Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)100), &kv) == 1);
  Verify333(kv.key == FNVHashInt64((HTValue_t)100));
  Verify333(((ExampleValuePtr) kv.value)->num == 100);

  Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)18583), &kv) == 1);
  Verify333(kv.key == FNVHashInt64((HTValue_t)18583));
  Verify333(((ExampleValuePtr) kv.value)->num == 18583);

  // make sure non-existent value cannot be found
  Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)20000), &kv) == 0);

  // delete a value
  Verify333(RemoveFromHashTable(ht, FNVHashInt64((HTValue_t)100), &kv) == 1);
  Verify333(kv.key == FNVHashInt64((HTValue_t)100));
  Verify333(((ExampleValuePtr) kv.value)->num == 100);
  ExampleValueFree(kv.value);   // since we malloc'ed it, we must free it

  // make sure it's deleted
  Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)100), &kv) == 0);
  Verify333(NumElementsInHashTable(ht) == (HWSize_t) 19999);

  // loop through using an iterator
  i = 0;
  iter = HashTableMakeIterator(ht);
  Verify333(iter != NULL);

  while (HTIteratorPastEnd(iter) == 0) {
    Verify333(HTIteratorGet(iter, &kv) == 1);
    Verify333(kv.key != FNVHashInt64((HTValue_t)100));   // we deleted it

    // advance the iterator
    HTIteratorNext(iter);
    i++;
  }
  Verify333(i == 19999);

  // free the iterator
  HTIteratorFree(iter);

  // free the hash table
  FreeHashTable(ht, &ExampleValueFree);
  return 0;
}