Exemplo n.º 1
0
void printHashset(struct HashSet *hashset, FILE *file)
{
	int bucketNumber;

	for (bucketNumber = 0; bucketNumber < hashset->numberBuckets;
		bucketNumber++)
		printBucket(hashset, bucketNumber, file);
}
Exemplo n.º 2
0
/***********************************************************************
* Suffix Array Induced Sorting (SAIS) is a linear time/space suffix 
* array construction algorithm which competes with BPR2 for top 
* time/space requirements.  TODO: reference paper here.
* 
* Currently this code is being changes to enable testing of run-removal
* on the code.  Run removal enforces stricter conditions on the input 
* sequence which allows for some of the logic to be simplified.  This
* experimental approach is particularly useful for inputs with small
* alphabets in the general case, but definitionally saves on sequences 
* which are known to have many runs of identical values in the sequence.
* 
* @source  :The sequence to construct the suffix array on.
* @runsRem :
* 
* @alphabetSize : not actually alphabet size, but highest value seen in 
*                 alphabet.  Might change in future.
* 
* 
* Notes
* 0 = undefined, 1 = L, 2 = S, 3 = {LMS, M}
***********************************************************************/
sequence SAIS(const u8 *source, const size_t sourceLength, 
                        const sequence runsRem, const u8 alphabetSize){
//DECLARATIONS//////////////////////////////////////////////////////////
  sequence toReturn, sanityCheck;
  size_t **bucket;
  size_t **oldBucket;
  size_t i;
  size_t *bucketSize;
  size_t *bucketFrontCounter;
  size_t *bucketEndCounter;
  
  unsigned char *LMSandLS;

////INITIALIZATION//////////////////////////////////////////////////////
  LMSandLS  = malloc(sizeof(unsigned char) * runsRem.size);
  bucket    = malloc(sizeof(*bucket) * alphabetSize);
  oldBucket = malloc(sizeof(*oldBucket) * alphabetSize);
  bucketSize = calloc(sizeof(size_t)* alphabetSize, 1);
  bucketFrontCounter = calloc(sizeof(size_t)* alphabetSize, 1);
  bucketEndCounter = calloc(sizeof(size_t)* alphabetSize, 1);
  sanityCheck = initSequence(runsRem.size);
  memset(sanityCheck.S, 0, sizeof(*sanityCheck.S) * sanityCheck.size);
  sanityCheck.size = 0;

  /*prescan for buckets************************************************/
  //calculate bucket sizes
  
  for(i = 0; i < runsRem.size; i++) bucketSize[source[runsRem.S[i]]]++;

  //calculate bucket start and stops
  for(short i = 0; i < alphabetSize; i++){
    bucket[i] = calloc(sizeof(size_t), bucketSize[i]);
    oldBucket[i] = calloc(sizeof(size_t), bucketSize[i]);
  }

#ifdef DEBUG
  //first place where bucket data can be printed
  fprintf(stderr, "%lu\n", runsRem.size);
  printBucket(bucket, alphabetSize, bucketSize);
#endif


//OPERATION/////////////////////////////////////////////////////////////
  /*set up L, S, and LMS metadata**************************************/
  /*The paper stipulates an additional universally minimal character
   * which is definitionally LMS, but here it is simulated.*/

  //Assign characters' values right to left (end to beginning) for L, S,
  //and LMS
  size_t loopUntil = runsRem.size - 2;
  LMSandLS[runsRem.size-1] = _L_;
  for(i = loopUntil; i != ((size_t)0)-1; i--)
    LMSandLS[i] = source[runsRem.S[i]] > source[runsRem.S[i+1]] ? _L_ : _S_;
  
  i=0;
  while(1){
    while(i < loopUntil && LMSandLS[i] == _L_) i++;
    if(i >= loopUntil) break;
    LMSandLS[i++] = _LMS_;
    while(i < loopUntil && LMSandLS[i] == _S_) i++;
    if(i >= loopUntil) break;
  }

#ifdef DEBUG
  printLMSandLS(LMSandLS, runsRem.size);

  fprintf(stderr, "\n\nAdding to buckets\n\n");
#endif

/*PRIMEER***************************************Add entries to buckets*/
  //This is supposed to prepare the data to be induce sorted.

  memcpy(bucketEndCounter, bucketSize, sizeof(*bucketEndCounter) * alphabetSize);
  
  bucket[source[runsRem.S[runsRem.size-1]]][0] = runsRem.size-1;
  //bucketFrontCounter[bucketLocation]++;
  
  //LMS type right-to-left scan -- Add LMS entries to the ends of
  //various buckets going from right to left.  The result is partially
  //full buckets with LMS entries in acending order.
  for(size_t i = runsRem.size-1; i != ((size_t)0)-1; i--){
    if(LMSandLS[i] == _LMS_){
      const unsigned char target = source[runsRem.S[i]];
      bucket[target][--bucketEndCounter[target]] = i;
    }
  }
  

/*LOOP OVER UNTIL COMPLETE*********************************************/
//TODO: there's some really ugly ways to make this run faster.
  //L type left-to-right scan, not exactly a direct reasoning for this,
  //please refer to the paper.  Bounds checking was used in place of
  //checking for negative values so that -1 didn't have to be used,
  //allowing architentually maximal string length.
  char goOn;
  do{
    goOn = 0;
    memset(sanityCheck.S, 0, sizeof(*sanityCheck.S) * runsRem.size);
    sanityCheck.size = 0;

    //step 3 of setting up SA
    //L type right to left scan.
    memcpy(bucketEndCounter, bucketSize, sizeof(*bucketEndCounter) * alphabetSize);
    for(i = alphabetSize-1; i != ((size_t)0)-1 ; i--){
      for(size_t j = bucketSize[i]-1; j != ((size_t)0)-1; j--){
        if(!bucket[i][j]) continue;
        const size_t target = bucket[i][j]-1;

        if(LMSandLS[target] == _L_ || LMSandLS[target] == _LMS_){
          char KILLYOSELF = 0;
          if(source[runsRem.S[target]] == source[runsRem.S[runsRem.size-1]] && bucketEndCounter[source[runsRem.S[target]]]-1 == 0){
            KILLYOSELF = 1;
            printf("Trying to write over something you're blatently not supposed to write over.\n");
          }
          for(size_t k = 0; k < sanityCheck.size; k++)
            if(sanityCheck.S[k] == target){
              KILLYOSELF=1;
              printf("trying to write a %lu a second time.\n", target);
            }
          
          if(KILLYOSELF){
            printf("KILL YO SELF in r to l\n");
            exit(1);
          }
          const unsigned char target2 = source[runsRem.S[target]];
          bucket[target2][--bucketEndCounter[target2]] = target;
          sanityCheck.S[sanityCheck.size++] = target;
        }
      }
    }

#ifdef DEBUG
    printBucket(bucket, alphabetSize, bucketSize);
#endif
    
    //S type left to right scan.
    memset(bucketFrontCounter, 0, sizeof(*bucketFrontCounter) * alphabetSize);
    //bucket[source[runsRem.S[runsRem.size-1]]][0] = runsRem.size-1;
    bucketFrontCounter[source[runsRem.S[runsRem.size-1]]] = 1;//protect last index
  
    for(int i = 0; i < alphabetSize; i++){
      for(size_t j = 0; j < bucketSize[i]; j++){
        if(!bucket[i][j]) continue;
        const size_t target = bucket[i][j]-1;
        
        if(LMSandLS[target] == _S_){
          char KILLYOSELF = 0;
          if(source[runsRem.S[target]] == source[runsRem.S[runsRem.size-1]] && bucketEndCounter[source[runsRem.S[target]]]-1 == 0){
            KILLYOSELF = 1;
            printf("Trying to write over something you're blatently not supposed to write over.\n");
          }
          for(size_t k = 0; k < sanityCheck.size; k++)
            if(sanityCheck.S[k] == target){
              KILLYOSELF=1;
              printf("trying to write a %lu a second time.\n", target);
            }
          
          if(KILLYOSELF){
            printf("KILL YO SELF in l to r\n");
            exit(1);
          }
          const unsigned char target2 = source[runsRem.S[target]];
          bucket[target2][bucketFrontCounter[target2]++] = target;
          sanityCheck.S[sanityCheck.size++] = target;
        }
      }
    }

    for(i = 0; i < alphabetSize; i ++)
      if(memcmp(bucket[i], oldBucket[i], bucketSize[i] * sizeof(size_t))){
        for(size_t j = i; j < alphabetSize; j++)
          if(bucketSize[j])
            memcpy(oldBucket[j], bucket[j], sizeof(*bucket) * bucketSize[j]);
        goOn = 1;
        break;
      }

  }while(goOn);

#ifdef DEBUG
    printBucket(bucket, alphabetSize, bucketSize);
#endif

//CLEAN UP//////////////////////////////////////////////////////////////

  free(LMSandLS);
  
  toReturn = initSequence(runsRem.size);
  toReturn.size = 0;
  for(i = 0; i < alphabetSize; i++)
    for(size_t j = 0; j < bucketSize[i]; j++)
      toReturn.S[toReturn.size++] = bucket[i][j];
  return toReturn;
}