void printHashset(struct HashSet *hashset, FILE *file) { int bucketNumber; for (bucketNumber = 0; bucketNumber < hashset->numberBuckets; bucketNumber++) printBucket(hashset, bucketNumber, file); }
/*********************************************************************** * Suffix Array Induced Sorting (SAIS) is a linear time/space suffix * array construction algorithm which competes with BPR2 for top * time/space requirements. TODO: reference paper here. * * Currently this code is being changes to enable testing of run-removal * on the code. Run removal enforces stricter conditions on the input * sequence which allows for some of the logic to be simplified. This * experimental approach is particularly useful for inputs with small * alphabets in the general case, but definitionally saves on sequences * which are known to have many runs of identical values in the sequence. * * @source :The sequence to construct the suffix array on. * @runsRem : * * @alphabetSize : not actually alphabet size, but highest value seen in * alphabet. Might change in future. * * * Notes * 0 = undefined, 1 = L, 2 = S, 3 = {LMS, M} ***********************************************************************/ sequence SAIS(const u8 *source, const size_t sourceLength, const sequence runsRem, const u8 alphabetSize){ //DECLARATIONS////////////////////////////////////////////////////////// sequence toReturn, sanityCheck; size_t **bucket; size_t **oldBucket; size_t i; size_t *bucketSize; size_t *bucketFrontCounter; size_t *bucketEndCounter; unsigned char *LMSandLS; ////INITIALIZATION////////////////////////////////////////////////////// LMSandLS = malloc(sizeof(unsigned char) * runsRem.size); bucket = malloc(sizeof(*bucket) * alphabetSize); oldBucket = malloc(sizeof(*oldBucket) * alphabetSize); bucketSize = calloc(sizeof(size_t)* alphabetSize, 1); bucketFrontCounter = calloc(sizeof(size_t)* alphabetSize, 1); bucketEndCounter = calloc(sizeof(size_t)* alphabetSize, 1); sanityCheck = initSequence(runsRem.size); memset(sanityCheck.S, 0, sizeof(*sanityCheck.S) * sanityCheck.size); sanityCheck.size = 0; /*prescan for buckets************************************************/ //calculate bucket sizes for(i = 0; i < runsRem.size; i++) bucketSize[source[runsRem.S[i]]]++; //calculate bucket start and stops for(short i = 0; i < alphabetSize; i++){ bucket[i] = calloc(sizeof(size_t), bucketSize[i]); oldBucket[i] = calloc(sizeof(size_t), bucketSize[i]); } #ifdef DEBUG //first place where bucket data can be printed fprintf(stderr, "%lu\n", runsRem.size); printBucket(bucket, alphabetSize, bucketSize); #endif //OPERATION///////////////////////////////////////////////////////////// /*set up L, S, and LMS metadata**************************************/ /*The paper stipulates an additional universally minimal character * which is definitionally LMS, but here it is simulated.*/ //Assign characters' values right to left (end to beginning) for L, S, //and LMS size_t loopUntil = runsRem.size - 2; LMSandLS[runsRem.size-1] = _L_; for(i = loopUntil; i != ((size_t)0)-1; i--) LMSandLS[i] = source[runsRem.S[i]] > source[runsRem.S[i+1]] ? _L_ : _S_; i=0; while(1){ while(i < loopUntil && LMSandLS[i] == _L_) i++; if(i >= loopUntil) break; LMSandLS[i++] = _LMS_; while(i < loopUntil && LMSandLS[i] == _S_) i++; if(i >= loopUntil) break; } #ifdef DEBUG printLMSandLS(LMSandLS, runsRem.size); fprintf(stderr, "\n\nAdding to buckets\n\n"); #endif /*PRIMEER***************************************Add entries to buckets*/ //This is supposed to prepare the data to be induce sorted. memcpy(bucketEndCounter, bucketSize, sizeof(*bucketEndCounter) * alphabetSize); bucket[source[runsRem.S[runsRem.size-1]]][0] = runsRem.size-1; //bucketFrontCounter[bucketLocation]++; //LMS type right-to-left scan -- Add LMS entries to the ends of //various buckets going from right to left. The result is partially //full buckets with LMS entries in acending order. for(size_t i = runsRem.size-1; i != ((size_t)0)-1; i--){ if(LMSandLS[i] == _LMS_){ const unsigned char target = source[runsRem.S[i]]; bucket[target][--bucketEndCounter[target]] = i; } } /*LOOP OVER UNTIL COMPLETE*********************************************/ //TODO: there's some really ugly ways to make this run faster. //L type left-to-right scan, not exactly a direct reasoning for this, //please refer to the paper. Bounds checking was used in place of //checking for negative values so that -1 didn't have to be used, //allowing architentually maximal string length. char goOn; do{ goOn = 0; memset(sanityCheck.S, 0, sizeof(*sanityCheck.S) * runsRem.size); sanityCheck.size = 0; //step 3 of setting up SA //L type right to left scan. memcpy(bucketEndCounter, bucketSize, sizeof(*bucketEndCounter) * alphabetSize); for(i = alphabetSize-1; i != ((size_t)0)-1 ; i--){ for(size_t j = bucketSize[i]-1; j != ((size_t)0)-1; j--){ if(!bucket[i][j]) continue; const size_t target = bucket[i][j]-1; if(LMSandLS[target] == _L_ || LMSandLS[target] == _LMS_){ char KILLYOSELF = 0; if(source[runsRem.S[target]] == source[runsRem.S[runsRem.size-1]] && bucketEndCounter[source[runsRem.S[target]]]-1 == 0){ KILLYOSELF = 1; printf("Trying to write over something you're blatently not supposed to write over.\n"); } for(size_t k = 0; k < sanityCheck.size; k++) if(sanityCheck.S[k] == target){ KILLYOSELF=1; printf("trying to write a %lu a second time.\n", target); } if(KILLYOSELF){ printf("KILL YO SELF in r to l\n"); exit(1); } const unsigned char target2 = source[runsRem.S[target]]; bucket[target2][--bucketEndCounter[target2]] = target; sanityCheck.S[sanityCheck.size++] = target; } } } #ifdef DEBUG printBucket(bucket, alphabetSize, bucketSize); #endif //S type left to right scan. memset(bucketFrontCounter, 0, sizeof(*bucketFrontCounter) * alphabetSize); //bucket[source[runsRem.S[runsRem.size-1]]][0] = runsRem.size-1; bucketFrontCounter[source[runsRem.S[runsRem.size-1]]] = 1;//protect last index for(int i = 0; i < alphabetSize; i++){ for(size_t j = 0; j < bucketSize[i]; j++){ if(!bucket[i][j]) continue; const size_t target = bucket[i][j]-1; if(LMSandLS[target] == _S_){ char KILLYOSELF = 0; if(source[runsRem.S[target]] == source[runsRem.S[runsRem.size-1]] && bucketEndCounter[source[runsRem.S[target]]]-1 == 0){ KILLYOSELF = 1; printf("Trying to write over something you're blatently not supposed to write over.\n"); } for(size_t k = 0; k < sanityCheck.size; k++) if(sanityCheck.S[k] == target){ KILLYOSELF=1; printf("trying to write a %lu a second time.\n", target); } if(KILLYOSELF){ printf("KILL YO SELF in l to r\n"); exit(1); } const unsigned char target2 = source[runsRem.S[target]]; bucket[target2][bucketFrontCounter[target2]++] = target; sanityCheck.S[sanityCheck.size++] = target; } } } for(i = 0; i < alphabetSize; i ++) if(memcmp(bucket[i], oldBucket[i], bucketSize[i] * sizeof(size_t))){ for(size_t j = i; j < alphabetSize; j++) if(bucketSize[j]) memcpy(oldBucket[j], bucket[j], sizeof(*bucket) * bucketSize[j]); goOn = 1; break; } }while(goOn); #ifdef DEBUG printBucket(bucket, alphabetSize, bucketSize); #endif //CLEAN UP////////////////////////////////////////////////////////////// free(LMSandLS); toReturn = initSequence(runsRem.size); toReturn.size = 0; for(i = 0; i < alphabetSize; i++) for(size_t j = 0; j < bucketSize[i]; j++) toReturn.S[toReturn.size++] = bucket[i][j]; return toReturn; }