int test_getCodeword(){ MyString* S = newString(); appendString(S, 65); int code = getCodeword(dict, S); if(code != 65){ printf("FAIL:\tgetCodeword: string \"65\" should correspond to code 65, was %d.\n", code); //return 1; } deleteString(S); S = newString(); appendString(S, 0); appendString(S, 1); appendString(S, 2); appendString(S, 3); code = getCodeword(dict, S); if(code != 256){ printf("FAIL:\tgetCodeword: string \"0123\" should correspond to code 256, was %d.\n", code); return 1; } return 0; }
void neighbourLookup_build(struct PSSMatrix PSSMatrix, struct scoreMatrix scoreMatrix, int4 wordLength) { int4 queryPosition = 0; int4 numNeighbours; int4 codeword; int4 numWords = proteinLookup_numWords; struct neighbour *neighbours = (struct neighbour *)global_malloc(sizeof(struct neighbour) * numWords); while (queryPosition < PSSMatrix.length - wordLength + 1) { codeword = getCodeword(PSSMatrix.queryCodes + queryPosition, wordLength); if (neighborLookup[codeword].numNeighbours == 0) { numNeighbours = 0; // wordLookupDFA_getNeighbours(PSSMatrix, queryPosition, &numNeighbours, // neighbours); wordLookupSM_getNeighbours(PSSMatrix.queryCodes, scoreMatrix, queryPosition, &numNeighbours, neighbours); neighborLookup[codeword].numNeighbours = numNeighbours; neighborLookup[codeword].neighbours = (int4 *)global_malloc(sizeof(int4) * numNeighbours); while (numNeighbours > 0) { numNeighbours--; neighborLookup[codeword].neighbours[numNeighbours] = neighbours[numNeighbours].codeword; } } // printf("%d %d\n", codeword, neighborLookup[codeword].numNeighbours); queryPosition++; } free(neighbours); }
// Get all the neighbours for given query window void wordLookupSM_getNeighbours(char *codes, struct scoreMatrix scoreMatrix, int4 queryPosition, int4 *numNeighbours, struct neighbour *neighbours) { int4 codeword, score = 0, count = queryPosition, containsWild = 0; // Get score for aligning the best match codes to the query window while (count < queryPosition + parameters_wordSize) { if (codes[count] >= encoding_numRegularLetters) containsWild = 1; // score += PSSMatrix.matrix[count][PSSMatrix.bestMatchCodes[count]]; score += scoreMatrix.matrix[codes[count]][codes[count]]; // printf("%d %c %d\n", count, encoding_getLetter(codes[count]), // scoreMatrix.matrix[codes[count]][codes[count]]); count++; } // printf("score: %d\n", score); // If a word containing wildcards only consider nearest neighbour if high // scoring if (!containsWild || score >= parameters_T) { // Convert query word codes to codeword codeword = getCodeword(codes + queryPosition, parameters_wordSize); // Automatically add the query word itself to list of neighbours neighbours[*numNeighbours].codeword = codeword; neighbours[*numNeighbours].score = score; neighbours[*numNeighbours].position = 0; (*numNeighbours)++; // Recursively find remaining neighbours wordLookupSM_findNeighbours(codes, scoreMatrix, queryPosition, numNeighbours, neighbours); } }
int main(int argc, char* argv[]){ if ( argc != 3 ) // argc should be 3 for correct execution { printf( "usage: %s input_filename output_filename\n", argv[0] ); } //read text file into input array OriginalData* orig = readOriginalData(argv[1]); //allocate the same amount of compressed data (lets hope it is enough) CompressedData* compressed = newCompressedData(orig->dataLength, 8); //initializez the dictionary and the symbols in the dictionary Dictionary* dict = newDictionary(8); initDictionary(dict); printf("Compressing...\n"); writeToCompressedData(compressed, dict->clearCode); MyString* S = newString(); int codeword; int dictReturn; while(hasNextSymbol(orig)){ appendString(S, (uint8_t) nextSymbol(orig)); if(getCodeword(dict, S) != -1){ ; } else { //output S minus the last char S->length--; codeword = getCodeword(dict, S); writeToCompressedData(compressed, codeword); S->length++; //great for debugging : // printf("wrote a %d bit codeword: %d\n", compressed->bitWidth, codeword); // printf("Saved new word: %d \t", dict->wordCount); // printString(S); // printf("\n"); //add S to the dictionary dictReturn = addToDictionary(dict, S); if(dictReturn == -2){ //for debugging : // printf("--------------------------Increased Bit Width\n"); //increase the bit width by one compressed->bitWidth++; dict->bitWidth *= 2; } if(dictReturn == -1){ //dictionary is full, clear it and write a clearCode clearDictionary(dict); writeToCompressedData(compressed, dict->clearCode); compressed->bitWidth = compressed->rootBitWidth+1; //step one symbol back in original data orig->nextSymbol--; orig->dataLeft++; //make S empty S->length = 0; }else{ //"delete" the last character (it will still be in the data) S->length--; S->data[0] = S->data[S->length];//make S start with the "deleted" character S->length = 1; //make the length of S 1, now S is the earlier last character } } } //write the last codeword codeword = getCodeword(dict, S); writeToCompressedData(compressed, codeword); //write EOI writeToCompressedData(compressed, dict->endOfInformation); writeCompressedDataToFile(compressed, argv[2]); printf("File %s compressed succesfully to %s\n", argv[1], argv[2]); return 0; }