/** Opens appropriate files and handles them @param argc number of command line arguments @param argv array of command line arguments @return the exit status of the program */ int main( int argc, char *argv[] ) { char *wordFile = "words.txt"; FILE *input; FILE *output; if ( argc < 3 || argc > 4 ) { fprintf( stderr, "usage: pack <input.txt> <compressed.raw> [wordfile.txt]\n" ); exit( 1 ); } input = fopen( argv[ 1 ], "rb" ); if ( !input ) { printf( "usage: readFile <input-file>\n" ); exit( 1 ); } output = fopen( argv[ 2 ], "rw" ); if ( !output ) { printf( "usage: readFile <input-file>\n" ); exit( 1 ); } if ( argc == 4 ) { wordFile = argv[ 3 ]; } WordList *wordList = readWordList( wordFile ); #ifdef DEBUG // Report the entire contents of the word list, once it's built. printf( "---- word list -----\n" ); for ( int i = 0; i < wordList->len; i++ ) printf( "%d == %s\n", i, wordList->words[ i ] ); printf( "--------------------\n" ); #endif // ... // Read the contents of the whole file into one big buffer. This could be more // efficient, but it simplifies the rest of the program. char *buffer = readFile( input ); // Write out codes for everything in the buffer. int pos = 0; PendingBits pending = { 0, 0 }; while ( buffer[ pos ] ) { // Get the next code. int code = bestCode( wordList, buffer + pos ); #ifdef DEBUG printf( "%d <- %s\n", code, wordList->words[ code ] ); #endif // Write it out and move ahead by the number of characters we just encoded. readCode( &pending, input ); pos += strlen( wordList->words[ code ] ); } // Write out any remaining bits in the last, partial byte. flushBits( &pending, output ); // ... fclose( input ); fclose( output ); return EXIT_SUCCESS; }
// Compress STDIN into stream of codes // First byte sent in the form [MAXBITS (6 bits)][E_FLAG][P_FLAG] // // The only special code sent is ESCAPE for -e; // everything else is derived in decode. // // Pruning performed as soon as the table is full int encode(int MAXBITS, int E_FLAG, int P_FLAG) { // Send option args encoded as: // MAXBITS: 6 bits (since max value is 20) // E_FLAG: 1 bit // P_FLAG: 1 bit putBits(6, MAXBITS); putBits(1, E_FLAG); putBits(1, P_FLAG); int next_code = 0; // == number of codes assigned == # elts in ARRAY int nBits = 1; // #bits required to send NEXT code if (E_FLAG) next_code = 2; // already assigned 0 to QUIT // 1 to ESCAPE // ============== INITIALIZE TRIE ================ Trie t = createT(); if (!E_FLAG) { // initialize all one-char strings for (int K = 0; K < 256; K++) insertT(t, K, next_code++, 0); nBits = 8; } // ================ ENCODE INPUT ================= Trie C = t; // last node visited int K; while ((K = getchar()) != EOF) { Trie child = getT(C, K); if (child != NULL) { // increment NAP and go down trie sawT(child); C = child; } else { // ============ PUTBITS ========================== if (C == t) { // new 1-char string if (!E_FLAG) DIE_FORMAT("E_FLAG false, yet (EMPTY, K=%d) not in table\n", K); putBits(nBits, ESCAPE); putBits(CHAR_BIT, K); } else { // Output code C putBits(nBits, getCodeT(C)); } // =========== INSERT ============================== // insert new code if table not full if (next_code < (1 << MAXBITS)) { insertT(C, K, next_code++, 1); } // =========== UPDATE NBITS ======================= // Prune as soon as last slot taken if (next_code == (1 << MAXBITS)) { if (P_FLAG) { next_code = prune(&t, E_FLAG); nBits = get_nbits(next_code); } else ; } // Increase NBITS only when #codes assigned // exceeds it else if (next_code > (1 << nBits)) nBits++; // ============ RESET C ===== if (C == t) // new single-char, so skip continue; else { C = getT(t, K); if (C == NULL) { // (EMPTY, K) not in table if (!E_FLAG) DIE_FORMAT("E_FLAG false, yet (EMPTY, K=%d) not in table\n", K); ungetc(K, stdin); // single-char on next insert C = t; } else sawT(C); // increment NAP } } } // Put leftover known prefix if (C != t) { putBits(nBits, getCodeT(C)); } flushBits(); destroyT(t); return 0; }
int encode(int args[]) { //freopen("hash.h", "r", stdin); Hash *h = malloc(sizeof(Hash)); int size = 1 << (args[1]+1); unsigned int time =1; initialize(h, size, args[3], time); int c=EMPTY, k, latestCode; int bits=(args[3])?3:9; printf("%d %d %d|", args[1], args[2], args[3]); while((k = getchar()) != EOF) if(findInHash(h, c , (char)k) != EMPTY) c = findInHash(h, c , (char)k); else { if(args[3] && findInHash(h, EMPTY, (char)k) == EMPTY) //Time to send escape sequence { if (c!=EMPTY) { putBits(bits, c); h->shadowArray[c]->time=(++time); } putBits(bits, ESCAPE); putBits(CHAR_BIT, k); if(h->numElements < ( 1 << args[1])) { latestCode = insert(h, EMPTY, (char)k, 1); if(latestCode>= (1<<(bits))) { putBits(bits, INCR); bits++; } } c = EMPTY; } else { putBits(bits, c); h->shadowArray[c]->time=(++time); if(h->numElements < ( 1 << args[1])) { latestCode = insert(h, c, (char)k, 1); if(latestCode>= (1<<(bits))) { putBits(bits, INCR); bits++; } } c = findInHash(h, EMPTY, (char)k); } if(args[2] && h->numElements == ( 1 << args[1])-1) { putBits(bits, RESET); putBits(bits, c); prune(args, &h, time); //bitchange c=EMPTY; } } if (c!=EMPTY) putBits(bits, c); flushBits(); //printHash(h); freeHash (h); return 0; }
int main(int argc, char **argv) { clock_t begin, end; double time_spent; begin = clock(); int NUM_FINDCODE_CALLS = 0; int NUM_HASHCELLS_VISITED = 0; int c = 0; //char in (pref, char) int prefix = EMPTY_CODE; int index = EMPTY_CODE; int MAXBITS = 15; //default char DUMP_TO[PATH_MAX]; char INIT_FROM[PATH_MAX]; int PRUNE = -1; Table *stringTable = initStringTable(MAXBITS); //setting params from cmd line argv's STARTING FROM 2 BC ENCODE/DECODE for (int i = 2; i < argc; ++i) { if (strcmp(argv[i], "-m") == 0) { //ERROR CHECK FOR IF NEXT ARGV IS NOT INTEGER STRING??? MAXBITS = atoi(argv[i + 1]); i++; } else if (strcmp(argv[i], "-o") == 0) { if(i + 1 == argc) { fprintf(stderr, "usage: encode [-m MAXBITS | -o NAME | -i NAME | -p USED]* OR decode [-o NAME]*\n"); exit(EXIT_FAILURE); //EDGE CASE: IF -O -O HI, THEN DO WE DUMP TO '-O' AND SIGNAL ERROR ON HI? OR DO WE DUMP TO -O, AND OVERRIDE WITH DUMPING TO 'HI'? } strncpy(DUMP_TO, argv[i + 1], strlen(argv[i + 1])); i++; } else if (strcmp(argv[i], "-i") == 0) { if(i + 1 == argc) { fprintf(stderr, "usage: encode [-m MAXBITS | -o NAME | -i NAME | -p USED]* OR decode [-o NAME]*\n"); exit(EXIT_FAILURE); } strncpy(INIT_FROM, argv[i + 1], strlen(argv[i + 1])); i++; } else if (strcmp(argv[i], "-p") == 0) { if(i + 1 == argc) { fprintf(stderr, "usage: encode [-m MAXBITS | -o NAME | -i NAME | -p USED]* OR decode [-o NAME]*\n"); exit(EXIT_FAILURE); } PRUNE = atoi(argv[i + 1]); //test if not an integer; what if prune is 0? w i++; if(PRUNE){} } else { fprintf(stderr, "usage: encode [-m MAXBITS | -o NAME | -i NAME | -p USED]* OR decode [-o NAME]*\n"); exit(EXIT_FAILURE); } } fprintf(stderr, "argv[0]: %s\n", argv[0]); if(strcmp(argv[1], "encode") == 0) { //ENCODE while((c = getchar()) != EOF) { index = findCode(prefix, c, stringTable, &NUM_FINDCODE_CALLS, &NUM_HASHCELLS_VISITED); if (index != -1) //(pref, char) found in stringTable { prefix = index; (stringTable->table[prefix]->useCount)++; continue; } else { //not found in stringTable putBits(stringTable->nBits, prefix); insertToTable(prefix, c, &stringTable, PRUNE); prefix = findCode(0, c, stringTable, &NUM_FINDCODE_CALLS, &NUM_HASHCELLS_VISITED); //start search again at finalChar of newly found sequence (stringTable->table[prefix]->useCount)++; } } putBits(stringTable->nBits, prefix); flushBits(); // printTable(stringTable, "encode"); // fprintf(stderr, "\nNUM_FINDCODE_CALLS: %d\nNUM_HASHCELLS_VISITED: %d\nAVG_SEARCH_RATIO: %d\n", NUM_FINDCODE_CALLS, NUM_HASHCELLS_VISITED, NUM_HASHCELLS_VISITED / NUM_FINDCODE_CALLS); } else if(strcmp(argv[1], "decode") == 0) { //DECODE char ENCODE_OR_DECODE[25] = "decode"; int oldIndex = 0, firstCharOfNewIndex = 0; //used to build stringTable, 1 step behind encode int newIndex = 0; int cascadingIndex = 0; //used to print a code string recursively int kwkwk = 0; //used in kwkwk case int prunedThisPass = 0; int HIT_THE_LIMIT = 0; while( (newIndex = cascadingIndex = getBits(stringTable->nBits)) != EOF) { //nBits was incremented just before to accomodate increased nBits in ENCODE that hasn't been automatically detected in DECODE //and we decrement it now bc it will be automatically incremented in INSERTTABLE //but if == MAXBITS, then we did not artificially increment nBits last round if(stringTable->last + 1 == stringTable->size && !HIT_THE_LIMIT ) {//2nd condition happens when table is already maxSize, in which case no more doubling or pruning happens if(!prunedThisPass) //when pruning results in an at-limit table stringTable->nBits--; if(stringTable->size != 1<<stringTable->nBits){ fprintf(stderr, "size: %d, nBits: %d\n", stringTable->size, stringTable->nBits); exit(EXIT_FAILURE); } else { } } if(newIndex > stringTable->last) //newIndex is an unknown code (kwkwk abnormal case) { kwkwk = 1; if(newIndex < stringTable->size) stringTable->table[newIndex]->useCount++; //useCount usually incremented in printRecursive function, which will not receive newIndex in this case cascadingIndex = oldIndex; } printRecursive(&cascadingIndex, stringTable); if (kwkwk == 1) { printf("%c", firstCharOfNewIndex); //output should be oldCode, oldCode, firstCharOfOldCode kwkwk = 0; } if(cascadingIndex >= stringTable->size) { fprintf(stderr, "cascadingIndex: %d, size: %d\n", cascadingIndex, stringTable->size); exit(EXIT_FAILURE); } firstCharOfNewIndex = stringTable->table[cascadingIndex]->c; //finalK = char(c) if(oldIndex != 0 && !prunedThisPass) { //every time except first time, and after pruning, add new code to table insertToTable(oldIndex, firstCharOfNewIndex, &stringTable, PRUNE); } prunedThisPass = 0; oldIndex = newIndex; if(stringTable->last + 1 >= stringTable->size) { //decode is one step behind encode; deocde inserts new code every time it reads a code, starting from 2nd code //encode adds a code at every code starting from 1st code. CURRENT CODE + firstChar of NEXT CODE was the last code //encode tried to add to table and PRUNED or DOUBLE'd //=>PRUNING or DOUBLING:: //if DOUBLING: next code will increment nBits //if PRUNING: next code is encoded from a PRUNED table, not CURRENT table (and we don't insert next round) if(stringTable->nBits != stringTable->MAXBITS) { if(stringTable->nBits == 8){ exit(EXIT_FAILURE); } stringTable->nBits++; } else { //next insert exceeds table size, and table size is MAXBITS size, so PRUNE HIT_THE_LIMIT = 1; if(PRUNE != -1) { prune(&stringTable, PRUNE, ENCODE_OR_DECODE); prunedThisPass = 1; if (stringTable->last + 1 == stringTable->size) { fprintf(stderr, "HERE!!! last: %d, size: %d\n", stringTable->last, stringTable->size); char temp[10] = "encode"; printTable(stringTable, temp); } if(stringTable->last + 1 != 1<<stringTable->MAXBITS) HIT_THE_LIMIT = 0; } } } } // printTable(stringTable, "decode"); } moses(stringTable); end = clock(); time_spent = (double)(end - begin) / CLOCKS_PER_SEC; fprintf(stderr, "time_spent: %f\n", time_spent); }
void encode(FILE* infile, FILE* outfile, unsigned int maxBits, unsigned int window, bool eFlag) { stringTable* table = stringTableNew(maxBits, eFlag); pruneInfo* pi = pruneInfoNew(maxBits); // write maxBits, window, and eFlag to outfile putBits(NBITS_MAXBITS, maxBits, outfile); putBits(NBITS_WINDOW, window, outfile); eFlag ? putBits(NBITS_EFLAG, 1, outfile) : putBits(NBITS_EFLAG, 0, outfile); // the string table is populated with (c, k) pairs; c is the code for the // prefix of the entry, k is the char appended to the end of the prefix unsigned int c = EMPTY_PREFIX; int k; unsigned char nbits = (eFlag) ? 2 : 9; // number of bits sent per code while((k = fgetc(infile)) != EOF) { tableElt* elt = stringTableHashSearch(table, c, k); if(elt) { c = elt->code; } else if(c == EMPTY_PREFIX) { // we're escaping k, so leave the prefix empty escapeChar(table, pi, k, &nbits, outfile); table = checkPrune(table, pi, window, &c, &nbits, outfile); } else { putBits(nbits, c, outfile); pruneInfoSawCode(pi, c); stringTableAdd(table, c, k, NULL); table = checkPrune(table, pi, window, &c, &nbits, outfile); checkNbits(&nbits, table, outfile); tableElt* kCode = stringTableHashSearch(table, EMPTY_PREFIX, k); if(kCode) { c = kCode->code; } else { escapeChar(table, pi, k, &nbits, outfile); c = EMPTY_PREFIX; // since we escaped k, we now have no prefix checkPrune(table, pi, window, &c, &nbits, outfile); } } } if(c != EMPTY_PREFIX) putBits(nbits, c, outfile); putBits(nbits, STOP_CODE, outfile); flushBits(outfile); stringTableDelete(table); pruneInfoDelete(pi); }
// encodes the input stream by taking advantage of lzw algorithm // also implements logic to prune the trie structure used to store the string // table, and escapes single character codes void encode(int e, int m, int p) { Trie st; createT(&st, e); int c = EMPTY; // same as (EMPTY, K), index of the prefix // int value of char k we are testing to see if c,k exists in the table int k; // number of codes you have inserted, 256 without escape flag... int codeCount = (e) ? 3 : 259; int bitCount = (e) ? 2 : 9; int maxbits = (m<=8 || m>20) ? 12 : m; int maxcodes = (1 << maxbits); int firstRead = false; // if first read of k when e flag is present int pruneCount = 0; printf("%02d:%d:%d:", maxbits, p, e); while((k = getchar())!= EOF) { st[c].appearances++; int ck = searchT(&st, c, k, e); // will increment c's appearance once // if ck is not in the table if(ck<0) { // if prune flag & reached maxcodes, do a prune before next insert // into the table, putBits 0 to indicate a prune has occurred // a prune should likewise happen in decode if(c!=EMPTY) { putBits(bitCount, c); } // add ck to the table as long as (e && c == EMPTY) is false // we will add (empty, k) to the table after this condition // !e and c==EMPTY will never happen, bc all chars will have been // added as children to empty // prune right before we reach maxcodes, we would have lost the next // code we insert anyways, now we won't lose k if(p&&(codeCount+1==maxcodes)) { putBits(bitCount, 0); pruneCount++; Trie newst; createT(&newst, e); int oldCodeCount = codeCount; codeCount=pruneT(&st, &newst, e, oldCodeCount); destroyT(&st, oldCodeCount); st = newst; bitCount=codeLength(codeCount); c=EMPTY; ungetc(k, stdin); continue; } // if(!e || c!=EMPTY) { if(codeCount<maxcodes) { if(tableFilled(codeCount+1)) { int newSize = (codeCount+1)*2; expandT(&st, newSize); bitCount++; } addT(&st, c, k, codeCount); codeCount++; } } // if escape flag is on and k is not yet added to the table if(e && searchT(&st, EMPTY, k, e) < 0) { putBits(bitCount, ESC); // 1 is the index of escape character putBits(8, k); if(codeCount<maxcodes) { if(codeLength(codeCount+1)-codeLength(codeCount)) { int newSize = (codeCount+1)*2; expandT(&st, newSize); bitCount++; } addT(&st, EMPTY, k, codeCount); codeCount++; } firstRead=true; // encode escaped something, don't unget(k) // if this happens } c = EMPTY; // make c empty again if(!firstRead) { ungetc(k, stdin); // put k back to start reading } // a new character else { firstRead = false; } } else { c=ck; // set c to index of next code } } if(c!=EMPTY) { putBits(bitCount, c); } putBits(bitCount, EOFILE); // puts EOF flushBits(); destroyT(&st, codeCount); }