int main (int argc, char** argv) { if (argc == 3) { std::string filename = argv [1]; std::string output = argv [2]; std::string type = get_type (filename); if (type == "" || !find_type (type)) { type = "original"; } read_color (type_location (type)); print_highlight (filename, output); } }
void test_compress(const char *data, size_t length) { size_t skip_table_size = length; if (skip_table_size > 0x10000) { skip_table_size = 0x10000; } uint16_t *skip_table = (uint16_t*)malloc(skip_table_size * sizeof(uint16_t)); memset(skip_table, 0xFF, skip_table_size * sizeof(uint16_t)); uint16_t *hash_table = (uint16_t*)malloc(HASH_TABLE_SIZE * sizeof(uint16_t)); memset(hash_table, 0xFF, HASH_TABLE_SIZE * sizeof(uint16_t)); int scans = 0; int skips = 0; int hashes = 0; unsigned last_match_pos = 0; unsigned last_match_len = 0; unsigned last_end_hash = 0; const char *base = data; size_t base_length = length; unsigned int min_forget_dist = 16000; unsigned forget_length = 0x10000 - min_forget_dist; for (;;) { unsigned block_length = forget_length; if (base_length < block_length) { block_length = (unsigned)base_length; } for (unsigned pos = 0; pos < block_length; pos++) { if (base_length - pos < 3) { // No full trigraph left, not worth finding a match. continue; } const char *cur_str = &base[pos]; unsigned max_match_len = (unsigned)(base_length - pos); // Get the linked list of the previous occourences of the trigraph at the // current position. unsigned hash = hash3(&base[pos]); uint16_t prev = hash_table[hash]; hashes++; hash_table[hash] = (uint16_t)pos; #if DO_LOG putchar('\n'); print_highlight(base, base_length, pos, 1, 30, printf("|> ")); #endif int t = 0; if (prev == 0xFFFF) { #if DO_LOG printf("No previous match for [%.3s]\n", &base[pos]); #endif // The trigraph hasn't been seen yet, so there can be no match. // Initialize linked list with end node and continue scanning. skip_table[pos] = 0xFFFF; last_match_len = 0; continue; } // Link to the chain. skip_table[pos] = (uint16_t)(pos - prev); unsigned end_dist = 0; unsigned end_hash = 0; unsigned end_pos = 0; int dbg_hash_pos; unsigned target_len = 3; unsigned best_pos = 0; unsigned best_len = 0; uint16_t check_pos = prev; if (last_match_len > 3 && base_length - pos > 3) { int len = last_match_len - 1; end_hash = last_end_hash; if (end_hash == HASH_TABLE_SIZE) { last_match_pos++; last_match_len--; #if DO_LOG printf("Last match proven optimal\n"); #endif continue; } end_pos = hash_table[end_hash]; if (end_pos == 0xFFFF) { last_match_pos++; last_match_len--; #if DO_LOG printf("Last match proven optimal\n"); #endif continue; } end_dist = len - 2; target_len = len + 1; best_pos = last_match_pos + 1; best_len = len; #if DO_LOG printf("Searching with [%.3s]..%d..[%.3s]\n", &base[pos], end_dist, &base[pos + last_match_len - 3]); #endif // Synchronize the begin and end trigraphs so that they are separated // by the searched for amount of bytes. for (;;) { int diff = end_pos - check_pos - end_dist; if (diff == 0) break; else if (diff > 0) { uint16_t skip = skip_table[end_pos]; skips++; if (skip > end_pos) break; end_pos -= skip; } else { uint16_t skip = skip_table[check_pos]; skips++; if (skip > check_pos) break; check_pos -= skip; } } } else { #if DO_LOG printf("Searching with [%.3s]\n", &base[pos]); #endif } for (;;) { // Calculate the maximum bound for the match. unsigned check_len = (unsigned)(pos - check_pos); unsigned max_check_len = check_len < max_match_len ? check_len : max_match_len; scans++; // Match as far as possible. const char *check_str = &base[check_pos]; unsigned len; for (len = 0; len < max_check_len; len++) { if (cur_str[len] != check_str[len]) break; } if (len >= target_len) { #if DO_LOG print_highlight(base, base_length, pos, len, 10, printf("%d: ", t)); print_highlight(base, base_length, check_pos, len, 10, printf("%d: ", t)); #endif t++; best_len = len; best_pos = check_pos; if (pos + len >= base_length - 1) { // We have found the longest possible match, nothing to search for. end_hash = HASH_TABLE_SIZE; break; } // We found a new longest match, update end trigraph accordingly. end_hash = hash3(&base[pos + len - 2]); end_pos = hash_table[end_hash]; hashes++; end_dist = len - 2; target_len = len + 1; if (end_pos == 0xFFFF) { // The buffer does not contain any occourences of the end trigraph, // so we have found the longest match there is. break; } dbg_hash_pos = (int)(pos + len - 2); #if DO_LOG printf("Searching with [%.3s]..%d..[%.3s]\n", &base[pos], end_dist, &base[dbg_hash_pos]); #endif } else { // In this case we hit a false match candidate. Carry on. } { // Move to the next match candidate. uint16_t skip = skip_table[check_pos]; skips++; if (skip > check_pos) break; check_pos -= skip; } if (best_len > 0) { // Synchronize the begin and end trigraphs so that they are separated // by the searched for amount of bytes. for (;;) { int diff = end_pos - check_pos - end_dist; if (diff == 0) break; else if (diff > 0) { uint16_t skip = skip_table[end_pos]; skips++; if (skip > end_pos) break; end_pos -= skip; } else { uint16_t skip = skip_table[check_pos]; skips++; if (skip > check_pos) break; check_pos -= skip; } } // Could not find a potential match anymore. if ((unsigned)(end_pos - check_pos) != end_dist) break; } } last_match_pos = best_pos; last_match_len = best_len; last_end_hash = end_hash; } base_length -= block_length; if (base_length == 0) break; // Need to rebase for (unsigned i = 0; i < HASH_TABLE_SIZE; i++) { if (hash_table[i] < block_length) { // Fell off the range of the compressor, "forget" the pointer. hash_table[i] = 0xFFFF; } else { // Still valid, adjust base. hash_table[i] -= (uint16_t)block_length; } } base += block_length; } free(skip_table); free(hash_table); printf("Hashes: %d\n", hashes); printf("Skips: %d\n", skips); printf("Scans: %d\n", scans); }