// compression void compress(FILE *inputFile, FILE *outputFile) { int prefix = getc(inputFile); if (prefix == EOF) { return; } int character; int nextCode; int index; // LZW starts out with a dictionary of 256 characters (in the case of 8 codeLength) and uses those as the "standard" // character set. nextCode = 256; // next code is the next available string code dictionaryInit(); // while (there is still data to be read) while ((character = getc(inputFile)) != (unsigned)EOF) { // ch = read a character; // if (dictionary contains prefix+character) if ((index = dictionaryLookup(prefix, character)) != -1) prefix = index; // prefix = prefix+character else { // ...no, try to add it // encode s to output file writeBinary(outputFile, prefix); // add prefix+character to dictionary if (nextCode < dictionarySize) dictionaryAdd(prefix, character, nextCode++); // prefix = character prefix = character; //... output the last string after adding the new one } } // encode s to output file writeBinary(outputFile, prefix); // output the last code if (leftover > 0) fputc(leftoverBits << 4, outputFile); // free the dictionary here dictionaryDestroy(); }
int dictionaryLoadFromFile(Dictionary* dict, char* filename) { FILE* inputFile = fopen(filename, "rb"); int count, len, index, retn = 0; char* s; checkPointer(dict); checkPointer(filename); if (!inputFile) { fprintf(stderr, "Error[DIC]: could not open file \"%s\".\n", filename); return DIC_ERROR; } if (fseek(inputFile, 0, SEEK_SET)) { fprintf(stderr, "Error[DIC]: could not seek file \"%s\".\n", filename); retn = DIC_ERROR; goto ldexit; } if (sizeof(int) != fread(&count, 1, sizeof(int), inputFile)) { fprintf(stderr, "Error[DIC]: broken dictionary file \"%s\".\n", filename); retn = DIC_ERROR; goto ldexit; } if (count > dict->maxSize) { dict->strings = realloc(dict->strings, (dict->maxSize = count) * sizeof(Word*)); if (!dict->strings) { fprintf(stderr, "Error[DIC]: not enough memory.\n"); retn = MEMORY_ERROR; goto ldexit; } dict->index = realloc(dict->index, dict->maxSize * sizeof(char*)); if (!dict->index) { fprintf(stderr, "Error[DIC]: not enough memory.\n"); retn = MEMORY_ERROR; goto ldexit; } } for (index = 0; index < count; index++) { if (sizeof(int) == fread(&len, 1, sizeof(int), inputFile)) { s = malloc(len + 1); if (!s) { fprintf(stderr, "Error[DIC]: not enough memory.\n"); retn = MEMORY_ERROR; goto ldexit; } if (len != fread(s, 1, len, inputFile)) { fprintf(stderr, "Error[DIC]: broken dictionary file \"%s\".\n", filename); retn = DIC_ERROR; goto ldexit; } s[len] = 0; if ((retn = dictionaryAdd(dict, s)) < 0) { fprintf(stderr, "Error[DIC]: could not add a record\n"); goto ldexit; } retn = 0; free(s); } else { fprintf(stderr, "Error[DIC]: broken dictionary file \"%s\".\n", filename); retn = DIC_ERROR; goto ldexit; } } if (fread(&index, 1, 1, inputFile) != 0) { fprintf(stderr, "Warning[DIC]: some extra data at the end of the dictionary file \"%s\".\n", filename); } ldexit: if (fclose(inputFile) == EOF) { fprintf(stderr, "Warning[DIC]: closing dictionary file \"%s\" failed.\n", filename); } return retn; }
/** * indexer main() * * @param argc Command line argument count * @param argv command line parameters * @return 0 if ok, 1 on failure * @todo file date comparison */ int main(int argc, char** argv) { FILE* curFile; int curFileNumber; int filesIndexed = 0; Dictionary files, words; PairDictionary pairs; char* curWord = malloc(MAX_WORD_LENGTH + 1); State state = SPACE; int curWordIndex = 0, added = 0; int ch, da; int retn = 0; if ((argc == 1) || ((argc == 2) && (strcmp(argv[1], "--help") == 0)) || ((argc == 2) && (strcmp(argv[1], "-?") == 0))) { printf("Creates a database for the 'search' program.\n\n"); printf("INDEXER [file1] [file2] [file3] ...\n"); printf("INDEXER { [--help] | [-?] } \n\n"); printf("\t--help, -? Show help (default)\n"); return EXIT_SUCCESS; } if (dictionaryInit(&files)) return BASE_ERROR; if (dictionaryInit(&words)) return BASE_ERROR; if (pairDictionaryInit(&pairs)) return BASE_ERROR; if ((file_exists("files.db")) && !dictionaryLoadFromFile(&files, "files.db")) { if (!dictionaryLoadFromFile(&words, "words.db")) { if (!pairDictionaryLoadFromFile(&pairs, "pairs.db")) { //printf("Current search base: %i files, %i words, %i pairs.\n", files.count, words.count, pairs.count); } else { fprintf(stderr, "Warning: corrupt pairs table.\n"); pairDictionaryClear(&pairs); } } else { fprintf(stderr, "Warning: corrupt words database.\n"); dictionaryClear(&words); } } else { printf("Creating new search base.\n"); dictionaryClear(&files); } for (curFileNumber = 1; curFileNumber < argc; curFileNumber++) { curWord[MAX_WORD_LENGTH] = 0; if ((curFile = fopen(argv[curFileNumber], "r"))) { int curFileId; if ((curFileId = dictionaryGetByString(&files, argv[curFileNumber])) != DIC_NO_RECORD) { pairDictionaryRemoveByFileId(&pairs, curFileId); } else curFileId = dictionaryAdd(&files, argv[curFileNumber]); if (curFileId < 0) { fprintf(stderr, "Error: could not add to the dictionary.\n"); retn = INDEXER_ERROR; goto erexit; } curWordIndex = 0; state = SPACE; while((ch = fgetc(curFile)) != EOF) { if (!isspace(ch)) { if (state != SKIP) { state = WORD; curWord[curWordIndex++] = ch; if (curWordIndex == MAX_WORD_LENGTH) { state = SKIP; added = 0; } } } else { if ((state == WORD) || ((state == SKIP) && !added)) { if (curWordIndex < MAX_WORD_LENGTH) curWord[curWordIndex] = 0; else curWord[MAX_WORD_LENGTH] = 0; da = dictionaryAdd(&words, curWord); if (da < 0) { fprintf(stderr, "Error: could not add to the dictionary.\n"); retn = INDEXER_ERROR; goto erexit; } if (pairDictionaryAdd(&pairs, curFileId, da) < 0) { fprintf(stderr, "Error: could not add to the pair dictionary.\n"); retn = INDEXER_ERROR; goto erexit; } curWordIndex = 0; } state = SPACE; } } if (state == WORD) { curWord[curWordIndex] = 0; da = dictionaryAdd(&words, curWord); if (da < 0) { fprintf(stderr, "Error: could not add to the dictionary.\n"); retn = INDEXER_ERROR; goto erexit; } pairDictionaryAdd(&pairs, curFileId, da); curWordIndex = 0; } filesIndexed++; if (curFile) fclose(curFile); } else { printf("Warning: file '%s' does not exist.\n", argv[curFileNumber]); } } dictionarySaveToFile(&files, "files.db"); dictionarySaveToFile(&words, "words.db"); pairDictionarySaveToFile(&pairs, "pairs.db"); erexit: if (curWord) free(curWord); dictionaryFinalize(&files); dictionaryFinalize(&words); pairDictionaryFinalize(&pairs); printf("Current search base: %i files, %i words, %i pairs. %i files indexed.\n", files.count, words.count, pairs.count, filesIndexed); return retn; }