/** * Closes a bit input / output file buffer. * * If this is an output buffer, it is flushed before closing. * * @param stream The file buffer to close. * @return Always returns true. */ int BFclose(BFile *stream) { if (stream->mode == 'w') BFflush(stream); return (fclose(stream->fd) == 0 ? 1 : 0); }
/** * Checks a huffcoded attribute for errors by decompressing it. * * This function assumes that compute_code_lengths() has been called * beforehand and made sure that the _uncompressed_ token sequence is * used by CL access functions. * * @param attr The attribute to check. * @param fname Base filename to use for the three compressed-attribute files. * Can be NULL, in which case the filenames in the attribute are used. */ void decode_check_huff(Attribute *attr, char *fname) { BFile bfd; FILE *sync; HCD hc; int pos, size, sync_offset, offset; int l, v; int item, true_item; unsigned char bit; char hcd_path[CL_MAX_LINE_LENGTH]; char huf_path[CL_MAX_LINE_LENGTH]; char sync_path[CL_MAX_LINE_LENGTH]; Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_huffcode, attr->any.name); if (fname) { sprintf(hcd_path, "%s.hcd", fname); sprintf(huf_path, "%s.huf", fname); sprintf(sync_path, "%s.huf.syn", fname); } else { char *path; path = component_full_name(attr, CompHuffSeq, NULL); assert(path && (cderrno == CDA_OK)); strcpy(huf_path, path); path = component_full_name(attr, CompHuffCodes, NULL); assert(path && (cderrno == CDA_OK)); strcpy(hcd_path, path); path = component_full_name(attr, CompHuffSync, NULL); assert(path && (cderrno == CDA_OK)); strcpy(sync_path, path); } Rprintf("- reading code descriptor block from %s\n", hcd_path); if (!ReadHCD(hcd_path, &hc)) { Rprintf( "ERROR: reading %s failed. Aborted.\n", hcd_path); rcqp_receive_error(1); } Rprintf("- reading compressed item sequence from %s\n", huf_path); if (!BFopen(huf_path, "r", &bfd)) { Rprintf( "ERROR: can't open file %s. Aborted.\n", huf_path); perror(huf_path); rcqp_receive_error(1); } Rprintf("- reading sync (mod %d) from %s\n", SYNCHRONIZATION, sync_path); if ((sync = fopen(sync_path, "r")) == NULL) { Rprintf( "ERROR: can't open file %s. Aborted.\n", sync_path); perror(sync_path); rcqp_receive_error(1); } size = cl_max_cpos(attr); if (size != hc.length) { Rprintf( "ERROR: wrong corpus size (%d tokens) in %s (correct size: %d)\n", hc.length, hcd_path, size); rcqp_receive_error(1); } for (pos = 0; pos < hc.length; pos++) { if ((pos % SYNCHRONIZATION) == 0) { offset = BFposition(&bfd); /* need to get offset before flushing (because flushing fills the bit buffer and advances offset to the following byte!) */ if (pos > 0) BFflush(&bfd); sync_offset = -1; /* make sure we get an error if read below fails */ NreadInt(&sync_offset, sync); if (offset != sync_offset) { Rprintf( "ERROR: wrong sync offset %d (true offset %d) at cpos %d. Aborted.\n", sync_offset, offset, pos); rcqp_receive_error(1); } } if (!BFread(&bit, 1, &bfd)) { Rprintf( "ERROR reading file %s. Aborted.\n", huf_path); rcqp_receive_error(1); } v = (bit ? 1 : 0); l = 1; while (v < hc.min_code[l]) { if (!BFread(&bit, 1, &bfd)) { Rprintf( "ERROR reading file %s. Aborted.\n", huf_path); return; } v <<= 1; if (bit) v++; l++; } item = hc.symbols[hc.symindex[l] + v - hc.min_code[l]]; true_item = cl_cpos2id(attr, pos); if (item != true_item) { Rprintf( "ERROR: wrong token (id=%d) at cpos %d (correct id=%d). Aborted.\n", item, pos, true_item); } } fclose(sync); BFclose(&bfd); /* tell the user it's safe to delete the CORPUS component now */ Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompCorpus, NULL)); return; /* exits on error, so there's no return value */ }
/** * Compresses the token stream of a p-attribute. * * Three files are created: the compressed token stream, the descriptor block, * and a sync file. * * @param attr The attribute to compress. * @param hc Location for the resulting Huffmann code descriptor block. * @param fname Base filename for the resulting files. */ int compute_code_lengths(Attribute *attr, HCD *hc, char *fname) { int id, i, h; int nr_codes = 0; int *heap = NULL; unsigned *codelength = NULL; /* was char[], probably to save space; but that's unnecessary and makes gcc complain */ int issued_codes[MAXCODELEN]; int next_code[MAXCODELEN]; long sum_bits; Rprintf("COMPRESSING TOKEN STREAM of %s.%s\n", corpus_id_cwb_huffcode, attr->any.name); /* I need the following components: * - CompCorpus * - CompCorpusFreqs * - CompLexicon * - CompLexiconIdx * and want to force the CL to use them rather than compressed data. */ { Component *comp; if ((comp = ensure_component(attr, CompCorpus, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the CORPUS component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompLexicon, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the LEXION component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompLexiconIdx, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the LEXIDX component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompCorpusFreqs, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the FREQS component.\n" "Run 'makeall -r %s -c FREQS %s %s' in order to create it.\n", corpus->registry_dir, corpus->registry_name, attr->any.name); rcqp_receive_error(1); } } /* * strongly follows Witten/Moffat/Bell: ``Managing Gigabytes'', * pp. 335ff. */ hc->size = cl_max_id(attr); /* the size of the attribute (nr of items) */ if ((hc->size <= 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_max_id() failed"); rcqp_receive_error(1); } hc->length = cl_max_cpos(attr); /* the length of the attribute (nr of tokens) */ if ((hc->length <= 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_max_cpos() failed"); rcqp_receive_error(1); } hc->symbols = NULL; hc->min_codelen = 100; hc->max_codelen = 0; memset((char *)hc->lcount, '\0', MAXCODELEN * sizeof(int)); memset((char *)hc->min_code, '\0', MAXCODELEN * sizeof(int)); memset((char *)hc->symindex, '\0', MAXCODELEN * sizeof(int)); memset((char *)issued_codes, '\0', MAXCODELEN * sizeof(int)); codelength = (unsigned *)cl_calloc(hc->size, sizeof(unsigned)); /* =========================================== make & initialize the heap */ heap = (int *)cl_malloc(hc->size * 2 * sizeof(int)); for (i = 0; i < hc->size; i++) { heap[i] = hc->size + i; heap[hc->size+i] = get_id_frequency(attr, i) + 1; /* add-one trick needed to avoid unsupported Huffman codes > 31 bits for very large corpora of ca. 2 billion words: theoretical optimal code length for hapax legomena in such corpora is ca. 31 bits, and the Huffman algorithm sometimes generates 32-bit codes; with add-one trick, the theoretical optimal code length is always <= 30 bits */ } /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) fprintf(protocol, "Allocated heap with %d cells for %d items\n\n", hc->size * 2, hc->size); if (do_protocol > 2) print_heap(heap, hc->size, "After Initialization"); /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 1 */ h = hc->size; /* * we address the heap in the following manner: when we start array * indices at 1, the left child is at 2i, and the right child is at * 2i+1. So we maintain this scheme and decrement just before * adressing the array. */ /* * construct the initial min-heap */ for (i = hc->size/2; i > 0; i--) { /* do: * bottom up, left to right, * for each root of each subtree, sift if necessary */ sift(heap, h, i); } /* ============================== PROTOCOL ============================== */ if (do_protocol > 2) { print_heap(heap, hc->size, "Initial Min-Heap"); fprintf(protocol, "\n"); } /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 2 */ /* smallest item at top of heap now, remove the two smallest items * and sift, find second smallest by removing top and sifting, as * long as we have more than one root */ while (h > 1) { int pos[2]; for (i = 0; i < 2; i++) { /* remove topmost (i.e. smallest) item */ pos[i] = heap[0]; /* remove and sift, to reobtain heap integrity: move ``last'' * item to top of heap and sift */ heap[0] = heap[--h]; sift(heap, h, 1); } /* ============================== PROTOCOL ============================== */ if (do_protocol > 3) { fprintf(protocol, "Removed smallest item %d with freq %d\n", pos[0], heap[pos[0]]); fprintf(protocol, "Removed 2nd smallest item %d with freq %d\n", pos[1], heap[pos[1]]); } /* ============================== PROTOCOL ============================== */ /* * pos[0] and pos[1] contain pointers to the two smallest items * now. since h was decremented twice, h and h+1 are now empty and * become the accumulated freq of pos[i]. The individual * frequencies are not needed any more, so pointers to h+1 (the * acc freq) are stored there instead (tricky, since freq cell * becomes pointer cell). So, what happens here, is to include a * new element in the heap. */ heap[h] = h+1; heap[h+1] = heap[pos[0]] + heap[pos[1]]; /* accumulated freq */ heap[pos[0]] = heap[pos[1]] = h+1; /* pointers! */ h++; /* we put a new element into heap */ /* * now, swap it up until we reobtain heap integrity */ { register int parent, current; current = h; parent = current >> 1; while ((parent > 0) && (heap[heap[parent-1]] > heap[heap[current-1]])) { int tmp; tmp = heap[parent-1]; heap[parent-1] = heap[current-1]; heap[current-1] = tmp; current = parent; parent = current >> 1; } } } /* ============================== PROTOCOL ============================== */ if (do_protocol > 3) fprintf(protocol, "\n"); /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 3 */ /* compute the code lengths. We don't have any freqs in heap any * more, only pointers to parents */ heap[0] = -1U; /* root has a depth of 0 */ heap[1] = 0; /* we trust in what they say on p. 345 */ for (i = 2; i < hc->size * 2; i++) heap[i] = heap[heap[i]]+1; /* collect the lengths */ sum_bits = 0L; for (i = 0; i < hc->size; i++) { int cl = heap[i+hc->size]; sum_bits += cl * get_id_frequency(attr, i); codelength[i] = cl; if (cl == 0) continue; if (cl > hc->max_codelen) hc->max_codelen = cl; if (cl < hc->min_codelen) hc->min_codelen = cl; hc->lcount[cl]++; } /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) { fprintf(protocol, "Minimal code length: %3d\n", hc->min_codelen); fprintf(protocol, "Maximal code length: %3d\n", hc->max_codelen); fprintf(protocol, "Compressed code len: %10ld bits, %10ld (+1) bytes\n\n\n", sum_bits, sum_bits/8); } /* ============================== PROTOCOL ============================== */ if (hc->max_codelen >= MAXCODELEN) { Rprintf( "Error: Huffman codes too long (%d bits, current maximum is %d bits).\n", hc->max_codelen, MAXCODELEN-1); Rprintf( " Please contact the CWB development team for assistance.\n"); rcqp_receive_error(1); } if ((hc->max_codelen == 0) && (hc->min_codelen == 100)) { Rprintf( "Problem: No output generated -- no items?\n"); nr_codes = 0; } else { hc->min_code[hc->max_codelen] = 0; for (i = hc->max_codelen-1; i > 0; i--) hc->min_code[i] = (hc->min_code[i+1] + hc->lcount[i+1]) >> 1; hc->symindex[hc->min_codelen] = 0; for (i = hc->min_codelen+1; i <= hc->max_codelen; i++) hc->symindex[i] = hc->symindex[i-1] + hc->lcount[i-1]; /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) { int sum_codes = 0; fprintf(protocol, " CL #codes MinCode SymIdx\n"); fprintf(protocol, "----------------------------------------\n"); for (i = hc->min_codelen; i <= hc->max_codelen; i++) { sum_codes += hc->lcount[i]; fprintf(protocol, "%3d %7d %7d %7d\n", i, hc->lcount[i], hc->min_code[i], hc->symindex[i]); } fprintf(protocol, "----------------------------------------\n"); fprintf(protocol, " %7d\n", sum_codes); } /* ============================== PROTOCOL ============================== */ for (i = 0; i < MAXCODELEN; i++) next_code[i] = hc->min_code[i]; /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "\n"); fprintf(protocol, " Item f(item) CL Bits Code, String\n"); fprintf(protocol, "------------------------------------" "------------------------------------\n"); } /* ============================== PROTOCOL ============================== */ /* compute and issue codes */ hc->symbols = heap + hc->size; for (i = 0; i < hc->size; i++) { /* we store the code for item i in heap[i] */ heap[i] = next_code[codelength[i]]; next_code[codelength[i]]++; /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "%7d %7d %3d %10d ", i, get_id_frequency(attr, i), codelength[i], codelength[i] * get_id_frequency(attr, i)); bprintf(heap[i], codelength[i], protocol); fprintf(protocol, " %7d %s\n", heap[i], get_string_of_id(attr, i)); } /* ============================== PROTOCOL ============================== */ /* and put the item itself in the second half of the table */ heap[hc->size+hc->symindex[codelength[i]]+issued_codes[codelength[i]]] = i; issued_codes[codelength[i]]++; } /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "------------------------------------" "------------------------------------\n"); } /* ============================== PROTOCOL ============================== */ /* The work itself -- encode the attribute data */ { char *path; char hcd_path[CL_MAX_LINE_LENGTH]; char huf_path[CL_MAX_LINE_LENGTH]; char sync_path[CL_MAX_LINE_LENGTH]; Component *corp; BFile bfd; FILE *sync; int cl, code, pos; corp = ensure_component(attr, CompCorpus, 0); assert(corp); if (fname) { path = fname; sprintf(hcd_path, "%s.hcd", path); sprintf(huf_path, "%s.huf", path); sprintf(sync_path, "%s.huf.syn", path); } else { path = component_full_name(attr, CompHuffSeq, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(huf_path, path); path = component_full_name(attr, CompHuffCodes, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(hcd_path, path); path = component_full_name(attr, CompHuffSync, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(sync_path, path); } Rprintf("- writing code descriptor block to %s\n", hcd_path); if (!WriteHCD(hcd_path, hc)) { Rprintf( "ERROR: writing %s failed. Aborted.\n", hcd_path); rcqp_receive_error(1); } Rprintf("- writing compressed item sequence to %s\n", huf_path); if (!BFopen(huf_path, "w", &bfd)) { Rprintf( "ERROR: can't create file %s\n", huf_path); perror(huf_path); rcqp_receive_error(1); } Rprintf("- writing sync (every %d tokens) to %s\n", SYNCHRONIZATION, sync_path); if ((sync = fopen(sync_path, "w")) == NULL) { Rprintf( "ERROR: can't create file %s\n", sync_path); perror(sync_path); rcqp_receive_error(1); } for (i = 0; i < hc->length; i++) { /* SYNCHRONIZE */ if ((i % SYNCHRONIZATION) == 0) { if (i > 0) BFflush(&bfd); pos = BFposition(&bfd); NwriteInt(pos, sync); } id = cl_cpos2id(attr, i); if ((id < 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_cpos2id() failed"); rcqp_receive_error(1); } else { assert((id >= 0) && (id < hc->size) && "Internal Error"); cl = codelength[id]; code = heap[id]; if (!BFwriteWord((unsigned int)code, cl, &bfd)) { Rprintf( "Error writing code for ID %d (%d, %d bits) at position %d. Aborted.\n", id, code, cl, i); rcqp_receive_error(1); } } } fclose(sync); BFclose(&bfd); } } free(codelength); free(heap); return 1; }
/** * Checks a compressed reversed index for errors by decompressing it. * * This function this assumes that compress_reversed_index() has been called * beforehand and made sure that the _uncompressed_ index is used by CL * access functions. * * @param attr The attribute to check the index of. * @param output_fn Base name for the compressed RDX files to be read * (if this is null, filename swill be taken from the * attribute). */ void decompress_check_reversed_index(Attribute *attr, char *output_fn) { char *s; char data_fname[CL_MAX_FILENAME_LENGTH]; char index_fname[CL_MAX_FILENAME_LENGTH]; int nr_elements; int element_freq; int corpus_size; int pos, gap; int b; int i, k; BFile data_file; FILE *index_file; PositionStream PStream; int true_pos; Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name); nr_elements = cl_max_id(attr); if ((nr_elements <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_id() failed"); compressrdx_cleanup(1); } corpus_size = cl_max_cpos(attr); if ((corpus_size <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_cpos() failed"); compressrdx_cleanup(1); } if (output_fn) { sprintf(data_fname, "%s.crc", output_fn); sprintf(index_fname, "%s.crx", output_fn); } else { s = component_full_name(attr, CompCompRF, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(data_fname, s); s = component_full_name(attr, CompCompRFX, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(index_fname, s); } if (! BFopen(data_fname, "r", &data_file)) { Rprintf( "ERROR: can't open file %s\n", data_fname); perror(data_fname); compressrdx_cleanup(1); } Rprintf("- reading compressed index from %s\n", data_fname); if ((index_file = fopen(index_fname, "r")) == NULL) { Rprintf( "ERROR: can't open file %s\n", index_fname); perror(index_fname); compressrdx_cleanup(1); } Rprintf("- reading compressed index offsets from %s\n", index_fname); for (i = 0; i < nr_elements; i++) { element_freq = cl_id2freq(attr, i); if ((element_freq == 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) token frequency == 0\n"); compressrdx_cleanup(1); } PStream = cl_new_stream(attr, i); if ((PStream == NULL) || (cl_errno != CDA_OK)) { cl_error("(aborting) index read error"); compressrdx_cleanup(1); } b = compute_ba(element_freq, corpus_size); if (debug_cwb_compress_rdx) fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n", i, element_freq, b); pos = 0; for (k = 0; k < element_freq; k++) { gap = read_golomb_code_bf(b, &data_file); pos += gap; if (1 != cl_read_stream(PStream, &true_pos, 1)) { cl_error("(aborting) index read error\n"); compressrdx_cleanup(1); } if (pos != true_pos) { Rprintf( "ERROR: wrong occurrence of token #%d at cpos %d (correct cpos: %d). Aborted.\n", i, pos, true_pos); compressrdx_cleanup(1); } } cl_delete_stream(&PStream); BFflush(&data_file); } fclose(index_file); BFclose(&data_file); /* tell the user it's safe to delete the REVCORP and REVCIDX components now */ Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompRevCorpus, NULL)); Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompRevCorpusIdx, NULL)); return; }
/** * Compresses the reversed index of a p-attribute. * * @param attr The attribute to compress the index of. * @param output_fn Base name for the compressed RDX files to be written * (if this is null, filenames will be taken from the * attribute). */ void compress_reversed_index(Attribute *attr, char *output_fn) { char *s; char data_fname[CL_MAX_FILENAME_LENGTH]; char index_fname[CL_MAX_FILENAME_LENGTH]; int nr_elements; int element_freq; int corpus_size; int last_pos, gap, fpos; int b; int i, k; BFile data_file; FILE *index_file = NULL; PositionStream PStream; int new_pos; Rprintf("COMPRESSING INDEX of %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name); /* ensure that we do NOT use the compressed index while building the * compressed index (yeah, a nasty thing that). That is, load the * .corpus.rev and .corpus.rdx components in order to force * subsequent CL calls to use the uncompressed data. */ { Component *comp; if ((comp = ensure_component(attr, CompRevCorpus, 0)) == NULL) { Rprintf( "Index compression requires the REVCORP component\n"); compressrdx_cleanup(1); } if ((comp = ensure_component(attr, CompRevCorpusIdx, 0)) == NULL) { Rprintf( "Index compression requires the REVCIDX component\n"); compressrdx_cleanup(1); } } nr_elements = cl_max_id(attr); if ((nr_elements <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_id() failed"); compressrdx_cleanup(1); } corpus_size = cl_max_cpos(attr); if ((corpus_size <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_cpos() failed"); compressrdx_cleanup(1); } if (output_fn) { sprintf(data_fname, "%s.crc", output_fn); sprintf(index_fname, "%s.crx", output_fn); } else { s = component_full_name(attr, CompCompRF, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(data_fname, s); s = component_full_name(attr, CompCompRFX, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(index_fname, s); } if (! BFopen(data_fname, "w", &data_file)) { Rprintf( "ERROR: can't create file %s\n", data_fname); perror(data_fname); compressrdx_cleanup(1); } Rprintf("- writing compressed index to %s\n", data_fname); if ((index_file = fopen(index_fname, "wb")) == NULL) { Rprintf( "ERROR: can't create file %s\n", index_fname); perror(index_fname); compressrdx_cleanup(1); } Rprintf("- writing compressed index offsets to %s\n", index_fname); for (i = 0; i < nr_elements; i++) { element_freq = cl_id2freq(attr, i); if ((element_freq == 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) token frequency == 0\n"); compressrdx_cleanup(1); } PStream = cl_new_stream(attr, i); if ((PStream == NULL) || (cl_errno != CDA_OK)) { cl_error("(aborting) index read error"); compressrdx_cleanup(1); } b = compute_ba(element_freq, corpus_size); fpos = BFposition(&data_file); NwriteInt(fpos, index_file); if (debug_cwb_compress_rdx) fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n", i, element_freq, b); last_pos = 0; for (k = 0; k < element_freq; k++) { if (1 != cl_read_stream(PStream, &new_pos, 1)) { cl_error("(aborting) index read error\n"); compressrdx_cleanup(1); } gap = new_pos - last_pos; last_pos = new_pos; if (debug_cwb_compress_rdx) fprintf(debug_output, "%8d: gap=%4d, b=%4d\n", codepos, gap, b); write_golomb_code(gap, b, &data_file); codepos++; } cl_delete_stream(&PStream); BFflush(&data_file); } fclose(index_file); BFclose(&data_file); return; }