/** * Reads bit data from a file into an unsigned int. * * This function reads nbits into an unsigned int, padded to the right. * * @param data Pointer to the location for the read bit data. * @param nbits Number of bits to read. * @param stream The BFile buffer to use. * @return Boolean: 1 for all OK, 0 for a problem. */ int BFreadWord(unsigned int *data, int nbits, BFile *stream) { int bytes, rest, i; unsigned char *cdata; if ((nbits > 32) || (nbits < 0)) { Rprintf( "bitio.o/BFreadWord: nbits (%d) not in legal bounds\n", nbits); return 0; } cdata = (unsigned char *)data; bytes = nbits / 8; rest = nbits % 8; if (rest) if (!BFread(cdata + 3 - bytes, rest, stream)) return 0; for (i = 4 - bytes; i < 4; i++) if (!BFread(cdata + i, 8, stream)) return 0; /* As in BFwriteWord, the above code assumes that integers are 4 bytes long and stored in LSB first fashion. To avoid rewriting the whole code, we just convert from this Network byte-order to the platform's native byte-order in the end (which assumes that ints are 4 bytes ... but hey, we've got to live with that in the CWB! */ *data = ntohl(*data); return 1; }
int read_golomb_code_am(int b, BFile *bf) { int q, i, nr_sc, lb, ub; unsigned int r; unsigned char bit; double ldb; ldb = log2(b * 1.0); ub = nint(ceil(ldb)); lb = ub - 1; /* read unary part */ q = 0; do { BFread(&bit, 1, bf); if (bit) q++; } while (bit); nr_sc = (1 << ub) - b; /* read binary part, bitwise */ r = 0; for (i = 0; i < lb; i++) { r <<= 1; BFread(&bit, 1, bf); r |= bit; } if (debug_cwb_compress_rdx) fprintf(debug_output, "%8d: Read r=%5d [%3d/%3d] #sc=%4d, ", codepos, r, lb, ub, nr_sc); if (r >= nr_sc) { r <<= 1; BFread(&bit, 1, bf); r |= bit; r -= nr_sc; } if (debug_cwb_compress_rdx) fprintf(debug_output, "final r=%d\tgap=%d\n", r, r+q*b); return r + q * b; }
/** * Checks a huffcoded attribute for errors by decompressing it. * * This function assumes that compute_code_lengths() has been called * beforehand and made sure that the _uncompressed_ token sequence is * used by CL access functions. * * @param attr The attribute to check. * @param fname Base filename to use for the three compressed-attribute files. * Can be NULL, in which case the filenames in the attribute are used. */ void decode_check_huff(Attribute *attr, char *fname) { BFile bfd; FILE *sync; HCD hc; int pos, size, sync_offset, offset; int l, v; int item, true_item; unsigned char bit; char hcd_path[CL_MAX_LINE_LENGTH]; char huf_path[CL_MAX_LINE_LENGTH]; char sync_path[CL_MAX_LINE_LENGTH]; Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_huffcode, attr->any.name); if (fname) { sprintf(hcd_path, "%s.hcd", fname); sprintf(huf_path, "%s.huf", fname); sprintf(sync_path, "%s.huf.syn", fname); } else { char *path; path = component_full_name(attr, CompHuffSeq, NULL); assert(path && (cderrno == CDA_OK)); strcpy(huf_path, path); path = component_full_name(attr, CompHuffCodes, NULL); assert(path && (cderrno == CDA_OK)); strcpy(hcd_path, path); path = component_full_name(attr, CompHuffSync, NULL); assert(path && (cderrno == CDA_OK)); strcpy(sync_path, path); } Rprintf("- reading code descriptor block from %s\n", hcd_path); if (!ReadHCD(hcd_path, &hc)) { Rprintf( "ERROR: reading %s failed. Aborted.\n", hcd_path); rcqp_receive_error(1); } Rprintf("- reading compressed item sequence from %s\n", huf_path); if (!BFopen(huf_path, "r", &bfd)) { Rprintf( "ERROR: can't open file %s. Aborted.\n", huf_path); perror(huf_path); rcqp_receive_error(1); } Rprintf("- reading sync (mod %d) from %s\n", SYNCHRONIZATION, sync_path); if ((sync = fopen(sync_path, "r")) == NULL) { Rprintf( "ERROR: can't open file %s. Aborted.\n", sync_path); perror(sync_path); rcqp_receive_error(1); } size = cl_max_cpos(attr); if (size != hc.length) { Rprintf( "ERROR: wrong corpus size (%d tokens) in %s (correct size: %d)\n", hc.length, hcd_path, size); rcqp_receive_error(1); } for (pos = 0; pos < hc.length; pos++) { if ((pos % SYNCHRONIZATION) == 0) { offset = BFposition(&bfd); /* need to get offset before flushing (because flushing fills the bit buffer and advances offset to the following byte!) */ if (pos > 0) BFflush(&bfd); sync_offset = -1; /* make sure we get an error if read below fails */ NreadInt(&sync_offset, sync); if (offset != sync_offset) { Rprintf( "ERROR: wrong sync offset %d (true offset %d) at cpos %d. Aborted.\n", sync_offset, offset, pos); rcqp_receive_error(1); } } if (!BFread(&bit, 1, &bfd)) { Rprintf( "ERROR reading file %s. Aborted.\n", huf_path); rcqp_receive_error(1); } v = (bit ? 1 : 0); l = 1; while (v < hc.min_code[l]) { if (!BFread(&bit, 1, &bfd)) { Rprintf( "ERROR reading file %s. Aborted.\n", huf_path); return; } v <<= 1; if (bit) v++; l++; } item = hc.symbols[hc.symindex[l] + v - hc.min_code[l]]; true_item = cl_cpos2id(attr, pos); if (item != true_item) { Rprintf( "ERROR: wrong token (id=%d) at cpos %d (correct id=%d). Aborted.\n", item, pos, true_item); } } fclose(sync); BFclose(&bfd); /* tell the user it's safe to delete the CORPUS component now */ Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompCorpus, NULL)); return; /* exits on error, so there's no return value */ }