/** * Main function for cwb-huffcode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int R_cwb_huffcode (char *corpus_name, char *registry_dir) { char *registry_directory = registry_dir; char *output_fn = NULL; char *attr_name = DEFAULT_ATT_NAME; Attribute *attr; HCD hc; Rprintf("Corpus: %s\n", corpus_name); int i_want_to_believe = 0; /* skip error checks? */ int all_attributes = 0; protocol = NULL; /* 'delayed' init (see top of file) */ /* ------------------------------------------------- PARSE ARGUMENTS */ /* parse arguments */ all_attributes++; corpus_id_cwb_huffcode = corpus_name; Rprintf("Corpus 1: %s\n", corpus_name); if ((corpus = cl_new_corpus(registry_directory, corpus_id_cwb_huffcode)) == NULL) { Rprintf( "Corpus %s not found in registry %s . Aborted.\n", corpus_id_cwb_huffcode, (registry_directory ? registry_directory : central_corpus_directory())); rcqp_receive_error(1); } Rprintf("Corpus 2: %s\n", corpus_name); if (all_attributes) { for (attr = corpus->attributes; attr; attr = attr->any.next) if (attr->any.type == ATT_POS) { compute_code_lengths(attr, &hc, output_fn); if (! i_want_to_believe) decode_check_huff(attr, output_fn); } } else { if ((attr = cl_new_attribute(corpus, attr_name, ATT_POS)) == NULL) { Rprintf( "Attribute %s.%s doesn't exist. Aborted.\n", corpus_id_cwb_huffcode, attr_name); rcqp_receive_error(1); } compute_code_lengths(attr, &hc, output_fn); if (! i_want_to_believe) decode_check_huff(attr, output_fn); } Rprintf("Corpus 3: %s\n", corpus_name); cl_delete_corpus(corpus); return(0); }
/** Open disk files for the s-attribute being encoded (must have been declared first). */ void sencode_open_files(void) { char buf[CL_MAX_LINE_LENGTH]; sprintf(buf, RNG_RNG, new_satt.dir, new_satt.name); if ((new_satt.fd = fopen(buf, "wb")) == NULL) { perror(buf); rcqp_receive_error(1); } if (new_satt.store_values) { sprintf(buf, RNG_AVS, new_satt.dir, new_satt.name); if ((new_satt.avs = fopen(buf, "w")) == NULL) { perror(buf); rcqp_receive_error(1); } sprintf(buf, RNG_AVX, new_satt.dir, new_satt.name); if ((new_satt.avx = fopen(buf, "wb")) == NULL) { perror(buf); rcqp_receive_error(1); } } new_satt.ready = 1; }
/** Close the disk files for the s-attribute being encoded. */ void sencode_close_files(void) { if (new_satt.ready) { if (EOF == fclose(new_satt.fd)) { perror("Error writing RNG file"); rcqp_receive_error(1); } if (new_satt.avs) { if (EOF == fclose(new_satt.avs)) { perror("Error writing AVS file"); rcqp_receive_error(1); } } if (new_satt.avx) { if (EOF == fclose(new_satt.avx)) { perror("Error writing AVX file"); rcqp_receive_error(1); } } new_satt.ready = 0; } }
/** * Shuts down the server with an "internal error" condition. * * Both parameters will be printed as part of the shutdown error message. * * @param function String: should be name of the calling function, that is, * the point where the error was raised. * @param reason String containing any other explanatory details about the error. */ void cqiserver_internal_error(char *function, char *reason) { Rprintf( "CQPserver: internal error in %s()\n", function); Rprintf( "CQPserver: ''%s''\n", reason); rcqp_receive_error(1); }
/** * Cleans up memory prior to an error-prompted exit. * * @param error_code Value to be returned by the program when it exits. */ void decode_cleanup(int error_code) { if (corpus != NULL) cl_delete_corpus(corpus); rcqp_receive_error(error_code); }
/** * Prints a usage message and exits the program. * * @param msg A message about the error. * @param error_code Value to be returned by the program when it exits. */ void huffcode_usage(char *msg, int error_code) { if (msg) Rprintf( "Usage error: %s\n", msg); Rprintf( "\n"); Rprintf( "Usage: %s [options] <corpus>\n\n", progname); Rprintf( "Compress the token sequence of a positional attribute. Creates .huf, .hcd,\n"); Rprintf( "and .huf.syn files, which replace the corresponding .corpus files. After\n"); Rprintf( "running this tool successfully, the .corpus files can be deleted.\n"); Rprintf( "\n"); Rprintf( "Options:\n"); Rprintf( " -P <att> compress attribute <att> [default: word]\n"); Rprintf( " -A compress all positional attributes\n"); Rprintf( " -r <dir> set registry directory\n"); Rprintf( " -f <file> set output file prefix (creates <file>.huf, ...)\n"); Rprintf( " -v verbose mode (shows protocol) [may be repeated]\n"); /* Rprintf( " -d debug mode (not implemented)\n"); *//* TODO -d / -D distinct as in cwb-compress-rdx? */ Rprintf( " -T skip validation pass ('I trust you')\n"); Rprintf( " -h this help page\n\n"); Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n"); if (corpus) cl_delete_corpus(corpus); rcqp_receive_error(error_code); }
/** * Safely reallocates memory. * * @see cl_malloc * @param block Pointer to the block to be reallocated * @param bytes Number of bytes to allocate to the resized memory block * @ return Pointer to the block of reallocated memory */ void * cl_realloc(void *block, size_t bytes) { void *new_block; if (block == NULL) new_block = malloc(bytes); /* some OSs don't fall back to malloc() if block == NULL */ else new_block = realloc(block, bytes); if (new_block == NULL) { if (bytes == 0) { /* don't warn any more, reallocating to 0 bytes should create no problems, at least on Linux and Solaris */ /* (the message was probably shown on Linux only, because Solaris doesn't return NULL in this case) */ /* Rprintf( "CL: WARNING realloc() to 0 bytes!\n"); */ } else { Rprintf( "CL: Out of memory. (killed)\n"); Rprintf( "CL: [cl_realloc(block at %p to %ld bytes)]\n", block, bytes); Rprintf("\n"); /* for CQP's child mode */ rcqp_receive_error(1); } } return new_block; }
/** * print usage message and exit */ void sencode_usage(void) { Rprintf( "\n"); Rprintf( "Usage: %s [options] (-S <att> | -V <att>)\n", progname); Rprintf( "\n"); Rprintf( "Adds s-attributes with computed start and end points to a corpus\n"); Rprintf( "\n"); Rprintf( "Options:\n"); Rprintf( " -B strip leading/trailing blanks from annotations\n"); Rprintf( " -d <dir> directory for output files\n"); Rprintf( " -f <file> read input from <file> [default: stdin]\n"); Rprintf( " -M create list of regions in memory (resolving overlaps)\n"); Rprintf( " -r <dir> set registry directory <dir>\n"); Rprintf( " -C <id> work on corpus <id> (with -a option)\n"); Rprintf( " -a add to existing annotation (resolving overlaps, implies -M)\n"); Rprintf( " -m treat annotations as feature set (or 'multi-value') attribute\n"); Rprintf( " -s (with -m) check that format of set annotations is consistent\n"); Rprintf( " -q silent mode ('be quiet')\n"); Rprintf( " -D debug mode\n"); Rprintf( " -S <att> generate s-attribute <att>\n"); Rprintf( " -V <att> generate s-attribute <att> with annotations\n"); Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n"); rcqp_receive_error(2); }
/** * Converts a regular expression to a DFA. Public function. * * @param rxs The regular expression. * @param automaton Pointer to the DFA object to write to. */ void regex2dfa(char *rxs, DFA *automaton) { int Q, i, j; int S, Sh, Classes, C; State SP; searchstr = rxs; init(); Q = Parse(); if (ERRORS > 0) Rprintf( "%d error(s)\n", ERRORS); if (Q == -1) rcqp_receive_error(1); FormState(Q); MergeStates(); automaton->Max_States = Ss; automaton->Max_Input = Environment[eep].MaxPatIndex + 1; automaton->E_State = automaton->Max_States; if (show_dfa) /* TODO: Use a module-internal debug variable (for encapsulation). */ WriteStates(); /* allocate memory for the transition table and initialize it. */ automaton->TransTable = (int **)cl_malloc(sizeof(int *) * automaton->Max_States); for (i = 0; i < Ss; i++) { automaton->TransTable[i] = (int *)cl_malloc(sizeof(int) * automaton->Max_Input); for (j = 0; j < automaton->Max_Input; j++) automaton->TransTable[i][j] = automaton->E_State; } /* allocate memory for the table of final states. */ automaton->Final = (Boolean *)cl_malloc(sizeof(Boolean) * (Ss + 1)); /* initialize the table of final states. */ for (i = 0; i <= automaton->Max_States; i++) automaton->Final[i] = False; for (S = Classes = 0; S < Ss; S++) { SP = &STab[S]; if (SP->Class != Classes) continue; Classes++; if (SP->Empty) automaton->Final[SP->Class] = True; for (Sh = 0; Sh < SP->Shifts; Sh++) { C = SP->ShList[Sh].RHS; automaton->TransTable[SP->Class][atoi(SP->ShList[Sh].LHS->Name)] = STab[C].Class; } } }
/** * Writes an integer to file, converting to network byte order. * * Other than the byte order conversion, this is the same as * fwrite(&val, sizeof(int), 1, fd) . * * @param val The integer to write. * @param fd File handle to write to. */ void NwriteInt(int val, FILE *fd) { int word; word = htonl(val); if (1 != fwrite(&word, sizeof(int), 1, fd)) { perror("File write error"); rcqp_receive_error(1); } }
/** * Reads an integer from file, converting from network byte order. * * This function does all the error checking for you, and will abort * the program if the int cannot be read. * * @param val Location to put the resulting int. * @param fd File handle to read from */ void NreadInt(int *val, FILE *fd) { int word; if (1 != fread(&word, sizeof(int), 1, fd)) { perror("File read error"); rcqp_receive_error(1); } *val = ntohl(word); }
/** * Main function for cwb-atoi. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char **argv) { FILE *fd; int i; char *progname = argv[0]; /* default case: we are reading from stdin */ fd = stdin; for (i = 1; i < argc; i++) { if (argv[i][0] == '-') { switch (argv[i][1]) { case 'n': little_endian = 0; break; case 'l': little_endian = 1; break; case 'h': default: Rprintf( "\n"); Rprintf( "Usage: %s [options] [file]\n", argv[0]); Rprintf( "Reads one integer per line from ASCII file <file> or from standard input\n"); Rprintf( "and writes values to standard output as 32-bit integers in network format\n"); Rprintf( "(the format used by CWB binary data files).\n"); Rprintf( "Options:\n"); Rprintf( " -n convert to network format [default]\n"); Rprintf( " -l convert to little endian format\n"); Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n"); rcqp_receive_error(1); } } else if ((fd = fopen(argv[i], "rb")) == NULL) { Rprintf( "%s: Couldn't open %s\n", progname, argv[i]); rcqp_receive_error(1); } } /* now process either input file or stdin */ process_fd(fd); return 0; }
/** * Main function for cwb-itoa. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char **argv) { FILE *fd; int i; char *progname = argv[0]; fd = stdin; /* initialisation removed from declaration for Gnuwin32 compatibility */ for (i = 1; i < argc; i++) { if (argv[i][0] == '-') { switch (argv[i][1]) { case 'n': little_endian = 0; break; case 'l': little_endian = 1; break; case 'h': default: Rprintf( "\n"); Rprintf( "Usage: %s [options] [file]\n", argv[0]); Rprintf( "Reads 32bit integers in network format from CWB binary data file <file>\n"); Rprintf( "or from standard input and prints the values as ASCII numbers on standard\n"); Rprintf( "output (one number per line).\n"); Rprintf( "Options:\n"); Rprintf( " -n read integers in network format [default]\n"); Rprintf( " -l read integers in little endian format\n"); Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n"); rcqp_receive_error(1); } } else if ((fd = fopen(argv[i], "rb")) == NULL) { Rprintf( "%s: Couldn't open %s\n", progname, argv[i]); rcqp_receive_error(1); } } /* now process either input file or stdin */ process_fd(fd); return 0; }
/** * Cleans up memory prior to an error-prompted exit. * * @param error_code Value to be returned by the program when it exits. */ void compressrdx_cleanup(int error_code) { if (corpus) cl_delete_corpus(corpus); if (debug_output != NULL) fclose(debug_output); rcqp_receive_error(error_code); }
void PUSH(StackTag Tag, int Q) { if (SP >= Stack + STACK_MAX) { REGEX2DFA_ERROR("Expression too complex ... aborting."); rcqp_receive_error(1); } SP->Tag = Tag; SP->Q = Q; SP++; }
/** * Sends the current CL error value to the client. * * This function takes the current contents of of the CL library's global * cl_errno error value and sends it to the client. * * It takes the CL error consant and translates it into the corresponding * CQI_CL_ERROR_* constant. * * NB: This function shuts down the server with an error condition if cl_errno * does not actually contain an error condition. * * @see cl_errno */ void send_cl_error(void) { int cmd; switch (cl_errno) { case CDA_EATTTYPE: cmd = CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE; break; case CDA_EIDORNG: case CDA_EIDXORNG: case CDA_EPOSORNG: cmd = CQI_CL_ERROR_OUT_OF_RANGE; break; case CDA_EPATTERN: case CDA_EBADREGEX: cmd = CQI_CL_ERROR_REGEX; break; case CDA_ENODATA: cmd = CQI_CL_ERROR_CORPUS_ACCESS; break; case CDA_ENOMEM: cmd = CQI_CL_ERROR_OUT_OF_MEMORY; break; case CDA_EOTHER: case CDA_ENYI: cmd = CQI_CL_ERROR_INTERNAL; break; case CDA_OK: Rprintf( "CQPserver: send_cl_error() called with cderrno == CDA_OK\n"); rcqp_receive_error(1); default: Rprintf( "CQPserver: send_cl_error() unknown value in cderrno\n"); rcqp_receive_error(1); } if (server_debug) Rprintf( "CQi: CL error, returning 0x%04X\n", cmd); cqi_command(cmd); return; }
/** * Prints a message describing how to use the program to STDERR and then exits. */ void describecorpus_usage(void) { Rprintf( "\n"); Rprintf( "Usage: %s [flags] <corpus> [<corpus> ...] \n", progname); Rprintf( "Options:\n"); Rprintf( " -r <dir> use registry directory <dir>\n"); Rprintf( " -s show statistics (attribute & lexicon size)\n"); Rprintf( " -d show details (about component files)\n"); Rprintf( " -h this help page\n"); Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n"); rcqp_receive_error(2); }
/** * Safely allocates memory malloc-style. * * This function allocates a block of memory of the requested size, * and does a test for malloc() failure which aborts the program and * prints an error message if the system is out of memory. * So the return value of this function can be used without further * testing for malloc() failure. * * @param bytes Number of bytes to allocate * @return Pointer to the block of allocated memory */ void * cl_malloc(size_t bytes) { void *block; block = malloc(bytes); if (block == NULL) { Rprintf( "CL: Out of memory. (killed)\n"); Rprintf( "CL: [cl_malloc(%ld)]\n", bytes); Rprintf("\n"); /* for CQP's child mode */ rcqp_receive_error(1); } return block; }
/** * Safely duplicates a string. * * @see cl_malloc * @param string Pointer to the original string * @return Pointer to the newly duplicated string */ char * cl_strdup(char *string) { char *new_string; new_string = strdup(string); if (new_string == NULL) { Rprintf( "CL: Out of memory. (killed)\n"); Rprintf( "CL: [cl_strdup(addr=%p, len=%ld)]\n", string, strlen(string)); Rprintf("\n"); /* for CQP's child mode */ rcqp_receive_error(1); } return new_string; }
/** * Writes an array of integers to file, converting to network byte order. * * Other than the byte order conversion, this is the same as * fwrite(vals, sizeof(int), nr_vals, fd) . * * @param vals Pointer to the location of the block of integers to write. * @param nr_vals Number of integers to write. * @param fd File handle to write to. */ void NwriteInts(int *vals, int nr_vals, FILE *fd) { int word, k; /* I strongly believe in buffered IO (;-) */ for (k = 0; k < nr_vals; k++) { word = htonl(vals[k]); if (1 != fwrite(&word, sizeof(int), 1, fd)) { perror("File write error"); rcqp_receive_error(1); } } }
/** * Reads an array of integers from file, converting from network byte order. * * This function does all the error checking for you, and will abort * the program if the requested number of ints cannot be read. * * @param vals Pointer to location to put the resulting array of ints. * (This memory must have been allocated by the caller.) * @param nr_vals Number of integers to read. * @param fd File handle to read from */ void NreadInts(int *vals, int nr_vals, FILE *fd) { int word, k; /* I strongly believe in buffered IO (;-) */ for (k = 0; k < nr_vals; k++) { if (1 != fread(&word, sizeof(int), 1, fd)) { perror("File read error"); rcqp_receive_error(1); } vals[k] = ntohl(word); } }
/** * Safely allocates memory calloc-style. * * @see cl_malloc * @param nr_of_elements Number of elements to allocate * @param element_size Size of each element * @return Pointer to the block of allocated memory */ void * cl_calloc(size_t nr_of_elements, size_t element_size) { void *block; block = calloc(nr_of_elements, element_size); if (block == NULL) { Rprintf( "CL: Out of memory. (killed)\n"); Rprintf( "CL: [cl_calloc(%ld*%ld bytes)]\n", nr_of_elements, element_size); Rprintf("\n"); /* for CQP's child mode */ rcqp_receive_error(1); } return block; }
/** * Prints an error message to NULL, and * exits the program if there are now just too many errors. */ static void REGEX2DFA_ERROR(char *Format, ...) { va_list AP; Rprintf( "[%d] ", LINE); va_start(AP, Format); Rvprintf( Format, AP); va_end(AP); Rprintf("%d",'\n'); if (++ERRORS == MAX_ERRORS) { Rprintf( "regex2dfa: Reached the %d error limit.\n", MAX_ERRORS); rcqp_receive_error(1); } }
/** * Prints basic information about a corpus to STDOUT. * * @param corpus The corpus to report on. * @param with_attribute_names Boolean: iff true, the counts of each type of attribute * are followed by a list of attribute names. * */ void describecorpus_show_basic_info (Corpus *corpus, int with_attribute_names) { Attribute *word, *a; int p_atts = 0, s_atts = 0, a_atts = 0; int size; char *colon = (with_attribute_names) ? ":" : ""; Rprintf("description: %s\n", corpus->name); Rprintf("registry file: %s/%s\n", corpus->registry_dir, corpus->registry_name); Rprintf("home directory: %s/\n", corpus->path); Rprintf("info file: %s\n", (corpus->info_file) ? corpus->info_file : "(none)"); if ((word = cl_new_attribute(corpus, "word", ATT_POS)) == NULL) { Rprintf( "ERROR: 'word' attribute is missing. Aborted.\n"); rcqp_receive_error(1); } size = cl_max_cpos(word); Rprintf("size (tokens): "); if (size >= 0) Rprintf("%d\n", size); else Rprintf("ERROR\n"); Rprintf("\n"); for (a = corpus->attributes; a; a = a->any.next) { switch(a->any.type) { case ATT_POS: p_atts++; break; case ATT_STRUC: s_atts++; break; case ATT_ALIGN: a_atts++; break; default: break; } } Rprintf("%3d positional attributes%s\n", p_atts, colon); if (with_attribute_names) describecorpus_show_attribute_names(corpus, ATT_POS); Rprintf("%3d structural attributes%s\n", s_atts, colon); if (with_attribute_names) describecorpus_show_attribute_names(corpus, ATT_STRUC); Rprintf("%3d alignment attributes%s\n", a_atts, colon); if (with_attribute_names) describecorpus_show_attribute_names(corpus, ATT_ALIGN); Rprintf("\n"); }
/** * Prints a message describing how to use the program to STDERR and then exits. */ void alignencode_usage(void) { Rprintf( "\n"); Rprintf( "Usage: %s [options] <alignment_file>\n\n", progname); Rprintf( "\n"); Rprintf( "Adds an alignment attribute to an existing CWB corpus\n"); Rprintf( "\n"); Rprintf( "Options:\n"); Rprintf( " -d <dir> write data file(s) to directory <dir>\n"); Rprintf( " -D write files to corpus data directory\n"); Rprintf( " -C compatibility mode (creates .alg file)\n"); /* Rprintf( " -R reverse alignment (target -> source)\n"); */ /* -R option disabled ... need to re-order alignment file for reverse alignment */ Rprintf( " -r <reg> use registry directory <reg>\n"); Rprintf( " -v verbose mode\n"); Rprintf( " -h this help page\n\n"); Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n"); rcqp_receive_error(1); }
/** * Write data about a region to disk files (as defined in global variable new_satt). */ void sencode_write_region(int start, int end, char *annot) { if (!new_satt.ready) sencode_open_files(); if (new_satt.store_values && (LH == NULL)) LH = cl_new_lexhash(0); /* write start & end positions of region */ NwriteInt(start, new_satt.fd); NwriteInt(end, new_satt.fd); /* store annotation for -V attribute */ if (new_satt.store_values) { int offset, id; cl_lexhash_entry entry; entry = cl_lexhash_find(LH, annot); if (entry == NULL) { /* must add string to hash and to avs file */ entry = cl_lexhash_add(LH, annot); entry->data.integer = new_satt.offset; new_satt.offset += strlen(annot) + 1; /* increment range offset */ if (0 > fprintf(new_satt.avs, "%s%c", annot, 0)) { perror("Error writing to AVS file"); rcqp_receive_error(1); } } id = entry->id; offset = entry->data.integer; NwriteInt(new_satt.num, new_satt.avx); NwriteInt(offset, new_satt.avx); } new_satt.num++; /* increment region number */ new_satt.last_cpos = end; }
/** * Compresses the token stream of a p-attribute. * * Three files are created: the compressed token stream, the descriptor block, * and a sync file. * * @param attr The attribute to compress. * @param hc Location for the resulting Huffmann code descriptor block. * @param fname Base filename for the resulting files. */ int compute_code_lengths(Attribute *attr, HCD *hc, char *fname) { int id, i, h; int nr_codes = 0; int *heap = NULL; unsigned *codelength = NULL; /* was char[], probably to save space; but that's unnecessary and makes gcc complain */ int issued_codes[MAXCODELEN]; int next_code[MAXCODELEN]; long sum_bits; Rprintf("COMPRESSING TOKEN STREAM of %s.%s\n", corpus_id_cwb_huffcode, attr->any.name); /* I need the following components: * - CompCorpus * - CompCorpusFreqs * - CompLexicon * - CompLexiconIdx * and want to force the CL to use them rather than compressed data. */ { Component *comp; if ((comp = ensure_component(attr, CompCorpus, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the CORPUS component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompLexicon, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the LEXION component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompLexiconIdx, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the LEXIDX component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompCorpusFreqs, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the FREQS component.\n" "Run 'makeall -r %s -c FREQS %s %s' in order to create it.\n", corpus->registry_dir, corpus->registry_name, attr->any.name); rcqp_receive_error(1); } } /* * strongly follows Witten/Moffat/Bell: ``Managing Gigabytes'', * pp. 335ff. */ hc->size = cl_max_id(attr); /* the size of the attribute (nr of items) */ if ((hc->size <= 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_max_id() failed"); rcqp_receive_error(1); } hc->length = cl_max_cpos(attr); /* the length of the attribute (nr of tokens) */ if ((hc->length <= 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_max_cpos() failed"); rcqp_receive_error(1); } hc->symbols = NULL; hc->min_codelen = 100; hc->max_codelen = 0; memset((char *)hc->lcount, '\0', MAXCODELEN * sizeof(int)); memset((char *)hc->min_code, '\0', MAXCODELEN * sizeof(int)); memset((char *)hc->symindex, '\0', MAXCODELEN * sizeof(int)); memset((char *)issued_codes, '\0', MAXCODELEN * sizeof(int)); codelength = (unsigned *)cl_calloc(hc->size, sizeof(unsigned)); /* =========================================== make & initialize the heap */ heap = (int *)cl_malloc(hc->size * 2 * sizeof(int)); for (i = 0; i < hc->size; i++) { heap[i] = hc->size + i; heap[hc->size+i] = get_id_frequency(attr, i) + 1; /* add-one trick needed to avoid unsupported Huffman codes > 31 bits for very large corpora of ca. 2 billion words: theoretical optimal code length for hapax legomena in such corpora is ca. 31 bits, and the Huffman algorithm sometimes generates 32-bit codes; with add-one trick, the theoretical optimal code length is always <= 30 bits */ } /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) fprintf(protocol, "Allocated heap with %d cells for %d items\n\n", hc->size * 2, hc->size); if (do_protocol > 2) print_heap(heap, hc->size, "After Initialization"); /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 1 */ h = hc->size; /* * we address the heap in the following manner: when we start array * indices at 1, the left child is at 2i, and the right child is at * 2i+1. So we maintain this scheme and decrement just before * adressing the array. */ /* * construct the initial min-heap */ for (i = hc->size/2; i > 0; i--) { /* do: * bottom up, left to right, * for each root of each subtree, sift if necessary */ sift(heap, h, i); } /* ============================== PROTOCOL ============================== */ if (do_protocol > 2) { print_heap(heap, hc->size, "Initial Min-Heap"); fprintf(protocol, "\n"); } /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 2 */ /* smallest item at top of heap now, remove the two smallest items * and sift, find second smallest by removing top and sifting, as * long as we have more than one root */ while (h > 1) { int pos[2]; for (i = 0; i < 2; i++) { /* remove topmost (i.e. smallest) item */ pos[i] = heap[0]; /* remove and sift, to reobtain heap integrity: move ``last'' * item to top of heap and sift */ heap[0] = heap[--h]; sift(heap, h, 1); } /* ============================== PROTOCOL ============================== */ if (do_protocol > 3) { fprintf(protocol, "Removed smallest item %d with freq %d\n", pos[0], heap[pos[0]]); fprintf(protocol, "Removed 2nd smallest item %d with freq %d\n", pos[1], heap[pos[1]]); } /* ============================== PROTOCOL ============================== */ /* * pos[0] and pos[1] contain pointers to the two smallest items * now. since h was decremented twice, h and h+1 are now empty and * become the accumulated freq of pos[i]. The individual * frequencies are not needed any more, so pointers to h+1 (the * acc freq) are stored there instead (tricky, since freq cell * becomes pointer cell). So, what happens here, is to include a * new element in the heap. */ heap[h] = h+1; heap[h+1] = heap[pos[0]] + heap[pos[1]]; /* accumulated freq */ heap[pos[0]] = heap[pos[1]] = h+1; /* pointers! */ h++; /* we put a new element into heap */ /* * now, swap it up until we reobtain heap integrity */ { register int parent, current; current = h; parent = current >> 1; while ((parent > 0) && (heap[heap[parent-1]] > heap[heap[current-1]])) { int tmp; tmp = heap[parent-1]; heap[parent-1] = heap[current-1]; heap[current-1] = tmp; current = parent; parent = current >> 1; } } } /* ============================== PROTOCOL ============================== */ if (do_protocol > 3) fprintf(protocol, "\n"); /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 3 */ /* compute the code lengths. We don't have any freqs in heap any * more, only pointers to parents */ heap[0] = -1U; /* root has a depth of 0 */ heap[1] = 0; /* we trust in what they say on p. 345 */ for (i = 2; i < hc->size * 2; i++) heap[i] = heap[heap[i]]+1; /* collect the lengths */ sum_bits = 0L; for (i = 0; i < hc->size; i++) { int cl = heap[i+hc->size]; sum_bits += cl * get_id_frequency(attr, i); codelength[i] = cl; if (cl == 0) continue; if (cl > hc->max_codelen) hc->max_codelen = cl; if (cl < hc->min_codelen) hc->min_codelen = cl; hc->lcount[cl]++; } /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) { fprintf(protocol, "Minimal code length: %3d\n", hc->min_codelen); fprintf(protocol, "Maximal code length: %3d\n", hc->max_codelen); fprintf(protocol, "Compressed code len: %10ld bits, %10ld (+1) bytes\n\n\n", sum_bits, sum_bits/8); } /* ============================== PROTOCOL ============================== */ if (hc->max_codelen >= MAXCODELEN) { Rprintf( "Error: Huffman codes too long (%d bits, current maximum is %d bits).\n", hc->max_codelen, MAXCODELEN-1); Rprintf( " Please contact the CWB development team for assistance.\n"); rcqp_receive_error(1); } if ((hc->max_codelen == 0) && (hc->min_codelen == 100)) { Rprintf( "Problem: No output generated -- no items?\n"); nr_codes = 0; } else { hc->min_code[hc->max_codelen] = 0; for (i = hc->max_codelen-1; i > 0; i--) hc->min_code[i] = (hc->min_code[i+1] + hc->lcount[i+1]) >> 1; hc->symindex[hc->min_codelen] = 0; for (i = hc->min_codelen+1; i <= hc->max_codelen; i++) hc->symindex[i] = hc->symindex[i-1] + hc->lcount[i-1]; /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) { int sum_codes = 0; fprintf(protocol, " CL #codes MinCode SymIdx\n"); fprintf(protocol, "----------------------------------------\n"); for (i = hc->min_codelen; i <= hc->max_codelen; i++) { sum_codes += hc->lcount[i]; fprintf(protocol, "%3d %7d %7d %7d\n", i, hc->lcount[i], hc->min_code[i], hc->symindex[i]); } fprintf(protocol, "----------------------------------------\n"); fprintf(protocol, " %7d\n", sum_codes); } /* ============================== PROTOCOL ============================== */ for (i = 0; i < MAXCODELEN; i++) next_code[i] = hc->min_code[i]; /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "\n"); fprintf(protocol, " Item f(item) CL Bits Code, String\n"); fprintf(protocol, "------------------------------------" "------------------------------------\n"); } /* ============================== PROTOCOL ============================== */ /* compute and issue codes */ hc->symbols = heap + hc->size; for (i = 0; i < hc->size; i++) { /* we store the code for item i in heap[i] */ heap[i] = next_code[codelength[i]]; next_code[codelength[i]]++; /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "%7d %7d %3d %10d ", i, get_id_frequency(attr, i), codelength[i], codelength[i] * get_id_frequency(attr, i)); bprintf(heap[i], codelength[i], protocol); fprintf(protocol, " %7d %s\n", heap[i], get_string_of_id(attr, i)); } /* ============================== PROTOCOL ============================== */ /* and put the item itself in the second half of the table */ heap[hc->size+hc->symindex[codelength[i]]+issued_codes[codelength[i]]] = i; issued_codes[codelength[i]]++; } /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "------------------------------------" "------------------------------------\n"); } /* ============================== PROTOCOL ============================== */ /* The work itself -- encode the attribute data */ { char *path; char hcd_path[CL_MAX_LINE_LENGTH]; char huf_path[CL_MAX_LINE_LENGTH]; char sync_path[CL_MAX_LINE_LENGTH]; Component *corp; BFile bfd; FILE *sync; int cl, code, pos; corp = ensure_component(attr, CompCorpus, 0); assert(corp); if (fname) { path = fname; sprintf(hcd_path, "%s.hcd", path); sprintf(huf_path, "%s.huf", path); sprintf(sync_path, "%s.huf.syn", path); } else { path = component_full_name(attr, CompHuffSeq, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(huf_path, path); path = component_full_name(attr, CompHuffCodes, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(hcd_path, path); path = component_full_name(attr, CompHuffSync, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(sync_path, path); } Rprintf("- writing code descriptor block to %s\n", hcd_path); if (!WriteHCD(hcd_path, hc)) { Rprintf( "ERROR: writing %s failed. Aborted.\n", hcd_path); rcqp_receive_error(1); } Rprintf("- writing compressed item sequence to %s\n", huf_path); if (!BFopen(huf_path, "w", &bfd)) { Rprintf( "ERROR: can't create file %s\n", huf_path); perror(huf_path); rcqp_receive_error(1); } Rprintf("- writing sync (every %d tokens) to %s\n", SYNCHRONIZATION, sync_path); if ((sync = fopen(sync_path, "w")) == NULL) { Rprintf( "ERROR: can't create file %s\n", sync_path); perror(sync_path); rcqp_receive_error(1); } for (i = 0; i < hc->length; i++) { /* SYNCHRONIZE */ if ((i % SYNCHRONIZATION) == 0) { if (i > 0) BFflush(&bfd); pos = BFposition(&bfd); NwriteInt(pos, sync); } id = cl_cpos2id(attr, i); if ((id < 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_cpos2id() failed"); rcqp_receive_error(1); } else { assert((id >= 0) && (id < hc->size) && "Internal Error"); cl = codelength[id]; code = heap[id]; if (!BFwriteWord((unsigned int)code, cl, &bfd)) { Rprintf( "Error writing code for ID %d (%d, %d bits) at position %d. Aborted.\n", id, code, cl, i); rcqp_receive_error(1); } } } fclose(sync); BFclose(&bfd); } } free(codelength); free(heap); return 1; }
/** * Main function for cwb-s-encode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char **argv) { int input_line; int start, end; char *annot; char buf[CL_MAX_LINE_LENGTH]; Attribute *att; int V_switch, values, S_annotations_dropped; int i, N; progname = argv[0]; sencode_parse_options(argc, argv); /* -a mode: read existing regions into memory */ if (add_to_existing) { if (corpus == NULL) { Rprintf( "Error: You have to specify source corpus (-C <corpus>) for -a switch.\n"); rcqp_receive_error(1); } att = cl_new_attribute(corpus, new_satt.name, ATT_STRUC); if ((att != NULL) && (cl_max_struc(att) > 0)) { V_switch = new_satt.store_values; values = cl_struc_values(att); if (V_switch && (!values)) { Rprintf( "Error: Existing regions of -V attribute have no annotations.\n"); rcqp_receive_error(1); } else if ((!V_switch) && values) { Rprintf( "Error: Existing regions of -S attributes have annotations.\n"); rcqp_receive_error(1); } if (!silent) Rprintf("[Loading previous <%s> regions]\n", new_satt.name); N = cl_max_struc(att); for (i = 0; i < N; i++) { cl_struc2cpos(att, i, &start, &end); annot = cl_struc2str(att, i); SL_insert(start, end, annot); } } else { if (!silent) Rprintf("[No <%s> regions defined (skipped)]\n", new_satt.name); } } /* loop reading input (stdin or -f <file>) */ if (in_memory && (!silent)) Rprintf("[Reading input data]\n"); input_line = 0; S_annotations_dropped = 0; while (fgets(buf, CL_MAX_LINE_LENGTH, text_fd)) { input_line++; /* check for buffer overflow */ if (strlen(buf) >= (CL_MAX_LINE_LENGTH - 1)) { Rprintf( "BUFFER OVERFLOW, input line #%d is too long:\n>> %s", input_line, buf); rcqp_receive_error(1); } if (! sencode_parse_line(buf, &start, &end, &annot)) { Rprintf( "FORMAT ERROR on line #%d:\n>> %s", input_line, buf); rcqp_receive_error(1); } if (new_satt.store_values && (annot == NULL)) { Rprintf( "MISSING ANNOTATION on line #%d:\n>> %s", input_line, buf); rcqp_receive_error(1); } if ((!new_satt.store_values) && (annot != NULL)) { if (! S_annotations_dropped) Rprintf( "WARNING: Annotation for -S attribute ignored on line #%d (warning issued only once):\n>> %s", input_line, buf); S_annotations_dropped++; } if ((start <= new_satt.last_cpos) || (end < start)) { Rprintf( "RANGE INCONSISTENCY on line #%d:\n>> %s(end of previous region was %d)\n", input_line, buf, new_satt.last_cpos); rcqp_receive_error(1); } if (annot != NULL && set_att != set_none) { /* convert set annotation into standard syntax */ annot = sencode_check_set(annot); if (annot == NULL) { Rprintf( "SET ANNOTATION SYNTAX ERROR on line #%d:\n>> %s", input_line, buf); rcqp_receive_error(1); } } /* debugging output */ if (debug) { Rprintf( "[%d, %d]", start, end); if (annot != NULL) Rprintf( " <%s>", annot); Rprintf( "\n"); } /* in -M mode, store this region in memory; otherwise write it to the disk files */ if (in_memory) SL_insert(start, end, annot); else sencode_write_region(start, end, annot); cl_free(annot); } /* in -M mode, write data to disk now that we have finished looping across input data */ if (in_memory) { SL item; if (!silent) Rprintf("[Creating encoded disk file(s)]\n"); SL_rewind(); while ((item = SL_next()) != NULL) sencode_write_region(item->start, item->end, item->annot); } /* close files */ sencode_close_files(); if (S_annotations_dropped > 0) Rprintf( "Warning: %d annotation values dropped for -S attribute '%s'.\n", S_annotations_dropped, new_satt.name); rcqp_receive_error(0); }
/** * Parse options and set global variables */ void sencode_parse_options(int argc, char **argv) { int c; extern char *optarg; extern int optind; /* by default, output files are written to current directory */ char *directory = "."; /* may need to set registry if source corpus is specified */ char *registry = NULL; /* source corpus _may_ be set with the -C switch */ char *corpus_name = NULL; /* if text_fd is unspecified, stdin will be used */ text_fd = NULL; /* make sure either -S or -V is used: reset new_satt.name now & check after getopt */ new_satt.name = NULL; while((c = getopt(argc, argv, "+qBd:f:msDS:V:r:C:Mah")) != EOF) switch(c) { /* q: be silent (quiet) */ case 'q': silent++; break; /* B: strip blanks */ case 'B': strip_blanks_in_values++; break; /* d: directory for generated data files */ case 'd': directory = optarg; break; /* f: read input from file */ case 'f': if (text_fd) { Rprintf( "Error: -f option used twice\n\n"); rcqp_receive_error(1); } if ((text_fd = fopen(optarg, "r")) == NULL) { perror("Can't open input file"); rcqp_receive_error(1); } break; /* M: compile list in memory, then write to disk */ case 'M': in_memory++; break; /* a: add to existing attribute (implies -M) */ case 'a': add_to_existing++; in_memory++; break; /* r: registry directory */ case 'r': registry = optarg; break; /* C: source corpus */ case 'C': corpus_name = optarg; break; /* m: set ('multi-value') attribute */ case 'm': set_att = set_any; /* don't know yet whether it's '|'-delimited or "split on whitespace" */ break; /* s: strict syntax checks on set attribute */ case 's': set_syntax_strict++; break; /* D: debug mode */ case 'D': debug++; break; /* S: s-attribute without annotations */ case 'S': sencode_declare_new_satt(optarg, directory, 0); if (optind < argc) { Rprintf( "Error: -S <att> must be last flag on command line.\n\n"); rcqp_receive_error(1); } break; /* V: s-attribute with annotations */ case 'V': sencode_declare_new_satt(optarg, directory, 1); if (optind < argc) { Rprintf( "Error: -V <att> must be last flag on command line.\n\n"); rcqp_receive_error(1); } break; /* default or -h: error */ case 'h': default: sencode_usage(); break; } /* now, check the default and obligatory values */ if (!text_fd) text_fd = stdin; if (new_satt.name == NULL) { Rprintf( "Error: either -S or -V flag must be specified.\n\n"); rcqp_receive_error(1); } if (optind < argc) { Rprintf( "Error: extra arguments.\n\n"); rcqp_receive_error(1); } /* if -C <corpus> was specified, open source corpus */ if (corpus_name != NULL) { corpus = cl_new_corpus(registry, corpus_name); if (corpus == NULL) { Rprintf( "Error: Can't find corpus <%s>!\n", corpus_name); rcqp_receive_error(1); } } }
/** * Checks a huffcoded attribute for errors by decompressing it. * * This function assumes that compute_code_lengths() has been called * beforehand and made sure that the _uncompressed_ token sequence is * used by CL access functions. * * @param attr The attribute to check. * @param fname Base filename to use for the three compressed-attribute files. * Can be NULL, in which case the filenames in the attribute are used. */ void decode_check_huff(Attribute *attr, char *fname) { BFile bfd; FILE *sync; HCD hc; int pos, size, sync_offset, offset; int l, v; int item, true_item; unsigned char bit; char hcd_path[CL_MAX_LINE_LENGTH]; char huf_path[CL_MAX_LINE_LENGTH]; char sync_path[CL_MAX_LINE_LENGTH]; Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_huffcode, attr->any.name); if (fname) { sprintf(hcd_path, "%s.hcd", fname); sprintf(huf_path, "%s.huf", fname); sprintf(sync_path, "%s.huf.syn", fname); } else { char *path; path = component_full_name(attr, CompHuffSeq, NULL); assert(path && (cderrno == CDA_OK)); strcpy(huf_path, path); path = component_full_name(attr, CompHuffCodes, NULL); assert(path && (cderrno == CDA_OK)); strcpy(hcd_path, path); path = component_full_name(attr, CompHuffSync, NULL); assert(path && (cderrno == CDA_OK)); strcpy(sync_path, path); } Rprintf("- reading code descriptor block from %s\n", hcd_path); if (!ReadHCD(hcd_path, &hc)) { Rprintf( "ERROR: reading %s failed. Aborted.\n", hcd_path); rcqp_receive_error(1); } Rprintf("- reading compressed item sequence from %s\n", huf_path); if (!BFopen(huf_path, "r", &bfd)) { Rprintf( "ERROR: can't open file %s. Aborted.\n", huf_path); perror(huf_path); rcqp_receive_error(1); } Rprintf("- reading sync (mod %d) from %s\n", SYNCHRONIZATION, sync_path); if ((sync = fopen(sync_path, "r")) == NULL) { Rprintf( "ERROR: can't open file %s. Aborted.\n", sync_path); perror(sync_path); rcqp_receive_error(1); } size = cl_max_cpos(attr); if (size != hc.length) { Rprintf( "ERROR: wrong corpus size (%d tokens) in %s (correct size: %d)\n", hc.length, hcd_path, size); rcqp_receive_error(1); } for (pos = 0; pos < hc.length; pos++) { if ((pos % SYNCHRONIZATION) == 0) { offset = BFposition(&bfd); /* need to get offset before flushing (because flushing fills the bit buffer and advances offset to the following byte!) */ if (pos > 0) BFflush(&bfd); sync_offset = -1; /* make sure we get an error if read below fails */ NreadInt(&sync_offset, sync); if (offset != sync_offset) { Rprintf( "ERROR: wrong sync offset %d (true offset %d) at cpos %d. Aborted.\n", sync_offset, offset, pos); rcqp_receive_error(1); } } if (!BFread(&bit, 1, &bfd)) { Rprintf( "ERROR reading file %s. Aborted.\n", huf_path); rcqp_receive_error(1); } v = (bit ? 1 : 0); l = 1; while (v < hc.min_code[l]) { if (!BFread(&bit, 1, &bfd)) { Rprintf( "ERROR reading file %s. Aborted.\n", huf_path); return; } v <<= 1; if (bit) v++; l++; } item = hc.symbols[hc.symindex[l] + v - hc.min_code[l]]; true_item = cl_cpos2id(attr, pos); if (item != true_item) { Rprintf( "ERROR: wrong token (id=%d) at cpos %d (correct id=%d). Aborted.\n", item, pos, true_item); } } fclose(sync); BFclose(&bfd); /* tell the user it's safe to delete the CORPUS component now */ Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompCorpus, NULL)); return; /* exits on error, so there's no return value */ }