Exemplo n.º 1
0
/**
 * Main function for cwb-huffcode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int 
R_cwb_huffcode (char *corpus_name, char *registry_dir)
{
  char *registry_directory = registry_dir;
  char *output_fn = NULL;
  char *attr_name = DEFAULT_ATT_NAME;
  Attribute *attr;

  HCD hc;

  Rprintf("Corpus: %s\n", corpus_name);

  int i_want_to_believe = 0;        /* skip error checks? */
  int all_attributes = 0;

  protocol = NULL;                /* 'delayed' init (see top of file) */

  /* ------------------------------------------------- PARSE ARGUMENTS */
  /* parse arguments */

  all_attributes++;
  corpus_id_cwb_huffcode = corpus_name;

  Rprintf("Corpus 1: %s\n", corpus_name);
  
  if ((corpus = cl_new_corpus(registry_directory, corpus_id_cwb_huffcode)) == NULL) {
   Rprintf( "Corpus %s not found in registry %s . Aborted.\n", 
            corpus_id_cwb_huffcode,
            (registry_directory ? registry_directory
               : central_corpus_directory()));
    rcqp_receive_error(1);
  }

  Rprintf("Corpus 2: %s\n", corpus_name);

  if (all_attributes) {
    for (attr = corpus->attributes; attr; attr = attr->any.next)
      if (attr->any.type == ATT_POS) {
        compute_code_lengths(attr, &hc, output_fn);
        if (! i_want_to_believe)
          decode_check_huff(attr, output_fn);
      }
  }
  else {
    if ((attr = cl_new_attribute(corpus, attr_name, ATT_POS)) == NULL) {
     Rprintf( "Attribute %s.%s doesn't exist. Aborted.\n", 
              corpus_id_cwb_huffcode, attr_name);
      rcqp_receive_error(1);
    }
    compute_code_lengths(attr, &hc, output_fn);
    if (! i_want_to_believe)
      decode_check_huff(attr, output_fn);
  }
  
  Rprintf("Corpus 3: %s\n", corpus_name);

  cl_delete_corpus(corpus);
  
  return(0);
}
Exemplo n.º 2
0
/** Open disk files for the s-attribute being encoded (must have been declared first). */
void
sencode_open_files(void)
{
  char buf[CL_MAX_LINE_LENGTH];

  sprintf(buf, RNG_RNG, new_satt.dir, new_satt.name);
  if ((new_satt.fd = fopen(buf, "wb")) == NULL) {
    perror(buf);
    rcqp_receive_error(1);
  }

  if (new_satt.store_values) {
    sprintf(buf, RNG_AVS, new_satt.dir, new_satt.name);
    if ((new_satt.avs = fopen(buf, "w")) == NULL) {
      perror(buf);
      rcqp_receive_error(1);
    }

    sprintf(buf, RNG_AVX, new_satt.dir, new_satt.name);
    if ((new_satt.avx = fopen(buf, "wb")) == NULL) {
      perror(buf);
      rcqp_receive_error(1);
    }
  }

  new_satt.ready = 1;
}
Exemplo n.º 3
0
/** Close the disk files for the s-attribute being encoded. */
void
sencode_close_files(void)
{
  if (new_satt.ready) {
    if (EOF == fclose(new_satt.fd)) {
      perror("Error writing RNG file");
      rcqp_receive_error(1);
    }

    if (new_satt.avs) {
      if (EOF == fclose(new_satt.avs)) {
        perror("Error writing AVS file");
        rcqp_receive_error(1);
      }
    }

    if (new_satt.avx) {
      if (EOF == fclose(new_satt.avx)) {
        perror("Error writing AVX file");
        rcqp_receive_error(1);
      }
    }

    new_satt.ready = 0;
  }
}
Exemplo n.º 4
0
/**
 * Shuts down the server with an "internal error" condition.
 *
 * Both parameters will be printed as part of the shutdown error message.
 *
 * @param function  String: should be name of the calling function, that is,
 *                  the point where the error was raised.
 * @param reason    String containing any other explanatory details about the error.
 */
void
cqiserver_internal_error(char *function, char *reason)
{
 Rprintf( "CQPserver: internal error in %s()\n", function);
 Rprintf( "CQPserver: ''%s''\n", reason);
  rcqp_receive_error(1);
}
Exemplo n.º 5
0
/**
 * Cleans up memory prior to an error-prompted exit.
 *
 * @param error_code  Value to be returned by the program when it exits.
 */
void
decode_cleanup(int error_code)
{
  if (corpus != NULL)
    cl_delete_corpus(corpus);
  rcqp_receive_error(error_code);
}
Exemplo n.º 6
0
/**
 * Prints a usage message and exits the program.
 *
 * @param msg         A message about the error.
 * @param error_code  Value to be returned by the program when it exits.
 */
void 
huffcode_usage(char *msg, int error_code)
{
  if (msg)
    Rprintf( "Usage error: %s\n", msg);
  Rprintf( "\n");
  Rprintf( "Usage:  %s [options] <corpus>\n\n", progname);
  Rprintf( "Compress the token sequence of a positional attribute. Creates .huf, .hcd,\n");
  Rprintf( "and .huf.syn files, which replace the corresponding .corpus files. After\n");
  Rprintf( "running this tool successfully, the .corpus files can be deleted.\n");
  Rprintf( "\n");
  Rprintf( "Options:\n");
  Rprintf( "  -P <att>  compress attribute <att> [default: word]\n");
  Rprintf( "  -A        compress all positional attributes\n");
  Rprintf( "  -r <dir>  set registry directory\n");
  Rprintf( "  -f <file> set output file prefix (creates <file>.huf, ...)\n");
  Rprintf( "  -v        verbose mode (shows protocol) [may be repeated]\n");
/*   Rprintf( "  -d        debug mode (not implemented)\n"); *//* TODO -d / -D distinct as in cwb-compress-rdx? */
  Rprintf( "  -T        skip validation pass ('I trust you')\n");
  Rprintf( "  -h        this help page\n\n");
  Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n");

  if (corpus)
    cl_delete_corpus(corpus);

  rcqp_receive_error(error_code);
}
Exemplo n.º 7
0
Arquivo: macros.c Projeto: rforge/rcwb
/**
 * Safely reallocates memory.
 *
 * @see cl_malloc
 * @param block  Pointer to the block to be reallocated
 * @param bytes  Number of bytes to allocate to the resized memory block
 * @ return      Pointer to the block of reallocated memory
 */
void *
cl_realloc(void *block, size_t bytes)
{
  void *new_block;

  if (block == NULL) 
    new_block = malloc(bytes);	/* some OSs don't fall back to malloc() if block == NULL */
  else
    new_block = realloc(block, bytes);

  if (new_block == NULL) {
    if (bytes == 0) {
      /* don't warn any more, reallocating to 0 bytes should create no problems, at least on Linux and Solaris */
      /* (the message was probably shown on Linux only, because Solaris doesn't return NULL in this case) */
      /* Rprintf( "CL: WARNING realloc() to 0 bytes!\n"); */      
    }
    else {
      Rprintf( "CL: Out of memory. (killed)\n");
      Rprintf( "CL: [cl_realloc(block at %p to %ld bytes)]\n", block, bytes);
      Rprintf("\n");		/* for CQP's child mode */
      rcqp_receive_error(1);
    }
  }
  return new_block;
}
Exemplo n.º 8
0
/**
 * print usage message and exit
 */
void
sencode_usage(void)
{
  Rprintf( "\n");
  Rprintf( "Usage:  %s [options] (-S <att> | -V <att>)\n", progname);
  Rprintf( "\n");
  Rprintf( "Adds s-attributes with computed start and end points to a corpus\n");
  Rprintf( "\n");
  Rprintf( "Options:\n");
  Rprintf( "  -B        strip leading/trailing blanks from annotations\n");
  Rprintf( "  -d <dir>  directory for output files\n");
  Rprintf( "  -f <file> read input from <file> [default: stdin]\n");
  Rprintf( "  -M        create list of regions in memory (resolving overlaps)\n");
  Rprintf( "  -r <dir>  set registry directory <dir>\n");
  Rprintf( "  -C <id>   work on corpus <id> (with -a option)\n");
  Rprintf( "  -a        add to existing annotation (resolving overlaps, implies -M)\n");
  Rprintf( "  -m        treat annotations as feature set (or 'multi-value') attribute\n");
  Rprintf( "  -s        (with -m) check that format of set annotations is consistent\n");
  Rprintf( "  -q        silent mode ('be quiet')\n");
  Rprintf( "  -D        debug mode\n");
  Rprintf( "  -S <att>  generate s-attribute <att>\n");
  Rprintf( "  -V <att>  generate s-attribute <att> with annotations\n");
  Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n");
  rcqp_receive_error(2);
}
Exemplo n.º 9
0
/**
 * Converts a regular expression to a DFA. Public function.
 *
 * @param rxs         The regular expression.
 * @param automaton   Pointer to the DFA object to write to.
 */
void
regex2dfa(char *rxs, DFA *automaton)
{
  int Q, i, j;
  int S, Sh, Classes, C; 
  State SP;

  searchstr = rxs;

  init();

  Q = Parse();
  
  if (ERRORS > 0) 
   Rprintf( "%d error(s)\n", ERRORS);
  if (Q == -1) 
    rcqp_receive_error(1);
  FormState(Q);
  MergeStates(); 

  automaton->Max_States = Ss;
  automaton->Max_Input = Environment[eep].MaxPatIndex + 1;
  automaton->E_State = automaton->Max_States;

  if (show_dfa) /* TODO: Use a module-internal debug variable (for encapsulation). */
    WriteStates();

  /* allocate memory for the transition table and initialize it. */
  automaton->TransTable = (int **)cl_malloc(sizeof(int *) * automaton->Max_States);
  for (i = 0; i < Ss; i++)  {
    automaton->TransTable[i] = (int *)cl_malloc(sizeof(int) * automaton->Max_Input);
    for (j = 0; j < automaton->Max_Input; j++)
      automaton->TransTable[i][j] = automaton->E_State;
  }

  /* allocate memory for the table of final states. */
  automaton->Final = (Boolean *)cl_malloc(sizeof(Boolean) * (Ss + 1));

  /* initialize the table of final states. */
  for (i = 0; i <= automaton->Max_States; i++)
    automaton->Final[i] = False;

  for (S = Classes = 0; S < Ss; S++) {
    SP = &STab[S];
    if (SP->Class != Classes)
      continue;
    Classes++;
    if (SP->Empty)
      automaton->Final[SP->Class] = True;
    for (Sh = 0; Sh < SP->Shifts; Sh++) {
      C = SP->ShList[Sh].RHS;
      automaton->TransTable[SP->Class][atoi(SP->ShList[Sh].LHS->Name)] =
        STab[C].Class;
    }
  }
}
Exemplo n.º 10
0
Arquivo: storage.c Projeto: cran/rcqp
/**
 * Writes an integer to file, converting to network byte order.
 *
 * Other than the byte order conversion, this is the same as
 * fwrite(&val, sizeof(int), 1, fd) .
 *
 * @param val  The integer to write.
 * @param fd   File handle to write to.
 */
void
NwriteInt(int val, FILE *fd)
{
  int word;
  word = htonl(val);
  if (1 != fwrite(&word, sizeof(int), 1, fd)) {
    perror("File write error");
    rcqp_receive_error(1);
  }
}
Exemplo n.º 11
0
Arquivo: storage.c Projeto: cran/rcqp
/**
 * Reads an integer from file, converting from network byte order.
 *
 * This function does all the error checking for you, and will abort
 * the program if the int cannot be read.
 *
 * @param val  Location to put the resulting int.
 * @param fd   File handle to read from
 */
void
NreadInt(int *val, FILE *fd)
{
  int word;
  if (1 != fread(&word, sizeof(int), 1, fd)) {
    perror("File read error");
    rcqp_receive_error(1);
  }
  *val = ntohl(word);
}
Exemplo n.º 12
0
/**
 * Main function for cwb-atoi.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  FILE *fd; 
  int i;
  char *progname = argv[0];

  /* default case: we are reading from stdin */
  fd = stdin;

  for (i = 1; i < argc; i++) {
    if (argv[i][0] == '-') {
      switch (argv[i][1]) {
      case 'n':
        little_endian = 0;
        break;
      case 'l':
        little_endian = 1;
        break;
      case 'h':
      default:
        Rprintf( "\n");
        Rprintf( "Usage:  %s [options] [file]\n", argv[0]);
        Rprintf( "Reads one integer per line from ASCII file <file> or from standard input\n");
        Rprintf( "and writes values to standard output as 32-bit integers in network format\n");
        Rprintf( "(the format used by CWB binary data files).\n");
        Rprintf( "Options:\n");
        Rprintf( "  -n  convert to network format [default]\n");
        Rprintf( "  -l  convert to little endian format\n");
        Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n");
        rcqp_receive_error(1);
      }
    }
    else if ((fd = fopen(argv[i], "rb")) == NULL) {
      Rprintf( "%s: Couldn't open %s\n", progname, argv[i]);
      rcqp_receive_error(1);
    }
  }
  /* now process either input file or stdin */
  process_fd(fd);
  return 0;
}
Exemplo n.º 13
0
Arquivo: cwb-itoa.c Projeto: cran/rcqp
/**
 * Main function for cwb-itoa.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  FILE *fd;
  int i;
  char *progname = argv[0];

  fd = stdin;  /* initialisation removed from declaration for Gnuwin32 compatibility */

  for (i = 1; i < argc; i++) {
    if (argv[i][0] == '-') {
      switch (argv[i][1]) {
      case 'n':
        little_endian = 0;
        break;
      case 'l':
        little_endian = 1;
        break;
      case 'h':
      default:
       Rprintf( "\n");
       Rprintf( "Usage:  %s [options] [file]\n", argv[0]);
       Rprintf( "Reads 32bit integers in network format from CWB binary data file <file>\n");
       Rprintf( "or from standard input and prints the values as ASCII numbers on standard\n");
       Rprintf( "output (one number per line).\n");
       Rprintf( "Options:\n");
       Rprintf( "  -n  read integers in network format [default]\n");
       Rprintf( "  -l  read integers in little endian format\n");
       Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n");
        rcqp_receive_error(1);
      }
    }
    else if ((fd = fopen(argv[i], "rb")) == NULL) {
     Rprintf( "%s: Couldn't open %s\n", progname, argv[i]);
      rcqp_receive_error(1);
    }
  }

  /* now process either input file or stdin */
  process_fd(fd);
  return 0;
}
Exemplo n.º 14
0
/**
 * Cleans up memory prior to an error-prompted exit.
 *
 * @param error_code  Value to be returned by the program when it exits.
 */
void
compressrdx_cleanup(int error_code)
{
  if (corpus)
    cl_delete_corpus(corpus);

  if (debug_output != NULL)
    fclose(debug_output);

  rcqp_receive_error(error_code);
}
Exemplo n.º 15
0
void
PUSH(StackTag Tag, int Q)
{
  if (SP >= Stack + STACK_MAX) 
    {
      REGEX2DFA_ERROR("Expression too complex ... aborting.");
      rcqp_receive_error(1);
    }
  SP->Tag = Tag;
  SP->Q = Q; 
  SP++;
}
Exemplo n.º 16
0
/**
 * Sends the current CL error value to the client.
 *
 * This function takes the current contents of of the CL library's global
 * cl_errno error value and sends it to the client.
 *
 * It takes the CL error consant and translates it into the corresponding
 * CQI_CL_ERROR_* constant.
 *
 * NB: This function shuts down the server with an error condition if cl_errno
 * does not actually contain an error condition.
 *
 * @see cl_errno
 */
void
send_cl_error(void)
{
  int cmd;
  
  switch (cl_errno) {
  case CDA_EATTTYPE:
    cmd = CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE;
    break;
  case CDA_EIDORNG:
  case CDA_EIDXORNG:
  case CDA_EPOSORNG:
    cmd = CQI_CL_ERROR_OUT_OF_RANGE;
    break;
  case CDA_EPATTERN:
  case CDA_EBADREGEX:
    cmd = CQI_CL_ERROR_REGEX;
    break;
  case CDA_ENODATA:
    cmd = CQI_CL_ERROR_CORPUS_ACCESS;
    break;
  case CDA_ENOMEM:
    cmd = CQI_CL_ERROR_OUT_OF_MEMORY;
    break;
  case CDA_EOTHER:
  case CDA_ENYI:
    cmd = CQI_CL_ERROR_INTERNAL;
    break;
  case CDA_OK:
   Rprintf( "CQPserver: send_cl_error() called with cderrno == CDA_OK\n");
    rcqp_receive_error(1);
  default:
   Rprintf( "CQPserver: send_cl_error() unknown value in cderrno\n");
    rcqp_receive_error(1);
  }
  if (server_debug)
   Rprintf( "CQi: CL error, returning 0x%04X\n", cmd);
  cqi_command(cmd);
  return;
}
Exemplo n.º 17
0
/**
 * Prints a message describing how to use the program to STDERR and then exits.
 */
void
describecorpus_usage(void)
{
  Rprintf( "\n");
  Rprintf( "Usage:  %s [flags] <corpus> [<corpus> ...] \n", progname);
  Rprintf( "Options:\n");
  Rprintf( "  -r <dir>  use registry directory <dir>\n");
  Rprintf( "  -s        show statistics (attribute & lexicon size)\n");
  Rprintf( "  -d        show details (about component files)\n");
  Rprintf( "  -h        this help page\n");
  Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n");
  rcqp_receive_error(2);
}
Exemplo n.º 18
0
Arquivo: macros.c Projeto: rforge/rcwb
/**
 * Safely allocates memory malloc-style.
 *
 * This function allocates a block of memory of the requested size,
 * and does a test for malloc() failure which aborts the program and
 * prints an error message if the system is out of memory.
 * So the return value of this function can be used without further
 * testing for malloc() failure.
 *
 * @param bytes  Number of bytes to allocate
 * @return       Pointer to the block of allocated memory
 */
void *
cl_malloc(size_t bytes)
{
  void *block;

  block = malloc(bytes);
  if (block == NULL) {
    Rprintf( "CL: Out of memory. (killed)\n");
    Rprintf( "CL: [cl_malloc(%ld)]\n", bytes);
    Rprintf("\n");		/* for CQP's child mode */
    rcqp_receive_error(1);
  }
  return block;
}
Exemplo n.º 19
0
Arquivo: macros.c Projeto: rforge/rcwb
/**
 * Safely duplicates a string.
 *
 * @see cl_malloc
 * @param string  Pointer to the original string
 * @return        Pointer to the newly duplicated string
 */
char *
cl_strdup(char *string)
{
  char *new_string;

  new_string = strdup(string);
  if (new_string == NULL) {
    Rprintf( "CL: Out of memory. (killed)\n");
    Rprintf( "CL: [cl_strdup(addr=%p, len=%ld)]\n", string, strlen(string));
    Rprintf("\n");		/* for CQP's child mode */
    rcqp_receive_error(1);
  }
  return new_string;
}
Exemplo n.º 20
0
Arquivo: storage.c Projeto: cran/rcqp
/**
 * Writes an array of integers to file, converting to network byte order.
 *
 * Other than the byte order conversion, this is the same as
 * fwrite(vals, sizeof(int), nr_vals, fd) .
 *
 * @param vals     Pointer to the location of the block of integers to write.
 * @param nr_vals  Number of integers to write.
 * @param fd       File handle to write to.
 */
void
NwriteInts(int *vals, int nr_vals, FILE *fd)
{
  int word, k;

  /* I strongly believe in buffered IO (;-) */
  for (k = 0; k < nr_vals; k++) {
    word = htonl(vals[k]);
    if (1 != fwrite(&word, sizeof(int), 1, fd)) {
      perror("File write error");
      rcqp_receive_error(1);
    }
  }
}
Exemplo n.º 21
0
Arquivo: storage.c Projeto: cran/rcqp
/**
 * Reads an array of integers from file, converting from network byte order.
 *
 * This function does all the error checking for you, and will abort
 * the program if the requested number of ints cannot be read.
 *
 * @param vals     Pointer to location to put the resulting array of ints.
 *                 (This memory must have been allocated by the caller.)
 * @param nr_vals  Number of integers to read.
 * @param fd       File handle to read from
 */
void
NreadInts(int *vals, int nr_vals, FILE *fd)
{
  int word, k;

  /* I strongly believe in buffered IO (;-) */
  for (k = 0; k < nr_vals; k++) {
    if (1 != fread(&word, sizeof(int), 1, fd)) {
      perror("File read error");
      rcqp_receive_error(1);
    }
    vals[k] = ntohl(word);
  }
}
Exemplo n.º 22
0
Arquivo: macros.c Projeto: rforge/rcwb
/**
 * Safely allocates memory calloc-style.
 *
 * @see cl_malloc
 * @param nr_of_elements  Number of elements to allocate
 * @param element_size    Size of each element
 * @return                Pointer to the block of allocated memory
 */
void *
cl_calloc(size_t nr_of_elements, size_t element_size)
{
  void *block;

  block = calloc(nr_of_elements, element_size);
  if (block == NULL) {
    Rprintf( "CL: Out of memory. (killed)\n");
    Rprintf( "CL: [cl_calloc(%ld*%ld bytes)]\n", nr_of_elements, element_size);
    Rprintf("\n");		/* for CQP's child mode */
    rcqp_receive_error(1);
  }
  return block;
}
Exemplo n.º 23
0
/**
 * Prints an error message to NULL, and
 * exits the program if there are now just too many errors.
 */
static void
REGEX2DFA_ERROR(char *Format, ...)
{
  va_list AP;
  
 Rprintf( "[%d] ", LINE);
  va_start(AP, Format); Rvprintf( Format, AP); va_end(AP);
  Rprintf("%d",'\n');
  if (++ERRORS == MAX_ERRORS) {
   Rprintf( "regex2dfa: Reached the %d error limit.\n",
            MAX_ERRORS);
    rcqp_receive_error(1);
  }
}
Exemplo n.º 24
0
/**
 * Prints basic information about a corpus to STDOUT.
 *
 * @param corpus                The corpus to report on.
 * @param with_attribute_names  Boolean: iff true, the counts of each type of attribute
 *                              are followed by a list of attribute names.
 *
 */
void
describecorpus_show_basic_info (Corpus *corpus, int with_attribute_names)
{
  Attribute *word, *a;
  int p_atts = 0, s_atts = 0, a_atts = 0;
  int size;
  char *colon = (with_attribute_names) ? ":" : "";

  Rprintf("description:    %s\n", corpus->name);
  Rprintf("registry file:  %s/%s\n", corpus->registry_dir, corpus->registry_name);
  Rprintf("home directory: %s/\n", corpus->path);
  Rprintf("info file:      %s\n", (corpus->info_file) ? corpus->info_file : "(none)");
  if ((word = cl_new_attribute(corpus, "word", ATT_POS)) == NULL) {
    Rprintf( "ERROR: 'word' attribute is missing. Aborted.\n");
    rcqp_receive_error(1);
  }
  size = cl_max_cpos(word);
  Rprintf("size (tokens):  ");
  if (size >= 0) 
    Rprintf("%d\n", size);
  else
    Rprintf("ERROR\n");
  Rprintf("\n");
  
  for (a = corpus->attributes; a; a = a->any.next) {
    switch(a->any.type) {
    case ATT_POS:   p_atts++; break;
    case ATT_STRUC: s_atts++; break;
    case ATT_ALIGN: a_atts++; break;
    default: break;
    }
  }
  Rprintf("%3d positional attributes%s\n", p_atts, colon);
  if (with_attribute_names)
    describecorpus_show_attribute_names(corpus, ATT_POS);
  Rprintf("%3d structural attributes%s\n", s_atts, colon);
  if (with_attribute_names)
    describecorpus_show_attribute_names(corpus, ATT_STRUC);
  Rprintf("%3d alignment  attributes%s\n", a_atts, colon);
  if (with_attribute_names)
    describecorpus_show_attribute_names(corpus, ATT_ALIGN);
  Rprintf("\n");
}
Exemplo n.º 25
0
/**
 * Prints a message describing how to use the program to STDERR and then exits.
 */
void
alignencode_usage(void)
{
 Rprintf( "\n");
 Rprintf( "Usage: %s [options] <alignment_file>\n\n", progname);
 Rprintf( "\n");
 Rprintf( "Adds an alignment attribute to an existing CWB corpus\n");
 Rprintf( "\n");
 Rprintf( "Options:\n");
 Rprintf( "  -d <dir> write data file(s) to directory <dir>\n");
 Rprintf( "  -D       write files to corpus data directory\n");
 Rprintf( "  -C       compatibility mode (creates .alg file)\n");
  /*  Rprintf( "  -R       reverse alignment (target -> source)\n"); */
  /* -R option disabled ... need to re-order alignment file for reverse alignment */
 Rprintf( "  -r <reg> use registry directory <reg>\n");
 Rprintf( "  -v       verbose mode\n");
 Rprintf( "  -h       this help page\n\n");
 Rprintf( "Part of the IMS Open Corpus Workbench v" VERSION "\n\n");
  rcqp_receive_error(1);
}
Exemplo n.º 26
0
/**
 * Write data about a region to disk files (as defined in global variable new_satt).
 */
void
sencode_write_region(int start, int end, char *annot)
{
  if (!new_satt.ready)
    sencode_open_files();
  if (new_satt.store_values && (LH == NULL))
    LH = cl_new_lexhash(0);

  /* write start & end positions of region */
  NwriteInt(start, new_satt.fd);
  NwriteInt(end, new_satt.fd);

  /* store annotation for -V attribute */
  if (new_satt.store_values) {
    int offset, id;
    cl_lexhash_entry entry;

    entry = cl_lexhash_find(LH, annot);
    if (entry == NULL) {
      /* must add string to hash and to avs file */
      entry = cl_lexhash_add(LH, annot);
      entry->data.integer = new_satt.offset;
      new_satt.offset += strlen(annot) + 1; /* increment range offset */
      if (0 > fprintf(new_satt.avs, "%s%c", annot, 0)) {
        perror("Error writing to AVS file");
        rcqp_receive_error(1);
      }
    }
    id = entry->id;
    offset = entry->data.integer;

    NwriteInt(new_satt.num, new_satt.avx);
    NwriteInt(offset, new_satt.avx);
  }

  new_satt.num++;   /* increment region number */
  new_satt.last_cpos = end;
}
Exemplo n.º 27
0
/**
 * Compresses the token stream of a p-attribute.
 *
 * Three files are created: the compressed token stream, the descriptor block,
 * and a sync file.
 *
 * @param attr  The attribute to compress.
 * @param hc    Location for the resulting Huffmann code descriptor block.
 * @param fname Base filename for the resulting files.
 */
int 
compute_code_lengths(Attribute *attr, HCD *hc, char *fname)
{
  int id, i, h;

  int nr_codes = 0;

  int *heap = NULL;
  unsigned *codelength = NULL;        /* was char[], probably to save space; but that's unnecessary and makes gcc complain */

  int issued_codes[MAXCODELEN];
  int next_code[MAXCODELEN];

  long sum_bits;


  Rprintf("COMPRESSING TOKEN STREAM of %s.%s\n", corpus_id_cwb_huffcode, attr->any.name);

  /* I need the following components:
   * - CompCorpus
   * - CompCorpusFreqs
   * - CompLexicon
   * - CompLexiconIdx
   * and want to force the CL to use them rather than compressed data. 
   */

  {
    Component *comp;

    if ((comp = ensure_component(attr, CompCorpus, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the CORPUS component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexicon, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXION component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexiconIdx, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXIDX component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompCorpusFreqs, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the FREQS component.\n"
              "Run 'makeall -r %s -c FREQS %s %s' in order to create it.\n",
              corpus->registry_dir, corpus->registry_name, attr->any.name);
      rcqp_receive_error(1);
    }

  }

  /*
   * strongly follows Witten/Moffat/Bell: ``Managing Gigabytes'', 
   * pp. 335ff.
   */

  hc->size = cl_max_id(attr);                /* the size of the attribute (nr of items) */
  if ((hc->size <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_id() failed");
    rcqp_receive_error(1);
  }

  hc->length = cl_max_cpos(attr); /* the length of the attribute (nr of tokens) */
  if ((hc->length <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_cpos() failed");
    rcqp_receive_error(1);
  }

  hc->symbols = NULL;
  hc->min_codelen = 100;
  hc->max_codelen = 0;

  memset((char *)hc->lcount, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->min_code, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->symindex, '\0', MAXCODELEN * sizeof(int));

  memset((char *)issued_codes, '\0', MAXCODELEN * sizeof(int));

  codelength = (unsigned *)cl_calloc(hc->size, sizeof(unsigned));


  /* =========================================== make & initialize the heap */

  heap = (int *)cl_malloc(hc->size * 2 * sizeof(int));

  for (i = 0; i < hc->size; i++) {
    heap[i] = hc->size + i;
    heap[hc->size+i] = get_id_frequency(attr, i) + 1;
    /* add-one trick needed to avoid unsupported Huffman codes > 31 bits for very large corpora of ca. 2 billion words:
       theoretical optimal code length for hapax legomena in such corpora is ca. 31 bits, and the Huffman algorithm 
       sometimes generates 32-bit codes; with add-one trick, the theoretical optimal code length is always <= 30 bits */    
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0)
    fprintf(protocol, "Allocated heap with %d cells for %d items\n\n",
            hc->size * 2, hc->size);
  if (do_protocol > 2)
    print_heap(heap, hc->size, "After Initialization");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 1 */


  h = hc->size;

  /*
   * we address the heap in the following manner: when we start array
   * indices at 1, the left child is at 2i, and the right child is at
   * 2i+1. So we maintain this scheme and decrement just before
   * adressing the array. 
   */

  /*
   * construct the initial min-heap
   */

  for (i = hc->size/2; i > 0; i--) {

    /* do:
     * bottom up, left to right,
     * for each root of each subtree, sift if necessary
     */

    sift(heap, h, i);
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 2) {
    print_heap(heap, hc->size, "Initial Min-Heap");
    fprintf(protocol, "\n");
  }
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 2 */

  /* smallest item at top of heap now, remove the two smallest items
   * and sift, find second smallest by removing top and sifting, as
   * long as we have more than one root */



  while (h > 1) {
    
    int pos[2];

    for (i = 0; i < 2; i++) {

      /* remove topmost (i.e. smallest) item */

      pos[i] = heap[0];

      /* remove and sift, to reobtain heap integrity: move ``last''
       * item to top of heap and sift */

      heap[0] = heap[--h];
      
      sift(heap, h, 1);
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 3) {
      fprintf(protocol, "Removed     smallest item %d with freq %d\n",
              pos[0], heap[pos[0]]);
      fprintf(protocol, "Removed 2nd smallest item %d with freq %d\n",
              pos[1], heap[pos[1]]);
    }
    /* ============================== PROTOCOL ============================== */

    /*
     * pos[0] and pos[1] contain pointers to the two smallest items
     * now. since h was decremented twice, h and h+1 are now empty and
     * become the accumulated freq of pos[i]. The individual
     * frequencies are not needed any more, so pointers to h+1 (the
     * acc freq) are stored there instead (tricky, since freq cell
     * becomes pointer cell). So, what happens here, is to include a
     * new element in the heap. */

    heap[h] = h+1;
    heap[h+1] = heap[pos[0]] + heap[pos[1]]; /* accumulated freq */
    heap[pos[0]] = heap[pos[1]] = h+1; /* pointers! */
    h++;                        /* we put a new element into heap */

    /*
     * now, swap it up until we reobtain heap integrity
     */

    {
      register int parent, current;
      
      current = h;
      
      parent = current >> 1;

      while ((parent > 0) &&
             (heap[heap[parent-1]] > heap[heap[current-1]])) {

        int tmp;

        tmp = heap[parent-1];
        heap[parent-1] = heap[current-1];
        heap[current-1] = tmp;

        current = parent;
        parent = current >> 1;
      }
    }
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 3)
    fprintf(protocol, "\n");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 3 */

  /* compute the code lengths. We don't have any freqs in heap any
   * more, only pointers to parents */

  heap[0] = -1U;

  /* root has a depth of 0 */

  heap[1] = 0;

  /* we trust in what they say on p. 345 */

  for (i = 2; i < hc->size * 2; i++)
    heap[i] = heap[heap[i]]+1;


  /* collect the lengths */

  sum_bits = 0L;

  for (i = 0; i < hc->size; i++) {

    int cl = heap[i+hc->size];

    sum_bits += cl * get_id_frequency(attr, i);

    codelength[i] = cl;
    if (cl == 0)
      continue;

    if (cl > hc->max_codelen)
      hc->max_codelen = cl;

    if (cl < hc->min_codelen)
      hc->min_codelen = cl;

    hc->lcount[cl]++;
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0) {

    fprintf(protocol, "Minimal code length: %3d\n", hc->min_codelen);
    fprintf(protocol, "Maximal code length: %3d\n", hc->max_codelen);
    fprintf(protocol, "Compressed code len: %10ld bits, %10ld (+1) bytes\n\n\n",
            sum_bits, sum_bits/8);

  }
  /* ============================== PROTOCOL ============================== */

  if (hc->max_codelen >= MAXCODELEN) {
    Rprintf( "Error: Huffman codes too long (%d bits, current maximum is %d bits).\n", hc->max_codelen, MAXCODELEN-1);
    Rprintf( "       Please contact the CWB development team for assistance.\n");
    rcqp_receive_error(1);
  }

  if ((hc->max_codelen == 0) && (hc->min_codelen == 100)) {

    Rprintf( "Problem: No output generated -- no items?\n");
    nr_codes = 0;
  }
  else {

    hc->min_code[hc->max_codelen] = 0;
    
    for (i = hc->max_codelen-1; i > 0; i--)
      hc->min_code[i] = (hc->min_code[i+1] + hc->lcount[i+1]) >> 1;

    hc->symindex[hc->min_codelen] = 0;
    for (i = hc->min_codelen+1; i <= hc->max_codelen; i++)
      hc->symindex[i] = hc->symindex[i-1] + hc->lcount[i-1];


    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 0) {

      int sum_codes = 0;

      fprintf(protocol, " CL  #codes  MinCode   SymIdx\n");
      fprintf(protocol, "----------------------------------------\n");

      for (i = hc->min_codelen; i <= hc->max_codelen; i++) {
        sum_codes += hc->lcount[i];
        fprintf(protocol, "%3d %7d  %7d  %7d\n", 
                i, hc->lcount[i], hc->min_code[i], hc->symindex[i]);
      }

      fprintf(protocol, "----------------------------------------\n");
      fprintf(protocol, "    %7d\n", sum_codes);
    }
    /* ============================== PROTOCOL ============================== */


    for (i = 0; i < MAXCODELEN; i++)
      next_code[i] = hc->min_code[i];

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "\n");
      fprintf(protocol, "   Item   f(item)  CL      Bits     Code, String\n");
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */

    /* compute and issue codes */
    
    hc->symbols = heap + hc->size;

    for (i = 0; i < hc->size; i++) {

      /* we store the code for item i in heap[i] */
      heap[i] = next_code[codelength[i]];
      next_code[codelength[i]]++;

      /* ============================== PROTOCOL ============================== */
      if (do_protocol > 1) {
        fprintf(protocol, "%7d  %7d  %3d  %10d ",
                i,
                get_id_frequency(attr, i),
                codelength[i],
                codelength[i] * get_id_frequency(attr, i));

        bprintf(heap[i], codelength[i], protocol);

        fprintf(protocol, "  %7d  %s\n",
                heap[i], get_string_of_id(attr, i));
      }
      /* ============================== PROTOCOL ============================== */

      /* and put the item itself in the second half of the table */
      heap[hc->size+hc->symindex[codelength[i]]+issued_codes[codelength[i]]] = i;
      issued_codes[codelength[i]]++;
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */


    /* The work itself -- encode the attribute data */

    {
      char *path;

      char hcd_path[CL_MAX_LINE_LENGTH];
      char huf_path[CL_MAX_LINE_LENGTH];
      char sync_path[CL_MAX_LINE_LENGTH];

      Component *corp;

      BFile bfd;
      FILE *sync;

      int cl, code, pos;

      corp = ensure_component(attr, CompCorpus, 0);
      assert(corp);

      if (fname) {
        path = fname;

        sprintf(hcd_path, "%s.hcd", path);
        sprintf(huf_path, "%s.huf", path);
        sprintf(sync_path, "%s.huf.syn", path);
      }
      else {
        path = component_full_name(attr, CompHuffSeq, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(huf_path, path);

        path = component_full_name(attr, CompHuffCodes, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(hcd_path, path);

        path = component_full_name(attr, CompHuffSync, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(sync_path, path);

      }

      Rprintf("- writing code descriptor block to %s\n",  hcd_path);
      if (!WriteHCD(hcd_path, hc)) {
        Rprintf( "ERROR: writing %s failed. Aborted.\n", hcd_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing compressed item sequence to %s\n", huf_path);

      if (!BFopen(huf_path, "w", &bfd)) {
        Rprintf( "ERROR: can't create file %s\n", huf_path);
        perror(huf_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing sync (every %d tokens) to %s\n", SYNCHRONIZATION, sync_path);

      if ((sync = fopen(sync_path, "w")) == NULL) {
        Rprintf( "ERROR: can't create file %s\n", sync_path);
        perror(sync_path);
        rcqp_receive_error(1);
      }

      for (i = 0; i < hc->length; i++) {

        /* SYNCHRONIZE */

        if ((i % SYNCHRONIZATION) == 0) {
          if (i > 0)
            BFflush(&bfd);
          pos = BFposition(&bfd);
          NwriteInt(pos, sync);
        }

        id = cl_cpos2id(attr, i);
        if ((id < 0) || (cderrno != CDA_OK)) {
          cdperror("(aborting) cl_cpos2id() failed");
          rcqp_receive_error(1);
        }

        else {

          assert((id >= 0) && (id < hc->size) && "Internal Error");

          cl = codelength[id];
          code = heap[id];

          if (!BFwriteWord((unsigned int)code, cl, &bfd)) {
            Rprintf( "Error writing code for ID %d (%d, %d bits) at position %d. Aborted.\n",
                    id, code, cl, i);
            rcqp_receive_error(1);
          }

        }

      }

      fclose(sync);
      BFclose(&bfd);
    }
  }

  free(codelength);
  free(heap);
 
  return 1;
}
Exemplo n.º 28
0
/**
 * Main function for cwb-s-encode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  int input_line;
  int start, end;
  char *annot;
  char buf[CL_MAX_LINE_LENGTH];
  Attribute *att;
  int V_switch, values, S_annotations_dropped;
  int i, N;

  progname = argv[0];
  sencode_parse_options(argc, argv);

  /* -a mode: read existing regions into memory */
  if (add_to_existing) {
    if (corpus == NULL) {
      Rprintf( "Error: You have to specify source corpus (-C <corpus>) for -a switch.\n");
      rcqp_receive_error(1);
    }
    att = cl_new_attribute(corpus, new_satt.name, ATT_STRUC);
    if ((att != NULL) && (cl_max_struc(att) > 0)) {
      V_switch = new_satt.store_values;
      values = cl_struc_values(att);
      if (V_switch && (!values)) {
        Rprintf( "Error: Existing regions of -V attribute have no annotations.\n");
        rcqp_receive_error(1);
      }
      else if ((!V_switch) && values) {
        Rprintf( "Error: Existing regions of -S attributes have annotations.\n");
        rcqp_receive_error(1);
      }
      if (!silent)
        Rprintf("[Loading previous <%s> regions]\n", new_satt.name);

      N = cl_max_struc(att);
      for (i = 0; i < N; i++) {
        cl_struc2cpos(att, i, &start, &end);
        annot = cl_struc2str(att, i);
        SL_insert(start, end, annot);
      }
    }
    else {
      if (!silent)
        Rprintf("[No <%s> regions defined (skipped)]\n", new_satt.name);
    }
  }

  /* loop reading input (stdin or -f <file>) */
  if (in_memory && (!silent))
    Rprintf("[Reading input data]\n");
  input_line = 0;
  S_annotations_dropped = 0;
  while (fgets(buf, CL_MAX_LINE_LENGTH, text_fd)) {
    input_line++;

    /* check for buffer overflow */
    if (strlen(buf) >= (CL_MAX_LINE_LENGTH - 1)) {
      Rprintf( "BUFFER OVERFLOW, input line #%d is too long:\n>> %s", input_line, buf);
      rcqp_receive_error(1);
    }

    if (! sencode_parse_line(buf, &start, &end, &annot)) {
      Rprintf( "FORMAT ERROR on line #%d:\n>> %s", input_line, buf);
      rcqp_receive_error(1);
    }
    if (new_satt.store_values && (annot == NULL)) {
      Rprintf( "MISSING ANNOTATION on line #%d:\n>> %s", input_line, buf);
      rcqp_receive_error(1);
    }
    if ((!new_satt.store_values) && (annot != NULL)) {
      if (! S_annotations_dropped)
        Rprintf( "WARNING: Annotation for -S attribute ignored on line #%d (warning issued only once):\n>> %s", input_line, buf);
      S_annotations_dropped++;
    }
    if ((start <= new_satt.last_cpos) || (end < start)) {
      Rprintf( "RANGE INCONSISTENCY on line #%d:\n>> %s(end of previous region was %d)\n", input_line, buf, new_satt.last_cpos);
      rcqp_receive_error(1);
    }
    if (annot != NULL && set_att != set_none) {
      /* convert set annotation into standard syntax */
      annot = sencode_check_set(annot);
      if (annot == NULL) {
        Rprintf( "SET ANNOTATION SYNTAX ERROR on line #%d:\n>> %s", input_line, buf);
        rcqp_receive_error(1);
      }
    }

    /* debugging output */
    if (debug) {
      Rprintf( "[%d, %d]", start, end);
      if (annot != NULL)
        Rprintf( " <%s>", annot);
      Rprintf( "\n");
    }

    /* in -M mode, store this region in memory; otherwise write it to the disk files */
    if (in_memory)
      SL_insert(start, end, annot);
    else
      sencode_write_region(start, end, annot);

    cl_free(annot);
  }

  /* in -M mode, write data to disk now that we have finished looping across input data */
  if (in_memory) {
    SL item;

    if (!silent)
      Rprintf("[Creating encoded disk file(s)]\n");
    SL_rewind();
    while ((item = SL_next()) != NULL)
      sencode_write_region(item->start, item->end, item->annot);
  }

  /* close files */
  sencode_close_files();

  if (S_annotations_dropped > 0)
    Rprintf( "Warning: %d annotation values dropped for -S attribute '%s'.\n", S_annotations_dropped, new_satt.name);

  rcqp_receive_error(0);
}
Exemplo n.º 29
0
/**
 * Parse options and set global variables
 */
void
sencode_parse_options(int argc, char **argv)
{
  int c;
  extern char *optarg;
  extern int optind;

  /* by default, output files are written to current directory */
  char *directory = ".";
  /* may need to set registry if source corpus is specified */
  char *registry = NULL;
  /* source corpus _may_ be set with the -C switch */
  char *corpus_name = NULL;

  /* if text_fd is unspecified, stdin will be used */
  text_fd = NULL;
  /* make sure either -S or -V is used: reset new_satt.name now & check after getopt */
  new_satt.name = NULL;

  while((c = getopt(argc, argv, "+qBd:f:msDS:V:r:C:Mah")) != EOF)
    switch(c) {

      /* q: be silent (quiet) */
    case 'q':
      silent++;
      break;

      /* B: strip blanks */
    case 'B':
      strip_blanks_in_values++;
      break;

      /* d: directory for generated data files */
    case 'd':
      directory = optarg;
      break;

      /* f: read input from file */
    case 'f':
      if (text_fd) {
        Rprintf( "Error: -f option used twice\n\n");
        rcqp_receive_error(1);
      }
      if ((text_fd = fopen(optarg, "r")) == NULL) {
        perror("Can't open input file");
        rcqp_receive_error(1);
      }
      break;

      /* M: compile list in memory, then write to disk */
    case 'M':
      in_memory++;
      break;

      /* a: add to existing attribute (implies -M) */
    case 'a':
      add_to_existing++;
      in_memory++;
      break;

      /* r: registry directory */
    case 'r':
      registry = optarg;
      break;

      /* C: source corpus */
    case 'C':
      corpus_name = optarg;
      break;

      /* m: set ('multi-value') attribute */
    case 'm':
      set_att = set_any;        /* don't know yet whether it's '|'-delimited or "split on whitespace" */
      break;

      /* s: strict syntax checks on set attribute */
    case 's':
      set_syntax_strict++;
      break;

      /* D: debug mode */
    case 'D':
      debug++;
      break;

      /* S: s-attribute without annotations */
    case 'S':
      sencode_declare_new_satt(optarg, directory, 0);
      if (optind < argc) {
        Rprintf( "Error: -S <att> must be last flag on command line.\n\n");
        rcqp_receive_error(1);
      }
      break;

      /* V: s-attribute with annotations */
    case 'V':
      sencode_declare_new_satt(optarg, directory, 1);
      if (optind < argc) {
        Rprintf( "Error: -V <att> must be last flag on command line.\n\n");
        rcqp_receive_error(1);
      }
      break;

    /* default or -h: error */
    case 'h':
    default:
      sencode_usage();
      break;
    }

  /* now, check the default and obligatory values */
  if (!text_fd)
    text_fd = stdin;
  if (new_satt.name == NULL) {
    Rprintf( "Error: either -S or -V flag must be specified.\n\n");
    rcqp_receive_error(1);
  }
  if (optind < argc) {
    Rprintf( "Error: extra arguments.\n\n");
    rcqp_receive_error(1);
  }

  /* if -C <corpus> was specified, open source corpus */
  if (corpus_name != NULL) {
    corpus = cl_new_corpus(registry, corpus_name);
    if (corpus == NULL) {
      Rprintf( "Error: Can't find corpus <%s>!\n", corpus_name);
      rcqp_receive_error(1);
    }
  }

}
Exemplo n.º 30
0
/**
 * Checks a huffcoded attribute for errors by decompressing it.
 *
 * This function assumes that compute_code_lengths() has been called
 * beforehand and made sure that the _uncompressed_ token sequence is
 * used by CL access functions.
 *
 * @param attr  The attribute to check.
 * @param fname Base filename to use for the three compressed-attribute files.
 *              Can be NULL, in which case the filenames in the attribute are used.
 */
void 
decode_check_huff(Attribute *attr, char *fname)
{
  BFile bfd;
  FILE *sync;
  HCD hc;

  int pos, size, sync_offset, offset;

  int l, v;
  int item, true_item;
  
  unsigned char bit;

  char hcd_path[CL_MAX_LINE_LENGTH];
  char huf_path[CL_MAX_LINE_LENGTH];
  char sync_path[CL_MAX_LINE_LENGTH];

  
  Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_huffcode, attr->any.name);

  if (fname) {
    sprintf(hcd_path, "%s.hcd", fname);
    sprintf(huf_path, "%s.huf", fname);
    sprintf(sync_path, "%s.huf.syn", fname);
  }
  else {

    char *path;

    path = component_full_name(attr, CompHuffSeq, NULL);
    assert(path && (cderrno == CDA_OK));
    strcpy(huf_path, path);
    
    path = component_full_name(attr, CompHuffCodes, NULL);
    assert(path && (cderrno == CDA_OK));
    strcpy(hcd_path, path);

    path = component_full_name(attr, CompHuffSync, NULL);
    assert(path && (cderrno == CDA_OK));
    strcpy(sync_path, path);
    
  }

  Rprintf("- reading code descriptor block from %s\n", hcd_path);
  if (!ReadHCD(hcd_path, &hc)) {
    Rprintf( "ERROR: reading %s failed. Aborted.\n",  hcd_path);
    rcqp_receive_error(1);
  }

  Rprintf("- reading compressed item sequence from %s\n", huf_path);
  if (!BFopen(huf_path, "r", &bfd)) {
    Rprintf( "ERROR: can't open file %s. Aborted.\n", huf_path);
    perror(huf_path);
    rcqp_receive_error(1);
  }

  Rprintf("- reading sync (mod %d) from %s\n", SYNCHRONIZATION, sync_path);
  if ((sync = fopen(sync_path, "r")) == NULL) {
    Rprintf( "ERROR: can't open file %s. Aborted.\n", sync_path);
    perror(sync_path);
    rcqp_receive_error(1);
  }

  size = cl_max_cpos(attr);
  if (size != hc.length) {
    Rprintf( "ERROR: wrong corpus size (%d tokens) in %s (correct size: %d)\n",
            hc.length, hcd_path, size);
    rcqp_receive_error(1);
  }

  for (pos = 0; pos < hc.length; pos++) {

    if ((pos % SYNCHRONIZATION) == 0) {
      offset = BFposition(&bfd); /* need to get offset before flushing (because flushing fills the bit buffer and advances offset to the following byte!) */
      if (pos > 0)
        BFflush(&bfd);
      sync_offset = -1;                /* make sure we get an error if read below fails */
      NreadInt(&sync_offset, sync);
      if (offset != sync_offset) {
        Rprintf( "ERROR: wrong sync offset %d (true offset %d) at cpos %d. Aborted.\n",
                sync_offset, offset, pos);
        rcqp_receive_error(1);
      }
    }

    if (!BFread(&bit, 1, &bfd)) {
      Rprintf( "ERROR reading file %s. Aborted.\n", huf_path);
      rcqp_receive_error(1);
    }

    v = (bit ? 1 : 0);
    l = 1;
    while (v < hc.min_code[l]) {
      if (!BFread(&bit, 1, &bfd)) {
        Rprintf( "ERROR reading file %s. Aborted.\n", huf_path);
        return;
      }
      v <<= 1;
      if (bit)
        v++;
      l++;
    }
    item = hc.symbols[hc.symindex[l] + v - hc.min_code[l]];

    true_item = cl_cpos2id(attr, pos);
    if (item != true_item) {
      Rprintf( "ERROR: wrong token (id=%d) at cpos %d (correct id=%d). Aborted.\n",
              item, pos, true_item);
    }

  }
  fclose(sync);
  BFclose(&bfd);

  /* tell the user it's safe to delete the CORPUS component now */
  Rprintf("!! You can delete the file <%s> now.\n",
         component_full_name(attr, CompCorpus, NULL));
  
  return;                        /* exits on error, so there's no return value */
}