Example #1
0
/**
 * Sets up a component for the given attribute.
 *
 * If the component of the specified ComponentID does not already exist,
 * a new Component object is created, set up, and assigned to the attribute's
 * component array. Finally, the component path is initialised using the
 * path argument.
 *
 * @see component_full_name
 * @param attribute The Attribute for which to create this component.
 * @param cid       The ID of the component to create.
 * @param path      Path to be passed to component_full_name. Can be NULL.
 * @return          The new component if all is OK. If a component with the
 *                  specified ID already exists, it is returned and no new
 *                  component is created (and a warning message is printed
 *                  to STDERR). If the attribute is NULL, return is NULL
 *                  (and a warning is printed).
 */
Component *
declare_component(Attribute *attribute, ComponentID cid, char *path)
{
  Component *component;

  if (attribute == NULL) {
    fprintf(stderr, "attributes:declare_component(): \n"
            "  NULL attribute passed in declaration of %s component\n",
            cid_name(cid));
    return NULL;
  }
  else if ((component = attribute->any.components[cid]) == NULL) {

    component = new(Component);

    component->id = cid;
    component->corpus = attribute->any.mother;
    component->attribute = attribute;
    component->path = NULL;

    init_mblob(&(component->data));

    /* important to do this before the call to component_full_name */
    attribute->any.components[cid] = component;
    
    /* initialize component path */
    (void) component_full_name(attribute, cid, path);

    return component;
  }
  else {

    fprintf(stderr, "attributes:declare_component(): Warning:\n"
            "  Component %s of %s declared twice\n",
            cid_name(cid), attribute->any.name);
    return component;
  }

  /* notreached */
  assert("Notreached point reached ..." && 0);
  return NULL;
}
Example #2
0
/**
 * Checks a huffcoded attribute for errors by decompressing it.
 *
 * This function assumes that compute_code_lengths() has been called
 * beforehand and made sure that the _uncompressed_ token sequence is
 * used by CL access functions.
 *
 * @param attr  The attribute to check.
 * @param fname Base filename to use for the three compressed-attribute files.
 *              Can be NULL, in which case the filenames in the attribute are used.
 */
void 
decode_check_huff(Attribute *attr, char *fname)
{
  BFile bfd;
  FILE *sync;
  HCD hc;

  int pos, size, sync_offset, offset;

  int l, v;
  int item, true_item;
  
  unsigned char bit;

  char hcd_path[CL_MAX_LINE_LENGTH];
  char huf_path[CL_MAX_LINE_LENGTH];
  char sync_path[CL_MAX_LINE_LENGTH];

  
  Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_huffcode, attr->any.name);

  if (fname) {
    sprintf(hcd_path, "%s.hcd", fname);
    sprintf(huf_path, "%s.huf", fname);
    sprintf(sync_path, "%s.huf.syn", fname);
  }
  else {

    char *path;

    path = component_full_name(attr, CompHuffSeq, NULL);
    assert(path && (cderrno == CDA_OK));
    strcpy(huf_path, path);
    
    path = component_full_name(attr, CompHuffCodes, NULL);
    assert(path && (cderrno == CDA_OK));
    strcpy(hcd_path, path);

    path = component_full_name(attr, CompHuffSync, NULL);
    assert(path && (cderrno == CDA_OK));
    strcpy(sync_path, path);
    
  }

  Rprintf("- reading code descriptor block from %s\n", hcd_path);
  if (!ReadHCD(hcd_path, &hc)) {
    Rprintf( "ERROR: reading %s failed. Aborted.\n",  hcd_path);
    rcqp_receive_error(1);
  }

  Rprintf("- reading compressed item sequence from %s\n", huf_path);
  if (!BFopen(huf_path, "r", &bfd)) {
    Rprintf( "ERROR: can't open file %s. Aborted.\n", huf_path);
    perror(huf_path);
    rcqp_receive_error(1);
  }

  Rprintf("- reading sync (mod %d) from %s\n", SYNCHRONIZATION, sync_path);
  if ((sync = fopen(sync_path, "r")) == NULL) {
    Rprintf( "ERROR: can't open file %s. Aborted.\n", sync_path);
    perror(sync_path);
    rcqp_receive_error(1);
  }

  size = cl_max_cpos(attr);
  if (size != hc.length) {
    Rprintf( "ERROR: wrong corpus size (%d tokens) in %s (correct size: %d)\n",
            hc.length, hcd_path, size);
    rcqp_receive_error(1);
  }

  for (pos = 0; pos < hc.length; pos++) {

    if ((pos % SYNCHRONIZATION) == 0) {
      offset = BFposition(&bfd); /* need to get offset before flushing (because flushing fills the bit buffer and advances offset to the following byte!) */
      if (pos > 0)
        BFflush(&bfd);
      sync_offset = -1;                /* make sure we get an error if read below fails */
      NreadInt(&sync_offset, sync);
      if (offset != sync_offset) {
        Rprintf( "ERROR: wrong sync offset %d (true offset %d) at cpos %d. Aborted.\n",
                sync_offset, offset, pos);
        rcqp_receive_error(1);
      }
    }

    if (!BFread(&bit, 1, &bfd)) {
      Rprintf( "ERROR reading file %s. Aborted.\n", huf_path);
      rcqp_receive_error(1);
    }

    v = (bit ? 1 : 0);
    l = 1;
    while (v < hc.min_code[l]) {
      if (!BFread(&bit, 1, &bfd)) {
        Rprintf( "ERROR reading file %s. Aborted.\n", huf_path);
        return;
      }
      v <<= 1;
      if (bit)
        v++;
      l++;
    }
    item = hc.symbols[hc.symindex[l] + v - hc.min_code[l]];

    true_item = cl_cpos2id(attr, pos);
    if (item != true_item) {
      Rprintf( "ERROR: wrong token (id=%d) at cpos %d (correct id=%d). Aborted.\n",
              item, pos, true_item);
    }

  }
  fclose(sync);
  BFclose(&bfd);

  /* tell the user it's safe to delete the CORPUS component now */
  Rprintf("!! You can delete the file <%s> now.\n",
         component_full_name(attr, CompCorpus, NULL));
  
  return;                        /* exits on error, so there's no return value */
}
Example #3
0
/**
 * Compresses the token stream of a p-attribute.
 *
 * Three files are created: the compressed token stream, the descriptor block,
 * and a sync file.
 *
 * @param attr  The attribute to compress.
 * @param hc    Location for the resulting Huffmann code descriptor block.
 * @param fname Base filename for the resulting files.
 */
int 
compute_code_lengths(Attribute *attr, HCD *hc, char *fname)
{
  int id, i, h;

  int nr_codes = 0;

  int *heap = NULL;
  unsigned *codelength = NULL;        /* was char[], probably to save space; but that's unnecessary and makes gcc complain */

  int issued_codes[MAXCODELEN];
  int next_code[MAXCODELEN];

  long sum_bits;


  Rprintf("COMPRESSING TOKEN STREAM of %s.%s\n", corpus_id_cwb_huffcode, attr->any.name);

  /* I need the following components:
   * - CompCorpus
   * - CompCorpusFreqs
   * - CompLexicon
   * - CompLexiconIdx
   * and want to force the CL to use them rather than compressed data. 
   */

  {
    Component *comp;

    if ((comp = ensure_component(attr, CompCorpus, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the CORPUS component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexicon, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXION component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexiconIdx, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXIDX component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompCorpusFreqs, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the FREQS component.\n"
              "Run 'makeall -r %s -c FREQS %s %s' in order to create it.\n",
              corpus->registry_dir, corpus->registry_name, attr->any.name);
      rcqp_receive_error(1);
    }

  }

  /*
   * strongly follows Witten/Moffat/Bell: ``Managing Gigabytes'', 
   * pp. 335ff.
   */

  hc->size = cl_max_id(attr);                /* the size of the attribute (nr of items) */
  if ((hc->size <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_id() failed");
    rcqp_receive_error(1);
  }

  hc->length = cl_max_cpos(attr); /* the length of the attribute (nr of tokens) */
  if ((hc->length <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_cpos() failed");
    rcqp_receive_error(1);
  }

  hc->symbols = NULL;
  hc->min_codelen = 100;
  hc->max_codelen = 0;

  memset((char *)hc->lcount, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->min_code, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->symindex, '\0', MAXCODELEN * sizeof(int));

  memset((char *)issued_codes, '\0', MAXCODELEN * sizeof(int));

  codelength = (unsigned *)cl_calloc(hc->size, sizeof(unsigned));


  /* =========================================== make & initialize the heap */

  heap = (int *)cl_malloc(hc->size * 2 * sizeof(int));

  for (i = 0; i < hc->size; i++) {
    heap[i] = hc->size + i;
    heap[hc->size+i] = get_id_frequency(attr, i) + 1;
    /* add-one trick needed to avoid unsupported Huffman codes > 31 bits for very large corpora of ca. 2 billion words:
       theoretical optimal code length for hapax legomena in such corpora is ca. 31 bits, and the Huffman algorithm 
       sometimes generates 32-bit codes; with add-one trick, the theoretical optimal code length is always <= 30 bits */    
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0)
    fprintf(protocol, "Allocated heap with %d cells for %d items\n\n",
            hc->size * 2, hc->size);
  if (do_protocol > 2)
    print_heap(heap, hc->size, "After Initialization");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 1 */


  h = hc->size;

  /*
   * we address the heap in the following manner: when we start array
   * indices at 1, the left child is at 2i, and the right child is at
   * 2i+1. So we maintain this scheme and decrement just before
   * adressing the array. 
   */

  /*
   * construct the initial min-heap
   */

  for (i = hc->size/2; i > 0; i--) {

    /* do:
     * bottom up, left to right,
     * for each root of each subtree, sift if necessary
     */

    sift(heap, h, i);
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 2) {
    print_heap(heap, hc->size, "Initial Min-Heap");
    fprintf(protocol, "\n");
  }
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 2 */

  /* smallest item at top of heap now, remove the two smallest items
   * and sift, find second smallest by removing top and sifting, as
   * long as we have more than one root */



  while (h > 1) {
    
    int pos[2];

    for (i = 0; i < 2; i++) {

      /* remove topmost (i.e. smallest) item */

      pos[i] = heap[0];

      /* remove and sift, to reobtain heap integrity: move ``last''
       * item to top of heap and sift */

      heap[0] = heap[--h];
      
      sift(heap, h, 1);
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 3) {
      fprintf(protocol, "Removed     smallest item %d with freq %d\n",
              pos[0], heap[pos[0]]);
      fprintf(protocol, "Removed 2nd smallest item %d with freq %d\n",
              pos[1], heap[pos[1]]);
    }
    /* ============================== PROTOCOL ============================== */

    /*
     * pos[0] and pos[1] contain pointers to the two smallest items
     * now. since h was decremented twice, h and h+1 are now empty and
     * become the accumulated freq of pos[i]. The individual
     * frequencies are not needed any more, so pointers to h+1 (the
     * acc freq) are stored there instead (tricky, since freq cell
     * becomes pointer cell). So, what happens here, is to include a
     * new element in the heap. */

    heap[h] = h+1;
    heap[h+1] = heap[pos[0]] + heap[pos[1]]; /* accumulated freq */
    heap[pos[0]] = heap[pos[1]] = h+1; /* pointers! */
    h++;                        /* we put a new element into heap */

    /*
     * now, swap it up until we reobtain heap integrity
     */

    {
      register int parent, current;
      
      current = h;
      
      parent = current >> 1;

      while ((parent > 0) &&
             (heap[heap[parent-1]] > heap[heap[current-1]])) {

        int tmp;

        tmp = heap[parent-1];
        heap[parent-1] = heap[current-1];
        heap[current-1] = tmp;

        current = parent;
        parent = current >> 1;
      }
    }
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 3)
    fprintf(protocol, "\n");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 3 */

  /* compute the code lengths. We don't have any freqs in heap any
   * more, only pointers to parents */

  heap[0] = -1U;

  /* root has a depth of 0 */

  heap[1] = 0;

  /* we trust in what they say on p. 345 */

  for (i = 2; i < hc->size * 2; i++)
    heap[i] = heap[heap[i]]+1;


  /* collect the lengths */

  sum_bits = 0L;

  for (i = 0; i < hc->size; i++) {

    int cl = heap[i+hc->size];

    sum_bits += cl * get_id_frequency(attr, i);

    codelength[i] = cl;
    if (cl == 0)
      continue;

    if (cl > hc->max_codelen)
      hc->max_codelen = cl;

    if (cl < hc->min_codelen)
      hc->min_codelen = cl;

    hc->lcount[cl]++;
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0) {

    fprintf(protocol, "Minimal code length: %3d\n", hc->min_codelen);
    fprintf(protocol, "Maximal code length: %3d\n", hc->max_codelen);
    fprintf(protocol, "Compressed code len: %10ld bits, %10ld (+1) bytes\n\n\n",
            sum_bits, sum_bits/8);

  }
  /* ============================== PROTOCOL ============================== */

  if (hc->max_codelen >= MAXCODELEN) {
    Rprintf( "Error: Huffman codes too long (%d bits, current maximum is %d bits).\n", hc->max_codelen, MAXCODELEN-1);
    Rprintf( "       Please contact the CWB development team for assistance.\n");
    rcqp_receive_error(1);
  }

  if ((hc->max_codelen == 0) && (hc->min_codelen == 100)) {

    Rprintf( "Problem: No output generated -- no items?\n");
    nr_codes = 0;
  }
  else {

    hc->min_code[hc->max_codelen] = 0;
    
    for (i = hc->max_codelen-1; i > 0; i--)
      hc->min_code[i] = (hc->min_code[i+1] + hc->lcount[i+1]) >> 1;

    hc->symindex[hc->min_codelen] = 0;
    for (i = hc->min_codelen+1; i <= hc->max_codelen; i++)
      hc->symindex[i] = hc->symindex[i-1] + hc->lcount[i-1];


    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 0) {

      int sum_codes = 0;

      fprintf(protocol, " CL  #codes  MinCode   SymIdx\n");
      fprintf(protocol, "----------------------------------------\n");

      for (i = hc->min_codelen; i <= hc->max_codelen; i++) {
        sum_codes += hc->lcount[i];
        fprintf(protocol, "%3d %7d  %7d  %7d\n", 
                i, hc->lcount[i], hc->min_code[i], hc->symindex[i]);
      }

      fprintf(protocol, "----------------------------------------\n");
      fprintf(protocol, "    %7d\n", sum_codes);
    }
    /* ============================== PROTOCOL ============================== */


    for (i = 0; i < MAXCODELEN; i++)
      next_code[i] = hc->min_code[i];

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "\n");
      fprintf(protocol, "   Item   f(item)  CL      Bits     Code, String\n");
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */

    /* compute and issue codes */
    
    hc->symbols = heap + hc->size;

    for (i = 0; i < hc->size; i++) {

      /* we store the code for item i in heap[i] */
      heap[i] = next_code[codelength[i]];
      next_code[codelength[i]]++;

      /* ============================== PROTOCOL ============================== */
      if (do_protocol > 1) {
        fprintf(protocol, "%7d  %7d  %3d  %10d ",
                i,
                get_id_frequency(attr, i),
                codelength[i],
                codelength[i] * get_id_frequency(attr, i));

        bprintf(heap[i], codelength[i], protocol);

        fprintf(protocol, "  %7d  %s\n",
                heap[i], get_string_of_id(attr, i));
      }
      /* ============================== PROTOCOL ============================== */

      /* and put the item itself in the second half of the table */
      heap[hc->size+hc->symindex[codelength[i]]+issued_codes[codelength[i]]] = i;
      issued_codes[codelength[i]]++;
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */


    /* The work itself -- encode the attribute data */

    {
      char *path;

      char hcd_path[CL_MAX_LINE_LENGTH];
      char huf_path[CL_MAX_LINE_LENGTH];
      char sync_path[CL_MAX_LINE_LENGTH];

      Component *corp;

      BFile bfd;
      FILE *sync;

      int cl, code, pos;

      corp = ensure_component(attr, CompCorpus, 0);
      assert(corp);

      if (fname) {
        path = fname;

        sprintf(hcd_path, "%s.hcd", path);
        sprintf(huf_path, "%s.huf", path);
        sprintf(sync_path, "%s.huf.syn", path);
      }
      else {
        path = component_full_name(attr, CompHuffSeq, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(huf_path, path);

        path = component_full_name(attr, CompHuffCodes, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(hcd_path, path);

        path = component_full_name(attr, CompHuffSync, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(sync_path, path);

      }

      Rprintf("- writing code descriptor block to %s\n",  hcd_path);
      if (!WriteHCD(hcd_path, hc)) {
        Rprintf( "ERROR: writing %s failed. Aborted.\n", hcd_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing compressed item sequence to %s\n", huf_path);

      if (!BFopen(huf_path, "w", &bfd)) {
        Rprintf( "ERROR: can't create file %s\n", huf_path);
        perror(huf_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing sync (every %d tokens) to %s\n", SYNCHRONIZATION, sync_path);

      if ((sync = fopen(sync_path, "w")) == NULL) {
        Rprintf( "ERROR: can't create file %s\n", sync_path);
        perror(sync_path);
        rcqp_receive_error(1);
      }

      for (i = 0; i < hc->length; i++) {

        /* SYNCHRONIZE */

        if ((i % SYNCHRONIZATION) == 0) {
          if (i > 0)
            BFflush(&bfd);
          pos = BFposition(&bfd);
          NwriteInt(pos, sync);
        }

        id = cl_cpos2id(attr, i);
        if ((id < 0) || (cderrno != CDA_OK)) {
          cdperror("(aborting) cl_cpos2id() failed");
          rcqp_receive_error(1);
        }

        else {

          assert((id >= 0) && (id < hc->size) && "Internal Error");

          cl = codelength[id];
          code = heap[id];

          if (!BFwriteWord((unsigned int)code, cl, &bfd)) {
            Rprintf( "Error writing code for ID %d (%d, %d bits) at position %d. Aborted.\n",
                    id, code, cl, i);
            rcqp_receive_error(1);
          }

        }

      }

      fclose(sync);
      BFclose(&bfd);
    }
  }

  free(codelength);
  free(heap);
 
  return 1;
}
Example #4
0
/**
 * Checks a compressed reversed index for errors by decompressing it.
 *
 * This function this assumes that compress_reversed_index() has been called
 * beforehand and made sure that the _uncompressed_ index is used by CL
 * access functions.
 *
 * @param attr      The attribute to check the index of.
 * @param output_fn Base name for the compressed RDX files to be read
 *                  (if this is null, filename swill be taken from the
 *                  attribute).
 */
void 
decompress_check_reversed_index(Attribute *attr, char *output_fn)
{
  char *s;
  char data_fname[CL_MAX_FILENAME_LENGTH];
  char index_fname[CL_MAX_FILENAME_LENGTH];
  
  int nr_elements;
  int element_freq;
  int corpus_size;
  int pos, gap;

  int b;
  int i, k;

  BFile data_file;
  FILE *index_file;

  PositionStream PStream;
  int true_pos;


  Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name);

  nr_elements = cl_max_id(attr);
  if ((nr_elements <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_id() failed");
    compressrdx_cleanup(1);
  }

  corpus_size = cl_max_cpos(attr);
  if ((corpus_size <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_cpos() failed");
    compressrdx_cleanup(1);
  }

  if (output_fn) {
    sprintf(data_fname, "%s.crc", output_fn);
    sprintf(index_fname, "%s.crx", output_fn);
  }
  else {
    s = component_full_name(attr, CompCompRF, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(data_fname, s);

    s = component_full_name(attr, CompCompRFX, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(index_fname, s);
  }
  
  if (! BFopen(data_fname, "r", &data_file)) {
    Rprintf( "ERROR: can't open file %s\n", data_fname);
    perror(data_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- reading compressed index from %s\n", data_fname);
  
  if ((index_file = fopen(index_fname, "r")) == NULL) {
    Rprintf( "ERROR: can't open file %s\n", index_fname);
    perror(index_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- reading compressed index offsets from %s\n", index_fname);


  for (i = 0; i < nr_elements; i++) {

    element_freq = cl_id2freq(attr, i);
    if ((element_freq == 0) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) token frequency == 0\n");
      compressrdx_cleanup(1);
    }

    PStream = cl_new_stream(attr, i);
    if ((PStream == NULL) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) index read error");
      compressrdx_cleanup(1);
    }

    b = compute_ba(element_freq, corpus_size);

    if (debug_cwb_compress_rdx)
      fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n",
              i, element_freq, b);

    pos = 0;
    for (k = 0; k < element_freq; k++) {

      gap = read_golomb_code_bf(b, &data_file);
      pos += gap;

      if (1 != cl_read_stream(PStream, &true_pos, 1)) {
        cl_error("(aborting) index read error\n");
        compressrdx_cleanup(1);
      }
      if (pos != true_pos) {
        Rprintf( "ERROR: wrong occurrence of token #%d at cpos %d (correct cpos: %d). Aborted.\n",
              i, pos, true_pos);
        compressrdx_cleanup(1);
      }

    }
    
    cl_delete_stream(&PStream);
    BFflush(&data_file);
  }

  fclose(index_file);
  BFclose(&data_file);

  /* tell the user it's safe to delete the REVCORP and REVCIDX components now */
  Rprintf("!! You can delete the file <%s> now.\n",
         component_full_name(attr, CompRevCorpus, NULL));
  Rprintf("!! You can delete the file <%s> now.\n",
         component_full_name(attr, CompRevCorpusIdx, NULL));
  
  return;
}
Example #5
0
/**
 * Compresses the reversed index of a p-attribute.
 *
 * @param attr      The attribute to compress the index of.
 * @param output_fn Base name for the compressed RDX files to be written
 *                  (if this is null, filenames will be taken from the
 *                  attribute).
 */
void 
compress_reversed_index(Attribute *attr, char *output_fn)
{
  char *s;
  char data_fname[CL_MAX_FILENAME_LENGTH];
  char index_fname[CL_MAX_FILENAME_LENGTH];
  
  int nr_elements;
  int element_freq;
  int corpus_size;
  int last_pos, gap, fpos;

  int b;

  int i, k;

  BFile data_file;
  FILE *index_file = NULL;

  PositionStream PStream;
  int new_pos;


  Rprintf("COMPRESSING INDEX of %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name);

  /* ensure that we do NOT use the compressed index while building the
   * compressed index (yeah, a nasty thing that). That is, load the
   * .corpus.rev and .corpus.rdx components in order to force
   * subsequent CL calls to use the uncompressed data.
   */

  {
    Component *comp;

    if ((comp = ensure_component(attr, CompRevCorpus, 0)) == NULL) {
      Rprintf( "Index compression requires the REVCORP component\n");
      compressrdx_cleanup(1);
    }

    if ((comp = ensure_component(attr, CompRevCorpusIdx, 0)) == NULL) {
      Rprintf( "Index compression requires the REVCIDX component\n");
      compressrdx_cleanup(1);
    }

  }

  nr_elements = cl_max_id(attr);
  if ((nr_elements <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_id() failed");
    compressrdx_cleanup(1);
  }

  corpus_size = cl_max_cpos(attr);
  if ((corpus_size <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_cpos() failed");
    compressrdx_cleanup(1);
  }

  if (output_fn) {
    sprintf(data_fname, "%s.crc", output_fn);
    sprintf(index_fname, "%s.crx", output_fn);
  }
  else {
    s = component_full_name(attr, CompCompRF, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(data_fname, s);

    s = component_full_name(attr, CompCompRFX, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(index_fname, s);
  }
  
  if (! BFopen(data_fname, "w", &data_file)) {
    Rprintf( "ERROR: can't create file %s\n", data_fname);
    perror(data_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- writing compressed index to %s\n", data_fname);
  
  if ((index_file = fopen(index_fname, "wb")) == NULL) {
    Rprintf( "ERROR: can't create file %s\n", index_fname);
    perror(index_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- writing compressed index offsets to %s\n", index_fname);

  for (i = 0; i < nr_elements; i++) {
    
    element_freq = cl_id2freq(attr, i);
    if ((element_freq == 0) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) token frequency == 0\n");
      compressrdx_cleanup(1);
    }

    PStream = cl_new_stream(attr, i);
    if ((PStream == NULL) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) index read error");
      compressrdx_cleanup(1);
    }
    
    b = compute_ba(element_freq, corpus_size);
    
    fpos = BFposition(&data_file);
    NwriteInt(fpos, index_file);
    
    if (debug_cwb_compress_rdx)
      fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n",
              i, element_freq, b);
    
    last_pos = 0;
    for (k = 0; k < element_freq; k++) {
      if (1 != cl_read_stream(PStream, &new_pos, 1)) {
        cl_error("(aborting) index read error\n");
        compressrdx_cleanup(1);
      }
      
      gap = new_pos - last_pos;
      last_pos = new_pos;
      
      if (debug_cwb_compress_rdx)
        fprintf(debug_output, "%8d:  gap=%4d, b=%4d\n", codepos, gap, b);
      
      write_golomb_code(gap, b, &data_file);
      codepos++;
    }
    
    cl_delete_stream(&PStream);
    BFflush(&data_file);
  }
    
  fclose(index_file);
  BFclose(&data_file);

  return;
}
Example #6
0
/**
 * Main function for cwb-align-encode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char *argv[])
{
  int argindex;                         /* index of first argument in argv[] */

  char *align_name = NULL;              /* name of the .align file */
  FILE *af = NULL;                      /* alignment file handle */
  int af_is_pipe;                       /* need to know whether to call fclose() or pclose() */
  char alx_name[CL_MAX_LINE_LENGTH];    /* full pathname of .alx file */
  char alg_name[CL_MAX_LINE_LENGTH];    /* full pathname of optional .alg file */
  FILE *alx=NULL, *alg=NULL;            /* file handles for .alx and optional .alg file */

  char line[CL_MAX_LINE_LENGTH];        /* one line of input from <infile> */

  char corpus1_name[CL_MAX_FILENAME_LENGTH];
  char corpus2_name[CL_MAX_FILENAME_LENGTH];
  char s1_name[CL_MAX_FILENAME_LENGTH];
  char s2_name[CL_MAX_FILENAME_LENGTH];
  Corpus *corpus1, *corpus2;            /* corpus handles */
  Attribute *w1, *w2;                   /* attribute handles for 'word' attributes; used to determine corpus size */
  int size1, size2;                     /* size of source & target corpus */

  Corpus *source_corpus;                /* encode alignment in this corpus (depends on -R flag, important for -D option) */
  char *source_corpus_name;             /* just for error messages */
  char *attribute_name;                 /* name of alignment attribute (depends on -R flag, must be lowercase) */

  int f1,l1,f2,l2;                      /* alignment regions */
  int current1, current2;
  int mark, n_0_1, n_1_0;

  int l;

  progname = argv[0];

  /* parse command line and read arguments */
  argindex = alignencode_parse_args(argc, argv, 1);
  align_name = argv[argindex];

  /* open alignment file and parse header; .gz files are automatically decompressed */
  af_is_pipe = 0;
  l = strlen(align_name);
  if ((l > 3) && (strncasecmp(align_name + l - 3, ".gz", 3) == 0)) {
    char *pipe_cmd = (char *) cl_malloc(l+10);
    sprintf(pipe_cmd, "gzip -cd %s", align_name); /* write .gz file through gzip pipe */
    af = popen(pipe_cmd, "r");
    if (af == NULL) {
      perror(pipe_cmd);
     Rprintf( "%s: can't read compressed file %s\n", progname, align_name);
      rcqp_receive_error(1);
    }
    af_is_pipe = 1;
    cl_free(pipe_cmd);
  }
  else {
    af = fopen(align_name, "r");
    if (af == NULL) {
      perror(align_name);
     Rprintf( "%s: can't read file %s\n", progname, align_name);
      rcqp_receive_error(1);
    }
  }

  /* read header = first line */
  fgets(line, CL_MAX_LINE_LENGTH, af);
  if (4 != sscanf(line, "%s %s %s %s", corpus1_name, s1_name, corpus2_name, s2_name)) {
   Rprintf( "%s: %s not in .align format\n", progname, align_name);
   Rprintf( "wrong header: %s", line);
    rcqp_receive_error(1);
  }
  if (verbose) {
    if (reverse)
     Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus2_name, corpus1_name, align_name);
    else
     Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus1_name, corpus2_name, align_name);
  }

  /* open corpora and determine their sizes (for validity checks and compatibility mode) */
  if (NULL == (corpus1 = cl_new_corpus(registry_dir, corpus1_name))) {
   Rprintf( "%s: can't open corpus %s\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  if (NULL == (corpus2 = cl_new_corpus(registry_dir, corpus2_name))) {
   Rprintf( "%s: can't open corpus %s\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }
  if (NULL == (w1 = cl_new_attribute(corpus1, "word", ATT_POS))) {
   Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  if (NULL == (w2 = cl_new_attribute(corpus2, "word", ATT_POS))) {
   Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }

  size1 = cl_max_cpos(w1);
  if (size1 <= 0) {
   Rprintf( "%s: data access error (%s.word)\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  size2 = cl_max_cpos(w2);
  if (size2 <= 0) {
   Rprintf( "%s: data access error (%s.word)\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }

  /* now work out the actual source corpus and the alignment attribute name (depending on -R flag) */
  source_corpus = (reverse) ? corpus2 : corpus1;
  source_corpus_name = (reverse) ? corpus2_name : corpus1_name;
  attribute_name = cl_strdup((reverse) ? corpus1_name : corpus2_name);
  cl_id_tolower(attribute_name); /* fold attribute name to lowercase */

  /* with -D option, determine data file name(s) from actual source corpus;
     otherwise use directory specified with -d and the usual naming conventions */
  if (data_dir_from_corpus) {
    Attribute *alignment = cl_new_attribute(source_corpus, attribute_name, ATT_ALIGN);
    char *comp_pathname;

    if (alignment == NULL) {
     Rprintf( "%s: alignment attribute %s.%s not declared in registry file\n",
              progname, source_corpus_name, attribute_name);
      rcqp_receive_error(1);
    }
    comp_pathname = component_full_name(alignment, CompXAlignData, NULL);
    if (comp_pathname == NULL) {
     Rprintf( "%s: can't determine pathname for .alx file (internal error)\n", progname);
      rcqp_receive_error(1);
    }
    strcpy(alx_name, comp_pathname); /* need to strcpy because component_full_name() returns pointer to internal buffer */
    if (compatibility) {
      comp_pathname = component_full_name(alignment, CompAlignData, NULL);
      if (comp_pathname == NULL) {
       Rprintf( "%s: can't determine pathname for .alg file (internal error)\n", progname);
        rcqp_receive_error(1);
      }
      strcpy(alg_name, comp_pathname);
    }
  }
  else {
    sprintf(alx_name, "%s" SUBDIR_SEP_STRING "%s.alx", data_dir, attribute_name);
    if (compatibility)
      sprintf(alg_name, "%s" SUBDIR_SEP_STRING "%s.alg", data_dir, attribute_name);
  }

  /* now open output file(s) */
  alx = fopen(alx_name, "wb");
  if (alx == NULL) {
    perror(alx_name);
   Rprintf( "%s: can't write file %s\n", progname, alx_name);
    rcqp_receive_error(1);
  }
  if (verbose)
   Rprintf("Writing file %s ...\n", alx_name);

  if (compatibility) {
    alg = fopen(alg_name, "wb");
    if (alg == NULL) {
      perror(alg_name);
     Rprintf( "%s: can't write file %s\n", progname, alg_name);
      rcqp_receive_error(1);
    }

    if (verbose)
     Rprintf("Writing file %s ...\n", alg_name);
  }

  /* main encoding loop */
  f1 = f2 = l1 = l2 = 0;
  mark = -1;                        /* check that regions occur in ascending order */
  current1 = current2 = -1;         /* for compatibility mode */
  n_0_1 = n_1_0 = 0;                /* number of 0:1 and 1:0 alignments, which are skipped */
  while (! feof(af)) {
    if (NULL == fgets(line, CL_MAX_LINE_LENGTH, af))
      break;                        /* end of file (or read error, which we choose to ignore) */
    if (4 != sscanf(line, "%d %d %d %d", &f1, &l1, &f2, &l2)) {
     Rprintf( "%s: input format error: %s", progname, line);
      rcqp_receive_error(1);
    }

    /* skip 0:1 and 1:0 alignments */
    if (l1 < f1) {
      n_0_1++; continue;
    }
    if (l2 < f2) {
      n_1_0++; continue;
    }

    /* check that source regions are non-overlapping and in ascending order */
    if (((reverse) ? f2 : f1) <= mark) {
     Rprintf( "%s: source regions of alignment must be in ascending order\n", progname);
     Rprintf( "Last region was [*, %d]; current is [%d, %d].\n", mark, f1, l1);
     Rprintf( "Aborted.\n");
      rcqp_receive_error(1);
    }
    mark = (reverse) ? l2 : l1;

    /* write alignment region to .alx file */
    if (reverse) {
      NwriteInt(f2, alx); NwriteInt(l2, alx);
      NwriteInt(f1, alx); NwriteInt(l1, alx);
    }
    else {
      NwriteInt(f1, alx); NwriteInt(l1, alx);
      NwriteInt(f2, alx); NwriteInt(l2, alx);
    }

    if (compatibility) {
      /* source and target regions of .alg file must be contiguous; store start points only; */
      /* hence we must collapse crossing alignments into one larger region (I know that's bullshit) */
      if ((f1 > current1) && (f2 > current2)) {
        if (reverse) {
          NwriteInt(f2, alg); NwriteInt(f1, alg);
        }
        else {
          NwriteInt(f1, alg); NwriteInt(f2, alg);
        }
        current1 = f1;
        current2 = f2;
      }
    }
  }
  if (compatibility) {
    if (reverse) {
      NwriteInt(size2, alg); NwriteInt(size1, alg); /* end of corpus alignment point*/
    }
    else {
      NwriteInt(size1, alg); NwriteInt(size2, alg); /* end of corpus alignment point*/
    }
  }

  if (verbose) {
   Rprintf("I skipped %d 0:1 alignments and %d 1:0 alignments.\n", n_0_1, n_1_0);
  }

  /* that's it; close file handles */
  fclose(alx);
  if (compatibility)
    fclose(alg);

  if (af_is_pipe)
    pclose(af);
  else
    fclose(af);

  return 0;
}
Example #7
0
/**
 * Initializes the path of an attribute Component.
 *
 * This function starts with the path it is passed, and then evaluates variables
 * in the form $UPPERCASE. The resulting path is assigned to the specified
 * entry in the component array for the given Attribute.
 *
 * Note that if it is called for a Component that does not yet exist, this function
 * creates the component by calling declare_component().
 *
 * @see declare_component
 * @see Component_Field_Specs
 * @param attribute            The Attribute object to work with.
 * @param cid                  The identifier of the Component to which the path is to
 *                             be added.
 * @param path                 The path to assign to the component. Can be NULL,
 *                             in which case, the default path from Component_Field_Specs
 *                             is used.
 * @return                     Pointer to this function's static buffer for creating the
 *                             path (NB: NOT to the path in the actual component! which
 *                             is a copy). If a path already exists, a pointer to that
 *                             path. NULL in case of error in Component_Field_Specs.
 */
char *
component_full_name(Attribute *attribute, ComponentID cid, char *path)
{
  component_field_spec *compspec;
  Component *component;
  
  static char buf[CL_MAX_LINE_LENGTH];
  char rname[CL_MAX_LINE_LENGTH];
  char *reference;
  char c;

  int ppos, bpos, dollar, rpos;


  /*  did we do the job before? */
  
  if ((component = attribute->any.components[cid]) != NULL &&
      (component->path != NULL))
    return component->path;

  /*  yet undeclared. So try to guess the name: */
  
  compspec = NULL;

  if (path == NULL) {
    if ((compspec = find_cid_id(cid)) == NULL) {
      fprintf(stderr, "attributes:component_full_name(): Warning:\n"
              "  can't find component table entry for Component #%d\n", cid);
      return NULL;
    }
    path = compspec->default_path;
  }

  /* index in string "path" */
  ppos = 0;
  /* index in string "buf" */
  bpos = 0;
  dollar = 0;
  rpos = 0;
  buf[bpos] = '\0';

  while ((c = path[ppos]) != '\0') {
  
    if (c == '$') {

      /*  reference to the name of another component. */

      dollar = ppos;            /* memorize the position of the $ */

      rpos = 0;
      c = path[++ppos];         /* first skip the '$' */
      while (isupper(c)) {
        rname[rpos++] = c;
        c = path[++ppos];
      }
      rname[rpos] = '\0';

      /* ppos now points to the first character after the reference 
       * rname holds the UPPERCASE name of the referenced component
       */

      reference = NULL;

      if (STREQ(rname, "HOME"))
        reference = getenv(rname);
      else if (STREQ(rname, "APATH"))
        reference = (attribute->any.path ? attribute->any.path 
                     : attribute->any.mother->path);
      else if (STREQ(rname, "ANAME"))
        reference = attribute->any.name;
      else if ((compspec = find_cid_name(rname)) != NULL)
        reference = component_full_name(attribute, compspec->id, NULL);
      
      if (reference == NULL) {
        fprintf(stderr, "attributes:component_full_name(): Warning:\n"
                "  Can't reference to the value of %s -- copying\n",
                rname);
        reference = rname;
      }

      for (rpos = 0; reference[rpos] != '\0'; rpos++) {
        buf[bpos] = reference[rpos];
        bpos++;
      }
    }
    else {
      /* just copy the character and scroll */
      buf[bpos] = c;
      bpos++;
      ppos++;
    }
  }
  buf[bpos] = '\0';

  if (component != NULL)
    component->path = (char *)cl_strdup(buf);
  else
    (void) declare_component(attribute, cid, buf);

  /*  and return it */
  return &buf[0];
  /* ?? why is buf returned instead of component->path, as earlier in the function? -- AH 16/9/09 */
}