示例#1
0
文件: storage.c 项目: cran/rcqp
/**
 * Allocates memory for a blob of the requested size.
 *
 * A block of memory holding nr_items of size item_size is created
 * in the specified MemBlob.
 *
 * @param blob        The MemBlob in which to place the memory.
 * @param nr_items    The number of items the MemBlob is to hold as data.
 * @param item_size   The size of one item.
 * @param clear_blob  boolean: if true, all bytes in the data space will be initialised to 0
 * @return            boolean: true 1 if OK, false on error
 */
int
alloc_mblob(MemBlob *blob, int nr_items, int item_size, int clear_blob)
{
  int size;
  
  assert((blob != NULL) && "alloc_mblob(): You can't pass a NULL blob!");
  assert((item_size >= 0) && "alloc_mblob(): item_size must be >= 0!");
  assert((nr_items > 0) && "alloc_mblob(): nr_items must be > 0!");

  blob->item_size = item_size;
  blob->nr_items = nr_items;
  if (item_size == SIZE_BIT) {
    size = nr_items / 8;
    if (size * 8 < nr_items) {
      size++;                        /* make room for 'extra' bits */
    }
  }
  else {
    size = nr_items * item_size;
  }
  blob->size = size;
  if (clear_blob) {
    blob->data = (int *) cl_calloc(size, 1);
  }
  else {
    blob->data = (int *) cl_malloc(size);
  }
  blob->allocation_method = MALLOCED;
  blob->writeable = 1;
  blob->changed = 0;
  blob->fname = NULL;
  blob->fsize = 0; 
  blob->offset = 0;

  return 1;
}
示例#2
0
文件: targets.c 项目: rforge/rcwb
int evaluate_target(CorpusList *corp,          /* the corpus */
                    FieldType t_id,            /* the field to set */
                    FieldType base,            /* where to start the search */
                    int inclusive,             /* including or excluding the base */
                    SearchStrategy strategy,   /* disambiguation rule: which item */
                    Constrainttree constr,     /* the constraint */
                    enum ctxtdir direction,    /* context direction */
                    int units,                       /* number of units */
                    char *attr_name)           /* name of unit */
{
  Attribute *attr;
  int *table;
  Context context;
  int i, line, lbound, rbound;
  int excl_start, excl_end;
  int nr_evals;
  int percentage, new_percentage; /* for ProgressBar */

  /* ------------------------------------------------------------ */

  assert(corp);

  /* consistency check */
  assert(t_id == TargetField || t_id == KeywordField || t_id == MatchField || t_id == MatchEndField);

  if (!constr) {
    cqpmessage(Error, "Constraing pattern missing in 'set target' command.");
    return 0;
  }

  if (corp->size <= 0) {
    cqpmessage(Error, "Corpus is empty.");
    return 0;
  }

  /*
   * check whether the base field specification is ok
   */
  switch(base) {
  case MatchField:
  case MatchEndField:
    if (corp->range == NULL) {
      cqpmessage(Error, "No ranges for start of search");
      return 0;
    }
    break;
  case TargetField:
    if (corp->targets == NULL) {
      cqpmessage(Error, "Can't start from base TARGET, none defined");
      return 0;
    }
    break;
  case KeywordField:
    if (corp->keywords == NULL) {
      cqpmessage(Error, "Can't start from base KEYWORD, none defined");
      return 0;
    }
    break;
  default:
    cqpmessage(Error, "Illegal base field (#%d) in 'set target' command.",
               base);
    return 0;
  }

  if (units <= 0) {
    cqpmessage(Error, "Invalid search space (%d units) in 'set target' command.", 
               units);
    return 0;
  }

  /* THIS SHOULD BE UNNECESSARY, BECAUSE THE GRAMMAR MAKES SURE THE SUBCORPUS EXISTS & IS LOADED */
  /*   if (!access_corpus(corp)) { */
  /*     cqpmessage(Error, "Can't access named query %s.", corp->name); */
  /*     return 0; */
  /*   } */

  context.size = units;
  context.direction = direction;

  if ((strcasecmp(attr_name, "word") == 0) ||
      (strcasecmp(attr_name, "words") == 0)) {
    attr = find_attribute(corp->corpus, DEFAULT_ATT_NAME, ATT_POS, NULL);
    context.type = word;
    context.attrib = NULL;
  }
  else {
    attr = find_attribute(corp->corpus, attr_name, ATT_STRUC, NULL);
    context.type = structure;
    context.attrib = attr;
  }

  if (attr == NULL) {
    cqpmessage(Error, "Can't find attribute %s.%s",
               corp->mother_name, attr_name);
    return 0;
  }

  if (progress_bar) {
    progress_bar_clear_line();
    progress_bar_message(1, 1, "    preparing");
  }


  table = (int *)cl_calloc(corp->size, sizeof(int));

  EvaluationIsRunning = 1;
  nr_evals = 0;
  percentage = -1;

  for (line = 0; line < corp->size && EvaluationIsRunning; line++) {

    if (progress_bar) {
      new_percentage = floor(0.5 + (100.0 * line) / corp->size);
      if (new_percentage > percentage) {
        percentage = new_percentage;
        progress_bar_percentage(0, 0, percentage);
      }
    }

    table[line] = -1;

    switch(base) {
    case MatchField:

      excl_start = corp->range[line].start;
      excl_end   = corp->range[line].end;

      if ((corp->range[line].start == corp->range[line].end) || inclusive) {

        if (calculate_ranges(corp,
                             corp->range[line].start, context,
                             &lbound, &rbound) == False) {

          Rprintf( "Can't compute boundaries for range #%d", line);
          lbound = rbound = -1;
        }
      }
      else {

        int dummy;

        if (calculate_ranges(corp,
                             corp->range[line].start, context,
                             &lbound, &dummy) == False) {

          Rprintf( "Can't compute left search space boundary match #%d", line);
          lbound = rbound = -1;
        }
        else if (calculate_ranges(corp,
                                  corp->range[line].end, context,
                                  &dummy, &rbound) == False) {

          Rprintf( "Can't compute right search space boundary match #%d", line);
          lbound = rbound = -1;
        }
      }
      break;

    case MatchEndField:
      excl_start = excl_end = corp->range[line].end;

      if (excl_start >= 0) {
        if (calculate_ranges(corp,
                             corp->range[line].end, context,
                             &lbound, &rbound) == False) {

          Rprintf( "Can't compute search space boundaries for match #%d", line);
          lbound = rbound = -1;
        }
      }
      else 
        lbound = rbound = -1;

      break;

    case TargetField:
      excl_start = excl_end = corp->targets[line];

      if (excl_start >= 0) {
        if (calculate_ranges(corp,
                             corp->targets[line], context,
                                  &lbound, &rbound) == False) {

          Rprintf( "Can't compute search space boundaries for match #%d", line);
          lbound = rbound = -1;
        }
      }
      else 
        lbound = rbound = -1;

      break;

    case KeywordField:
      excl_start = excl_end = corp->keywords[line];

      if (excl_start >= 0) {
        if (calculate_ranges(corp,
                             corp->keywords[line], context,
                             &lbound, &rbound) == False) {

          Rprintf( "Can't compute search space boundaries for match #%d", line);
          lbound = rbound = -1;
        }
      }
      else 
        lbound = rbound = -1;

      break;
    default:
      assert(0 && "Can't be");
      return 0;
    }

    if ((lbound >= 0) && (rbound >= 0)) {
      
      int dist, maxdist;

      if (direction == left) {
        rbound = excl_start;
        if (strategy == SearchNearest)
          strategy = SearchRightmost;
        else if (strategy == SearchFarthest)
          strategy = SearchLeftmost;
      }
      else if (direction == right) {
        lbound = excl_start;
        if (strategy == SearchNearest)
          strategy = SearchLeftmost;
        else if (strategy == SearchFarthest)
          strategy = SearchRightmost;
      }

      switch (strategy) {
      case SearchFarthest:

        maxdist = MAX(excl_start - lbound, rbound - excl_start);

        assert(maxdist >= 0);

        for (dist = maxdist; dist >= 0; dist--) {

          i = excl_start - dist;

          if (i >= lbound &&
              (inclusive || (i < excl_start)))
            if (eval_bool(constr, NULL, i)) {
              table[line] = i;
              break;
            }

          i = excl_start + dist;

          if (i <= rbound &&
              (inclusive || (i > excl_end)))
            if (eval_bool(constr, NULL, i)) {
              table[line] = i;
              break;
            }

          nr_evals++;
          if (nr_evals == 1000) {
            CheckForInterrupts();
            nr_evals = 0;
          }

        }
        break;

      case SearchNearest:

        maxdist = MAX(excl_start - lbound, rbound - excl_start);
        assert(maxdist >= 0);

        for (dist = 0; dist <= maxdist; dist++) {

          i = excl_start - dist;

          if (i >= lbound &&
              (inclusive || (i < excl_start)))
            if (eval_bool(constr, NULL, i)) {
              table[line] = i;
              break;
            }

          i = excl_start + dist;

          if (i <= rbound &&
              (inclusive || (i > excl_end)))
            if (eval_bool(constr, NULL, i)) {
              table[line] = i;
              break;
            }

          nr_evals++;
          if (nr_evals == 1000) {
            CheckForInterrupts();
            nr_evals = 0;
          }

        }
        break;

      case SearchLeftmost:
        for (i = lbound; i <= rbound; i++)
          if (inclusive || (i < excl_start) || (i > excl_end)) {
            if (eval_bool(constr, NULL, i)) {
              table[line] = i;
              break;
            }

            nr_evals++;
            if (nr_evals == 1000) {
              CheckForInterrupts();
              nr_evals = 0;
            }
          }
        break;

      case SearchRightmost:
        for (i = rbound; i >= lbound; i--)
          if (inclusive || (i < excl_start) || (i > excl_end)) {
            if (eval_bool(constr, NULL, i)) {
              table[line] = i;
              break;
            }

            nr_evals++;
            if (nr_evals == 1000) {
              CheckForInterrupts();
              nr_evals = 0;
            }
          }
        break;
      default:
        break;
      }
    }
  }

  if (progress_bar) 
    progress_bar_message(1, 1, "  cleaning up");

  switch (t_id) {
  case MatchField:
    for (i = 0; i < corp->size; i++) {
      if (table[i] >= 0) 
        corp->range[i].start = table[i];
      if (corp->range[i].start > corp->range[i].end)
        corp->range[i].start = corp->range[i].end;
    }
    cl_free(table);
    break;

  case MatchEndField:
    for (i = 0; i < corp->size; i++) {
      if (table[i] >= 0) 
        corp->range[i].end = table[i];
      if (corp->range[i].end < corp->range[i].start)
        corp->range[i].end = corp->range[i].start;
    }
    cl_free(table);
    break;

  case TargetField:
    cl_free(corp->targets);
    corp->targets = table;
    break;

  case KeywordField:
    cl_free(corp->keywords);
    corp->keywords = table;
    break;

  default:
    assert(0 && "Can't be");
    break;
  }

  if (progress_bar)
    progress_bar_clear_line();

  if ((t_id == MatchField) || (t_id == MatchEndField))
    RangeSort(corp, 0);                /* re-sort corpus if match regions were modified */

  touch_corpus(corp);  
  if (!EvaluationIsRunning) {
    cqpmessage(Warning, "Evaluation interruted: results may be incomplete.");
    if (which_app == cqp) install_signal_handler();
  }
  EvaluationIsRunning = 0;

  return 1;
}
示例#3
0
文件: cwb-makeall.c 项目: rforge/rcwb
/**
 * Validates the REVCORP component of the given attribute.
 *
 * This function validates a REVCORP (i.e. an uncompressed index).
 * It assumes that a lexicon, frequencies and (compressed or
 * uncompressed) token stream are available for CL access for the
 * given attribute.
 *
 * @param attr  The attribute whose REVCORP should be checked.
 * @return      True for all OK, false for a problem.
 */
int
validate_revcorp(Attribute *attr)
{

  Component *revcorp = ensure_component(attr, CompRevCorpus, 0);
  int *ptab;                        /* table of index offsets for each lexicon entry */
  int lexsize, corpsize;
  int i, offset, cpos, id;

  printf(" ? validating %s ... ", cid_name(CompRevCorpus));
  fflush(stdout);

  if (revcorp == NULL) {
    printf("FAILED (no data)\n");
    return 0;
  }
  lexsize = cl_max_id(attr);
  corpsize = cl_max_cpos(attr);
  if ((lexsize <= 0) || (corpsize <= 0)) {
    printf("FAILED (corpus access error)\n");
    return 0;
  }
  if (revcorp->size != corpsize) {
    printf("FAILED (wrong size)\n");
    return 0;
  }

  /* init offsets by calculating REVIDX component from token frequencies */
  ptab = (int *) cl_calloc(lexsize, sizeof(int));
  offset = 0;
  for (i = 0; i < lexsize; i++) {
    ptab[i] = offset;
    offset += cl_id2freq(attr, i);
  }

  /* now read token stream, check each token id against REVCORP, and increment its pointer */
  for (cpos = 0; cpos < corpsize; cpos++) {
    id = cl_cpos2id(attr, cpos);
    if ((id < 0) || (id >= lexsize)) {
      printf("FAILED (inconsistency in token stream)\n");
      cl_free(ptab);
      return 0;
    }
    if (ntohl(revcorp->data.data[ptab[id]]) != cpos) {
      printf("FAILED\n");
      cl_free(ptab);
      return 0;
    }
    ptab[id]++;
  }

  /* validate frequencies by comparing final offsets against those calculated from token frequencies */
  offset = 0;
  for (i = 0; i < lexsize; i++) {
    offset += cl_id2freq(attr, i);
    if (ptab[i] != offset) {
      printf("FAILED (token frequencies incorrect)\n");
      cl_free(ptab);
      return 0;
    }
  }

  cl_free(ptab);

  printf("OK\n");
  return 1;
}
示例#4
0
/**
 * Compresses the token stream of a p-attribute.
 *
 * Three files are created: the compressed token stream, the descriptor block,
 * and a sync file.
 *
 * @param attr  The attribute to compress.
 * @param hc    Location for the resulting Huffmann code descriptor block.
 * @param fname Base filename for the resulting files.
 */
int 
compute_code_lengths(Attribute *attr, HCD *hc, char *fname)
{
  int id, i, h;

  int nr_codes = 0;

  int *heap = NULL;
  unsigned *codelength = NULL;        /* was char[], probably to save space; but that's unnecessary and makes gcc complain */

  int issued_codes[MAXCODELEN];
  int next_code[MAXCODELEN];

  long sum_bits;


  Rprintf("COMPRESSING TOKEN STREAM of %s.%s\n", corpus_id_cwb_huffcode, attr->any.name);

  /* I need the following components:
   * - CompCorpus
   * - CompCorpusFreqs
   * - CompLexicon
   * - CompLexiconIdx
   * and want to force the CL to use them rather than compressed data. 
   */

  {
    Component *comp;

    if ((comp = ensure_component(attr, CompCorpus, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the CORPUS component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexicon, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXION component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexiconIdx, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXIDX component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompCorpusFreqs, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the FREQS component.\n"
              "Run 'makeall -r %s -c FREQS %s %s' in order to create it.\n",
              corpus->registry_dir, corpus->registry_name, attr->any.name);
      rcqp_receive_error(1);
    }

  }

  /*
   * strongly follows Witten/Moffat/Bell: ``Managing Gigabytes'', 
   * pp. 335ff.
   */

  hc->size = cl_max_id(attr);                /* the size of the attribute (nr of items) */
  if ((hc->size <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_id() failed");
    rcqp_receive_error(1);
  }

  hc->length = cl_max_cpos(attr); /* the length of the attribute (nr of tokens) */
  if ((hc->length <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_cpos() failed");
    rcqp_receive_error(1);
  }

  hc->symbols = NULL;
  hc->min_codelen = 100;
  hc->max_codelen = 0;

  memset((char *)hc->lcount, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->min_code, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->symindex, '\0', MAXCODELEN * sizeof(int));

  memset((char *)issued_codes, '\0', MAXCODELEN * sizeof(int));

  codelength = (unsigned *)cl_calloc(hc->size, sizeof(unsigned));


  /* =========================================== make & initialize the heap */

  heap = (int *)cl_malloc(hc->size * 2 * sizeof(int));

  for (i = 0; i < hc->size; i++) {
    heap[i] = hc->size + i;
    heap[hc->size+i] = get_id_frequency(attr, i) + 1;
    /* add-one trick needed to avoid unsupported Huffman codes > 31 bits for very large corpora of ca. 2 billion words:
       theoretical optimal code length for hapax legomena in such corpora is ca. 31 bits, and the Huffman algorithm 
       sometimes generates 32-bit codes; with add-one trick, the theoretical optimal code length is always <= 30 bits */    
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0)
    fprintf(protocol, "Allocated heap with %d cells for %d items\n\n",
            hc->size * 2, hc->size);
  if (do_protocol > 2)
    print_heap(heap, hc->size, "After Initialization");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 1 */


  h = hc->size;

  /*
   * we address the heap in the following manner: when we start array
   * indices at 1, the left child is at 2i, and the right child is at
   * 2i+1. So we maintain this scheme and decrement just before
   * adressing the array. 
   */

  /*
   * construct the initial min-heap
   */

  for (i = hc->size/2; i > 0; i--) {

    /* do:
     * bottom up, left to right,
     * for each root of each subtree, sift if necessary
     */

    sift(heap, h, i);
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 2) {
    print_heap(heap, hc->size, "Initial Min-Heap");
    fprintf(protocol, "\n");
  }
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 2 */

  /* smallest item at top of heap now, remove the two smallest items
   * and sift, find second smallest by removing top and sifting, as
   * long as we have more than one root */



  while (h > 1) {
    
    int pos[2];

    for (i = 0; i < 2; i++) {

      /* remove topmost (i.e. smallest) item */

      pos[i] = heap[0];

      /* remove and sift, to reobtain heap integrity: move ``last''
       * item to top of heap and sift */

      heap[0] = heap[--h];
      
      sift(heap, h, 1);
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 3) {
      fprintf(protocol, "Removed     smallest item %d with freq %d\n",
              pos[0], heap[pos[0]]);
      fprintf(protocol, "Removed 2nd smallest item %d with freq %d\n",
              pos[1], heap[pos[1]]);
    }
    /* ============================== PROTOCOL ============================== */

    /*
     * pos[0] and pos[1] contain pointers to the two smallest items
     * now. since h was decremented twice, h and h+1 are now empty and
     * become the accumulated freq of pos[i]. The individual
     * frequencies are not needed any more, so pointers to h+1 (the
     * acc freq) are stored there instead (tricky, since freq cell
     * becomes pointer cell). So, what happens here, is to include a
     * new element in the heap. */

    heap[h] = h+1;
    heap[h+1] = heap[pos[0]] + heap[pos[1]]; /* accumulated freq */
    heap[pos[0]] = heap[pos[1]] = h+1; /* pointers! */
    h++;                        /* we put a new element into heap */

    /*
     * now, swap it up until we reobtain heap integrity
     */

    {
      register int parent, current;
      
      current = h;
      
      parent = current >> 1;

      while ((parent > 0) &&
             (heap[heap[parent-1]] > heap[heap[current-1]])) {

        int tmp;

        tmp = heap[parent-1];
        heap[parent-1] = heap[current-1];
        heap[current-1] = tmp;

        current = parent;
        parent = current >> 1;
      }
    }
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 3)
    fprintf(protocol, "\n");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 3 */

  /* compute the code lengths. We don't have any freqs in heap any
   * more, only pointers to parents */

  heap[0] = -1U;

  /* root has a depth of 0 */

  heap[1] = 0;

  /* we trust in what they say on p. 345 */

  for (i = 2; i < hc->size * 2; i++)
    heap[i] = heap[heap[i]]+1;


  /* collect the lengths */

  sum_bits = 0L;

  for (i = 0; i < hc->size; i++) {

    int cl = heap[i+hc->size];

    sum_bits += cl * get_id_frequency(attr, i);

    codelength[i] = cl;
    if (cl == 0)
      continue;

    if (cl > hc->max_codelen)
      hc->max_codelen = cl;

    if (cl < hc->min_codelen)
      hc->min_codelen = cl;

    hc->lcount[cl]++;
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0) {

    fprintf(protocol, "Minimal code length: %3d\n", hc->min_codelen);
    fprintf(protocol, "Maximal code length: %3d\n", hc->max_codelen);
    fprintf(protocol, "Compressed code len: %10ld bits, %10ld (+1) bytes\n\n\n",
            sum_bits, sum_bits/8);

  }
  /* ============================== PROTOCOL ============================== */

  if (hc->max_codelen >= MAXCODELEN) {
    Rprintf( "Error: Huffman codes too long (%d bits, current maximum is %d bits).\n", hc->max_codelen, MAXCODELEN-1);
    Rprintf( "       Please contact the CWB development team for assistance.\n");
    rcqp_receive_error(1);
  }

  if ((hc->max_codelen == 0) && (hc->min_codelen == 100)) {

    Rprintf( "Problem: No output generated -- no items?\n");
    nr_codes = 0;
  }
  else {

    hc->min_code[hc->max_codelen] = 0;
    
    for (i = hc->max_codelen-1; i > 0; i--)
      hc->min_code[i] = (hc->min_code[i+1] + hc->lcount[i+1]) >> 1;

    hc->symindex[hc->min_codelen] = 0;
    for (i = hc->min_codelen+1; i <= hc->max_codelen; i++)
      hc->symindex[i] = hc->symindex[i-1] + hc->lcount[i-1];


    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 0) {

      int sum_codes = 0;

      fprintf(protocol, " CL  #codes  MinCode   SymIdx\n");
      fprintf(protocol, "----------------------------------------\n");

      for (i = hc->min_codelen; i <= hc->max_codelen; i++) {
        sum_codes += hc->lcount[i];
        fprintf(protocol, "%3d %7d  %7d  %7d\n", 
                i, hc->lcount[i], hc->min_code[i], hc->symindex[i]);
      }

      fprintf(protocol, "----------------------------------------\n");
      fprintf(protocol, "    %7d\n", sum_codes);
    }
    /* ============================== PROTOCOL ============================== */


    for (i = 0; i < MAXCODELEN; i++)
      next_code[i] = hc->min_code[i];

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "\n");
      fprintf(protocol, "   Item   f(item)  CL      Bits     Code, String\n");
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */

    /* compute and issue codes */
    
    hc->symbols = heap + hc->size;

    for (i = 0; i < hc->size; i++) {

      /* we store the code for item i in heap[i] */
      heap[i] = next_code[codelength[i]];
      next_code[codelength[i]]++;

      /* ============================== PROTOCOL ============================== */
      if (do_protocol > 1) {
        fprintf(protocol, "%7d  %7d  %3d  %10d ",
                i,
                get_id_frequency(attr, i),
                codelength[i],
                codelength[i] * get_id_frequency(attr, i));

        bprintf(heap[i], codelength[i], protocol);

        fprintf(protocol, "  %7d  %s\n",
                heap[i], get_string_of_id(attr, i));
      }
      /* ============================== PROTOCOL ============================== */

      /* and put the item itself in the second half of the table */
      heap[hc->size+hc->symindex[codelength[i]]+issued_codes[codelength[i]]] = i;
      issued_codes[codelength[i]]++;
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */


    /* The work itself -- encode the attribute data */

    {
      char *path;

      char hcd_path[CL_MAX_LINE_LENGTH];
      char huf_path[CL_MAX_LINE_LENGTH];
      char sync_path[CL_MAX_LINE_LENGTH];

      Component *corp;

      BFile bfd;
      FILE *sync;

      int cl, code, pos;

      corp = ensure_component(attr, CompCorpus, 0);
      assert(corp);

      if (fname) {
        path = fname;

        sprintf(hcd_path, "%s.hcd", path);
        sprintf(huf_path, "%s.huf", path);
        sprintf(sync_path, "%s.huf.syn", path);
      }
      else {
        path = component_full_name(attr, CompHuffSeq, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(huf_path, path);

        path = component_full_name(attr, CompHuffCodes, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(hcd_path, path);

        path = component_full_name(attr, CompHuffSync, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(sync_path, path);

      }

      Rprintf("- writing code descriptor block to %s\n",  hcd_path);
      if (!WriteHCD(hcd_path, hc)) {
        Rprintf( "ERROR: writing %s failed. Aborted.\n", hcd_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing compressed item sequence to %s\n", huf_path);

      if (!BFopen(huf_path, "w", &bfd)) {
        Rprintf( "ERROR: can't create file %s\n", huf_path);
        perror(huf_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing sync (every %d tokens) to %s\n", SYNCHRONIZATION, sync_path);

      if ((sync = fopen(sync_path, "w")) == NULL) {
        Rprintf( "ERROR: can't create file %s\n", sync_path);
        perror(sync_path);
        rcqp_receive_error(1);
      }

      for (i = 0; i < hc->length; i++) {

        /* SYNCHRONIZE */

        if ((i % SYNCHRONIZATION) == 0) {
          if (i > 0)
            BFflush(&bfd);
          pos = BFposition(&bfd);
          NwriteInt(pos, sync);
        }

        id = cl_cpos2id(attr, i);
        if ((id < 0) || (cderrno != CDA_OK)) {
          cdperror("(aborting) cl_cpos2id() failed");
          rcqp_receive_error(1);
        }

        else {

          assert((id >= 0) && (id < hc->size) && "Internal Error");

          cl = codelength[id];
          code = heap[id];

          if (!BFwriteWord((unsigned int)code, cl, &bfd)) {
            Rprintf( "Error writing code for ID %d (%d, %d bits) at position %d. Aborted.\n",
                    id, code, cl, i);
            rcqp_receive_error(1);
          }

        }

      }

      fclose(sync);
      BFclose(&bfd);
    }
  }

  free(codelength);
  free(heap);
 
  return 1;
}
示例#5
0
/****************************************************************************************
* Function name - heap_init
*
* Description - Performs initialization of an allocated heap.
*
* Input -       *h - pointer to an allocated heap
*               initial_heap_size -  initial size to start
*               increase_step -  size to increase heap, when deciding to do so
*               comparator -  user-function, that compares user-objects, kept by the heap
*               dumper -  user-function, that dumps user-objects, kept by the heap
*               nodes_prealloc -  number of hnodes to be pre-allocated at initialization
*
* Return Code/Output - On success - 0, on error -1
****************************************************************************************/
int heap_init (heap*const h,
               size_t initial_heap_size,
               size_t increase_step,
               heap_cmp_func comparator,
               node_dump_func dumper,
               size_t nodes_prealloc)
{
  size_t i = 0;
	
  if (!h)
    {
      fprintf(stderr, "%s -error: wrong input\n", __func__);
      return -1;
    }
	
  memset ((void*)h, 0, sizeof (*h));
  
  if (! (h->heap = calloc (initial_heap_size, sizeof (hnode*))) )
    {
      fprintf(stderr, "%s - error: alloc heap failed\n", __func__);
      return -1;
    }
	
  /* Alloc array of node-ids */
  if (! (h->ids_arr = calloc (initial_heap_size, sizeof (long))) )
    {
      fprintf(stderr, "%s - error: alloc of nodes-ids array failed\n", __func__);
      return -1;
    }
	
  /* Invalidate node-ids array indexes */
  for (i = 0; i < initial_heap_size; i++)
    {
      h->ids_arr[i] = -1; /* non-valid id is -1 */
    }
	
  h->max_heap_size = initial_heap_size;
  h->heap_increase_step = increase_step;
	
  if (0 == comparator)
    {
      fprintf(stderr, "%s - error: comparator function should be provided.\n", __func__);
      return -1;      
    }
  else
    {
      h->fcomp = comparator;
    }

  h->fndump = dumper; /* If zero, we do not dump nodes. */

  if (!(h->nodes_mpool = cl_calloc (1, sizeof (mpool))))
    {
      fprintf(stderr, "%s - error: mpool allocation failed\n", __func__);
      return -1;
    }
  else
    {
      if (mpool_init (h->nodes_mpool, sizeof (hnode), nodes_prealloc) == -1)
        {
          fprintf(stderr, "%s - error: mpool_init () -  failed\n",  __func__);
          return -1;
        }
    }
  return 0;
}