Esempio n. 1
0
/* 
 * ------------------------------------------------------------------------
 * 
 * "rcqpCmd_lexicon_size(SEXP inAttribute)" --
 * 
 * 
 * 
 * ------------------------------------------------------------------------
 */
SEXP rcqpCmd_lexicon_size(SEXP inAttribute)
{
	SEXP			result = R_NilValue;
	char *			a;
	Attribute *		attribute;
	int 			size;
	
	if (!isString(inAttribute) || length(inAttribute) != 1) error("argument 'attribute' must be a string");
	PROTECT(inAttribute);

	a = (char*)CHAR(STRING_ELT(inAttribute,0));
	attribute = cqi_lookup_attribute(a, ATT_POS);

	if (attribute != NULL) {
		size = cl_max_id(attribute);
		if (size < 0) {
			UNPROTECT(1);
			Rprintf("negative size");
			rcqp_send_error();
		} else {
			result = PROTECT(allocVector(INTSXP, 1));
			INTEGER(result)[0] = size;
		}
	} else {
		UNPROTECT(1);
		return R_NilValue;
	}
	
	UNPROTECT(2);
	
	return result;
}
Esempio n. 2
0
void
do_cqi_cl_lexicon_size(void)
{
  char *a;
  Attribute *attribute;
  int size;
          
  a = cqi_read_string();
  if (server_debug)
   Rprintf( "CQi: CQI_CL_LEXICON_SIZE('%s')\n", a);
  attribute = cqi_lookup_attribute(a, ATT_POS);
  if (attribute != NULL) {
    size = cl_max_id(attribute);
    if (size < 0) {
      send_cl_error();
    }
    else {
      cqi_data_int(size);
    }
  }
  else {
    cqi_command(cqi_errno);     /* cqi_errno set by lookup() */
  }
  free(a);
}
Esempio n. 3
0
/**
 * Prints statistical information about a corpus to STDOUT.
 *
 * Each corpus attribute gets info printed about it:
 * tokens and types for a P-attribute, number of instances
 * of regions for an S-attribute, number of alignment
 * blocks for an A-attribute.
 *
 * @param corpus  The corpus to analyse.
 */
void 
describecorpus_show_statistics (Corpus *corpus)
{
  Attribute *a;
  int tokens, types, regions, blocks;

  for (a = corpus->attributes; a; a = a->any.next) {
    switch(a->any.type) {
    case ATT_POS:
      Rprintf("p-ATT %-16s ", a->any.name);
      tokens = cl_max_cpos(a);
      types = cl_max_id(a);
      if ((tokens > 0) && (types > 0))
        Rprintf("%10d tokens, %8d types", tokens, types);
      else 
        Rprintf("           NO DATA");
      break;
    case ATT_STRUC:
      Rprintf("s-ATT %-16s ", a->any.name); 
      regions = cl_max_struc(a);
      if (regions >= 0) {
        Rprintf("%10d regions", regions);
        if (cl_struc_values(a))
          Rprintf(" (with annotations)");
      }
      else 
        Rprintf("           NO DATA");
      break;
    case ATT_ALIGN:
      Rprintf("a-ATT %-16s ", a->any.name); 
      blocks = cl_max_alg(a);
      if (blocks >= 0) {
        Rprintf("%10d alignment blocks", blocks);
        if (cl_has_extended_alignment(a))
          Rprintf(" (extended)");
      }
      else
        Rprintf("           NO DATA");
      break;
    default:
      Rprintf("???   %-16s (unknown attribute type)", a->any.name); 
      break;
    }
    Rprintf("\n");
  }

  Rprintf("\n");
}
Esempio n. 4
0
/**
 * Validates the REVCORP component of the given attribute.
 *
 * This function validates a REVCORP (i.e. an uncompressed index).
 * It assumes that a lexicon, frequencies and (compressed or
 * uncompressed) token stream are available for CL access for the
 * given attribute.
 *
 * @param attr  The attribute whose REVCORP should be checked.
 * @return      True for all OK, false for a problem.
 */
int
validate_revcorp(Attribute *attr)
{

  Component *revcorp = ensure_component(attr, CompRevCorpus, 0);
  int *ptab;                        /* table of index offsets for each lexicon entry */
  int lexsize, corpsize;
  int i, offset, cpos, id;

  printf(" ? validating %s ... ", cid_name(CompRevCorpus));
  fflush(stdout);

  if (revcorp == NULL) {
    printf("FAILED (no data)\n");
    return 0;
  }
  lexsize = cl_max_id(attr);
  corpsize = cl_max_cpos(attr);
  if ((lexsize <= 0) || (corpsize <= 0)) {
    printf("FAILED (corpus access error)\n");
    return 0;
  }
  if (revcorp->size != corpsize) {
    printf("FAILED (wrong size)\n");
    return 0;
  }

  /* init offsets by calculating REVIDX component from token frequencies */
  ptab = (int *) cl_calloc(lexsize, sizeof(int));
  offset = 0;
  for (i = 0; i < lexsize; i++) {
    ptab[i] = offset;
    offset += cl_id2freq(attr, i);
  }

  /* now read token stream, check each token id against REVCORP, and increment its pointer */
  for (cpos = 0; cpos < corpsize; cpos++) {
    id = cl_cpos2id(attr, cpos);
    if ((id < 0) || (id >= lexsize)) {
      printf("FAILED (inconsistency in token stream)\n");
      cl_free(ptab);
      return 0;
    }
    if (ntohl(revcorp->data.data[ptab[id]]) != cpos) {
      printf("FAILED\n");
      cl_free(ptab);
      return 0;
    }
    ptab[id]++;
  }

  /* validate frequencies by comparing final offsets against those calculated from token frequencies */
  offset = 0;
  for (i = 0; i < lexsize; i++) {
    offset += cl_id2freq(attr, i);
    if (ptab[i] != offset) {
      printf("FAILED (token frequencies incorrect)\n");
      cl_free(ptab);
      return 0;
    }
  }

  cl_free(ptab);

  printf("OK\n");
  return 1;
}
Esempio n. 5
0
/**
 * Compresses the token stream of a p-attribute.
 *
 * Three files are created: the compressed token stream, the descriptor block,
 * and a sync file.
 *
 * @param attr  The attribute to compress.
 * @param hc    Location for the resulting Huffmann code descriptor block.
 * @param fname Base filename for the resulting files.
 */
int 
compute_code_lengths(Attribute *attr, HCD *hc, char *fname)
{
  int id, i, h;

  int nr_codes = 0;

  int *heap = NULL;
  unsigned *codelength = NULL;        /* was char[], probably to save space; but that's unnecessary and makes gcc complain */

  int issued_codes[MAXCODELEN];
  int next_code[MAXCODELEN];

  long sum_bits;


  Rprintf("COMPRESSING TOKEN STREAM of %s.%s\n", corpus_id_cwb_huffcode, attr->any.name);

  /* I need the following components:
   * - CompCorpus
   * - CompCorpusFreqs
   * - CompLexicon
   * - CompLexiconIdx
   * and want to force the CL to use them rather than compressed data. 
   */

  {
    Component *comp;

    if ((comp = ensure_component(attr, CompCorpus, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the CORPUS component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexicon, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXION component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexiconIdx, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXIDX component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompCorpusFreqs, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the FREQS component.\n"
              "Run 'makeall -r %s -c FREQS %s %s' in order to create it.\n",
              corpus->registry_dir, corpus->registry_name, attr->any.name);
      rcqp_receive_error(1);
    }

  }

  /*
   * strongly follows Witten/Moffat/Bell: ``Managing Gigabytes'', 
   * pp. 335ff.
   */

  hc->size = cl_max_id(attr);                /* the size of the attribute (nr of items) */
  if ((hc->size <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_id() failed");
    rcqp_receive_error(1);
  }

  hc->length = cl_max_cpos(attr); /* the length of the attribute (nr of tokens) */
  if ((hc->length <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_cpos() failed");
    rcqp_receive_error(1);
  }

  hc->symbols = NULL;
  hc->min_codelen = 100;
  hc->max_codelen = 0;

  memset((char *)hc->lcount, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->min_code, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->symindex, '\0', MAXCODELEN * sizeof(int));

  memset((char *)issued_codes, '\0', MAXCODELEN * sizeof(int));

  codelength = (unsigned *)cl_calloc(hc->size, sizeof(unsigned));


  /* =========================================== make & initialize the heap */

  heap = (int *)cl_malloc(hc->size * 2 * sizeof(int));

  for (i = 0; i < hc->size; i++) {
    heap[i] = hc->size + i;
    heap[hc->size+i] = get_id_frequency(attr, i) + 1;
    /* add-one trick needed to avoid unsupported Huffman codes > 31 bits for very large corpora of ca. 2 billion words:
       theoretical optimal code length for hapax legomena in such corpora is ca. 31 bits, and the Huffman algorithm 
       sometimes generates 32-bit codes; with add-one trick, the theoretical optimal code length is always <= 30 bits */    
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0)
    fprintf(protocol, "Allocated heap with %d cells for %d items\n\n",
            hc->size * 2, hc->size);
  if (do_protocol > 2)
    print_heap(heap, hc->size, "After Initialization");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 1 */


  h = hc->size;

  /*
   * we address the heap in the following manner: when we start array
   * indices at 1, the left child is at 2i, and the right child is at
   * 2i+1. So we maintain this scheme and decrement just before
   * adressing the array. 
   */

  /*
   * construct the initial min-heap
   */

  for (i = hc->size/2; i > 0; i--) {

    /* do:
     * bottom up, left to right,
     * for each root of each subtree, sift if necessary
     */

    sift(heap, h, i);
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 2) {
    print_heap(heap, hc->size, "Initial Min-Heap");
    fprintf(protocol, "\n");
  }
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 2 */

  /* smallest item at top of heap now, remove the two smallest items
   * and sift, find second smallest by removing top and sifting, as
   * long as we have more than one root */



  while (h > 1) {
    
    int pos[2];

    for (i = 0; i < 2; i++) {

      /* remove topmost (i.e. smallest) item */

      pos[i] = heap[0];

      /* remove and sift, to reobtain heap integrity: move ``last''
       * item to top of heap and sift */

      heap[0] = heap[--h];
      
      sift(heap, h, 1);
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 3) {
      fprintf(protocol, "Removed     smallest item %d with freq %d\n",
              pos[0], heap[pos[0]]);
      fprintf(protocol, "Removed 2nd smallest item %d with freq %d\n",
              pos[1], heap[pos[1]]);
    }
    /* ============================== PROTOCOL ============================== */

    /*
     * pos[0] and pos[1] contain pointers to the two smallest items
     * now. since h was decremented twice, h and h+1 are now empty and
     * become the accumulated freq of pos[i]. The individual
     * frequencies are not needed any more, so pointers to h+1 (the
     * acc freq) are stored there instead (tricky, since freq cell
     * becomes pointer cell). So, what happens here, is to include a
     * new element in the heap. */

    heap[h] = h+1;
    heap[h+1] = heap[pos[0]] + heap[pos[1]]; /* accumulated freq */
    heap[pos[0]] = heap[pos[1]] = h+1; /* pointers! */
    h++;                        /* we put a new element into heap */

    /*
     * now, swap it up until we reobtain heap integrity
     */

    {
      register int parent, current;
      
      current = h;
      
      parent = current >> 1;

      while ((parent > 0) &&
             (heap[heap[parent-1]] > heap[heap[current-1]])) {

        int tmp;

        tmp = heap[parent-1];
        heap[parent-1] = heap[current-1];
        heap[current-1] = tmp;

        current = parent;
        parent = current >> 1;
      }
    }
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 3)
    fprintf(protocol, "\n");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 3 */

  /* compute the code lengths. We don't have any freqs in heap any
   * more, only pointers to parents */

  heap[0] = -1U;

  /* root has a depth of 0 */

  heap[1] = 0;

  /* we trust in what they say on p. 345 */

  for (i = 2; i < hc->size * 2; i++)
    heap[i] = heap[heap[i]]+1;


  /* collect the lengths */

  sum_bits = 0L;

  for (i = 0; i < hc->size; i++) {

    int cl = heap[i+hc->size];

    sum_bits += cl * get_id_frequency(attr, i);

    codelength[i] = cl;
    if (cl == 0)
      continue;

    if (cl > hc->max_codelen)
      hc->max_codelen = cl;

    if (cl < hc->min_codelen)
      hc->min_codelen = cl;

    hc->lcount[cl]++;
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0) {

    fprintf(protocol, "Minimal code length: %3d\n", hc->min_codelen);
    fprintf(protocol, "Maximal code length: %3d\n", hc->max_codelen);
    fprintf(protocol, "Compressed code len: %10ld bits, %10ld (+1) bytes\n\n\n",
            sum_bits, sum_bits/8);

  }
  /* ============================== PROTOCOL ============================== */

  if (hc->max_codelen >= MAXCODELEN) {
    Rprintf( "Error: Huffman codes too long (%d bits, current maximum is %d bits).\n", hc->max_codelen, MAXCODELEN-1);
    Rprintf( "       Please contact the CWB development team for assistance.\n");
    rcqp_receive_error(1);
  }

  if ((hc->max_codelen == 0) && (hc->min_codelen == 100)) {

    Rprintf( "Problem: No output generated -- no items?\n");
    nr_codes = 0;
  }
  else {

    hc->min_code[hc->max_codelen] = 0;
    
    for (i = hc->max_codelen-1; i > 0; i--)
      hc->min_code[i] = (hc->min_code[i+1] + hc->lcount[i+1]) >> 1;

    hc->symindex[hc->min_codelen] = 0;
    for (i = hc->min_codelen+1; i <= hc->max_codelen; i++)
      hc->symindex[i] = hc->symindex[i-1] + hc->lcount[i-1];


    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 0) {

      int sum_codes = 0;

      fprintf(protocol, " CL  #codes  MinCode   SymIdx\n");
      fprintf(protocol, "----------------------------------------\n");

      for (i = hc->min_codelen; i <= hc->max_codelen; i++) {
        sum_codes += hc->lcount[i];
        fprintf(protocol, "%3d %7d  %7d  %7d\n", 
                i, hc->lcount[i], hc->min_code[i], hc->symindex[i]);
      }

      fprintf(protocol, "----------------------------------------\n");
      fprintf(protocol, "    %7d\n", sum_codes);
    }
    /* ============================== PROTOCOL ============================== */


    for (i = 0; i < MAXCODELEN; i++)
      next_code[i] = hc->min_code[i];

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "\n");
      fprintf(protocol, "   Item   f(item)  CL      Bits     Code, String\n");
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */

    /* compute and issue codes */
    
    hc->symbols = heap + hc->size;

    for (i = 0; i < hc->size; i++) {

      /* we store the code for item i in heap[i] */
      heap[i] = next_code[codelength[i]];
      next_code[codelength[i]]++;

      /* ============================== PROTOCOL ============================== */
      if (do_protocol > 1) {
        fprintf(protocol, "%7d  %7d  %3d  %10d ",
                i,
                get_id_frequency(attr, i),
                codelength[i],
                codelength[i] * get_id_frequency(attr, i));

        bprintf(heap[i], codelength[i], protocol);

        fprintf(protocol, "  %7d  %s\n",
                heap[i], get_string_of_id(attr, i));
      }
      /* ============================== PROTOCOL ============================== */

      /* and put the item itself in the second half of the table */
      heap[hc->size+hc->symindex[codelength[i]]+issued_codes[codelength[i]]] = i;
      issued_codes[codelength[i]]++;
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */


    /* The work itself -- encode the attribute data */

    {
      char *path;

      char hcd_path[CL_MAX_LINE_LENGTH];
      char huf_path[CL_MAX_LINE_LENGTH];
      char sync_path[CL_MAX_LINE_LENGTH];

      Component *corp;

      BFile bfd;
      FILE *sync;

      int cl, code, pos;

      corp = ensure_component(attr, CompCorpus, 0);
      assert(corp);

      if (fname) {
        path = fname;

        sprintf(hcd_path, "%s.hcd", path);
        sprintf(huf_path, "%s.huf", path);
        sprintf(sync_path, "%s.huf.syn", path);
      }
      else {
        path = component_full_name(attr, CompHuffSeq, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(huf_path, path);

        path = component_full_name(attr, CompHuffCodes, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(hcd_path, path);

        path = component_full_name(attr, CompHuffSync, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(sync_path, path);

      }

      Rprintf("- writing code descriptor block to %s\n",  hcd_path);
      if (!WriteHCD(hcd_path, hc)) {
        Rprintf( "ERROR: writing %s failed. Aborted.\n", hcd_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing compressed item sequence to %s\n", huf_path);

      if (!BFopen(huf_path, "w", &bfd)) {
        Rprintf( "ERROR: can't create file %s\n", huf_path);
        perror(huf_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing sync (every %d tokens) to %s\n", SYNCHRONIZATION, sync_path);

      if ((sync = fopen(sync_path, "w")) == NULL) {
        Rprintf( "ERROR: can't create file %s\n", sync_path);
        perror(sync_path);
        rcqp_receive_error(1);
      }

      for (i = 0; i < hc->length; i++) {

        /* SYNCHRONIZE */

        if ((i % SYNCHRONIZATION) == 0) {
          if (i > 0)
            BFflush(&bfd);
          pos = BFposition(&bfd);
          NwriteInt(pos, sync);
        }

        id = cl_cpos2id(attr, i);
        if ((id < 0) || (cderrno != CDA_OK)) {
          cdperror("(aborting) cl_cpos2id() failed");
          rcqp_receive_error(1);
        }

        else {

          assert((id >= 0) && (id < hc->size) && "Internal Error");

          cl = codelength[id];
          code = heap[id];

          if (!BFwriteWord((unsigned int)code, cl, &bfd)) {
            Rprintf( "Error writing code for ID %d (%d, %d bits) at position %d. Aborted.\n",
                    id, code, cl, i);
            rcqp_receive_error(1);
          }

        }

      }

      fclose(sync);
      BFclose(&bfd);
    }
  }

  free(codelength);
  free(heap);
 
  return 1;
}
Esempio n. 6
0
/**
 * Checks a compressed reversed index for errors by decompressing it.
 *
 * This function this assumes that compress_reversed_index() has been called
 * beforehand and made sure that the _uncompressed_ index is used by CL
 * access functions.
 *
 * @param attr      The attribute to check the index of.
 * @param output_fn Base name for the compressed RDX files to be read
 *                  (if this is null, filename swill be taken from the
 *                  attribute).
 */
void 
decompress_check_reversed_index(Attribute *attr, char *output_fn)
{
  char *s;
  char data_fname[CL_MAX_FILENAME_LENGTH];
  char index_fname[CL_MAX_FILENAME_LENGTH];
  
  int nr_elements;
  int element_freq;
  int corpus_size;
  int pos, gap;

  int b;
  int i, k;

  BFile data_file;
  FILE *index_file;

  PositionStream PStream;
  int true_pos;


  Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name);

  nr_elements = cl_max_id(attr);
  if ((nr_elements <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_id() failed");
    compressrdx_cleanup(1);
  }

  corpus_size = cl_max_cpos(attr);
  if ((corpus_size <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_cpos() failed");
    compressrdx_cleanup(1);
  }

  if (output_fn) {
    sprintf(data_fname, "%s.crc", output_fn);
    sprintf(index_fname, "%s.crx", output_fn);
  }
  else {
    s = component_full_name(attr, CompCompRF, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(data_fname, s);

    s = component_full_name(attr, CompCompRFX, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(index_fname, s);
  }
  
  if (! BFopen(data_fname, "r", &data_file)) {
    Rprintf( "ERROR: can't open file %s\n", data_fname);
    perror(data_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- reading compressed index from %s\n", data_fname);
  
  if ((index_file = fopen(index_fname, "r")) == NULL) {
    Rprintf( "ERROR: can't open file %s\n", index_fname);
    perror(index_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- reading compressed index offsets from %s\n", index_fname);


  for (i = 0; i < nr_elements; i++) {

    element_freq = cl_id2freq(attr, i);
    if ((element_freq == 0) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) token frequency == 0\n");
      compressrdx_cleanup(1);
    }

    PStream = cl_new_stream(attr, i);
    if ((PStream == NULL) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) index read error");
      compressrdx_cleanup(1);
    }

    b = compute_ba(element_freq, corpus_size);

    if (debug_cwb_compress_rdx)
      fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n",
              i, element_freq, b);

    pos = 0;
    for (k = 0; k < element_freq; k++) {

      gap = read_golomb_code_bf(b, &data_file);
      pos += gap;

      if (1 != cl_read_stream(PStream, &true_pos, 1)) {
        cl_error("(aborting) index read error\n");
        compressrdx_cleanup(1);
      }
      if (pos != true_pos) {
        Rprintf( "ERROR: wrong occurrence of token #%d at cpos %d (correct cpos: %d). Aborted.\n",
              i, pos, true_pos);
        compressrdx_cleanup(1);
      }

    }
    
    cl_delete_stream(&PStream);
    BFflush(&data_file);
  }

  fclose(index_file);
  BFclose(&data_file);

  /* tell the user it's safe to delete the REVCORP and REVCIDX components now */
  Rprintf("!! You can delete the file <%s> now.\n",
         component_full_name(attr, CompRevCorpus, NULL));
  Rprintf("!! You can delete the file <%s> now.\n",
         component_full_name(attr, CompRevCorpusIdx, NULL));
  
  return;
}
Esempio n. 7
0
/**
 * Compresses the reversed index of a p-attribute.
 *
 * @param attr      The attribute to compress the index of.
 * @param output_fn Base name for the compressed RDX files to be written
 *                  (if this is null, filenames will be taken from the
 *                  attribute).
 */
void 
compress_reversed_index(Attribute *attr, char *output_fn)
{
  char *s;
  char data_fname[CL_MAX_FILENAME_LENGTH];
  char index_fname[CL_MAX_FILENAME_LENGTH];
  
  int nr_elements;
  int element_freq;
  int corpus_size;
  int last_pos, gap, fpos;

  int b;

  int i, k;

  BFile data_file;
  FILE *index_file = NULL;

  PositionStream PStream;
  int new_pos;


  Rprintf("COMPRESSING INDEX of %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name);

  /* ensure that we do NOT use the compressed index while building the
   * compressed index (yeah, a nasty thing that). That is, load the
   * .corpus.rev and .corpus.rdx components in order to force
   * subsequent CL calls to use the uncompressed data.
   */

  {
    Component *comp;

    if ((comp = ensure_component(attr, CompRevCorpus, 0)) == NULL) {
      Rprintf( "Index compression requires the REVCORP component\n");
      compressrdx_cleanup(1);
    }

    if ((comp = ensure_component(attr, CompRevCorpusIdx, 0)) == NULL) {
      Rprintf( "Index compression requires the REVCIDX component\n");
      compressrdx_cleanup(1);
    }

  }

  nr_elements = cl_max_id(attr);
  if ((nr_elements <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_id() failed");
    compressrdx_cleanup(1);
  }

  corpus_size = cl_max_cpos(attr);
  if ((corpus_size <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_cpos() failed");
    compressrdx_cleanup(1);
  }

  if (output_fn) {
    sprintf(data_fname, "%s.crc", output_fn);
    sprintf(index_fname, "%s.crx", output_fn);
  }
  else {
    s = component_full_name(attr, CompCompRF, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(data_fname, s);

    s = component_full_name(attr, CompCompRFX, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(index_fname, s);
  }
  
  if (! BFopen(data_fname, "w", &data_file)) {
    Rprintf( "ERROR: can't create file %s\n", data_fname);
    perror(data_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- writing compressed index to %s\n", data_fname);
  
  if ((index_file = fopen(index_fname, "wb")) == NULL) {
    Rprintf( "ERROR: can't create file %s\n", index_fname);
    perror(index_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- writing compressed index offsets to %s\n", index_fname);

  for (i = 0; i < nr_elements; i++) {
    
    element_freq = cl_id2freq(attr, i);
    if ((element_freq == 0) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) token frequency == 0\n");
      compressrdx_cleanup(1);
    }

    PStream = cl_new_stream(attr, i);
    if ((PStream == NULL) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) index read error");
      compressrdx_cleanup(1);
    }
    
    b = compute_ba(element_freq, corpus_size);
    
    fpos = BFposition(&data_file);
    NwriteInt(fpos, index_file);
    
    if (debug_cwb_compress_rdx)
      fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n",
              i, element_freq, b);
    
    last_pos = 0;
    for (k = 0; k < element_freq; k++) {
      if (1 != cl_read_stream(PStream, &new_pos, 1)) {
        cl_error("(aborting) index read error\n");
        compressrdx_cleanup(1);
      }
      
      gap = new_pos - last_pos;
      last_pos = new_pos;
      
      if (debug_cwb_compress_rdx)
        fprintf(debug_output, "%8d:  gap=%4d, b=%4d\n", codepos, gap, b);
      
      write_golomb_code(gap, b, &data_file);
      codepos++;
    }
    
    cl_delete_stream(&PStream);
    BFflush(&data_file);
  }
    
  fclose(index_file);
  BFclose(&data_file);

  return;
}
Esempio n. 8
0
/**
 * Creates feature maps for a source/target corpus pair.
 *
 * Example usage:
 *
 * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s);
 *
 * @param config              pointer to a list of strings representing the feature map configuration.
 * @param config_lines        the number of configuration items stored in config_data.
 * @param w_attr1             The p-attribute in the first corpus to link.
 * @param w_attr2             The p-attribute in the second corpus to link.
 * @param s_attr1             The s-attribute in the first corpus to link.
 * @param s_attr2             The s-attribute in the second corpus to link.
 * @return                    the new FMS object.
 */
FMS
create_feature_maps(char **config,
                    int config_lines,
                    Attribute *w_attr1,
                    Attribute *w_attr2,
                    Attribute *s_attr1,
                    Attribute *s_attr2
                    ) 
{
  FMS r;
  unsigned int *fcount1, *fcount2;
  int config_pointer;
  char *b, command[200], dummy[200];
  int current_feature,
      weight,
    need_to_abort;
  int *fs1, *fs2; 
  int i,nw1,nw2;


  r = (FMS) malloc(sizeof(feature_maps_t));
  assert(r);

  r->att1 = w_attr1;
  r->att2 = w_attr2;
  r->s1 = s_attr1;
  r->s2 = s_attr2;

  init_char_map();
  
  nw1= cl_max_id(w_attr1);
  if (nw1 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of source corpus\n");
    exit(1);
  }
  nw2= cl_max_id(w_attr2);
  if (nw2 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of target corpus\n");
    exit(1);
  }
  
  printf("LEXICON SIZE: %d / %d\n", nw1, nw2);

  fcount1 = (unsigned int*) calloc(nw1+1,sizeof(unsigned int));
  fcount2 = (unsigned int*) calloc(nw2+1,sizeof(unsigned int));

  r->n_features=1;

  /* process feature map configuration: first pass */
  for (config_pointer = 0; config_pointer < config_lines; config_pointer++) {

    if ( (b = strpbrk(config[config_pointer],"\n#")) )  /* strip newline and comments */
      *b=0;
    if (sscanf(config[config_pointer],"%s",command)>0) {
      if(command[0]=='-') {
        switch(command[1]) {
        case 'S': {
          int i1, i2, f1, f2;
          float threshold;
          int n_shared=0;
            
          if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy)!=3) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -S:<weight>:<threshold>\n");
            fprintf(stderr,"  Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n");
            exit(1);
          }
          else {
            printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight);
            fflush(stdout);
            for (i1=0; i1 < nw1; i1++) {
              f1 = cl_id2freq(w_attr1, i1);
              i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1));
              if (i2 >= 0){
                f2 = cl_id2freq(w_attr2, i2);
                if (f1 / (0.0+f1+f2) >=threshold && f2 / (0.0+f1+f2) >= threshold){
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_shared++;
                  r->n_features++; 
                }
              }
            }
            printf("[%d]\n",n_shared);
          }
          break;
        }
        case '1': 
        case '2':
        case '3':
        case '4': { 
          int n;
          
          if (sscanf(config[config_pointer],"%1s%d:%d %s",command,&n,&weight,dummy)!=3) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -<n>:<weight>  (n = 1..4)\n");
            fprintf(stderr,"  Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n");
            exit(1);
          }
          else if(n <= 0 || n>4) {
            /* this shouldn't happen anyway */
            fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n",n,config[config_pointer]);
            exit(1);
          }
          else {
            int i,f,l;

            printf("FEATURE: %d-grams, weight=%d ... ", n, weight);
            fflush(stdout);

            for(i=0; i<nw1; i++) {
              l = cl_id2strlen(w_attr1, i);
              fcount1[i] += (l >= n) ? l - n + 1 : 0;
            }
            for(i=0; i<nw2; i++) {
              l = cl_id2strlen(w_attr2, i);
              fcount2[i] += (l >= n) ? l - n + 1 : 0;
            }
            f=1;
            for(i=0;i<n;i++)
              f*=char_map_range;
            r->n_features+=f;           
            printf("[%d]\n", f);
          }
          break;
        }
        case 'W': {
          char filename[200],
            word1[200],
            word2[200];
          FILE *wordlist;
          int nw,nl=0,i1,i2,n_matched=0;

          if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -W:<weight>:<filename>\n");
            fprintf(stderr, "  Word list (read from file <filename>).\n");
            exit(1);
          }
          else if(!(wordlist=fopen(filename,"r"))) {
            fprintf(stderr,"ERROR: Cannot read word list file %s.\n",
                    filename);
            exit(-1);
          }
          else {
            printf("FEATURE: word list %s, weight=%d ... ", filename, weight);
            fflush(stdout);
            while((nw=fscanf(wordlist,"%s %s",word1,word2))>0) {
              nl++;
              if (nw!=2) fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n",
                                 nl,filename,nw);
              else {
                if((i1=cl_str2id(w_attr1,word1))>=0
                   && (i2=cl_str2id(w_attr2,word2)) >=0) {
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_matched++;
                  r->n_features++;
                }
              }
            }
            fclose(wordlist);
            printf("[%d]\n", n_matched);
          }         
          break;
        }
        case 'C': 
          if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -C:<weight>\n");
            fprintf(stderr, "  Character count [primary feature].\n");
            exit(1);
          }
          else {
            /* primary feature -> don't create additional features */
            /* first entry in a token's feature list is character count */ 
            for (i=0; i<nw1; i++) fcount1[i]++;
            for (i=0; i<nw2; i++) fcount2[i]++;
            printf("FEATURE: character count, weight=%d ... [1]\n", weight);
          }
          break;
        default: fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]);
          exit(1);
          break;
        }
      }
      else {
        fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]);
        exit(1);
      }
    }
  }
Esempio n. 9
0
/**
 * Creates feature maps for a source/target corpus pair.
 *
 * This is the constructor function for the FMS class.
 *
 * Example usage:
 *
 * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s);
 *
 * @param config              array of strings representing the feature map configuration.
 * @param config_lines        the number of configuration items stored in config_data.
 * @param w_attr1             The p-attribute in the first corpus to link.
 * @param w_attr2             The p-attribute in the second corpus to link.
 * @param s_attr1             The s-attribute in the first corpus to link.
 * @param s_attr2             The s-attribute in the second corpus to link.
 * @return                    the new FMS object.
 */
FMS
create_feature_maps(char **config,
                    int config_lines,
                    Attribute *w_attr1,
                    Attribute *w_attr2,
                    Attribute *s_attr1,
                    Attribute *s_attr2
                    ) 
{
  FMS r;

  unsigned int *fcount1, *fcount2;    /* arrays for types in the lexicons of the source
                                       * & target corpora, respectively, counting how often each is used
                                       * in a feature */

  int config_pointer;

  char *b, command[CL_MAX_LINE_LENGTH], dummy[CL_MAX_LINE_LENGTH];

  int current_feature;
  int weight;                         /* holds the weight assigned to the feature(s) we're working on */
  int need_to_abort;                  /* boolean used during pointer check */

  /* after we have counted up features, these will become arrays of ints, with one entry per feature */
  int *fs1, *fs2; 

  int i;
  int nw1;  /* number of types on the word-attribute of the source corpus */
  int nw2;  /* number of types on the word-attribute of the target corpus */

  /* one last variable: we need to know the character set of the two corpora for assorted purposes */
  CorpusCharset charset;
  charset = cl_corpus_charset(cl_attribute_mother_corpus(w_attr1));

  /* first, create the FMS object. */
  r = (FMS) malloc(sizeof(feature_maps_t));
  assert(r);

  /* copy in the attribute pointers */
  r->att1 = w_attr1;
  r->att2 = w_attr2;
  r->s1 = s_attr1;
  r->s2 = s_attr2;

  init_char_map();
  
  /* find out how many different word-types occur on each of the p-attributes */
  nw1 = cl_max_id(w_attr1);
  if (nw1 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of source corpus\n");
    exit(1);
  }
  nw2 = cl_max_id(w_attr2);
  if (nw2 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of target corpus\n");
    exit(1);
  }
  
  printf("LEXICON SIZE: %d / %d\n", nw1, nw2);

  fcount1 = (unsigned int*) calloc(nw1 + 1, sizeof(unsigned int));
  fcount2 = (unsigned int*) calloc(nw2 + 1, sizeof(unsigned int));

  r->n_features = 1;


  /* NOTE there are two passes through the creation of feature maps - two sets of nearly identical code!
   * First pass to see how many things we need ot count, second pass to count them. */

  /* process feature map configuration: first pass */
  for (config_pointer = 0; config_pointer < config_lines; config_pointer++) {

    /* strip newline and comments */
    if ( (b = strpbrk(config[config_pointer],"\n#")) )
      *b = 0;

    if (sscanf(config[config_pointer], "%s", command) > 0) {
      if(command[0] == '-') {
        /*
         * These are the FIRST PASS options for the different config lines.
         *
         * Possible config commands: -S -W -C -1 -2 -3 -4
         */
        switch(command[1]) {
        /* -S : the "shared words" type of feature */
        case 'S': {
          int i1, i2; /* i1 and i2 are temporary indexes into the lexicons of the two corpora */
          int f1, f2; /* f1 and f2 are temporary storage for frequencies from the corpus lexicons */
          float threshold;
          int n_shared = 0; /* numebr fo shared words - only calculated for the purpose of printing it */

          if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy) != 3) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -S:<weight>:<threshold>\n");
            fprintf(stderr,"  Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n");
            exit(1);
          }
          else {
            printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight);
            fflush(stdout);

            /* for each type in target corpus, get its frequency, and the corresponding id and frequency
             * from the target corpus, then test whether it meets the criteria for use as a feature. */
            for (i1 = 0; i1 < nw1; i1++) {
              f1 = cl_id2freq(w_attr1, i1);
              i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1));
              if (i2 >= 0){
                f2 = cl_id2freq(w_attr2, i2);
                /* if it will be used as a feature, increment counts of features in various places */
                if ( (f1 / (0.0+f1+f2)) >= threshold && (f2 / (0.0+f1+f2)) >= threshold){
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_shared++;
                  r->n_features++; 
                }
              }
            }
            printf("[%d]\n", n_shared);
          }
          break;
        }
        /* -1 to -4 : shared character sequences (of 1 letter to 4 letters in length) as features */
        case '1': 
        case '2':
        case '3':
        case '4': { 
          int n; /* length of the n-gram, obviously */
          
          if (sscanf(config[config_pointer], "%1s%d:%d %s", command, &n, &weight, dummy) !=3 ) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -<n>:<weight>  (n = 1..4)\n");
            fprintf(stderr,"  Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n");
            exit(1);
          }
          else if(n <= 0 || n > 4) {
            /* this shouldn't happen anyway */
            fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n", n, config[config_pointer]);
            exit(1);
          }
          else {
            int i,f,l; /* temp storage for lexicon index, n of possible features, && word length */
            char *s;

            printf("FEATURE: %d-grams, weight=%d ... ", n, weight);
            fflush(stdout);

            /* for each entry in source-corpus lexicon, add to the number of features IFF
             * that lexicon entry is longer than 4 characters */
            for(i = 0; i < nw1; i++) {
              /* l = cl_id2strlen(w_attr1, i); */
              s = (unsigned char *) cl_strdup(cl_id2str(w_attr1, i));
              cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC);
              l = strlen(s);
              cl_free(s);
              fcount1[i] += (l >= n) ? l - n + 1 : 0;
            }
            /* same for target corpus */
            for(i = 0; i < nw2; i++) {
              /* l = cl_id2strlen(w_attr2, i); */
              s = (unsigned char *) cl_strdup(cl_id2str(w_attr2, i));
              cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC);
              l = strlen(s);
              cl_free(s);
              fcount2[i] += (l >= n) ? l - n + 1 : 0;
            }
            /* set f to number of possible features (= number of possible characters to the power of n) */
            f = 1;
            for(i = 0 ; i < n; i++)
              f *= char_map_range;
            /* anmd add that to our total number of features! */
            r->n_features += f;
            printf("[%d]\n", f);
          }
          break;
        }
        /* -W: the word-translation-equivalence type of feature */
        case 'W': {
          char filename[CL_MAX_LINE_LENGTH],
            word1[CL_MAX_LINE_LENGTH],
            word2[CL_MAX_LINE_LENGTH];
          FILE *wordlist;
          int nw;      /* number of words scanned from an input line */
          int nl = 0;  /* counter for the number of lines in the wordlist file we have gone through */
          int i1,i2;   /* lexicon ids in source and target corpora */
          int n_matched = 0;  /* counter for n of lines in input file that can be used as a feature. */

          if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -W:<weight>:<filename>\n");
            fprintf(stderr, "  Word list (read from file <filename>).\n");
            exit(1);
          }
          else if(!(wordlist = fopen(filename,"r"))) {
            fprintf(stderr,"ERROR: Cannot read word list file %s.\n",
                    filename);
            exit(-1);
          }
          else {
            printf("FEATURE: word list %s, weight=%d ... ", filename, weight);
            fflush(stdout);
            while((nw = fscanf(wordlist,"%s %s",word1,word2))>0) {
              /* on first line of file, skip UTF8 byte-order-mark if present */
              if (nl == 0 && charset == utf8 && strlen(word1) > 3)
                if (word1[0] == (char)0xEF && word1[1] == (char)0xBB && word1[2] == (char)0xBF)
                   cl_strcpy(word1, (word1 + 3));
              nl++;
              /* check that both word 1 and word 2 are valid for the encoding of the corpora */
              if (! (cl_string_validate_encoding(word1, charset, 0)
                  && cl_string_validate_encoding(word2, charset, 0)) ) {
                fprintf(stderr, "ERROR: character encoding error in the word-list input file with the input word list.\n");
                fprintf(stderr, "       (The error occurs on line %d.)\n", nl);
                exit(1);
              }
              if (nw != 2)
                fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n",nl,filename,nw);
              else {
                /* if word1 and word2 both occur in their respective corpora, this is a feature. */
                if(   (i1 = cl_str2id(w_attr1, word1)) >= 0
                   && (i2 = cl_str2id(w_attr2, word2)) >= 0 ) {
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_matched++;
                  r->n_features++;
                }
              }
            }
            fclose(wordlist);
            printf("[%d]\n", n_matched);
          }         
          break;
        }
        /* -C: the character count type of feature.
         * This feature exists for EVERY word type. */
        case 'C': 
          if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -C:<weight>\n");
            fprintf(stderr, "  Character count [primary feature].\n");
            exit(1);
          }
          else {
            /* primary feature -> don't create additional features */
            /* first entry in a token's feature list is character count */ 
            for (i=0; i<nw1; i++)
              fcount1[i]++;
            for (i=0; i<nw2; i++)
              fcount2[i]++;
            printf("FEATURE: character count, weight=%d ... [1]\n", weight);
          }
          break;
        default:
          fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]);
          exit(1);
          break;
        }
      }
      else {
        fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]);
        exit(1);
      }
    }
  }