C++ (Cpp) cl_max_cpos Exemples

Exemple #1

0

Afficher le fichier

Fichier : rcqpCommands.c Projet : rforge/rcwb

/* 
 * ------------------------------------------------------------------------
 * 
 * "rcqpCmd_attribute_size(SEXP inAttribute)" --
 * 
 * 
 * 
 * ------------------------------------------------------------------------
 */
SEXP rcqpCmd_attribute_size(SEXP inAttribute)
{
	SEXP			result = R_NilValue;
	char *			a;
	Attribute *		attribute;
	int				size;
	int				found = 0;
	
	if (!isString(inAttribute) || length(inAttribute) != 1) error("argument 'attribute' must be a string");
	PROTECT(inAttribute);

	a = (char*)CHAR(STRING_ELT(inAttribute,0));
	
	/* Need to try all possible attribute types */
	attribute = cqi_lookup_attribute(a, ATT_POS);
	if (attribute != NULL) {
		size = cl_max_cpos(attribute);
		if (size < 0) {
			UNPROTECT(1);
			rcqp_send_error();
		} else {
			found = 1;
		}
	} else {
		attribute = cqi_lookup_attribute(a, ATT_STRUC);
		if (attribute != NULL) {
			size = cl_max_struc(attribute);
			if (size < 0) {
				size = 0;
			} else {
				found = 1;
			}
		} else {
			attribute = cqi_lookup_attribute(a, ATT_ALIGN);
			if (attribute != NULL) {
				size = cl_max_alg(attribute);
				if (size < 0) {
					UNPROTECT(1);
					rcqp_send_error();
				} else {
					found = 1;
				}
			} else {
				UNPROTECT(1);
				rcqp_error_code(cqi_errno);
			}
		}
	}

	if (found) {
		result = PROTECT(allocVector(INTSXP, 1));
		INTEGER(result)[0] = size;
	} 
	
	UNPROTECT(2);

	return result;
}

Exemple #2

0

Afficher le fichier

Fichier : cqpserver.c Projet : cran/rcqp

void
do_cqi_cl_attribute_size(void)
{
  char *a;
  Attribute *attribute;
  int size;
          
  a = cqi_read_string();        /* need to try all possible attribute types */
  if (server_debug)
   Rprintf( "CQi: CQI_CL_ATTRIBUTE_SIZE('%s')\n", a);
  attribute = cqi_lookup_attribute(a, ATT_POS);
  if (attribute != NULL) {
    size = cl_max_cpos(attribute);
    if (size < 0) {
      send_cl_error();
    }
    else {
      cqi_data_int(size);
    }
  }
  else {
    attribute = cqi_lookup_attribute(a, ATT_STRUC);
    if (attribute != NULL) {
      size = cl_max_struc(attribute);
      if (size < 0) {
        /*      send_cl_error(); */
        /* current version of CL considers 0 regions a data access error condition, but we want to allow that */
        cqi_data_int(0);
      }
      else {
        cqi_data_int(size);
      }
    }
    else {
      attribute = cqi_lookup_attribute(a, ATT_ALIGN);
      if (attribute != NULL) {
        size = cl_max_alg(attribute);
        if (size < 0) {
          send_cl_error();
        }
        else {
          cqi_data_int(size);
        }
      }
      else {
        cqi_command(cqi_errno); /* return errno from the last lookup */
      }
    }
  }
  free(a);
}

Exemple #3

0

Afficher le fichier

Fichier : cwb-describe-corpus.c Projet : rforge/rcwb

/**
 * Prints statistical information about a corpus to STDOUT.
 *
 * Each corpus attribute gets info printed about it:
 * tokens and types for a P-attribute, number of instances
 * of regions for an S-attribute, number of alignment
 * blocks for an A-attribute.
 *
 * @param corpus  The corpus to analyse.
 */
void 
describecorpus_show_statistics (Corpus *corpus)
{
  Attribute *a;
  int tokens, types, regions, blocks;

  for (a = corpus->attributes; a; a = a->any.next) {
    switch(a->any.type) {
    case ATT_POS:
      Rprintf("p-ATT %-16s ", a->any.name);
      tokens = cl_max_cpos(a);
      types = cl_max_id(a);
      if ((tokens > 0) && (types > 0))
        Rprintf("%10d tokens, %8d types", tokens, types);
      else 
        Rprintf("           NO DATA");
      break;
    case ATT_STRUC:
      Rprintf("s-ATT %-16s ", a->any.name); 
      regions = cl_max_struc(a);
      if (regions >= 0) {
        Rprintf("%10d regions", regions);
        if (cl_struc_values(a))
          Rprintf(" (with annotations)");
      }
      else 
        Rprintf("           NO DATA");
      break;
    case ATT_ALIGN:
      Rprintf("a-ATT %-16s ", a->any.name); 
      blocks = cl_max_alg(a);
      if (blocks >= 0) {
        Rprintf("%10d alignment blocks", blocks);
        if (cl_has_extended_alignment(a))
          Rprintf(" (extended)");
      }
      else
        Rprintf("           NO DATA");
      break;
    default:
      Rprintf("???   %-16s (unknown attribute type)", a->any.name); 
      break;
    }
    Rprintf("\n");
  }

  Rprintf("\n");
}

Exemple #4

0

Afficher le fichier

Fichier : cwb-describe-corpus.c Projet : rforge/rcwb

/**
 * Prints basic information about a corpus to STDOUT.
 *
 * @param corpus                The corpus to report on.
 * @param with_attribute_names  Boolean: iff true, the counts of each type of attribute
 *                              are followed by a list of attribute names.
 *
 */
void
describecorpus_show_basic_info (Corpus *corpus, int with_attribute_names)
{
  Attribute *word, *a;
  int p_atts = 0, s_atts = 0, a_atts = 0;
  int size;
  char *colon = (with_attribute_names) ? ":" : "";

  Rprintf("description:    %s\n", corpus->name);
  Rprintf("registry file:  %s/%s\n", corpus->registry_dir, corpus->registry_name);
  Rprintf("home directory: %s/\n", corpus->path);
  Rprintf("info file:      %s\n", (corpus->info_file) ? corpus->info_file : "(none)");
  if ((word = cl_new_attribute(corpus, "word", ATT_POS)) == NULL) {
    Rprintf( "ERROR: 'word' attribute is missing. Aborted.\n");
    rcqp_receive_error(1);
  }
  size = cl_max_cpos(word);
  Rprintf("size (tokens):  ");
  if (size >= 0) 
    Rprintf("%d\n", size);
  else
    Rprintf("ERROR\n");
  Rprintf("\n");
  
  for (a = corpus->attributes; a; a = a->any.next) {
    switch(a->any.type) {
    case ATT_POS:   p_atts++; break;
    case ATT_STRUC: s_atts++; break;
    case ATT_ALIGN: a_atts++; break;
    default: break;
    }
  }
  Rprintf("%3d positional attributes%s\n", p_atts, colon);
  if (with_attribute_names)
    describecorpus_show_attribute_names(corpus, ATT_POS);
  Rprintf("%3d structural attributes%s\n", s_atts, colon);
  if (with_attribute_names)
    describecorpus_show_attribute_names(corpus, ATT_STRUC);
  Rprintf("%3d alignment  attributes%s\n", a_atts, colon);
  if (with_attribute_names)
    describecorpus_show_attribute_names(corpus, ATT_ALIGN);
  Rprintf("\n");
}

Exemple #5

0

Afficher le fichier

Fichier : cwb-makeall.c Projet : rforge/rcwb

/**
 * Validates the REVCORP component of the given attribute.
 *
 * This function validates a REVCORP (i.e. an uncompressed index).
 * It assumes that a lexicon, frequencies and (compressed or
 * uncompressed) token stream are available for CL access for the
 * given attribute.
 *
 * @param attr  The attribute whose REVCORP should be checked.
 * @return      True for all OK, false for a problem.
 */
int
validate_revcorp(Attribute *attr)
{

  Component *revcorp = ensure_component(attr, CompRevCorpus, 0);
  int *ptab;                        /* table of index offsets for each lexicon entry */
  int lexsize, corpsize;
  int i, offset, cpos, id;

  printf(" ? validating %s ... ", cid_name(CompRevCorpus));
  fflush(stdout);

  if (revcorp == NULL) {
    printf("FAILED (no data)\n");
    return 0;
  }
  lexsize = cl_max_id(attr);
  corpsize = cl_max_cpos(attr);
  if ((lexsize <= 0) || (corpsize <= 0)) {
    printf("FAILED (corpus access error)\n");
    return 0;
  }
  if (revcorp->size != corpsize) {
    printf("FAILED (wrong size)\n");
    return 0;
  }

  /* init offsets by calculating REVIDX component from token frequencies */
  ptab = (int *) cl_calloc(lexsize, sizeof(int));
  offset = 0;
  for (i = 0; i < lexsize; i++) {
    ptab[i] = offset;
    offset += cl_id2freq(attr, i);
  }

  /* now read token stream, check each token id against REVCORP, and increment its pointer */
  for (cpos = 0; cpos < corpsize; cpos++) {
    id = cl_cpos2id(attr, cpos);
    if ((id < 0) || (id >= lexsize)) {
      printf("FAILED (inconsistency in token stream)\n");
      cl_free(ptab);
      return 0;
    }
    if (ntohl(revcorp->data.data[ptab[id]]) != cpos) {
      printf("FAILED\n");
      cl_free(ptab);
      return 0;
    }
    ptab[id]++;
  }

  /* validate frequencies by comparing final offsets against those calculated from token frequencies */
  offset = 0;
  for (i = 0; i < lexsize; i++) {
    offset += cl_id2freq(attr, i);
    if (ptab[i] != offset) {
      printf("FAILED (token frequencies incorrect)\n");
      cl_free(ptab);
      return 0;
    }
  }

  cl_free(ptab);

  printf("OK\n");
  return 1;
}

Exemple #6

0

Afficher le fichier

Fichier : cwb-decode.c Projet : rforge/rcwb

/**
 * Main function for cwb-decode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  Attribute *attr;
  Attribute *context = NULL;

  int sp;  /* start position of a match */
  int ep;  /* end position of a match */

  int w, cnt, read_pos_frm_stdin;

  char s[CL_MAX_LINE_LENGTH];      /* buffer for strings read from file */
  char *token;

  char *input_filename = NULL;
  FILE *input_file = stdin;

  /* ------------------------------------------------- PARSE ARGUMENTS */

  int c;
  extern char *optarg;
  extern int optind;

  progname = argv[0];

  first_token = -1;
  last = -1;
  maxlast = -1;

  read_pos_frm_stdin = 0;

  /* use getopt() to parse command-line options */
  while((c = getopt(argc, argv, "+s:e:r:nLHCxXf:ph")) != EOF)
    switch(c) {

      /* s: start corpus position */
    case 's':
      first_token = atoi(optarg);
      break;

      /* e: end corpus position */
    case 'e':
      last = atoi(optarg);
      break;

      /* r: registry directory */
    case 'r':
      if (registry_directory == NULL)
        registry_directory = optarg;
      else {
        fprintf(stderr, "%s: -r option used twice\n", progname);
        exit(2);
      }
      break;

      /* n: show cpos in -H mode */
    case 'n':
      printnum++;
      break;

      /* x: XML-compatible output in -C mode (-Cx) */
    case 'x':
      xml_compatible++;
      break;

      /* L,H,C,X: Lisp, Horizontal, Compact, and XML modes */
    case 'L':
      mode = LispMode;
      break;
    case 'H':
      mode = ConclineMode;
      break;
    case 'C':
      mode = EncodeMode;
      break;
    case 'X':
      mode = XMLMode;
      break;

      /* f: matchlist mode / read corpus positions from file */
    case 'f':
      input_filename = optarg;
      break;

      /* p: matchlist mode / read corpus positions from stdin */
    case 'p':
      read_pos_frm_stdin++;
      break;

      /* h: help page */
    case 'h':
      decode_usage(2);
      break;

    default:
      fprintf(stderr, "Illegal option. Try \"%s -h\" for more information.\n", progname);
      fprintf(stderr, "[remember that options go before the corpus name, and attribute declarations after it!]\n");
      decode_cleanup(2);
    }

  /* required argument: corpus id */
  if (optind < argc) {
    corpus_id = argv[optind++];

    if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) {
      fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n",
              corpus_id,
              (registry_directory ? registry_directory : cl_standard_registry() ) );
      decode_cleanup(1);
    }
  }
  else {
    fprintf(stderr, "Missing argument. Try \"%s -h\" for more information.\n", progname);
    decode_cleanup(2);
  }


  /* now parse output flags (-P, -S, ...) [cnt is our own argument counter] */
  for (cnt = optind; cnt < argc; cnt++) {
    if (strcmp(argv[cnt], "-c") == 0) {         /* -c: context */

      if ((context = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) {
        fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }

    }
    else if (strcmp(argv[cnt], "-P") == 0) {    /* -P: positional attribute */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_POS)) == NULL) {
        fprintf(stderr, "Can't open p-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else {
        if (cl_max_cpos(attr) > 0) {
          decode_add_attribute(attr);
          if (maxlast < 0)
            maxlast = cl_max_cpos(attr); /* determines corpus size */
        }
        else {
          fprintf(stderr, "Attribute %s.%s is declared, but not accessible (missing data?). Aborted.\n",
                  corpus_id, argv[cnt]);
          decode_cleanup(1);
        }
      }

    }
    else if (strcmp(argv[cnt], "-ALL") == 0) {  /* -ALL: all p-attributes and s-attributes */

      for (attr = corpus->attributes; attr; attr = attr->any.next)
        if (attr->any.type == ATT_POS) {
          decode_add_attribute(attr);
          if (maxlast < 0)
            maxlast = cl_max_cpos(attr);
        }
        else if (attr->any.type == ATT_STRUC) {
          decode_add_attribute(attr);
        }

    }
    else if (strcmp(argv[cnt], "-D") == 0) {    /* -D: dynamic attribute (not implemented) */

      fprintf(stderr, "Sorry, dynamic attributes are not implemented. Aborting.\n");
      decode_cleanup(2);

    }
    else if (strcmp(argv[cnt], "-A") == 0) {    /* -A: alignment attribute */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_ALIGN)) == NULL) {
        fprintf(stderr, "Can't open a-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else
        decode_add_attribute(attr);
    }
    else if (strcmp(argv[cnt], "-S") == 0) {    /* -S: structural attribute (as tags) */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) {
        fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else
        decode_add_attribute(attr);
    }
    else if (strcmp(argv[cnt], "-V") == 0) {    /* -V: show structural attribute values (with -p or -f) */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) {
        fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else if (!cl_struc_values(attr)) {
        fprintf(stderr, "S-attribute %s.%s does not have annotations. Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else if (printValuesIndex >= MAX_PRINT_VALUES) {
        fprintf(stderr, "Too many -V attributes, sorry. Aborted.\n");
        decode_cleanup(1);
      }
      else
        printValues[printValuesIndex++] = attr;
    }
    else {

      fprintf(stderr, "Unknown flag: %s\n", argv[cnt]);
      decode_cleanup(2);

    }
  }
  /* ---- end of parse attribute declarations ---- */

  if (input_filename != NULL) {
    if (strcmp(input_filename, "-") == 0)
      input_file = stdin;
    else if ((input_file = fopen(input_filename, "r")) == NULL) {
      perror(input_filename);
      exit(1);
    }
    read_pos_frm_stdin++;
  }

  decode_verify_print_value_list();

  /* ------------------------------------------------------------ DECODE CORPUS */

  if (read_pos_frm_stdin == 0) {
    /*
     * normal mode: decode entire corpus or specified range
     */

    if (maxlast < 0) {
      fprintf(stderr, "Need at least one p-attribute (-P flag). Aborted.\n");
      decode_cleanup(2);
    }

    if (first_token < 0 || first_token >= maxlast)
      first_token = 0;

    if (last < 0 || last >= maxlast)
      last = maxlast - 1;

    if (last < first_token) {
      fprintf(stderr, "Warning: output range #%d..#%d is empty. No output.\n", first_token, last);
      decode_cleanup(2);
    }

    if ( (mode == XMLMode) ||  ((mode == EncodeMode) && xml_compatible) ) {
      decode_print_xml_declaration();
      printf("<corpus name=\"%s\" start=\"%d\" end=\"%d\">\n",
             corpus_id, first_token, last);
    }

    /* decode_print_surrounding_s_att_values(first_token); */ /* don't do that in "normal" mode, coz it doesn't make sense */

    for (w = first_token; w <= last; w++)
      decode_print_token_sequence(w, -1, context);

    if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) {
      printf("</corpus>\n");
    }
  }
  else {
    /*
     * matchlist mode: read (pairs of) corpus positions from stdin or file
     */

    if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) {
      decode_print_xml_declaration();
      printf("<matchlist corpus=\"%s\">\n", corpus_id);
    }

    cnt = 0;
    while (fgets(s, CL_MAX_LINE_LENGTH, input_file) != NULL) {

      token = strtok(s, " \t\n");

      if ((token != NULL) && is_num(token)) {
        sp = atoi(token);

        ep = -1;
        if ((token = strtok(NULL, " \t\n")) != NULL) {
          if (!is_num(token)) {
            fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", token);
            decode_cleanup(1);
          }
          else
            ep = atoi(token);
        }

        cnt++;                  /* count matches in matchlist  */
        if (mode == XMLMode) {
          printf("<match nr=\"%d\"", cnt);
          if (printnum)
            printf(" start=\"%d\" end=\"%d\"", sp, (ep >= 0) ? ep : sp);
          printf(">\n");
        }
        else {
          /* nothing shown before range */
        }

        decode_print_surrounding_s_att_values(sp);

        decode_print_token_sequence(sp, ep, context);

        if (mode == XMLMode) {
          printf("</match>\n");
        }
        else if (mode != ConclineMode) {
          printf("\n");         /* blank line, unless in -H mode */
        }
      }
      else {
        fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", s);
        decode_cleanup(1);
      }
    }

    if (input_file != stdin)
      fclose(input_file);

    if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) {
      printf("</matchlist>\n");
    }
  }

  decode_cleanup(0);
  return 0;                     /* just to keep gcc from complaining */
}

Exemple #7

0

Afficher le fichier

Fichier : cwb-huffcode.c Projet : rforge/rcwb

/**
 * Checks a huffcoded attribute for errors by decompressing it.
 *
 * This function assumes that compute_code_lengths() has been called
 * beforehand and made sure that the _uncompressed_ token sequence is
 * used by CL access functions.
 *
 * @param attr  The attribute to check.
 * @param fname Base filename to use for the three compressed-attribute files.
 *              Can be NULL, in which case the filenames in the attribute are used.
 */
void 
decode_check_huff(Attribute *attr, char *fname)
{
  BFile bfd;
  FILE *sync;
  HCD hc;

  int pos, size, sync_offset, offset;

  int l, v;
  int item, true_item;
  
  unsigned char bit;

  char hcd_path[CL_MAX_LINE_LENGTH];
  char huf_path[CL_MAX_LINE_LENGTH];
  char sync_path[CL_MAX_LINE_LENGTH];

  
  Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_huffcode, attr->any.name);

  if (fname) {
    sprintf(hcd_path, "%s.hcd", fname);
    sprintf(huf_path, "%s.huf", fname);
    sprintf(sync_path, "%s.huf.syn", fname);
  }
  else {

    char *path;

    path = component_full_name(attr, CompHuffSeq, NULL);
    assert(path && (cderrno == CDA_OK));
    strcpy(huf_path, path);
    
    path = component_full_name(attr, CompHuffCodes, NULL);
    assert(path && (cderrno == CDA_OK));
    strcpy(hcd_path, path);

    path = component_full_name(attr, CompHuffSync, NULL);
    assert(path && (cderrno == CDA_OK));
    strcpy(sync_path, path);
    
  }

  Rprintf("- reading code descriptor block from %s\n", hcd_path);
  if (!ReadHCD(hcd_path, &hc)) {
    Rprintf( "ERROR: reading %s failed. Aborted.\n",  hcd_path);
    rcqp_receive_error(1);
  }

  Rprintf("- reading compressed item sequence from %s\n", huf_path);
  if (!BFopen(huf_path, "r", &bfd)) {
    Rprintf( "ERROR: can't open file %s. Aborted.\n", huf_path);
    perror(huf_path);
    rcqp_receive_error(1);
  }

  Rprintf("- reading sync (mod %d) from %s\n", SYNCHRONIZATION, sync_path);
  if ((sync = fopen(sync_path, "r")) == NULL) {
    Rprintf( "ERROR: can't open file %s. Aborted.\n", sync_path);
    perror(sync_path);
    rcqp_receive_error(1);
  }

  size = cl_max_cpos(attr);
  if (size != hc.length) {
    Rprintf( "ERROR: wrong corpus size (%d tokens) in %s (correct size: %d)\n",
            hc.length, hcd_path, size);
    rcqp_receive_error(1);
  }

  for (pos = 0; pos < hc.length; pos++) {

    if ((pos % SYNCHRONIZATION) == 0) {
      offset = BFposition(&bfd); /* need to get offset before flushing (because flushing fills the bit buffer and advances offset to the following byte!) */
      if (pos > 0)
        BFflush(&bfd);
      sync_offset = -1;                /* make sure we get an error if read below fails */
      NreadInt(&sync_offset, sync);
      if (offset != sync_offset) {
        Rprintf( "ERROR: wrong sync offset %d (true offset %d) at cpos %d. Aborted.\n",
                sync_offset, offset, pos);
        rcqp_receive_error(1);
      }
    }

    if (!BFread(&bit, 1, &bfd)) {
      Rprintf( "ERROR reading file %s. Aborted.\n", huf_path);
      rcqp_receive_error(1);
    }

    v = (bit ? 1 : 0);
    l = 1;
    while (v < hc.min_code[l]) {
      if (!BFread(&bit, 1, &bfd)) {
        Rprintf( "ERROR reading file %s. Aborted.\n", huf_path);
        return;
      }
      v <<= 1;
      if (bit)
        v++;
      l++;
    }
    item = hc.symbols[hc.symindex[l] + v - hc.min_code[l]];

    true_item = cl_cpos2id(attr, pos);
    if (item != true_item) {
      Rprintf( "ERROR: wrong token (id=%d) at cpos %d (correct id=%d). Aborted.\n",
              item, pos, true_item);
    }

  }
  fclose(sync);
  BFclose(&bfd);

  /* tell the user it's safe to delete the CORPUS component now */
  Rprintf("!! You can delete the file <%s> now.\n",
         component_full_name(attr, CompCorpus, NULL));
  
  return;                        /* exits on error, so there's no return value */
}

Exemple #8

0

Afficher le fichier

Fichier : cwb-huffcode.c Projet : rforge/rcwb

/**
 * Compresses the token stream of a p-attribute.
 *
 * Three files are created: the compressed token stream, the descriptor block,
 * and a sync file.
 *
 * @param attr  The attribute to compress.
 * @param hc    Location for the resulting Huffmann code descriptor block.
 * @param fname Base filename for the resulting files.
 */
int 
compute_code_lengths(Attribute *attr, HCD *hc, char *fname)
{
  int id, i, h;

  int nr_codes = 0;

  int *heap = NULL;
  unsigned *codelength = NULL;        /* was char[], probably to save space; but that's unnecessary and makes gcc complain */

  int issued_codes[MAXCODELEN];
  int next_code[MAXCODELEN];

  long sum_bits;


  Rprintf("COMPRESSING TOKEN STREAM of %s.%s\n", corpus_id_cwb_huffcode, attr->any.name);

  /* I need the following components:
   * - CompCorpus
   * - CompCorpusFreqs
   * - CompLexicon
   * - CompLexiconIdx
   * and want to force the CL to use them rather than compressed data. 
   */

  {
    Component *comp;

    if ((comp = ensure_component(attr, CompCorpus, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the CORPUS component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexicon, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXION component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompLexiconIdx, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the LEXIDX component\n");
      rcqp_receive_error(1);
    }

    if ((comp = ensure_component(attr, CompCorpusFreqs, 0)) == NULL) {
      Rprintf( "Computation of huffman codes needs the FREQS component.\n"
              "Run 'makeall -r %s -c FREQS %s %s' in order to create it.\n",
              corpus->registry_dir, corpus->registry_name, attr->any.name);
      rcqp_receive_error(1);
    }

  }

  /*
   * strongly follows Witten/Moffat/Bell: ``Managing Gigabytes'', 
   * pp. 335ff.
   */

  hc->size = cl_max_id(attr);                /* the size of the attribute (nr of items) */
  if ((hc->size <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_id() failed");
    rcqp_receive_error(1);
  }

  hc->length = cl_max_cpos(attr); /* the length of the attribute (nr of tokens) */
  if ((hc->length <= 0) || (cderrno != CDA_OK)) {
    cdperror("(aborting) cl_max_cpos() failed");
    rcqp_receive_error(1);
  }

  hc->symbols = NULL;
  hc->min_codelen = 100;
  hc->max_codelen = 0;

  memset((char *)hc->lcount, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->min_code, '\0', MAXCODELEN * sizeof(int));
  memset((char *)hc->symindex, '\0', MAXCODELEN * sizeof(int));

  memset((char *)issued_codes, '\0', MAXCODELEN * sizeof(int));

  codelength = (unsigned *)cl_calloc(hc->size, sizeof(unsigned));


  /* =========================================== make & initialize the heap */

  heap = (int *)cl_malloc(hc->size * 2 * sizeof(int));

  for (i = 0; i < hc->size; i++) {
    heap[i] = hc->size + i;
    heap[hc->size+i] = get_id_frequency(attr, i) + 1;
    /* add-one trick needed to avoid unsupported Huffman codes > 31 bits for very large corpora of ca. 2 billion words:
       theoretical optimal code length for hapax legomena in such corpora is ca. 31 bits, and the Huffman algorithm 
       sometimes generates 32-bit codes; with add-one trick, the theoretical optimal code length is always <= 30 bits */    
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0)
    fprintf(protocol, "Allocated heap with %d cells for %d items\n\n",
            hc->size * 2, hc->size);
  if (do_protocol > 2)
    print_heap(heap, hc->size, "After Initialization");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 1 */


  h = hc->size;

  /*
   * we address the heap in the following manner: when we start array
   * indices at 1, the left child is at 2i, and the right child is at
   * 2i+1. So we maintain this scheme and decrement just before
   * adressing the array. 
   */

  /*
   * construct the initial min-heap
   */

  for (i = hc->size/2; i > 0; i--) {

    /* do:
     * bottom up, left to right,
     * for each root of each subtree, sift if necessary
     */

    sift(heap, h, i);
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 2) {
    print_heap(heap, hc->size, "Initial Min-Heap");
    fprintf(protocol, "\n");
  }
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 2 */

  /* smallest item at top of heap now, remove the two smallest items
   * and sift, find second smallest by removing top and sifting, as
   * long as we have more than one root */



  while (h > 1) {
    
    int pos[2];

    for (i = 0; i < 2; i++) {

      /* remove topmost (i.e. smallest) item */

      pos[i] = heap[0];

      /* remove and sift, to reobtain heap integrity: move ``last''
       * item to top of heap and sift */

      heap[0] = heap[--h];
      
      sift(heap, h, 1);
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 3) {
      fprintf(protocol, "Removed     smallest item %d with freq %d\n",
              pos[0], heap[pos[0]]);
      fprintf(protocol, "Removed 2nd smallest item %d with freq %d\n",
              pos[1], heap[pos[1]]);
    }
    /* ============================== PROTOCOL ============================== */

    /*
     * pos[0] and pos[1] contain pointers to the two smallest items
     * now. since h was decremented twice, h and h+1 are now empty and
     * become the accumulated freq of pos[i]. The individual
     * frequencies are not needed any more, so pointers to h+1 (the
     * acc freq) are stored there instead (tricky, since freq cell
     * becomes pointer cell). So, what happens here, is to include a
     * new element in the heap. */

    heap[h] = h+1;
    heap[h+1] = heap[pos[0]] + heap[pos[1]]; /* accumulated freq */
    heap[pos[0]] = heap[pos[1]] = h+1; /* pointers! */
    h++;                        /* we put a new element into heap */

    /*
     * now, swap it up until we reobtain heap integrity
     */

    {
      register int parent, current;
      
      current = h;
      
      parent = current >> 1;

      while ((parent > 0) &&
             (heap[heap[parent-1]] > heap[heap[current-1]])) {

        int tmp;

        tmp = heap[parent-1];
        heap[parent-1] = heap[current-1];
        heap[current-1] = tmp;

        current = parent;
        parent = current >> 1;
      }
    }
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 3)
    fprintf(protocol, "\n");
  /* ============================== PROTOCOL ============================== */



  /* ================================================== Phase 3 */

  /* compute the code lengths. We don't have any freqs in heap any
   * more, only pointers to parents */

  heap[0] = -1U;

  /* root has a depth of 0 */

  heap[1] = 0;

  /* we trust in what they say on p. 345 */

  for (i = 2; i < hc->size * 2; i++)
    heap[i] = heap[heap[i]]+1;


  /* collect the lengths */

  sum_bits = 0L;

  for (i = 0; i < hc->size; i++) {

    int cl = heap[i+hc->size];

    sum_bits += cl * get_id_frequency(attr, i);

    codelength[i] = cl;
    if (cl == 0)
      continue;

    if (cl > hc->max_codelen)
      hc->max_codelen = cl;

    if (cl < hc->min_codelen)
      hc->min_codelen = cl;

    hc->lcount[cl]++;
  }

  /* ============================== PROTOCOL ============================== */
  if (do_protocol > 0) {

    fprintf(protocol, "Minimal code length: %3d\n", hc->min_codelen);
    fprintf(protocol, "Maximal code length: %3d\n", hc->max_codelen);
    fprintf(protocol, "Compressed code len: %10ld bits, %10ld (+1) bytes\n\n\n",
            sum_bits, sum_bits/8);

  }
  /* ============================== PROTOCOL ============================== */

  if (hc->max_codelen >= MAXCODELEN) {
    Rprintf( "Error: Huffman codes too long (%d bits, current maximum is %d bits).\n", hc->max_codelen, MAXCODELEN-1);
    Rprintf( "       Please contact the CWB development team for assistance.\n");
    rcqp_receive_error(1);
  }

  if ((hc->max_codelen == 0) && (hc->min_codelen == 100)) {

    Rprintf( "Problem: No output generated -- no items?\n");
    nr_codes = 0;
  }
  else {

    hc->min_code[hc->max_codelen] = 0;
    
    for (i = hc->max_codelen-1; i > 0; i--)
      hc->min_code[i] = (hc->min_code[i+1] + hc->lcount[i+1]) >> 1;

    hc->symindex[hc->min_codelen] = 0;
    for (i = hc->min_codelen+1; i <= hc->max_codelen; i++)
      hc->symindex[i] = hc->symindex[i-1] + hc->lcount[i-1];


    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 0) {

      int sum_codes = 0;

      fprintf(protocol, " CL  #codes  MinCode   SymIdx\n");
      fprintf(protocol, "----------------------------------------\n");

      for (i = hc->min_codelen; i <= hc->max_codelen; i++) {
        sum_codes += hc->lcount[i];
        fprintf(protocol, "%3d %7d  %7d  %7d\n", 
                i, hc->lcount[i], hc->min_code[i], hc->symindex[i]);
      }

      fprintf(protocol, "----------------------------------------\n");
      fprintf(protocol, "    %7d\n", sum_codes);
    }
    /* ============================== PROTOCOL ============================== */


    for (i = 0; i < MAXCODELEN; i++)
      next_code[i] = hc->min_code[i];

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "\n");
      fprintf(protocol, "   Item   f(item)  CL      Bits     Code, String\n");
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */

    /* compute and issue codes */
    
    hc->symbols = heap + hc->size;

    for (i = 0; i < hc->size; i++) {

      /* we store the code for item i in heap[i] */
      heap[i] = next_code[codelength[i]];
      next_code[codelength[i]]++;

      /* ============================== PROTOCOL ============================== */
      if (do_protocol > 1) {
        fprintf(protocol, "%7d  %7d  %3d  %10d ",
                i,
                get_id_frequency(attr, i),
                codelength[i],
                codelength[i] * get_id_frequency(attr, i));

        bprintf(heap[i], codelength[i], protocol);

        fprintf(protocol, "  %7d  %s\n",
                heap[i], get_string_of_id(attr, i));
      }
      /* ============================== PROTOCOL ============================== */

      /* and put the item itself in the second half of the table */
      heap[hc->size+hc->symindex[codelength[i]]+issued_codes[codelength[i]]] = i;
      issued_codes[codelength[i]]++;
    }

    /* ============================== PROTOCOL ============================== */
    if (do_protocol > 1) {
      fprintf(protocol, "------------------------------------"
              "------------------------------------\n");
    }
    /* ============================== PROTOCOL ============================== */


    /* The work itself -- encode the attribute data */

    {
      char *path;

      char hcd_path[CL_MAX_LINE_LENGTH];
      char huf_path[CL_MAX_LINE_LENGTH];
      char sync_path[CL_MAX_LINE_LENGTH];

      Component *corp;

      BFile bfd;
      FILE *sync;

      int cl, code, pos;

      corp = ensure_component(attr, CompCorpus, 0);
      assert(corp);

      if (fname) {
        path = fname;

        sprintf(hcd_path, "%s.hcd", path);
        sprintf(huf_path, "%s.huf", path);
        sprintf(sync_path, "%s.huf.syn", path);
      }
      else {
        path = component_full_name(attr, CompHuffSeq, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(huf_path, path);

        path = component_full_name(attr, CompHuffCodes, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(hcd_path, path);

        path = component_full_name(attr, CompHuffSync, NULL);
        assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */
        strcpy(sync_path, path);

      }

      Rprintf("- writing code descriptor block to %s\n",  hcd_path);
      if (!WriteHCD(hcd_path, hc)) {
        Rprintf( "ERROR: writing %s failed. Aborted.\n", hcd_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing compressed item sequence to %s\n", huf_path);

      if (!BFopen(huf_path, "w", &bfd)) {
        Rprintf( "ERROR: can't create file %s\n", huf_path);
        perror(huf_path);
        rcqp_receive_error(1);
      }

      Rprintf("- writing sync (every %d tokens) to %s\n", SYNCHRONIZATION, sync_path);

      if ((sync = fopen(sync_path, "w")) == NULL) {
        Rprintf( "ERROR: can't create file %s\n", sync_path);
        perror(sync_path);
        rcqp_receive_error(1);
      }

      for (i = 0; i < hc->length; i++) {

        /* SYNCHRONIZE */

        if ((i % SYNCHRONIZATION) == 0) {
          if (i > 0)
            BFflush(&bfd);
          pos = BFposition(&bfd);
          NwriteInt(pos, sync);
        }

        id = cl_cpos2id(attr, i);
        if ((id < 0) || (cderrno != CDA_OK)) {
          cdperror("(aborting) cl_cpos2id() failed");
          rcqp_receive_error(1);
        }

        else {

          assert((id >= 0) && (id < hc->size) && "Internal Error");

          cl = codelength[id];
          code = heap[id];

          if (!BFwriteWord((unsigned int)code, cl, &bfd)) {
            Rprintf( "Error writing code for ID %d (%d, %d bits) at position %d. Aborted.\n",
                    id, code, cl, i);
            rcqp_receive_error(1);
          }

        }

      }

      fclose(sync);
      BFclose(&bfd);
    }
  }

  free(codelength);
  free(heap);
 
  return 1;
}

Exemple #9

0

Afficher le fichier

Fichier : cwb-compress-rdx.c Projet : rforge/rcwb

/**
 * Checks a compressed reversed index for errors by decompressing it.
 *
 * This function this assumes that compress_reversed_index() has been called
 * beforehand and made sure that the _uncompressed_ index is used by CL
 * access functions.
 *
 * @param attr      The attribute to check the index of.
 * @param output_fn Base name for the compressed RDX files to be read
 *                  (if this is null, filename swill be taken from the
 *                  attribute).
 */
void 
decompress_check_reversed_index(Attribute *attr, char *output_fn)
{
  char *s;
  char data_fname[CL_MAX_FILENAME_LENGTH];
  char index_fname[CL_MAX_FILENAME_LENGTH];
  
  int nr_elements;
  int element_freq;
  int corpus_size;
  int pos, gap;

  int b;
  int i, k;

  BFile data_file;
  FILE *index_file;

  PositionStream PStream;
  int true_pos;


  Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name);

  nr_elements = cl_max_id(attr);
  if ((nr_elements <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_id() failed");
    compressrdx_cleanup(1);
  }

  corpus_size = cl_max_cpos(attr);
  if ((corpus_size <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_cpos() failed");
    compressrdx_cleanup(1);
  }

  if (output_fn) {
    sprintf(data_fname, "%s.crc", output_fn);
    sprintf(index_fname, "%s.crx", output_fn);
  }
  else {
    s = component_full_name(attr, CompCompRF, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(data_fname, s);

    s = component_full_name(attr, CompCompRFX, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(index_fname, s);
  }
  
  if (! BFopen(data_fname, "r", &data_file)) {
    Rprintf( "ERROR: can't open file %s\n", data_fname);
    perror(data_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- reading compressed index from %s\n", data_fname);
  
  if ((index_file = fopen(index_fname, "r")) == NULL) {
    Rprintf( "ERROR: can't open file %s\n", index_fname);
    perror(index_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- reading compressed index offsets from %s\n", index_fname);


  for (i = 0; i < nr_elements; i++) {

    element_freq = cl_id2freq(attr, i);
    if ((element_freq == 0) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) token frequency == 0\n");
      compressrdx_cleanup(1);
    }

    PStream = cl_new_stream(attr, i);
    if ((PStream == NULL) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) index read error");
      compressrdx_cleanup(1);
    }

    b = compute_ba(element_freq, corpus_size);

    if (debug_cwb_compress_rdx)
      fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n",
              i, element_freq, b);

    pos = 0;
    for (k = 0; k < element_freq; k++) {

      gap = read_golomb_code_bf(b, &data_file);
      pos += gap;

      if (1 != cl_read_stream(PStream, &true_pos, 1)) {
        cl_error("(aborting) index read error\n");
        compressrdx_cleanup(1);
      }
      if (pos != true_pos) {
        Rprintf( "ERROR: wrong occurrence of token #%d at cpos %d (correct cpos: %d). Aborted.\n",
              i, pos, true_pos);
        compressrdx_cleanup(1);
      }

    }
    
    cl_delete_stream(&PStream);
    BFflush(&data_file);
  }

  fclose(index_file);
  BFclose(&data_file);

  /* tell the user it's safe to delete the REVCORP and REVCIDX components now */
  Rprintf("!! You can delete the file <%s> now.\n",
         component_full_name(attr, CompRevCorpus, NULL));
  Rprintf("!! You can delete the file <%s> now.\n",
         component_full_name(attr, CompRevCorpusIdx, NULL));
  
  return;
}

Exemple #10

0

Afficher le fichier

Fichier : cwb-compress-rdx.c Projet : rforge/rcwb

/**
 * Compresses the reversed index of a p-attribute.
 *
 * @param attr      The attribute to compress the index of.
 * @param output_fn Base name for the compressed RDX files to be written
 *                  (if this is null, filenames will be taken from the
 *                  attribute).
 */
void 
compress_reversed_index(Attribute *attr, char *output_fn)
{
  char *s;
  char data_fname[CL_MAX_FILENAME_LENGTH];
  char index_fname[CL_MAX_FILENAME_LENGTH];
  
  int nr_elements;
  int element_freq;
  int corpus_size;
  int last_pos, gap, fpos;

  int b;

  int i, k;

  BFile data_file;
  FILE *index_file = NULL;

  PositionStream PStream;
  int new_pos;


  Rprintf("COMPRESSING INDEX of %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name);

  /* ensure that we do NOT use the compressed index while building the
   * compressed index (yeah, a nasty thing that). That is, load the
   * .corpus.rev and .corpus.rdx components in order to force
   * subsequent CL calls to use the uncompressed data.
   */

  {
    Component *comp;

    if ((comp = ensure_component(attr, CompRevCorpus, 0)) == NULL) {
      Rprintf( "Index compression requires the REVCORP component\n");
      compressrdx_cleanup(1);
    }

    if ((comp = ensure_component(attr, CompRevCorpusIdx, 0)) == NULL) {
      Rprintf( "Index compression requires the REVCIDX component\n");
      compressrdx_cleanup(1);
    }

  }

  nr_elements = cl_max_id(attr);
  if ((nr_elements <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_id() failed");
    compressrdx_cleanup(1);
  }

  corpus_size = cl_max_cpos(attr);
  if ((corpus_size <= 0) || (cl_errno != CDA_OK)) {
    cl_error("(aborting) cl_max_cpos() failed");
    compressrdx_cleanup(1);
  }

  if (output_fn) {
    sprintf(data_fname, "%s.crc", output_fn);
    sprintf(index_fname, "%s.crx", output_fn);
  }
  else {
    s = component_full_name(attr, CompCompRF, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(data_fname, s);

    s = component_full_name(attr, CompCompRFX, NULL);
    assert(s && (cl_errno == CDA_OK));
    strcpy(index_fname, s);
  }
  
  if (! BFopen(data_fname, "w", &data_file)) {
    Rprintf( "ERROR: can't create file %s\n", data_fname);
    perror(data_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- writing compressed index to %s\n", data_fname);
  
  if ((index_file = fopen(index_fname, "wb")) == NULL) {
    Rprintf( "ERROR: can't create file %s\n", index_fname);
    perror(index_fname);
    compressrdx_cleanup(1);
  }
  Rprintf("- writing compressed index offsets to %s\n", index_fname);

  for (i = 0; i < nr_elements; i++) {
    
    element_freq = cl_id2freq(attr, i);
    if ((element_freq == 0) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) token frequency == 0\n");
      compressrdx_cleanup(1);
    }

    PStream = cl_new_stream(attr, i);
    if ((PStream == NULL) || (cl_errno != CDA_OK)) {
      cl_error("(aborting) index read error");
      compressrdx_cleanup(1);
    }
    
    b = compute_ba(element_freq, corpus_size);
    
    fpos = BFposition(&data_file);
    NwriteInt(fpos, index_file);
    
    if (debug_cwb_compress_rdx)
      fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n",
              i, element_freq, b);
    
    last_pos = 0;
    for (k = 0; k < element_freq; k++) {
      if (1 != cl_read_stream(PStream, &new_pos, 1)) {
        cl_error("(aborting) index read error\n");
        compressrdx_cleanup(1);
      }
      
      gap = new_pos - last_pos;
      last_pos = new_pos;
      
      if (debug_cwb_compress_rdx)
        fprintf(debug_output, "%8d:  gap=%4d, b=%4d\n", codepos, gap, b);
      
      write_golomb_code(gap, b, &data_file);
      codepos++;
    }
    
    cl_delete_stream(&PStream);
    BFflush(&data_file);
  }
    
  fclose(index_file);
  BFclose(&data_file);

  return;
}

Exemple #11

0

Afficher le fichier

Fichier : cwb-align-encode.c Projet : cran/rcqp

/**
 * Main function for cwb-align-encode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char *argv[])
{
  int argindex;                         /* index of first argument in argv[] */

  char *align_name = NULL;              /* name of the .align file */
  FILE *af = NULL;                      /* alignment file handle */
  int af_is_pipe;                       /* need to know whether to call fclose() or pclose() */
  char alx_name[CL_MAX_LINE_LENGTH];    /* full pathname of .alx file */
  char alg_name[CL_MAX_LINE_LENGTH];    /* full pathname of optional .alg file */
  FILE *alx=NULL, *alg=NULL;            /* file handles for .alx and optional .alg file */

  char line[CL_MAX_LINE_LENGTH];        /* one line of input from <infile> */

  char corpus1_name[CL_MAX_FILENAME_LENGTH];
  char corpus2_name[CL_MAX_FILENAME_LENGTH];
  char s1_name[CL_MAX_FILENAME_LENGTH];
  char s2_name[CL_MAX_FILENAME_LENGTH];
  Corpus *corpus1, *corpus2;            /* corpus handles */
  Attribute *w1, *w2;                   /* attribute handles for 'word' attributes; used to determine corpus size */
  int size1, size2;                     /* size of source & target corpus */

  Corpus *source_corpus;                /* encode alignment in this corpus (depends on -R flag, important for -D option) */
  char *source_corpus_name;             /* just for error messages */
  char *attribute_name;                 /* name of alignment attribute (depends on -R flag, must be lowercase) */

  int f1,l1,f2,l2;                      /* alignment regions */
  int current1, current2;
  int mark, n_0_1, n_1_0;

  int l;

  progname = argv[0];

  /* parse command line and read arguments */
  argindex = alignencode_parse_args(argc, argv, 1);
  align_name = argv[argindex];

  /* open alignment file and parse header; .gz files are automatically decompressed */
  af_is_pipe = 0;
  l = strlen(align_name);
  if ((l > 3) && (strncasecmp(align_name + l - 3, ".gz", 3) == 0)) {
    char *pipe_cmd = (char *) cl_malloc(l+10);
    sprintf(pipe_cmd, "gzip -cd %s", align_name); /* write .gz file through gzip pipe */
    af = popen(pipe_cmd, "r");
    if (af == NULL) {
      perror(pipe_cmd);
     Rprintf( "%s: can't read compressed file %s\n", progname, align_name);
      rcqp_receive_error(1);
    }
    af_is_pipe = 1;
    cl_free(pipe_cmd);
  }
  else {
    af = fopen(align_name, "r");
    if (af == NULL) {
      perror(align_name);
     Rprintf( "%s: can't read file %s\n", progname, align_name);
      rcqp_receive_error(1);
    }
  }

  /* read header = first line */
  fgets(line, CL_MAX_LINE_LENGTH, af);
  if (4 != sscanf(line, "%s %s %s %s", corpus1_name, s1_name, corpus2_name, s2_name)) {
   Rprintf( "%s: %s not in .align format\n", progname, align_name);
   Rprintf( "wrong header: %s", line);
    rcqp_receive_error(1);
  }
  if (verbose) {
    if (reverse)
     Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus2_name, corpus1_name, align_name);
    else
     Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus1_name, corpus2_name, align_name);
  }

  /* open corpora and determine their sizes (for validity checks and compatibility mode) */
  if (NULL == (corpus1 = cl_new_corpus(registry_dir, corpus1_name))) {
   Rprintf( "%s: can't open corpus %s\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  if (NULL == (corpus2 = cl_new_corpus(registry_dir, corpus2_name))) {
   Rprintf( "%s: can't open corpus %s\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }
  if (NULL == (w1 = cl_new_attribute(corpus1, "word", ATT_POS))) {
   Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  if (NULL == (w2 = cl_new_attribute(corpus2, "word", ATT_POS))) {
   Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }

  size1 = cl_max_cpos(w1);
  if (size1 <= 0) {
   Rprintf( "%s: data access error (%s.word)\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  size2 = cl_max_cpos(w2);
  if (size2 <= 0) {
   Rprintf( "%s: data access error (%s.word)\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }

  /* now work out the actual source corpus and the alignment attribute name (depending on -R flag) */
  source_corpus = (reverse) ? corpus2 : corpus1;
  source_corpus_name = (reverse) ? corpus2_name : corpus1_name;
  attribute_name = cl_strdup((reverse) ? corpus1_name : corpus2_name);
  cl_id_tolower(attribute_name); /* fold attribute name to lowercase */

  /* with -D option, determine data file name(s) from actual source corpus;
     otherwise use directory specified with -d and the usual naming conventions */
  if (data_dir_from_corpus) {
    Attribute *alignment = cl_new_attribute(source_corpus, attribute_name, ATT_ALIGN);
    char *comp_pathname;

    if (alignment == NULL) {
     Rprintf( "%s: alignment attribute %s.%s not declared in registry file\n",
              progname, source_corpus_name, attribute_name);
      rcqp_receive_error(1);
    }
    comp_pathname = component_full_name(alignment, CompXAlignData, NULL);
    if (comp_pathname == NULL) {
     Rprintf( "%s: can't determine pathname for .alx file (internal error)\n", progname);
      rcqp_receive_error(1);
    }
    strcpy(alx_name, comp_pathname); /* need to strcpy because component_full_name() returns pointer to internal buffer */
    if (compatibility) {
      comp_pathname = component_full_name(alignment, CompAlignData, NULL);
      if (comp_pathname == NULL) {
       Rprintf( "%s: can't determine pathname for .alg file (internal error)\n", progname);
        rcqp_receive_error(1);
      }
      strcpy(alg_name, comp_pathname);
    }
  }
  else {
    sprintf(alx_name, "%s" SUBDIR_SEP_STRING "%s.alx", data_dir, attribute_name);
    if (compatibility)
      sprintf(alg_name, "%s" SUBDIR_SEP_STRING "%s.alg", data_dir, attribute_name);
  }

  /* now open output file(s) */
  alx = fopen(alx_name, "wb");
  if (alx == NULL) {
    perror(alx_name);
   Rprintf( "%s: can't write file %s\n", progname, alx_name);
    rcqp_receive_error(1);
  }
  if (verbose)
   Rprintf("Writing file %s ...\n", alx_name);

  if (compatibility) {
    alg = fopen(alg_name, "wb");
    if (alg == NULL) {
      perror(alg_name);
     Rprintf( "%s: can't write file %s\n", progname, alg_name);
      rcqp_receive_error(1);
    }

    if (verbose)
     Rprintf("Writing file %s ...\n", alg_name);
  }

  /* main encoding loop */
  f1 = f2 = l1 = l2 = 0;
  mark = -1;                        /* check that regions occur in ascending order */
  current1 = current2 = -1;         /* for compatibility mode */
  n_0_1 = n_1_0 = 0;                /* number of 0:1 and 1:0 alignments, which are skipped */
  while (! feof(af)) {
    if (NULL == fgets(line, CL_MAX_LINE_LENGTH, af))
      break;                        /* end of file (or read error, which we choose to ignore) */
    if (4 != sscanf(line, "%d %d %d %d", &f1, &l1, &f2, &l2)) {
     Rprintf( "%s: input format error: %s", progname, line);
      rcqp_receive_error(1);
    }

    /* skip 0:1 and 1:0 alignments */
    if (l1 < f1) {
      n_0_1++; continue;
    }
    if (l2 < f2) {
      n_1_0++; continue;
    }

    /* check that source regions are non-overlapping and in ascending order */
    if (((reverse) ? f2 : f1) <= mark) {
     Rprintf( "%s: source regions of alignment must be in ascending order\n", progname);
     Rprintf( "Last region was [*, %d]; current is [%d, %d].\n", mark, f1, l1);
     Rprintf( "Aborted.\n");
      rcqp_receive_error(1);
    }
    mark = (reverse) ? l2 : l1;

    /* write alignment region to .alx file */
    if (reverse) {
      NwriteInt(f2, alx); NwriteInt(l2, alx);
      NwriteInt(f1, alx); NwriteInt(l1, alx);
    }
    else {
      NwriteInt(f1, alx); NwriteInt(l1, alx);
      NwriteInt(f2, alx); NwriteInt(l2, alx);
    }

    if (compatibility) {
      /* source and target regions of .alg file must be contiguous; store start points only; */
      /* hence we must collapse crossing alignments into one larger region (I know that's bullshit) */
      if ((f1 > current1) && (f2 > current2)) {
        if (reverse) {
          NwriteInt(f2, alg); NwriteInt(f1, alg);
        }
        else {
          NwriteInt(f1, alg); NwriteInt(f2, alg);
        }
        current1 = f1;
        current2 = f2;
      }
    }
  }
  if (compatibility) {
    if (reverse) {
      NwriteInt(size2, alg); NwriteInt(size1, alg); /* end of corpus alignment point*/
    }
    else {
      NwriteInt(size1, alg); NwriteInt(size2, alg); /* end of corpus alignment point*/
    }
  }

  if (verbose) {
   Rprintf("I skipped %d 0:1 alignments and %d 1:0 alignments.\n", n_0_1, n_1_0);
  }

  /* that's it; close file handles */
  fclose(alx);
  if (compatibility)
    fclose(alg);

  if (af_is_pipe)
    pclose(af);
  else
    fclose(af);

  return 0;
}

Exemple #12

0

Afficher le fichier

Fichier : matchlist.c Projet : rforge/rcwb

/**
 * Perform "operation" on the two match lists (can be initial).
 *
 * The result is assigned to list1.
 *
 *
 * this whole code is WRONG when one of the matchlists is inverted
 * TODO!
 *
 * Also TODO: give it a better name.
 *
 * This contains, by far, most of the code in the Matchlist module.
 */
int
Setop(Matchlist *list1, MLSetOp operation, Matchlist *list2)
{
  int i, j, k, t, ins;
  Matchlist tmp;
  Attribute *attr;

  switch (operation) {

  case Union:

    /*
     * -------------------- UNION
     */

    /*
     * TODO:
     * optimize in case 
     *   (list1->matches_whole_corpus && list2->matches_whole_corpus)
     */
    
    if (list2->start == NULL)

      if (list2->is_inverted) {
        /* l2 is empty, but inverted, so the result is the whole corpus,
         * as in l2. */
        return Setop(list1, Identity, list2);
      }
      else 
        /* the result is list1, so just return */
        return 1;

    else if (list1->start == NULL)

      if (list1->is_inverted)
        /* empty, but inverted --> whole corpus, l1 */
        return 1;
      else 
        /* the result is in list2, so return a copy */
        return Setop(list1, Identity, list2);

    else if (list1->is_inverted && list2->is_inverted) {

      /* union of 2 inverted lists is the inverted intersection */

      list1->is_inverted = 0; list2->is_inverted = 0;
      Setop(list1, Intersection, list2);
      list1->is_inverted = 1;
      
    }
    else {

      if (list1->is_inverted) {
        list1->is_inverted = 0;
        Setop(list1, Complement, NULL);
      }
      if (list2->is_inverted) {
        list2->is_inverted = 0;
        Setop(list2, Complement, NULL);
      }

      tmp.tabsize = list1->tabsize + list2->tabsize;

      tmp.start = (int *)cl_malloc(sizeof(int) * tmp.tabsize);

      if (list1->end && list2->end)
        tmp.end   = (int *)cl_malloc(sizeof(int) * tmp.tabsize);
      else
        tmp.end = NULL;

      if (list1->target_positions && list2->target_positions)
        tmp.target_positions = (int *)cl_malloc(sizeof(int) * tmp.tabsize);
      else
        tmp.target_positions = NULL;





      i = 0;                        /* the position in list1 */
      j = 0;                        /* the position in list2 */
      k = 0;                        /* the insertion point in the result list `tmp' */


      while ((i < list1->tabsize) || (j < list2->tabsize))

        if ((i < list1->tabsize) && (list1->start[i] == -1))
          i++;
        else if ((j < list2->tabsize) && (list2->start[j] == -1))
          j++;
        else if ((j >= list2->tabsize) ||
            ((i < list1->tabsize) && (list1->start[i] < list2->start[j]))) {

          /* copy (remaining) item from list1 */

          tmp.start[k] = list1->start[i];

          if (tmp.end)
            tmp.end[k] = list1->end[i];

          if (tmp.target_positions)
            tmp.target_positions[k] = list1->target_positions[i];

          k++;
          i++;

        }
        else if ((i >= list1->tabsize) ||
                 ((j < list2->tabsize) && (list1->start[i] > list2->start[j]))) {

          /* copy (remaining) item from list2 */
          tmp.start[k] = list2->start[j];

          if (tmp.end)
            tmp.end[k] = list2->end[j];

          if (tmp.target_positions)
            tmp.target_positions[k] = list2->target_positions[j];

          k++;
          j++;

        }
        else {

          /* both start positions are identical. Now check whether the end
           * positions are also the same => the ranges are identical and
           * the duplicate is to be eliminated.
           */

          tmp.start[k] = list1->start[i];

          if ((tmp.end == NULL) || (list1->end[i] == list2->end[j])) {

            /* real duplicate, copy once */

            if (tmp.end)
              tmp.end[k]   = list1->end[i];

            if (tmp.target_positions)
              tmp.target_positions[k]   = list1->target_positions[i];

            i++;
            j++;

          }
          else {

            /*
             * we have existing, non-equal end positions. copy the smaller one.
             */

            if (list1->end[i] < list2->end[j]) {
              tmp.end[k]   = list1->end[i];

              if (tmp.target_positions)
                tmp.target_positions[k] = list1->target_positions[i];

              i++;
            }
            else {
              tmp.end[k]   = list2->end[j];

              if (tmp.target_positions)
                tmp.target_positions[k] = list2->target_positions[j];

              j++;
            }

          }
          k++;
        }

      assert(k <= tmp.tabsize);

      /* we did not eliminate any duplicates if k==tmp.tabsize. 
       * So, in that case, we do not have to bother with reallocs.
       */
      
      if (k < tmp.tabsize) {
        tmp.start = (int *)cl_realloc((char *)tmp.start, sizeof(int) * k);
        if (tmp.end)
          tmp.end = (int *)cl_realloc((char *)tmp.end, sizeof(int) * k);
        if (tmp.target_positions)
          tmp.target_positions = (int *)cl_realloc((char *)tmp.target_positions, sizeof(int) * k);
      }

      cl_free(list1->start);
      cl_free(list1->end);
      cl_free(list1->target_positions);

      list1->start = tmp.start; tmp.start = NULL;
      list1->end   = tmp.end;   tmp.end = NULL;
      list1->target_positions = tmp.target_positions;   tmp.target_positions = NULL;
      list1->tabsize = k;
      list1->matches_whole_corpus = 0;
      list1->is_inverted = 0;
    }

    break;

  case Intersection:

    /*
     * -------------------- INTERSECTION
     */

    if (list1->tabsize == 0 && list1->is_inverted)

      /* l1 matches whole corpus, so intersection is equal to l2 */
      return Setop(list1, Identity, list2);

    else if (list2->tabsize == 0 && list2->is_inverted)
      /* l2 matches whole corpus, so intersection is equal to l1 */
      return 1;

    else if ((list1->tabsize == 0) || (list2->tabsize == 0)) {

      /*
       * Bingo. one of the two is empty AND NOT INVERTED. So
       * the intersection is also empty.
       */
      
      cl_free(list1->start);
      cl_free(list1->end);
      cl_free(list1->target_positions);
      list1->tabsize = 0;
      list1->matches_whole_corpus = 0;
      list1->is_inverted = 0;
      
    }
    else if (list1->is_inverted && list2->is_inverted) {
      
      /* intersection of 2 inverted lists is the inverted union */

      list1->is_inverted = 0; list2->is_inverted = 0;
      Setop(list1, Union, list2);
      list1->is_inverted = 1;
      
    }
    else {

      /*
       * Two non-empty lists. ONE of both may be inverted.
       * We have to do some work then
       */

      if (list1->is_inverted)
        tmp.tabsize = list2->tabsize;
      else if (list2->is_inverted)
        tmp.tabsize = list1->tabsize;
      else
        tmp.tabsize = MIN(list1->tabsize, list2->tabsize);

      tmp.start = (int *)cl_malloc(sizeof(int) * tmp.tabsize);

      if (list1->end && list2->end)
        tmp.end   = (int *)cl_malloc(sizeof(int) * tmp.tabsize);
      else
        tmp.end = NULL;

      if (list1->target_positions && list2->target_positions)
        tmp.target_positions = (int *)cl_malloc(sizeof(int) * tmp.tabsize);
      else
        tmp.target_positions = NULL;


      i = 0;                        /* the position in list1 */
      j = 0;                        /* the position in list2 */
      k = 0;                        /* the insertion point in the result list */

      while ((i < list1->tabsize) && (j < list2->tabsize))

        if (list1->start[i] < list2->start[j])
          i++;
        else if (list1->start[i] > list2->start[j])
          j++;
        else {

          /* both start positions are identical. Now check whether the end
           * positions are also the same => the ranges are identical and
           * one version is to be copied.
           */

          if ((tmp.end == NULL) || (list1->end[i] == list2->end[j])) {

            /* real duplicate, copy once */

            tmp.start[k] = list1->start[i];

            if (tmp.end)
              tmp.end[k]   = list1->end[i];

            if (tmp.target_positions)
              tmp.target_positions[k]   = list1->target_positions[i];

            i++;
            j++;
            k++;
          }
          else {

            /*
             * we have existing, non-equal end positions. Advance on
             * list with the smaller element.
             */

            if (list1->end[i] < list2->end[j])
              i++;
            else
              j++;
          }
        }

      assert(k <= tmp.tabsize);

      if (k == 0) {
        /* we did not copy anything. result is empty. */
        cl_free(tmp.start); tmp.start = NULL;
        cl_free(tmp.end);   tmp.end   = NULL;
        cl_free(tmp.target_positions); tmp.target_positions = NULL;
      }
      else if (k < tmp.tabsize) {

        /* we did not eliminate any duplicates if k==tmp.tabsize.
         * So, in that case, we do not have to bother with reallocs.
         */
      
        tmp.start = (int *)cl_realloc((char *)tmp.start, sizeof(int) * k);
        if (tmp.end)
          tmp.end = (int *)cl_realloc((char *)tmp.end, sizeof(int) * k);
        if (tmp.target_positions)
          tmp.target_positions = (int *)cl_realloc((char *)tmp.target_positions, sizeof(int) * k);
      }

      cl_free(list1->start);
      cl_free(list1->end);
      cl_free(list1->target_positions);

      list1->start = tmp.start; tmp.start = NULL;
      list1->end   = tmp.end;   tmp.end = NULL;

      list1->target_positions   = tmp.target_positions;
      tmp.target_positions = NULL;

      list1->tabsize = k;
      list1->matches_whole_corpus = 0;
      list1->is_inverted = 0;
    }

    break;

  case Complement:

    /*
     * -------------------- COMPLEMENT 
     * in that case. ML2 should be empty. We suppose it is.
     */

    /* 
     * what the hell is the complement of a non-initial matchlist?
     * I simply do not know. so do it only for initial ones.
     */
    
    if (list1->end) {
      fprintf(stderr, "Can't calculate complement for non-initial matchlist.\n");
      return 0;
    }

    /* we could always make the complement by toggling the inversion flag,
     * but we only do that in case the list is inverted, otherwise we would
     * need another function to physically make the complement
     */

    if (list1->is_inverted) {
      list1->is_inverted = 0; 
      return 1;
    }

    if (!evalenv) {
      fprintf(stderr, "Can't calculate complement with NULL eval env\n");
      return 0;
    }

    if (!evalenv->query_corpus) {
      fprintf(stderr, "Can't calculate complement with NULL query_corpus.\n");
      return 0;
    }

    if (!access_corpus(evalenv->query_corpus)) {
      fprintf(stderr, "Complement: can't access current corpus.\n");
      return 0;
    }

    /* 
     * OK. The tests went by. Now, the size of the new ML is the 
     * size of the corpus MINUS the size of the current matchlist.
     */

    if ((attr = find_attribute(evalenv->query_corpus->corpus,
                               DEFAULT_ATT_NAME, ATT_POS, NULL)) == NULL) {
      fprintf(stderr, "Complement: can't find %s attribute of current corpus\n",
              DEFAULT_ATT_NAME);
      return 0;
    }

    i = cl_max_cpos(attr);
    if (cl_errno != CDA_OK) {
      fprintf(stderr, "Complement: can't get attribute size\n");
      return 0;
    }

    tmp.tabsize = i - list1->tabsize;

    if (tmp.tabsize == 0) {

      /*
       * Best case. Result is empty.
       */

      cl_free(list1->start);
      cl_free(list1->end);
      cl_free(list1->target_positions);
      list1->matches_whole_corpus = 0;
      list1->tabsize = 0;
      list1->is_inverted = 0;
    }
    else if (tmp.tabsize == i) {

      /*
       * Worst case. 
       * result is a copy of the corpus.
       *
       * TODO: This is not true if we have -1 elements in the source list.
       *
       */

      cl_free(list1->start);
      cl_free(list1->end);
      cl_free(list1->target_positions);

      list1->start = (int *)cl_malloc(sizeof(int) * tmp.tabsize);
      list1->tabsize = tmp.tabsize;
      list1->matches_whole_corpus = 1;
      list1->is_inverted = 0;

      for (i = 0; i < tmp.tabsize; i++)
        list1->start[i] = i;
    }
    else {

      /*
       * in between.
       */
      
      tmp.start = (int *)cl_malloc(sizeof(int) * tmp.tabsize);
      tmp.end = NULL;
      tmp.target_positions = NULL;
      tmp.matches_whole_corpus = 0;

      j = 0;                        /* index in source list */
      t = 0;                        /* index in target list */
      for (k = 0; k < i; k++) {
        if ((j >= list1->tabsize) || (k < list1->start[j])) {
          tmp.start[t] = k;
          t++;
        }
        else if (k == list1->start[j]) {
          j++;
        }
        else /* (k > list1->start[j]) */ {
          assert("Error in Complement calculation routine" && 0);
        }
      }
      assert(t == tmp.tabsize);

      cl_free(list1->start);
      cl_free(list1->end);
      cl_free(list1->target_positions);

      list1->start = tmp.start; tmp.start = NULL;
      list1->end   = tmp.end;   tmp.end = NULL;
      list1->tabsize = tmp.tabsize;
      list1->matches_whole_corpus = 0;
      list1->is_inverted = 0;
    }


    break;

  case Identity:

    /* 
     * -------------------- IDENTITY
     * create a copy of ML2 into ML1
     */

    free_matchlist(list1);

    list1->tabsize = list2->tabsize;
    list1->matches_whole_corpus = list2->matches_whole_corpus;
    list1->is_inverted = list2->is_inverted;

    if (list2->start) {
      list1->start = (int *)cl_malloc(sizeof(int) * list2->tabsize);
      memcpy((char *)list1->start, (char *)list2->start, sizeof(int) * list2->tabsize);
    }

    if (list2->end) {
      list1->end = (int *)cl_malloc(sizeof(int) * list2->tabsize);
      memcpy((char *)list1->end, (char *)list2->end, sizeof(int) * list2->tabsize);
    }

    if (list2->target_positions) {
      list1->target_positions = (int *)cl_malloc(sizeof(int) * list2->tabsize);
      memcpy((char *)list1->target_positions,
             (char *)list2->target_positions, sizeof(int) * list2->tabsize);
    }

    break;

  case Uniq:

    /* 
     * -------------------- UNIQ
     * create a unique version of ML1
     * working destructively on list1
     */

    if (list1->start && (list1->tabsize > 0)) {

      ins = 0;                        /* the insertion point */

      if (list1->end)

        for (i = 0; i < list1->tabsize; i++) {

          if ((ins == 0) ||
              ((list1->start[i] != list1->start[ins-1]) ||
               (list1->end[i] != list1->end[ins-1]))) {

            /* copy the data from the current position
             * down to the insertion point.
             */

            list1->start[ins] = list1->start[i];
            list1->end[ins]   = list1->end[i];
            if (list1->target_positions)
              list1->target_positions[ins]   = list1->target_positions[i];
            ins++;
          }
        }
      else
        for (i = 0; i < list1->tabsize; i++) {
          if ((ins == 0) || (list1->start[i] != list1->start[ins-1])) {

            /* copy the data from the current position
             * down to the insertion point.
             */

            list1->start[ins] = list1->start[i];
            if (list1->target_positions)
              list1->target_positions[ins]   = list1->target_positions[i];
            ins++;
          }
        }
    
      if (ins != list1->tabsize) {

        /*
         * no elements were deleted from the list when ins==tabsize. So
         * we do not have to do anything then.
         * Otherwise, the list was used destructively. Free up used space.
         */

        list1->start = (int *)cl_realloc(list1->start, sizeof(int) * ins);
        if (list1->end)
          list1->end = (int *)cl_realloc(list1->end,   sizeof(int) * ins);
        if (list1->target_positions)
          list1->target_positions = (int *)cl_realloc(list1->target_positions,   sizeof(int) * ins);
        list1->tabsize = ins;
        list1->matches_whole_corpus = 0;
        list1->is_inverted = 0;
      }
    }

    break;

  case Reduce:

    if ((list1->start) && (list1->tabsize > 0)) {
    
      ins = 0;

      /* for the sake of efficiency, we distinguish here between
       * initial matchlists and non-initial matchlists. Two almost
       * identical loops are performed, but we do the test for initial
       * mls instead of inside the loop here */
      
      if (list1->end)

        for (i = 0; i < list1->tabsize; i++) {

          if (list1->start[i] != -1) {

            /* copy the data from the current position
             * down to the insertion point.
             */

            if (i != ins) {
              list1->start[ins] = list1->start[i];
              list1->end[ins]   = list1->end[i];
              if (list1->target_positions)
                list1->target_positions[ins]   = list1->target_positions[i];
            }
            ins++;
          }
        }
      else
        for (i = 0; i < list1->tabsize; i++) {

          if (list1->start[i] != -1) {

            /* copy the data from the current position
             * down to the insertion point.
             */

            if (i != ins)
              list1->start[ins] = list1->start[i];
            if (list1->target_positions)
              list1->target_positions[ins]   = list1->target_positions[i];
            ins++;
          }
        }
      
      if (ins == 0) {

        /*
         * all elements have been deleted. So free the used space.
         */

        cl_free(list1->start);
        cl_free(list1->end);
        cl_free(list1->target_positions);
        list1->tabsize = 0;
        list1->matches_whole_corpus = 0;
        list1->is_inverted = 0;
      }
      else if (ins != list1->tabsize) {

        /*
         * no elements were deleted from the list when ins==tabsize. So
         * we do not have to do anything then.
         * Otherwise, the list was used destructively. Free up used space.
         */

        list1->start = (int *)cl_realloc(list1->start, sizeof(int) * ins);
        if (list1->end)
          list1->end = (int *)cl_realloc(list1->end,   sizeof(int) * ins);
        if (list1->target_positions)
          list1->target_positions = (int *)cl_realloc(list1->target_positions, sizeof(int) * ins);
        list1->tabsize = ins;
        list1->matches_whole_corpus = 0;
        list1->is_inverted = 0;
      }
    }
    break;
    
  default:
    assert("Illegal operator in Setop" && 0);
    return 0;
    break;
  }

  return 1;
}