/* * ------------------------------------------------------------------------ * * "rcqpCmd_attribute_size(SEXP inAttribute)" -- * * * * ------------------------------------------------------------------------ */ SEXP rcqpCmd_attribute_size(SEXP inAttribute) { SEXP result = R_NilValue; char * a; Attribute * attribute; int size; int found = 0; if (!isString(inAttribute) || length(inAttribute) != 1) error("argument 'attribute' must be a string"); PROTECT(inAttribute); a = (char*)CHAR(STRING_ELT(inAttribute,0)); /* Need to try all possible attribute types */ attribute = cqi_lookup_attribute(a, ATT_POS); if (attribute != NULL) { size = cl_max_cpos(attribute); if (size < 0) { UNPROTECT(1); rcqp_send_error(); } else { found = 1; } } else { attribute = cqi_lookup_attribute(a, ATT_STRUC); if (attribute != NULL) { size = cl_max_struc(attribute); if (size < 0) { size = 0; } else { found = 1; } } else { attribute = cqi_lookup_attribute(a, ATT_ALIGN); if (attribute != NULL) { size = cl_max_alg(attribute); if (size < 0) { UNPROTECT(1); rcqp_send_error(); } else { found = 1; } } else { UNPROTECT(1); rcqp_error_code(cqi_errno); } } } if (found) { result = PROTECT(allocVector(INTSXP, 1)); INTEGER(result)[0] = size; } UNPROTECT(2); return result; }
void do_cqi_cl_attribute_size(void) { char *a; Attribute *attribute; int size; a = cqi_read_string(); /* need to try all possible attribute types */ if (server_debug) Rprintf( "CQi: CQI_CL_ATTRIBUTE_SIZE('%s')\n", a); attribute = cqi_lookup_attribute(a, ATT_POS); if (attribute != NULL) { size = cl_max_cpos(attribute); if (size < 0) { send_cl_error(); } else { cqi_data_int(size); } } else { attribute = cqi_lookup_attribute(a, ATT_STRUC); if (attribute != NULL) { size = cl_max_struc(attribute); if (size < 0) { /* send_cl_error(); */ /* current version of CL considers 0 regions a data access error condition, but we want to allow that */ cqi_data_int(0); } else { cqi_data_int(size); } } else { attribute = cqi_lookup_attribute(a, ATT_ALIGN); if (attribute != NULL) { size = cl_max_alg(attribute); if (size < 0) { send_cl_error(); } else { cqi_data_int(size); } } else { cqi_command(cqi_errno); /* return errno from the last lookup */ } } } free(a); }
/** * Prints statistical information about a corpus to STDOUT. * * Each corpus attribute gets info printed about it: * tokens and types for a P-attribute, number of instances * of regions for an S-attribute, number of alignment * blocks for an A-attribute. * * @param corpus The corpus to analyse. */ void describecorpus_show_statistics (Corpus *corpus) { Attribute *a; int tokens, types, regions, blocks; for (a = corpus->attributes; a; a = a->any.next) { switch(a->any.type) { case ATT_POS: Rprintf("p-ATT %-16s ", a->any.name); tokens = cl_max_cpos(a); types = cl_max_id(a); if ((tokens > 0) && (types > 0)) Rprintf("%10d tokens, %8d types", tokens, types); else Rprintf(" NO DATA"); break; case ATT_STRUC: Rprintf("s-ATT %-16s ", a->any.name); regions = cl_max_struc(a); if (regions >= 0) { Rprintf("%10d regions", regions); if (cl_struc_values(a)) Rprintf(" (with annotations)"); } else Rprintf(" NO DATA"); break; case ATT_ALIGN: Rprintf("a-ATT %-16s ", a->any.name); blocks = cl_max_alg(a); if (blocks >= 0) { Rprintf("%10d alignment blocks", blocks); if (cl_has_extended_alignment(a)) Rprintf(" (extended)"); } else Rprintf(" NO DATA"); break; default: Rprintf("??? %-16s (unknown attribute type)", a->any.name); break; } Rprintf("\n"); } Rprintf("\n"); }
/** * Prints basic information about a corpus to STDOUT. * * @param corpus The corpus to report on. * @param with_attribute_names Boolean: iff true, the counts of each type of attribute * are followed by a list of attribute names. * */ void describecorpus_show_basic_info (Corpus *corpus, int with_attribute_names) { Attribute *word, *a; int p_atts = 0, s_atts = 0, a_atts = 0; int size; char *colon = (with_attribute_names) ? ":" : ""; Rprintf("description: %s\n", corpus->name); Rprintf("registry file: %s/%s\n", corpus->registry_dir, corpus->registry_name); Rprintf("home directory: %s/\n", corpus->path); Rprintf("info file: %s\n", (corpus->info_file) ? corpus->info_file : "(none)"); if ((word = cl_new_attribute(corpus, "word", ATT_POS)) == NULL) { Rprintf( "ERROR: 'word' attribute is missing. Aborted.\n"); rcqp_receive_error(1); } size = cl_max_cpos(word); Rprintf("size (tokens): "); if (size >= 0) Rprintf("%d\n", size); else Rprintf("ERROR\n"); Rprintf("\n"); for (a = corpus->attributes; a; a = a->any.next) { switch(a->any.type) { case ATT_POS: p_atts++; break; case ATT_STRUC: s_atts++; break; case ATT_ALIGN: a_atts++; break; default: break; } } Rprintf("%3d positional attributes%s\n", p_atts, colon); if (with_attribute_names) describecorpus_show_attribute_names(corpus, ATT_POS); Rprintf("%3d structural attributes%s\n", s_atts, colon); if (with_attribute_names) describecorpus_show_attribute_names(corpus, ATT_STRUC); Rprintf("%3d alignment attributes%s\n", a_atts, colon); if (with_attribute_names) describecorpus_show_attribute_names(corpus, ATT_ALIGN); Rprintf("\n"); }
/** * Validates the REVCORP component of the given attribute. * * This function validates a REVCORP (i.e. an uncompressed index). * It assumes that a lexicon, frequencies and (compressed or * uncompressed) token stream are available for CL access for the * given attribute. * * @param attr The attribute whose REVCORP should be checked. * @return True for all OK, false for a problem. */ int validate_revcorp(Attribute *attr) { Component *revcorp = ensure_component(attr, CompRevCorpus, 0); int *ptab; /* table of index offsets for each lexicon entry */ int lexsize, corpsize; int i, offset, cpos, id; printf(" ? validating %s ... ", cid_name(CompRevCorpus)); fflush(stdout); if (revcorp == NULL) { printf("FAILED (no data)\n"); return 0; } lexsize = cl_max_id(attr); corpsize = cl_max_cpos(attr); if ((lexsize <= 0) || (corpsize <= 0)) { printf("FAILED (corpus access error)\n"); return 0; } if (revcorp->size != corpsize) { printf("FAILED (wrong size)\n"); return 0; } /* init offsets by calculating REVIDX component from token frequencies */ ptab = (int *) cl_calloc(lexsize, sizeof(int)); offset = 0; for (i = 0; i < lexsize; i++) { ptab[i] = offset; offset += cl_id2freq(attr, i); } /* now read token stream, check each token id against REVCORP, and increment its pointer */ for (cpos = 0; cpos < corpsize; cpos++) { id = cl_cpos2id(attr, cpos); if ((id < 0) || (id >= lexsize)) { printf("FAILED (inconsistency in token stream)\n"); cl_free(ptab); return 0; } if (ntohl(revcorp->data.data[ptab[id]]) != cpos) { printf("FAILED\n"); cl_free(ptab); return 0; } ptab[id]++; } /* validate frequencies by comparing final offsets against those calculated from token frequencies */ offset = 0; for (i = 0; i < lexsize; i++) { offset += cl_id2freq(attr, i); if (ptab[i] != offset) { printf("FAILED (token frequencies incorrect)\n"); cl_free(ptab); return 0; } } cl_free(ptab); printf("OK\n"); return 1; }
/** * Main function for cwb-decode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char **argv) { Attribute *attr; Attribute *context = NULL; int sp; /* start position of a match */ int ep; /* end position of a match */ int w, cnt, read_pos_frm_stdin; char s[CL_MAX_LINE_LENGTH]; /* buffer for strings read from file */ char *token; char *input_filename = NULL; FILE *input_file = stdin; /* ------------------------------------------------- PARSE ARGUMENTS */ int c; extern char *optarg; extern int optind; progname = argv[0]; first_token = -1; last = -1; maxlast = -1; read_pos_frm_stdin = 0; /* use getopt() to parse command-line options */ while((c = getopt(argc, argv, "+s:e:r:nLHCxXf:ph")) != EOF) switch(c) { /* s: start corpus position */ case 's': first_token = atoi(optarg); break; /* e: end corpus position */ case 'e': last = atoi(optarg); break; /* r: registry directory */ case 'r': if (registry_directory == NULL) registry_directory = optarg; else { fprintf(stderr, "%s: -r option used twice\n", progname); exit(2); } break; /* n: show cpos in -H mode */ case 'n': printnum++; break; /* x: XML-compatible output in -C mode (-Cx) */ case 'x': xml_compatible++; break; /* L,H,C,X: Lisp, Horizontal, Compact, and XML modes */ case 'L': mode = LispMode; break; case 'H': mode = ConclineMode; break; case 'C': mode = EncodeMode; break; case 'X': mode = XMLMode; break; /* f: matchlist mode / read corpus positions from file */ case 'f': input_filename = optarg; break; /* p: matchlist mode / read corpus positions from stdin */ case 'p': read_pos_frm_stdin++; break; /* h: help page */ case 'h': decode_usage(2); break; default: fprintf(stderr, "Illegal option. Try \"%s -h\" for more information.\n", progname); fprintf(stderr, "[remember that options go before the corpus name, and attribute declarations after it!]\n"); decode_cleanup(2); } /* required argument: corpus id */ if (optind < argc) { corpus_id = argv[optind++]; if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) { fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n", corpus_id, (registry_directory ? registry_directory : cl_standard_registry() ) ); decode_cleanup(1); } } else { fprintf(stderr, "Missing argument. Try \"%s -h\" for more information.\n", progname); decode_cleanup(2); } /* now parse output flags (-P, -S, ...) [cnt is our own argument counter] */ for (cnt = optind; cnt < argc; cnt++) { if (strcmp(argv[cnt], "-c") == 0) { /* -c: context */ if ((context = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) { fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } } else if (strcmp(argv[cnt], "-P") == 0) { /* -P: positional attribute */ if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_POS)) == NULL) { fprintf(stderr, "Can't open p-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else { if (cl_max_cpos(attr) > 0) { decode_add_attribute(attr); if (maxlast < 0) maxlast = cl_max_cpos(attr); /* determines corpus size */ } else { fprintf(stderr, "Attribute %s.%s is declared, but not accessible (missing data?). Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } } } else if (strcmp(argv[cnt], "-ALL") == 0) { /* -ALL: all p-attributes and s-attributes */ for (attr = corpus->attributes; attr; attr = attr->any.next) if (attr->any.type == ATT_POS) { decode_add_attribute(attr); if (maxlast < 0) maxlast = cl_max_cpos(attr); } else if (attr->any.type == ATT_STRUC) { decode_add_attribute(attr); } } else if (strcmp(argv[cnt], "-D") == 0) { /* -D: dynamic attribute (not implemented) */ fprintf(stderr, "Sorry, dynamic attributes are not implemented. Aborting.\n"); decode_cleanup(2); } else if (strcmp(argv[cnt], "-A") == 0) { /* -A: alignment attribute */ if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_ALIGN)) == NULL) { fprintf(stderr, "Can't open a-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else decode_add_attribute(attr); } else if (strcmp(argv[cnt], "-S") == 0) { /* -S: structural attribute (as tags) */ if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) { fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else decode_add_attribute(attr); } else if (strcmp(argv[cnt], "-V") == 0) { /* -V: show structural attribute values (with -p or -f) */ if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) { fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else if (!cl_struc_values(attr)) { fprintf(stderr, "S-attribute %s.%s does not have annotations. Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else if (printValuesIndex >= MAX_PRINT_VALUES) { fprintf(stderr, "Too many -V attributes, sorry. Aborted.\n"); decode_cleanup(1); } else printValues[printValuesIndex++] = attr; } else { fprintf(stderr, "Unknown flag: %s\n", argv[cnt]); decode_cleanup(2); } } /* ---- end of parse attribute declarations ---- */ if (input_filename != NULL) { if (strcmp(input_filename, "-") == 0) input_file = stdin; else if ((input_file = fopen(input_filename, "r")) == NULL) { perror(input_filename); exit(1); } read_pos_frm_stdin++; } decode_verify_print_value_list(); /* ------------------------------------------------------------ DECODE CORPUS */ if (read_pos_frm_stdin == 0) { /* * normal mode: decode entire corpus or specified range */ if (maxlast < 0) { fprintf(stderr, "Need at least one p-attribute (-P flag). Aborted.\n"); decode_cleanup(2); } if (first_token < 0 || first_token >= maxlast) first_token = 0; if (last < 0 || last >= maxlast) last = maxlast - 1; if (last < first_token) { fprintf(stderr, "Warning: output range #%d..#%d is empty. No output.\n", first_token, last); decode_cleanup(2); } if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) { decode_print_xml_declaration(); printf("<corpus name=\"%s\" start=\"%d\" end=\"%d\">\n", corpus_id, first_token, last); } /* decode_print_surrounding_s_att_values(first_token); */ /* don't do that in "normal" mode, coz it doesn't make sense */ for (w = first_token; w <= last; w++) decode_print_token_sequence(w, -1, context); if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) { printf("</corpus>\n"); } } else { /* * matchlist mode: read (pairs of) corpus positions from stdin or file */ if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) { decode_print_xml_declaration(); printf("<matchlist corpus=\"%s\">\n", corpus_id); } cnt = 0; while (fgets(s, CL_MAX_LINE_LENGTH, input_file) != NULL) { token = strtok(s, " \t\n"); if ((token != NULL) && is_num(token)) { sp = atoi(token); ep = -1; if ((token = strtok(NULL, " \t\n")) != NULL) { if (!is_num(token)) { fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", token); decode_cleanup(1); } else ep = atoi(token); } cnt++; /* count matches in matchlist */ if (mode == XMLMode) { printf("<match nr=\"%d\"", cnt); if (printnum) printf(" start=\"%d\" end=\"%d\"", sp, (ep >= 0) ? ep : sp); printf(">\n"); } else { /* nothing shown before range */ } decode_print_surrounding_s_att_values(sp); decode_print_token_sequence(sp, ep, context); if (mode == XMLMode) { printf("</match>\n"); } else if (mode != ConclineMode) { printf("\n"); /* blank line, unless in -H mode */ } } else { fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", s); decode_cleanup(1); } } if (input_file != stdin) fclose(input_file); if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) { printf("</matchlist>\n"); } } decode_cleanup(0); return 0; /* just to keep gcc from complaining */ }
/** * Checks a huffcoded attribute for errors by decompressing it. * * This function assumes that compute_code_lengths() has been called * beforehand and made sure that the _uncompressed_ token sequence is * used by CL access functions. * * @param attr The attribute to check. * @param fname Base filename to use for the three compressed-attribute files. * Can be NULL, in which case the filenames in the attribute are used. */ void decode_check_huff(Attribute *attr, char *fname) { BFile bfd; FILE *sync; HCD hc; int pos, size, sync_offset, offset; int l, v; int item, true_item; unsigned char bit; char hcd_path[CL_MAX_LINE_LENGTH]; char huf_path[CL_MAX_LINE_LENGTH]; char sync_path[CL_MAX_LINE_LENGTH]; Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_huffcode, attr->any.name); if (fname) { sprintf(hcd_path, "%s.hcd", fname); sprintf(huf_path, "%s.huf", fname); sprintf(sync_path, "%s.huf.syn", fname); } else { char *path; path = component_full_name(attr, CompHuffSeq, NULL); assert(path && (cderrno == CDA_OK)); strcpy(huf_path, path); path = component_full_name(attr, CompHuffCodes, NULL); assert(path && (cderrno == CDA_OK)); strcpy(hcd_path, path); path = component_full_name(attr, CompHuffSync, NULL); assert(path && (cderrno == CDA_OK)); strcpy(sync_path, path); } Rprintf("- reading code descriptor block from %s\n", hcd_path); if (!ReadHCD(hcd_path, &hc)) { Rprintf( "ERROR: reading %s failed. Aborted.\n", hcd_path); rcqp_receive_error(1); } Rprintf("- reading compressed item sequence from %s\n", huf_path); if (!BFopen(huf_path, "r", &bfd)) { Rprintf( "ERROR: can't open file %s. Aborted.\n", huf_path); perror(huf_path); rcqp_receive_error(1); } Rprintf("- reading sync (mod %d) from %s\n", SYNCHRONIZATION, sync_path); if ((sync = fopen(sync_path, "r")) == NULL) { Rprintf( "ERROR: can't open file %s. Aborted.\n", sync_path); perror(sync_path); rcqp_receive_error(1); } size = cl_max_cpos(attr); if (size != hc.length) { Rprintf( "ERROR: wrong corpus size (%d tokens) in %s (correct size: %d)\n", hc.length, hcd_path, size); rcqp_receive_error(1); } for (pos = 0; pos < hc.length; pos++) { if ((pos % SYNCHRONIZATION) == 0) { offset = BFposition(&bfd); /* need to get offset before flushing (because flushing fills the bit buffer and advances offset to the following byte!) */ if (pos > 0) BFflush(&bfd); sync_offset = -1; /* make sure we get an error if read below fails */ NreadInt(&sync_offset, sync); if (offset != sync_offset) { Rprintf( "ERROR: wrong sync offset %d (true offset %d) at cpos %d. Aborted.\n", sync_offset, offset, pos); rcqp_receive_error(1); } } if (!BFread(&bit, 1, &bfd)) { Rprintf( "ERROR reading file %s. Aborted.\n", huf_path); rcqp_receive_error(1); } v = (bit ? 1 : 0); l = 1; while (v < hc.min_code[l]) { if (!BFread(&bit, 1, &bfd)) { Rprintf( "ERROR reading file %s. Aborted.\n", huf_path); return; } v <<= 1; if (bit) v++; l++; } item = hc.symbols[hc.symindex[l] + v - hc.min_code[l]]; true_item = cl_cpos2id(attr, pos); if (item != true_item) { Rprintf( "ERROR: wrong token (id=%d) at cpos %d (correct id=%d). Aborted.\n", item, pos, true_item); } } fclose(sync); BFclose(&bfd); /* tell the user it's safe to delete the CORPUS component now */ Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompCorpus, NULL)); return; /* exits on error, so there's no return value */ }
/** * Compresses the token stream of a p-attribute. * * Three files are created: the compressed token stream, the descriptor block, * and a sync file. * * @param attr The attribute to compress. * @param hc Location for the resulting Huffmann code descriptor block. * @param fname Base filename for the resulting files. */ int compute_code_lengths(Attribute *attr, HCD *hc, char *fname) { int id, i, h; int nr_codes = 0; int *heap = NULL; unsigned *codelength = NULL; /* was char[], probably to save space; but that's unnecessary and makes gcc complain */ int issued_codes[MAXCODELEN]; int next_code[MAXCODELEN]; long sum_bits; Rprintf("COMPRESSING TOKEN STREAM of %s.%s\n", corpus_id_cwb_huffcode, attr->any.name); /* I need the following components: * - CompCorpus * - CompCorpusFreqs * - CompLexicon * - CompLexiconIdx * and want to force the CL to use them rather than compressed data. */ { Component *comp; if ((comp = ensure_component(attr, CompCorpus, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the CORPUS component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompLexicon, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the LEXION component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompLexiconIdx, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the LEXIDX component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompCorpusFreqs, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the FREQS component.\n" "Run 'makeall -r %s -c FREQS %s %s' in order to create it.\n", corpus->registry_dir, corpus->registry_name, attr->any.name); rcqp_receive_error(1); } } /* * strongly follows Witten/Moffat/Bell: ``Managing Gigabytes'', * pp. 335ff. */ hc->size = cl_max_id(attr); /* the size of the attribute (nr of items) */ if ((hc->size <= 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_max_id() failed"); rcqp_receive_error(1); } hc->length = cl_max_cpos(attr); /* the length of the attribute (nr of tokens) */ if ((hc->length <= 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_max_cpos() failed"); rcqp_receive_error(1); } hc->symbols = NULL; hc->min_codelen = 100; hc->max_codelen = 0; memset((char *)hc->lcount, '\0', MAXCODELEN * sizeof(int)); memset((char *)hc->min_code, '\0', MAXCODELEN * sizeof(int)); memset((char *)hc->symindex, '\0', MAXCODELEN * sizeof(int)); memset((char *)issued_codes, '\0', MAXCODELEN * sizeof(int)); codelength = (unsigned *)cl_calloc(hc->size, sizeof(unsigned)); /* =========================================== make & initialize the heap */ heap = (int *)cl_malloc(hc->size * 2 * sizeof(int)); for (i = 0; i < hc->size; i++) { heap[i] = hc->size + i; heap[hc->size+i] = get_id_frequency(attr, i) + 1; /* add-one trick needed to avoid unsupported Huffman codes > 31 bits for very large corpora of ca. 2 billion words: theoretical optimal code length for hapax legomena in such corpora is ca. 31 bits, and the Huffman algorithm sometimes generates 32-bit codes; with add-one trick, the theoretical optimal code length is always <= 30 bits */ } /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) fprintf(protocol, "Allocated heap with %d cells for %d items\n\n", hc->size * 2, hc->size); if (do_protocol > 2) print_heap(heap, hc->size, "After Initialization"); /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 1 */ h = hc->size; /* * we address the heap in the following manner: when we start array * indices at 1, the left child is at 2i, and the right child is at * 2i+1. So we maintain this scheme and decrement just before * adressing the array. */ /* * construct the initial min-heap */ for (i = hc->size/2; i > 0; i--) { /* do: * bottom up, left to right, * for each root of each subtree, sift if necessary */ sift(heap, h, i); } /* ============================== PROTOCOL ============================== */ if (do_protocol > 2) { print_heap(heap, hc->size, "Initial Min-Heap"); fprintf(protocol, "\n"); } /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 2 */ /* smallest item at top of heap now, remove the two smallest items * and sift, find second smallest by removing top and sifting, as * long as we have more than one root */ while (h > 1) { int pos[2]; for (i = 0; i < 2; i++) { /* remove topmost (i.e. smallest) item */ pos[i] = heap[0]; /* remove and sift, to reobtain heap integrity: move ``last'' * item to top of heap and sift */ heap[0] = heap[--h]; sift(heap, h, 1); } /* ============================== PROTOCOL ============================== */ if (do_protocol > 3) { fprintf(protocol, "Removed smallest item %d with freq %d\n", pos[0], heap[pos[0]]); fprintf(protocol, "Removed 2nd smallest item %d with freq %d\n", pos[1], heap[pos[1]]); } /* ============================== PROTOCOL ============================== */ /* * pos[0] and pos[1] contain pointers to the two smallest items * now. since h was decremented twice, h and h+1 are now empty and * become the accumulated freq of pos[i]. The individual * frequencies are not needed any more, so pointers to h+1 (the * acc freq) are stored there instead (tricky, since freq cell * becomes pointer cell). So, what happens here, is to include a * new element in the heap. */ heap[h] = h+1; heap[h+1] = heap[pos[0]] + heap[pos[1]]; /* accumulated freq */ heap[pos[0]] = heap[pos[1]] = h+1; /* pointers! */ h++; /* we put a new element into heap */ /* * now, swap it up until we reobtain heap integrity */ { register int parent, current; current = h; parent = current >> 1; while ((parent > 0) && (heap[heap[parent-1]] > heap[heap[current-1]])) { int tmp; tmp = heap[parent-1]; heap[parent-1] = heap[current-1]; heap[current-1] = tmp; current = parent; parent = current >> 1; } } } /* ============================== PROTOCOL ============================== */ if (do_protocol > 3) fprintf(protocol, "\n"); /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 3 */ /* compute the code lengths. We don't have any freqs in heap any * more, only pointers to parents */ heap[0] = -1U; /* root has a depth of 0 */ heap[1] = 0; /* we trust in what they say on p. 345 */ for (i = 2; i < hc->size * 2; i++) heap[i] = heap[heap[i]]+1; /* collect the lengths */ sum_bits = 0L; for (i = 0; i < hc->size; i++) { int cl = heap[i+hc->size]; sum_bits += cl * get_id_frequency(attr, i); codelength[i] = cl; if (cl == 0) continue; if (cl > hc->max_codelen) hc->max_codelen = cl; if (cl < hc->min_codelen) hc->min_codelen = cl; hc->lcount[cl]++; } /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) { fprintf(protocol, "Minimal code length: %3d\n", hc->min_codelen); fprintf(protocol, "Maximal code length: %3d\n", hc->max_codelen); fprintf(protocol, "Compressed code len: %10ld bits, %10ld (+1) bytes\n\n\n", sum_bits, sum_bits/8); } /* ============================== PROTOCOL ============================== */ if (hc->max_codelen >= MAXCODELEN) { Rprintf( "Error: Huffman codes too long (%d bits, current maximum is %d bits).\n", hc->max_codelen, MAXCODELEN-1); Rprintf( " Please contact the CWB development team for assistance.\n"); rcqp_receive_error(1); } if ((hc->max_codelen == 0) && (hc->min_codelen == 100)) { Rprintf( "Problem: No output generated -- no items?\n"); nr_codes = 0; } else { hc->min_code[hc->max_codelen] = 0; for (i = hc->max_codelen-1; i > 0; i--) hc->min_code[i] = (hc->min_code[i+1] + hc->lcount[i+1]) >> 1; hc->symindex[hc->min_codelen] = 0; for (i = hc->min_codelen+1; i <= hc->max_codelen; i++) hc->symindex[i] = hc->symindex[i-1] + hc->lcount[i-1]; /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) { int sum_codes = 0; fprintf(protocol, " CL #codes MinCode SymIdx\n"); fprintf(protocol, "----------------------------------------\n"); for (i = hc->min_codelen; i <= hc->max_codelen; i++) { sum_codes += hc->lcount[i]; fprintf(protocol, "%3d %7d %7d %7d\n", i, hc->lcount[i], hc->min_code[i], hc->symindex[i]); } fprintf(protocol, "----------------------------------------\n"); fprintf(protocol, " %7d\n", sum_codes); } /* ============================== PROTOCOL ============================== */ for (i = 0; i < MAXCODELEN; i++) next_code[i] = hc->min_code[i]; /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "\n"); fprintf(protocol, " Item f(item) CL Bits Code, String\n"); fprintf(protocol, "------------------------------------" "------------------------------------\n"); } /* ============================== PROTOCOL ============================== */ /* compute and issue codes */ hc->symbols = heap + hc->size; for (i = 0; i < hc->size; i++) { /* we store the code for item i in heap[i] */ heap[i] = next_code[codelength[i]]; next_code[codelength[i]]++; /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "%7d %7d %3d %10d ", i, get_id_frequency(attr, i), codelength[i], codelength[i] * get_id_frequency(attr, i)); bprintf(heap[i], codelength[i], protocol); fprintf(protocol, " %7d %s\n", heap[i], get_string_of_id(attr, i)); } /* ============================== PROTOCOL ============================== */ /* and put the item itself in the second half of the table */ heap[hc->size+hc->symindex[codelength[i]]+issued_codes[codelength[i]]] = i; issued_codes[codelength[i]]++; } /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "------------------------------------" "------------------------------------\n"); } /* ============================== PROTOCOL ============================== */ /* The work itself -- encode the attribute data */ { char *path; char hcd_path[CL_MAX_LINE_LENGTH]; char huf_path[CL_MAX_LINE_LENGTH]; char sync_path[CL_MAX_LINE_LENGTH]; Component *corp; BFile bfd; FILE *sync; int cl, code, pos; corp = ensure_component(attr, CompCorpus, 0); assert(corp); if (fname) { path = fname; sprintf(hcd_path, "%s.hcd", path); sprintf(huf_path, "%s.huf", path); sprintf(sync_path, "%s.huf.syn", path); } else { path = component_full_name(attr, CompHuffSeq, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(huf_path, path); path = component_full_name(attr, CompHuffCodes, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(hcd_path, path); path = component_full_name(attr, CompHuffSync, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(sync_path, path); } Rprintf("- writing code descriptor block to %s\n", hcd_path); if (!WriteHCD(hcd_path, hc)) { Rprintf( "ERROR: writing %s failed. Aborted.\n", hcd_path); rcqp_receive_error(1); } Rprintf("- writing compressed item sequence to %s\n", huf_path); if (!BFopen(huf_path, "w", &bfd)) { Rprintf( "ERROR: can't create file %s\n", huf_path); perror(huf_path); rcqp_receive_error(1); } Rprintf("- writing sync (every %d tokens) to %s\n", SYNCHRONIZATION, sync_path); if ((sync = fopen(sync_path, "w")) == NULL) { Rprintf( "ERROR: can't create file %s\n", sync_path); perror(sync_path); rcqp_receive_error(1); } for (i = 0; i < hc->length; i++) { /* SYNCHRONIZE */ if ((i % SYNCHRONIZATION) == 0) { if (i > 0) BFflush(&bfd); pos = BFposition(&bfd); NwriteInt(pos, sync); } id = cl_cpos2id(attr, i); if ((id < 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_cpos2id() failed"); rcqp_receive_error(1); } else { assert((id >= 0) && (id < hc->size) && "Internal Error"); cl = codelength[id]; code = heap[id]; if (!BFwriteWord((unsigned int)code, cl, &bfd)) { Rprintf( "Error writing code for ID %d (%d, %d bits) at position %d. Aborted.\n", id, code, cl, i); rcqp_receive_error(1); } } } fclose(sync); BFclose(&bfd); } } free(codelength); free(heap); return 1; }
/** * Checks a compressed reversed index for errors by decompressing it. * * This function this assumes that compress_reversed_index() has been called * beforehand and made sure that the _uncompressed_ index is used by CL * access functions. * * @param attr The attribute to check the index of. * @param output_fn Base name for the compressed RDX files to be read * (if this is null, filename swill be taken from the * attribute). */ void decompress_check_reversed_index(Attribute *attr, char *output_fn) { char *s; char data_fname[CL_MAX_FILENAME_LENGTH]; char index_fname[CL_MAX_FILENAME_LENGTH]; int nr_elements; int element_freq; int corpus_size; int pos, gap; int b; int i, k; BFile data_file; FILE *index_file; PositionStream PStream; int true_pos; Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name); nr_elements = cl_max_id(attr); if ((nr_elements <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_id() failed"); compressrdx_cleanup(1); } corpus_size = cl_max_cpos(attr); if ((corpus_size <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_cpos() failed"); compressrdx_cleanup(1); } if (output_fn) { sprintf(data_fname, "%s.crc", output_fn); sprintf(index_fname, "%s.crx", output_fn); } else { s = component_full_name(attr, CompCompRF, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(data_fname, s); s = component_full_name(attr, CompCompRFX, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(index_fname, s); } if (! BFopen(data_fname, "r", &data_file)) { Rprintf( "ERROR: can't open file %s\n", data_fname); perror(data_fname); compressrdx_cleanup(1); } Rprintf("- reading compressed index from %s\n", data_fname); if ((index_file = fopen(index_fname, "r")) == NULL) { Rprintf( "ERROR: can't open file %s\n", index_fname); perror(index_fname); compressrdx_cleanup(1); } Rprintf("- reading compressed index offsets from %s\n", index_fname); for (i = 0; i < nr_elements; i++) { element_freq = cl_id2freq(attr, i); if ((element_freq == 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) token frequency == 0\n"); compressrdx_cleanup(1); } PStream = cl_new_stream(attr, i); if ((PStream == NULL) || (cl_errno != CDA_OK)) { cl_error("(aborting) index read error"); compressrdx_cleanup(1); } b = compute_ba(element_freq, corpus_size); if (debug_cwb_compress_rdx) fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n", i, element_freq, b); pos = 0; for (k = 0; k < element_freq; k++) { gap = read_golomb_code_bf(b, &data_file); pos += gap; if (1 != cl_read_stream(PStream, &true_pos, 1)) { cl_error("(aborting) index read error\n"); compressrdx_cleanup(1); } if (pos != true_pos) { Rprintf( "ERROR: wrong occurrence of token #%d at cpos %d (correct cpos: %d). Aborted.\n", i, pos, true_pos); compressrdx_cleanup(1); } } cl_delete_stream(&PStream); BFflush(&data_file); } fclose(index_file); BFclose(&data_file); /* tell the user it's safe to delete the REVCORP and REVCIDX components now */ Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompRevCorpus, NULL)); Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompRevCorpusIdx, NULL)); return; }
/** * Compresses the reversed index of a p-attribute. * * @param attr The attribute to compress the index of. * @param output_fn Base name for the compressed RDX files to be written * (if this is null, filenames will be taken from the * attribute). */ void compress_reversed_index(Attribute *attr, char *output_fn) { char *s; char data_fname[CL_MAX_FILENAME_LENGTH]; char index_fname[CL_MAX_FILENAME_LENGTH]; int nr_elements; int element_freq; int corpus_size; int last_pos, gap, fpos; int b; int i, k; BFile data_file; FILE *index_file = NULL; PositionStream PStream; int new_pos; Rprintf("COMPRESSING INDEX of %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name); /* ensure that we do NOT use the compressed index while building the * compressed index (yeah, a nasty thing that). That is, load the * .corpus.rev and .corpus.rdx components in order to force * subsequent CL calls to use the uncompressed data. */ { Component *comp; if ((comp = ensure_component(attr, CompRevCorpus, 0)) == NULL) { Rprintf( "Index compression requires the REVCORP component\n"); compressrdx_cleanup(1); } if ((comp = ensure_component(attr, CompRevCorpusIdx, 0)) == NULL) { Rprintf( "Index compression requires the REVCIDX component\n"); compressrdx_cleanup(1); } } nr_elements = cl_max_id(attr); if ((nr_elements <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_id() failed"); compressrdx_cleanup(1); } corpus_size = cl_max_cpos(attr); if ((corpus_size <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_cpos() failed"); compressrdx_cleanup(1); } if (output_fn) { sprintf(data_fname, "%s.crc", output_fn); sprintf(index_fname, "%s.crx", output_fn); } else { s = component_full_name(attr, CompCompRF, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(data_fname, s); s = component_full_name(attr, CompCompRFX, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(index_fname, s); } if (! BFopen(data_fname, "w", &data_file)) { Rprintf( "ERROR: can't create file %s\n", data_fname); perror(data_fname); compressrdx_cleanup(1); } Rprintf("- writing compressed index to %s\n", data_fname); if ((index_file = fopen(index_fname, "wb")) == NULL) { Rprintf( "ERROR: can't create file %s\n", index_fname); perror(index_fname); compressrdx_cleanup(1); } Rprintf("- writing compressed index offsets to %s\n", index_fname); for (i = 0; i < nr_elements; i++) { element_freq = cl_id2freq(attr, i); if ((element_freq == 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) token frequency == 0\n"); compressrdx_cleanup(1); } PStream = cl_new_stream(attr, i); if ((PStream == NULL) || (cl_errno != CDA_OK)) { cl_error("(aborting) index read error"); compressrdx_cleanup(1); } b = compute_ba(element_freq, corpus_size); fpos = BFposition(&data_file); NwriteInt(fpos, index_file); if (debug_cwb_compress_rdx) fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n", i, element_freq, b); last_pos = 0; for (k = 0; k < element_freq; k++) { if (1 != cl_read_stream(PStream, &new_pos, 1)) { cl_error("(aborting) index read error\n"); compressrdx_cleanup(1); } gap = new_pos - last_pos; last_pos = new_pos; if (debug_cwb_compress_rdx) fprintf(debug_output, "%8d: gap=%4d, b=%4d\n", codepos, gap, b); write_golomb_code(gap, b, &data_file); codepos++; } cl_delete_stream(&PStream); BFflush(&data_file); } fclose(index_file); BFclose(&data_file); return; }
/** * Main function for cwb-align-encode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char *argv[]) { int argindex; /* index of first argument in argv[] */ char *align_name = NULL; /* name of the .align file */ FILE *af = NULL; /* alignment file handle */ int af_is_pipe; /* need to know whether to call fclose() or pclose() */ char alx_name[CL_MAX_LINE_LENGTH]; /* full pathname of .alx file */ char alg_name[CL_MAX_LINE_LENGTH]; /* full pathname of optional .alg file */ FILE *alx=NULL, *alg=NULL; /* file handles for .alx and optional .alg file */ char line[CL_MAX_LINE_LENGTH]; /* one line of input from <infile> */ char corpus1_name[CL_MAX_FILENAME_LENGTH]; char corpus2_name[CL_MAX_FILENAME_LENGTH]; char s1_name[CL_MAX_FILENAME_LENGTH]; char s2_name[CL_MAX_FILENAME_LENGTH]; Corpus *corpus1, *corpus2; /* corpus handles */ Attribute *w1, *w2; /* attribute handles for 'word' attributes; used to determine corpus size */ int size1, size2; /* size of source & target corpus */ Corpus *source_corpus; /* encode alignment in this corpus (depends on -R flag, important for -D option) */ char *source_corpus_name; /* just for error messages */ char *attribute_name; /* name of alignment attribute (depends on -R flag, must be lowercase) */ int f1,l1,f2,l2; /* alignment regions */ int current1, current2; int mark, n_0_1, n_1_0; int l; progname = argv[0]; /* parse command line and read arguments */ argindex = alignencode_parse_args(argc, argv, 1); align_name = argv[argindex]; /* open alignment file and parse header; .gz files are automatically decompressed */ af_is_pipe = 0; l = strlen(align_name); if ((l > 3) && (strncasecmp(align_name + l - 3, ".gz", 3) == 0)) { char *pipe_cmd = (char *) cl_malloc(l+10); sprintf(pipe_cmd, "gzip -cd %s", align_name); /* write .gz file through gzip pipe */ af = popen(pipe_cmd, "r"); if (af == NULL) { perror(pipe_cmd); Rprintf( "%s: can't read compressed file %s\n", progname, align_name); rcqp_receive_error(1); } af_is_pipe = 1; cl_free(pipe_cmd); } else { af = fopen(align_name, "r"); if (af == NULL) { perror(align_name); Rprintf( "%s: can't read file %s\n", progname, align_name); rcqp_receive_error(1); } } /* read header = first line */ fgets(line, CL_MAX_LINE_LENGTH, af); if (4 != sscanf(line, "%s %s %s %s", corpus1_name, s1_name, corpus2_name, s2_name)) { Rprintf( "%s: %s not in .align format\n", progname, align_name); Rprintf( "wrong header: %s", line); rcqp_receive_error(1); } if (verbose) { if (reverse) Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus2_name, corpus1_name, align_name); else Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus1_name, corpus2_name, align_name); } /* open corpora and determine their sizes (for validity checks and compatibility mode) */ if (NULL == (corpus1 = cl_new_corpus(registry_dir, corpus1_name))) { Rprintf( "%s: can't open corpus %s\n", progname, corpus1_name); rcqp_receive_error(1); } if (NULL == (corpus2 = cl_new_corpus(registry_dir, corpus2_name))) { Rprintf( "%s: can't open corpus %s\n", progname, corpus2_name); rcqp_receive_error(1); } if (NULL == (w1 = cl_new_attribute(corpus1, "word", ATT_POS))) { Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus1_name); rcqp_receive_error(1); } if (NULL == (w2 = cl_new_attribute(corpus2, "word", ATT_POS))) { Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus2_name); rcqp_receive_error(1); } size1 = cl_max_cpos(w1); if (size1 <= 0) { Rprintf( "%s: data access error (%s.word)\n", progname, corpus1_name); rcqp_receive_error(1); } size2 = cl_max_cpos(w2); if (size2 <= 0) { Rprintf( "%s: data access error (%s.word)\n", progname, corpus2_name); rcqp_receive_error(1); } /* now work out the actual source corpus and the alignment attribute name (depending on -R flag) */ source_corpus = (reverse) ? corpus2 : corpus1; source_corpus_name = (reverse) ? corpus2_name : corpus1_name; attribute_name = cl_strdup((reverse) ? corpus1_name : corpus2_name); cl_id_tolower(attribute_name); /* fold attribute name to lowercase */ /* with -D option, determine data file name(s) from actual source corpus; otherwise use directory specified with -d and the usual naming conventions */ if (data_dir_from_corpus) { Attribute *alignment = cl_new_attribute(source_corpus, attribute_name, ATT_ALIGN); char *comp_pathname; if (alignment == NULL) { Rprintf( "%s: alignment attribute %s.%s not declared in registry file\n", progname, source_corpus_name, attribute_name); rcqp_receive_error(1); } comp_pathname = component_full_name(alignment, CompXAlignData, NULL); if (comp_pathname == NULL) { Rprintf( "%s: can't determine pathname for .alx file (internal error)\n", progname); rcqp_receive_error(1); } strcpy(alx_name, comp_pathname); /* need to strcpy because component_full_name() returns pointer to internal buffer */ if (compatibility) { comp_pathname = component_full_name(alignment, CompAlignData, NULL); if (comp_pathname == NULL) { Rprintf( "%s: can't determine pathname for .alg file (internal error)\n", progname); rcqp_receive_error(1); } strcpy(alg_name, comp_pathname); } } else { sprintf(alx_name, "%s" SUBDIR_SEP_STRING "%s.alx", data_dir, attribute_name); if (compatibility) sprintf(alg_name, "%s" SUBDIR_SEP_STRING "%s.alg", data_dir, attribute_name); } /* now open output file(s) */ alx = fopen(alx_name, "wb"); if (alx == NULL) { perror(alx_name); Rprintf( "%s: can't write file %s\n", progname, alx_name); rcqp_receive_error(1); } if (verbose) Rprintf("Writing file %s ...\n", alx_name); if (compatibility) { alg = fopen(alg_name, "wb"); if (alg == NULL) { perror(alg_name); Rprintf( "%s: can't write file %s\n", progname, alg_name); rcqp_receive_error(1); } if (verbose) Rprintf("Writing file %s ...\n", alg_name); } /* main encoding loop */ f1 = f2 = l1 = l2 = 0; mark = -1; /* check that regions occur in ascending order */ current1 = current2 = -1; /* for compatibility mode */ n_0_1 = n_1_0 = 0; /* number of 0:1 and 1:0 alignments, which are skipped */ while (! feof(af)) { if (NULL == fgets(line, CL_MAX_LINE_LENGTH, af)) break; /* end of file (or read error, which we choose to ignore) */ if (4 != sscanf(line, "%d %d %d %d", &f1, &l1, &f2, &l2)) { Rprintf( "%s: input format error: %s", progname, line); rcqp_receive_error(1); } /* skip 0:1 and 1:0 alignments */ if (l1 < f1) { n_0_1++; continue; } if (l2 < f2) { n_1_0++; continue; } /* check that source regions are non-overlapping and in ascending order */ if (((reverse) ? f2 : f1) <= mark) { Rprintf( "%s: source regions of alignment must be in ascending order\n", progname); Rprintf( "Last region was [*, %d]; current is [%d, %d].\n", mark, f1, l1); Rprintf( "Aborted.\n"); rcqp_receive_error(1); } mark = (reverse) ? l2 : l1; /* write alignment region to .alx file */ if (reverse) { NwriteInt(f2, alx); NwriteInt(l2, alx); NwriteInt(f1, alx); NwriteInt(l1, alx); } else { NwriteInt(f1, alx); NwriteInt(l1, alx); NwriteInt(f2, alx); NwriteInt(l2, alx); } if (compatibility) { /* source and target regions of .alg file must be contiguous; store start points only; */ /* hence we must collapse crossing alignments into one larger region (I know that's bullshit) */ if ((f1 > current1) && (f2 > current2)) { if (reverse) { NwriteInt(f2, alg); NwriteInt(f1, alg); } else { NwriteInt(f1, alg); NwriteInt(f2, alg); } current1 = f1; current2 = f2; } } } if (compatibility) { if (reverse) { NwriteInt(size2, alg); NwriteInt(size1, alg); /* end of corpus alignment point*/ } else { NwriteInt(size1, alg); NwriteInt(size2, alg); /* end of corpus alignment point*/ } } if (verbose) { Rprintf("I skipped %d 0:1 alignments and %d 1:0 alignments.\n", n_0_1, n_1_0); } /* that's it; close file handles */ fclose(alx); if (compatibility) fclose(alg); if (af_is_pipe) pclose(af); else fclose(af); return 0; }
/** * Perform "operation" on the two match lists (can be initial). * * The result is assigned to list1. * * * this whole code is WRONG when one of the matchlists is inverted * TODO! * * Also TODO: give it a better name. * * This contains, by far, most of the code in the Matchlist module. */ int Setop(Matchlist *list1, MLSetOp operation, Matchlist *list2) { int i, j, k, t, ins; Matchlist tmp; Attribute *attr; switch (operation) { case Union: /* * -------------------- UNION */ /* * TODO: * optimize in case * (list1->matches_whole_corpus && list2->matches_whole_corpus) */ if (list2->start == NULL) if (list2->is_inverted) { /* l2 is empty, but inverted, so the result is the whole corpus, * as in l2. */ return Setop(list1, Identity, list2); } else /* the result is list1, so just return */ return 1; else if (list1->start == NULL) if (list1->is_inverted) /* empty, but inverted --> whole corpus, l1 */ return 1; else /* the result is in list2, so return a copy */ return Setop(list1, Identity, list2); else if (list1->is_inverted && list2->is_inverted) { /* union of 2 inverted lists is the inverted intersection */ list1->is_inverted = 0; list2->is_inverted = 0; Setop(list1, Intersection, list2); list1->is_inverted = 1; } else { if (list1->is_inverted) { list1->is_inverted = 0; Setop(list1, Complement, NULL); } if (list2->is_inverted) { list2->is_inverted = 0; Setop(list2, Complement, NULL); } tmp.tabsize = list1->tabsize + list2->tabsize; tmp.start = (int *)cl_malloc(sizeof(int) * tmp.tabsize); if (list1->end && list2->end) tmp.end = (int *)cl_malloc(sizeof(int) * tmp.tabsize); else tmp.end = NULL; if (list1->target_positions && list2->target_positions) tmp.target_positions = (int *)cl_malloc(sizeof(int) * tmp.tabsize); else tmp.target_positions = NULL; i = 0; /* the position in list1 */ j = 0; /* the position in list2 */ k = 0; /* the insertion point in the result list `tmp' */ while ((i < list1->tabsize) || (j < list2->tabsize)) if ((i < list1->tabsize) && (list1->start[i] == -1)) i++; else if ((j < list2->tabsize) && (list2->start[j] == -1)) j++; else if ((j >= list2->tabsize) || ((i < list1->tabsize) && (list1->start[i] < list2->start[j]))) { /* copy (remaining) item from list1 */ tmp.start[k] = list1->start[i]; if (tmp.end) tmp.end[k] = list1->end[i]; if (tmp.target_positions) tmp.target_positions[k] = list1->target_positions[i]; k++; i++; } else if ((i >= list1->tabsize) || ((j < list2->tabsize) && (list1->start[i] > list2->start[j]))) { /* copy (remaining) item from list2 */ tmp.start[k] = list2->start[j]; if (tmp.end) tmp.end[k] = list2->end[j]; if (tmp.target_positions) tmp.target_positions[k] = list2->target_positions[j]; k++; j++; } else { /* both start positions are identical. Now check whether the end * positions are also the same => the ranges are identical and * the duplicate is to be eliminated. */ tmp.start[k] = list1->start[i]; if ((tmp.end == NULL) || (list1->end[i] == list2->end[j])) { /* real duplicate, copy once */ if (tmp.end) tmp.end[k] = list1->end[i]; if (tmp.target_positions) tmp.target_positions[k] = list1->target_positions[i]; i++; j++; } else { /* * we have existing, non-equal end positions. copy the smaller one. */ if (list1->end[i] < list2->end[j]) { tmp.end[k] = list1->end[i]; if (tmp.target_positions) tmp.target_positions[k] = list1->target_positions[i]; i++; } else { tmp.end[k] = list2->end[j]; if (tmp.target_positions) tmp.target_positions[k] = list2->target_positions[j]; j++; } } k++; } assert(k <= tmp.tabsize); /* we did not eliminate any duplicates if k==tmp.tabsize. * So, in that case, we do not have to bother with reallocs. */ if (k < tmp.tabsize) { tmp.start = (int *)cl_realloc((char *)tmp.start, sizeof(int) * k); if (tmp.end) tmp.end = (int *)cl_realloc((char *)tmp.end, sizeof(int) * k); if (tmp.target_positions) tmp.target_positions = (int *)cl_realloc((char *)tmp.target_positions, sizeof(int) * k); } cl_free(list1->start); cl_free(list1->end); cl_free(list1->target_positions); list1->start = tmp.start; tmp.start = NULL; list1->end = tmp.end; tmp.end = NULL; list1->target_positions = tmp.target_positions; tmp.target_positions = NULL; list1->tabsize = k; list1->matches_whole_corpus = 0; list1->is_inverted = 0; } break; case Intersection: /* * -------------------- INTERSECTION */ if (list1->tabsize == 0 && list1->is_inverted) /* l1 matches whole corpus, so intersection is equal to l2 */ return Setop(list1, Identity, list2); else if (list2->tabsize == 0 && list2->is_inverted) /* l2 matches whole corpus, so intersection is equal to l1 */ return 1; else if ((list1->tabsize == 0) || (list2->tabsize == 0)) { /* * Bingo. one of the two is empty AND NOT INVERTED. So * the intersection is also empty. */ cl_free(list1->start); cl_free(list1->end); cl_free(list1->target_positions); list1->tabsize = 0; list1->matches_whole_corpus = 0; list1->is_inverted = 0; } else if (list1->is_inverted && list2->is_inverted) { /* intersection of 2 inverted lists is the inverted union */ list1->is_inverted = 0; list2->is_inverted = 0; Setop(list1, Union, list2); list1->is_inverted = 1; } else { /* * Two non-empty lists. ONE of both may be inverted. * We have to do some work then */ if (list1->is_inverted) tmp.tabsize = list2->tabsize; else if (list2->is_inverted) tmp.tabsize = list1->tabsize; else tmp.tabsize = MIN(list1->tabsize, list2->tabsize); tmp.start = (int *)cl_malloc(sizeof(int) * tmp.tabsize); if (list1->end && list2->end) tmp.end = (int *)cl_malloc(sizeof(int) * tmp.tabsize); else tmp.end = NULL; if (list1->target_positions && list2->target_positions) tmp.target_positions = (int *)cl_malloc(sizeof(int) * tmp.tabsize); else tmp.target_positions = NULL; i = 0; /* the position in list1 */ j = 0; /* the position in list2 */ k = 0; /* the insertion point in the result list */ while ((i < list1->tabsize) && (j < list2->tabsize)) if (list1->start[i] < list2->start[j]) i++; else if (list1->start[i] > list2->start[j]) j++; else { /* both start positions are identical. Now check whether the end * positions are also the same => the ranges are identical and * one version is to be copied. */ if ((tmp.end == NULL) || (list1->end[i] == list2->end[j])) { /* real duplicate, copy once */ tmp.start[k] = list1->start[i]; if (tmp.end) tmp.end[k] = list1->end[i]; if (tmp.target_positions) tmp.target_positions[k] = list1->target_positions[i]; i++; j++; k++; } else { /* * we have existing, non-equal end positions. Advance on * list with the smaller element. */ if (list1->end[i] < list2->end[j]) i++; else j++; } } assert(k <= tmp.tabsize); if (k == 0) { /* we did not copy anything. result is empty. */ cl_free(tmp.start); tmp.start = NULL; cl_free(tmp.end); tmp.end = NULL; cl_free(tmp.target_positions); tmp.target_positions = NULL; } else if (k < tmp.tabsize) { /* we did not eliminate any duplicates if k==tmp.tabsize. * So, in that case, we do not have to bother with reallocs. */ tmp.start = (int *)cl_realloc((char *)tmp.start, sizeof(int) * k); if (tmp.end) tmp.end = (int *)cl_realloc((char *)tmp.end, sizeof(int) * k); if (tmp.target_positions) tmp.target_positions = (int *)cl_realloc((char *)tmp.target_positions, sizeof(int) * k); } cl_free(list1->start); cl_free(list1->end); cl_free(list1->target_positions); list1->start = tmp.start; tmp.start = NULL; list1->end = tmp.end; tmp.end = NULL; list1->target_positions = tmp.target_positions; tmp.target_positions = NULL; list1->tabsize = k; list1->matches_whole_corpus = 0; list1->is_inverted = 0; } break; case Complement: /* * -------------------- COMPLEMENT * in that case. ML2 should be empty. We suppose it is. */ /* * what the hell is the complement of a non-initial matchlist? * I simply do not know. so do it only for initial ones. */ if (list1->end) { fprintf(stderr, "Can't calculate complement for non-initial matchlist.\n"); return 0; } /* we could always make the complement by toggling the inversion flag, * but we only do that in case the list is inverted, otherwise we would * need another function to physically make the complement */ if (list1->is_inverted) { list1->is_inverted = 0; return 1; } if (!evalenv) { fprintf(stderr, "Can't calculate complement with NULL eval env\n"); return 0; } if (!evalenv->query_corpus) { fprintf(stderr, "Can't calculate complement with NULL query_corpus.\n"); return 0; } if (!access_corpus(evalenv->query_corpus)) { fprintf(stderr, "Complement: can't access current corpus.\n"); return 0; } /* * OK. The tests went by. Now, the size of the new ML is the * size of the corpus MINUS the size of the current matchlist. */ if ((attr = find_attribute(evalenv->query_corpus->corpus, DEFAULT_ATT_NAME, ATT_POS, NULL)) == NULL) { fprintf(stderr, "Complement: can't find %s attribute of current corpus\n", DEFAULT_ATT_NAME); return 0; } i = cl_max_cpos(attr); if (cl_errno != CDA_OK) { fprintf(stderr, "Complement: can't get attribute size\n"); return 0; } tmp.tabsize = i - list1->tabsize; if (tmp.tabsize == 0) { /* * Best case. Result is empty. */ cl_free(list1->start); cl_free(list1->end); cl_free(list1->target_positions); list1->matches_whole_corpus = 0; list1->tabsize = 0; list1->is_inverted = 0; } else if (tmp.tabsize == i) { /* * Worst case. * result is a copy of the corpus. * * TODO: This is not true if we have -1 elements in the source list. * */ cl_free(list1->start); cl_free(list1->end); cl_free(list1->target_positions); list1->start = (int *)cl_malloc(sizeof(int) * tmp.tabsize); list1->tabsize = tmp.tabsize; list1->matches_whole_corpus = 1; list1->is_inverted = 0; for (i = 0; i < tmp.tabsize; i++) list1->start[i] = i; } else { /* * in between. */ tmp.start = (int *)cl_malloc(sizeof(int) * tmp.tabsize); tmp.end = NULL; tmp.target_positions = NULL; tmp.matches_whole_corpus = 0; j = 0; /* index in source list */ t = 0; /* index in target list */ for (k = 0; k < i; k++) { if ((j >= list1->tabsize) || (k < list1->start[j])) { tmp.start[t] = k; t++; } else if (k == list1->start[j]) { j++; } else /* (k > list1->start[j]) */ { assert("Error in Complement calculation routine" && 0); } } assert(t == tmp.tabsize); cl_free(list1->start); cl_free(list1->end); cl_free(list1->target_positions); list1->start = tmp.start; tmp.start = NULL; list1->end = tmp.end; tmp.end = NULL; list1->tabsize = tmp.tabsize; list1->matches_whole_corpus = 0; list1->is_inverted = 0; } break; case Identity: /* * -------------------- IDENTITY * create a copy of ML2 into ML1 */ free_matchlist(list1); list1->tabsize = list2->tabsize; list1->matches_whole_corpus = list2->matches_whole_corpus; list1->is_inverted = list2->is_inverted; if (list2->start) { list1->start = (int *)cl_malloc(sizeof(int) * list2->tabsize); memcpy((char *)list1->start, (char *)list2->start, sizeof(int) * list2->tabsize); } if (list2->end) { list1->end = (int *)cl_malloc(sizeof(int) * list2->tabsize); memcpy((char *)list1->end, (char *)list2->end, sizeof(int) * list2->tabsize); } if (list2->target_positions) { list1->target_positions = (int *)cl_malloc(sizeof(int) * list2->tabsize); memcpy((char *)list1->target_positions, (char *)list2->target_positions, sizeof(int) * list2->tabsize); } break; case Uniq: /* * -------------------- UNIQ * create a unique version of ML1 * working destructively on list1 */ if (list1->start && (list1->tabsize > 0)) { ins = 0; /* the insertion point */ if (list1->end) for (i = 0; i < list1->tabsize; i++) { if ((ins == 0) || ((list1->start[i] != list1->start[ins-1]) || (list1->end[i] != list1->end[ins-1]))) { /* copy the data from the current position * down to the insertion point. */ list1->start[ins] = list1->start[i]; list1->end[ins] = list1->end[i]; if (list1->target_positions) list1->target_positions[ins] = list1->target_positions[i]; ins++; } } else for (i = 0; i < list1->tabsize; i++) { if ((ins == 0) || (list1->start[i] != list1->start[ins-1])) { /* copy the data from the current position * down to the insertion point. */ list1->start[ins] = list1->start[i]; if (list1->target_positions) list1->target_positions[ins] = list1->target_positions[i]; ins++; } } if (ins != list1->tabsize) { /* * no elements were deleted from the list when ins==tabsize. So * we do not have to do anything then. * Otherwise, the list was used destructively. Free up used space. */ list1->start = (int *)cl_realloc(list1->start, sizeof(int) * ins); if (list1->end) list1->end = (int *)cl_realloc(list1->end, sizeof(int) * ins); if (list1->target_positions) list1->target_positions = (int *)cl_realloc(list1->target_positions, sizeof(int) * ins); list1->tabsize = ins; list1->matches_whole_corpus = 0; list1->is_inverted = 0; } } break; case Reduce: if ((list1->start) && (list1->tabsize > 0)) { ins = 0; /* for the sake of efficiency, we distinguish here between * initial matchlists and non-initial matchlists. Two almost * identical loops are performed, but we do the test for initial * mls instead of inside the loop here */ if (list1->end) for (i = 0; i < list1->tabsize; i++) { if (list1->start[i] != -1) { /* copy the data from the current position * down to the insertion point. */ if (i != ins) { list1->start[ins] = list1->start[i]; list1->end[ins] = list1->end[i]; if (list1->target_positions) list1->target_positions[ins] = list1->target_positions[i]; } ins++; } } else for (i = 0; i < list1->tabsize; i++) { if (list1->start[i] != -1) { /* copy the data from the current position * down to the insertion point. */ if (i != ins) list1->start[ins] = list1->start[i]; if (list1->target_positions) list1->target_positions[ins] = list1->target_positions[i]; ins++; } } if (ins == 0) { /* * all elements have been deleted. So free the used space. */ cl_free(list1->start); cl_free(list1->end); cl_free(list1->target_positions); list1->tabsize = 0; list1->matches_whole_corpus = 0; list1->is_inverted = 0; } else if (ins != list1->tabsize) { /* * no elements were deleted from the list when ins==tabsize. So * we do not have to do anything then. * Otherwise, the list was used destructively. Free up used space. */ list1->start = (int *)cl_realloc(list1->start, sizeof(int) * ins); if (list1->end) list1->end = (int *)cl_realloc(list1->end, sizeof(int) * ins); if (list1->target_positions) list1->target_positions = (int *)cl_realloc(list1->target_positions, sizeof(int) * ins); list1->tabsize = ins; list1->matches_whole_corpus = 0; list1->is_inverted = 0; } } break; default: assert("Illegal operator in Setop" && 0); return 0; break; } return 1; }