/* * ------------------------------------------------------------------------ * * "rcqpCmd_lexicon_size(SEXP inAttribute)" -- * * * * ------------------------------------------------------------------------ */ SEXP rcqpCmd_lexicon_size(SEXP inAttribute) { SEXP result = R_NilValue; char * a; Attribute * attribute; int size; if (!isString(inAttribute) || length(inAttribute) != 1) error("argument 'attribute' must be a string"); PROTECT(inAttribute); a = (char*)CHAR(STRING_ELT(inAttribute,0)); attribute = cqi_lookup_attribute(a, ATT_POS); if (attribute != NULL) { size = cl_max_id(attribute); if (size < 0) { UNPROTECT(1); Rprintf("negative size"); rcqp_send_error(); } else { result = PROTECT(allocVector(INTSXP, 1)); INTEGER(result)[0] = size; } } else { UNPROTECT(1); return R_NilValue; } UNPROTECT(2); return result; }
void do_cqi_cl_lexicon_size(void) { char *a; Attribute *attribute; int size; a = cqi_read_string(); if (server_debug) Rprintf( "CQi: CQI_CL_LEXICON_SIZE('%s')\n", a); attribute = cqi_lookup_attribute(a, ATT_POS); if (attribute != NULL) { size = cl_max_id(attribute); if (size < 0) { send_cl_error(); } else { cqi_data_int(size); } } else { cqi_command(cqi_errno); /* cqi_errno set by lookup() */ } free(a); }
/** * Prints statistical information about a corpus to STDOUT. * * Each corpus attribute gets info printed about it: * tokens and types for a P-attribute, number of instances * of regions for an S-attribute, number of alignment * blocks for an A-attribute. * * @param corpus The corpus to analyse. */ void describecorpus_show_statistics (Corpus *corpus) { Attribute *a; int tokens, types, regions, blocks; for (a = corpus->attributes; a; a = a->any.next) { switch(a->any.type) { case ATT_POS: Rprintf("p-ATT %-16s ", a->any.name); tokens = cl_max_cpos(a); types = cl_max_id(a); if ((tokens > 0) && (types > 0)) Rprintf("%10d tokens, %8d types", tokens, types); else Rprintf(" NO DATA"); break; case ATT_STRUC: Rprintf("s-ATT %-16s ", a->any.name); regions = cl_max_struc(a); if (regions >= 0) { Rprintf("%10d regions", regions); if (cl_struc_values(a)) Rprintf(" (with annotations)"); } else Rprintf(" NO DATA"); break; case ATT_ALIGN: Rprintf("a-ATT %-16s ", a->any.name); blocks = cl_max_alg(a); if (blocks >= 0) { Rprintf("%10d alignment blocks", blocks); if (cl_has_extended_alignment(a)) Rprintf(" (extended)"); } else Rprintf(" NO DATA"); break; default: Rprintf("??? %-16s (unknown attribute type)", a->any.name); break; } Rprintf("\n"); } Rprintf("\n"); }
/** * Validates the REVCORP component of the given attribute. * * This function validates a REVCORP (i.e. an uncompressed index). * It assumes that a lexicon, frequencies and (compressed or * uncompressed) token stream are available for CL access for the * given attribute. * * @param attr The attribute whose REVCORP should be checked. * @return True for all OK, false for a problem. */ int validate_revcorp(Attribute *attr) { Component *revcorp = ensure_component(attr, CompRevCorpus, 0); int *ptab; /* table of index offsets for each lexicon entry */ int lexsize, corpsize; int i, offset, cpos, id; printf(" ? validating %s ... ", cid_name(CompRevCorpus)); fflush(stdout); if (revcorp == NULL) { printf("FAILED (no data)\n"); return 0; } lexsize = cl_max_id(attr); corpsize = cl_max_cpos(attr); if ((lexsize <= 0) || (corpsize <= 0)) { printf("FAILED (corpus access error)\n"); return 0; } if (revcorp->size != corpsize) { printf("FAILED (wrong size)\n"); return 0; } /* init offsets by calculating REVIDX component from token frequencies */ ptab = (int *) cl_calloc(lexsize, sizeof(int)); offset = 0; for (i = 0; i < lexsize; i++) { ptab[i] = offset; offset += cl_id2freq(attr, i); } /* now read token stream, check each token id against REVCORP, and increment its pointer */ for (cpos = 0; cpos < corpsize; cpos++) { id = cl_cpos2id(attr, cpos); if ((id < 0) || (id >= lexsize)) { printf("FAILED (inconsistency in token stream)\n"); cl_free(ptab); return 0; } if (ntohl(revcorp->data.data[ptab[id]]) != cpos) { printf("FAILED\n"); cl_free(ptab); return 0; } ptab[id]++; } /* validate frequencies by comparing final offsets against those calculated from token frequencies */ offset = 0; for (i = 0; i < lexsize; i++) { offset += cl_id2freq(attr, i); if (ptab[i] != offset) { printf("FAILED (token frequencies incorrect)\n"); cl_free(ptab); return 0; } } cl_free(ptab); printf("OK\n"); return 1; }
/** * Compresses the token stream of a p-attribute. * * Three files are created: the compressed token stream, the descriptor block, * and a sync file. * * @param attr The attribute to compress. * @param hc Location for the resulting Huffmann code descriptor block. * @param fname Base filename for the resulting files. */ int compute_code_lengths(Attribute *attr, HCD *hc, char *fname) { int id, i, h; int nr_codes = 0; int *heap = NULL; unsigned *codelength = NULL; /* was char[], probably to save space; but that's unnecessary and makes gcc complain */ int issued_codes[MAXCODELEN]; int next_code[MAXCODELEN]; long sum_bits; Rprintf("COMPRESSING TOKEN STREAM of %s.%s\n", corpus_id_cwb_huffcode, attr->any.name); /* I need the following components: * - CompCorpus * - CompCorpusFreqs * - CompLexicon * - CompLexiconIdx * and want to force the CL to use them rather than compressed data. */ { Component *comp; if ((comp = ensure_component(attr, CompCorpus, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the CORPUS component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompLexicon, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the LEXION component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompLexiconIdx, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the LEXIDX component\n"); rcqp_receive_error(1); } if ((comp = ensure_component(attr, CompCorpusFreqs, 0)) == NULL) { Rprintf( "Computation of huffman codes needs the FREQS component.\n" "Run 'makeall -r %s -c FREQS %s %s' in order to create it.\n", corpus->registry_dir, corpus->registry_name, attr->any.name); rcqp_receive_error(1); } } /* * strongly follows Witten/Moffat/Bell: ``Managing Gigabytes'', * pp. 335ff. */ hc->size = cl_max_id(attr); /* the size of the attribute (nr of items) */ if ((hc->size <= 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_max_id() failed"); rcqp_receive_error(1); } hc->length = cl_max_cpos(attr); /* the length of the attribute (nr of tokens) */ if ((hc->length <= 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_max_cpos() failed"); rcqp_receive_error(1); } hc->symbols = NULL; hc->min_codelen = 100; hc->max_codelen = 0; memset((char *)hc->lcount, '\0', MAXCODELEN * sizeof(int)); memset((char *)hc->min_code, '\0', MAXCODELEN * sizeof(int)); memset((char *)hc->symindex, '\0', MAXCODELEN * sizeof(int)); memset((char *)issued_codes, '\0', MAXCODELEN * sizeof(int)); codelength = (unsigned *)cl_calloc(hc->size, sizeof(unsigned)); /* =========================================== make & initialize the heap */ heap = (int *)cl_malloc(hc->size * 2 * sizeof(int)); for (i = 0; i < hc->size; i++) { heap[i] = hc->size + i; heap[hc->size+i] = get_id_frequency(attr, i) + 1; /* add-one trick needed to avoid unsupported Huffman codes > 31 bits for very large corpora of ca. 2 billion words: theoretical optimal code length for hapax legomena in such corpora is ca. 31 bits, and the Huffman algorithm sometimes generates 32-bit codes; with add-one trick, the theoretical optimal code length is always <= 30 bits */ } /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) fprintf(protocol, "Allocated heap with %d cells for %d items\n\n", hc->size * 2, hc->size); if (do_protocol > 2) print_heap(heap, hc->size, "After Initialization"); /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 1 */ h = hc->size; /* * we address the heap in the following manner: when we start array * indices at 1, the left child is at 2i, and the right child is at * 2i+1. So we maintain this scheme and decrement just before * adressing the array. */ /* * construct the initial min-heap */ for (i = hc->size/2; i > 0; i--) { /* do: * bottom up, left to right, * for each root of each subtree, sift if necessary */ sift(heap, h, i); } /* ============================== PROTOCOL ============================== */ if (do_protocol > 2) { print_heap(heap, hc->size, "Initial Min-Heap"); fprintf(protocol, "\n"); } /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 2 */ /* smallest item at top of heap now, remove the two smallest items * and sift, find second smallest by removing top and sifting, as * long as we have more than one root */ while (h > 1) { int pos[2]; for (i = 0; i < 2; i++) { /* remove topmost (i.e. smallest) item */ pos[i] = heap[0]; /* remove and sift, to reobtain heap integrity: move ``last'' * item to top of heap and sift */ heap[0] = heap[--h]; sift(heap, h, 1); } /* ============================== PROTOCOL ============================== */ if (do_protocol > 3) { fprintf(protocol, "Removed smallest item %d with freq %d\n", pos[0], heap[pos[0]]); fprintf(protocol, "Removed 2nd smallest item %d with freq %d\n", pos[1], heap[pos[1]]); } /* ============================== PROTOCOL ============================== */ /* * pos[0] and pos[1] contain pointers to the two smallest items * now. since h was decremented twice, h and h+1 are now empty and * become the accumulated freq of pos[i]. The individual * frequencies are not needed any more, so pointers to h+1 (the * acc freq) are stored there instead (tricky, since freq cell * becomes pointer cell). So, what happens here, is to include a * new element in the heap. */ heap[h] = h+1; heap[h+1] = heap[pos[0]] + heap[pos[1]]; /* accumulated freq */ heap[pos[0]] = heap[pos[1]] = h+1; /* pointers! */ h++; /* we put a new element into heap */ /* * now, swap it up until we reobtain heap integrity */ { register int parent, current; current = h; parent = current >> 1; while ((parent > 0) && (heap[heap[parent-1]] > heap[heap[current-1]])) { int tmp; tmp = heap[parent-1]; heap[parent-1] = heap[current-1]; heap[current-1] = tmp; current = parent; parent = current >> 1; } } } /* ============================== PROTOCOL ============================== */ if (do_protocol > 3) fprintf(protocol, "\n"); /* ============================== PROTOCOL ============================== */ /* ================================================== Phase 3 */ /* compute the code lengths. We don't have any freqs in heap any * more, only pointers to parents */ heap[0] = -1U; /* root has a depth of 0 */ heap[1] = 0; /* we trust in what they say on p. 345 */ for (i = 2; i < hc->size * 2; i++) heap[i] = heap[heap[i]]+1; /* collect the lengths */ sum_bits = 0L; for (i = 0; i < hc->size; i++) { int cl = heap[i+hc->size]; sum_bits += cl * get_id_frequency(attr, i); codelength[i] = cl; if (cl == 0) continue; if (cl > hc->max_codelen) hc->max_codelen = cl; if (cl < hc->min_codelen) hc->min_codelen = cl; hc->lcount[cl]++; } /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) { fprintf(protocol, "Minimal code length: %3d\n", hc->min_codelen); fprintf(protocol, "Maximal code length: %3d\n", hc->max_codelen); fprintf(protocol, "Compressed code len: %10ld bits, %10ld (+1) bytes\n\n\n", sum_bits, sum_bits/8); } /* ============================== PROTOCOL ============================== */ if (hc->max_codelen >= MAXCODELEN) { Rprintf( "Error: Huffman codes too long (%d bits, current maximum is %d bits).\n", hc->max_codelen, MAXCODELEN-1); Rprintf( " Please contact the CWB development team for assistance.\n"); rcqp_receive_error(1); } if ((hc->max_codelen == 0) && (hc->min_codelen == 100)) { Rprintf( "Problem: No output generated -- no items?\n"); nr_codes = 0; } else { hc->min_code[hc->max_codelen] = 0; for (i = hc->max_codelen-1; i > 0; i--) hc->min_code[i] = (hc->min_code[i+1] + hc->lcount[i+1]) >> 1; hc->symindex[hc->min_codelen] = 0; for (i = hc->min_codelen+1; i <= hc->max_codelen; i++) hc->symindex[i] = hc->symindex[i-1] + hc->lcount[i-1]; /* ============================== PROTOCOL ============================== */ if (do_protocol > 0) { int sum_codes = 0; fprintf(protocol, " CL #codes MinCode SymIdx\n"); fprintf(protocol, "----------------------------------------\n"); for (i = hc->min_codelen; i <= hc->max_codelen; i++) { sum_codes += hc->lcount[i]; fprintf(protocol, "%3d %7d %7d %7d\n", i, hc->lcount[i], hc->min_code[i], hc->symindex[i]); } fprintf(protocol, "----------------------------------------\n"); fprintf(protocol, " %7d\n", sum_codes); } /* ============================== PROTOCOL ============================== */ for (i = 0; i < MAXCODELEN; i++) next_code[i] = hc->min_code[i]; /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "\n"); fprintf(protocol, " Item f(item) CL Bits Code, String\n"); fprintf(protocol, "------------------------------------" "------------------------------------\n"); } /* ============================== PROTOCOL ============================== */ /* compute and issue codes */ hc->symbols = heap + hc->size; for (i = 0; i < hc->size; i++) { /* we store the code for item i in heap[i] */ heap[i] = next_code[codelength[i]]; next_code[codelength[i]]++; /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "%7d %7d %3d %10d ", i, get_id_frequency(attr, i), codelength[i], codelength[i] * get_id_frequency(attr, i)); bprintf(heap[i], codelength[i], protocol); fprintf(protocol, " %7d %s\n", heap[i], get_string_of_id(attr, i)); } /* ============================== PROTOCOL ============================== */ /* and put the item itself in the second half of the table */ heap[hc->size+hc->symindex[codelength[i]]+issued_codes[codelength[i]]] = i; issued_codes[codelength[i]]++; } /* ============================== PROTOCOL ============================== */ if (do_protocol > 1) { fprintf(protocol, "------------------------------------" "------------------------------------\n"); } /* ============================== PROTOCOL ============================== */ /* The work itself -- encode the attribute data */ { char *path; char hcd_path[CL_MAX_LINE_LENGTH]; char huf_path[CL_MAX_LINE_LENGTH]; char sync_path[CL_MAX_LINE_LENGTH]; Component *corp; BFile bfd; FILE *sync; int cl, code, pos; corp = ensure_component(attr, CompCorpus, 0); assert(corp); if (fname) { path = fname; sprintf(hcd_path, "%s.hcd", path); sprintf(huf_path, "%s.huf", path); sprintf(sync_path, "%s.huf.syn", path); } else { path = component_full_name(attr, CompHuffSeq, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(huf_path, path); path = component_full_name(attr, CompHuffCodes, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(hcd_path, path); path = component_full_name(attr, CompHuffSync, NULL); assert(path); /* additonal condition (cderrno == CDA_OK) removed, since component_full_name doesn't (re)set cderrno */ strcpy(sync_path, path); } Rprintf("- writing code descriptor block to %s\n", hcd_path); if (!WriteHCD(hcd_path, hc)) { Rprintf( "ERROR: writing %s failed. Aborted.\n", hcd_path); rcqp_receive_error(1); } Rprintf("- writing compressed item sequence to %s\n", huf_path); if (!BFopen(huf_path, "w", &bfd)) { Rprintf( "ERROR: can't create file %s\n", huf_path); perror(huf_path); rcqp_receive_error(1); } Rprintf("- writing sync (every %d tokens) to %s\n", SYNCHRONIZATION, sync_path); if ((sync = fopen(sync_path, "w")) == NULL) { Rprintf( "ERROR: can't create file %s\n", sync_path); perror(sync_path); rcqp_receive_error(1); } for (i = 0; i < hc->length; i++) { /* SYNCHRONIZE */ if ((i % SYNCHRONIZATION) == 0) { if (i > 0) BFflush(&bfd); pos = BFposition(&bfd); NwriteInt(pos, sync); } id = cl_cpos2id(attr, i); if ((id < 0) || (cderrno != CDA_OK)) { cdperror("(aborting) cl_cpos2id() failed"); rcqp_receive_error(1); } else { assert((id >= 0) && (id < hc->size) && "Internal Error"); cl = codelength[id]; code = heap[id]; if (!BFwriteWord((unsigned int)code, cl, &bfd)) { Rprintf( "Error writing code for ID %d (%d, %d bits) at position %d. Aborted.\n", id, code, cl, i); rcqp_receive_error(1); } } } fclose(sync); BFclose(&bfd); } } free(codelength); free(heap); return 1; }
/** * Checks a compressed reversed index for errors by decompressing it. * * This function this assumes that compress_reversed_index() has been called * beforehand and made sure that the _uncompressed_ index is used by CL * access functions. * * @param attr The attribute to check the index of. * @param output_fn Base name for the compressed RDX files to be read * (if this is null, filename swill be taken from the * attribute). */ void decompress_check_reversed_index(Attribute *attr, char *output_fn) { char *s; char data_fname[CL_MAX_FILENAME_LENGTH]; char index_fname[CL_MAX_FILENAME_LENGTH]; int nr_elements; int element_freq; int corpus_size; int pos, gap; int b; int i, k; BFile data_file; FILE *index_file; PositionStream PStream; int true_pos; Rprintf("VALIDATING %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name); nr_elements = cl_max_id(attr); if ((nr_elements <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_id() failed"); compressrdx_cleanup(1); } corpus_size = cl_max_cpos(attr); if ((corpus_size <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_cpos() failed"); compressrdx_cleanup(1); } if (output_fn) { sprintf(data_fname, "%s.crc", output_fn); sprintf(index_fname, "%s.crx", output_fn); } else { s = component_full_name(attr, CompCompRF, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(data_fname, s); s = component_full_name(attr, CompCompRFX, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(index_fname, s); } if (! BFopen(data_fname, "r", &data_file)) { Rprintf( "ERROR: can't open file %s\n", data_fname); perror(data_fname); compressrdx_cleanup(1); } Rprintf("- reading compressed index from %s\n", data_fname); if ((index_file = fopen(index_fname, "r")) == NULL) { Rprintf( "ERROR: can't open file %s\n", index_fname); perror(index_fname); compressrdx_cleanup(1); } Rprintf("- reading compressed index offsets from %s\n", index_fname); for (i = 0; i < nr_elements; i++) { element_freq = cl_id2freq(attr, i); if ((element_freq == 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) token frequency == 0\n"); compressrdx_cleanup(1); } PStream = cl_new_stream(attr, i); if ((PStream == NULL) || (cl_errno != CDA_OK)) { cl_error("(aborting) index read error"); compressrdx_cleanup(1); } b = compute_ba(element_freq, corpus_size); if (debug_cwb_compress_rdx) fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n", i, element_freq, b); pos = 0; for (k = 0; k < element_freq; k++) { gap = read_golomb_code_bf(b, &data_file); pos += gap; if (1 != cl_read_stream(PStream, &true_pos, 1)) { cl_error("(aborting) index read error\n"); compressrdx_cleanup(1); } if (pos != true_pos) { Rprintf( "ERROR: wrong occurrence of token #%d at cpos %d (correct cpos: %d). Aborted.\n", i, pos, true_pos); compressrdx_cleanup(1); } } cl_delete_stream(&PStream); BFflush(&data_file); } fclose(index_file); BFclose(&data_file); /* tell the user it's safe to delete the REVCORP and REVCIDX components now */ Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompRevCorpus, NULL)); Rprintf("!! You can delete the file <%s> now.\n", component_full_name(attr, CompRevCorpusIdx, NULL)); return; }
/** * Compresses the reversed index of a p-attribute. * * @param attr The attribute to compress the index of. * @param output_fn Base name for the compressed RDX files to be written * (if this is null, filenames will be taken from the * attribute). */ void compress_reversed_index(Attribute *attr, char *output_fn) { char *s; char data_fname[CL_MAX_FILENAME_LENGTH]; char index_fname[CL_MAX_FILENAME_LENGTH]; int nr_elements; int element_freq; int corpus_size; int last_pos, gap, fpos; int b; int i, k; BFile data_file; FILE *index_file = NULL; PositionStream PStream; int new_pos; Rprintf("COMPRESSING INDEX of %s.%s\n", corpus_id_cwb_compress_rdx, attr->any.name); /* ensure that we do NOT use the compressed index while building the * compressed index (yeah, a nasty thing that). That is, load the * .corpus.rev and .corpus.rdx components in order to force * subsequent CL calls to use the uncompressed data. */ { Component *comp; if ((comp = ensure_component(attr, CompRevCorpus, 0)) == NULL) { Rprintf( "Index compression requires the REVCORP component\n"); compressrdx_cleanup(1); } if ((comp = ensure_component(attr, CompRevCorpusIdx, 0)) == NULL) { Rprintf( "Index compression requires the REVCIDX component\n"); compressrdx_cleanup(1); } } nr_elements = cl_max_id(attr); if ((nr_elements <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_id() failed"); compressrdx_cleanup(1); } corpus_size = cl_max_cpos(attr); if ((corpus_size <= 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) cl_max_cpos() failed"); compressrdx_cleanup(1); } if (output_fn) { sprintf(data_fname, "%s.crc", output_fn); sprintf(index_fname, "%s.crx", output_fn); } else { s = component_full_name(attr, CompCompRF, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(data_fname, s); s = component_full_name(attr, CompCompRFX, NULL); assert(s && (cl_errno == CDA_OK)); strcpy(index_fname, s); } if (! BFopen(data_fname, "w", &data_file)) { Rprintf( "ERROR: can't create file %s\n", data_fname); perror(data_fname); compressrdx_cleanup(1); } Rprintf("- writing compressed index to %s\n", data_fname); if ((index_file = fopen(index_fname, "wb")) == NULL) { Rprintf( "ERROR: can't create file %s\n", index_fname); perror(index_fname); compressrdx_cleanup(1); } Rprintf("- writing compressed index offsets to %s\n", index_fname); for (i = 0; i < nr_elements; i++) { element_freq = cl_id2freq(attr, i); if ((element_freq == 0) || (cl_errno != CDA_OK)) { cl_error("(aborting) token frequency == 0\n"); compressrdx_cleanup(1); } PStream = cl_new_stream(attr, i); if ((PStream == NULL) || (cl_errno != CDA_OK)) { cl_error("(aborting) index read error"); compressrdx_cleanup(1); } b = compute_ba(element_freq, corpus_size); fpos = BFposition(&data_file); NwriteInt(fpos, index_file); if (debug_cwb_compress_rdx) fprintf(debug_output, "------------------------------ ID %d (f: %d, b: %d)\n", i, element_freq, b); last_pos = 0; for (k = 0; k < element_freq; k++) { if (1 != cl_read_stream(PStream, &new_pos, 1)) { cl_error("(aborting) index read error\n"); compressrdx_cleanup(1); } gap = new_pos - last_pos; last_pos = new_pos; if (debug_cwb_compress_rdx) fprintf(debug_output, "%8d: gap=%4d, b=%4d\n", codepos, gap, b); write_golomb_code(gap, b, &data_file); codepos++; } cl_delete_stream(&PStream); BFflush(&data_file); } fclose(index_file); BFclose(&data_file); return; }
/** * Creates feature maps for a source/target corpus pair. * * Example usage: * * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s); * * @param config pointer to a list of strings representing the feature map configuration. * @param config_lines the number of configuration items stored in config_data. * @param w_attr1 The p-attribute in the first corpus to link. * @param w_attr2 The p-attribute in the second corpus to link. * @param s_attr1 The s-attribute in the first corpus to link. * @param s_attr2 The s-attribute in the second corpus to link. * @return the new FMS object. */ FMS create_feature_maps(char **config, int config_lines, Attribute *w_attr1, Attribute *w_attr2, Attribute *s_attr1, Attribute *s_attr2 ) { FMS r; unsigned int *fcount1, *fcount2; int config_pointer; char *b, command[200], dummy[200]; int current_feature, weight, need_to_abort; int *fs1, *fs2; int i,nw1,nw2; r = (FMS) malloc(sizeof(feature_maps_t)); assert(r); r->att1 = w_attr1; r->att2 = w_attr2; r->s1 = s_attr1; r->s2 = s_attr2; init_char_map(); nw1= cl_max_id(w_attr1); if (nw1 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of source corpus\n"); exit(1); } nw2= cl_max_id(w_attr2); if (nw2 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of target corpus\n"); exit(1); } printf("LEXICON SIZE: %d / %d\n", nw1, nw2); fcount1 = (unsigned int*) calloc(nw1+1,sizeof(unsigned int)); fcount2 = (unsigned int*) calloc(nw2+1,sizeof(unsigned int)); r->n_features=1; /* process feature map configuration: first pass */ for (config_pointer = 0; config_pointer < config_lines; config_pointer++) { if ( (b = strpbrk(config[config_pointer],"\n#")) ) /* strip newline and comments */ *b=0; if (sscanf(config[config_pointer],"%s",command)>0) { if(command[0]=='-') { switch(command[1]) { case 'S': { int i1, i2, f1, f2; float threshold; int n_shared=0; if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy)!=3) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -S:<weight>:<threshold>\n"); fprintf(stderr," Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n"); exit(1); } else { printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight); fflush(stdout); for (i1=0; i1 < nw1; i1++) { f1 = cl_id2freq(w_attr1, i1); i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1)); if (i2 >= 0){ f2 = cl_id2freq(w_attr2, i2); if (f1 / (0.0+f1+f2) >=threshold && f2 / (0.0+f1+f2) >= threshold){ fcount1[i1]++; fcount2[i2]++; n_shared++; r->n_features++; } } } printf("[%d]\n",n_shared); } break; } case '1': case '2': case '3': case '4': { int n; if (sscanf(config[config_pointer],"%1s%d:%d %s",command,&n,&weight,dummy)!=3) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -<n>:<weight> (n = 1..4)\n"); fprintf(stderr," Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n"); exit(1); } else if(n <= 0 || n>4) { /* this shouldn't happen anyway */ fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n",n,config[config_pointer]); exit(1); } else { int i,f,l; printf("FEATURE: %d-grams, weight=%d ... ", n, weight); fflush(stdout); for(i=0; i<nw1; i++) { l = cl_id2strlen(w_attr1, i); fcount1[i] += (l >= n) ? l - n + 1 : 0; } for(i=0; i<nw2; i++) { l = cl_id2strlen(w_attr2, i); fcount2[i] += (l >= n) ? l - n + 1 : 0; } f=1; for(i=0;i<n;i++) f*=char_map_range; r->n_features+=f; printf("[%d]\n", f); } break; } case 'W': { char filename[200], word1[200], word2[200]; FILE *wordlist; int nw,nl=0,i1,i2,n_matched=0; if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -W:<weight>:<filename>\n"); fprintf(stderr, " Word list (read from file <filename>).\n"); exit(1); } else if(!(wordlist=fopen(filename,"r"))) { fprintf(stderr,"ERROR: Cannot read word list file %s.\n", filename); exit(-1); } else { printf("FEATURE: word list %s, weight=%d ... ", filename, weight); fflush(stdout); while((nw=fscanf(wordlist,"%s %s",word1,word2))>0) { nl++; if (nw!=2) fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n", nl,filename,nw); else { if((i1=cl_str2id(w_attr1,word1))>=0 && (i2=cl_str2id(w_attr2,word2)) >=0) { fcount1[i1]++; fcount2[i2]++; n_matched++; r->n_features++; } } } fclose(wordlist); printf("[%d]\n", n_matched); } break; } case 'C': if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -C:<weight>\n"); fprintf(stderr, " Character count [primary feature].\n"); exit(1); } else { /* primary feature -> don't create additional features */ /* first entry in a token's feature list is character count */ for (i=0; i<nw1; i++) fcount1[i]++; for (i=0; i<nw2; i++) fcount2[i]++; printf("FEATURE: character count, weight=%d ... [1]\n", weight); } break; default: fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]); exit(1); break; } } else { fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]); exit(1); } } }
/** * Creates feature maps for a source/target corpus pair. * * This is the constructor function for the FMS class. * * Example usage: * * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s); * * @param config array of strings representing the feature map configuration. * @param config_lines the number of configuration items stored in config_data. * @param w_attr1 The p-attribute in the first corpus to link. * @param w_attr2 The p-attribute in the second corpus to link. * @param s_attr1 The s-attribute in the first corpus to link. * @param s_attr2 The s-attribute in the second corpus to link. * @return the new FMS object. */ FMS create_feature_maps(char **config, int config_lines, Attribute *w_attr1, Attribute *w_attr2, Attribute *s_attr1, Attribute *s_attr2 ) { FMS r; unsigned int *fcount1, *fcount2; /* arrays for types in the lexicons of the source * & target corpora, respectively, counting how often each is used * in a feature */ int config_pointer; char *b, command[CL_MAX_LINE_LENGTH], dummy[CL_MAX_LINE_LENGTH]; int current_feature; int weight; /* holds the weight assigned to the feature(s) we're working on */ int need_to_abort; /* boolean used during pointer check */ /* after we have counted up features, these will become arrays of ints, with one entry per feature */ int *fs1, *fs2; int i; int nw1; /* number of types on the word-attribute of the source corpus */ int nw2; /* number of types on the word-attribute of the target corpus */ /* one last variable: we need to know the character set of the two corpora for assorted purposes */ CorpusCharset charset; charset = cl_corpus_charset(cl_attribute_mother_corpus(w_attr1)); /* first, create the FMS object. */ r = (FMS) malloc(sizeof(feature_maps_t)); assert(r); /* copy in the attribute pointers */ r->att1 = w_attr1; r->att2 = w_attr2; r->s1 = s_attr1; r->s2 = s_attr2; init_char_map(); /* find out how many different word-types occur on each of the p-attributes */ nw1 = cl_max_id(w_attr1); if (nw1 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of source corpus\n"); exit(1); } nw2 = cl_max_id(w_attr2); if (nw2 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of target corpus\n"); exit(1); } printf("LEXICON SIZE: %d / %d\n", nw1, nw2); fcount1 = (unsigned int*) calloc(nw1 + 1, sizeof(unsigned int)); fcount2 = (unsigned int*) calloc(nw2 + 1, sizeof(unsigned int)); r->n_features = 1; /* NOTE there are two passes through the creation of feature maps - two sets of nearly identical code! * First pass to see how many things we need ot count, second pass to count them. */ /* process feature map configuration: first pass */ for (config_pointer = 0; config_pointer < config_lines; config_pointer++) { /* strip newline and comments */ if ( (b = strpbrk(config[config_pointer],"\n#")) ) *b = 0; if (sscanf(config[config_pointer], "%s", command) > 0) { if(command[0] == '-') { /* * These are the FIRST PASS options for the different config lines. * * Possible config commands: -S -W -C -1 -2 -3 -4 */ switch(command[1]) { /* -S : the "shared words" type of feature */ case 'S': { int i1, i2; /* i1 and i2 are temporary indexes into the lexicons of the two corpora */ int f1, f2; /* f1 and f2 are temporary storage for frequencies from the corpus lexicons */ float threshold; int n_shared = 0; /* numebr fo shared words - only calculated for the purpose of printing it */ if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy) != 3) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -S:<weight>:<threshold>\n"); fprintf(stderr," Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n"); exit(1); } else { printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight); fflush(stdout); /* for each type in target corpus, get its frequency, and the corresponding id and frequency * from the target corpus, then test whether it meets the criteria for use as a feature. */ for (i1 = 0; i1 < nw1; i1++) { f1 = cl_id2freq(w_attr1, i1); i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1)); if (i2 >= 0){ f2 = cl_id2freq(w_attr2, i2); /* if it will be used as a feature, increment counts of features in various places */ if ( (f1 / (0.0+f1+f2)) >= threshold && (f2 / (0.0+f1+f2)) >= threshold){ fcount1[i1]++; fcount2[i2]++; n_shared++; r->n_features++; } } } printf("[%d]\n", n_shared); } break; } /* -1 to -4 : shared character sequences (of 1 letter to 4 letters in length) as features */ case '1': case '2': case '3': case '4': { int n; /* length of the n-gram, obviously */ if (sscanf(config[config_pointer], "%1s%d:%d %s", command, &n, &weight, dummy) !=3 ) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -<n>:<weight> (n = 1..4)\n"); fprintf(stderr," Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n"); exit(1); } else if(n <= 0 || n > 4) { /* this shouldn't happen anyway */ fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n", n, config[config_pointer]); exit(1); } else { int i,f,l; /* temp storage for lexicon index, n of possible features, && word length */ char *s; printf("FEATURE: %d-grams, weight=%d ... ", n, weight); fflush(stdout); /* for each entry in source-corpus lexicon, add to the number of features IFF * that lexicon entry is longer than 4 characters */ for(i = 0; i < nw1; i++) { /* l = cl_id2strlen(w_attr1, i); */ s = (unsigned char *) cl_strdup(cl_id2str(w_attr1, i)); cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC); l = strlen(s); cl_free(s); fcount1[i] += (l >= n) ? l - n + 1 : 0; } /* same for target corpus */ for(i = 0; i < nw2; i++) { /* l = cl_id2strlen(w_attr2, i); */ s = (unsigned char *) cl_strdup(cl_id2str(w_attr2, i)); cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC); l = strlen(s); cl_free(s); fcount2[i] += (l >= n) ? l - n + 1 : 0; } /* set f to number of possible features (= number of possible characters to the power of n) */ f = 1; for(i = 0 ; i < n; i++) f *= char_map_range; /* anmd add that to our total number of features! */ r->n_features += f; printf("[%d]\n", f); } break; } /* -W: the word-translation-equivalence type of feature */ case 'W': { char filename[CL_MAX_LINE_LENGTH], word1[CL_MAX_LINE_LENGTH], word2[CL_MAX_LINE_LENGTH]; FILE *wordlist; int nw; /* number of words scanned from an input line */ int nl = 0; /* counter for the number of lines in the wordlist file we have gone through */ int i1,i2; /* lexicon ids in source and target corpora */ int n_matched = 0; /* counter for n of lines in input file that can be used as a feature. */ if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -W:<weight>:<filename>\n"); fprintf(stderr, " Word list (read from file <filename>).\n"); exit(1); } else if(!(wordlist = fopen(filename,"r"))) { fprintf(stderr,"ERROR: Cannot read word list file %s.\n", filename); exit(-1); } else { printf("FEATURE: word list %s, weight=%d ... ", filename, weight); fflush(stdout); while((nw = fscanf(wordlist,"%s %s",word1,word2))>0) { /* on first line of file, skip UTF8 byte-order-mark if present */ if (nl == 0 && charset == utf8 && strlen(word1) > 3) if (word1[0] == (char)0xEF && word1[1] == (char)0xBB && word1[2] == (char)0xBF) cl_strcpy(word1, (word1 + 3)); nl++; /* check that both word 1 and word 2 are valid for the encoding of the corpora */ if (! (cl_string_validate_encoding(word1, charset, 0) && cl_string_validate_encoding(word2, charset, 0)) ) { fprintf(stderr, "ERROR: character encoding error in the word-list input file with the input word list.\n"); fprintf(stderr, " (The error occurs on line %d.)\n", nl); exit(1); } if (nw != 2) fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n",nl,filename,nw); else { /* if word1 and word2 both occur in their respective corpora, this is a feature. */ if( (i1 = cl_str2id(w_attr1, word1)) >= 0 && (i2 = cl_str2id(w_attr2, word2)) >= 0 ) { fcount1[i1]++; fcount2[i2]++; n_matched++; r->n_features++; } } } fclose(wordlist); printf("[%d]\n", n_matched); } break; } /* -C: the character count type of feature. * This feature exists for EVERY word type. */ case 'C': if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -C:<weight>\n"); fprintf(stderr, " Character count [primary feature].\n"); exit(1); } else { /* primary feature -> don't create additional features */ /* first entry in a token's feature list is character count */ for (i=0; i<nw1; i++) fcount1[i]++; for (i=0; i<nw2; i++) fcount2[i]++; printf("FEATURE: character count, weight=%d ... [1]\n", weight); } break; default: fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]); exit(1); break; } } else { fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]); exit(1); } } }