/** * Main function for cwb-huffcode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int R_cwb_huffcode (char *corpus_name, char *registry_dir) { char *registry_directory = registry_dir; char *output_fn = NULL; char *attr_name = DEFAULT_ATT_NAME; Attribute *attr; HCD hc; Rprintf("Corpus: %s\n", corpus_name); int i_want_to_believe = 0; /* skip error checks? */ int all_attributes = 0; protocol = NULL; /* 'delayed' init (see top of file) */ /* ------------------------------------------------- PARSE ARGUMENTS */ /* parse arguments */ all_attributes++; corpus_id_cwb_huffcode = corpus_name; Rprintf("Corpus 1: %s\n", corpus_name); if ((corpus = cl_new_corpus(registry_directory, corpus_id_cwb_huffcode)) == NULL) { Rprintf( "Corpus %s not found in registry %s . Aborted.\n", corpus_id_cwb_huffcode, (registry_directory ? registry_directory : central_corpus_directory())); rcqp_receive_error(1); } Rprintf("Corpus 2: %s\n", corpus_name); if (all_attributes) { for (attr = corpus->attributes; attr; attr = attr->any.next) if (attr->any.type == ATT_POS) { compute_code_lengths(attr, &hc, output_fn); if (! i_want_to_believe) decode_check_huff(attr, output_fn); } } else { if ((attr = cl_new_attribute(corpus, attr_name, ATT_POS)) == NULL) { Rprintf( "Attribute %s.%s doesn't exist. Aborted.\n", corpus_id_cwb_huffcode, attr_name); rcqp_receive_error(1); } compute_code_lengths(attr, &hc, output_fn); if (! i_want_to_believe) decode_check_huff(attr, output_fn); } Rprintf("Corpus 3: %s\n", corpus_name); cl_delete_corpus(corpus); return(0); }
/** * Prints basic information about a corpus to STDOUT. * * @param corpus The corpus to report on. * @param with_attribute_names Boolean: iff true, the counts of each type of attribute * are followed by a list of attribute names. * */ void describecorpus_show_basic_info (Corpus *corpus, int with_attribute_names) { Attribute *word, *a; int p_atts = 0, s_atts = 0, a_atts = 0; int size; char *colon = (with_attribute_names) ? ":" : ""; Rprintf("description: %s\n", corpus->name); Rprintf("registry file: %s/%s\n", corpus->registry_dir, corpus->registry_name); Rprintf("home directory: %s/\n", corpus->path); Rprintf("info file: %s\n", (corpus->info_file) ? corpus->info_file : "(none)"); if ((word = cl_new_attribute(corpus, "word", ATT_POS)) == NULL) { Rprintf( "ERROR: 'word' attribute is missing. Aborted.\n"); rcqp_receive_error(1); } size = cl_max_cpos(word); Rprintf("size (tokens): "); if (size >= 0) Rprintf("%d\n", size); else Rprintf("ERROR\n"); Rprintf("\n"); for (a = corpus->attributes; a; a = a->any.next) { switch(a->any.type) { case ATT_POS: p_atts++; break; case ATT_STRUC: s_atts++; break; case ATT_ALIGN: a_atts++; break; default: break; } } Rprintf("%3d positional attributes%s\n", p_atts, colon); if (with_attribute_names) describecorpus_show_attribute_names(corpus, ATT_POS); Rprintf("%3d structural attributes%s\n", s_atts, colon); if (with_attribute_names) describecorpus_show_attribute_names(corpus, ATT_STRUC); Rprintf("%3d alignment attributes%s\n", a_atts, colon); if (with_attribute_names) describecorpus_show_attribute_names(corpus, ATT_ALIGN); Rprintf("\n"); }
/** * Main function for cwb-makeall. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char **argv) { char *attr_name = NULL; Attribute *attribute; char *registry_directory = NULL; char *corpus_id = NULL; extern int optind; extern char *optarg; int c; int validate = 0; char *component = NULL; ComponentID cid; int i = 0; /* ------------------------------------------------- PARSE ARGUMENTS */ progname = argv[0]; /* parse arguments */ while ((c = getopt(argc, argv, "+r:c:P:hDM:V")) != EOF) { switch (c) { /* r: registry directory */ case 'r': if (registry_directory == NULL) registry_directory = optarg; else { fprintf(stderr, "%s: -r option used twice\n", progname); exit(2); } break; case 'P': if (attr_name == NULL) attr_name = optarg; else { fprintf(stderr, "%s: -P option used twice\n", progname); exit(2); } break; case 'c': if (component == NULL) component = optarg; else { fprintf(stderr, "%s: -c option used twice\n", progname); exit(2); } break; case 'D': cl_set_debug_level(1); break; case 'M': i = atoi(optarg); cl_set_memory_limit(i); break; case 'V': validate++; break; case 'h': default: makeall_usage(); } } if (optind >= argc) { fprintf(stderr, "Missing argument, try \"%s -h\" for more information.\n", progname); exit(1); } /* first argument: corpus id */ corpus_id = argv[optind++]; if (component != NULL) { cid = component_id(component); if (cid == CompLast) { fprintf(stderr, "Illegal component name: ``%s''\n", component); exit(1); } } else { cid = CompLast; } if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) { fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n", corpus_id, (registry_directory ? registry_directory : central_corpus_directory())); exit(1); } printf("=== Makeall: processing corpus %s ===\n", corpus_id); printf("Registry directory: %s\n", corpus->registry_dir); if (optind < argc) { for (i = optind; i < argc; i++) { if ((attribute = cl_new_attribute(corpus, argv[i], ATT_POS)) != NULL) { makeall_do_attribute(attribute, cid, validate); } else { fprintf(stderr, "p-attribute %s.%s not defined. Aborted.\n", corpus_id, attr_name); exit(1); } } } else if (attr_name != NULL) { if ((attribute = cl_new_attribute(corpus, attr_name, ATT_POS)) != NULL) { makeall_do_attribute(attribute, cid, validate); } else { fprintf(stderr, "p-attribute %s.%s not defined. Aborted.\n", corpus_id, attr_name); exit(1); } } else { /* process each p-attribute of the corpus in turn */ for (attribute = corpus->attributes; attribute; attribute = attribute->any.next) if (attribute->type == ATT_POS) { ComponentID my_cid; makeall_do_attribute(attribute, cid, validate); /* now destoy all components; this makes the attribute unusable, but it is currently the only way to free allocated and memory-mapped data */ for (my_cid = CompDirectory; my_cid < CompLast; my_cid++) { /* ordering gleaned from attributes.h */ drop_component(attribute, my_cid); } } } printf("========================================\n"); exit(0); }
/** * Main function for cwb-decode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char **argv) { Attribute *attr; Attribute *context = NULL; int sp; /* start position of a match */ int ep; /* end position of a match */ int w, cnt, read_pos_frm_stdin; char s[CL_MAX_LINE_LENGTH]; /* buffer for strings read from file */ char *token; char *input_filename = NULL; FILE *input_file = stdin; /* ------------------------------------------------- PARSE ARGUMENTS */ int c; extern char *optarg; extern int optind; progname = argv[0]; first_token = -1; last = -1; maxlast = -1; read_pos_frm_stdin = 0; /* use getopt() to parse command-line options */ while((c = getopt(argc, argv, "+s:e:r:nLHCxXf:ph")) != EOF) switch(c) { /* s: start corpus position */ case 's': first_token = atoi(optarg); break; /* e: end corpus position */ case 'e': last = atoi(optarg); break; /* r: registry directory */ case 'r': if (registry_directory == NULL) registry_directory = optarg; else { fprintf(stderr, "%s: -r option used twice\n", progname); exit(2); } break; /* n: show cpos in -H mode */ case 'n': printnum++; break; /* x: XML-compatible output in -C mode (-Cx) */ case 'x': xml_compatible++; break; /* L,H,C,X: Lisp, Horizontal, Compact, and XML modes */ case 'L': mode = LispMode; break; case 'H': mode = ConclineMode; break; case 'C': mode = EncodeMode; break; case 'X': mode = XMLMode; break; /* f: matchlist mode / read corpus positions from file */ case 'f': input_filename = optarg; break; /* p: matchlist mode / read corpus positions from stdin */ case 'p': read_pos_frm_stdin++; break; /* h: help page */ case 'h': decode_usage(2); break; default: fprintf(stderr, "Illegal option. Try \"%s -h\" for more information.\n", progname); fprintf(stderr, "[remember that options go before the corpus name, and attribute declarations after it!]\n"); decode_cleanup(2); } /* required argument: corpus id */ if (optind < argc) { corpus_id = argv[optind++]; if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) { fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n", corpus_id, (registry_directory ? registry_directory : cl_standard_registry() ) ); decode_cleanup(1); } } else { fprintf(stderr, "Missing argument. Try \"%s -h\" for more information.\n", progname); decode_cleanup(2); } /* now parse output flags (-P, -S, ...) [cnt is our own argument counter] */ for (cnt = optind; cnt < argc; cnt++) { if (strcmp(argv[cnt], "-c") == 0) { /* -c: context */ if ((context = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) { fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } } else if (strcmp(argv[cnt], "-P") == 0) { /* -P: positional attribute */ if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_POS)) == NULL) { fprintf(stderr, "Can't open p-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else { if (cl_max_cpos(attr) > 0) { decode_add_attribute(attr); if (maxlast < 0) maxlast = cl_max_cpos(attr); /* determines corpus size */ } else { fprintf(stderr, "Attribute %s.%s is declared, but not accessible (missing data?). Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } } } else if (strcmp(argv[cnt], "-ALL") == 0) { /* -ALL: all p-attributes and s-attributes */ for (attr = corpus->attributes; attr; attr = attr->any.next) if (attr->any.type == ATT_POS) { decode_add_attribute(attr); if (maxlast < 0) maxlast = cl_max_cpos(attr); } else if (attr->any.type == ATT_STRUC) { decode_add_attribute(attr); } } else if (strcmp(argv[cnt], "-D") == 0) { /* -D: dynamic attribute (not implemented) */ fprintf(stderr, "Sorry, dynamic attributes are not implemented. Aborting.\n"); decode_cleanup(2); } else if (strcmp(argv[cnt], "-A") == 0) { /* -A: alignment attribute */ if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_ALIGN)) == NULL) { fprintf(stderr, "Can't open a-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else decode_add_attribute(attr); } else if (strcmp(argv[cnt], "-S") == 0) { /* -S: structural attribute (as tags) */ if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) { fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else decode_add_attribute(attr); } else if (strcmp(argv[cnt], "-V") == 0) { /* -V: show structural attribute values (with -p or -f) */ if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) { fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else if (!cl_struc_values(attr)) { fprintf(stderr, "S-attribute %s.%s does not have annotations. Aborted.\n", corpus_id, argv[cnt]); decode_cleanup(1); } else if (printValuesIndex >= MAX_PRINT_VALUES) { fprintf(stderr, "Too many -V attributes, sorry. Aborted.\n"); decode_cleanup(1); } else printValues[printValuesIndex++] = attr; } else { fprintf(stderr, "Unknown flag: %s\n", argv[cnt]); decode_cleanup(2); } } /* ---- end of parse attribute declarations ---- */ if (input_filename != NULL) { if (strcmp(input_filename, "-") == 0) input_file = stdin; else if ((input_file = fopen(input_filename, "r")) == NULL) { perror(input_filename); exit(1); } read_pos_frm_stdin++; } decode_verify_print_value_list(); /* ------------------------------------------------------------ DECODE CORPUS */ if (read_pos_frm_stdin == 0) { /* * normal mode: decode entire corpus or specified range */ if (maxlast < 0) { fprintf(stderr, "Need at least one p-attribute (-P flag). Aborted.\n"); decode_cleanup(2); } if (first_token < 0 || first_token >= maxlast) first_token = 0; if (last < 0 || last >= maxlast) last = maxlast - 1; if (last < first_token) { fprintf(stderr, "Warning: output range #%d..#%d is empty. No output.\n", first_token, last); decode_cleanup(2); } if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) { decode_print_xml_declaration(); printf("<corpus name=\"%s\" start=\"%d\" end=\"%d\">\n", corpus_id, first_token, last); } /* decode_print_surrounding_s_att_values(first_token); */ /* don't do that in "normal" mode, coz it doesn't make sense */ for (w = first_token; w <= last; w++) decode_print_token_sequence(w, -1, context); if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) { printf("</corpus>\n"); } } else { /* * matchlist mode: read (pairs of) corpus positions from stdin or file */ if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) { decode_print_xml_declaration(); printf("<matchlist corpus=\"%s\">\n", corpus_id); } cnt = 0; while (fgets(s, CL_MAX_LINE_LENGTH, input_file) != NULL) { token = strtok(s, " \t\n"); if ((token != NULL) && is_num(token)) { sp = atoi(token); ep = -1; if ((token = strtok(NULL, " \t\n")) != NULL) { if (!is_num(token)) { fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", token); decode_cleanup(1); } else ep = atoi(token); } cnt++; /* count matches in matchlist */ if (mode == XMLMode) { printf("<match nr=\"%d\"", cnt); if (printnum) printf(" start=\"%d\" end=\"%d\"", sp, (ep >= 0) ? ep : sp); printf(">\n"); } else { /* nothing shown before range */ } decode_print_surrounding_s_att_values(sp); decode_print_token_sequence(sp, ep, context); if (mode == XMLMode) { printf("</match>\n"); } else if (mode != ConclineMode) { printf("\n"); /* blank line, unless in -H mode */ } } else { fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", s); decode_cleanup(1); } } if (input_file != stdin) fclose(input_file); if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) { printf("</matchlist>\n"); } } decode_cleanup(0); return 0; /* just to keep gcc from complaining */ }
/** * Main function for cwb-s-encode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char **argv) { int input_line; int start, end; char *annot; char buf[CL_MAX_LINE_LENGTH]; Attribute *att; int V_switch, values, S_annotations_dropped; int i, N; progname = argv[0]; sencode_parse_options(argc, argv); /* -a mode: read existing regions into memory */ if (add_to_existing) { if (corpus == NULL) { Rprintf( "Error: You have to specify source corpus (-C <corpus>) for -a switch.\n"); rcqp_receive_error(1); } att = cl_new_attribute(corpus, new_satt.name, ATT_STRUC); if ((att != NULL) && (cl_max_struc(att) > 0)) { V_switch = new_satt.store_values; values = cl_struc_values(att); if (V_switch && (!values)) { Rprintf( "Error: Existing regions of -V attribute have no annotations.\n"); rcqp_receive_error(1); } else if ((!V_switch) && values) { Rprintf( "Error: Existing regions of -S attributes have annotations.\n"); rcqp_receive_error(1); } if (!silent) Rprintf("[Loading previous <%s> regions]\n", new_satt.name); N = cl_max_struc(att); for (i = 0; i < N; i++) { cl_struc2cpos(att, i, &start, &end); annot = cl_struc2str(att, i); SL_insert(start, end, annot); } } else { if (!silent) Rprintf("[No <%s> regions defined (skipped)]\n", new_satt.name); } } /* loop reading input (stdin or -f <file>) */ if (in_memory && (!silent)) Rprintf("[Reading input data]\n"); input_line = 0; S_annotations_dropped = 0; while (fgets(buf, CL_MAX_LINE_LENGTH, text_fd)) { input_line++; /* check for buffer overflow */ if (strlen(buf) >= (CL_MAX_LINE_LENGTH - 1)) { Rprintf( "BUFFER OVERFLOW, input line #%d is too long:\n>> %s", input_line, buf); rcqp_receive_error(1); } if (! sencode_parse_line(buf, &start, &end, &annot)) { Rprintf( "FORMAT ERROR on line #%d:\n>> %s", input_line, buf); rcqp_receive_error(1); } if (new_satt.store_values && (annot == NULL)) { Rprintf( "MISSING ANNOTATION on line #%d:\n>> %s", input_line, buf); rcqp_receive_error(1); } if ((!new_satt.store_values) && (annot != NULL)) { if (! S_annotations_dropped) Rprintf( "WARNING: Annotation for -S attribute ignored on line #%d (warning issued only once):\n>> %s", input_line, buf); S_annotations_dropped++; } if ((start <= new_satt.last_cpos) || (end < start)) { Rprintf( "RANGE INCONSISTENCY on line #%d:\n>> %s(end of previous region was %d)\n", input_line, buf, new_satt.last_cpos); rcqp_receive_error(1); } if (annot != NULL && set_att != set_none) { /* convert set annotation into standard syntax */ annot = sencode_check_set(annot); if (annot == NULL) { Rprintf( "SET ANNOTATION SYNTAX ERROR on line #%d:\n>> %s", input_line, buf); rcqp_receive_error(1); } } /* debugging output */ if (debug) { Rprintf( "[%d, %d]", start, end); if (annot != NULL) Rprintf( " <%s>", annot); Rprintf( "\n"); } /* in -M mode, store this region in memory; otherwise write it to the disk files */ if (in_memory) SL_insert(start, end, annot); else sencode_write_region(start, end, annot); cl_free(annot); } /* in -M mode, write data to disk now that we have finished looping across input data */ if (in_memory) { SL item; if (!silent) Rprintf("[Creating encoded disk file(s)]\n"); SL_rewind(); while ((item = SL_next()) != NULL) sencode_write_region(item->start, item->end, item->annot); } /* close files */ sencode_close_files(); if (S_annotations_dropped > 0) Rprintf( "Warning: %d annotation values dropped for -S attribute '%s'.\n", S_annotations_dropped, new_satt.name); rcqp_receive_error(0); }
/** * Main function for cwb-align-encode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char *argv[]) { int argindex; /* index of first argument in argv[] */ char *align_name = NULL; /* name of the .align file */ FILE *af = NULL; /* alignment file handle */ int af_is_pipe; /* need to know whether to call fclose() or pclose() */ char alx_name[CL_MAX_LINE_LENGTH]; /* full pathname of .alx file */ char alg_name[CL_MAX_LINE_LENGTH]; /* full pathname of optional .alg file */ FILE *alx=NULL, *alg=NULL; /* file handles for .alx and optional .alg file */ char line[CL_MAX_LINE_LENGTH]; /* one line of input from <infile> */ char corpus1_name[CL_MAX_FILENAME_LENGTH]; char corpus2_name[CL_MAX_FILENAME_LENGTH]; char s1_name[CL_MAX_FILENAME_LENGTH]; char s2_name[CL_MAX_FILENAME_LENGTH]; Corpus *corpus1, *corpus2; /* corpus handles */ Attribute *w1, *w2; /* attribute handles for 'word' attributes; used to determine corpus size */ int size1, size2; /* size of source & target corpus */ Corpus *source_corpus; /* encode alignment in this corpus (depends on -R flag, important for -D option) */ char *source_corpus_name; /* just for error messages */ char *attribute_name; /* name of alignment attribute (depends on -R flag, must be lowercase) */ int f1,l1,f2,l2; /* alignment regions */ int current1, current2; int mark, n_0_1, n_1_0; int l; progname = argv[0]; /* parse command line and read arguments */ argindex = alignencode_parse_args(argc, argv, 1); align_name = argv[argindex]; /* open alignment file and parse header; .gz files are automatically decompressed */ af_is_pipe = 0; l = strlen(align_name); if ((l > 3) && (strncasecmp(align_name + l - 3, ".gz", 3) == 0)) { char *pipe_cmd = (char *) cl_malloc(l+10); sprintf(pipe_cmd, "gzip -cd %s", align_name); /* write .gz file through gzip pipe */ af = popen(pipe_cmd, "r"); if (af == NULL) { perror(pipe_cmd); Rprintf( "%s: can't read compressed file %s\n", progname, align_name); rcqp_receive_error(1); } af_is_pipe = 1; cl_free(pipe_cmd); } else { af = fopen(align_name, "r"); if (af == NULL) { perror(align_name); Rprintf( "%s: can't read file %s\n", progname, align_name); rcqp_receive_error(1); } } /* read header = first line */ fgets(line, CL_MAX_LINE_LENGTH, af); if (4 != sscanf(line, "%s %s %s %s", corpus1_name, s1_name, corpus2_name, s2_name)) { Rprintf( "%s: %s not in .align format\n", progname, align_name); Rprintf( "wrong header: %s", line); rcqp_receive_error(1); } if (verbose) { if (reverse) Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus2_name, corpus1_name, align_name); else Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus1_name, corpus2_name, align_name); } /* open corpora and determine their sizes (for validity checks and compatibility mode) */ if (NULL == (corpus1 = cl_new_corpus(registry_dir, corpus1_name))) { Rprintf( "%s: can't open corpus %s\n", progname, corpus1_name); rcqp_receive_error(1); } if (NULL == (corpus2 = cl_new_corpus(registry_dir, corpus2_name))) { Rprintf( "%s: can't open corpus %s\n", progname, corpus2_name); rcqp_receive_error(1); } if (NULL == (w1 = cl_new_attribute(corpus1, "word", ATT_POS))) { Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus1_name); rcqp_receive_error(1); } if (NULL == (w2 = cl_new_attribute(corpus2, "word", ATT_POS))) { Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus2_name); rcqp_receive_error(1); } size1 = cl_max_cpos(w1); if (size1 <= 0) { Rprintf( "%s: data access error (%s.word)\n", progname, corpus1_name); rcqp_receive_error(1); } size2 = cl_max_cpos(w2); if (size2 <= 0) { Rprintf( "%s: data access error (%s.word)\n", progname, corpus2_name); rcqp_receive_error(1); } /* now work out the actual source corpus and the alignment attribute name (depending on -R flag) */ source_corpus = (reverse) ? corpus2 : corpus1; source_corpus_name = (reverse) ? corpus2_name : corpus1_name; attribute_name = cl_strdup((reverse) ? corpus1_name : corpus2_name); cl_id_tolower(attribute_name); /* fold attribute name to lowercase */ /* with -D option, determine data file name(s) from actual source corpus; otherwise use directory specified with -d and the usual naming conventions */ if (data_dir_from_corpus) { Attribute *alignment = cl_new_attribute(source_corpus, attribute_name, ATT_ALIGN); char *comp_pathname; if (alignment == NULL) { Rprintf( "%s: alignment attribute %s.%s not declared in registry file\n", progname, source_corpus_name, attribute_name); rcqp_receive_error(1); } comp_pathname = component_full_name(alignment, CompXAlignData, NULL); if (comp_pathname == NULL) { Rprintf( "%s: can't determine pathname for .alx file (internal error)\n", progname); rcqp_receive_error(1); } strcpy(alx_name, comp_pathname); /* need to strcpy because component_full_name() returns pointer to internal buffer */ if (compatibility) { comp_pathname = component_full_name(alignment, CompAlignData, NULL); if (comp_pathname == NULL) { Rprintf( "%s: can't determine pathname for .alg file (internal error)\n", progname); rcqp_receive_error(1); } strcpy(alg_name, comp_pathname); } } else { sprintf(alx_name, "%s" SUBDIR_SEP_STRING "%s.alx", data_dir, attribute_name); if (compatibility) sprintf(alg_name, "%s" SUBDIR_SEP_STRING "%s.alg", data_dir, attribute_name); } /* now open output file(s) */ alx = fopen(alx_name, "wb"); if (alx == NULL) { perror(alx_name); Rprintf( "%s: can't write file %s\n", progname, alx_name); rcqp_receive_error(1); } if (verbose) Rprintf("Writing file %s ...\n", alx_name); if (compatibility) { alg = fopen(alg_name, "wb"); if (alg == NULL) { perror(alg_name); Rprintf( "%s: can't write file %s\n", progname, alg_name); rcqp_receive_error(1); } if (verbose) Rprintf("Writing file %s ...\n", alg_name); } /* main encoding loop */ f1 = f2 = l1 = l2 = 0; mark = -1; /* check that regions occur in ascending order */ current1 = current2 = -1; /* for compatibility mode */ n_0_1 = n_1_0 = 0; /* number of 0:1 and 1:0 alignments, which are skipped */ while (! feof(af)) { if (NULL == fgets(line, CL_MAX_LINE_LENGTH, af)) break; /* end of file (or read error, which we choose to ignore) */ if (4 != sscanf(line, "%d %d %d %d", &f1, &l1, &f2, &l2)) { Rprintf( "%s: input format error: %s", progname, line); rcqp_receive_error(1); } /* skip 0:1 and 1:0 alignments */ if (l1 < f1) { n_0_1++; continue; } if (l2 < f2) { n_1_0++; continue; } /* check that source regions are non-overlapping and in ascending order */ if (((reverse) ? f2 : f1) <= mark) { Rprintf( "%s: source regions of alignment must be in ascending order\n", progname); Rprintf( "Last region was [*, %d]; current is [%d, %d].\n", mark, f1, l1); Rprintf( "Aborted.\n"); rcqp_receive_error(1); } mark = (reverse) ? l2 : l1; /* write alignment region to .alx file */ if (reverse) { NwriteInt(f2, alx); NwriteInt(l2, alx); NwriteInt(f1, alx); NwriteInt(l1, alx); } else { NwriteInt(f1, alx); NwriteInt(l1, alx); NwriteInt(f2, alx); NwriteInt(l2, alx); } if (compatibility) { /* source and target regions of .alg file must be contiguous; store start points only; */ /* hence we must collapse crossing alignments into one larger region (I know that's bullshit) */ if ((f1 > current1) && (f2 > current2)) { if (reverse) { NwriteInt(f2, alg); NwriteInt(f1, alg); } else { NwriteInt(f1, alg); NwriteInt(f2, alg); } current1 = f1; current2 = f2; } } } if (compatibility) { if (reverse) { NwriteInt(size2, alg); NwriteInt(size1, alg); /* end of corpus alignment point*/ } else { NwriteInt(size1, alg); NwriteInt(size2, alg); /* end of corpus alignment point*/ } } if (verbose) { Rprintf("I skipped %d 0:1 alignments and %d 1:0 alignments.\n", n_0_1, n_1_0); } /* that's it; close file handles */ fclose(alx); if (compatibility) fclose(alg); if (af_is_pipe) pclose(af); else fclose(af); return 0; }
/** * Sets up a corpus attribute. * * NEVER CALL THIS!! ONLY USED WHILE PARSING A REGISTRY ENTRY!!!! * * @param corpus The corpus this attribute belongs to. * @param attribute_name The name of the attribute (i.e. the handle it has in the registry file). * @param type Type of attribute to be created. * @param data Unused. It can just be NULL. */ Attribute * setup_attribute(Corpus *corpus, char *attribute_name, int type, char *data) { Attribute *attr; Attribute *prev; /* count of attributes that the corpus possesses already, including the default * used to calculate this attribute's attr_number value. */ int a_num; attr = NULL; if (cl_new_attribute(corpus, attribute_name, type) != NULL) fprintf(stderr, "attributes:setup_attribute(): Warning: \n" " Attribute %s of type %s already defined in corpus %s\n", attribute_name, aid_name(type), corpus->id); else { ComponentID cid; attr = new(Attribute); attr->type = type; attr->any.mother = corpus; attr->any.name = attribute_name; for (cid = CompDirectory; cid < CompLast; cid++) attr->any.components[cid] = NULL; if (strcmp(attribute_name, DEFAULT_ATT_NAME) == 0 && type == ATTAT_POS) a_num = 0; else a_num = 1; /* insert at end of attribute list */ attr->any.next = NULL; if (corpus->attributes == NULL) corpus->attributes = attr; else { for (prev = corpus->attributes; prev->any.next; prev = prev->any.next) a_num++; assert(prev); assert(prev->any.next == NULL); prev->any.next = attr; } attr->any.attr_number = a_num; attr->any.path = NULL; /* ======================================== type specific initializations */ switch (attr->type) { case ATT_POS: attr->pos.hc = NULL; attr->pos.this_block_nr = -1; break; case ATT_STRUC: attr->struc.has_attribute_values = -1; /* not yet known */ break; default: break; } } return attr; }
/* tabulate specified query result, using settings from global list of tabulation items; return value indicates whether tabulation was successful (otherwise, generates error message) */ int print_tabulation(CorpusList *cl, int first, int last, struct Redir *rd) { TabulationItem item = TabulationList; int current; if (! cl) return 0; if (first <= 0) first = 0; /* make sure that first and last match to tabulate are in range */ if (last >= cl->size) last = cl->size - 1; while (item) { /* obtain attribute handles for tabulation items */ if (item->attribute_name) { if (NULL != (item->attribute = cl_new_attribute(cl->corpus, item->attribute_name, ATT_POS))) { item->attribute_type = ATT_POS; } else if (NULL != (item->attribute = cl_new_attribute(cl->corpus, item->attribute_name, ATT_STRUC))) { item->attribute_type = ATT_STRUC; if (! cl_struc_values(item->attribute)) { cqpmessage(Error, "No annotated values for s-attribute ``%s'' in named query %s", item->attribute_name, cl->name); return 0; } } else { cqpmessage(Error, "Can't find attribute ``%s'' for named query %s", item->attribute_name, cl->name); return 0; } } else { item->attribute_type = ATT_NONE; /* no attribute -> print corpus position */ } if (cl->size > 0) { /* work around bug: anchor validation will fail for empty query result (but then loop below is void anyway) */ if (! (pt_validate_anchor(cl, item->anchor1) && pt_validate_anchor(cl, item->anchor2))) return 0; } item = item->next; } if (! open_stream(rd, cl->corpus->charset)) { cqpmessage(Error, "Can't redirect output to file or pipe\n"); return 0; } /* tabulate selected attribute values for matches <first> .. <last> */ for (current = first; current <= last; current++) { TabulationItem item = TabulationList; while (item) { int start = pt_get_anchor_cpos(cl, current, item->anchor1, item->offset1); int end = pt_get_anchor_cpos(cl, current, item->anchor2, item->offset2); int cpos; if (start < 0 || end < 0) /* one of the anchors is undefined -> print single undefined value for entire range */ start = end = -1; for (cpos = start; cpos <= end; cpos++) { if (item->attribute_type == ATT_NONE) { fprintf(rd->stream, "%d", cpos); } else { if (cpos >= 0) { /* undefined anchors print empty string */ char *string = NULL; if (item->attribute_type == ATT_POS) string = cl_cpos2str(item->attribute, cpos); else string = cl_cpos2struc2str(item->attribute, cpos); if (string) { if (item->flags) { char *copy = cl_strdup(string); cl_string_canonical(copy, cl->corpus->charset, item->flags); fprintf(rd->stream, "%s", copy); cl_free(copy); } else { fprintf(rd->stream, "%s", string); } } } } if (cpos < end) /* multiple values for tabulation item are separated by blanks */ fprintf(rd->stream, " "); } if (item->next) /* multiple tabulation items are separated by TABs */ fprintf(rd->stream, "\t"); item = item->next; } fprintf(rd->stream, "\n"); } close_stream(rd); free_tabulation_list(); return 1; }
/** * Main function for cwb-huffcode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char **argv) { char *registry_directory = NULL; char *output_fn = NULL; char *attr_name = DEFAULT_ATT_NAME; Attribute *attr; HCD hc; extern int optind; extern char *optarg; int c; int i_want_to_believe = 0; /* skip error checks? */ int all_attributes = 0; protocol = stdout; /* 'delayed' init (see top of file) */ /* ------------------------------------------------- PARSE ARGUMENTS */ progname = argv[0]; /* parse arguments */ while ((c = getopt(argc, argv, "+TvP:r:f:dAh")) != EOF) { switch (c) { /* T: skip decompression / error checking pass ("I trust you") */ case 'T': i_want_to_believe++; break; /* v: verbose -> displays protocol of compression process on stdout */ case 'v': do_protocol++; break; /* P: attribute to compress */ case 'P': attr_name = optarg; break; /* r: registry directory */ case 'r': if (registry_directory == NULL) registry_directory = optarg; else { fprintf(stderr, "%s: -r option used twice\n", progname); exit(2); } break; /* f: filename prefix for compressed data files */ case 'f': output_fn = optarg; break; /* d: debug mode --- unused */ case 'd': debug++; break; /* A: compress all attributes */ case 'A': all_attributes++; break; /* h: help page */ case 'h': huffcode_usage(NULL, 2); break; default: huffcode_usage("illegal option.", 2); break; } } /* single argument: corpus id */ if (optind < argc) { corpus_id = argv[optind++]; } else { huffcode_usage("corpus not specified (missing argument)", 1); } if (optind < argc) { huffcode_usage("Too many arguments", 1); } if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) { fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n", corpus_id, (registry_directory ? registry_directory : central_corpus_directory())); exit(1); } if (all_attributes) { for (attr = corpus->attributes; attr; attr = attr->any.next) if (attr->any.type == ATT_POS) { compute_code_lengths(attr, &hc, output_fn); if (! i_want_to_believe) decode_check_huff(attr, output_fn); } } else { if ((attr = cl_new_attribute(corpus, attr_name, ATT_POS)) == NULL) { fprintf(stderr, "Attribute %s.%s doesn't exist. Aborted.\n", corpus_id, attr_name); exit(1); } compute_code_lengths(attr, &hc, output_fn); if (! i_want_to_believe) decode_check_huff(attr, output_fn); } cl_delete_corpus(corpus); exit(0); }