Esempio n. 1
0
/**
 * Main function for cwb-huffcode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int 
R_cwb_huffcode (char *corpus_name, char *registry_dir)
{
  char *registry_directory = registry_dir;
  char *output_fn = NULL;
  char *attr_name = DEFAULT_ATT_NAME;
  Attribute *attr;

  HCD hc;

  Rprintf("Corpus: %s\n", corpus_name);

  int i_want_to_believe = 0;        /* skip error checks? */
  int all_attributes = 0;

  protocol = NULL;                /* 'delayed' init (see top of file) */

  /* ------------------------------------------------- PARSE ARGUMENTS */
  /* parse arguments */

  all_attributes++;
  corpus_id_cwb_huffcode = corpus_name;

  Rprintf("Corpus 1: %s\n", corpus_name);
  
  if ((corpus = cl_new_corpus(registry_directory, corpus_id_cwb_huffcode)) == NULL) {
   Rprintf( "Corpus %s not found in registry %s . Aborted.\n", 
            corpus_id_cwb_huffcode,
            (registry_directory ? registry_directory
               : central_corpus_directory()));
    rcqp_receive_error(1);
  }

  Rprintf("Corpus 2: %s\n", corpus_name);

  if (all_attributes) {
    for (attr = corpus->attributes; attr; attr = attr->any.next)
      if (attr->any.type == ATT_POS) {
        compute_code_lengths(attr, &hc, output_fn);
        if (! i_want_to_believe)
          decode_check_huff(attr, output_fn);
      }
  }
  else {
    if ((attr = cl_new_attribute(corpus, attr_name, ATT_POS)) == NULL) {
     Rprintf( "Attribute %s.%s doesn't exist. Aborted.\n", 
              corpus_id_cwb_huffcode, attr_name);
      rcqp_receive_error(1);
    }
    compute_code_lengths(attr, &hc, output_fn);
    if (! i_want_to_believe)
      decode_check_huff(attr, output_fn);
  }
  
  Rprintf("Corpus 3: %s\n", corpus_name);

  cl_delete_corpus(corpus);
  
  return(0);
}
Esempio n. 2
0
/**
 * Prints basic information about a corpus to STDOUT.
 *
 * @param corpus                The corpus to report on.
 * @param with_attribute_names  Boolean: iff true, the counts of each type of attribute
 *                              are followed by a list of attribute names.
 *
 */
void
describecorpus_show_basic_info (Corpus *corpus, int with_attribute_names)
{
  Attribute *word, *a;
  int p_atts = 0, s_atts = 0, a_atts = 0;
  int size;
  char *colon = (with_attribute_names) ? ":" : "";

  Rprintf("description:    %s\n", corpus->name);
  Rprintf("registry file:  %s/%s\n", corpus->registry_dir, corpus->registry_name);
  Rprintf("home directory: %s/\n", corpus->path);
  Rprintf("info file:      %s\n", (corpus->info_file) ? corpus->info_file : "(none)");
  if ((word = cl_new_attribute(corpus, "word", ATT_POS)) == NULL) {
    Rprintf( "ERROR: 'word' attribute is missing. Aborted.\n");
    rcqp_receive_error(1);
  }
  size = cl_max_cpos(word);
  Rprintf("size (tokens):  ");
  if (size >= 0) 
    Rprintf("%d\n", size);
  else
    Rprintf("ERROR\n");
  Rprintf("\n");
  
  for (a = corpus->attributes; a; a = a->any.next) {
    switch(a->any.type) {
    case ATT_POS:   p_atts++; break;
    case ATT_STRUC: s_atts++; break;
    case ATT_ALIGN: a_atts++; break;
    default: break;
    }
  }
  Rprintf("%3d positional attributes%s\n", p_atts, colon);
  if (with_attribute_names)
    describecorpus_show_attribute_names(corpus, ATT_POS);
  Rprintf("%3d structural attributes%s\n", s_atts, colon);
  if (with_attribute_names)
    describecorpus_show_attribute_names(corpus, ATT_STRUC);
  Rprintf("%3d alignment  attributes%s\n", a_atts, colon);
  if (with_attribute_names)
    describecorpus_show_attribute_names(corpus, ATT_ALIGN);
  Rprintf("\n");
}
Esempio n. 3
0
/**
 * Main function for cwb-makeall.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  char *attr_name = NULL;
  Attribute *attribute;

  char *registry_directory = NULL;
  char *corpus_id = NULL;

  extern int optind;
  extern char *optarg;
  int c;

  int validate = 0;

  char *component = NULL;

  ComponentID cid;
  int i = 0;

  /* ------------------------------------------------- PARSE ARGUMENTS */

  progname = argv[0];

  /* parse arguments */
  while ((c = getopt(argc, argv, "+r:c:P:hDM:V")) != EOF) {
    switch (c) {

    /* r: registry directory */
    case 'r':
      if (registry_directory == NULL)
        registry_directory = optarg;
      else {
        fprintf(stderr, "%s: -r option used twice\n", progname);
        exit(2);
      }
      break;

    case 'P':
      if (attr_name == NULL)
        attr_name = optarg;
      else {
        fprintf(stderr, "%s: -P option used twice\n", progname);
        exit(2);
      }
      break;

    case 'c':
      if (component == NULL)
        component = optarg;
      else {
        fprintf(stderr, "%s: -c option used twice\n", progname);
        exit(2);
      }
      break;

    case 'D':
      cl_set_debug_level(1);
      break;

    case 'M':
      i = atoi(optarg);
      cl_set_memory_limit(i);
      break;

    case 'V':
      validate++;
      break;

    case 'h':
    default:
      makeall_usage();
    }
  }

  if (optind >= argc) {
    fprintf(stderr, "Missing argument, try \"%s -h\" for more information.\n", progname);
    exit(1);
  }

  /* first argument: corpus id */
  corpus_id = argv[optind++];

  if (component != NULL) {
    cid = component_id(component);
    if (cid == CompLast) {
      fprintf(stderr, "Illegal component name: ``%s''\n", component);
      exit(1);
    }
  }
  else {
    cid = CompLast;
  }


  if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) {
    fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n",
            corpus_id,
            (registry_directory ? registry_directory
             : central_corpus_directory()));
    exit(1);
  }

  printf("=== Makeall: processing corpus %s ===\n", corpus_id);
  printf("Registry directory: %s\n", corpus->registry_dir);

  if (optind < argc) {
    for (i = optind; i < argc; i++) {
      if ((attribute = cl_new_attribute(corpus, argv[i], ATT_POS)) != NULL) {
        makeall_do_attribute(attribute, cid, validate);
      }
      else {
        fprintf(stderr, "p-attribute %s.%s not defined. Aborted.\n",
                corpus_id, attr_name);
        exit(1);
      }
    }
  }
  else if (attr_name != NULL) {
    if ((attribute = cl_new_attribute(corpus, attr_name, ATT_POS)) != NULL) {
      makeall_do_attribute(attribute, cid, validate);
    }
    else {
      fprintf(stderr, "p-attribute %s.%s not defined. Aborted.\n",
              corpus_id, attr_name);
      exit(1);
    }
  }
  else {
    /* process each p-attribute of the corpus in turn */
    for (attribute = corpus->attributes; attribute; attribute = attribute->any.next)
      if (attribute->type == ATT_POS) {
        ComponentID my_cid;

        makeall_do_attribute(attribute, cid, validate);
        /* now destoy all components; this makes the attribute unusable,
           but it is currently the only way to free allocated and memory-mapped data */
        for (my_cid = CompDirectory; my_cid < CompLast; my_cid++) { /* ordering gleaned from attributes.h */
          drop_component(attribute, my_cid);
        }
      }
  }

  printf("========================================\n");
  exit(0);
}
Esempio n. 4
0
/**
 * Main function for cwb-decode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  Attribute *attr;
  Attribute *context = NULL;

  int sp;  /* start position of a match */
  int ep;  /* end position of a match */

  int w, cnt, read_pos_frm_stdin;

  char s[CL_MAX_LINE_LENGTH];      /* buffer for strings read from file */
  char *token;

  char *input_filename = NULL;
  FILE *input_file = stdin;

  /* ------------------------------------------------- PARSE ARGUMENTS */

  int c;
  extern char *optarg;
  extern int optind;

  progname = argv[0];

  first_token = -1;
  last = -1;
  maxlast = -1;

  read_pos_frm_stdin = 0;

  /* use getopt() to parse command-line options */
  while((c = getopt(argc, argv, "+s:e:r:nLHCxXf:ph")) != EOF)
    switch(c) {

      /* s: start corpus position */
    case 's':
      first_token = atoi(optarg);
      break;

      /* e: end corpus position */
    case 'e':
      last = atoi(optarg);
      break;

      /* r: registry directory */
    case 'r':
      if (registry_directory == NULL)
        registry_directory = optarg;
      else {
        fprintf(stderr, "%s: -r option used twice\n", progname);
        exit(2);
      }
      break;

      /* n: show cpos in -H mode */
    case 'n':
      printnum++;
      break;

      /* x: XML-compatible output in -C mode (-Cx) */
    case 'x':
      xml_compatible++;
      break;

      /* L,H,C,X: Lisp, Horizontal, Compact, and XML modes */
    case 'L':
      mode = LispMode;
      break;
    case 'H':
      mode = ConclineMode;
      break;
    case 'C':
      mode = EncodeMode;
      break;
    case 'X':
      mode = XMLMode;
      break;

      /* f: matchlist mode / read corpus positions from file */
    case 'f':
      input_filename = optarg;
      break;

      /* p: matchlist mode / read corpus positions from stdin */
    case 'p':
      read_pos_frm_stdin++;
      break;

      /* h: help page */
    case 'h':
      decode_usage(2);
      break;

    default:
      fprintf(stderr, "Illegal option. Try \"%s -h\" for more information.\n", progname);
      fprintf(stderr, "[remember that options go before the corpus name, and attribute declarations after it!]\n");
      decode_cleanup(2);
    }

  /* required argument: corpus id */
  if (optind < argc) {
    corpus_id = argv[optind++];

    if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) {
      fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n",
              corpus_id,
              (registry_directory ? registry_directory : cl_standard_registry() ) );
      decode_cleanup(1);
    }
  }
  else {
    fprintf(stderr, "Missing argument. Try \"%s -h\" for more information.\n", progname);
    decode_cleanup(2);
  }


  /* now parse output flags (-P, -S, ...) [cnt is our own argument counter] */
  for (cnt = optind; cnt < argc; cnt++) {
    if (strcmp(argv[cnt], "-c") == 0) {         /* -c: context */

      if ((context = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) {
        fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }

    }
    else if (strcmp(argv[cnt], "-P") == 0) {    /* -P: positional attribute */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_POS)) == NULL) {
        fprintf(stderr, "Can't open p-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else {
        if (cl_max_cpos(attr) > 0) {
          decode_add_attribute(attr);
          if (maxlast < 0)
            maxlast = cl_max_cpos(attr); /* determines corpus size */
        }
        else {
          fprintf(stderr, "Attribute %s.%s is declared, but not accessible (missing data?). Aborted.\n",
                  corpus_id, argv[cnt]);
          decode_cleanup(1);
        }
      }

    }
    else if (strcmp(argv[cnt], "-ALL") == 0) {  /* -ALL: all p-attributes and s-attributes */

      for (attr = corpus->attributes; attr; attr = attr->any.next)
        if (attr->any.type == ATT_POS) {
          decode_add_attribute(attr);
          if (maxlast < 0)
            maxlast = cl_max_cpos(attr);
        }
        else if (attr->any.type == ATT_STRUC) {
          decode_add_attribute(attr);
        }

    }
    else if (strcmp(argv[cnt], "-D") == 0) {    /* -D: dynamic attribute (not implemented) */

      fprintf(stderr, "Sorry, dynamic attributes are not implemented. Aborting.\n");
      decode_cleanup(2);

    }
    else if (strcmp(argv[cnt], "-A") == 0) {    /* -A: alignment attribute */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_ALIGN)) == NULL) {
        fprintf(stderr, "Can't open a-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else
        decode_add_attribute(attr);
    }
    else if (strcmp(argv[cnt], "-S") == 0) {    /* -S: structural attribute (as tags) */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) {
        fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else
        decode_add_attribute(attr);
    }
    else if (strcmp(argv[cnt], "-V") == 0) {    /* -V: show structural attribute values (with -p or -f) */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) {
        fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else if (!cl_struc_values(attr)) {
        fprintf(stderr, "S-attribute %s.%s does not have annotations. Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else if (printValuesIndex >= MAX_PRINT_VALUES) {
        fprintf(stderr, "Too many -V attributes, sorry. Aborted.\n");
        decode_cleanup(1);
      }
      else
        printValues[printValuesIndex++] = attr;
    }
    else {

      fprintf(stderr, "Unknown flag: %s\n", argv[cnt]);
      decode_cleanup(2);

    }
  }
  /* ---- end of parse attribute declarations ---- */

  if (input_filename != NULL) {
    if (strcmp(input_filename, "-") == 0)
      input_file = stdin;
    else if ((input_file = fopen(input_filename, "r")) == NULL) {
      perror(input_filename);
      exit(1);
    }
    read_pos_frm_stdin++;
  }

  decode_verify_print_value_list();

  /* ------------------------------------------------------------ DECODE CORPUS */

  if (read_pos_frm_stdin == 0) {
    /*
     * normal mode: decode entire corpus or specified range
     */

    if (maxlast < 0) {
      fprintf(stderr, "Need at least one p-attribute (-P flag). Aborted.\n");
      decode_cleanup(2);
    }

    if (first_token < 0 || first_token >= maxlast)
      first_token = 0;

    if (last < 0 || last >= maxlast)
      last = maxlast - 1;

    if (last < first_token) {
      fprintf(stderr, "Warning: output range #%d..#%d is empty. No output.\n", first_token, last);
      decode_cleanup(2);
    }

    if ( (mode == XMLMode) ||  ((mode == EncodeMode) && xml_compatible) ) {
      decode_print_xml_declaration();
      printf("<corpus name=\"%s\" start=\"%d\" end=\"%d\">\n",
             corpus_id, first_token, last);
    }

    /* decode_print_surrounding_s_att_values(first_token); */ /* don't do that in "normal" mode, coz it doesn't make sense */

    for (w = first_token; w <= last; w++)
      decode_print_token_sequence(w, -1, context);

    if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) {
      printf("</corpus>\n");
    }
  }
  else {
    /*
     * matchlist mode: read (pairs of) corpus positions from stdin or file
     */

    if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) {
      decode_print_xml_declaration();
      printf("<matchlist corpus=\"%s\">\n", corpus_id);
    }

    cnt = 0;
    while (fgets(s, CL_MAX_LINE_LENGTH, input_file) != NULL) {

      token = strtok(s, " \t\n");

      if ((token != NULL) && is_num(token)) {
        sp = atoi(token);

        ep = -1;
        if ((token = strtok(NULL, " \t\n")) != NULL) {
          if (!is_num(token)) {
            fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", token);
            decode_cleanup(1);
          }
          else
            ep = atoi(token);
        }

        cnt++;                  /* count matches in matchlist  */
        if (mode == XMLMode) {
          printf("<match nr=\"%d\"", cnt);
          if (printnum)
            printf(" start=\"%d\" end=\"%d\"", sp, (ep >= 0) ? ep : sp);
          printf(">\n");
        }
        else {
          /* nothing shown before range */
        }

        decode_print_surrounding_s_att_values(sp);

        decode_print_token_sequence(sp, ep, context);

        if (mode == XMLMode) {
          printf("</match>\n");
        }
        else if (mode != ConclineMode) {
          printf("\n");         /* blank line, unless in -H mode */
        }
      }
      else {
        fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", s);
        decode_cleanup(1);
      }
    }

    if (input_file != stdin)
      fclose(input_file);

    if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) {
      printf("</matchlist>\n");
    }
  }

  decode_cleanup(0);
  return 0;                     /* just to keep gcc from complaining */
}
Esempio n. 5
0
/**
 * Main function for cwb-s-encode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  int input_line;
  int start, end;
  char *annot;
  char buf[CL_MAX_LINE_LENGTH];
  Attribute *att;
  int V_switch, values, S_annotations_dropped;
  int i, N;

  progname = argv[0];
  sencode_parse_options(argc, argv);

  /* -a mode: read existing regions into memory */
  if (add_to_existing) {
    if (corpus == NULL) {
      Rprintf( "Error: You have to specify source corpus (-C <corpus>) for -a switch.\n");
      rcqp_receive_error(1);
    }
    att = cl_new_attribute(corpus, new_satt.name, ATT_STRUC);
    if ((att != NULL) && (cl_max_struc(att) > 0)) {
      V_switch = new_satt.store_values;
      values = cl_struc_values(att);
      if (V_switch && (!values)) {
        Rprintf( "Error: Existing regions of -V attribute have no annotations.\n");
        rcqp_receive_error(1);
      }
      else if ((!V_switch) && values) {
        Rprintf( "Error: Existing regions of -S attributes have annotations.\n");
        rcqp_receive_error(1);
      }
      if (!silent)
        Rprintf("[Loading previous <%s> regions]\n", new_satt.name);

      N = cl_max_struc(att);
      for (i = 0; i < N; i++) {
        cl_struc2cpos(att, i, &start, &end);
        annot = cl_struc2str(att, i);
        SL_insert(start, end, annot);
      }
    }
    else {
      if (!silent)
        Rprintf("[No <%s> regions defined (skipped)]\n", new_satt.name);
    }
  }

  /* loop reading input (stdin or -f <file>) */
  if (in_memory && (!silent))
    Rprintf("[Reading input data]\n");
  input_line = 0;
  S_annotations_dropped = 0;
  while (fgets(buf, CL_MAX_LINE_LENGTH, text_fd)) {
    input_line++;

    /* check for buffer overflow */
    if (strlen(buf) >= (CL_MAX_LINE_LENGTH - 1)) {
      Rprintf( "BUFFER OVERFLOW, input line #%d is too long:\n>> %s", input_line, buf);
      rcqp_receive_error(1);
    }

    if (! sencode_parse_line(buf, &start, &end, &annot)) {
      Rprintf( "FORMAT ERROR on line #%d:\n>> %s", input_line, buf);
      rcqp_receive_error(1);
    }
    if (new_satt.store_values && (annot == NULL)) {
      Rprintf( "MISSING ANNOTATION on line #%d:\n>> %s", input_line, buf);
      rcqp_receive_error(1);
    }
    if ((!new_satt.store_values) && (annot != NULL)) {
      if (! S_annotations_dropped)
        Rprintf( "WARNING: Annotation for -S attribute ignored on line #%d (warning issued only once):\n>> %s", input_line, buf);
      S_annotations_dropped++;
    }
    if ((start <= new_satt.last_cpos) || (end < start)) {
      Rprintf( "RANGE INCONSISTENCY on line #%d:\n>> %s(end of previous region was %d)\n", input_line, buf, new_satt.last_cpos);
      rcqp_receive_error(1);
    }
    if (annot != NULL && set_att != set_none) {
      /* convert set annotation into standard syntax */
      annot = sencode_check_set(annot);
      if (annot == NULL) {
        Rprintf( "SET ANNOTATION SYNTAX ERROR on line #%d:\n>> %s", input_line, buf);
        rcqp_receive_error(1);
      }
    }

    /* debugging output */
    if (debug) {
      Rprintf( "[%d, %d]", start, end);
      if (annot != NULL)
        Rprintf( " <%s>", annot);
      Rprintf( "\n");
    }

    /* in -M mode, store this region in memory; otherwise write it to the disk files */
    if (in_memory)
      SL_insert(start, end, annot);
    else
      sencode_write_region(start, end, annot);

    cl_free(annot);
  }

  /* in -M mode, write data to disk now that we have finished looping across input data */
  if (in_memory) {
    SL item;

    if (!silent)
      Rprintf("[Creating encoded disk file(s)]\n");
    SL_rewind();
    while ((item = SL_next()) != NULL)
      sencode_write_region(item->start, item->end, item->annot);
  }

  /* close files */
  sencode_close_files();

  if (S_annotations_dropped > 0)
    Rprintf( "Warning: %d annotation values dropped for -S attribute '%s'.\n", S_annotations_dropped, new_satt.name);

  rcqp_receive_error(0);
}
Esempio n. 6
0
/**
 * Main function for cwb-align-encode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char *argv[])
{
  int argindex;                         /* index of first argument in argv[] */

  char *align_name = NULL;              /* name of the .align file */
  FILE *af = NULL;                      /* alignment file handle */
  int af_is_pipe;                       /* need to know whether to call fclose() or pclose() */
  char alx_name[CL_MAX_LINE_LENGTH];    /* full pathname of .alx file */
  char alg_name[CL_MAX_LINE_LENGTH];    /* full pathname of optional .alg file */
  FILE *alx=NULL, *alg=NULL;            /* file handles for .alx and optional .alg file */

  char line[CL_MAX_LINE_LENGTH];        /* one line of input from <infile> */

  char corpus1_name[CL_MAX_FILENAME_LENGTH];
  char corpus2_name[CL_MAX_FILENAME_LENGTH];
  char s1_name[CL_MAX_FILENAME_LENGTH];
  char s2_name[CL_MAX_FILENAME_LENGTH];
  Corpus *corpus1, *corpus2;            /* corpus handles */
  Attribute *w1, *w2;                   /* attribute handles for 'word' attributes; used to determine corpus size */
  int size1, size2;                     /* size of source & target corpus */

  Corpus *source_corpus;                /* encode alignment in this corpus (depends on -R flag, important for -D option) */
  char *source_corpus_name;             /* just for error messages */
  char *attribute_name;                 /* name of alignment attribute (depends on -R flag, must be lowercase) */

  int f1,l1,f2,l2;                      /* alignment regions */
  int current1, current2;
  int mark, n_0_1, n_1_0;

  int l;

  progname = argv[0];

  /* parse command line and read arguments */
  argindex = alignencode_parse_args(argc, argv, 1);
  align_name = argv[argindex];

  /* open alignment file and parse header; .gz files are automatically decompressed */
  af_is_pipe = 0;
  l = strlen(align_name);
  if ((l > 3) && (strncasecmp(align_name + l - 3, ".gz", 3) == 0)) {
    char *pipe_cmd = (char *) cl_malloc(l+10);
    sprintf(pipe_cmd, "gzip -cd %s", align_name); /* write .gz file through gzip pipe */
    af = popen(pipe_cmd, "r");
    if (af == NULL) {
      perror(pipe_cmd);
     Rprintf( "%s: can't read compressed file %s\n", progname, align_name);
      rcqp_receive_error(1);
    }
    af_is_pipe = 1;
    cl_free(pipe_cmd);
  }
  else {
    af = fopen(align_name, "r");
    if (af == NULL) {
      perror(align_name);
     Rprintf( "%s: can't read file %s\n", progname, align_name);
      rcqp_receive_error(1);
    }
  }

  /* read header = first line */
  fgets(line, CL_MAX_LINE_LENGTH, af);
  if (4 != sscanf(line, "%s %s %s %s", corpus1_name, s1_name, corpus2_name, s2_name)) {
   Rprintf( "%s: %s not in .align format\n", progname, align_name);
   Rprintf( "wrong header: %s", line);
    rcqp_receive_error(1);
  }
  if (verbose) {
    if (reverse)
     Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus2_name, corpus1_name, align_name);
    else
     Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus1_name, corpus2_name, align_name);
  }

  /* open corpora and determine their sizes (for validity checks and compatibility mode) */
  if (NULL == (corpus1 = cl_new_corpus(registry_dir, corpus1_name))) {
   Rprintf( "%s: can't open corpus %s\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  if (NULL == (corpus2 = cl_new_corpus(registry_dir, corpus2_name))) {
   Rprintf( "%s: can't open corpus %s\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }
  if (NULL == (w1 = cl_new_attribute(corpus1, "word", ATT_POS))) {
   Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  if (NULL == (w2 = cl_new_attribute(corpus2, "word", ATT_POS))) {
   Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }

  size1 = cl_max_cpos(w1);
  if (size1 <= 0) {
   Rprintf( "%s: data access error (%s.word)\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  size2 = cl_max_cpos(w2);
  if (size2 <= 0) {
   Rprintf( "%s: data access error (%s.word)\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }

  /* now work out the actual source corpus and the alignment attribute name (depending on -R flag) */
  source_corpus = (reverse) ? corpus2 : corpus1;
  source_corpus_name = (reverse) ? corpus2_name : corpus1_name;
  attribute_name = cl_strdup((reverse) ? corpus1_name : corpus2_name);
  cl_id_tolower(attribute_name); /* fold attribute name to lowercase */

  /* with -D option, determine data file name(s) from actual source corpus;
     otherwise use directory specified with -d and the usual naming conventions */
  if (data_dir_from_corpus) {
    Attribute *alignment = cl_new_attribute(source_corpus, attribute_name, ATT_ALIGN);
    char *comp_pathname;

    if (alignment == NULL) {
     Rprintf( "%s: alignment attribute %s.%s not declared in registry file\n",
              progname, source_corpus_name, attribute_name);
      rcqp_receive_error(1);
    }
    comp_pathname = component_full_name(alignment, CompXAlignData, NULL);
    if (comp_pathname == NULL) {
     Rprintf( "%s: can't determine pathname for .alx file (internal error)\n", progname);
      rcqp_receive_error(1);
    }
    strcpy(alx_name, comp_pathname); /* need to strcpy because component_full_name() returns pointer to internal buffer */
    if (compatibility) {
      comp_pathname = component_full_name(alignment, CompAlignData, NULL);
      if (comp_pathname == NULL) {
       Rprintf( "%s: can't determine pathname for .alg file (internal error)\n", progname);
        rcqp_receive_error(1);
      }
      strcpy(alg_name, comp_pathname);
    }
  }
  else {
    sprintf(alx_name, "%s" SUBDIR_SEP_STRING "%s.alx", data_dir, attribute_name);
    if (compatibility)
      sprintf(alg_name, "%s" SUBDIR_SEP_STRING "%s.alg", data_dir, attribute_name);
  }

  /* now open output file(s) */
  alx = fopen(alx_name, "wb");
  if (alx == NULL) {
    perror(alx_name);
   Rprintf( "%s: can't write file %s\n", progname, alx_name);
    rcqp_receive_error(1);
  }
  if (verbose)
   Rprintf("Writing file %s ...\n", alx_name);

  if (compatibility) {
    alg = fopen(alg_name, "wb");
    if (alg == NULL) {
      perror(alg_name);
     Rprintf( "%s: can't write file %s\n", progname, alg_name);
      rcqp_receive_error(1);
    }

    if (verbose)
     Rprintf("Writing file %s ...\n", alg_name);
  }

  /* main encoding loop */
  f1 = f2 = l1 = l2 = 0;
  mark = -1;                        /* check that regions occur in ascending order */
  current1 = current2 = -1;         /* for compatibility mode */
  n_0_1 = n_1_0 = 0;                /* number of 0:1 and 1:0 alignments, which are skipped */
  while (! feof(af)) {
    if (NULL == fgets(line, CL_MAX_LINE_LENGTH, af))
      break;                        /* end of file (or read error, which we choose to ignore) */
    if (4 != sscanf(line, "%d %d %d %d", &f1, &l1, &f2, &l2)) {
     Rprintf( "%s: input format error: %s", progname, line);
      rcqp_receive_error(1);
    }

    /* skip 0:1 and 1:0 alignments */
    if (l1 < f1) {
      n_0_1++; continue;
    }
    if (l2 < f2) {
      n_1_0++; continue;
    }

    /* check that source regions are non-overlapping and in ascending order */
    if (((reverse) ? f2 : f1) <= mark) {
     Rprintf( "%s: source regions of alignment must be in ascending order\n", progname);
     Rprintf( "Last region was [*, %d]; current is [%d, %d].\n", mark, f1, l1);
     Rprintf( "Aborted.\n");
      rcqp_receive_error(1);
    }
    mark = (reverse) ? l2 : l1;

    /* write alignment region to .alx file */
    if (reverse) {
      NwriteInt(f2, alx); NwriteInt(l2, alx);
      NwriteInt(f1, alx); NwriteInt(l1, alx);
    }
    else {
      NwriteInt(f1, alx); NwriteInt(l1, alx);
      NwriteInt(f2, alx); NwriteInt(l2, alx);
    }

    if (compatibility) {
      /* source and target regions of .alg file must be contiguous; store start points only; */
      /* hence we must collapse crossing alignments into one larger region (I know that's bullshit) */
      if ((f1 > current1) && (f2 > current2)) {
        if (reverse) {
          NwriteInt(f2, alg); NwriteInt(f1, alg);
        }
        else {
          NwriteInt(f1, alg); NwriteInt(f2, alg);
        }
        current1 = f1;
        current2 = f2;
      }
    }
  }
  if (compatibility) {
    if (reverse) {
      NwriteInt(size2, alg); NwriteInt(size1, alg); /* end of corpus alignment point*/
    }
    else {
      NwriteInt(size1, alg); NwriteInt(size2, alg); /* end of corpus alignment point*/
    }
  }

  if (verbose) {
   Rprintf("I skipped %d 0:1 alignments and %d 1:0 alignments.\n", n_0_1, n_1_0);
  }

  /* that's it; close file handles */
  fclose(alx);
  if (compatibility)
    fclose(alg);

  if (af_is_pipe)
    pclose(af);
  else
    fclose(af);

  return 0;
}
Esempio n. 7
0
/**
 * Sets up a corpus attribute.
 *
 * NEVER CALL THIS!! ONLY USED WHILE PARSING A REGISTRY ENTRY!!!!
 *
 * @param corpus          The corpus this attribute belongs to.
 * @param attribute_name  The name of the attribute (i.e. the handle it has in the registry file).
 * @param type            Type of attribute to be created.
 * @param data            Unused. It can just be NULL.
 */
Attribute *
setup_attribute(Corpus *corpus,
                char *attribute_name,
                int type,
                char *data)
{
  Attribute *attr;
  Attribute *prev;

  /* count of attributes that the corpus possesses already, including the default
   * used to calculate this attribute's attr_number value. */
  int a_num;

  attr = NULL;

  if (cl_new_attribute(corpus, attribute_name, type) != NULL)
    fprintf(stderr, "attributes:setup_attribute(): Warning: \n"
            "  Attribute %s of type %s already defined in corpus %s\n",
            attribute_name, aid_name(type), corpus->id);
  else {

    ComponentID cid;

    attr = new(Attribute);
    attr->type = type;
    attr->any.mother = corpus;
    attr->any.name = attribute_name;

    for (cid = CompDirectory; cid < CompLast; cid++)
      attr->any.components[cid] = NULL;

    if (strcmp(attribute_name, DEFAULT_ATT_NAME) == 0 && type == ATTAT_POS)
      a_num = 0;
    else
      a_num = 1;

    /* insert at end of attribute list */
    
    attr->any.next = NULL;
    if (corpus->attributes == NULL)
      corpus->attributes = attr;
    else {
      for (prev = corpus->attributes; prev->any.next; prev = prev->any.next)
        a_num++;
      assert(prev);
      assert(prev->any.next == NULL);
      prev->any.next = attr;
    }
    attr->any.attr_number = a_num;

    attr->any.path = NULL;

    /* ======================================== type specific initializations */

    switch (attr->type) {

    case ATT_POS:
      attr->pos.hc = NULL;
      attr->pos.this_block_nr = -1;
      break;

    case ATT_STRUC:
      attr->struc.has_attribute_values = -1; /* not yet known */
      break;

    default:
      break;
    }
  }
  
  return attr;
}
Esempio n. 8
0
File: output.c Progetto: rforge/rcwb
/* tabulate specified query result, using settings from global list of tabulation items;
   return value indicates whether tabulation was successful (otherwise, generates error message) */
int
print_tabulation(CorpusList *cl, int first, int last, struct Redir *rd)
{
  TabulationItem item = TabulationList;
  int current;
  
  if (! cl) 
    return 0;

  if (first <= 0) first = 0;    /* make sure that first and last match to tabulate are in range */
  if (last >= cl->size) last = cl->size - 1;

  while (item) {                /* obtain attribute handles for tabulation items */
    if (item->attribute_name) {
      if (NULL != (item->attribute = cl_new_attribute(cl->corpus, item->attribute_name, ATT_POS))) {
        item->attribute_type = ATT_POS;
      }
      else if (NULL != (item->attribute = cl_new_attribute(cl->corpus, item->attribute_name, ATT_STRUC))) {
        item->attribute_type = ATT_STRUC;
        if (! cl_struc_values(item->attribute)) {
          cqpmessage(Error, "No annotated values for s-attribute ``%s'' in named query %s", item->attribute_name, cl->name);
          return 0;
        }
      }
      else {
        cqpmessage(Error, "Can't find attribute ``%s'' for named query %s", item->attribute_name, cl->name);
        return 0;
      }
    }
    else {
      item->attribute_type = ATT_NONE; /* no attribute -> print corpus position */
    }
    if (cl->size > 0) {
      /* work around bug: anchor validation will fail for empty query result (but then loop below is void anyway) */
      if (! (pt_validate_anchor(cl, item->anchor1) && pt_validate_anchor(cl, item->anchor2)))
	return 0;
    }
    item = item->next;
  }

  if (! open_stream(rd, cl->corpus->charset)) {
    cqpmessage(Error, "Can't redirect output to file or pipe\n");
    return 0;
  }

  /* tabulate selected attribute values for matches <first> .. <last> */
  for (current = first; current <= last; current++) {
    TabulationItem item = TabulationList;
    while (item) {
      int start = pt_get_anchor_cpos(cl, current, item->anchor1, item->offset1);
      int end   = pt_get_anchor_cpos(cl, current, item->anchor2, item->offset2);
      int cpos;

      if (start < 0 || end < 0) /* one of the anchors is undefined -> print single undefined value for entire range */
        start = end = -1;

      for (cpos = start; cpos <= end; cpos++) {
        if (item->attribute_type == ATT_NONE) {
          fprintf(rd->stream, "%d", cpos);
        }
        else {
          if (cpos >= 0) {      /* undefined anchors print empty string */
            char *string = NULL;
            if (item->attribute_type == ATT_POS) 
              string = cl_cpos2str(item->attribute, cpos);
            else
              string = cl_cpos2struc2str(item->attribute, cpos);
            if (string) {
              if (item->flags) {
                char *copy = cl_strdup(string);
                cl_string_canonical(copy, cl->corpus->charset, item->flags);
                fprintf(rd->stream, "%s", copy);
                cl_free(copy);
              }
              else {
                fprintf(rd->stream, "%s", string);
              }
            }
          }
        }
        if (cpos < end)         /* multiple values for tabulation item are separated by blanks */
          fprintf(rd->stream, " "); 
      }
      if (item->next)           /* multiple tabulation items are separated by TABs */
        fprintf(rd->stream, "\t");
      item = item->next;
    }
    fprintf(rd->stream, "\n");
  }
  
  close_stream(rd);
  free_tabulation_list();
  return 1;
}
Esempio n. 9
0
/**
 * Main function for cwb-huffcode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int 
main(int argc, char **argv)
{
  char *registry_directory = NULL;
  char *output_fn = NULL;
  char *attr_name = DEFAULT_ATT_NAME;
  Attribute *attr;

  HCD hc;

  extern int optind;
  extern char *optarg;
  int c;
  
  int i_want_to_believe = 0;        /* skip error checks? */
  int all_attributes = 0;

  protocol = stdout;                /* 'delayed' init (see top of file) */

  /* ------------------------------------------------- PARSE ARGUMENTS */

  progname = argv[0];

  /* parse arguments */
  while ((c = getopt(argc, argv, "+TvP:r:f:dAh")) != EOF) {
    switch (c) {

      /* T: skip decompression / error checking pass ("I trust you")  */
    case 'T':
      i_want_to_believe++;
      break;

      /* v: verbose -> displays protocol of compression process on stdout */
    case 'v':
      do_protocol++;
      break;

      /* P: attribute to compress */
    case 'P':
      attr_name = optarg;
      break;

      /* r: registry directory */
    case 'r': 
      if (registry_directory == NULL) 
        registry_directory = optarg;
      else {
        fprintf(stderr, "%s: -r option used twice\n", progname);
        exit(2);
      }
      break;
      
      /* f: filename prefix for compressed data files */
    case 'f':
      output_fn = optarg;
      break;
      
      /* d: debug mode  --- unused */
    case 'd':
      debug++;
      break;

      /* A: compress all attributes */
    case 'A':
      all_attributes++;
      break;

      /* h: help page */
    case 'h':
      huffcode_usage(NULL, 2);
      break;

    default: 
      huffcode_usage("illegal option.", 2);
      break;
    }
  }
  
  /* single argument: corpus id */
  if (optind < argc) {
    corpus_id = argv[optind++];
  }
  else {
    huffcode_usage("corpus not specified (missing argument)", 1);
  }

  if (optind < argc) {
    huffcode_usage("Too many arguments", 1);
  }
  
  if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) {
    fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n", 
            corpus_id,
            (registry_directory ? registry_directory
               : central_corpus_directory()));
    exit(1);
  }

  if (all_attributes) {
    for (attr = corpus->attributes; attr; attr = attr->any.next)
      if (attr->any.type == ATT_POS) {
        compute_code_lengths(attr, &hc, output_fn);
        if (! i_want_to_believe)
          decode_check_huff(attr, output_fn);
      }
  }
  else {
    if ((attr = cl_new_attribute(corpus, attr_name, ATT_POS)) == NULL) {
      fprintf(stderr, "Attribute %s.%s doesn't exist. Aborted.\n", 
              corpus_id, attr_name);
      exit(1);
    }
    compute_code_lengths(attr, &hc, output_fn);
    if (! i_want_to_believe)
      decode_check_huff(attr, output_fn);
  }
  
  cl_delete_corpus(corpus);
  
  exit(0);
}