Ejemplo n.º 1
0
/**
 * Main function for cwb-huffcode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int 
R_cwb_huffcode (char *corpus_name, char *registry_dir)
{
  char *registry_directory = registry_dir;
  char *output_fn = NULL;
  char *attr_name = DEFAULT_ATT_NAME;
  Attribute *attr;

  HCD hc;

  Rprintf("Corpus: %s\n", corpus_name);

  int i_want_to_believe = 0;        /* skip error checks? */
  int all_attributes = 0;

  protocol = NULL;                /* 'delayed' init (see top of file) */

  /* ------------------------------------------------- PARSE ARGUMENTS */
  /* parse arguments */

  all_attributes++;
  corpus_id_cwb_huffcode = corpus_name;

  Rprintf("Corpus 1: %s\n", corpus_name);
  
  if ((corpus = cl_new_corpus(registry_directory, corpus_id_cwb_huffcode)) == NULL) {
   Rprintf( "Corpus %s not found in registry %s . Aborted.\n", 
            corpus_id_cwb_huffcode,
            (registry_directory ? registry_directory
               : central_corpus_directory()));
    rcqp_receive_error(1);
  }

  Rprintf("Corpus 2: %s\n", corpus_name);

  if (all_attributes) {
    for (attr = corpus->attributes; attr; attr = attr->any.next)
      if (attr->any.type == ATT_POS) {
        compute_code_lengths(attr, &hc, output_fn);
        if (! i_want_to_believe)
          decode_check_huff(attr, output_fn);
      }
  }
  else {
    if ((attr = cl_new_attribute(corpus, attr_name, ATT_POS)) == NULL) {
     Rprintf( "Attribute %s.%s doesn't exist. Aborted.\n", 
              corpus_id_cwb_huffcode, attr_name);
      rcqp_receive_error(1);
    }
    compute_code_lengths(attr, &hc, output_fn);
    if (! i_want_to_believe)
      decode_check_huff(attr, output_fn);
  }
  
  Rprintf("Corpus 3: %s\n", corpus_name);

  cl_delete_corpus(corpus);
  
  return(0);
}
Ejemplo n.º 2
0
/**
 * Main function for cwb-compress-rdx.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
R_cwb_compress_rdx(char *corpus_name, char * registry_dir)
{
  char *registry_directory = registry_dir;
  char *attr_name = DEFAULT_ATT_NAME;
  Attribute *attr;

  char *output_fn = NULL;
  char *debug_fn = NULL;

  int i_want_to_believe = 0;        /* skip error checks? */
  int all_attributes = 0;

//  debug_output = stderr;        /* 'delayed' init (see top of file) */

  /* ------------------------------------------------- PARSE ARGUMENTS */


  /* parse arguments */
      all_attributes++;
    corpus_id_cwb_compress_rdx = corpus_name;

  if (debug_fn)  {
    if (strcmp(debug_fn, "-") == 0)
      debug_output = NULL;
    else if ((debug_output = fopen(debug_fn, "w")) == NULL) {
     Rprintf( "Can't write debug_cwb_compress_rdx output to file %s. Aborted.", debug_fn);
      perror(debug_fn);
      compressrdx_cleanup(1);
    }
  }

  if ((corpus = cl_new_corpus(registry_directory, corpus_id_cwb_compress_rdx)) == NULL) {
   Rprintf( "Corpus %s not found in registry %s . Aborted.\n", 
            corpus_id_cwb_compress_rdx,
            (registry_directory ? registry_directory : cl_standard_registry()));
    compressrdx_cleanup(1);
  }

  if (all_attributes) {
    for (attr = corpus->attributes; attr; attr = attr->any.next)
      if (attr->any.type == ATT_POS) {
        compress_reversed_index(attr, output_fn);
        if (! i_want_to_believe)
          decompress_check_reversed_index(attr, output_fn);
      }
  }
  else {
    if ((attr = cl_new_attribute_oldstyle(corpus, attr_name, ATT_POS, NULL)) == NULL) {
     Rprintf( "Attribute %s.%s doesn't exist. Aborted.\n", corpus_id_cwb_compress_rdx, attr_name);
      compressrdx_cleanup(1);
    }
    compress_reversed_index(attr, output_fn);
    if (! i_want_to_believe) 
      decompress_check_reversed_index(attr, output_fn);
  }
  
  compressrdx_cleanup(0);
  return(0);                        /* to keep gcc from complaining */
}
Ejemplo n.º 3
0
/**
 * Main function for cwb-makeall.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  char *attr_name = NULL;
  Attribute *attribute;

  char *registry_directory = NULL;
  char *corpus_id = NULL;

  extern int optind;
  extern char *optarg;
  int c;

  int validate = 0;

  char *component = NULL;

  ComponentID cid;
  int i = 0;

  /* ------------------------------------------------- PARSE ARGUMENTS */

  progname = argv[0];

  /* parse arguments */
  while ((c = getopt(argc, argv, "+r:c:P:hDM:V")) != EOF) {
    switch (c) {

    /* r: registry directory */
    case 'r':
      if (registry_directory == NULL)
        registry_directory = optarg;
      else {
        fprintf(stderr, "%s: -r option used twice\n", progname);
        exit(2);
      }
      break;

    case 'P':
      if (attr_name == NULL)
        attr_name = optarg;
      else {
        fprintf(stderr, "%s: -P option used twice\n", progname);
        exit(2);
      }
      break;

    case 'c':
      if (component == NULL)
        component = optarg;
      else {
        fprintf(stderr, "%s: -c option used twice\n", progname);
        exit(2);
      }
      break;

    case 'D':
      cl_set_debug_level(1);
      break;

    case 'M':
      i = atoi(optarg);
      cl_set_memory_limit(i);
      break;

    case 'V':
      validate++;
      break;

    case 'h':
    default:
      makeall_usage();
    }
  }

  if (optind >= argc) {
    fprintf(stderr, "Missing argument, try \"%s -h\" for more information.\n", progname);
    exit(1);
  }

  /* first argument: corpus id */
  corpus_id = argv[optind++];

  if (component != NULL) {
    cid = component_id(component);
    if (cid == CompLast) {
      fprintf(stderr, "Illegal component name: ``%s''\n", component);
      exit(1);
    }
  }
  else {
    cid = CompLast;
  }


  if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) {
    fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n",
            corpus_id,
            (registry_directory ? registry_directory
             : central_corpus_directory()));
    exit(1);
  }

  printf("=== Makeall: processing corpus %s ===\n", corpus_id);
  printf("Registry directory: %s\n", corpus->registry_dir);

  if (optind < argc) {
    for (i = optind; i < argc; i++) {
      if ((attribute = cl_new_attribute(corpus, argv[i], ATT_POS)) != NULL) {
        makeall_do_attribute(attribute, cid, validate);
      }
      else {
        fprintf(stderr, "p-attribute %s.%s not defined. Aborted.\n",
                corpus_id, attr_name);
        exit(1);
      }
    }
  }
  else if (attr_name != NULL) {
    if ((attribute = cl_new_attribute(corpus, attr_name, ATT_POS)) != NULL) {
      makeall_do_attribute(attribute, cid, validate);
    }
    else {
      fprintf(stderr, "p-attribute %s.%s not defined. Aborted.\n",
              corpus_id, attr_name);
      exit(1);
    }
  }
  else {
    /* process each p-attribute of the corpus in turn */
    for (attribute = corpus->attributes; attribute; attribute = attribute->any.next)
      if (attribute->type == ATT_POS) {
        ComponentID my_cid;

        makeall_do_attribute(attribute, cid, validate);
        /* now destoy all components; this makes the attribute unusable,
           but it is currently the only way to free allocated and memory-mapped data */
        for (my_cid = CompDirectory; my_cid < CompLast; my_cid++) { /* ordering gleaned from attributes.h */
          drop_component(attribute, my_cid);
        }
      }
  }

  printf("========================================\n");
  exit(0);
}
Ejemplo n.º 4
0
/**
 * Main function for cwb-decode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  Attribute *attr;
  Attribute *context = NULL;

  int sp;  /* start position of a match */
  int ep;  /* end position of a match */

  int w, cnt, read_pos_frm_stdin;

  char s[CL_MAX_LINE_LENGTH];      /* buffer for strings read from file */
  char *token;

  char *input_filename = NULL;
  FILE *input_file = stdin;

  /* ------------------------------------------------- PARSE ARGUMENTS */

  int c;
  extern char *optarg;
  extern int optind;

  progname = argv[0];

  first_token = -1;
  last = -1;
  maxlast = -1;

  read_pos_frm_stdin = 0;

  /* use getopt() to parse command-line options */
  while((c = getopt(argc, argv, "+s:e:r:nLHCxXf:ph")) != EOF)
    switch(c) {

      /* s: start corpus position */
    case 's':
      first_token = atoi(optarg);
      break;

      /* e: end corpus position */
    case 'e':
      last = atoi(optarg);
      break;

      /* r: registry directory */
    case 'r':
      if (registry_directory == NULL)
        registry_directory = optarg;
      else {
        fprintf(stderr, "%s: -r option used twice\n", progname);
        exit(2);
      }
      break;

      /* n: show cpos in -H mode */
    case 'n':
      printnum++;
      break;

      /* x: XML-compatible output in -C mode (-Cx) */
    case 'x':
      xml_compatible++;
      break;

      /* L,H,C,X: Lisp, Horizontal, Compact, and XML modes */
    case 'L':
      mode = LispMode;
      break;
    case 'H':
      mode = ConclineMode;
      break;
    case 'C':
      mode = EncodeMode;
      break;
    case 'X':
      mode = XMLMode;
      break;

      /* f: matchlist mode / read corpus positions from file */
    case 'f':
      input_filename = optarg;
      break;

      /* p: matchlist mode / read corpus positions from stdin */
    case 'p':
      read_pos_frm_stdin++;
      break;

      /* h: help page */
    case 'h':
      decode_usage(2);
      break;

    default:
      fprintf(stderr, "Illegal option. Try \"%s -h\" for more information.\n", progname);
      fprintf(stderr, "[remember that options go before the corpus name, and attribute declarations after it!]\n");
      decode_cleanup(2);
    }

  /* required argument: corpus id */
  if (optind < argc) {
    corpus_id = argv[optind++];

    if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) {
      fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n",
              corpus_id,
              (registry_directory ? registry_directory : cl_standard_registry() ) );
      decode_cleanup(1);
    }
  }
  else {
    fprintf(stderr, "Missing argument. Try \"%s -h\" for more information.\n", progname);
    decode_cleanup(2);
  }


  /* now parse output flags (-P, -S, ...) [cnt is our own argument counter] */
  for (cnt = optind; cnt < argc; cnt++) {
    if (strcmp(argv[cnt], "-c") == 0) {         /* -c: context */

      if ((context = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) {
        fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }

    }
    else if (strcmp(argv[cnt], "-P") == 0) {    /* -P: positional attribute */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_POS)) == NULL) {
        fprintf(stderr, "Can't open p-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else {
        if (cl_max_cpos(attr) > 0) {
          decode_add_attribute(attr);
          if (maxlast < 0)
            maxlast = cl_max_cpos(attr); /* determines corpus size */
        }
        else {
          fprintf(stderr, "Attribute %s.%s is declared, but not accessible (missing data?). Aborted.\n",
                  corpus_id, argv[cnt]);
          decode_cleanup(1);
        }
      }

    }
    else if (strcmp(argv[cnt], "-ALL") == 0) {  /* -ALL: all p-attributes and s-attributes */

      for (attr = corpus->attributes; attr; attr = attr->any.next)
        if (attr->any.type == ATT_POS) {
          decode_add_attribute(attr);
          if (maxlast < 0)
            maxlast = cl_max_cpos(attr);
        }
        else if (attr->any.type == ATT_STRUC) {
          decode_add_attribute(attr);
        }

    }
    else if (strcmp(argv[cnt], "-D") == 0) {    /* -D: dynamic attribute (not implemented) */

      fprintf(stderr, "Sorry, dynamic attributes are not implemented. Aborting.\n");
      decode_cleanup(2);

    }
    else if (strcmp(argv[cnt], "-A") == 0) {    /* -A: alignment attribute */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_ALIGN)) == NULL) {
        fprintf(stderr, "Can't open a-attribute %s.%s . Aborted.\n", corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else
        decode_add_attribute(attr);
    }
    else if (strcmp(argv[cnt], "-S") == 0) {    /* -S: structural attribute (as tags) */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) {
        fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else
        decode_add_attribute(attr);
    }
    else if (strcmp(argv[cnt], "-V") == 0) {    /* -V: show structural attribute values (with -p or -f) */

      if ((attr = cl_new_attribute(corpus, argv[++cnt], ATT_STRUC)) == NULL) {
        fprintf(stderr, "Can't open s-attribute %s.%s . Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else if (!cl_struc_values(attr)) {
        fprintf(stderr, "S-attribute %s.%s does not have annotations. Aborted.\n",
                corpus_id, argv[cnt]);
        decode_cleanup(1);
      }
      else if (printValuesIndex >= MAX_PRINT_VALUES) {
        fprintf(stderr, "Too many -V attributes, sorry. Aborted.\n");
        decode_cleanup(1);
      }
      else
        printValues[printValuesIndex++] = attr;
    }
    else {

      fprintf(stderr, "Unknown flag: %s\n", argv[cnt]);
      decode_cleanup(2);

    }
  }
  /* ---- end of parse attribute declarations ---- */

  if (input_filename != NULL) {
    if (strcmp(input_filename, "-") == 0)
      input_file = stdin;
    else if ((input_file = fopen(input_filename, "r")) == NULL) {
      perror(input_filename);
      exit(1);
    }
    read_pos_frm_stdin++;
  }

  decode_verify_print_value_list();

  /* ------------------------------------------------------------ DECODE CORPUS */

  if (read_pos_frm_stdin == 0) {
    /*
     * normal mode: decode entire corpus or specified range
     */

    if (maxlast < 0) {
      fprintf(stderr, "Need at least one p-attribute (-P flag). Aborted.\n");
      decode_cleanup(2);
    }

    if (first_token < 0 || first_token >= maxlast)
      first_token = 0;

    if (last < 0 || last >= maxlast)
      last = maxlast - 1;

    if (last < first_token) {
      fprintf(stderr, "Warning: output range #%d..#%d is empty. No output.\n", first_token, last);
      decode_cleanup(2);
    }

    if ( (mode == XMLMode) ||  ((mode == EncodeMode) && xml_compatible) ) {
      decode_print_xml_declaration();
      printf("<corpus name=\"%s\" start=\"%d\" end=\"%d\">\n",
             corpus_id, first_token, last);
    }

    /* decode_print_surrounding_s_att_values(first_token); */ /* don't do that in "normal" mode, coz it doesn't make sense */

    for (w = first_token; w <= last; w++)
      decode_print_token_sequence(w, -1, context);

    if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) {
      printf("</corpus>\n");
    }
  }
  else {
    /*
     * matchlist mode: read (pairs of) corpus positions from stdin or file
     */

    if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) {
      decode_print_xml_declaration();
      printf("<matchlist corpus=\"%s\">\n", corpus_id);
    }

    cnt = 0;
    while (fgets(s, CL_MAX_LINE_LENGTH, input_file) != NULL) {

      token = strtok(s, " \t\n");

      if ((token != NULL) && is_num(token)) {
        sp = atoi(token);

        ep = -1;
        if ((token = strtok(NULL, " \t\n")) != NULL) {
          if (!is_num(token)) {
            fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", token);
            decode_cleanup(1);
          }
          else
            ep = atoi(token);
        }

        cnt++;                  /* count matches in matchlist  */
        if (mode == XMLMode) {
          printf("<match nr=\"%d\"", cnt);
          if (printnum)
            printf(" start=\"%d\" end=\"%d\"", sp, (ep >= 0) ? ep : sp);
          printf(">\n");
        }
        else {
          /* nothing shown before range */
        }

        decode_print_surrounding_s_att_values(sp);

        decode_print_token_sequence(sp, ep, context);

        if (mode == XMLMode) {
          printf("</match>\n");
        }
        else if (mode != ConclineMode) {
          printf("\n");         /* blank line, unless in -H mode */
        }
      }
      else {
        fprintf(stderr, "Invalid corpus position #%s . Aborted.\n", s);
        decode_cleanup(1);
      }
    }

    if (input_file != stdin)
      fclose(input_file);

    if ( (mode == XMLMode) || ((mode == EncodeMode) && xml_compatible) ) {
      printf("</matchlist>\n");
    }
  }

  decode_cleanup(0);
  return 0;                     /* just to keep gcc from complaining */
}
Ejemplo n.º 5
0
/**
 * Parse options and set global variables
 */
void
sencode_parse_options(int argc, char **argv)
{
  int c;
  extern char *optarg;
  extern int optind;

  /* by default, output files are written to current directory */
  char *directory = ".";
  /* may need to set registry if source corpus is specified */
  char *registry = NULL;
  /* source corpus _may_ be set with the -C switch */
  char *corpus_name = NULL;

  /* if text_fd is unspecified, stdin will be used */
  text_fd = NULL;
  /* make sure either -S or -V is used: reset new_satt.name now & check after getopt */
  new_satt.name = NULL;

  while((c = getopt(argc, argv, "+qBd:f:msDS:V:r:C:Mah")) != EOF)
    switch(c) {

      /* q: be silent (quiet) */
    case 'q':
      silent++;
      break;

      /* B: strip blanks */
    case 'B':
      strip_blanks_in_values++;
      break;

      /* d: directory for generated data files */
    case 'd':
      directory = optarg;
      break;

      /* f: read input from file */
    case 'f':
      if (text_fd) {
        Rprintf( "Error: -f option used twice\n\n");
        rcqp_receive_error(1);
      }
      if ((text_fd = fopen(optarg, "r")) == NULL) {
        perror("Can't open input file");
        rcqp_receive_error(1);
      }
      break;

      /* M: compile list in memory, then write to disk */
    case 'M':
      in_memory++;
      break;

      /* a: add to existing attribute (implies -M) */
    case 'a':
      add_to_existing++;
      in_memory++;
      break;

      /* r: registry directory */
    case 'r':
      registry = optarg;
      break;

      /* C: source corpus */
    case 'C':
      corpus_name = optarg;
      break;

      /* m: set ('multi-value') attribute */
    case 'm':
      set_att = set_any;        /* don't know yet whether it's '|'-delimited or "split on whitespace" */
      break;

      /* s: strict syntax checks on set attribute */
    case 's':
      set_syntax_strict++;
      break;

      /* D: debug mode */
    case 'D':
      debug++;
      break;

      /* S: s-attribute without annotations */
    case 'S':
      sencode_declare_new_satt(optarg, directory, 0);
      if (optind < argc) {
        Rprintf( "Error: -S <att> must be last flag on command line.\n\n");
        rcqp_receive_error(1);
      }
      break;

      /* V: s-attribute with annotations */
    case 'V':
      sencode_declare_new_satt(optarg, directory, 1);
      if (optind < argc) {
        Rprintf( "Error: -V <att> must be last flag on command line.\n\n");
        rcqp_receive_error(1);
      }
      break;

    /* default or -h: error */
    case 'h':
    default:
      sencode_usage();
      break;
    }

  /* now, check the default and obligatory values */
  if (!text_fd)
    text_fd = stdin;
  if (new_satt.name == NULL) {
    Rprintf( "Error: either -S or -V flag must be specified.\n\n");
    rcqp_receive_error(1);
  }
  if (optind < argc) {
    Rprintf( "Error: extra arguments.\n\n");
    rcqp_receive_error(1);
  }

  /* if -C <corpus> was specified, open source corpus */
  if (corpus_name != NULL) {
    corpus = cl_new_corpus(registry, corpus_name);
    if (corpus == NULL) {
      Rprintf( "Error: Can't find corpus <%s>!\n", corpus_name);
      rcqp_receive_error(1);
    }
  }

}
Ejemplo n.º 6
0
/**
 * Main function for cwb-align-encode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char *argv[])
{
  int argindex;                         /* index of first argument in argv[] */

  char *align_name = NULL;              /* name of the .align file */
  FILE *af = NULL;                      /* alignment file handle */
  int af_is_pipe;                       /* need to know whether to call fclose() or pclose() */
  char alx_name[CL_MAX_LINE_LENGTH];    /* full pathname of .alx file */
  char alg_name[CL_MAX_LINE_LENGTH];    /* full pathname of optional .alg file */
  FILE *alx=NULL, *alg=NULL;            /* file handles for .alx and optional .alg file */

  char line[CL_MAX_LINE_LENGTH];        /* one line of input from <infile> */

  char corpus1_name[CL_MAX_FILENAME_LENGTH];
  char corpus2_name[CL_MAX_FILENAME_LENGTH];
  char s1_name[CL_MAX_FILENAME_LENGTH];
  char s2_name[CL_MAX_FILENAME_LENGTH];
  Corpus *corpus1, *corpus2;            /* corpus handles */
  Attribute *w1, *w2;                   /* attribute handles for 'word' attributes; used to determine corpus size */
  int size1, size2;                     /* size of source & target corpus */

  Corpus *source_corpus;                /* encode alignment in this corpus (depends on -R flag, important for -D option) */
  char *source_corpus_name;             /* just for error messages */
  char *attribute_name;                 /* name of alignment attribute (depends on -R flag, must be lowercase) */

  int f1,l1,f2,l2;                      /* alignment regions */
  int current1, current2;
  int mark, n_0_1, n_1_0;

  int l;

  progname = argv[0];

  /* parse command line and read arguments */
  argindex = alignencode_parse_args(argc, argv, 1);
  align_name = argv[argindex];

  /* open alignment file and parse header; .gz files are automatically decompressed */
  af_is_pipe = 0;
  l = strlen(align_name);
  if ((l > 3) && (strncasecmp(align_name + l - 3, ".gz", 3) == 0)) {
    char *pipe_cmd = (char *) cl_malloc(l+10);
    sprintf(pipe_cmd, "gzip -cd %s", align_name); /* write .gz file through gzip pipe */
    af = popen(pipe_cmd, "r");
    if (af == NULL) {
      perror(pipe_cmd);
     Rprintf( "%s: can't read compressed file %s\n", progname, align_name);
      rcqp_receive_error(1);
    }
    af_is_pipe = 1;
    cl_free(pipe_cmd);
  }
  else {
    af = fopen(align_name, "r");
    if (af == NULL) {
      perror(align_name);
     Rprintf( "%s: can't read file %s\n", progname, align_name);
      rcqp_receive_error(1);
    }
  }

  /* read header = first line */
  fgets(line, CL_MAX_LINE_LENGTH, af);
  if (4 != sscanf(line, "%s %s %s %s", corpus1_name, s1_name, corpus2_name, s2_name)) {
   Rprintf( "%s: %s not in .align format\n", progname, align_name);
   Rprintf( "wrong header: %s", line);
    rcqp_receive_error(1);
  }
  if (verbose) {
    if (reverse)
     Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus2_name, corpus1_name, align_name);
    else
     Rprintf("Encoding alignment for [%s, %s] from file %s\n", corpus1_name, corpus2_name, align_name);
  }

  /* open corpora and determine their sizes (for validity checks and compatibility mode) */
  if (NULL == (corpus1 = cl_new_corpus(registry_dir, corpus1_name))) {
   Rprintf( "%s: can't open corpus %s\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  if (NULL == (corpus2 = cl_new_corpus(registry_dir, corpus2_name))) {
   Rprintf( "%s: can't open corpus %s\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }
  if (NULL == (w1 = cl_new_attribute(corpus1, "word", ATT_POS))) {
   Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  if (NULL == (w2 = cl_new_attribute(corpus2, "word", ATT_POS))) {
   Rprintf( "%s: can't open p-attribute %s.word\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }

  size1 = cl_max_cpos(w1);
  if (size1 <= 0) {
   Rprintf( "%s: data access error (%s.word)\n", progname, corpus1_name);
    rcqp_receive_error(1);
  }
  size2 = cl_max_cpos(w2);
  if (size2 <= 0) {
   Rprintf( "%s: data access error (%s.word)\n", progname, corpus2_name);
    rcqp_receive_error(1);
  }

  /* now work out the actual source corpus and the alignment attribute name (depending on -R flag) */
  source_corpus = (reverse) ? corpus2 : corpus1;
  source_corpus_name = (reverse) ? corpus2_name : corpus1_name;
  attribute_name = cl_strdup((reverse) ? corpus1_name : corpus2_name);
  cl_id_tolower(attribute_name); /* fold attribute name to lowercase */

  /* with -D option, determine data file name(s) from actual source corpus;
     otherwise use directory specified with -d and the usual naming conventions */
  if (data_dir_from_corpus) {
    Attribute *alignment = cl_new_attribute(source_corpus, attribute_name, ATT_ALIGN);
    char *comp_pathname;

    if (alignment == NULL) {
     Rprintf( "%s: alignment attribute %s.%s not declared in registry file\n",
              progname, source_corpus_name, attribute_name);
      rcqp_receive_error(1);
    }
    comp_pathname = component_full_name(alignment, CompXAlignData, NULL);
    if (comp_pathname == NULL) {
     Rprintf( "%s: can't determine pathname for .alx file (internal error)\n", progname);
      rcqp_receive_error(1);
    }
    strcpy(alx_name, comp_pathname); /* need to strcpy because component_full_name() returns pointer to internal buffer */
    if (compatibility) {
      comp_pathname = component_full_name(alignment, CompAlignData, NULL);
      if (comp_pathname == NULL) {
       Rprintf( "%s: can't determine pathname for .alg file (internal error)\n", progname);
        rcqp_receive_error(1);
      }
      strcpy(alg_name, comp_pathname);
    }
  }
  else {
    sprintf(alx_name, "%s" SUBDIR_SEP_STRING "%s.alx", data_dir, attribute_name);
    if (compatibility)
      sprintf(alg_name, "%s" SUBDIR_SEP_STRING "%s.alg", data_dir, attribute_name);
  }

  /* now open output file(s) */
  alx = fopen(alx_name, "wb");
  if (alx == NULL) {
    perror(alx_name);
   Rprintf( "%s: can't write file %s\n", progname, alx_name);
    rcqp_receive_error(1);
  }
  if (verbose)
   Rprintf("Writing file %s ...\n", alx_name);

  if (compatibility) {
    alg = fopen(alg_name, "wb");
    if (alg == NULL) {
      perror(alg_name);
     Rprintf( "%s: can't write file %s\n", progname, alg_name);
      rcqp_receive_error(1);
    }

    if (verbose)
     Rprintf("Writing file %s ...\n", alg_name);
  }

  /* main encoding loop */
  f1 = f2 = l1 = l2 = 0;
  mark = -1;                        /* check that regions occur in ascending order */
  current1 = current2 = -1;         /* for compatibility mode */
  n_0_1 = n_1_0 = 0;                /* number of 0:1 and 1:0 alignments, which are skipped */
  while (! feof(af)) {
    if (NULL == fgets(line, CL_MAX_LINE_LENGTH, af))
      break;                        /* end of file (or read error, which we choose to ignore) */
    if (4 != sscanf(line, "%d %d %d %d", &f1, &l1, &f2, &l2)) {
     Rprintf( "%s: input format error: %s", progname, line);
      rcqp_receive_error(1);
    }

    /* skip 0:1 and 1:0 alignments */
    if (l1 < f1) {
      n_0_1++; continue;
    }
    if (l2 < f2) {
      n_1_0++; continue;
    }

    /* check that source regions are non-overlapping and in ascending order */
    if (((reverse) ? f2 : f1) <= mark) {
     Rprintf( "%s: source regions of alignment must be in ascending order\n", progname);
     Rprintf( "Last region was [*, %d]; current is [%d, %d].\n", mark, f1, l1);
     Rprintf( "Aborted.\n");
      rcqp_receive_error(1);
    }
    mark = (reverse) ? l2 : l1;

    /* write alignment region to .alx file */
    if (reverse) {
      NwriteInt(f2, alx); NwriteInt(l2, alx);
      NwriteInt(f1, alx); NwriteInt(l1, alx);
    }
    else {
      NwriteInt(f1, alx); NwriteInt(l1, alx);
      NwriteInt(f2, alx); NwriteInt(l2, alx);
    }

    if (compatibility) {
      /* source and target regions of .alg file must be contiguous; store start points only; */
      /* hence we must collapse crossing alignments into one larger region (I know that's bullshit) */
      if ((f1 > current1) && (f2 > current2)) {
        if (reverse) {
          NwriteInt(f2, alg); NwriteInt(f1, alg);
        }
        else {
          NwriteInt(f1, alg); NwriteInt(f2, alg);
        }
        current1 = f1;
        current2 = f2;
      }
    }
  }
  if (compatibility) {
    if (reverse) {
      NwriteInt(size2, alg); NwriteInt(size1, alg); /* end of corpus alignment point*/
    }
    else {
      NwriteInt(size1, alg); NwriteInt(size2, alg); /* end of corpus alignment point*/
    }
  }

  if (verbose) {
   Rprintf("I skipped %d 0:1 alignments and %d 1:0 alignments.\n", n_0_1, n_1_0);
  }

  /* that's it; close file handles */
  fclose(alx);
  if (compatibility)
    fclose(alg);

  if (af_is_pipe)
    pclose(af);
  else
    fclose(af);

  return 0;
}
Ejemplo n.º 7
0
/**
 * Main function for cwb-describe-corpus.
 *
 * Prints information about an indexed corpus to STDOUT.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int
main(int argc, char **argv)
{
  int i;
  Corpus *corpus;

  int c;
  extern char *optarg;
  extern int optind;

  int show_stats = 0;
  int show_details = 0;

  char *registry = NULL;

  progname = argv[0];
  while ((c = getopt(argc, argv, "+r:sdh")) != EOF) {
    switch(c) {
      
      /* -r <dir>: change registry directory */
    case 'r':
      if (registry == NULL)
        registry = optarg;
      else {
        Rprintf( "%s: -r option used twice\n", progname);
        rcqp_receive_error(2);
      }
      break;
      
      /* -s: show statistics */
    case 's':
      show_stats++;
      break;

      /* -d: show details */
    case 'd':
      show_details++;
      break;

      /* -h: help page */
    case 'h':
    default:
      describecorpus_usage();
      break;
    }

  }

  if (optind >= argc) {
    Rprintf( "Missing argument, try \"%s -h\" for more information.\n", progname);
    rcqp_receive_error(1);
  }

  for (i = optind; i < argc; i++) {
    if ((corpus = cl_new_corpus(registry, argv[i])) == NULL) {
      Rprintf( "ERROR. Can't access corpus %s !\n", argv[i]);
      rcqp_receive_error(1);
    }

    Rprintf("\n============================================================\n");
    Rprintf("Corpus: %s\n", argv[i]);
    Rprintf("============================================================\n\n");

    describecorpus_show_basic_info(corpus, !(show_stats || show_details));
    /* show attribute names only if no other options are selected */

    if (show_stats) {
      describecorpus_show_statistics(corpus);
    }

    if (show_details) {
      describe_corpus(corpus);
    }
  
    cl_delete_corpus(corpus);
  }

  return 0;
}
Ejemplo n.º 8
0
/**
 * Main function for cwb-huffcode.
 *
 * @param argc   Number of command-line arguments.
 * @param argv   Command-line arguments.
 */
int 
main(int argc, char **argv)
{
  char *registry_directory = NULL;
  char *output_fn = NULL;
  char *attr_name = DEFAULT_ATT_NAME;
  Attribute *attr;

  HCD hc;

  extern int optind;
  extern char *optarg;
  int c;
  
  int i_want_to_believe = 0;        /* skip error checks? */
  int all_attributes = 0;

  protocol = stdout;                /* 'delayed' init (see top of file) */

  /* ------------------------------------------------- PARSE ARGUMENTS */

  progname = argv[0];

  /* parse arguments */
  while ((c = getopt(argc, argv, "+TvP:r:f:dAh")) != EOF) {
    switch (c) {

      /* T: skip decompression / error checking pass ("I trust you")  */
    case 'T':
      i_want_to_believe++;
      break;

      /* v: verbose -> displays protocol of compression process on stdout */
    case 'v':
      do_protocol++;
      break;

      /* P: attribute to compress */
    case 'P':
      attr_name = optarg;
      break;

      /* r: registry directory */
    case 'r': 
      if (registry_directory == NULL) 
        registry_directory = optarg;
      else {
        fprintf(stderr, "%s: -r option used twice\n", progname);
        exit(2);
      }
      break;
      
      /* f: filename prefix for compressed data files */
    case 'f':
      output_fn = optarg;
      break;
      
      /* d: debug mode  --- unused */
    case 'd':
      debug++;
      break;

      /* A: compress all attributes */
    case 'A':
      all_attributes++;
      break;

      /* h: help page */
    case 'h':
      huffcode_usage(NULL, 2);
      break;

    default: 
      huffcode_usage("illegal option.", 2);
      break;
    }
  }
  
  /* single argument: corpus id */
  if (optind < argc) {
    corpus_id = argv[optind++];
  }
  else {
    huffcode_usage("corpus not specified (missing argument)", 1);
  }

  if (optind < argc) {
    huffcode_usage("Too many arguments", 1);
  }
  
  if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) {
    fprintf(stderr, "Corpus %s not found in registry %s . Aborted.\n", 
            corpus_id,
            (registry_directory ? registry_directory
               : central_corpus_directory()));
    exit(1);
  }

  if (all_attributes) {
    for (attr = corpus->attributes; attr; attr = attr->any.next)
      if (attr->any.type == ATT_POS) {
        compute_code_lengths(attr, &hc, output_fn);
        if (! i_want_to_believe)
          decode_check_huff(attr, output_fn);
      }
  }
  else {
    if ((attr = cl_new_attribute(corpus, attr_name, ATT_POS)) == NULL) {
      fprintf(stderr, "Attribute %s.%s doesn't exist. Aborted.\n", 
              corpus_id, attr_name);
      exit(1);
    }
    compute_code_lengths(attr, &hc, output_fn);
    if (! i_want_to_believe)
      decode_check_huff(attr, output_fn);
  }
  
  cl_delete_corpus(corpus);
  
  exit(0);
}