Example #1
0
File: regopt.c Project: cran/rcqp
/**
 * Matches a regular expression against a string.
 *
 * The regular expression contained in the CL_Regex is compared to the string.
 * No settings or flags are passed to this function; rather, the
 * settings that rx was created with are used.
 *
 * @param rx   The regular expression to match.
 * @param str  The string to compare the regex to.
 * @return     Boolean: true if the regex matched, otherwise false.
 */
int
cl_regex_match(CL_Regex rx, char *str)
{
  char *haystack; /* either the original string to match against, or a pointer to rx->haystack_buf */
  int optimised = (rx->grains > 0);
  int i, di, k, max_i, len, jump;
  int grain_match, result;
  int ovector[30]; /* memory for pcre to use for back-references in pattern matches */

  if (rx->flags) { /* normalise input string if necessary */
    haystack = rx->haystack_buf;
    strcpy(haystack, str);
    cl_string_canonical(haystack, rx->charset, rx->flags);
  }
  else
    haystack = str;
  len = strlen(haystack);

  if (optimised) {
    /* this 'optimised' matcher may look fairly bloated, but it's still way ahead of POSIX regexen */
    /* string offset where first character of each grain would be */
    grain_match = 0;
    max_i = len - rx->grain_len; /* stop trying to match when i > max_i */
    if (rx->anchor_end)
      i = (max_i >= 0) ? max_i : 0; /* if anchored at end, align grains with end of string */
    else
      i = 0;

    while (i <= max_i) {
      jump = rx->jumptable[(unsigned char) haystack[i + rx->grain_len - 1]];
      if (jump > 0) {
        i += jump; /* Boyer-Moore search */
      }
      else {
        /* for each grain */
        for (k = 0; k < rx->grains; k++) {
          di = 0;
          while ((di < rx->grain_len) && (rx->grain[k][di] == haystack[i + di]))
            di++;
          if (di >= rx->grain_len) {
            grain_match = 1;
            break; /* we have found a grain match and can quit the loop */
          }
        }
        i++;
      }
      if (rx->anchor_start)
        break; /* if anchored at start, only the first iteration can match */
    }
  } /* endif optimised */
  else
    /* if the regex is not optimised, always behave as if a grain was matched */
    grain_match = 1;

  /* if there was a grain-match, we call pcre_exec, which might match or might notfind a match in the end;
   * but if there wasn't a grain-match, we know that pcre won't match; so we don't bother calling it. */

  if (!grain_match) { /* enabled since version 2.2.b94 (14 Feb 2006) -- before: && cl_optimize */
    cl_regopt_successes++;
    result = PCRE_ERROR_NOMATCH;  /* the return code from PCRE when there is, um, no match */
  }
#if 0
  /* for debug purposes: always call pcre regardless of whether the grains matched. */
  /* this allows the code in the below #if 1 to check whether or not grains are behaving as they should. */
  else {
#else
  if (1) {
#endif
    result = pcre_exec(rx->needle, rx->extra, haystack,
                       len, 0, PCRE_NO_UTF8_CHECK,
                       ovector, 30);
    if (result < PCRE_ERROR_NOMATCH && cl_debug)
      /* note, "no match" is a PCRE "error", but all actual errors are lower numbers */
     Rprintf( "CL: Regex Execute Error no. %d (see `man pcreapi` for error codes)\n", result);
  }


#if 1
  /* debugging code used before version 2.2.b94, modified to pcre return values & re-enabled in 3.2.b3 */
  /* check for critical error: optimiser didn't accept candidate, but regex matched */
  if ((result > 0) && !grain_match)
   Rprintf( "CL ERROR: regex optimiser did not accept '%s' although it should have!\n", haystack);
#endif

  return (result > 0); /* return true if regular expression matched */
}

/**
 * Deletes a CL_Regex object.
 *
 * Note that we use cl_free to deallocate the internal PCRE buffers,
 * not pcre_free, for the simple reason that pcre_free is just a
 * function pointer that will normally contain free, and thus we
 * miss out on the checking that cl_free provides.
 *
 * @param rx  The CL_Regex to delete.
 */
void
cl_delete_regex(CL_Regex rx)
{
  /* DON'T use cl_free() for PCRE opaque objects, just in case; use PCRE built-in
   * pcre_free(). Note this will probably just be set to = free(). But it might not.
   * We can let PCRE worry about that. That does mean, however, we should test the
   * pointers for non-nullity before calling pcre_free. Normally we would also set the
   * pointers to NULL after freeing the target. However, in this case, we know the
   * structure they belong to will be freed by the end of the function, so no worries.
   */
  int i;

  if (rx->needle)
    pcre_free(rx->needle);         /* free PCRE regex buffer */
  if (rx->extra)
    pcre_free(rx->extra);          /* and "extra" buffer */
  cl_free(rx->haystack_buf);       /* free string buffer if it was allocated */
  for (i = 0; i < rx->grains; i++)
    cl_free(rx->grain[i]);         /* free grain strings if regex was optimised */

  cl_free(rx);
}
Example #2
0
File: output.c Project: rforge/rcwb
/* tabulate specified query result, using settings from global list of tabulation items;
   return value indicates whether tabulation was successful (otherwise, generates error message) */
int
print_tabulation(CorpusList *cl, int first, int last, struct Redir *rd)
{
  TabulationItem item = TabulationList;
  int current;
  
  if (! cl) 
    return 0;

  if (first <= 0) first = 0;    /* make sure that first and last match to tabulate are in range */
  if (last >= cl->size) last = cl->size - 1;

  while (item) {                /* obtain attribute handles for tabulation items */
    if (item->attribute_name) {
      if (NULL != (item->attribute = cl_new_attribute(cl->corpus, item->attribute_name, ATT_POS))) {
        item->attribute_type = ATT_POS;
      }
      else if (NULL != (item->attribute = cl_new_attribute(cl->corpus, item->attribute_name, ATT_STRUC))) {
        item->attribute_type = ATT_STRUC;
        if (! cl_struc_values(item->attribute)) {
          cqpmessage(Error, "No annotated values for s-attribute ``%s'' in named query %s", item->attribute_name, cl->name);
          return 0;
        }
      }
      else {
        cqpmessage(Error, "Can't find attribute ``%s'' for named query %s", item->attribute_name, cl->name);
        return 0;
      }
    }
    else {
      item->attribute_type = ATT_NONE; /* no attribute -> print corpus position */
    }
    if (cl->size > 0) {
      /* work around bug: anchor validation will fail for empty query result (but then loop below is void anyway) */
      if (! (pt_validate_anchor(cl, item->anchor1) && pt_validate_anchor(cl, item->anchor2)))
	return 0;
    }
    item = item->next;
  }

  if (! open_stream(rd, cl->corpus->charset)) {
    cqpmessage(Error, "Can't redirect output to file or pipe\n");
    return 0;
  }

  /* tabulate selected attribute values for matches <first> .. <last> */
  for (current = first; current <= last; current++) {
    TabulationItem item = TabulationList;
    while (item) {
      int start = pt_get_anchor_cpos(cl, current, item->anchor1, item->offset1);
      int end   = pt_get_anchor_cpos(cl, current, item->anchor2, item->offset2);
      int cpos;

      if (start < 0 || end < 0) /* one of the anchors is undefined -> print single undefined value for entire range */
        start = end = -1;

      for (cpos = start; cpos <= end; cpos++) {
        if (item->attribute_type == ATT_NONE) {
          fprintf(rd->stream, "%d", cpos);
        }
        else {
          if (cpos >= 0) {      /* undefined anchors print empty string */
            char *string = NULL;
            if (item->attribute_type == ATT_POS) 
              string = cl_cpos2str(item->attribute, cpos);
            else
              string = cl_cpos2struc2str(item->attribute, cpos);
            if (string) {
              if (item->flags) {
                char *copy = cl_strdup(string);
                cl_string_canonical(copy, cl->corpus->charset, item->flags);
                fprintf(rd->stream, "%s", copy);
                cl_free(copy);
              }
              else {
                fprintf(rd->stream, "%s", string);
              }
            }
          }
        }
        if (cpos < end)         /* multiple values for tabulation item are separated by blanks */
          fprintf(rd->stream, " "); 
      }
      if (item->next)           /* multiple tabulation items are separated by TABs */
        fprintf(rd->stream, "\t");
      item = item->next;
    }
    fprintf(rd->stream, "\n");
  }
  
  close_stream(rd);
  free_tabulation_list();
  return 1;
}
Example #3
0
File: regopt.c Project: cran/rcqp
/**
 * Create a new CL_regex object (ie a regular expression buffer).
 *
 * The regular expression is preprocessed according to the flags, and
 * anchored to the start and end of the string. (That is, ^ is added to
 * the start, $ to the end.)
 *
 * Then the resulting regex is compiled (using PCRE) and
 * optimised.
 *
 * @param regex    String containing the regular expression
 * @param flags    IGNORE_CASE, or IGNORE_DIAC, or both, or 0.
 * @param charset  The character set of the regex.
 * @return         The new CL_Regex object, or NULL in case of error.
 */
CL_Regex
cl_new_regex(char *regex, int flags, CorpusCharset charset)
{
  char *preprocessed_regex; /* allocate dynamically to support very long regexps (from RE() operator) */
  char *anchored_regex;
  CL_Regex rx;
  int error_num, optimised, i, l;

  int options_for_pcre = 0;
  const char *errstring_for_pcre = NULL;
  int erroffset_for_pcre = 0;

  /* allocate temporary strings */
  l = strlen(regex);
  preprocessed_regex = (char *) cl_malloc(l + 1);
  anchored_regex = (char *) cl_malloc(l + 5);

  /* allocate and initialise CL_Regex object */
  rx = (CL_Regex) cl_malloc(sizeof(struct _CL_Regex));
  rx->haystack_buf = NULL;
  rx->charset = charset;
  rx->flags = flags & (IGNORE_CASE | IGNORE_DIAC); /* mask unsupported flags */
  rx->grains = 0; /* indicates no optimisation -> other opt. fields are invalid */

  /* pre-process regular expression (translate latex escapes and normalise) */
  cl_string_latex2iso(regex, preprocessed_regex, l);
  cl_string_canonical(preprocessed_regex, charset, rx->flags);

  /* add start and end anchors to improve performance of regex matcher for expressions such as ".*ung" */
  sprintf(anchored_regex, "^(%s)$", preprocessed_regex);

  /* compile regular expression with PCRE library function */
  if (charset == utf8) {
    if (cl_debug)
     Rprintf( "CL: enabling PCRE's UTF8 mode for regex %s\n", anchored_regex);
    /* note we assume all strings have been checked upon input (i.e. indexing or by the parser) */
    options_for_pcre = PCRE_UTF8|PCRE_NO_UTF8_CHECK;
    /* we do our own case folding, so we don't need the PCRE_CASELESS flag */
  }
  rx->needle = pcre_compile(anchored_regex, options_for_pcre, &errstring_for_pcre, &erroffset_for_pcre, NULL);
  if (rx->needle == NULL) {
    strcpy(cl_regex_error, errstring_for_pcre);
   Rprintf( "CL: Regex Compile Error: %s\n", cl_regex_error);
    cl_free(rx);
    cl_free(preprocessed_regex);
    cl_free(anchored_regex);
    cl_errno = CDA_EBADREGEX;
    return NULL;
  }
  else if (cl_debug)
   Rprintf( "CL: Regex compiled successfully using PCRE library\n");

  /* always use pcre_study because nearly all our regexes are going to be used lots of times;
   * note that according to man pcre, the optimisation methods are different to those used by
   * the CL's regex optimiser. So it is all good. */
  rx->extra = pcre_study(rx->needle, 0, &errstring_for_pcre);
  if (errstring_for_pcre != NULL) {
    rx->extra = NULL;
    if (cl_debug)
     Rprintf( "CL: calling pcre_study failed with message...\n   %s\n", errstring_for_pcre);
    /* note that failure of pcre_study is not a critical error, we can just continue without
       the extra info */
  }
  if (cl_debug && rx->extra)
   Rprintf( "CL: calling pcre_study produced useful information...\n");

  /* allocate string buffer for cl_regex_match() function if flags are present */
  if (flags)
    rx->haystack_buf = (char *) cl_malloc(CL_MAX_LINE_LENGTH); /* this is for the string being matched, not the regex! */

  /* attempt to optimise regular expression */
  optimised = cl_regopt_analyse(preprocessed_regex);
  if (optimised) {
    /* copy optimiser data to CL_Regex object */
    regopt_data_copy_to_regex_object(rx);
  }

  cl_free(preprocessed_regex);
  cl_free(anchored_regex);
  cl_errno = CDA_OK;
  return rx;
}
Example #4
0
/**
 * Creates feature maps for a source/target corpus pair.
 *
 * This is the constructor function for the FMS class.
 *
 * Example usage:
 *
 * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s);
 *
 * @param config              array of strings representing the feature map configuration.
 * @param config_lines        the number of configuration items stored in config_data.
 * @param w_attr1             The p-attribute in the first corpus to link.
 * @param w_attr2             The p-attribute in the second corpus to link.
 * @param s_attr1             The s-attribute in the first corpus to link.
 * @param s_attr2             The s-attribute in the second corpus to link.
 * @return                    the new FMS object.
 */
FMS
create_feature_maps(char **config,
                    int config_lines,
                    Attribute *w_attr1,
                    Attribute *w_attr2,
                    Attribute *s_attr1,
                    Attribute *s_attr2
                    ) 
{
  FMS r;

  unsigned int *fcount1, *fcount2;    /* arrays for types in the lexicons of the source
                                       * & target corpora, respectively, counting how often each is used
                                       * in a feature */

  int config_pointer;

  char *b, command[CL_MAX_LINE_LENGTH], dummy[CL_MAX_LINE_LENGTH];

  int current_feature;
  int weight;                         /* holds the weight assigned to the feature(s) we're working on */
  int need_to_abort;                  /* boolean used during pointer check */

  /* after we have counted up features, these will become arrays of ints, with one entry per feature */
  int *fs1, *fs2; 

  int i;
  int nw1;  /* number of types on the word-attribute of the source corpus */
  int nw2;  /* number of types on the word-attribute of the target corpus */

  /* one last variable: we need to know the character set of the two corpora for assorted purposes */
  CorpusCharset charset;
  charset = cl_corpus_charset(cl_attribute_mother_corpus(w_attr1));

  /* first, create the FMS object. */
  r = (FMS) malloc(sizeof(feature_maps_t));
  assert(r);

  /* copy in the attribute pointers */
  r->att1 = w_attr1;
  r->att2 = w_attr2;
  r->s1 = s_attr1;
  r->s2 = s_attr2;

  init_char_map();
  
  /* find out how many different word-types occur on each of the p-attributes */
  nw1 = cl_max_id(w_attr1);
  if (nw1 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of source corpus\n");
    exit(1);
  }
  nw2 = cl_max_id(w_attr2);
  if (nw2 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of target corpus\n");
    exit(1);
  }
  
  printf("LEXICON SIZE: %d / %d\n", nw1, nw2);

  fcount1 = (unsigned int*) calloc(nw1 + 1, sizeof(unsigned int));
  fcount2 = (unsigned int*) calloc(nw2 + 1, sizeof(unsigned int));

  r->n_features = 1;


  /* NOTE there are two passes through the creation of feature maps - two sets of nearly identical code!
   * First pass to see how many things we need ot count, second pass to count them. */

  /* process feature map configuration: first pass */
  for (config_pointer = 0; config_pointer < config_lines; config_pointer++) {

    /* strip newline and comments */
    if ( (b = strpbrk(config[config_pointer],"\n#")) )
      *b = 0;

    if (sscanf(config[config_pointer], "%s", command) > 0) {
      if(command[0] == '-') {
        /*
         * These are the FIRST PASS options for the different config lines.
         *
         * Possible config commands: -S -W -C -1 -2 -3 -4
         */
        switch(command[1]) {
        /* -S : the "shared words" type of feature */
        case 'S': {
          int i1, i2; /* i1 and i2 are temporary indexes into the lexicons of the two corpora */
          int f1, f2; /* f1 and f2 are temporary storage for frequencies from the corpus lexicons */
          float threshold;
          int n_shared = 0; /* numebr fo shared words - only calculated for the purpose of printing it */

          if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy) != 3) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -S:<weight>:<threshold>\n");
            fprintf(stderr,"  Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n");
            exit(1);
          }
          else {
            printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight);
            fflush(stdout);

            /* for each type in target corpus, get its frequency, and the corresponding id and frequency
             * from the target corpus, then test whether it meets the criteria for use as a feature. */
            for (i1 = 0; i1 < nw1; i1++) {
              f1 = cl_id2freq(w_attr1, i1);
              i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1));
              if (i2 >= 0){
                f2 = cl_id2freq(w_attr2, i2);
                /* if it will be used as a feature, increment counts of features in various places */
                if ( (f1 / (0.0+f1+f2)) >= threshold && (f2 / (0.0+f1+f2)) >= threshold){
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_shared++;
                  r->n_features++; 
                }
              }
            }
            printf("[%d]\n", n_shared);
          }
          break;
        }
        /* -1 to -4 : shared character sequences (of 1 letter to 4 letters in length) as features */
        case '1': 
        case '2':
        case '3':
        case '4': { 
          int n; /* length of the n-gram, obviously */
          
          if (sscanf(config[config_pointer], "%1s%d:%d %s", command, &n, &weight, dummy) !=3 ) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -<n>:<weight>  (n = 1..4)\n");
            fprintf(stderr,"  Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n");
            exit(1);
          }
          else if(n <= 0 || n > 4) {
            /* this shouldn't happen anyway */
            fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n", n, config[config_pointer]);
            exit(1);
          }
          else {
            int i,f,l; /* temp storage for lexicon index, n of possible features, && word length */
            char *s;

            printf("FEATURE: %d-grams, weight=%d ... ", n, weight);
            fflush(stdout);

            /* for each entry in source-corpus lexicon, add to the number of features IFF
             * that lexicon entry is longer than 4 characters */
            for(i = 0; i < nw1; i++) {
              /* l = cl_id2strlen(w_attr1, i); */
              s = (unsigned char *) cl_strdup(cl_id2str(w_attr1, i));
              cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC);
              l = strlen(s);
              cl_free(s);
              fcount1[i] += (l >= n) ? l - n + 1 : 0;
            }
            /* same for target corpus */
            for(i = 0; i < nw2; i++) {
              /* l = cl_id2strlen(w_attr2, i); */
              s = (unsigned char *) cl_strdup(cl_id2str(w_attr2, i));
              cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC);
              l = strlen(s);
              cl_free(s);
              fcount2[i] += (l >= n) ? l - n + 1 : 0;
            }
            /* set f to number of possible features (= number of possible characters to the power of n) */
            f = 1;
            for(i = 0 ; i < n; i++)
              f *= char_map_range;
            /* anmd add that to our total number of features! */
            r->n_features += f;
            printf("[%d]\n", f);
          }
          break;
        }
        /* -W: the word-translation-equivalence type of feature */
        case 'W': {
          char filename[CL_MAX_LINE_LENGTH],
            word1[CL_MAX_LINE_LENGTH],
            word2[CL_MAX_LINE_LENGTH];
          FILE *wordlist;
          int nw;      /* number of words scanned from an input line */
          int nl = 0;  /* counter for the number of lines in the wordlist file we have gone through */
          int i1,i2;   /* lexicon ids in source and target corpora */
          int n_matched = 0;  /* counter for n of lines in input file that can be used as a feature. */

          if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -W:<weight>:<filename>\n");
            fprintf(stderr, "  Word list (read from file <filename>).\n");
            exit(1);
          }
          else if(!(wordlist = fopen(filename,"r"))) {
            fprintf(stderr,"ERROR: Cannot read word list file %s.\n",
                    filename);
            exit(-1);
          }
          else {
            printf("FEATURE: word list %s, weight=%d ... ", filename, weight);
            fflush(stdout);
            while((nw = fscanf(wordlist,"%s %s",word1,word2))>0) {
              /* on first line of file, skip UTF8 byte-order-mark if present */
              if (nl == 0 && charset == utf8 && strlen(word1) > 3)
                if (word1[0] == (char)0xEF && word1[1] == (char)0xBB && word1[2] == (char)0xBF)
                   cl_strcpy(word1, (word1 + 3));
              nl++;
              /* check that both word 1 and word 2 are valid for the encoding of the corpora */
              if (! (cl_string_validate_encoding(word1, charset, 0)
                  && cl_string_validate_encoding(word2, charset, 0)) ) {
                fprintf(stderr, "ERROR: character encoding error in the word-list input file with the input word list.\n");
                fprintf(stderr, "       (The error occurs on line %d.)\n", nl);
                exit(1);
              }
              if (nw != 2)
                fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n",nl,filename,nw);
              else {
                /* if word1 and word2 both occur in their respective corpora, this is a feature. */
                if(   (i1 = cl_str2id(w_attr1, word1)) >= 0
                   && (i2 = cl_str2id(w_attr2, word2)) >= 0 ) {
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_matched++;
                  r->n_features++;
                }
              }
            }
            fclose(wordlist);
            printf("[%d]\n", n_matched);
          }         
          break;
        }
        /* -C: the character count type of feature.
         * This feature exists for EVERY word type. */
        case 'C': 
          if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -C:<weight>\n");
            fprintf(stderr, "  Character count [primary feature].\n");
            exit(1);
          }
          else {
            /* primary feature -> don't create additional features */
            /* first entry in a token's feature list is character count */ 
            for (i=0; i<nw1; i++)
              fcount1[i]++;
            for (i=0; i<nw2; i++)
              fcount2[i]++;
            printf("FEATURE: character count, weight=%d ... [1]\n", weight);
          }
          break;
        default:
          fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]);
          exit(1);
          break;
        }
      }
      else {
        fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]);
        exit(1);
      }
    }
  }