/** * Matches a regular expression against a string. * * The regular expression contained in the CL_Regex is compared to the string. * No settings or flags are passed to this function; rather, the * settings that rx was created with are used. * * @param rx The regular expression to match. * @param str The string to compare the regex to. * @return Boolean: true if the regex matched, otherwise false. */ int cl_regex_match(CL_Regex rx, char *str) { char *haystack; /* either the original string to match against, or a pointer to rx->haystack_buf */ int optimised = (rx->grains > 0); int i, di, k, max_i, len, jump; int grain_match, result; int ovector[30]; /* memory for pcre to use for back-references in pattern matches */ if (rx->flags) { /* normalise input string if necessary */ haystack = rx->haystack_buf; strcpy(haystack, str); cl_string_canonical(haystack, rx->charset, rx->flags); } else haystack = str; len = strlen(haystack); if (optimised) { /* this 'optimised' matcher may look fairly bloated, but it's still way ahead of POSIX regexen */ /* string offset where first character of each grain would be */ grain_match = 0; max_i = len - rx->grain_len; /* stop trying to match when i > max_i */ if (rx->anchor_end) i = (max_i >= 0) ? max_i : 0; /* if anchored at end, align grains with end of string */ else i = 0; while (i <= max_i) { jump = rx->jumptable[(unsigned char) haystack[i + rx->grain_len - 1]]; if (jump > 0) { i += jump; /* Boyer-Moore search */ } else { /* for each grain */ for (k = 0; k < rx->grains; k++) { di = 0; while ((di < rx->grain_len) && (rx->grain[k][di] == haystack[i + di])) di++; if (di >= rx->grain_len) { grain_match = 1; break; /* we have found a grain match and can quit the loop */ } } i++; } if (rx->anchor_start) break; /* if anchored at start, only the first iteration can match */ } } /* endif optimised */ else /* if the regex is not optimised, always behave as if a grain was matched */ grain_match = 1; /* if there was a grain-match, we call pcre_exec, which might match or might notfind a match in the end; * but if there wasn't a grain-match, we know that pcre won't match; so we don't bother calling it. */ if (!grain_match) { /* enabled since version 2.2.b94 (14 Feb 2006) -- before: && cl_optimize */ cl_regopt_successes++; result = PCRE_ERROR_NOMATCH; /* the return code from PCRE when there is, um, no match */ } #if 0 /* for debug purposes: always call pcre regardless of whether the grains matched. */ /* this allows the code in the below #if 1 to check whether or not grains are behaving as they should. */ else { #else if (1) { #endif result = pcre_exec(rx->needle, rx->extra, haystack, len, 0, PCRE_NO_UTF8_CHECK, ovector, 30); if (result < PCRE_ERROR_NOMATCH && cl_debug) /* note, "no match" is a PCRE "error", but all actual errors are lower numbers */ Rprintf( "CL: Regex Execute Error no. %d (see `man pcreapi` for error codes)\n", result); } #if 1 /* debugging code used before version 2.2.b94, modified to pcre return values & re-enabled in 3.2.b3 */ /* check for critical error: optimiser didn't accept candidate, but regex matched */ if ((result > 0) && !grain_match) Rprintf( "CL ERROR: regex optimiser did not accept '%s' although it should have!\n", haystack); #endif return (result > 0); /* return true if regular expression matched */ } /** * Deletes a CL_Regex object. * * Note that we use cl_free to deallocate the internal PCRE buffers, * not pcre_free, for the simple reason that pcre_free is just a * function pointer that will normally contain free, and thus we * miss out on the checking that cl_free provides. * * @param rx The CL_Regex to delete. */ void cl_delete_regex(CL_Regex rx) { /* DON'T use cl_free() for PCRE opaque objects, just in case; use PCRE built-in * pcre_free(). Note this will probably just be set to = free(). But it might not. * We can let PCRE worry about that. That does mean, however, we should test the * pointers for non-nullity before calling pcre_free. Normally we would also set the * pointers to NULL after freeing the target. However, in this case, we know the * structure they belong to will be freed by the end of the function, so no worries. */ int i; if (rx->needle) pcre_free(rx->needle); /* free PCRE regex buffer */ if (rx->extra) pcre_free(rx->extra); /* and "extra" buffer */ cl_free(rx->haystack_buf); /* free string buffer if it was allocated */ for (i = 0; i < rx->grains; i++) cl_free(rx->grain[i]); /* free grain strings if regex was optimised */ cl_free(rx); }
/* tabulate specified query result, using settings from global list of tabulation items; return value indicates whether tabulation was successful (otherwise, generates error message) */ int print_tabulation(CorpusList *cl, int first, int last, struct Redir *rd) { TabulationItem item = TabulationList; int current; if (! cl) return 0; if (first <= 0) first = 0; /* make sure that first and last match to tabulate are in range */ if (last >= cl->size) last = cl->size - 1; while (item) { /* obtain attribute handles for tabulation items */ if (item->attribute_name) { if (NULL != (item->attribute = cl_new_attribute(cl->corpus, item->attribute_name, ATT_POS))) { item->attribute_type = ATT_POS; } else if (NULL != (item->attribute = cl_new_attribute(cl->corpus, item->attribute_name, ATT_STRUC))) { item->attribute_type = ATT_STRUC; if (! cl_struc_values(item->attribute)) { cqpmessage(Error, "No annotated values for s-attribute ``%s'' in named query %s", item->attribute_name, cl->name); return 0; } } else { cqpmessage(Error, "Can't find attribute ``%s'' for named query %s", item->attribute_name, cl->name); return 0; } } else { item->attribute_type = ATT_NONE; /* no attribute -> print corpus position */ } if (cl->size > 0) { /* work around bug: anchor validation will fail for empty query result (but then loop below is void anyway) */ if (! (pt_validate_anchor(cl, item->anchor1) && pt_validate_anchor(cl, item->anchor2))) return 0; } item = item->next; } if (! open_stream(rd, cl->corpus->charset)) { cqpmessage(Error, "Can't redirect output to file or pipe\n"); return 0; } /* tabulate selected attribute values for matches <first> .. <last> */ for (current = first; current <= last; current++) { TabulationItem item = TabulationList; while (item) { int start = pt_get_anchor_cpos(cl, current, item->anchor1, item->offset1); int end = pt_get_anchor_cpos(cl, current, item->anchor2, item->offset2); int cpos; if (start < 0 || end < 0) /* one of the anchors is undefined -> print single undefined value for entire range */ start = end = -1; for (cpos = start; cpos <= end; cpos++) { if (item->attribute_type == ATT_NONE) { fprintf(rd->stream, "%d", cpos); } else { if (cpos >= 0) { /* undefined anchors print empty string */ char *string = NULL; if (item->attribute_type == ATT_POS) string = cl_cpos2str(item->attribute, cpos); else string = cl_cpos2struc2str(item->attribute, cpos); if (string) { if (item->flags) { char *copy = cl_strdup(string); cl_string_canonical(copy, cl->corpus->charset, item->flags); fprintf(rd->stream, "%s", copy); cl_free(copy); } else { fprintf(rd->stream, "%s", string); } } } } if (cpos < end) /* multiple values for tabulation item are separated by blanks */ fprintf(rd->stream, " "); } if (item->next) /* multiple tabulation items are separated by TABs */ fprintf(rd->stream, "\t"); item = item->next; } fprintf(rd->stream, "\n"); } close_stream(rd); free_tabulation_list(); return 1; }
/** * Create a new CL_regex object (ie a regular expression buffer). * * The regular expression is preprocessed according to the flags, and * anchored to the start and end of the string. (That is, ^ is added to * the start, $ to the end.) * * Then the resulting regex is compiled (using PCRE) and * optimised. * * @param regex String containing the regular expression * @param flags IGNORE_CASE, or IGNORE_DIAC, or both, or 0. * @param charset The character set of the regex. * @return The new CL_Regex object, or NULL in case of error. */ CL_Regex cl_new_regex(char *regex, int flags, CorpusCharset charset) { char *preprocessed_regex; /* allocate dynamically to support very long regexps (from RE() operator) */ char *anchored_regex; CL_Regex rx; int error_num, optimised, i, l; int options_for_pcre = 0; const char *errstring_for_pcre = NULL; int erroffset_for_pcre = 0; /* allocate temporary strings */ l = strlen(regex); preprocessed_regex = (char *) cl_malloc(l + 1); anchored_regex = (char *) cl_malloc(l + 5); /* allocate and initialise CL_Regex object */ rx = (CL_Regex) cl_malloc(sizeof(struct _CL_Regex)); rx->haystack_buf = NULL; rx->charset = charset; rx->flags = flags & (IGNORE_CASE | IGNORE_DIAC); /* mask unsupported flags */ rx->grains = 0; /* indicates no optimisation -> other opt. fields are invalid */ /* pre-process regular expression (translate latex escapes and normalise) */ cl_string_latex2iso(regex, preprocessed_regex, l); cl_string_canonical(preprocessed_regex, charset, rx->flags); /* add start and end anchors to improve performance of regex matcher for expressions such as ".*ung" */ sprintf(anchored_regex, "^(%s)$", preprocessed_regex); /* compile regular expression with PCRE library function */ if (charset == utf8) { if (cl_debug) Rprintf( "CL: enabling PCRE's UTF8 mode for regex %s\n", anchored_regex); /* note we assume all strings have been checked upon input (i.e. indexing or by the parser) */ options_for_pcre = PCRE_UTF8|PCRE_NO_UTF8_CHECK; /* we do our own case folding, so we don't need the PCRE_CASELESS flag */ } rx->needle = pcre_compile(anchored_regex, options_for_pcre, &errstring_for_pcre, &erroffset_for_pcre, NULL); if (rx->needle == NULL) { strcpy(cl_regex_error, errstring_for_pcre); Rprintf( "CL: Regex Compile Error: %s\n", cl_regex_error); cl_free(rx); cl_free(preprocessed_regex); cl_free(anchored_regex); cl_errno = CDA_EBADREGEX; return NULL; } else if (cl_debug) Rprintf( "CL: Regex compiled successfully using PCRE library\n"); /* always use pcre_study because nearly all our regexes are going to be used lots of times; * note that according to man pcre, the optimisation methods are different to those used by * the CL's regex optimiser. So it is all good. */ rx->extra = pcre_study(rx->needle, 0, &errstring_for_pcre); if (errstring_for_pcre != NULL) { rx->extra = NULL; if (cl_debug) Rprintf( "CL: calling pcre_study failed with message...\n %s\n", errstring_for_pcre); /* note that failure of pcre_study is not a critical error, we can just continue without the extra info */ } if (cl_debug && rx->extra) Rprintf( "CL: calling pcre_study produced useful information...\n"); /* allocate string buffer for cl_regex_match() function if flags are present */ if (flags) rx->haystack_buf = (char *) cl_malloc(CL_MAX_LINE_LENGTH); /* this is for the string being matched, not the regex! */ /* attempt to optimise regular expression */ optimised = cl_regopt_analyse(preprocessed_regex); if (optimised) { /* copy optimiser data to CL_Regex object */ regopt_data_copy_to_regex_object(rx); } cl_free(preprocessed_regex); cl_free(anchored_regex); cl_errno = CDA_OK; return rx; }
/** * Creates feature maps for a source/target corpus pair. * * This is the constructor function for the FMS class. * * Example usage: * * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s); * * @param config array of strings representing the feature map configuration. * @param config_lines the number of configuration items stored in config_data. * @param w_attr1 The p-attribute in the first corpus to link. * @param w_attr2 The p-attribute in the second corpus to link. * @param s_attr1 The s-attribute in the first corpus to link. * @param s_attr2 The s-attribute in the second corpus to link. * @return the new FMS object. */ FMS create_feature_maps(char **config, int config_lines, Attribute *w_attr1, Attribute *w_attr2, Attribute *s_attr1, Attribute *s_attr2 ) { FMS r; unsigned int *fcount1, *fcount2; /* arrays for types in the lexicons of the source * & target corpora, respectively, counting how often each is used * in a feature */ int config_pointer; char *b, command[CL_MAX_LINE_LENGTH], dummy[CL_MAX_LINE_LENGTH]; int current_feature; int weight; /* holds the weight assigned to the feature(s) we're working on */ int need_to_abort; /* boolean used during pointer check */ /* after we have counted up features, these will become arrays of ints, with one entry per feature */ int *fs1, *fs2; int i; int nw1; /* number of types on the word-attribute of the source corpus */ int nw2; /* number of types on the word-attribute of the target corpus */ /* one last variable: we need to know the character set of the two corpora for assorted purposes */ CorpusCharset charset; charset = cl_corpus_charset(cl_attribute_mother_corpus(w_attr1)); /* first, create the FMS object. */ r = (FMS) malloc(sizeof(feature_maps_t)); assert(r); /* copy in the attribute pointers */ r->att1 = w_attr1; r->att2 = w_attr2; r->s1 = s_attr1; r->s2 = s_attr2; init_char_map(); /* find out how many different word-types occur on each of the p-attributes */ nw1 = cl_max_id(w_attr1); if (nw1 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of source corpus\n"); exit(1); } nw2 = cl_max_id(w_attr2); if (nw2 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of target corpus\n"); exit(1); } printf("LEXICON SIZE: %d / %d\n", nw1, nw2); fcount1 = (unsigned int*) calloc(nw1 + 1, sizeof(unsigned int)); fcount2 = (unsigned int*) calloc(nw2 + 1, sizeof(unsigned int)); r->n_features = 1; /* NOTE there are two passes through the creation of feature maps - two sets of nearly identical code! * First pass to see how many things we need ot count, second pass to count them. */ /* process feature map configuration: first pass */ for (config_pointer = 0; config_pointer < config_lines; config_pointer++) { /* strip newline and comments */ if ( (b = strpbrk(config[config_pointer],"\n#")) ) *b = 0; if (sscanf(config[config_pointer], "%s", command) > 0) { if(command[0] == '-') { /* * These are the FIRST PASS options for the different config lines. * * Possible config commands: -S -W -C -1 -2 -3 -4 */ switch(command[1]) { /* -S : the "shared words" type of feature */ case 'S': { int i1, i2; /* i1 and i2 are temporary indexes into the lexicons of the two corpora */ int f1, f2; /* f1 and f2 are temporary storage for frequencies from the corpus lexicons */ float threshold; int n_shared = 0; /* numebr fo shared words - only calculated for the purpose of printing it */ if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy) != 3) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -S:<weight>:<threshold>\n"); fprintf(stderr," Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n"); exit(1); } else { printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight); fflush(stdout); /* for each type in target corpus, get its frequency, and the corresponding id and frequency * from the target corpus, then test whether it meets the criteria for use as a feature. */ for (i1 = 0; i1 < nw1; i1++) { f1 = cl_id2freq(w_attr1, i1); i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1)); if (i2 >= 0){ f2 = cl_id2freq(w_attr2, i2); /* if it will be used as a feature, increment counts of features in various places */ if ( (f1 / (0.0+f1+f2)) >= threshold && (f2 / (0.0+f1+f2)) >= threshold){ fcount1[i1]++; fcount2[i2]++; n_shared++; r->n_features++; } } } printf("[%d]\n", n_shared); } break; } /* -1 to -4 : shared character sequences (of 1 letter to 4 letters in length) as features */ case '1': case '2': case '3': case '4': { int n; /* length of the n-gram, obviously */ if (sscanf(config[config_pointer], "%1s%d:%d %s", command, &n, &weight, dummy) !=3 ) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -<n>:<weight> (n = 1..4)\n"); fprintf(stderr," Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n"); exit(1); } else if(n <= 0 || n > 4) { /* this shouldn't happen anyway */ fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n", n, config[config_pointer]); exit(1); } else { int i,f,l; /* temp storage for lexicon index, n of possible features, && word length */ char *s; printf("FEATURE: %d-grams, weight=%d ... ", n, weight); fflush(stdout); /* for each entry in source-corpus lexicon, add to the number of features IFF * that lexicon entry is longer than 4 characters */ for(i = 0; i < nw1; i++) { /* l = cl_id2strlen(w_attr1, i); */ s = (unsigned char *) cl_strdup(cl_id2str(w_attr1, i)); cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC); l = strlen(s); cl_free(s); fcount1[i] += (l >= n) ? l - n + 1 : 0; } /* same for target corpus */ for(i = 0; i < nw2; i++) { /* l = cl_id2strlen(w_attr2, i); */ s = (unsigned char *) cl_strdup(cl_id2str(w_attr2, i)); cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC); l = strlen(s); cl_free(s); fcount2[i] += (l >= n) ? l - n + 1 : 0; } /* set f to number of possible features (= number of possible characters to the power of n) */ f = 1; for(i = 0 ; i < n; i++) f *= char_map_range; /* anmd add that to our total number of features! */ r->n_features += f; printf("[%d]\n", f); } break; } /* -W: the word-translation-equivalence type of feature */ case 'W': { char filename[CL_MAX_LINE_LENGTH], word1[CL_MAX_LINE_LENGTH], word2[CL_MAX_LINE_LENGTH]; FILE *wordlist; int nw; /* number of words scanned from an input line */ int nl = 0; /* counter for the number of lines in the wordlist file we have gone through */ int i1,i2; /* lexicon ids in source and target corpora */ int n_matched = 0; /* counter for n of lines in input file that can be used as a feature. */ if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -W:<weight>:<filename>\n"); fprintf(stderr, " Word list (read from file <filename>).\n"); exit(1); } else if(!(wordlist = fopen(filename,"r"))) { fprintf(stderr,"ERROR: Cannot read word list file %s.\n", filename); exit(-1); } else { printf("FEATURE: word list %s, weight=%d ... ", filename, weight); fflush(stdout); while((nw = fscanf(wordlist,"%s %s",word1,word2))>0) { /* on first line of file, skip UTF8 byte-order-mark if present */ if (nl == 0 && charset == utf8 && strlen(word1) > 3) if (word1[0] == (char)0xEF && word1[1] == (char)0xBB && word1[2] == (char)0xBF) cl_strcpy(word1, (word1 + 3)); nl++; /* check that both word 1 and word 2 are valid for the encoding of the corpora */ if (! (cl_string_validate_encoding(word1, charset, 0) && cl_string_validate_encoding(word2, charset, 0)) ) { fprintf(stderr, "ERROR: character encoding error in the word-list input file with the input word list.\n"); fprintf(stderr, " (The error occurs on line %d.)\n", nl); exit(1); } if (nw != 2) fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n",nl,filename,nw); else { /* if word1 and word2 both occur in their respective corpora, this is a feature. */ if( (i1 = cl_str2id(w_attr1, word1)) >= 0 && (i2 = cl_str2id(w_attr2, word2)) >= 0 ) { fcount1[i1]++; fcount2[i2]++; n_matched++; r->n_features++; } } } fclose(wordlist); printf("[%d]\n", n_matched); } break; } /* -C: the character count type of feature. * This feature exists for EVERY word type. */ case 'C': if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -C:<weight>\n"); fprintf(stderr, " Character count [primary feature].\n"); exit(1); } else { /* primary feature -> don't create additional features */ /* first entry in a token's feature list is character count */ for (i=0; i<nw1; i++) fcount1[i]++; for (i=0; i<nw2; i++) fcount2[i]++; printf("FEATURE: character count, weight=%d ... [1]\n", weight); } break; default: fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]); exit(1); break; } } else { fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]); exit(1); } } }