/* one might wish to add extensive error checking to all the CL functions, but that will need a LOT of code! */ void do_cqi_cl_str2id(void) { char **strlist; int len, i, id; char *a; Attribute *attribute; a = cqi_read_string(); len = cqi_read_string_list(&strlist); if (server_debug) { Rprintf( "CQi: CQI_CL_STR2ID('%s', [", a); for (i=0; i<len; i++) Rprintf( "'%s' ", strlist[i]); Rprintf( "])\n"); } attribute = cqi_lookup_attribute(a, ATT_POS); if (attribute == NULL) { cqi_command(cqi_errno); } else { /* we assemble the CQI_DATA_INT_LIST() return command by hand, so we don't have to allocate a temporary list */ cqi_send_word(CQI_DATA_INT_LIST); cqi_send_int(len); /* list size */ for (i=0; i<len; i++) { id = cl_str2id(attribute, strlist[i]); if (id < 0) id = -1; /* -1 => string not found in lexicon */ cqi_send_int(id); } } cqi_flush(); if (strlist != NULL) free(strlist); /* don't forget to free allocated memory */ free(a); }
/* * ------------------------------------------------------------------------ * * "rcqpCmd_str2id(SEXP inAttribute, SEXP inStrs)" -- * * * * ------------------------------------------------------------------------ */ SEXP rcqpCmd_str2id(SEXP inAttribute, SEXP inStrs) { SEXP result = R_NilValue; int idx; int len, i; char *a, *str; Attribute * attribute; if (!isString(inAttribute) || length(inAttribute) != 1) error("argument 'attribute' must be a string"); PROTECT(inAttribute); if (!isString(inStrs)) error("argument 'strs' must be a vector of strings"); PROTECT(inStrs); a = (char*)CHAR(STRING_ELT(inAttribute,0)); len = length(inStrs); attribute = cqi_lookup_attribute(a, ATT_POS); if (attribute == NULL) { UNPROTECT(2); rcqp_error_code(cqi_errno); } else { result = PROTECT(allocVector(INTSXP, len)); for (i=0; i<len; i++) { str = (char*)CHAR(STRING_ELT(inStrs,i)); idx = cl_str2id(attribute, str); if (idx < 0) { idx = -1; } INTEGER(result)[i] = idx; } } UNPROTECT(3); return result; }
/** * Creates feature maps for a source/target corpus pair. * * Example usage: * * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s); * * @param config pointer to a list of strings representing the feature map configuration. * @param config_lines the number of configuration items stored in config_data. * @param w_attr1 The p-attribute in the first corpus to link. * @param w_attr2 The p-attribute in the second corpus to link. * @param s_attr1 The s-attribute in the first corpus to link. * @param s_attr2 The s-attribute in the second corpus to link. * @return the new FMS object. */ FMS create_feature_maps(char **config, int config_lines, Attribute *w_attr1, Attribute *w_attr2, Attribute *s_attr1, Attribute *s_attr2 ) { FMS r; unsigned int *fcount1, *fcount2; int config_pointer; char *b, command[200], dummy[200]; int current_feature, weight, need_to_abort; int *fs1, *fs2; int i,nw1,nw2; r = (FMS) malloc(sizeof(feature_maps_t)); assert(r); r->att1 = w_attr1; r->att2 = w_attr2; r->s1 = s_attr1; r->s2 = s_attr2; init_char_map(); nw1= cl_max_id(w_attr1); if (nw1 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of source corpus\n"); exit(1); } nw2= cl_max_id(w_attr2); if (nw2 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of target corpus\n"); exit(1); } printf("LEXICON SIZE: %d / %d\n", nw1, nw2); fcount1 = (unsigned int*) calloc(nw1+1,sizeof(unsigned int)); fcount2 = (unsigned int*) calloc(nw2+1,sizeof(unsigned int)); r->n_features=1; /* process feature map configuration: first pass */ for (config_pointer = 0; config_pointer < config_lines; config_pointer++) { if ( (b = strpbrk(config[config_pointer],"\n#")) ) /* strip newline and comments */ *b=0; if (sscanf(config[config_pointer],"%s",command)>0) { if(command[0]=='-') { switch(command[1]) { case 'S': { int i1, i2, f1, f2; float threshold; int n_shared=0; if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy)!=3) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -S:<weight>:<threshold>\n"); fprintf(stderr," Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n"); exit(1); } else { printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight); fflush(stdout); for (i1=0; i1 < nw1; i1++) { f1 = cl_id2freq(w_attr1, i1); i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1)); if (i2 >= 0){ f2 = cl_id2freq(w_attr2, i2); if (f1 / (0.0+f1+f2) >=threshold && f2 / (0.0+f1+f2) >= threshold){ fcount1[i1]++; fcount2[i2]++; n_shared++; r->n_features++; } } } printf("[%d]\n",n_shared); } break; } case '1': case '2': case '3': case '4': { int n; if (sscanf(config[config_pointer],"%1s%d:%d %s",command,&n,&weight,dummy)!=3) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -<n>:<weight> (n = 1..4)\n"); fprintf(stderr," Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n"); exit(1); } else if(n <= 0 || n>4) { /* this shouldn't happen anyway */ fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n",n,config[config_pointer]); exit(1); } else { int i,f,l; printf("FEATURE: %d-grams, weight=%d ... ", n, weight); fflush(stdout); for(i=0; i<nw1; i++) { l = cl_id2strlen(w_attr1, i); fcount1[i] += (l >= n) ? l - n + 1 : 0; } for(i=0; i<nw2; i++) { l = cl_id2strlen(w_attr2, i); fcount2[i] += (l >= n) ? l - n + 1 : 0; } f=1; for(i=0;i<n;i++) f*=char_map_range; r->n_features+=f; printf("[%d]\n", f); } break; } case 'W': { char filename[200], word1[200], word2[200]; FILE *wordlist; int nw,nl=0,i1,i2,n_matched=0; if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -W:<weight>:<filename>\n"); fprintf(stderr, " Word list (read from file <filename>).\n"); exit(1); } else if(!(wordlist=fopen(filename,"r"))) { fprintf(stderr,"ERROR: Cannot read word list file %s.\n", filename); exit(-1); } else { printf("FEATURE: word list %s, weight=%d ... ", filename, weight); fflush(stdout); while((nw=fscanf(wordlist,"%s %s",word1,word2))>0) { nl++; if (nw!=2) fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n", nl,filename,nw); else { if((i1=cl_str2id(w_attr1,word1))>=0 && (i2=cl_str2id(w_attr2,word2)) >=0) { fcount1[i1]++; fcount2[i2]++; n_matched++; r->n_features++; } } } fclose(wordlist); printf("[%d]\n", n_matched); } break; } case 'C': if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -C:<weight>\n"); fprintf(stderr, " Character count [primary feature].\n"); exit(1); } else { /* primary feature -> don't create additional features */ /* first entry in a token's feature list is character count */ for (i=0; i<nw1; i++) fcount1[i]++; for (i=0; i<nw2; i++) fcount2[i]++; printf("FEATURE: character count, weight=%d ... [1]\n", weight); } break; default: fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]); exit(1); break; } } else { fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]); exit(1); } } }
/** * Creates feature maps for a source/target corpus pair. * * This is the constructor function for the FMS class. * * Example usage: * * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s); * * @param config array of strings representing the feature map configuration. * @param config_lines the number of configuration items stored in config_data. * @param w_attr1 The p-attribute in the first corpus to link. * @param w_attr2 The p-attribute in the second corpus to link. * @param s_attr1 The s-attribute in the first corpus to link. * @param s_attr2 The s-attribute in the second corpus to link. * @return the new FMS object. */ FMS create_feature_maps(char **config, int config_lines, Attribute *w_attr1, Attribute *w_attr2, Attribute *s_attr1, Attribute *s_attr2 ) { FMS r; unsigned int *fcount1, *fcount2; /* arrays for types in the lexicons of the source * & target corpora, respectively, counting how often each is used * in a feature */ int config_pointer; char *b, command[CL_MAX_LINE_LENGTH], dummy[CL_MAX_LINE_LENGTH]; int current_feature; int weight; /* holds the weight assigned to the feature(s) we're working on */ int need_to_abort; /* boolean used during pointer check */ /* after we have counted up features, these will become arrays of ints, with one entry per feature */ int *fs1, *fs2; int i; int nw1; /* number of types on the word-attribute of the source corpus */ int nw2; /* number of types on the word-attribute of the target corpus */ /* one last variable: we need to know the character set of the two corpora for assorted purposes */ CorpusCharset charset; charset = cl_corpus_charset(cl_attribute_mother_corpus(w_attr1)); /* first, create the FMS object. */ r = (FMS) malloc(sizeof(feature_maps_t)); assert(r); /* copy in the attribute pointers */ r->att1 = w_attr1; r->att2 = w_attr2; r->s1 = s_attr1; r->s2 = s_attr2; init_char_map(); /* find out how many different word-types occur on each of the p-attributes */ nw1 = cl_max_id(w_attr1); if (nw1 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of source corpus\n"); exit(1); } nw2 = cl_max_id(w_attr2); if (nw2 <= 0) { fprintf(stderr, "ERROR: can't access lexicon of target corpus\n"); exit(1); } printf("LEXICON SIZE: %d / %d\n", nw1, nw2); fcount1 = (unsigned int*) calloc(nw1 + 1, sizeof(unsigned int)); fcount2 = (unsigned int*) calloc(nw2 + 1, sizeof(unsigned int)); r->n_features = 1; /* NOTE there are two passes through the creation of feature maps - two sets of nearly identical code! * First pass to see how many things we need ot count, second pass to count them. */ /* process feature map configuration: first pass */ for (config_pointer = 0; config_pointer < config_lines; config_pointer++) { /* strip newline and comments */ if ( (b = strpbrk(config[config_pointer],"\n#")) ) *b = 0; if (sscanf(config[config_pointer], "%s", command) > 0) { if(command[0] == '-') { /* * These are the FIRST PASS options for the different config lines. * * Possible config commands: -S -W -C -1 -2 -3 -4 */ switch(command[1]) { /* -S : the "shared words" type of feature */ case 'S': { int i1, i2; /* i1 and i2 are temporary indexes into the lexicons of the two corpora */ int f1, f2; /* f1 and f2 are temporary storage for frequencies from the corpus lexicons */ float threshold; int n_shared = 0; /* numebr fo shared words - only calculated for the purpose of printing it */ if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy) != 3) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -S:<weight>:<threshold>\n"); fprintf(stderr," Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n"); exit(1); } else { printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight); fflush(stdout); /* for each type in target corpus, get its frequency, and the corresponding id and frequency * from the target corpus, then test whether it meets the criteria for use as a feature. */ for (i1 = 0; i1 < nw1; i1++) { f1 = cl_id2freq(w_attr1, i1); i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1)); if (i2 >= 0){ f2 = cl_id2freq(w_attr2, i2); /* if it will be used as a feature, increment counts of features in various places */ if ( (f1 / (0.0+f1+f2)) >= threshold && (f2 / (0.0+f1+f2)) >= threshold){ fcount1[i1]++; fcount2[i2]++; n_shared++; r->n_features++; } } } printf("[%d]\n", n_shared); } break; } /* -1 to -4 : shared character sequences (of 1 letter to 4 letters in length) as features */ case '1': case '2': case '3': case '4': { int n; /* length of the n-gram, obviously */ if (sscanf(config[config_pointer], "%1s%d:%d %s", command, &n, &weight, dummy) !=3 ) { fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr,"Usage: -<n>:<weight> (n = 1..4)\n"); fprintf(stderr," Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n"); exit(1); } else if(n <= 0 || n > 4) { /* this shouldn't happen anyway */ fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n", n, config[config_pointer]); exit(1); } else { int i,f,l; /* temp storage for lexicon index, n of possible features, && word length */ char *s; printf("FEATURE: %d-grams, weight=%d ... ", n, weight); fflush(stdout); /* for each entry in source-corpus lexicon, add to the number of features IFF * that lexicon entry is longer than 4 characters */ for(i = 0; i < nw1; i++) { /* l = cl_id2strlen(w_attr1, i); */ s = (unsigned char *) cl_strdup(cl_id2str(w_attr1, i)); cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC); l = strlen(s); cl_free(s); fcount1[i] += (l >= n) ? l - n + 1 : 0; } /* same for target corpus */ for(i = 0; i < nw2; i++) { /* l = cl_id2strlen(w_attr2, i); */ s = (unsigned char *) cl_strdup(cl_id2str(w_attr2, i)); cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC); l = strlen(s); cl_free(s); fcount2[i] += (l >= n) ? l - n + 1 : 0; } /* set f to number of possible features (= number of possible characters to the power of n) */ f = 1; for(i = 0 ; i < n; i++) f *= char_map_range; /* anmd add that to our total number of features! */ r->n_features += f; printf("[%d]\n", f); } break; } /* -W: the word-translation-equivalence type of feature */ case 'W': { char filename[CL_MAX_LINE_LENGTH], word1[CL_MAX_LINE_LENGTH], word2[CL_MAX_LINE_LENGTH]; FILE *wordlist; int nw; /* number of words scanned from an input line */ int nl = 0; /* counter for the number of lines in the wordlist file we have gone through */ int i1,i2; /* lexicon ids in source and target corpora */ int n_matched = 0; /* counter for n of lines in input file that can be used as a feature. */ if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -W:<weight>:<filename>\n"); fprintf(stderr, " Word list (read from file <filename>).\n"); exit(1); } else if(!(wordlist = fopen(filename,"r"))) { fprintf(stderr,"ERROR: Cannot read word list file %s.\n", filename); exit(-1); } else { printf("FEATURE: word list %s, weight=%d ... ", filename, weight); fflush(stdout); while((nw = fscanf(wordlist,"%s %s",word1,word2))>0) { /* on first line of file, skip UTF8 byte-order-mark if present */ if (nl == 0 && charset == utf8 && strlen(word1) > 3) if (word1[0] == (char)0xEF && word1[1] == (char)0xBB && word1[2] == (char)0xBF) cl_strcpy(word1, (word1 + 3)); nl++; /* check that both word 1 and word 2 are valid for the encoding of the corpora */ if (! (cl_string_validate_encoding(word1, charset, 0) && cl_string_validate_encoding(word2, charset, 0)) ) { fprintf(stderr, "ERROR: character encoding error in the word-list input file with the input word list.\n"); fprintf(stderr, " (The error occurs on line %d.)\n", nl); exit(1); } if (nw != 2) fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n",nl,filename,nw); else { /* if word1 and word2 both occur in their respective corpora, this is a feature. */ if( (i1 = cl_str2id(w_attr1, word1)) >= 0 && (i2 = cl_str2id(w_attr2, word2)) >= 0 ) { fcount1[i1]++; fcount2[i2]++; n_matched++; r->n_features++; } } } fclose(wordlist); printf("[%d]\n", n_matched); } break; } /* -C: the character count type of feature. * This feature exists for EVERY word type. */ case 'C': if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) { fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]); fprintf(stderr, "Usage: -C:<weight>\n"); fprintf(stderr, " Character count [primary feature].\n"); exit(1); } else { /* primary feature -> don't create additional features */ /* first entry in a token's feature list is character count */ for (i=0; i<nw1; i++) fcount1[i]++; for (i=0; i<nw2; i++) fcount2[i]++; printf("FEATURE: character count, weight=%d ... [1]\n", weight); } break; default: fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]); exit(1); break; } } else { fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]); exit(1); } } }