Esempio n. 1
0
void
do_cqi_cl_id2str(void)
{
  int *idlist;
  int len, i;
  char *a, *str;
  Attribute *attribute;

  a = cqi_read_string();
  len = cqi_read_int_list(&idlist);
  if (server_debug) {
   Rprintf( "CQi: CQI_CL_ID2STR('%s', [", a);
    for (i=0; i<len; i++)
     Rprintf( "%d ", idlist[i]);
   Rprintf( "])\n");
  }
  attribute = cqi_lookup_attribute(a, ATT_POS);
  if (attribute == NULL) {
    cqi_command(cqi_errno);
  }
  else {
    /* we assemble the CQI_DATA_STRING_LIST() return command by hand,
       so we don't have to allocate a temporary list */
    cqi_send_word(CQI_DATA_STRING_LIST);
    cqi_send_int(len);          /* list size */
    for (i=0; i<len; i++) {
      str = cl_id2str(attribute, idlist[i]);
      cqi_send_string(str);     /* sends "" if str == NULL (ID out of range) */
    }
  }
  cqi_flush();
  if (idlist != NULL)
    free(idlist);               /* don't forget to free allocated memory */
  free(a);
}
Esempio n. 2
0
File: groups.c Progetto: cran/rcqp
char *
Group_id2str(Group *group, int id, int target) {
  Attribute *attr = (target) ? group->target_attribute : group->source_attribute;
  int is_struc = (target) ? group->target_is_struc : group->source_is_struc;
  char *base = (target) ? group->target_base : group->source_base;
  if (id == ANY_ID) 
    return "(all)";
  else if (id < 0)
    return "(none)";
  else if (is_struc) 
    return base+id;             /* keep fingers crossed ... */
  else 
    return cl_id2str(attr, id);
}
Esempio n. 3
0
/* 
 * ------------------------------------------------------------------------
 * 
 * "rcqpCmd_id2str(SEXP inAttribute, SEXP inIds)" --
 * 
 * 
 * 
 * ------------------------------------------------------------------------
 */
SEXP rcqpCmd_id2str(SEXP inAttribute, SEXP inIds)
{
	SEXP			result = R_NilValue;
	int				idx;
	int				len, i;
	char 			*a, *str;
	Attribute *		attribute;
	
	if (!isString(inAttribute) || length(inAttribute) != 1) error("argument 'attribute' must be a string");
	PROTECT(inAttribute);
	if (!isVector(inIds)) error("argument 'ids' must be a vector of integers");
	PROTECT(inIds);
	
	a = (char*)CHAR(STRING_ELT(inAttribute,0));
	len = length(inIds);
	
	attribute = cqi_lookup_attribute(a, ATT_POS);
	if (attribute == NULL) {
		UNPROTECT(2);
		rcqp_error_code(cqi_errno);
	} else {
		result = PROTECT(allocVector(STRSXP, len));	
		
		for (i=0; i<len; i++) {
			idx = INTEGER(inIds)[i];	
			str = cl_id2str(attribute, idx);
            /* Sends "" if str == NULL (cpos out of range) */
			if (str != NULL) {
				SET_STRING_ELT(result, i, mkChar(str));
			} 
		}
	}
	
	UNPROTECT(3);

	return result;
}
Esempio n. 4
0
/**
 * Creates feature maps for a source/target corpus pair.
 *
 * Example usage:
 *
 * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s);
 *
 * @param config              pointer to a list of strings representing the feature map configuration.
 * @param config_lines        the number of configuration items stored in config_data.
 * @param w_attr1             The p-attribute in the first corpus to link.
 * @param w_attr2             The p-attribute in the second corpus to link.
 * @param s_attr1             The s-attribute in the first corpus to link.
 * @param s_attr2             The s-attribute in the second corpus to link.
 * @return                    the new FMS object.
 */
FMS
create_feature_maps(char **config,
                    int config_lines,
                    Attribute *w_attr1,
                    Attribute *w_attr2,
                    Attribute *s_attr1,
                    Attribute *s_attr2
                    ) 
{
  FMS r;
  unsigned int *fcount1, *fcount2;
  int config_pointer;
  char *b, command[200], dummy[200];
  int current_feature,
      weight,
    need_to_abort;
  int *fs1, *fs2; 
  int i,nw1,nw2;


  r = (FMS) malloc(sizeof(feature_maps_t));
  assert(r);

  r->att1 = w_attr1;
  r->att2 = w_attr2;
  r->s1 = s_attr1;
  r->s2 = s_attr2;

  init_char_map();
  
  nw1= cl_max_id(w_attr1);
  if (nw1 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of source corpus\n");
    exit(1);
  }
  nw2= cl_max_id(w_attr2);
  if (nw2 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of target corpus\n");
    exit(1);
  }
  
  printf("LEXICON SIZE: %d / %d\n", nw1, nw2);

  fcount1 = (unsigned int*) calloc(nw1+1,sizeof(unsigned int));
  fcount2 = (unsigned int*) calloc(nw2+1,sizeof(unsigned int));

  r->n_features=1;

  /* process feature map configuration: first pass */
  for (config_pointer = 0; config_pointer < config_lines; config_pointer++) {

    if ( (b = strpbrk(config[config_pointer],"\n#")) )  /* strip newline and comments */
      *b=0;
    if (sscanf(config[config_pointer],"%s",command)>0) {
      if(command[0]=='-') {
        switch(command[1]) {
        case 'S': {
          int i1, i2, f1, f2;
          float threshold;
          int n_shared=0;
            
          if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy)!=3) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -S:<weight>:<threshold>\n");
            fprintf(stderr,"  Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n");
            exit(1);
          }
          else {
            printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight);
            fflush(stdout);
            for (i1=0; i1 < nw1; i1++) {
              f1 = cl_id2freq(w_attr1, i1);
              i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1));
              if (i2 >= 0){
                f2 = cl_id2freq(w_attr2, i2);
                if (f1 / (0.0+f1+f2) >=threshold && f2 / (0.0+f1+f2) >= threshold){
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_shared++;
                  r->n_features++; 
                }
              }
            }
            printf("[%d]\n",n_shared);
          }
          break;
        }
        case '1': 
        case '2':
        case '3':
        case '4': { 
          int n;
          
          if (sscanf(config[config_pointer],"%1s%d:%d %s",command,&n,&weight,dummy)!=3) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -<n>:<weight>  (n = 1..4)\n");
            fprintf(stderr,"  Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n");
            exit(1);
          }
          else if(n <= 0 || n>4) {
            /* this shouldn't happen anyway */
            fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n",n,config[config_pointer]);
            exit(1);
          }
          else {
            int i,f,l;

            printf("FEATURE: %d-grams, weight=%d ... ", n, weight);
            fflush(stdout);

            for(i=0; i<nw1; i++) {
              l = cl_id2strlen(w_attr1, i);
              fcount1[i] += (l >= n) ? l - n + 1 : 0;
            }
            for(i=0; i<nw2; i++) {
              l = cl_id2strlen(w_attr2, i);
              fcount2[i] += (l >= n) ? l - n + 1 : 0;
            }
            f=1;
            for(i=0;i<n;i++)
              f*=char_map_range;
            r->n_features+=f;           
            printf("[%d]\n", f);
          }
          break;
        }
        case 'W': {
          char filename[200],
            word1[200],
            word2[200];
          FILE *wordlist;
          int nw,nl=0,i1,i2,n_matched=0;

          if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -W:<weight>:<filename>\n");
            fprintf(stderr, "  Word list (read from file <filename>).\n");
            exit(1);
          }
          else if(!(wordlist=fopen(filename,"r"))) {
            fprintf(stderr,"ERROR: Cannot read word list file %s.\n",
                    filename);
            exit(-1);
          }
          else {
            printf("FEATURE: word list %s, weight=%d ... ", filename, weight);
            fflush(stdout);
            while((nw=fscanf(wordlist,"%s %s",word1,word2))>0) {
              nl++;
              if (nw!=2) fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n",
                                 nl,filename,nw);
              else {
                if((i1=cl_str2id(w_attr1,word1))>=0
                   && (i2=cl_str2id(w_attr2,word2)) >=0) {
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_matched++;
                  r->n_features++;
                }
              }
            }
            fclose(wordlist);
            printf("[%d]\n", n_matched);
          }         
          break;
        }
        case 'C': 
          if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -C:<weight>\n");
            fprintf(stderr, "  Character count [primary feature].\n");
            exit(1);
          }
          else {
            /* primary feature -> don't create additional features */
            /* first entry in a token's feature list is character count */ 
            for (i=0; i<nw1; i++) fcount1[i]++;
            for (i=0; i<nw2; i++) fcount2[i]++;
            printf("FEATURE: character count, weight=%d ... [1]\n", weight);
          }
          break;
        default: fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]);
          exit(1);
          break;
        }
      }
      else {
        fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]);
        exit(1);
      }
    }
  }
Esempio n. 5
0
/**
 * Creates feature maps for a source/target corpus pair.
 *
 * This is the constructor function for the FMS class.
 *
 * Example usage:
 *
 * FMS = create_feature_maps(config_data, nr_of_config_lines, source_word, target_word, source_s, target_s);
 *
 * @param config              array of strings representing the feature map configuration.
 * @param config_lines        the number of configuration items stored in config_data.
 * @param w_attr1             The p-attribute in the first corpus to link.
 * @param w_attr2             The p-attribute in the second corpus to link.
 * @param s_attr1             The s-attribute in the first corpus to link.
 * @param s_attr2             The s-attribute in the second corpus to link.
 * @return                    the new FMS object.
 */
FMS
create_feature_maps(char **config,
                    int config_lines,
                    Attribute *w_attr1,
                    Attribute *w_attr2,
                    Attribute *s_attr1,
                    Attribute *s_attr2
                    ) 
{
  FMS r;

  unsigned int *fcount1, *fcount2;    /* arrays for types in the lexicons of the source
                                       * & target corpora, respectively, counting how often each is used
                                       * in a feature */

  int config_pointer;

  char *b, command[CL_MAX_LINE_LENGTH], dummy[CL_MAX_LINE_LENGTH];

  int current_feature;
  int weight;                         /* holds the weight assigned to the feature(s) we're working on */
  int need_to_abort;                  /* boolean used during pointer check */

  /* after we have counted up features, these will become arrays of ints, with one entry per feature */
  int *fs1, *fs2; 

  int i;
  int nw1;  /* number of types on the word-attribute of the source corpus */
  int nw2;  /* number of types on the word-attribute of the target corpus */

  /* one last variable: we need to know the character set of the two corpora for assorted purposes */
  CorpusCharset charset;
  charset = cl_corpus_charset(cl_attribute_mother_corpus(w_attr1));

  /* first, create the FMS object. */
  r = (FMS) malloc(sizeof(feature_maps_t));
  assert(r);

  /* copy in the attribute pointers */
  r->att1 = w_attr1;
  r->att2 = w_attr2;
  r->s1 = s_attr1;
  r->s2 = s_attr2;

  init_char_map();
  
  /* find out how many different word-types occur on each of the p-attributes */
  nw1 = cl_max_id(w_attr1);
  if (nw1 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of source corpus\n");
    exit(1);
  }
  nw2 = cl_max_id(w_attr2);
  if (nw2 <= 0) {
    fprintf(stderr, "ERROR: can't access lexicon of target corpus\n");
    exit(1);
  }
  
  printf("LEXICON SIZE: %d / %d\n", nw1, nw2);

  fcount1 = (unsigned int*) calloc(nw1 + 1, sizeof(unsigned int));
  fcount2 = (unsigned int*) calloc(nw2 + 1, sizeof(unsigned int));

  r->n_features = 1;


  /* NOTE there are two passes through the creation of feature maps - two sets of nearly identical code!
   * First pass to see how many things we need ot count, second pass to count them. */

  /* process feature map configuration: first pass */
  for (config_pointer = 0; config_pointer < config_lines; config_pointer++) {

    /* strip newline and comments */
    if ( (b = strpbrk(config[config_pointer],"\n#")) )
      *b = 0;

    if (sscanf(config[config_pointer], "%s", command) > 0) {
      if(command[0] == '-') {
        /*
         * These are the FIRST PASS options for the different config lines.
         *
         * Possible config commands: -S -W -C -1 -2 -3 -4
         */
        switch(command[1]) {
        /* -S : the "shared words" type of feature */
        case 'S': {
          int i1, i2; /* i1 and i2 are temporary indexes into the lexicons of the two corpora */
          int f1, f2; /* f1 and f2 are temporary storage for frequencies from the corpus lexicons */
          float threshold;
          int n_shared = 0; /* numebr fo shared words - only calculated for the purpose of printing it */

          if(sscanf(config[config_pointer],"%2s:%d:%f %s",command,&weight,&threshold,dummy) != 3) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -S:<weight>:<threshold>\n");
            fprintf(stderr,"  Shared words with freq. ratios f1/(f1+f2) and f2/(f1+f2) >= <threshold>.\n");
            exit(1);
          }
          else {
            printf("FEATURE: Shared words, threshold=%4.1f%c, weight=%d ... ",threshold * 100, '\%', weight);
            fflush(stdout);

            /* for each type in target corpus, get its frequency, and the corresponding id and frequency
             * from the target corpus, then test whether it meets the criteria for use as a feature. */
            for (i1 = 0; i1 < nw1; i1++) {
              f1 = cl_id2freq(w_attr1, i1);
              i2 = cl_str2id(w_attr2, cl_id2str(w_attr1, i1));
              if (i2 >= 0){
                f2 = cl_id2freq(w_attr2, i2);
                /* if it will be used as a feature, increment counts of features in various places */
                if ( (f1 / (0.0+f1+f2)) >= threshold && (f2 / (0.0+f1+f2)) >= threshold){
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_shared++;
                  r->n_features++; 
                }
              }
            }
            printf("[%d]\n", n_shared);
          }
          break;
        }
        /* -1 to -4 : shared character sequences (of 1 letter to 4 letters in length) as features */
        case '1': 
        case '2':
        case '3':
        case '4': { 
          int n; /* length of the n-gram, obviously */
          
          if (sscanf(config[config_pointer], "%1s%d:%d %s", command, &n, &weight, dummy) !=3 ) {
            fprintf(stderr,"ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr,"Usage: -<n>:<weight>  (n = 1..4)\n");
            fprintf(stderr,"  Shared <n>-grams (single characters, bigrams, trigrams, 4-grams).\n");
            exit(1);
          }
          else if(n <= 0 || n > 4) {
            /* this shouldn't happen anyway */
            fprintf(stderr,"ERROR: cannot handle %d-grams: %s\n", n, config[config_pointer]);
            exit(1);
          }
          else {
            int i,f,l; /* temp storage for lexicon index, n of possible features, && word length */
            char *s;

            printf("FEATURE: %d-grams, weight=%d ... ", n, weight);
            fflush(stdout);

            /* for each entry in source-corpus lexicon, add to the number of features IFF
             * that lexicon entry is longer than 4 characters */
            for(i = 0; i < nw1; i++) {
              /* l = cl_id2strlen(w_attr1, i); */
              s = (unsigned char *) cl_strdup(cl_id2str(w_attr1, i));
              cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC);
              l = strlen(s);
              cl_free(s);
              fcount1[i] += (l >= n) ? l - n + 1 : 0;
            }
            /* same for target corpus */
            for(i = 0; i < nw2; i++) {
              /* l = cl_id2strlen(w_attr2, i); */
              s = (unsigned char *) cl_strdup(cl_id2str(w_attr2, i));
              cl_string_canonical( (char *)s, charset, IGNORE_CASE | IGNORE_DIAC);
              l = strlen(s);
              cl_free(s);
              fcount2[i] += (l >= n) ? l - n + 1 : 0;
            }
            /* set f to number of possible features (= number of possible characters to the power of n) */
            f = 1;
            for(i = 0 ; i < n; i++)
              f *= char_map_range;
            /* anmd add that to our total number of features! */
            r->n_features += f;
            printf("[%d]\n", f);
          }
          break;
        }
        /* -W: the word-translation-equivalence type of feature */
        case 'W': {
          char filename[CL_MAX_LINE_LENGTH],
            word1[CL_MAX_LINE_LENGTH],
            word2[CL_MAX_LINE_LENGTH];
          FILE *wordlist;
          int nw;      /* number of words scanned from an input line */
          int nl = 0;  /* counter for the number of lines in the wordlist file we have gone through */
          int i1,i2;   /* lexicon ids in source and target corpora */
          int n_matched = 0;  /* counter for n of lines in input file that can be used as a feature. */

          if(sscanf(config[config_pointer],"%2s:%d:%s %s",command,&weight,filename,dummy)!=3) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -W:<weight>:<filename>\n");
            fprintf(stderr, "  Word list (read from file <filename>).\n");
            exit(1);
          }
          else if(!(wordlist = fopen(filename,"r"))) {
            fprintf(stderr,"ERROR: Cannot read word list file %s.\n",
                    filename);
            exit(-1);
          }
          else {
            printf("FEATURE: word list %s, weight=%d ... ", filename, weight);
            fflush(stdout);
            while((nw = fscanf(wordlist,"%s %s",word1,word2))>0) {
              /* on first line of file, skip UTF8 byte-order-mark if present */
              if (nl == 0 && charset == utf8 && strlen(word1) > 3)
                if (word1[0] == (char)0xEF && word1[1] == (char)0xBB && word1[2] == (char)0xBF)
                   cl_strcpy(word1, (word1 + 3));
              nl++;
              /* check that both word 1 and word 2 are valid for the encoding of the corpora */
              if (! (cl_string_validate_encoding(word1, charset, 0)
                  && cl_string_validate_encoding(word2, charset, 0)) ) {
                fprintf(stderr, "ERROR: character encoding error in the word-list input file with the input word list.\n");
                fprintf(stderr, "       (The error occurs on line %d.)\n", nl);
                exit(1);
              }
              if (nw != 2)
                fprintf(stderr,"WARNING: Line %d in word list '%s' contains %d words, ignored.\n",nl,filename,nw);
              else {
                /* if word1 and word2 both occur in their respective corpora, this is a feature. */
                if(   (i1 = cl_str2id(w_attr1, word1)) >= 0
                   && (i2 = cl_str2id(w_attr2, word2)) >= 0 ) {
                  fcount1[i1]++;
                  fcount2[i2]++;
                  n_matched++;
                  r->n_features++;
                }
              }
            }
            fclose(wordlist);
            printf("[%d]\n", n_matched);
          }         
          break;
        }
        /* -C: the character count type of feature.
         * This feature exists for EVERY word type. */
        case 'C': 
          if(sscanf(config[config_pointer],"%2s:%d %s",command,&weight,dummy)!=2) {
            fprintf(stderr, "ERROR: wrong # of args: %s\n",config[config_pointer]);
            fprintf(stderr, "Usage: -C:<weight>\n");
            fprintf(stderr, "  Character count [primary feature].\n");
            exit(1);
          }
          else {
            /* primary feature -> don't create additional features */
            /* first entry in a token's feature list is character count */ 
            for (i=0; i<nw1; i++)
              fcount1[i]++;
            for (i=0; i<nw2; i++)
              fcount2[i]++;
            printf("FEATURE: character count, weight=%d ... [1]\n", weight);
          }
          break;
        default:
          fprintf(stderr,"ERROR: unknown feature: %s\n",config[config_pointer]);
          exit(1);
          break;
        }
      }
      else {
        fprintf(stderr,"ERROR: feature parse error: %s\n", config[config_pointer]);
        exit(1);
      }
    }
  }