예제 #1
0
/**
 * This function analyses the inputs of all the tags of the given .fst2 in
 * order to determine their kind. 'tokens' contains all the text tokens.
 * After the execution of the function, 'number_of_patterns' will contain
 * the number of patterns found in the grammar, and 'is_DIC'/'is_CDIC'/'is_SDIC'/'is_TDIC'
 * will contain 1 if the tag 'DIC'/'CDIC'/'SDIC'/'TDIC' has been found. See
 * the comment below about the special case for '<!DIC>'.
 */
void process_tags(int *number_of_patterns,
                  struct string_hash* semantic_codes,
                  int *is_DIC,int *is_CDIC,
                  int *is_SDIC,struct locate_parameters* parameters,
                  Abstract_allocator prv_alloc) {
(*number_of_patterns)=0;
(*is_DIC)=0;
(*is_CDIC)=0;
(*is_SDIC)=0;
Fst2* fst2=parameters->fst2;
struct string_hash* tokens=parameters->tokens;
Fst2Tag* tag=fst2->tags;
/* We get the number of the SPACE token */
unichar t[2];
t[0]=' ';
t[1]='\0';
parameters->SPACE=get_value_index(t,tokens,DONT_INSERT);
/* Then, we test all the tags */
for (int i=0;i<fst2->number_of_tags;i++) {
   if (tag[i]->type!=UNDEFINED_TAG) {
      /* We don't need to process again things like variables and contexts
       * that have already been processed at the time of loading the fst2 */
      continue;
   }
   int length=u_strlen(tag[i]->input);
   if (!u_strcmp(tag[i]->input,"#")) {
      /* If we have a #, we must check if it is the meta one that
       * forbids space or the "#" token */
      if (is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK)) {
         /* If the case respect bit is set to 1, then we have the "#" token */
         tag[i]->type=PATTERN_TAG;
         tag[i]->pattern=build_token_pattern(tag[i]->input,prv_alloc);
      }
      else {
         /* If we have the meta # */
         tag[i]->type=META_TAG;
         tag[i]->meta=META_SHARP;
      }
   }
   else if (!u_strcmp(tag[i]->input,"<E>")) {
      /* If we have a transition tagged by the empty word epsilon */
      tag[i]->type=META_TAG;
      tag[i]->meta=META_EPSILON;
   }
   else {
      int token_number=get_value_index(tag[i]->input,tokens,DONT_INSERT);
      if (token_number!=-1) {
         /* If the input is an existing token */
         if (token_number==parameters->SPACE) {
            /* If it is a space */
            tag[i]->type=META_TAG;
            tag[i]->meta=META_SPACE;
         } else {
            /* If it is a normal token */
            tag[i]->type=PATTERN_TAG;
            tag[i]->pattern=build_token_pattern(tag[i]->input,prv_alloc);
         }
      }
      else {
         /* This input is not an existing token. Two cases can happen:
          * 1) metas like <!MOT> or patterns like <V:K>
          * 2) a word that is not in the text tokens */
         if (tag[i]->input[0]!='<' || tag[i]->input[length-1]!='>') {
            /* If we are in case 2, it may not be an error. For instance,
             * if the tag contains "foo" and if it is a tag that allows
             * case variations, we could match "FOO" if this token is in the
             * text. */
            tag[i]->type=PATTERN_TAG;
            tag[i]->pattern=build_token_pattern(tag[i]->input,prv_alloc);
         } else {
            /* If we have something of the form <...>, we must test first if it is
             * or not a negative tag like <!XXX> */
            int negative_tag=(tag[i]->input[1]=='!')?1:0;
            if (negative_tag) {
               set_bit_mask(&(tag[i]->control),NEGATION_TAG_BIT_MASK);
            }
            /* Then, we must test if we have or not a meta. To do that, we
             * extract the content without < > and ! if any.*/
            unichar* content=u_strdup(&(tag[i]->input[1+negative_tag]),length-2-negative_tag,prv_alloc);
            /* And we test all the possible metas */
            if (!u_strcmp(content,"MOT")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_MOT;
            }
            else if (!u_strcmp(content,"DIC")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_DIC;
               if (!negative_tag) {
                  /* We mark that the DIC tag has been found, but only
                   * if it is not the negative one (<!DIC>). We do this
                   * because things matched by <DIC> will be taken from
                   * the 'dlf' and 'dlc' files, whereas things matched by <!DIC>
                   * will be taken from the 'err' file. Such a trick is necessary
                   * if we don't want 'priori' to be taken as an unknown word since
                   * it is  part of the compound word 'a priori' */
                  (*is_DIC)=1;
               }
            }
            else if (!u_strcmp(content,"CDIC")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_CDIC;
               (*is_CDIC)=1;
            }
            else if (!u_strcmp(content,"SDIC")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_SDIC;
               (*is_SDIC)=1;
            }
            else if (!u_strcmp(content,"TDIC")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_TDIC;
            }
            else if (!u_strcmp(content,"MAJ")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_MAJ;
            }
            else if (!u_strcmp(content,"MIN")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_MIN;
            }
            else if (!u_strcmp(content,"PRE")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_PRE;
            }
            else if (!u_strcmp(content,"NB")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_NB;
               if (negative_tag) {
                  error("Negative mark will be ignored in <!NB>\n");
               }
            }
            else if (!u_strcmp(content,"TOKEN")) {
               tag[i]->type=META_TAG;
               tag[i]->meta=META_TOKEN;
            }
            else {
               /* If we arrive here, we have not a meta but a pattern like
                * <be>, <be.V>, <V:K>, ... */
               tag[i]->type=PATTERN_TAG;
               tag[i]->pattern=build_pattern(content,semantic_codes,parameters->tilde_negation_operator,prv_alloc);
               if (tag[i]->pattern->type==CODE_PATTERN ||
                   tag[i]->pattern->type==LEMMA_AND_CODE_PATTERN ||
                   tag[i]->pattern->type==FULL_PATTERN || 
                   tag[i]->pattern->type==INFLECTED_AND_LEMMA_PATTERN) {
                  /* If the pattern we obtain contains grammatical/semantic
                   * codes, then we put it in the pattern tree and we note its number. */
                  tag[i]->pattern_number=add_pattern(number_of_patterns,tag[i]->pattern,parameters->pattern_tree_root,prv_alloc);
                  if (tag[i]->pattern->type==CODE_PATTERN) {
                     /* If we have a code pattern, then the tag will just need to contain
                      * the pattern number, BUT, WE DO NOT FREE THE PATTERN,
                      * since this pattern still could be used in morphological mode */
                     tag[i]->type=PATTERN_NUMBER_TAG;
                  }
               }
            }
            /* We don't forget to free the content */
            free_cb(content,prv_alloc);
         }
      }
   }
}
}
예제 #2
0
파일: search.c 프로젝트: megabajt/poldek
static
error_t parse_opt(int key, char *arg, struct argp_state *state)
{
    struct cmdctx *cmdctx = state->input;

    switch (key) {
        case 'a':
            cmdctx->_flags |= OPT_SEARCH_ALL;
            break;
            
        case 'c':
            cmdctx->_flags |= OPT_SEARCH_CNFL;
            break;

        case 'l':
        case 'f':
            cmdctx->_flags |= OPT_SEARCH_FL;
            break;

        case 'g':
            cmdctx->_flags |= OPT_SEARCH_GROUP;
            break;
            
        case 'o':
            cmdctx->_flags |= OPT_SEARCH_OBSL;
            break;
            
        case 'p':
            cmdctx->_flags |= OPT_SEARCH_CAP;
            break;

        case 'r':
            cmdctx->_flags |= OPT_SEARCH_REQ;
            break;

	case 'S':
	    cmdctx->_flags |= OPT_SEARCH_SUGS;
	    break;

        case 's':
            cmdctx->_flags |= OPT_SEARCH_SUMM;
            break;

        case 'd':
            cmdctx->_flags |= OPT_SEARCH_DESC;
            break;
        
        case 'L':
    	    cmdctx->_flags |= OPT_SEARCH_CHANGELOG;
    	    break;

        case OPT_PATTERN_PCRE:
            cmdctx->_flags |= OPT_PATTERN_PCRE;
            break;

            
        case ARGP_KEY_ARG:
            if (arg == NULL)
                break;
            DBGF("search.arg (%s)\n", arg);
            
            if (cmdctx->_data != NULL) { /* already got pattern */
                poldek_ts_add_pkgmask(cmdctx->ts, arg);
                
            } else {
                struct pattern *pt;
                if ((pt = build_pattern(cmdctx, arg)) == NULL) {
                    argp_usage(state);
                    return EINVAL;
                }
                cmdctx->_data = pt;
                cmdctx->rtflags |= CMDCTX_GOTARGS;
            }
            break;
            
        default:
            return ARGP_ERR_UNKNOWN;
    }
    
    return 0;
}