/** * This function analyses the inputs of all the tags of the given .fst2 in * order to determine their kind. 'tokens' contains all the text tokens. * After the execution of the function, 'number_of_patterns' will contain * the number of patterns found in the grammar, and 'is_DIC'/'is_CDIC'/'is_SDIC'/'is_TDIC' * will contain 1 if the tag 'DIC'/'CDIC'/'SDIC'/'TDIC' has been found. See * the comment below about the special case for '<!DIC>'. */ void process_tags(int *number_of_patterns, struct string_hash* semantic_codes, int *is_DIC,int *is_CDIC, int *is_SDIC,struct locate_parameters* parameters, Abstract_allocator prv_alloc) { (*number_of_patterns)=0; (*is_DIC)=0; (*is_CDIC)=0; (*is_SDIC)=0; Fst2* fst2=parameters->fst2; struct string_hash* tokens=parameters->tokens; Fst2Tag* tag=fst2->tags; /* We get the number of the SPACE token */ unichar t[2]; t[0]=' '; t[1]='\0'; parameters->SPACE=get_value_index(t,tokens,DONT_INSERT); /* Then, we test all the tags */ for (int i=0;i<fst2->number_of_tags;i++) { if (tag[i]->type!=UNDEFINED_TAG) { /* We don't need to process again things like variables and contexts * that have already been processed at the time of loading the fst2 */ continue; } int length=u_strlen(tag[i]->input); if (!u_strcmp(tag[i]->input,"#")) { /* If we have a #, we must check if it is the meta one that * forbids space or the "#" token */ if (is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK)) { /* If the case respect bit is set to 1, then we have the "#" token */ tag[i]->type=PATTERN_TAG; tag[i]->pattern=build_token_pattern(tag[i]->input,prv_alloc); } else { /* If we have the meta # */ tag[i]->type=META_TAG; tag[i]->meta=META_SHARP; } } else if (!u_strcmp(tag[i]->input,"<E>")) { /* If we have a transition tagged by the empty word epsilon */ tag[i]->type=META_TAG; tag[i]->meta=META_EPSILON; } else { int token_number=get_value_index(tag[i]->input,tokens,DONT_INSERT); if (token_number!=-1) { /* If the input is an existing token */ if (token_number==parameters->SPACE) { /* If it is a space */ tag[i]->type=META_TAG; tag[i]->meta=META_SPACE; } else { /* If it is a normal token */ tag[i]->type=PATTERN_TAG; tag[i]->pattern=build_token_pattern(tag[i]->input,prv_alloc); } } else { /* This input is not an existing token. Two cases can happen: * 1) metas like <!MOT> or patterns like <V:K> * 2) a word that is not in the text tokens */ if (tag[i]->input[0]!='<' || tag[i]->input[length-1]!='>') { /* If we are in case 2, it may not be an error. For instance, * if the tag contains "foo" and if it is a tag that allows * case variations, we could match "FOO" if this token is in the * text. */ tag[i]->type=PATTERN_TAG; tag[i]->pattern=build_token_pattern(tag[i]->input,prv_alloc); } else { /* If we have something of the form <...>, we must test first if it is * or not a negative tag like <!XXX> */ int negative_tag=(tag[i]->input[1]=='!')?1:0; if (negative_tag) { set_bit_mask(&(tag[i]->control),NEGATION_TAG_BIT_MASK); } /* Then, we must test if we have or not a meta. To do that, we * extract the content without < > and ! if any.*/ unichar* content=u_strdup(&(tag[i]->input[1+negative_tag]),length-2-negative_tag,prv_alloc); /* And we test all the possible metas */ if (!u_strcmp(content,"MOT")) { tag[i]->type=META_TAG; tag[i]->meta=META_MOT; } else if (!u_strcmp(content,"DIC")) { tag[i]->type=META_TAG; tag[i]->meta=META_DIC; if (!negative_tag) { /* We mark that the DIC tag has been found, but only * if it is not the negative one (<!DIC>). We do this * because things matched by <DIC> will be taken from * the 'dlf' and 'dlc' files, whereas things matched by <!DIC> * will be taken from the 'err' file. Such a trick is necessary * if we don't want 'priori' to be taken as an unknown word since * it is part of the compound word 'a priori' */ (*is_DIC)=1; } } else if (!u_strcmp(content,"CDIC")) { tag[i]->type=META_TAG; tag[i]->meta=META_CDIC; (*is_CDIC)=1; } else if (!u_strcmp(content,"SDIC")) { tag[i]->type=META_TAG; tag[i]->meta=META_SDIC; (*is_SDIC)=1; } else if (!u_strcmp(content,"TDIC")) { tag[i]->type=META_TAG; tag[i]->meta=META_TDIC; } else if (!u_strcmp(content,"MAJ")) { tag[i]->type=META_TAG; tag[i]->meta=META_MAJ; } else if (!u_strcmp(content,"MIN")) { tag[i]->type=META_TAG; tag[i]->meta=META_MIN; } else if (!u_strcmp(content,"PRE")) { tag[i]->type=META_TAG; tag[i]->meta=META_PRE; } else if (!u_strcmp(content,"NB")) { tag[i]->type=META_TAG; tag[i]->meta=META_NB; if (negative_tag) { error("Negative mark will be ignored in <!NB>\n"); } } else if (!u_strcmp(content,"TOKEN")) { tag[i]->type=META_TAG; tag[i]->meta=META_TOKEN; } else { /* If we arrive here, we have not a meta but a pattern like * <be>, <be.V>, <V:K>, ... */ tag[i]->type=PATTERN_TAG; tag[i]->pattern=build_pattern(content,semantic_codes,parameters->tilde_negation_operator,prv_alloc); if (tag[i]->pattern->type==CODE_PATTERN || tag[i]->pattern->type==LEMMA_AND_CODE_PATTERN || tag[i]->pattern->type==FULL_PATTERN || tag[i]->pattern->type==INFLECTED_AND_LEMMA_PATTERN) { /* If the pattern we obtain contains grammatical/semantic * codes, then we put it in the pattern tree and we note its number. */ tag[i]->pattern_number=add_pattern(number_of_patterns,tag[i]->pattern,parameters->pattern_tree_root,prv_alloc); if (tag[i]->pattern->type==CODE_PATTERN) { /* If we have a code pattern, then the tag will just need to contain * the pattern number, BUT, WE DO NOT FREE THE PATTERN, * since this pattern still could be used in morphological mode */ tag[i]->type=PATTERN_NUMBER_TAG; } } } /* We don't forget to free the content */ free_cb(content,prv_alloc); } } } } }
static error_t parse_opt(int key, char *arg, struct argp_state *state) { struct cmdctx *cmdctx = state->input; switch (key) { case 'a': cmdctx->_flags |= OPT_SEARCH_ALL; break; case 'c': cmdctx->_flags |= OPT_SEARCH_CNFL; break; case 'l': case 'f': cmdctx->_flags |= OPT_SEARCH_FL; break; case 'g': cmdctx->_flags |= OPT_SEARCH_GROUP; break; case 'o': cmdctx->_flags |= OPT_SEARCH_OBSL; break; case 'p': cmdctx->_flags |= OPT_SEARCH_CAP; break; case 'r': cmdctx->_flags |= OPT_SEARCH_REQ; break; case 'S': cmdctx->_flags |= OPT_SEARCH_SUGS; break; case 's': cmdctx->_flags |= OPT_SEARCH_SUMM; break; case 'd': cmdctx->_flags |= OPT_SEARCH_DESC; break; case 'L': cmdctx->_flags |= OPT_SEARCH_CHANGELOG; break; case OPT_PATTERN_PCRE: cmdctx->_flags |= OPT_PATTERN_PCRE; break; case ARGP_KEY_ARG: if (arg == NULL) break; DBGF("search.arg (%s)\n", arg); if (cmdctx->_data != NULL) { /* already got pattern */ poldek_ts_add_pkgmask(cmdctx->ts, arg); } else { struct pattern *pt; if ((pt = build_pattern(cmdctx, arg)) == NULL) { argp_usage(state); return EINVAL; } cmdctx->_data = pt; cmdctx->rtflags |= CMDCTX_GOTARGS; } break; default: return ARGP_ERR_UNKNOWN; } return 0; }