/* fp->sp->owner is scp (assert this is non-NULL) fp->sp is sp sigs[0] is sig */ void sigs_cache_add(struct ilem_form *ifp, struct sig const *const *sigs) { if (!ifp || !ifp->sp || BIT_ISSET(ifp->f2.flags, F2_FLAGS_FROM_CACHE)) return; if (!ifp->sp->cache) sigs_cache_init(ifp->sp); if (verbose) fprintf(stderr, "sigs_cache: adding %s to cache\n", ifp->f2.form); sigs_load_one_sig(ifp->sp->owner, ifp->sp->cache, sigs[0]->sig, 0, ifp); #if 0 if (BIT_ISSET(fp->f2.flags, F2_FLAGS_LEM_BY_NORM)) hash_add(fp->sp->cache, npool_copy(ifp->f2.norm, sigs_cache_pool), (void*)sigs); else hash_add(fp->sp->cache, npool_copy(ifp->f2.form, sigs_cache_pool), (void*)sigs); #endif }
struct est * est_init(const char *project, const char *index) { struct est *estp = malloc(sizeof(struct est)); estp->h = hash_create(1000); estp->p = npool_init(); estp->project = (const char*)npool_copy((const unsigned char *)project, estp->p); estp->index = (const char*)npool_copy((const unsigned char *)index, estp->p); estp->filename = (const char*)npool_copy((const unsigned char *) se_file(project, index, "keys.est"), estp->p); return estp; }
static unsigned char * alt_next(struct xcl_context *xc) { unsigned char *this_lem = lem_next_alt; if (this_lem) { lem_next_alt = this_lem; while (*lem_next_alt) if ('|' == *lem_next_alt && (lem_next_alt > this_lem && lem_next_alt[-1] != '\\') && lem_next_alt[-1] != '.' && lem_next_alt[-1] != '-' && lem_next_alt[-1] != '}' && lem_next_alt[-1] != '(' && lem_next_alt[1] && lem_next_alt[1] != '.' && lem_next_alt[1] != '-' && lem_next_alt[1] != '{' && lem_next_alt[1] != ')' ) { *lem_next_alt++ = '\0'; break; } else ++lem_next_alt; if (*lem_next_alt == '\0') lem_next_alt = NULL; return npool_copy(this_lem, xc->pool); } else return NULL; }
static void xmd_eH(void *userData, const char *name) { if (in_cat_data) /* breaks if a key can occur more than once */ { if (!strcmp(name,"cat")) in_cat_data = 0; else if (strcmp(name,"subfield")) /*FIXME: should do something with subfields */ hash_add(xmd_vals, (unsigned char*)npool_copy(name, xmd_pool), (unsigned char *)npool_copy(charData_retrieve(), xmd_pool)); } else if (!strcmp(name,"images")) in_cat_data = 1; }
struct nsa_token * create_unit(struct nsa_parser *p,const char *s,struct nsa_token *t) { const char *n = nsa_trim_morph(p->context, s); struct nsa_hash_data *d = hash_find(p->context->step_index, (unsigned char *)n); if (d) { struct nsa_token *tu = new_token(); struct nsa_unit *u = new_unit(); List *l = list_create(LIST_SINGLE); list_add(l,t); if (d->continuations) d = check_continuations(d,p,&n,l); u->name = (char *)npool_copy((unsigned char *)n,p->pool); u->cands = d->cands; tu->type = NSA_T_UNIT; if (t) { struct nsa_token *lt; int i; tu->children = new_children(list_len(l)); for (i = 0, lt = list_first(l); lt; lt = list_next(l),++i) tu->children[i] = lt; } tu->d.u = u; list_free(l,NULL); return tu; } else return t; }
static unsigned char * lem_next(struct xcl_context *xc) { unsigned char *this_lem = lem_next_lem; if (this_lem) { lem_next_lem = this_lem; while (*lem_next_lem) if ('&' == *lem_next_lem && (lem_next_lem[-1] != '\\' && lem_next_lem[-1] != '+' && lem_next_lem[-1] != '-')) { *lem_next_lem++ = '\0'; if (lem_next_lem[-2] == '.') { post_lem_sentence = lem_next_lem[-3]; /* fprintf(stderr,"post_lem_sentence = %c\n",post_lem_sentence); */ lem_next_lem[-3] = '\0'; } break; } else ++lem_next_lem; if (*lem_next_lem == '\0') lem_next_lem = NULL; return npool_copy(this_lem, xc->pool); } else return this_lem; }
/* ref_fp is the f2 we use as the CFGWPOS source; fp is the f2 that is being processed by the lemmer; if ref_fp is NULL fp is used as ref_fp as well */ int f2_alias(struct sig_context *scp, struct f2 *fp, struct f2 *ref_fp) { int ret = 0; if (!ref_fp) ref_fp = fp; if (scp->aliases) { unsigned char *aform = NULL; aform = sas_alias_form(scp->aliases, fp->form, ref_fp->cf, ref_fp->gw, ref_fp->pos); if (strcmp((char*)fp->form,(char*)aform)) { fp->oform = fp->form; fp->form = npool_copy(aform,scp->pool); ret = 1; if (verbose > 1) fprintf(stderr,"aliased form %s => fp->form %s\n",fp->oform,fp->form); } free(aform); } return ret; }
void lem_save_lemma(struct node *wp, const char *lemma) { struct ilem_form *form = hash_find(word_form_index, getAttr(wp,"xml:id")); if (form) form->literal = (char*)npool_copy((unsigned char *)lemma,lemline_xcp->pool); else vwarning("internal error: word_form_index lookup failed"); }
void lem_append_line(unsigned char *lp) { struct lem_save *sp; for (sp = curr_lsp; sp->next; sp = sp->next) ; last_lsp = sp->next = new_lsp(); sp->next->line = npool_copy(lp,lemline_xcp->pool); }
static char * make_inst(struct xcl_context *xc, struct ilem_form *ifp) { char buf[1024]; sprintf(buf,"%%%s:%s=",ifp->f2.lang,ifp->f2.form); if (ifp->sublem) strcat(buf,ifp->sublem); return (char*)npool_copy((unsigned char*)buf,xc->pool); }
void lem_save_cont(unsigned char *lp) { struct lem_save *sp; for (sp = curr_lsp; sp->cont; sp = sp->cont) ; sp->cont = new_lsp(); sp->cont->line = npool_copy(lp,lemline_xcp->pool); }
void lem_reset_form(const char *ref, const char *form) { struct ilem_form *fp = hash_find(word_form_index,(unsigned char *)ref); if (fp) { fp->f2.form = npool_copy((unsigned char *)form,lemline_xcp->pool); form = (const char*)fp->f2.form; } }
unsigned char * f2_sig(struct f2 *fp, struct npool*pool) { unsigned char *ret = NULL; if (!fp) return NULL; if (fp->parts) { unsigned char *tmp = NULL; tmp = sig_one(fp, 0); if (tmp) { List *parts = list_create(LIST_SINGLE); int i; list_add(parts, tmp); for (i = 0; fp->parts[i]; ++i) { fp->parts[i]->tail_sig = tmp = sig_one(fp->parts[i], 1); if (tmp) list_add(parts, tmp); else return NULL; } tmp = list_to_str2(parts, "&&"); ret = npool_copy(tmp,pool); free(tmp); } else return NULL; } else { unsigned char *tmp = NULL; tmp = sig_one(fp, 0); ret = npool_copy(tmp,pool); free(tmp); } return ret; }
static const char * countbase_str(struct nsa_token *m, struct nsa_parser *p) { if (m->type == NSA_T_COUNT) { char buf[128]; sprintf(buf,"%d",count_base(m)); return (const char *)npool_copy((unsigned char *)buf,p->pool); } return ""; }
unsigned char * f2_psu_sig(struct f2 *fp, struct npool *pool) { unsigned char buf[1024]; #if 1 sprintf((char*)buf,"{%s}::",fp->psu_ngram); #else sprintf((char*)buf,"{%s[%s//%s]%s'%s", fp->cf ? fp->cf : (Uchar*)"X", fp->gw ? fp->gw : (Uchar*)"X", fp->sense ? fp->sense : (Uchar*)"X", fp->pos ? fp->pos : (Uchar*)"X", fp->epos ? fp->epos : (Uchar*)"X"); strcat((char*)buf, "}::"); #endif if (fp->parts) { int i; char *amp = NULL; for (i = 0; fp->parts[i]; ++i) { if (i) strcat((char*)buf, "++"); #if 1 if (fp->parts[i]->tail_sig) strcat((char*)buf, (char*)fp->parts[i]->tail_sig); else { if (!fp->parts[i]->sig) fp->parts[i]->sig = f2_sig(fp->parts[i], pool); if ((amp = strstr((char*)fp->parts[i]->sig, "&&"))) { int len = strlen((char*)buf) + (amp - (char*)fp->parts[i]->sig); strncat((char*)buf, (char*)fp->parts[i]->sig, amp - (char*)fp->parts[i]->sig); buf[len] = '\0'; } else strcat((char*)buf,tabless(fp->parts[i]->sig)); #else if (BIT_ISSET(fp->parts[i]->flags, F2_FLAGS_SAME_REF)) append_sig_sans_form(buf,(unsigned char*)tabless(fp->parts[i]->sig)); else strcat((char*)buf,tabless(fp->parts[i]->sig)); #endif } } } return npool_copy(buf,pool); }
static void incr_val(Hash_table *h, const unsigned char *v) { int *counter = NULL; if (!(counter = hash_find(h, v))) { counter = malloc(sizeof(int *)); *counter = 1; hash_add(h, npool_copy(v, sig_pool), counter); } else *counter += 1; }
void proj_init(struct run_context *runp, const char *project) { struct proj_context *p = hash_find(runp->known_projects, (unsigned char *)project); if (!p) { const char *o = NULL; p = calloc(1,sizeof(struct proj_context)); p->name = (char *)npool_copy((unsigned char *)project, runp->pool); hash_add(runp->known_projects, npool_copy((unsigned char *)project,runp->pool), p); p->xpd = xpd_init(project,runp->pool); p->owner = runp; if (xpd_option(p->xpd,"atf-saa-mode")) saa_mode = xpd_option_int(p->xpd,"atf-saa-mode"); o = xpd_option(p->xpd,"render-serial"); if (o && !strcmp(o, "yes")) odt_serial = 1; /*set_project(p, project);*/ } runp->proj = p; }
int f2_extreme_alias(struct sig_context *scp, struct f2 *fp, struct f2 *ref_fp) { if (!f2_form_signs(fp->form, ref_fp->form)) return 0; fp->oform = fp->form; fp->form = npool_copy(ref_fp->form,scp->pool); if (verbose > 1) fprintf(stderr,"extreme aliased form %s => fp->form %s\n",fp->oform,fp->form); return 1; }
static const unsigned char * ilem_conv(struct xcl_l *l, const unsigned char *str) { const unsigned char *x = NULL; if (str) { int entry_chartrie_er = chartrie_suppress_errors; curr_lang = l->f->lang; chartrie_suppress_errors = 1; x = natf2utf((char*)str,(char*)str+strlen((char*)str),0,l->xc->file,l->lnum); if (strcmp((char*)x,(char*)str)) str = npool_copy(x,l->xc->pool); chartrie_suppress_errors = entry_chartrie_er; } return str; }
/* This routine should not set anything but FORM at the f2 level; that is the job of ilem_parse */ void lem_save_form(const char *ref, const char *lang, const char *formstr, struct lang_context *langcon) { struct ilem_form *form = mb_new(lemline_xcp->sigs->mb_ilem_forms); extern int curr_cell; form->ref = (char*)ref; if (lang) { form->f2.lang = (unsigned char*)lang; form->f2.core = langcore_of(lang); if (strstr(lang,"949")) BIT_SET(form->f2.flags,F2_FLAGS_LEM_BY_NORM); } if (BIT_ISSET(form->f2.flags,F2_FLAGS_LEM_BY_NORM)) { form->f2.norm = (unsigned char *)formstr; form->f2.form = (const unsigned char *)"*"; } else form->f2.form = (unsigned char *)formstr; form->file = (char*)file; form->lnum = lnum; form->lang = langcon; if (!ref[0]) return; if (!curr_lsp->forms_alloced || curr_lsp->forms_used == curr_lsp->forms_alloced) { curr_lsp->forms_alloced += 16; curr_lsp->forms = realloc(curr_lsp->forms, curr_lsp->forms_alloced*sizeof(struct ilem_form*)); curr_lsp->cells = realloc(curr_lsp->cells, curr_lsp->forms_alloced*sizeof(int)); if (curr_lsp->forms_used < 0) curr_lsp->forms_used = 0; } /* when curr_cell = 0 we are in a line with no cells; by definition, all content in such a line is in cell 2 (because cell 1 is the line number) */ curr_lsp->cells[curr_lsp->forms_used] = (curr_cell ? curr_cell : 2); curr_lsp->forms[curr_lsp->forms_used++] = form; hash_add(word_form_index,npool_copy((unsigned char*)ref,lemline_xcp->pool),form); }
void nsa_token(struct nsa_parser *p, enum nsa_ptypes type, void *ref, const char *s) { struct nsa_token *t = new_token(); if (type == NSA_P_STOP) { t->type = NSA_T_STOP; } else { unsigned char *s2 = npool_copy((const unsigned char *)s,p->pool), *brack; t->type = NSA_T_GRAPHEME; grapheme(t) = new_grapheme(); grapheme_overt(t) = 1; grapheme_text_ref(t) = new_text_ref(); grapheme_text_ref(t)->ptype = type; switch (type) { case NSA_P_LEMM: grapheme_text_ref(t)->t.lemmptr = ref; break; case NSA_P_LITERAL: grapheme_text_ref(t)->t.literal = ref; break; case NSA_P_LINK: grapheme_text_ref(t)->t.linkptr = ref; break; default: break; } if ((brack = (unsigned char *)strchr((const char *)s2,'(')) && (isdigit(*s2) || ((brack-s2)==1 && (*s2 == 'n' || *s2 == 'N')))) { grapheme_num(t) = (char *)s2; *brack++ = '\0'; grapheme_unit(t) = (char *)brack; while (*brack && ')' != *brack) ++brack; *brack = '\0'; } } list_add(p->toks,t); }
struct xcl_context * xcl_process(struct run_context *run, struct node *text) { struct xcl_context *xc = xcl_create(); char *langs = texttag_langs(); /* xc->system = xcl_get_global_context()->system; */ /*xc->cache = xcl_cache();*/ xc->run = run; xc->curr = xc->root = NULL; xc->langs = (char*)npool_copy((unsigned char*)langs,xc->pool); free(langs); xc->project = project; xc->textid = textid; xc->file = file; xc->sigs = sig_context_init(); process(xc,text); return xc; }
const unsigned char * note_register_tag(const unsigned char *tag, struct node *parent) { if (!tag) { if (notes_in_line) { struct note *last_np = list_last(notes_in_line); if (last_np) { int m = atoi((char*)last_np->tag); if (m > 0) { static char buf[10]; sprintf(buf, "%d", m+1); return note_register_tag((const unsigned char *)buf, parent); } else /* this is a stop-gap; it means that alpha notes can be done explicitly, but they'll get mixed with numeric marks if no mark is used in a #note: */ return note_register_tag((const unsigned char *)"1", parent); } else return note_register_tag((const unsigned char *)"1", parent); } else { return note_register_tag((const unsigned char *)"1", parent); } } if (note_find_in_line(tag)) { vwarning("note tag %s is used more than once in this line", tag); return NULL; } else { struct note *np = mb_new(mb); unsigned char *note_mark_text = NULL; struct node *note_mark_node = parent; if (note_index < 1000000) { unsigned char markbuf[8]; sprintf((char*)markbuf,"%d",note_index++); note_mark_text = npool_copy(markbuf, note_pool); } /* If there was a ^1^ tag in the line we need to replace the text content of the parent element here; otherwise, we have a fresh parent element and just need to append the text node */ if (note_mark_node->children.lastused) ((struct node*)(note_mark_node->children.nodes[0]))->data = note_mark_text; else appendChild(note_mark_node, textNode(note_mark_text)); np->tag = tag; np->mark = note_mark_text; np->node = note_mark_node; np->status = NOTE_REGISTERED; if (notes_in_line) list_add(notes_in_line, np); /* list_add(notes_in_text, np); */ return tag; } }
/* parent node is "current" node in block.c */ int note_parse_tlit(struct node *parent, int current_level, unsigned char **lines) { int nlines; struct node *n; char tagbuf[8], *m = tagbuf; unsigned char *notelabel = NULL, *notetext = NULL; const unsigned char *tag = NULL, *mark = NULL; *tagbuf = '\0'; lines[0] += 6; while (isspace(lines[0][0])) ++lines[0]; if ('^' == lines[0][0]) { struct note *np; /* the note should already be registered at the tag-point in the line */ ++lines[0]; while (lines[0][0] && '^' != lines[0][0]) { *m++ = lines[0][0]; ++lines[0]; } *m = '\0'; ++lines[0]; tag = (const unsigned char *)tagbuf; np = note_find_in_line(tag); if (np) { mark = np->mark; } else { warning("tag in note does not have corresponding tag in preceding line"); return 1; } } else { if (list_len(notes_in_line)) { warning("tagged notes cannot be mixed with untagged ones"); return 1; } else { struct node *lastC = note_attach_point(parent); /* If there is no note tag we have to do two things: fix the attach point and set the tag to "1" */ if (lastC) { struct node *xmark = NULL; enum e_type e; enum block_levels l; switch (lastC->etype) { case e_l: { struct node *lastCchild = lastChild(lastC); if (lastCchild && lastCchild->etype == e_c) { /* the attach point is either the cell or its chield field if there is one */ struct node *cField = lastChild(lastCchild); if (cField->etype == e_f) lastC = cField; else lastC = lastCchild; l = WORD; } else if (lastCchild && lastCchild->etype == e_f) { /* the attach point is the field */ lastC = lastCchild; l = WORD; } else l = LINE; e = e_g_nonw; } break; case e_composite: case e_score: case e_transliteration: l = TEXT; e = e_note_link; break; case e_object: l = OBJECT; e = e_note_link; break; case e_surface: l = SURFACE; e = e_note_link; break; case e_column: l = COLUMN; e = e_note_link; break; /* FIXME: THIS CAN'T BE RIGHT */ case e_variant: l = LINE; e = e_note_link; break; default: vwarning("unhandled note parent %s", lastC->names[0].pname); break; } xmark = elem(e,NULL,lnum,l); if (e == e_g_nonw) appendAttr(xmark, attr(a_type, (unsigned char *)"notelink")); appendChild(lastC, xmark); tag = "1"; mark = note_register_tag(tag, xmark); } else { warning("nowhere to attach note mark to; please provide context and mark"); tag = NULL; } } } if (tag) { while (isspace(lines[0][0])) ++lines[0]; if (!strncmp((char*)lines[0],"@notelabel{", 11)) { lines[0] += 11; notelabel = lines[0]; while (lines[0][0] != '}') ++lines[0]; lines[0][0] = '\0'; ++lines[0]; while (isspace(lines[0][0])) ++lines[0]; } n = elem(e_note_text,NULL,lnum,current_level); appendAttr(n, attr(a_note_mark, mark)); note_register_note(tag, n); if (notelabel) set_or_append_attr(n,a_note_label,"notelabel",notelabel); /* This is a bit weird, but the last character before the content is either a space after #note:, or a space or the closer character after a note mark or label, so we are safe to play this trick with the scan_comment routine */ --lines[0]; lines[0][0] = '#'; notetext = npool_copy(scan_comment_sub(lines,&nlines,0), note_pool); (void)trans_inline(n,notetext,NULL,0); appendChild(parent,n); } return nlines; }
/* return non-negative on success; -1 on error; non-negative is the length of string parsed by f2_parse. */ int f2_parse(const Uchar *file, size_t line, Uchar *lp, struct f2 *f2p, Uchar **psu_sense, struct sig_context *scp) { Uchar *tmp = NULL, *err_lp = NULL, *disambig = NULL, *ampamp = NULL, *orig_lp = lp, field = '\0', *psu_tmp = NULL; int ret = 0; const char *saved_phase = phase; int square, saved_with_textid = with_textid; if (!lp) return 1; /* err_lp = npool_copy(lp, scp->pool); */ err_lp = (Uchar*)strdup((char*)lp); phase = "f2"; with_textid = 0; /* skip the old shadow lem codes */ if (*lp == '`') { vwarning2((char*)file,line,"%s: please remove deprecated shadow lem sequence '`' or '`?'",err_lp); lp += 1 + (lp[1] == '?'); } if ((ampamp = (unsigned char*)strstr((char*)lp, "&&"))) { f2_parse_cof(file, line, lp, f2p, psu_sense, ampamp, scp); goto ret; } /* if the sig starts with @ parse the admin fields @PROJ%LANG:FORM= * first. * * N.B.: % and : are not recognized by field_end() as this causes * problems parsing morphology. */ if ('@' == *lp) { f2p->project = lp+1; lp = (Uchar*)strchr((char*)lp,'%'); if (lp) { *lp++ = '\0'; f2p->lang = lp; f2p->core = langcore_of((const char*)lp); lp = (Uchar*)strchr((char*)lp,':'); if (lp) { *lp++ = '\0'; f2p->form = lp; lp = (Uchar*)strchr((char*)lp,'='); *lp++ = '\0'; } } } if ('[' == *lp) { /* FIXME: this needs to be more rigorous and check for CF-legal char in initial position */ vwarning2((char*)file,line,"%s: lemmatization cannot begin with '['",err_lp); goto ret; } if (!strchr((const char *)lp,'[')) { /* f2p->cf = "X"; */ f2p->gw = (unsigned char *)"X"; if (*lp == 'n') { f2p->pos = "n"; ++lp; } else if (*lp == 'u') { f2p->pos = "u"; ++lp; } goto pos_parse; } /* parse the CF[GW/SENSE]POS'EPOS which are constant: */ f2p->cf = lp; if (*lp == '"') { ++lp; f2p->cf = lp; /* don't include quotes in the CF; WATCHME: what happens in post-cache retrieval parse? */ BIT_SET(f2p->flags,F2_FLAGS_CF_QUOTED); /* fp->explicit |= NEW_CF; */ /* have to do something here, i.e., suppress charset translation */ } while (*lp && (*lp != '[' || lp[-1] == '\\')) ++lp; if (BIT_ISSET(f2p->flags,F2_FLAGS_CF_QUOTED)) { if (lp[-1] == '"') lp[-1] = '\0'; else { vwarning2((char*)file,line,"%s: '\"' missing on quoted CF",err_lp); ret = -1; } } if (lp) *lp = '\0'; else goto ret; if (lp[-1] == ')' && lp[-2] != '\\') { char *oparen = strchr((char*)f2p->cf,'('); if (oparen && oparen[-1] != '\\') { *oparen++ = '\0'; f2p->restrictor = (unsigned char*)oparen; lp[-1] = '\0'; } else { vwarning2((char*)file,line,"%s: '(' missing on restrictor",err_lp); ret = -1; goto ret; } } else if ((tmp = (unsigned char*)strchr((char*)f2p->cf,'(')) && tmp[-1] != '\\') { vwarning2((char*)file,line,"%s: ')' missing on restrictor",err_lp); ret = -1; goto ret; } f2p->gw = ++lp; if ((psu_tmp = (Uchar *)strstr(cc(lp),"+="))) { *psu_tmp = '\0'; psu_tmp += 2; if (psu_sense) *psu_sense = psu_tmp; psu_tmp = (Uchar *)strchr((const char *)psu_tmp,']'); } /* make SENSE optional here to support inline lem parsing */ square = 0; while (*lp && (*lp != '/' || lp[1] != '/')) { if (*lp == '[' && lp[-1] != '\\') ++square; else if (*lp == ']' && lp[-1] != '\\') { if (square) --square; else break; } ++lp; } /* If we didn't find ] but had a psu_sense with +=, reset lp to the closing square bracket after the psu_sense */ if (!*lp && psu_tmp) lp = psu_tmp; if (*lp) { if ('/' == *lp) { *lp++ = '\0'; ++lp; f2p->sense = lp; square = 0; while (*lp) { if (*lp == '[' && lp[-1] != '\\') ++square; else if (*lp == ']' && lp[-1] != '\\') { if (square) --square; else break; } ++lp; } } if (*lp) /* lp is at closing square bracket of CF[GW] */ { *lp++ = '\0'; /* This is either a POS or something that starts with a field char */ pos_parse: if (isupper(*lp)) { Uchar *end = NULL; Uchar *epos = NULL; for (end = lp; *end && !isspace(*end); ++end) ; epos = (Uchar*)strchr((const char *)lp,'\''); f2p->pos = lp; if (epos && epos < end) lp = epos; else lp = field_end(lp); if (*f2p->pos == 'V' && '/' == *lp && (lp[1] == 't' || lp[1] == 'i')) { ++lp; epos = (Uchar*)strchr((const char *)lp,'\''); if (epos && epos < end) lp = epos; else lp = field_end(lp); } field = *lp; if (field == '\'') { *lp++ = '\0'; f2p->epos = lp; lp = field_end(lp); if (*f2p->epos == 'V' && '/' == *lp && (lp[1] == 't' || lp[1] == 'i')) { ++lp; lp = field_end(lp); } field = *lp; *lp++ = '\0'; } else *lp++ = '\0'; } else if (*lp == '\'') { *lp++ = '\0'; f2p->epos = lp; lp = field_end(lp); field = *lp; *lp++ = '\0'; } else if (*lp) { field = *lp; *lp++ = '\0'; } /* Now we are at a variable set of instance fields; parse as though they can be in any order, though in principle the order should always be fixed. */ while (*lp) { switch (field) { #if 0 /* this must follow POS and ' is no longer a field * ender because of conflict with ' in M1 */ case '\'': f2p->epos = lp; break; #endif case '@': f2p->project = lp; break; case '%': f2p->lang = lp; break; case ':': f2p->form = lp; break; case '$': if (!BIT_ISSET(f2p->flags, F2_FLAGS_LEM_BY_NORM)) f2p->norm = lp; /* else ignore normalization because we got it from the "FORM" */ break; case '/': f2p->base = lp; break; case '+': if (*lp == '-') f2p->cont = lp; else if (*lp == '.') f2p->augment = lp; else vwarning2((char*)file,line,"%s: '+' in signature should be followed by '-' or '.'", err_lp); ++lp; break; case '#': if (*lp == '#') { ++lp; f2p->morph2 = lp; } else f2p->morph = lp; break; case '*': f2p->stem = lp; break; case '\\': disambig = lp; while (isalnum(*lp) || '\\' == *lp) ++lp; break; case '<': case ' ': case '\t': case 0: goto break_switch_loop; default: vwarning2((char*)file,line,"%s: parse error at '%c'", err_lp, field); ++ret; goto ret; } lp = field_end(lp); if (*lp) { field = *lp; *lp++ = '\0'; } } } } break_switch_loop: validate_pos((const char *)file, line, f2p->pos); validate_pos((const char *)file, line, f2p->epos); if (f2p->base) validate_base((const char *)file, line, f2p->base); #if 0 /* If lp is non-zero we didn't manage to parse the entire form: */ if (*lp) { vwarning2((char*)file,line,"%s: bad tense designator: only allowed with verb POS",err_lp); ret = -1; goto ret; } #endif /* field == '$' occurs when $ is the end of the lem, e.g., ana[to]PRP$ */ /* FIXME: THIS IS A POOR TEST BECAUSE IT FAILS ON ]N$#M1 */ if (field == '$') { if (!f2p->norm || !*f2p->norm) f2p->norm = f2p->cf; } if (BIT_ISSET(f2p->flags, F2_FLAGS_LEM_BY_NORM)) { if (f2p->norm && f2p->cf && !strcmp((char*)f2p->cf,(char*)f2p->norm)) { BIT_SET(f2p->flags, F2_FLAGS_NORM_IS_CF); f2p->cf = NULL; } } if (f2p->gw) { char *bs = strchr((char*)f2p->gw, '\\'); if (bs) { unsigned char *gwtmp = npool_copy(f2p->gw, f2_pool); bs = (char *)(gwtmp + (bs - (char*)f2p->gw)); *bs++ = '\0'; f2p->gw = gwtmp; if (*bs == 'i' || *bs == 't') { if ((!f2p->pos || *f2p->pos == 'V') && (!f2p->epos || *f2p->epos == 'V')) { if (*bs == 'i') f2p->epos = npool_copy((unsigned char *)"V/i",f2_pool); else f2p->epos = npool_copy((unsigned char *)"V/t",f2_pool); } else { vwarning2((char*)file,line,"%s: bad designator: only allowed with verb POS",err_lp); ret = -1; goto ret; } } else { vwarning2((char*)file,line,"%s: bad designator: only 'i' or 't' allowed",err_lp); ret = -1; goto ret; } } } ret: free(err_lp); /* '<' == *lp must be dealt with by caller */ if (isspace(*lp)) *lp++ = '\0'; if (!f2p->gw || !*f2p->gw) { if (f2p->sense && *f2p->sense) f2p->gw = f2p->sense; else f2p->gw = (unsigned char *)"1"; } clean_cf((char*)file, line, (unsigned char *)f2p->cf); clean_gw_sense((char*)file, line, (unsigned char *)f2p->gw); if (f2p->sense) clean_gw_sense((char*)file, line, (unsigned char *)f2p->sense); if (f2p->augment) { char buf[1024]; sprintf(buf,"%s%s%s",f2p->form,AUGMENT_STR,f2p->augment); f2p->form = npool_copy((unsigned char *)buf,f2_pool); } if (disambig) { char buf[1024]; if (*disambig == *(DISAMBIG_STR)) sprintf(buf,"%s%s",f2p->form,disambig); else sprintf(buf,"%s%s%s",f2p->form,DISAMBIG_STR,disambig); f2p->form = npool_copy((unsigned char *)buf,f2_pool); } if (f2p->cf && strchr((char*)f2p->cf,' ')) BIT_SET(f2p->flags, F2_FLAGS_IS_PSU); phase = saved_phase; with_textid = saved_with_textid; return (ret < 0) ? -1 : (lp - orig_lp); }
void est_add(const unsigned char *key, struct est *estp) { const unsigned char *mangled_key = keymangler(key, estmangle, NULL, 0, NULL,NULL); hash_add(estp->h, npool_copy(mangled_key, estp->p), npool_copy(key, estp->p)); }
/* caller should now resolve word_id against word_form_index before calling and pass the result as form arg if non-NULL; NULL arg means form is embedded in lemma */ void ilem_parse(struct xcl_context *xc, struct ilem_form *master_formp) { unsigned char *lem; int newflag = 0; extern const char *phase; unsigned char *lemma = NULL; #define LANGBUF_LEN 32 char langbuf[LANGBUF_LEN+1]; #if 0 #define FORMBUF_LEN 128 char formbuf[FORMBUF_LEN+1]; #endif struct xcl_l *master_lp = NULL; if (!xc) { vwarning("internal error: ilem_parse called with NULL args"); return; } if (!master_formp) { /* this can happen after ATF parse errors */ return; } phase = "lem"; /*#define lemma (master_formp->literal)*/ if (master_formp->literal) { lemma = npool_copy((unsigned char *)master_formp->literal, xc->pool); } else { struct xcl_l*lp = xcl_lemma(xc,NULL,master_formp->ref,NULL,NULL,0); lp->lnum = master_formp->lnum; lp->f = master_formp; lp->inst = make_inst(xc,lp->f); phase = NULL; return; } if (NULL == master_formp->f2.lang) { if ('%' == *lemma) { char *langbufp = langbuf; for (++lemma; *lemma != ':' && *lemma != '-'; ) { if (langbufp - langbuf == LANGBUF_LEN) { langbuf[LANGBUF_LEN] = '\0'; vwarning2(file,lnum,"[91]: lang starting with '%s' is too long (MAX %d)",langbuf,LANGBUF_LEN); phase = NULL; return; } else *langbufp++ = *lemma++; } if ('-' == *lemma) { while (*lemma && ':' != *lemma) ++lemma; if (!*lemma) { vwarning2(file,lnum,"[92]: lang starting with '%s' has no ':'",langbuf); phase = NULL; return; } } } else { vwarning2(file,lnum,"[96]: no lang set for form"); phase = NULL; return; } master_formp->f2.lang = npool_copy((unsigned char *)langbuf,xc->pool); master_formp->f2.core = langcore_of(langbuf); } else if ('%' == *lemma && '%' != lemma[1]) { while (*lemma && ':' != *lemma) ++lemma; if (':' != *lemma) { vwarning2(file,lnum,"lang has no ':'"); return; } ++lemma; } #if 0 /* In L1 this routine had to handle lems with a form prepended and separated by * (not = , because that conflicts with = in ASCII macron). This is no longer the case in L2 */ if (NULL == master_formp->f2.form) { char *formbufp = formbuf; while (*lemma != '*') { if (formbufp - formbuf == FORMBUF_LEN) { formbuf[10] = '\0'; vwarning2(file,lnum,"[94]: form starting '%s' is too long (MAX %d)",formbuf,FORMBUF_LEN); phase = NULL; return; } *formbufp++ = *lemma++; } if ('*' != *lemma) { formbuf[10] = '\0'; vwarning2(file,lnum,"[95]: form starting '%s' has no '*'",formbuf,FORMBUF_LEN); phase = NULL; return; } ++lemma; } #endif /* Now we know that lemma points to the start of the lemmatization */ lem_init((const unsigned char *)lemma); /* This outer loop splits on '&' */ while (1) { struct xcl_l*lp; int alt_count = 0; int iflags = 0; struct ilem_form *curr_f = NULL; lem = lem_next(xc); if (!lem) break; lp = xcl_lemma(xc,NULL,master_formp->ref,NULL,NULL,0); lp->inst = master_formp->literal; lp->lnum = lnum; lp->ante_para = ilem_para_parse(xc, lem,&lem,master_formp->lnum, ilem_para_pos_ante); if (lem) { unsigned char *post = NULL; while (isspace(*lem)) ++lem; post = lem_end(lem); lp->post_para = ilem_para_parse(xc, post,NULL,master_formp->lnum, ilem_para_pos_post); if (isspace(*post)) { while (post > lem && isspace(post[-1])) --post; *post = '\0'; } ilem_para_boundaries(lp,xc); } else { vwarning2(file,master_formp->lnum,"[96]: lem `%s' failed syntax stripping",lem); break; } alt_init(lem); if (master_formp->mcount) { struct ilem_form *mrover = NULL; /*lp->f = NULL;*/ /* NEW ILEM_FORM form_allocator();*/ lp->f = mb_new(xc->sigs->mb_ilem_forms); lp->f->newflag = newflag; lp->f->f2.lang = master_formp->f2.lang; lp->f->f2.core = master_formp->f2.core; lp->f->mcount = -1; if (master_formp->mcount == 1) { master_formp->type = "cof-head"; master_lp->cof_tails = list_create(LIST_SINGLE); } lp->f->type = "cof-tail"; lp->cof_head = master_lp; list_add(lp->cof_head->cof_tails, lp); ++master_formp->mcount; /* efficiency doesn't matter here as we will have relatively few of these */ for (mrover = master_formp; mrover->multi; mrover = mrover->multi) ; mrover->multi = lp->f; /*lp->f->master = master_formp;*/ lp->f->file = master_formp->file; lp->f->lnum = master_formp->lnum; lp->ref = lp->f->ref = master_formp->ref; lp->f->f2.form = master_formp->f2.form; lp->f->literal = NULL; } else { lp->f = master_formp; lp->f->mcount = 1; lp->f->newflag = newflag; lp->ref = lp->f->ref; lp->f->type = NULL; master_lp = lp; } lp->f->instance_flags = iflags; /* This inner loop splits on '|'; it is where each lemma is actually handled */ while (1) { lem = alt_next(xc); if (!lem) break; iflags = 0; while (lem_iflags[*lem]) { switch (*lem) { case '+': ++lem; /*newflag = !ignore_plus; */ BIT_SET(iflags, F2_FLAGS_LEM_NEW); break; case '!': ++lem; BIT_SET(iflags, F2_FLAGS_PSU_STOP); break; case '-': ++lem; BIT_SET(iflags, F2_FLAGS_PSU_SKIP); break; case '`': lem = (unsigned char *)"X"; break; } } if (bootstrap_mode && !BIT_ISSET(iflags, F2_FLAGS_LEM_NEW)) BIT_SET(iflags, F2_FLAGS_LEM_NEW); if (BIT_ISSET(iflags,F2_FLAGS_LEM_NEW)) { char *tmp = malloc(strlen(lem) + 2); sprintf(tmp, "+%s", lem); lem = npool_copy(tmp, xc->pool); free(tmp); } if (alt_count++) { struct ilem_form *last_alt = NULL, *f = NULL; if (!lem) break; /*f->f2 = NULL form_allocator();*/ f = mb_new(xc->sigs->mb_ilem_forms); /* f->newflag = newflag; */ lp->f->ref = master_formp->ref; f->f2.lang = master_formp->f2.lang; f->f2.core = master_formp->f2.core; f->f2.form = master_formp->f2.form; if (BIT_ISSET(iflags, F2_FLAGS_LEM_NEW)) { BIT_SET(f->f2.flags, F2_FLAGS_LEM_NEW); if ('+' == *lem) /* should always be true */ ++lem; } f->lnum = master_formp->lnum; f->file = master_formp->file; f->instance_flags = iflags; f->sublem = (char*)npool_copy(lem,xc->pool); /* link this into the master_formp */ for (last_alt = master_formp; last_alt->ambig; last_alt = last_alt->ambig) ; curr_f = last_alt->ambig = f; } else { lp->f->sublem = (char*)npool_copy(lem,xc->pool); curr_f = lp->f; if (BIT_ISSET(iflags, F2_FLAGS_LEM_NEW)) { BIT_SET(curr_f->f2.flags, F2_FLAGS_LEM_NEW); if ('+' == *lem) /* should always be true */ ++lem; } } /* Instance parsing cannot result in a form with && being processed using f2_parse_cof, so we can just pass a NULL final argument */ f2_parse((Uchar*)lp->f->file, lp->f->lnum, lem, &curr_f->f2, (Uchar**)&curr_f->psu_sense, NULL); if (check_cf((char*)lp->f->file, lp->f->lnum, (char*)curr_f->f2.cf)) BIT_SET(curr_f->f2.flags, F2_FLAGS_INVALID); if (curr_f->lang) { curr_lang = curr_f->lang; if (!BIT_ISSET(curr_f->f2.flags,F2_FLAGS_CF_QUOTED)) curr_f->f2.cf = ilem_conv(lp,curr_f->f2.cf); curr_f->f2.norm = ilem_conv(lp,curr_f->f2.norm); curr_f->f2.base = ilem_conv(lp,curr_f->f2.base); curr_f->f2.cont = ilem_conv(lp,curr_f->f2.cont); } curr_f->sublem = make_inst(xc,curr_f); } } }
static void eH(void *userData, const char *name) { static int defined = 1; if (!strcmp(name,"key")) { const unsigned char *k = (const unsigned char *)charData_retrieve(); if (!hash_find(context->syskeys,k)) hash_add(context->syskeys,npool_copy(k,context->cpool),curr_system); } else if (!strcmp(name,"det")) { const unsigned char *d = (const unsigned char *)charData_retrieve(); if (!hash_find(context->sysdets,d)) hash_add(context->sysdets,npool_copy(d,context->cpool),curr_system); } else if (!strcmp(name,"och")) { const unsigned char *d = (const unsigned char *)charData_retrieve(); List *l; if (!(l = hash_find(context->comheads,d))) { l = list_create(LIST_SINGLE); list_add(l,(void*)curr_comhead_sys); hash_add(context->comheads,npool_copy(d,context->cpool),(void*)l); } else { list_add(l,(void*)curr_comhead_sys); } } else if (!strcmp(name,"gal2")) { const unsigned char *d = (const unsigned char *)charData_retrieve(); if (!hash_find(context->gal2_tokens,d)) hash_add(context->gal2_tokens,npool_copy(d,context->cpool),&defined); } else if (!strcmp(name,"igi")) { const unsigned char *d = (const unsigned char *)charData_retrieve(); if (!hash_find(context->igigal_keys,d)) hash_add(context->igigal_keys,npool_copy(d,context->cpool),last_u); } else if (!strcmp(name,"la2")) { const unsigned char *d = (const unsigned char *)charData_retrieve(); if (!hash_find(context->la2_tokens,d)) hash_add(context->la2_tokens,npool_copy(d,context->cpool),&defined); } else if (!strcmp(name,"suffix")) { const unsigned char *d = (const unsigned char *)charData_retrieve(); if (!hash_find(context->morph_suffixes,d)) hash_add(context->morph_suffixes,npool_copy(d,context->cpool),&defined); } else if (!strcmp(name,"sexfrac")) { const unsigned char *d = (const unsigned char *)charData_retrieve(); if (!hash_find(context->sexfracs,d)) hash_add(context->sexfracs,npool_copy(d,context->cpool),&defined); } }
static void sH(void *userData, const char *name, const char **atts) { if (name[22] == 'f' && (!strcmp(name, "http://oracc.org/ns/xtf/1.0:transliteration") || !strcmp(name, "http://oracc.org/ns/xtf/1.0:composite"))) { strcpy(curr_project, findAttr(atts,"project")); strcpy(curr_text_id, get_xml_id(atts)); } else { const char *utf8 = findAttr(atts,"http://oracc.org/ns/gdl/1.0:utf8"); if (*utf8) { static wchar_t wbuf[128]; static size_t n, i; char sbuf[512], xbuf[1024], *hex; unsigned char *sn = NULL; const char *form = NULL; if (*(const unsigned char *)utf8 > 127) { n = mbstowcs(wbuf,utf8,128); hex = malloc(n * 8); *hex = '\0'; for (i = 0; i < n; ++i) { if (i) strcat(hex, "."); sprintf(hex+strlen(hex),"x%05X",wbuf[i]); } } else { hex = strdup(utf8); } if (!(sn = psl_hex_to_sign(hex))) sn = (unsigned char *)hex; sprintf(sbuf,"%s:%s",hex,sn); sprintf(xbuf,"%s:%s:%s:%s",curr_project,curr_text_id,hex,sn); free(hex); if (!(curr_sig_hash = hash_find(signiary,(unsigned char*)sbuf))) { curr_sig_hash = hash_create(1); hash_add(signiary,npool_copy((unsigned char*)sbuf,sig_pool),curr_sig_hash); } incr_val(curr_sig_hash, (const unsigned char *)"#count"); ++total_sign_instances; if (!(curr_hash = hash_find(pertext,(unsigned char*)xbuf))) { curr_hash = hash_create(1); hash_add(pertext,npool_copy((unsigned char*)xbuf,sig_pool),curr_hash); } incr_val(curr_hash, (const unsigned char *)"#count"); form = findAttr(atts, "form"); if (form && *form) { incr_val(curr_sig_hash, (const unsigned char *)form); incr_val(curr_hash, (const unsigned char *)form); curr_sig_hash = curr_hash = NULL; } } } }
void lem_save_line(unsigned char *lp) { last_lsp = curr_lsp = new_lsp(); curr_lsp->line = npool_copy(lp,lemline_xcp->pool); }