void lem_psu(char *psuline) { char *psu_ngram, *psu_sig; if (!currlang->runsigs) { currlang->runsigs = mb_new(lemline_xcp->sigs->mb_sigsets); /* other runsigs initialization */ } psu_ngram = psuline; while (*psu_ngram && !isspace(*psu_ngram)) ++psu_ngram; while (isspace(*psu_ngram)) ++psu_ngram; if (psu_sig) { char *psu_ngram_end = psu_sig; while (psu_ngram_end > psu_ngram && isspace(psu_ngram_end[-1])) --psu_ngram_end; *psu_ngram_end = '\0'; psu_sig += 2; while (isspace(*psu_sig)) ++psu_sig; psu_register(file,lnum,currlang->runsigs,currlang,psu_ngram,NULL); } }
/* This routine should not set anything but FORM at the f2 level; that is the job of ilem_parse */ void lem_save_form(const char *ref, const char *lang, const char *formstr, struct lang_context *langcon) { struct ilem_form *form = mb_new(lemline_xcp->sigs->mb_ilem_forms); extern int curr_cell; form->ref = (char*)ref; if (lang) { form->f2.lang = (unsigned char*)lang; form->f2.core = langcore_of(lang); if (strstr(lang,"949")) BIT_SET(form->f2.flags,F2_FLAGS_LEM_BY_NORM); } if (BIT_ISSET(form->f2.flags,F2_FLAGS_LEM_BY_NORM)) { form->f2.norm = (unsigned char *)formstr; form->f2.form = (const unsigned char *)"*"; } else form->f2.form = (unsigned char *)formstr; form->file = (char*)file; form->lnum = lnum; form->lang = langcon; if (!ref[0]) return; if (!curr_lsp->forms_alloced || curr_lsp->forms_used == curr_lsp->forms_alloced) { curr_lsp->forms_alloced += 16; curr_lsp->forms = realloc(curr_lsp->forms, curr_lsp->forms_alloced*sizeof(struct ilem_form*)); curr_lsp->cells = realloc(curr_lsp->cells, curr_lsp->forms_alloced*sizeof(int)); if (curr_lsp->forms_used < 0) curr_lsp->forms_used = 0; } /* when curr_cell = 0 we are in a line with no cells; by definition, all content in such a line is in cell 2 (because cell 1 is the line number) */ curr_lsp->cells[curr_lsp->forms_used] = (curr_cell ? curr_cell : 2); curr_lsp->forms[curr_lsp->forms_used++] = form; hash_add(word_form_index,npool_copy((unsigned char*)ref,lemline_xcp->pool),form); }
static void set_instance_fields(struct xcl_context *xc, struct ML *mlp) { const char *lastw = ""; int i; static char formbuf[128], normbuf[128]; List *parts = list_create(LIST_SINGLE); *formbuf = *normbuf = '\0'; for (i = 0; i < mlp->matches_used; ++i) { /* Should we be discriminating about which match of matches[i].matches[] we are using for this? */ struct f2 *lform = mlp->matches[i].matching_f2s[0]; struct f2 *clone = mb_new(xc->sigs->mb_f2s); /* This is a shallow clone; we only need it so we can set the flags locally */ *clone = *lform; list_add(parts, lform); if (strcmp(lastw,mlp->matches[i].lp->ref)) { if (*formbuf) strcat(formbuf, " "); strcat(formbuf,(char*)lform->form); } else BIT_SET(lform->flags,F2_FLAGS_SAME_REF); lastw = mlp->matches[i].lp->ref; if (lform->norm) { if (*normbuf) strcat(normbuf, " "); strcat(normbuf,(char*)lform->norm); } } mlp->matches[0].psu_form->form = (unsigned char*)formbuf; mlp->matches[0].psu_form->norm = (unsigned char*)normbuf; mlp->matches[0].psu_form->file = (unsigned char*)mlp->matches[0].lp->f->file; mlp->matches[0].psu_form->lnum = mlp->matches[0].lp->f->lnum; mlp->matches[0].psu_form->parts = (struct f2**)list2array(parts); list_free(parts, NULL); }
/* Parse the COF head into the top-level f2, then parse tails into the f2->parts array, and NULL-terminate the array */ void f2_parse_cof(const Uchar *file, size_t line, Uchar *lp, struct f2 *f2p, Uchar **psu_sense, Uchar *ampamp, struct sig_context *scp) { List *cofs = list_create(LIST_SINGLE); Uchar *form = NULL; int i = 0; while (1) { list_add(cofs, lp); if (!ampamp) break; *ampamp = '\0'; ampamp += 2; lp = ampamp; ampamp = (unsigned char *)strstr((char*)lp,"&&"); } f2p->parts = mb_new_array(scp->mb_f2ps, list_len(cofs)); /* Parse the head in the top-level f2 structure */ form = list_first(cofs); f2_parse(file,line,form,f2p,NULL,NULL); f2p->cof_id = (uintptr_t)f2p; BIT_SET(f2p->flags, F2_FLAGS_COF_HEAD); /* Now parse the tails into the parts array */ for (i = 0, form = list_next(cofs); form; form = list_next(cofs), ++i) { f2p->parts[i] = mb_new(scp->mb_f2s); f2_parse(file,line,form,f2p->parts[i],NULL,NULL); f2p->parts[i]->cof_id = (uintptr_t)f2p; BIT_SET(f2p->parts[i]->flags, F2_FLAGS_COF_TAIL); } f2p->parts[i] = NULL; }
void sigs_cache_init(struct sigset *sp) { #if 0 /* This is old cache handling; these days we autoload missing languages and we have per-lang cache rather than using sp->file == "cache" */ if (!sp->file) return; if (!strcmp((const char *)sp->file, "cache")) { fprintf(stderr, "sigs_cache: can't cache a cache\n"); return; } #endif if (!sigs_cache_pool) { sigs_cache_pool = npool_init(); sigs_cache_pool_refs = 0; } ++sigs_cache_pool_refs; sp->cache = mb_new(sp->owner->mb_sigsets); sp->cache->project = sp->project; sp->cache->lang = sp->lang; sp->cache->core = sp->core; sp->cache->file = (const unsigned char *)"cache"; sp->cache->fmem = NULL; sp->cache->lines = NULL; sp->cache->forms = hash_create(1); sp->cache->norms = hash_create(1); sp->cache->bigrams_hash = NULL; sp->cache->mdsets_hash = NULL; sp->cache->cache = NULL; sp->cache->bigrams = NULL; sp->cache->psus = NULL; sp->cache->owner = sp->owner; sp->cache->loaded = 1; }
const unsigned char * note_register_tag(const unsigned char *tag, struct node *parent) { if (!tag) { if (notes_in_line) { struct note *last_np = list_last(notes_in_line); if (last_np) { int m = atoi((char*)last_np->tag); if (m > 0) { static char buf[10]; sprintf(buf, "%d", m+1); return note_register_tag((const unsigned char *)buf, parent); } else /* this is a stop-gap; it means that alpha notes can be done explicitly, but they'll get mixed with numeric marks if no mark is used in a #note: */ return note_register_tag((const unsigned char *)"1", parent); } else return note_register_tag((const unsigned char *)"1", parent); } else { return note_register_tag((const unsigned char *)"1", parent); } } if (note_find_in_line(tag)) { vwarning("note tag %s is used more than once in this line", tag); return NULL; } else { struct note *np = mb_new(mb); unsigned char *note_mark_text = NULL; struct node *note_mark_node = parent; if (note_index < 1000000) { unsigned char markbuf[8]; sprintf((char*)markbuf,"%d",note_index++); note_mark_text = npool_copy(markbuf, note_pool); } /* If there was a ^1^ tag in the line we need to replace the text content of the parent element here; otherwise, we have a fresh parent element and just need to append the text node */ if (note_mark_node->children.lastused) ((struct node*)(note_mark_node->children.nodes[0]))->data = note_mark_text; else appendChild(note_mark_node, textNode(note_mark_text)); np->tag = tag; np->mark = note_mark_text; np->node = note_mark_node; np->status = NOTE_REGISTERED; if (notes_in_line) list_add(notes_in_line, np); /* list_add(notes_in_text, np); */ return tag; } }
/* caller should now resolve word_id against word_form_index before calling and pass the result as form arg if non-NULL; NULL arg means form is embedded in lemma */ void ilem_parse(struct xcl_context *xc, struct ilem_form *master_formp) { unsigned char *lem; int newflag = 0; extern const char *phase; unsigned char *lemma = NULL; #define LANGBUF_LEN 32 char langbuf[LANGBUF_LEN+1]; #if 0 #define FORMBUF_LEN 128 char formbuf[FORMBUF_LEN+1]; #endif struct xcl_l *master_lp = NULL; if (!xc) { vwarning("internal error: ilem_parse called with NULL args"); return; } if (!master_formp) { /* this can happen after ATF parse errors */ return; } phase = "lem"; /*#define lemma (master_formp->literal)*/ if (master_formp->literal) { lemma = npool_copy((unsigned char *)master_formp->literal, xc->pool); } else { struct xcl_l*lp = xcl_lemma(xc,NULL,master_formp->ref,NULL,NULL,0); lp->lnum = master_formp->lnum; lp->f = master_formp; lp->inst = make_inst(xc,lp->f); phase = NULL; return; } if (NULL == master_formp->f2.lang) { if ('%' == *lemma) { char *langbufp = langbuf; for (++lemma; *lemma != ':' && *lemma != '-'; ) { if (langbufp - langbuf == LANGBUF_LEN) { langbuf[LANGBUF_LEN] = '\0'; vwarning2(file,lnum,"[91]: lang starting with '%s' is too long (MAX %d)",langbuf,LANGBUF_LEN); phase = NULL; return; } else *langbufp++ = *lemma++; } if ('-' == *lemma) { while (*lemma && ':' != *lemma) ++lemma; if (!*lemma) { vwarning2(file,lnum,"[92]: lang starting with '%s' has no ':'",langbuf); phase = NULL; return; } } } else { vwarning2(file,lnum,"[96]: no lang set for form"); phase = NULL; return; } master_formp->f2.lang = npool_copy((unsigned char *)langbuf,xc->pool); master_formp->f2.core = langcore_of(langbuf); } else if ('%' == *lemma && '%' != lemma[1]) { while (*lemma && ':' != *lemma) ++lemma; if (':' != *lemma) { vwarning2(file,lnum,"lang has no ':'"); return; } ++lemma; } #if 0 /* In L1 this routine had to handle lems with a form prepended and separated by * (not = , because that conflicts with = in ASCII macron). This is no longer the case in L2 */ if (NULL == master_formp->f2.form) { char *formbufp = formbuf; while (*lemma != '*') { if (formbufp - formbuf == FORMBUF_LEN) { formbuf[10] = '\0'; vwarning2(file,lnum,"[94]: form starting '%s' is too long (MAX %d)",formbuf,FORMBUF_LEN); phase = NULL; return; } *formbufp++ = *lemma++; } if ('*' != *lemma) { formbuf[10] = '\0'; vwarning2(file,lnum,"[95]: form starting '%s' has no '*'",formbuf,FORMBUF_LEN); phase = NULL; return; } ++lemma; } #endif /* Now we know that lemma points to the start of the lemmatization */ lem_init((const unsigned char *)lemma); /* This outer loop splits on '&' */ while (1) { struct xcl_l*lp; int alt_count = 0; int iflags = 0; struct ilem_form *curr_f = NULL; lem = lem_next(xc); if (!lem) break; lp = xcl_lemma(xc,NULL,master_formp->ref,NULL,NULL,0); lp->inst = master_formp->literal; lp->lnum = lnum; lp->ante_para = ilem_para_parse(xc, lem,&lem,master_formp->lnum, ilem_para_pos_ante); if (lem) { unsigned char *post = NULL; while (isspace(*lem)) ++lem; post = lem_end(lem); lp->post_para = ilem_para_parse(xc, post,NULL,master_formp->lnum, ilem_para_pos_post); if (isspace(*post)) { while (post > lem && isspace(post[-1])) --post; *post = '\0'; } ilem_para_boundaries(lp,xc); } else { vwarning2(file,master_formp->lnum,"[96]: lem `%s' failed syntax stripping",lem); break; } alt_init(lem); if (master_formp->mcount) { struct ilem_form *mrover = NULL; /*lp->f = NULL;*/ /* NEW ILEM_FORM form_allocator();*/ lp->f = mb_new(xc->sigs->mb_ilem_forms); lp->f->newflag = newflag; lp->f->f2.lang = master_formp->f2.lang; lp->f->f2.core = master_formp->f2.core; lp->f->mcount = -1; if (master_formp->mcount == 1) { master_formp->type = "cof-head"; master_lp->cof_tails = list_create(LIST_SINGLE); } lp->f->type = "cof-tail"; lp->cof_head = master_lp; list_add(lp->cof_head->cof_tails, lp); ++master_formp->mcount; /* efficiency doesn't matter here as we will have relatively few of these */ for (mrover = master_formp; mrover->multi; mrover = mrover->multi) ; mrover->multi = lp->f; /*lp->f->master = master_formp;*/ lp->f->file = master_formp->file; lp->f->lnum = master_formp->lnum; lp->ref = lp->f->ref = master_formp->ref; lp->f->f2.form = master_formp->f2.form; lp->f->literal = NULL; } else { lp->f = master_formp; lp->f->mcount = 1; lp->f->newflag = newflag; lp->ref = lp->f->ref; lp->f->type = NULL; master_lp = lp; } lp->f->instance_flags = iflags; /* This inner loop splits on '|'; it is where each lemma is actually handled */ while (1) { lem = alt_next(xc); if (!lem) break; iflags = 0; while (lem_iflags[*lem]) { switch (*lem) { case '+': ++lem; /*newflag = !ignore_plus; */ BIT_SET(iflags, F2_FLAGS_LEM_NEW); break; case '!': ++lem; BIT_SET(iflags, F2_FLAGS_PSU_STOP); break; case '-': ++lem; BIT_SET(iflags, F2_FLAGS_PSU_SKIP); break; case '`': lem = (unsigned char *)"X"; break; } } if (bootstrap_mode && !BIT_ISSET(iflags, F2_FLAGS_LEM_NEW)) BIT_SET(iflags, F2_FLAGS_LEM_NEW); if (BIT_ISSET(iflags,F2_FLAGS_LEM_NEW)) { char *tmp = malloc(strlen(lem) + 2); sprintf(tmp, "+%s", lem); lem = npool_copy(tmp, xc->pool); free(tmp); } if (alt_count++) { struct ilem_form *last_alt = NULL, *f = NULL; if (!lem) break; /*f->f2 = NULL form_allocator();*/ f = mb_new(xc->sigs->mb_ilem_forms); /* f->newflag = newflag; */ lp->f->ref = master_formp->ref; f->f2.lang = master_formp->f2.lang; f->f2.core = master_formp->f2.core; f->f2.form = master_formp->f2.form; if (BIT_ISSET(iflags, F2_FLAGS_LEM_NEW)) { BIT_SET(f->f2.flags, F2_FLAGS_LEM_NEW); if ('+' == *lem) /* should always be true */ ++lem; } f->lnum = master_formp->lnum; f->file = master_formp->file; f->instance_flags = iflags; f->sublem = (char*)npool_copy(lem,xc->pool); /* link this into the master_formp */ for (last_alt = master_formp; last_alt->ambig; last_alt = last_alt->ambig) ; curr_f = last_alt->ambig = f; } else { lp->f->sublem = (char*)npool_copy(lem,xc->pool); curr_f = lp->f; if (BIT_ISSET(iflags, F2_FLAGS_LEM_NEW)) { BIT_SET(curr_f->f2.flags, F2_FLAGS_LEM_NEW); if ('+' == *lem) /* should always be true */ ++lem; } } /* Instance parsing cannot result in a form with && being processed using f2_parse_cof, so we can just pass a NULL final argument */ f2_parse((Uchar*)lp->f->file, lp->f->lnum, lem, &curr_f->f2, (Uchar**)&curr_f->psu_sense, NULL); if (check_cf((char*)lp->f->file, lp->f->lnum, (char*)curr_f->f2.cf)) BIT_SET(curr_f->f2.flags, F2_FLAGS_INVALID); if (curr_f->lang) { curr_lang = curr_f->lang; if (!BIT_ISSET(curr_f->f2.flags,F2_FLAGS_CF_QUOTED)) curr_f->f2.cf = ilem_conv(lp,curr_f->f2.cf); curr_f->f2.norm = ilem_conv(lp,curr_f->f2.norm); curr_f->f2.base = ilem_conv(lp,curr_f->f2.base); curr_f->f2.cont = ilem_conv(lp,curr_f->f2.cont); } curr_f->sublem = make_inst(xc,curr_f); } } }
void links_psu(struct xcl_context *xc, struct ML *mlp) { struct linkset *lsp; int i; lsp = new_linkset(xc->linkbase,"psu",mlp->matches[0].psu); if (mlp->matches[0].psu_form) { struct xcl_l*lp = calloc(1,sizeof(struct xcl_l)); struct f2 *parsed_psu = mb_new(xc->sigs->mb_f2s); set_instance_fields(xc,mlp); /* PSU's don't use the && COF notation, so NULL final arg is safe here */ f2_parse((unsigned char*)mlp->matches[0].lp->xc->file, mlp->matches[0].lp->f->lnum, npool_copy((unsigned char*)mlp->matches[0].psu,xc->pool), parsed_psu, NULL, NULL); mlp->matches[0].psu_form->file = (unsigned char *)mlp->matches[0].lp->xc->file; mlp->matches[0].psu_form->lnum = mlp->matches[0].lp->f->lnum; mlp->matches[0].psu_form->cf = parsed_psu->cf; mlp->matches[0].psu_form->gw = parsed_psu->gw; if (mlp->matches[0].lp->f->psu_sense) mlp->matches[0].psu_form->sense = (unsigned char*)mlp->matches[0].lp->f->psu_sense; else if (parsed_psu->sense) mlp->matches[0].psu_form->sense = parsed_psu->sense; if (parsed_psu->pos) mlp->matches[0].psu_form->pos = parsed_psu->pos; if (parsed_psu->epos) mlp->matches[0].psu_form->epos = parsed_psu->epos; lp->parent = xc->root; /* fake this */ lsp->form = mlp->matches[0].psu_form; lsp->form->file = (unsigned char*)xc->file; lsp->form->lnum = mlp->matches[0].lp->lnum; lsp->form->sig = f2_psu_sig(mlp->matches[0].psu_form, xc->pool); lp->inst = psu_inst((char*)lsp->form->sig); lp->f = calloc(1,sizeof(struct ilem_form)); lp->f->file = (char*)mlp->matches[0].psu_form->file; lp->f->lnum = mlp->matches[0].psu_form->lnum; lp->f->f2 = *mlp->matches[0].psu_form; if (psus_sig_check) sigs_l_check(xc, lp); mlp->matches[0].psu_nfinds = lp->f->fcount; /* WATCHME: should I be using psu_finds and reporting ambig here? * For now, just use the first sig. */ if (lp->f->fcount > 0) lsp->form->sig = lp->f->finds[0]->f2.sig; /* can't free this now because it may be referenced via the cache */ /* free(lp->f); */ free(lp); /* clear the newflag so it doesn't carry over to further occurrences of this psu_form */ /*mlp->matches[0].psu_form->newflag = 0;*/ if (psus_sig_check) { if (!mlp->matches[0].psu_nfinds) /*NB: NO AMBIGUITY YET*/ { struct f2 *e = mlp->matches[0].psu_form; vwarning2((const char *)e->file, e->lnum, "psu: %s[%s]%s: compound not found", e->cf,e->gw,e->pos); } else if (verbose) { struct f2 *e = mlp->matches[0].psu_form; vwarning2((const char *)e->file, e->lnum, "psu: %s[%s]%s found OK", e->cf,e->gw,e->pos); } } } preallocate_links(lsp,mlp->matches_used); lsp->used = mlp->matches_used; for (i = 0; i < mlp->matches_used; ++i) { lsp->links[i].role = "elt"; lsp->links[i].title = (const char *)mlp->matches[i].lp->f->f2.cf; lsp->links[i].lp = mlp->matches[i].lp; lsp->links[i].lref = mlp->matches[i].lp->xml_id; /* WATCHME: this is a bit lazy; but at initial implementation it is not possible for an lp to be part of more than one PSU */ mlp->matches[i].lp->psurefs = lsp->xml_id; mlp->matches[i].lp->f->is_part = 1; /* Delete finds which are not PSU matches */ if (mlp->matches[i].nmatches < mlp->matches[i].lp->f->fcount) { memcpy(mlp->matches[i].lp->f->finds, mlp->matches[i].matching_f2s, mlp->matches[i].nmatches * sizeof(struct f2*)); mlp->matches[i].lp->f->finds[mlp->matches[i].nmatches] = NULL; mlp->matches[i].lp->f->fcount = mlp->matches[i].nmatches; } } }
void xcl_eH(void *userData, const char *name) { struct xcl_context *xcp = userData; const char *vbar = strchr(name,EXPAT_NS_CHAR); if (!strncmp("http://oracc.org/ns/xcl/1.0",name,vbar-name)) { ++vbar; if (next_k && !strcmp(vbar,"m")) { hash_add(curr_meta, npool_copy((unsigned char *)next_k,xcp->pool), npool_copy((unsigned char *)charData_retrieve(),xcp->pool)); next_k = NULL; } else if (!strcmp(vbar,"c")) xcl_chunk_end(xcp); else if (!strcmp(vbar,"l")) { if (!in_ll) { struct xcl_l *lp = xcl_lemma(xcp, curr_xml_id, curr_ref, curr_form, NULL, ll_type); lp->inst = curr_inst; lp->sig = npool_copy(curr_sig,xcp->pool); lp->lnum = curr_lnum; lp->f = mb_new(xcp->sigs->mb_ilem_forms); lp->f->ref = (char*)npool_copy((unsigned char *)curr_ref, xcp->pool); /* FIXME: this is not good enough for COF and PSU */ lp->f->f2.sig = lp->sig; f2_parse((unsigned char *)xcp->file, lp->lnum, npool_copy((unsigned char*)curr_sig,xcp->pool), &lp->f->f2, NULL, xcp->sigs); } #if 0 { form->f2.lang = (unsigned char*)lang; form->f2.core = langcore_of(lang); if (strstr(lang,"949")) BIT_SET(form->f2.flags,F2_FLAGS_LEM_BY_NORM); } if (BIT_ISSET(form->f2.flags,F2_FLAGS_LEM_BY_NORM)) { form->f2.norm = (unsigned char *)formstr; form->f2.form = (const unsigned char *)"*"; } else form->f2.form = (unsigned char *)formstr; form->file = (char*)file; form->lnum = lnum; form->lang = langcon; #endif } else if (!strcmp(vbar,"ll")) { ll_type = ll_none; in_ll = 0; } else if (!strcmp(vbar,"psu")) { unsigned char *tmp = (unsigned char*)charData_retrieve(), *etmp; List *lp; while (isspace(*tmp)) ++tmp; etmp = tmp+strlen((char*)tmp); while (isspace(etmp[-1])) --etmp; *etmp = '\0'; if (!(lp = hash_find(xcp->psus,(unsigned char*)psu_lang))) { lp = list_create(LIST_SINGLE); hash_add(xcp->psus,npool_copy((unsigned char*)psu_lang,xcp->pool),lp); } list_add(lp,npool_copy(tmp,xcp->pool)); } } else charData_discard(); }