/* Returns pointer to next nonwhite. */ Bool StringGetWord_LeNonwhite(/* RESULTS */ char *out, /* INPUT */ char *in, int maxlen, /* RESULTS */ char **nextp, char *trail_punc) { char *orig_out; orig_out = out; maxlen--; while (*in) { if (!LexEntryNonwhite(*((uc *)in))) { if (trail_punc) { *nextp = StringReadWhitespace_LeNonwhite(trail_punc, in, PUNCLEN); } else { *nextp = StringSkipWhitespace_LeNonwhite(in + 1); } *out = TERM; return(1); } if ((out - orig_out) >= maxlen) { return(0); } *out = *in; out++; in++; } *nextp = in; *out = TERM; return(1); }
void TA_TaggerWriteSentence(FILE *stream, TaggerWords *tw, Channel *ch, size_t lowerb, size_t upperb, int eoschar) { int c, prev, inword, first; size_t pos; prev = TERM; inword = 0; first = 1; for (pos = lowerb; pos <= upperb; pos++) { c = *(((char *)ch->buf) + pos); if (LexEntryNonwhite(c)) { if (!inword) { if (first) { first = 0; } else { fputc(' ', stream); } if (prev == '\'') fputc(prev, stream); inword = 1; TaggerWordsAdd(tw, pos, c); } fputc(c, stream); } else { inword = 0; if (c == ',') fputs(" ,", stream); } prev = c; } fputc(' ', stream); fputc(eoschar, stream); TaggerWordsAdd(tw, pos, eoschar); fputc(NEWLINE, stream); }
void TA_EndOfSentence(Channel *ch, Discourse *dc) { int c, eos, inword; char *p, *begin; eos = inword = 0; p = (char *)ch->buf; begin = NULL; while ((c = *((uc *)p))) { if (c == ';' || c == ':') { ChannelAddPNode(ch, PNTYPE_END_OF_SENT, 1.0, NULL, NULL, p, p+1); } else if (c == '.' || c == '!' || c == '?') { if (eos == 0) { if (c == '.') eos = 1; else eos = 2; begin = p; } } else if (LexEntryNonwhite(c)) { if (eos == 2 && CharSentenceStart(c)) { ChannelAddPNode(ch, PNTYPE_END_OF_SENT, 1.0, NULL, NULL, begin, p); } eos = 0; begin = NULL; inword = 1; } else { inword = 0; if (eos) eos = 2; } p++; } if (eos) { ChannelAddPNode(ch, PNTYPE_END_OF_SENT, 1.0, NULL, NULL, begin, p); } }
char *StringReadWhitespace_LeNonwhite(/* RESULTS */ char *out, /* INPUT */ char *in, int maxlen) { maxlen--; while (*in && (!LexEntryNonwhite(*((uc *)in)))) { if (maxlen <= 0) { *out = TERM; while (*in && (!LexEntryNonwhite(*((uc *)in)))) { in++; } return(in); } /* No Dbg here because this is frequent when the punct is _____ etc. */ *out++ = *in++; maxlen--; } *out = TERM; return(in); }
void TA_LexEntry(Channel *ch, Discourse *dc) { int numwords, cur, prev, mods, max_words; char *p, *rest, phrase[PHRASELEN], postpunc[PUNCLEN]; HashTable *ht; ht = DC(dc).ht; prev = TERM; p = (char *)ch->buf; if (dc->mode & DC_MODE_COMPOUND_NOUN) max_words = 1; else max_words = MaxWordsInPhrase+1; while (*p) { cur = *((uc *)p); if (LexEntryNonwhite(cur) && (!LexEntryNonwhite(prev))) { /* At start of word/phrase. */ for (numwords = 1; numwords <= max_words; numwords++) { if (StringGetNWords_LeNonwhite(phrase, postpunc, p, PHRASELEN, numwords, &rest)) { if (!TA_LexEntry1(phrase, postpunc, p, rest, prev, ht, ch, dc, numwords == 1)) { /* todo: Inelegant. This is required to deal with "." at end of * sentence. This may eliminate needed dots. A better solution * is to eliminate dots specifically where EOS is detected. */ StringElims(phrase, LE_NONWHITESPACE, &mods); if (mods) { TA_LexEntry1(phrase, postpunc, p, rest, prev, ht, ch, dc, numwords == 1); } } } } } prev = cur; p++; } }
Bool StringGetNWords_LeN_Back(/* RESULTS */ char *output, /* INPUT */ char *in, char *in_base, int maxlen, int n, /* RESULTS */ char **nextp) { char *orig_out, buf[PHRASELEN], *out; int inword; out = buf; orig_out = out; inword = 0; maxlen--; while (*in && in >= in_base) { if (LexEntryNonwhite(*((uc *)in))) { if (!inword) { inword = 1; n--; if (orig_out != out) { if ((out - orig_out) >= maxlen) { return(0); } *out = SPACE; out++; } } if ((out - orig_out) >= maxlen) { return(0); } *out = *in; out++; in--; } else { if (n == 0) { *nextp = in-1; /* todo? */ *out = TERM; StringReverse(orig_out, output); return(1); } inword = 0; in--; } } return(0); }
Bool StringGetNWords_LeNonwhite(/* RESULTS */ char *out, char *trail_punc, /* INPUT */ char *in, int maxlen, int n, /* RESULTS */ char **nextp) { char *orig_out; int inword; orig_out = out; inword = 0; maxlen--; while (*in) { if (LexEntryNonwhite(*((uc *)in))) { if (!inword) { inword = 1; n--; if (orig_out != out) { if ((out - orig_out) >= maxlen) { return(0); } *out = SPACE; out++; } } if ((out - orig_out) >= maxlen) { return(0); } *out = *in; out++; in++; } else { if (n <= 0) { *out = TERM; *nextp = StringReadWhitespace_LeNonwhite(trail_punc, in, PUNCLEN); return(1); } inword = 0; in++; } } return(0); }
/* Convert string so it can be looked up by LexEntryFindPhrase. * "day-dream day" => "day dream day" * (similar to LexEntryDbFileToPhrase?) */ Bool StringGetLeNonwhite(/* RESULTS */ char *out, /* INPUT */ char *in, int maxlen) { int inword; char *orig_out; inword = 0; orig_out = out; maxlen--; while (*in) { if (LexEntryNonwhite(*((uc *)in))) { if (!inword) { inword = 1; if (orig_out != out) { if ((out - orig_out) >= maxlen) { return(0); } *out = SPACE; out++; } } if ((out - orig_out) >= maxlen) { return(0); } *out = *in; out++; in++; } else { inword = 0; in++; } } if ((out - orig_out) >= maxlen) { return(0); } *out = TERM; return(1); }
char *StringSkipWhitespace_LeNonwhite(char *in) { while (*in && (!LexEntryNonwhite(*((uc *)in)))) in++; return(in); }