Пример #1
0
/* Returns pointer to next nonwhite. */
Bool StringGetWord_LeNonwhite(/* RESULTS */ char *out, /* INPUT */ char *in,
                              int maxlen, /* RESULTS */ char **nextp,
                              char *trail_punc)
{
  char	*orig_out;
  orig_out = out;
  maxlen--;
  while (*in) {
    if (!LexEntryNonwhite(*((uc *)in))) {
      if (trail_punc) {
        *nextp = StringReadWhitespace_LeNonwhite(trail_punc, in, PUNCLEN);
      } else {
        *nextp = StringSkipWhitespace_LeNonwhite(in + 1);
      }
      *out = TERM;
      return(1);
    }
    if ((out - orig_out) >= maxlen) {
      return(0);
    }
    *out = *in;
    out++;
    in++;
  }
  *nextp = in;
  *out = TERM;
  return(1);
}
Пример #2
0
void TA_TaggerWriteSentence(FILE *stream, TaggerWords *tw, Channel *ch,
                            size_t lowerb, size_t upperb, int eoschar)
{
  int		c, prev, inword, first;
  size_t        pos;
  prev = TERM;
  inword = 0;
  first = 1;
  for (pos = lowerb; pos <= upperb; pos++) {
    c = *(((char *)ch->buf) + pos);
    if (LexEntryNonwhite(c)) {
      if (!inword) {
        if (first) {
          first = 0;
        } else {
          fputc(' ', stream);
        }
        if (prev == '\'') fputc(prev, stream);
        inword = 1;
        TaggerWordsAdd(tw, pos, c);
      }
      fputc(c, stream);
    } else {
      inword = 0;
      if (c == ',') fputs(" ,", stream);
    }
    prev = c;
  }
  fputc(' ', stream);
  fputc(eoschar, stream);
  TaggerWordsAdd(tw, pos, eoschar);
  fputc(NEWLINE, stream);
}
Пример #3
0
void TA_EndOfSentence(Channel *ch, Discourse *dc)
{
  int	c, eos, inword;
  char	*p, *begin;
  eos = inword = 0;
  p = (char *)ch->buf;
  begin = NULL;
  while ((c = *((uc *)p))) {
    if (c == ';' || c == ':') {
      ChannelAddPNode(ch, PNTYPE_END_OF_SENT, 1.0, NULL, NULL, p, p+1);
    } else if (c == '.' || c == '!' || c == '?') {
      if (eos == 0) {
        if (c == '.') eos = 1;
        else eos = 2;
        begin = p;
      }
    } else if (LexEntryNonwhite(c)) {
      if (eos == 2 && CharSentenceStart(c)) {
        ChannelAddPNode(ch, PNTYPE_END_OF_SENT, 1.0, NULL, NULL, begin, p);
      }
      eos = 0;
      begin = NULL;
      inword = 1;
    } else {
      inword = 0;
      if (eos) eos = 2;
    }
    p++;
  }
  if (eos) {
    ChannelAddPNode(ch, PNTYPE_END_OF_SENT, 1.0, NULL, NULL, begin, p);
  }
}
Пример #4
0
char *StringReadWhitespace_LeNonwhite(/* RESULTS */ char *out,
                                      /* INPUT */ char *in, int maxlen)
{
  maxlen--;
  while (*in && (!LexEntryNonwhite(*((uc *)in)))) {
    if (maxlen <= 0) {
      *out = TERM;
      while (*in && (!LexEntryNonwhite(*((uc *)in)))) {
        in++;
      }
      return(in);
    }
      /* No Dbg here because this is frequent when the punct is _____ etc. */
    *out++ = *in++;
    maxlen--;
  }
  *out = TERM;
  return(in);
}
Пример #5
0
void TA_LexEntry(Channel *ch, Discourse *dc)
{
  int		numwords, cur, prev, mods, max_words;
  char		*p, *rest, phrase[PHRASELEN], postpunc[PUNCLEN];
  HashTable	*ht;
  ht = DC(dc).ht;
  prev = TERM;
  p = (char *)ch->buf;

  if (dc->mode & DC_MODE_COMPOUND_NOUN) max_words = 1;
  else max_words = MaxWordsInPhrase+1;

  while (*p) {
    cur = *((uc *)p);
    if (LexEntryNonwhite(cur) && (!LexEntryNonwhite(prev))) {
    /* At start of word/phrase. */
      for (numwords = 1; numwords <= max_words; numwords++) {
        if (StringGetNWords_LeNonwhite(phrase, postpunc, p, PHRASELEN, numwords,
                                       &rest)) {
          if (!TA_LexEntry1(phrase, postpunc, p, rest, prev, ht, ch, dc,
                            numwords == 1)) {
            /* todo: Inelegant. This is required to deal with "." at end of
             * sentence. This may eliminate needed dots. A better solution
             * is to eliminate dots specifically where EOS is detected.
             */
            StringElims(phrase, LE_NONWHITESPACE, &mods);
            if (mods) {
              TA_LexEntry1(phrase, postpunc, p, rest, prev, ht, ch, dc,
                           numwords == 1);
            }
          }
        }
      }
    }
    prev = cur;
    p++;
  }
}
Пример #6
0
Bool StringGetNWords_LeN_Back(/* RESULTS */ char *output, /* INPUT */ char *in,
                              char *in_base, int maxlen, int n,
                              /* RESULTS */ char **nextp)
{
  char	*orig_out, buf[PHRASELEN], *out;
  int	inword;
  out = buf;
  orig_out = out;
  inword = 0;
  maxlen--;
  while (*in && in >= in_base) {
    if (LexEntryNonwhite(*((uc *)in))) {
      if (!inword) {
        inword = 1;
        n--;
        if (orig_out != out) {
          if ((out - orig_out) >= maxlen) {
            return(0);
          }
          *out = SPACE;
          out++;
        }
      }
      if ((out - orig_out) >= maxlen) {
        return(0);
      }
      *out = *in;
      out++;
      in--;
    } else {
      if (n == 0) {
        *nextp = in-1;	/* todo? */
        *out = TERM;
        StringReverse(orig_out, output);
        return(1);
      }
      inword = 0;
      in--;
    }
  }
  return(0);
}
Пример #7
0
Bool StringGetNWords_LeNonwhite(/* RESULTS */ char *out, char *trail_punc,
                                /* INPUT */ char *in, int maxlen, int n,
                                /* RESULTS */ char **nextp)
{
  char	*orig_out;
  int	inword;
  orig_out = out;
  inword = 0;
  maxlen--;
  while (*in) {
    if (LexEntryNonwhite(*((uc *)in))) {
      if (!inword) {
        inword = 1;
        n--;
        if (orig_out != out) {
          if ((out - orig_out) >= maxlen) {
            return(0);
          }
          *out = SPACE;
          out++;
        }
      }
      if ((out - orig_out) >= maxlen) {
        return(0);
      }
      *out = *in;
      out++;
      in++;
    } else {
      if (n <= 0) {
        *out = TERM;
        *nextp = StringReadWhitespace_LeNonwhite(trail_punc, in, PUNCLEN);
        return(1);
      }
      inword = 0;
      in++;
    }
  }
  return(0);
}
Пример #8
0
/* Convert string so it can be looked up by LexEntryFindPhrase.
 * "day-dream   day" => "day dream day"
 * (similar to LexEntryDbFileToPhrase?)
 */
Bool StringGetLeNonwhite(/* RESULTS */ char *out,
                         /* INPUT */ char *in, int maxlen)
{
  int	inword;
  char	*orig_out;
  inword = 0;
  orig_out = out;
  maxlen--;
  while (*in) {
    if (LexEntryNonwhite(*((uc *)in))) {
      if (!inword) {
        inword = 1;
        if (orig_out != out) {
          if ((out - orig_out) >= maxlen) {
            return(0);
          }
          *out = SPACE;
          out++;
        }
      }
      if ((out - orig_out) >= maxlen) {
        return(0);
      }
      *out = *in;
      out++;
      in++;
    } else {
      inword = 0;
      in++;
    }
  }
  if ((out - orig_out) >= maxlen) {
    return(0);
  }
  *out = TERM;
  return(1);
}
Пример #9
0
char *StringSkipWhitespace_LeNonwhite(char *in)
{
  while (*in && (!LexEntryNonwhite(*((uc *)in)))) in++;
  return(in);
}