int default_utt_break(cst_tokenstream *ts, const char *token, cst_relation *tokens) { /* This is the default utt break functions, languages may override this */ /* This will be ok for some latin based languages */ const char *postpunct = item_feat_string(relation_tail(tokens), "punc"); const char *ltoken = item_name(relation_tail(tokens)); if (cst_strchr(ts->whitespace,'\n') != cst_strrchr(ts->whitespace,'\n')) /* contains two new lines */ return TRUE; else if (strchr(postpunct,':') || strchr(postpunct,'?') || strchr(postpunct,'!')) return TRUE; else if (strchr(postpunct,'.') && (cst_strlen(ts->whitespace) > 1) && strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0])) return TRUE; else if (strchr(postpunct,'.') && /* next word starts with a capital */ strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]) && /* last word isn't an abbreviation */ !(strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[cst_strlen(ltoken)-1])|| ((cst_strlen(ltoken) < 4) && strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[0])))) return TRUE; else return FALSE; }
static void parse_description(const char *description, cst_features *f) { /* parse the description into something more usable */ cst_tokenstream *ts; const char *arg; char *op; const char *xop; ts = ts_open_string(description, " \t\r\n", /* whitespace */ "{}[]|", /* singlecharsymbols */ "", /* prepunctuation */ ""); /* postpunctuation */ while (!ts_eof(ts)) { op = cst_strdup(ts_get(ts)); if ((op[0] == '-') && (cst_strchr(ts->whitespace,'\n') != 0)) { /* got an option */ xop = feat_own_string(f,op); arg = ts_get(ts); if (arg[0] == '<') feat_set_string(f,xop,arg); else feat_set_string(f,xop,"<binary>"); } cst_free(op); } ts_close(ts); }