static size_t smartypants_cb__number(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) { if (word_boundary(previous_char) && size >= 3) { if (text[0] == '1' && text[1] == '/' && text[2] == '2') { if (size == 3 || word_boundary(text[3])) { BUFPUTSL(ob, "½"); return 2; } } if (text[0] == '1' && text[1] == '/' && text[2] == '4') { if (size == 3 || word_boundary(text[3]) || (size >= 5 && tolower(text[3]) == 't' && tolower(text[4]) == 'h')) { BUFPUTSL(ob, "¼"); return 2; } } if (text[0] == '3' && text[1] == '/' && text[2] == '4') { if (size == 3 || word_boundary(text[3]) || (size >= 6 && tolower(text[3]) == 't' && tolower(text[4]) == 'h' && tolower(text[5]) == 's')) { BUFPUTSL(ob, "¾"); return 2; } } } bufputc(ob, text[0]); return 0; }
static int smartypants_cmpsub(const struct buf *buf, size_t start, const char *prefix) { size_t i; if (prefix[0] == '<') { if (start == 0 || !word_boundary(buf->data[start - 1])) return 0; prefix++; } for (i = start; i < buf->size; ++i) { char c, p; c = tolower(buf->data[i]); p = *prefix++; if (p == 0) return 1; if (p == '>') return word_boundary(c); if (c != p) return 0; } return (*prefix == '>'); }
static int smartypants_quotes(struct buf *ob, uint8_t previous_char, uint8_t next_char, uint8_t quote, int *is_open) { char ent[8]; if (*is_open && !word_boundary(next_char)) return 0; if (!(*is_open) && !word_boundary(previous_char)) return 0; snprintf(ent, sizeof(ent), "&%c%cquo;", (*is_open) ? 'r' : 'l', quote); *is_open = !(*is_open); bufputs(ob, ent); return 1; }
static int smartypants_quotes(struct buf *ob, struct buf *text, size_t i, int is_open) { char ent[8]; if (is_open && i + 1 < text->size && !word_boundary(text->data[i + 1])) return 0; if (!is_open && i > 0 && !word_boundary(text->data[i - 1])) return 0; snprintf(ent, sizeof(ent), "&%c%cquo;", is_open ? 'r' : 'l', text->data[i] == '\'' ? 's' : 'd'); bufputs(ob, ent); return 1; }
static size_t smartypants_cb__squote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size) { if (size >= 2) { uint8_t t1 = tolower(text[1]); if (t1 == '\'') { if (smartypants_quotes(ob, previous_char, size >= 3 ? text[2] : 0, 'd', &smrt->in_dquote)) return 1; } if ((t1 == 's' || t1 == 't' || t1 == 'm' || t1 == 'd') && (size == 3 || word_boundary(text[2]))) { BUFPUTSL(ob, "’"); return 0; } if (size >= 3) { uint8_t t2 = tolower(text[2]); if (((t1 == 'r' && t2 == 'e') || (t1 == 'l' && t2 == 'l') || (t1 == 'v' && t2 == 'e')) && (size == 4 || word_boundary(text[3]))) { BUFPUTSL(ob, "’"); return 0; } } } if (smartypants_quotes(ob, previous_char, size > 0 ? text[1] : 0, 's', &smrt->in_squote)) return 0; bufputc(ob, text[0]); return 0; }
int main() { try { std::cout << "Testing word boundary" << std::endl; word_boundary(); std::cout << "Testing character boundary" << std::endl; test_boundaries(character,nones,0,lb::character); std::cout << "Testing sentence boundary" << std::endl; test_boundaries(sentence1,sentence1a,sentence1b,lb::sentence); std::cout << "Testing line boundary" << std::endl; test_boundaries(line1,line1a,line1b,lb::line); } catch(std::exception const &e) { std::cerr << "Failed " << e.what() << std::endl; return EXIT_FAILURE; } FINALIZE(); }
bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts) { Wordgraph_pathpos *wp_new = NULL; Wordgraph_pathpos *wp_old = NULL; Wordgraph_pathpos *wpp; Gword **next; /* next Wordgraph words of the current word */ size_t i; Linkage_info * const lifo = &lkg->lifo; bool match_found = true; /* if all the words are null - it's still a match */ Gword **lwg_path; Dictionary afdict = sent->dict->affix_table; /* for SANEMORPHISM */ char *const affix_types = alloca(sent->length*2 + 1); /* affix types */ affix_types[0] = '\0'; /* Populate the path word queue, initializing the path to NULL. */ for (next = sent->wordgraph->next; *next; next++) { wordgraph_path_append(&wp_new, /*path*/NULL, /*add_word*/NULL, *next); } assert(NULL != wp_new, "Path word queue is empty"); for (i = 0; i < lkg->num_words; i++) { Disjunct *cdj; /* chosen disjunct */ lgdebug(D_SLM, "%p Word %zu: ", lkg, i); if (NULL == wp_new) { lgdebug(+D_SLM, "- No more words in the wordgraph\n"); match_found = false; break; } if (wp_old != wp_new) { wordgraph_path_free(wp_old, true); wp_old = wp_new; } wp_new = NULL; //wordgraph_pathpos_print(wp_old); cdj = lkg->chosen_disjuncts[i]; /* Handle null words */ if (NULL == cdj) { lgdebug(D_SLM, "- Null word\n"); /* A null word matches any word in the Wordgraph - * so, unconditionally proceed in all paths in parallel. */ match_found = false; for (wpp = wp_old; NULL != wpp->word; wpp++) { if (NULL == wpp->word->next) continue; /* This path encountered the Wordgraph end */ /* The null words cannot be marked here because wpp->path consists * of pointers to the Wordgraph words, and these words are common to * all the linkages, with potentially different null words in each * of them. However, the position of the null words can be inferred * from the null words in the word array of the Linkage structure. */ for (next = wpp->word->next; NULL != *next; next++) { match_found = true; wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); } } continue; } if (!match_found) { const char *e = "Internal error: Too many words in the linkage\n"; lgdebug(D_SLM, "- %s", e); prt_error("Error: %s.", e); break; } assert(MT_EMPTY != cdj->word[0]->morpheme_type); /* already discarded */ if (debug_level(D_SLM)) print_with_subscript_dot(cdj->string); match_found = false; /* Proceed in all the paths in which the word is found. */ for (wpp = wp_old; NULL != wpp->word; wpp++) { const Gword **wlp; /* disjunct word list */ for (wlp = cdj->word; *wlp; wlp++) { if (*wlp == wpp->word) { match_found = true; for (next = wpp->word->next; NULL != *next; next++) { wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); } break; } } } if (!match_found) { /* FIXME? A message can be added here if there are too many words * in the linkage (can happen only if there is an internal error). */ lgdebug(D_SLM, "- No Wordgraph match\n"); break; } lgdebug(D_SLM, "\n"); } if (match_found) { match_found = false; /* Validate that there are no missing words in the linkage. It is so if * the dummy termination word is found in the new pathpos queue. */ if (NULL != wp_new) { for (wpp = wp_new; NULL != wpp->word; wpp++) { if (MT_INFRASTRUCTURE == wpp->word->morpheme_type) { match_found = true; /* Exit the loop with with wpp of the termination word. */ break; } } } if (!match_found) lgdebug(D_SLM, "%p Missing word(s) at the end of the linkage.\n", lkg); } #define DEBUG_morpheme_type 0 /* Check the morpheme type combination. * If null_count > 0, the morpheme type combination may be invalid * due to null subwords, so skip this check. */ if (match_found && (0 == sent->null_count) && (NULL != afdict) && (NULL != afdict->regex_root)) { const Gword **w; char *affix_types_p = affix_types; /* Construct the affix_types string. */ #if DEBUG_morpheme_type print_lwg_path(wpp->path); #endif i = 0; for (w = wpp->path; *w; w++) { i++; if (MT_EMPTY == (*w)->morpheme_type) continue; /* really a null word */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-enum" switch ((*w)->morpheme_type) { #pragma GCC diagnostic pop default: /* What to do with the rest? */ case MT_WORD: *affix_types_p = AFFIXTYPE_WORD; break; case MT_PREFIX: *affix_types_p = AFFIXTYPE_PREFIX; break; case MT_STEM: *affix_types_p = AFFIXTYPE_STEM; break; case MT_MIDDLE: *affix_types_p = AFFIXTYPE_MIDDLE; break; case MT_SUFFIX: *affix_types_p = AFFIXTYPE_SUFFIX; break; } #if DEBUG_morpheme_type lgdebug(D_SLM, "Word %zu: %s affixtype=%c\n", i, (*w)->subword, *affix_types_p); #endif affix_types_p++; } *affix_types_p = '\0'; #ifdef WORD_BOUNDARIES /* not yet implemented */ { const Gword *uw; /* If w is an "end subword", return its unsplit word, else NULL. */ uw = word_boundary(w); /* word_boundary() unimplemented */ if (NULL != uw) { *affix_types_p++ = AFFIXTYPE_END; lgdebug(D_SLM, "%p End of Gword %s\n", lkg, uw->subword); } } #endif /* Check if affix_types is valid according to SANEMORPHISM. */ if (('\0' != affix_types[0]) && (NULL == match_regex(afdict->regex_root, affix_types))) { /* Morpheme type combination is invalid */ match_found = false; /* Notify to stdout, so it will be shown along with the result. * XXX We should have a better way to notify. */ if (0 < opts->verbosity) printf("Warning: Invalid morpheme type combination '%s', " "run with !bad and !verbosity=4 to debug\n", affix_types); } } if (match_found) lwg_path = (Gword **)wpp->path; /* OK to modify */ wordgraph_path_free(wp_old, true); wordgraph_path_free(wp_new, !match_found); if (match_found) { if ('\0' != affix_types[0]) { lgdebug(D_SLM, "%p Morpheme type combination '%s'\n", lkg, affix_types); } lgdebug(+D_SLM, "%p SUCCEEDED\n", lkg); lkg->wg_path = lwg_path; return true; } /* Oh no ... invalid morpheme combination! */ sent->num_valid_linkages --; lifo->N_violations++; lifo->pp_violation_msg = "Invalid morphism construction."; lkg->wg_path = NULL; lifo->discarded = true; lgdebug(D_SLM, "%p FAILED\n", lkg); return false; }