char_type get_char_type(wchar_t c) { if ((c >= 0x41 && c <= 0x5A) || /* A-Z */ (c >= 0x61 && c <= 0x7A) || /* a-z */ (c >= 0xC1 && c <= 0xD6) || /* À-Ö */ (c >= 0xD8 && c <= 0xF6) || /* Ø-ö */ (c >= 0x00F8 && c <= 0x02AF) || /* ø-ɏ */ (c >= 0x0400 && c <= 0x0481) || /* Ѐ-ҁ - Cyrillic */ (c >= 0x048A && c <= 0x0527) || /* Ҋ-ԧ - Cyrillic + Cyrillic extended */ (c >= 0x1400 && c <= 0x15C3) || /* ᐀-ᗃ - Canadian syllabics */ (c >= 0xFB00 && c <= 0xFB04)) { return CHAR_LETTER; } if (SimpleChar::isWhitespace(c)) { return CHAR_WHITESPACE; } if (wcschr(L".,;-!?:'()[]{}/&" L"\u00AD" /* SOFT HYPHEN */ L"\u2019" /* RIGHT SINGLE QUOTATION MARK */ L"\u2010" /* HYPHEN */ L"\u2011" /* NON-BREAKING HYPHEN */ L"\u2013" /* EN DASH */ L"\u2014" /* EM DASH */ L"\u201C" /* LEFT DOUBLE QUOTATION MARK */ L"\u2026" /* HORIZONTAL ELLIPSIS */ , c)) return CHAR_PUNCTUATION; if (isFinnishQuotationMark(c)) { return CHAR_PUNCTUATION; } if (wcschr(L"0123456789", c)) { return CHAR_DIGIT; } return CHAR_UNKNOWN; }
char_type get_char_type(wchar_t c) { if (wcschr(L".,;-!?:'()[]{}/&" L"\u00AD" /* SOFT HYPHEN */ L"\u2019" /* RIGHT SINGLE QUOTATION MARK */ L"\u2010" /* HYPHEN */ L"\u2011" /* NON-BREAKING HYPHEN */ L"\u2013" /* EN DASH */ L"\u2014" /* EM DASH */ L"\u201C" /* LEFT DOUBLE QUOTATION MARK */ L"\u2026" /* HORIZONTAL ELLIPSIS */ , c)) return CHAR_PUNCTUATION; if (isFinnishQuotationMark(c)) { return CHAR_PUNCTUATION; } if (SimpleChar::isWhitespace(c)) { return CHAR_WHITESPACE; } if ((c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A) || (c >= 0xC1 && c <= 0xD6) || (c >= 0xD8 && c <= 0xF6) || (c >= 0x00F8 && c <= 0x024F) || (c >= 0x0400 && c <= 0x0481) || (c >= 0x048A && c <= 0x0523) || (c >= 0xFB00 && c <= 0xFB04)) { return CHAR_LETTER; } if (wcschr(L"0123456789", c)) { return CHAR_DIGIT; } return CHAR_UNKNOWN; }
voikko_sentence_type Sentence::next(voikko_options_t * options, const wchar_t * text, size_t textlen, size_t * sentencelen) { voikko_token_type token = TOKEN_WORD; size_t slen = 0; size_t tokenlen; size_t previous_token_start = 0; voikko_token_type previous_token_type = TOKEN_NONE; bool end_found = false; bool in_quotation = false; bool end_dotword = false; bool possible_end_punctuation = false; while (token != TOKEN_NONE && textlen > slen) { int ignore_dot_saved = options->ignore_dot; options->ignore_dot = 0; token = tokenizer::Tokenizer::nextToken(options, text + slen, textlen - slen, &tokenlen); options->ignore_dot = ignore_dot_saved; if (end_found && !in_quotation) { if (token != TOKEN_WHITESPACE) { *sentencelen = slen; if (end_dotword || possible_end_punctuation) return SENTENCE_POSSIBLE; else return SENTENCE_PROBABLE; } } else if (token == TOKEN_PUNCTUATION) { wchar_t punct = text[slen]; if (wcschr(L"!?", punct)) { end_found = true; if (in_quotation) { possible_end_punctuation = true; } } else if ((punct == L'.' && tokenlen == 3) || punct == L'\u2026') { // ellipsis end_found = true; possible_end_punctuation = true; } else if (punct == L'.') { end_found = true; if (slen != 0 && previous_token_type == TOKEN_WORD && dot_part_of_word(options, text + previous_token_start, slen - previous_token_start + 1)) { end_dotword = true; } } else if (punct == L':') { end_found = true; possible_end_punctuation = true; } else if (isFinnishQuotationMark(punct) || punct == L'\u201C') { in_quotation = !in_quotation; if (!in_quotation && slen + 1 < textlen && text[slen + 1] == L',') { // Comma immediately after ending quote suggests that // the sentence most likely did not end here. end_found = false; possible_end_punctuation = false; } } } previous_token_start = slen; previous_token_type = token; slen += tokenlen; } *sentencelen = textlen; return SENTENCE_NONE; }
static size_t word_length(const wchar_t * text, size_t textlen, voikko_options_t * options) { size_t wlen = 0; bool processing_number = false; const size_t urlLength = findUrlOrEmail(text, textlen); if (urlLength != 0) { return urlLength; } size_t adot; if (options->ignore_dot) { adot = 1; } else adot = 0; while (wlen < textlen) { switch (get_char_type(text[wlen])) { case CHAR_LETTER: processing_number = false; wlen++; break; case CHAR_DIGIT: processing_number = true; wlen++; break; case CHAR_WHITESPACE: case CHAR_UNKNOWN: return wlen; case CHAR_PUNCTUATION: switch (text[wlen]) { case L'\'': case L'\u2019': /* RIGHT SINGLE QUOTATION MARK */ case L':': if (wlen + 1 == textlen) return wlen; if (get_char_type(text[wlen+1]) == CHAR_LETTER) break; return wlen; case L'-': case L'\u00AD': /* SOFT HYPHEN */ case L'\u2010': /* HYPHEN */ case L'\u2011': /* NON-BREAKING HYPHEN */ if (wlen + 1 == textlen) { return wlen + 1; } if (isFinnishQuotationMark(text[wlen+1])) { return wlen + 1; } switch (get_char_type(text[wlen+1])) { case CHAR_LETTER: case CHAR_DIGIT: break; case CHAR_WHITESPACE: case CHAR_UNKNOWN: return wlen + 1; case CHAR_PUNCTUATION: if (text[wlen+1] == L',') { return wlen + 1; } return wlen; } break; case L'.': if (wlen + 1 == textlen) return wlen + adot; switch (get_char_type(text[wlen+1])) { case CHAR_LETTER: case CHAR_DIGIT: break; case CHAR_WHITESPACE: case CHAR_UNKNOWN: case CHAR_PUNCTUATION: return wlen + adot; } break; case L',': if (!processing_number) return wlen; if (wlen + 1 == textlen) return wlen; if (get_char_type(text[wlen+1]) == CHAR_DIGIT) break; return wlen; default: return wlen; } wlen++; } } return textlen; }