Exemple #1
0
char_type get_char_type(wchar_t c) {
	if ((c >= 0x41 && c <= 0x5A) || /* A-Z */
	    (c >= 0x61 && c <= 0x7A) || /* a-z */
	    (c >= 0xC1 && c <= 0xD6) || /* À-Ö */
	    (c >= 0xD8 && c <= 0xF6) || /* Ø-ö */
	    (c >= 0x00F8 && c <= 0x02AF) || /* ø-ɏ */
	    (c >= 0x0400 && c <= 0x0481) || /* Ѐ-ҁ - Cyrillic */
	    (c >= 0x048A && c <= 0x0527) || /* Ҋ-ԧ - Cyrillic + Cyrillic extended */
	    (c >= 0x1400 && c <= 0x15C3) || /* ᐀-ᗃ - Canadian syllabics */
	    (c >= 0xFB00 && c <= 0xFB04)) {
		return CHAR_LETTER;
	}
	if (SimpleChar::isWhitespace(c)) {
		return CHAR_WHITESPACE;
	}
	if (wcschr(L".,;-!?:'()[]{}/&"
	           L"\u00AD"  /* SOFT HYPHEN */
	           L"\u2019"  /* RIGHT SINGLE QUOTATION MARK */
	           L"\u2010"  /* HYPHEN */
	           L"\u2011"  /* NON-BREAKING HYPHEN */
	           L"\u2013"  /* EN DASH */
	           L"\u2014"  /* EM DASH */
	           L"\u201C"  /* LEFT DOUBLE QUOTATION MARK */
	           L"\u2026"  /* HORIZONTAL ELLIPSIS */
	           , c)) return CHAR_PUNCTUATION;
	if (isFinnishQuotationMark(c)) {
		return CHAR_PUNCTUATION;
	}
	if (wcschr(L"0123456789", c)) {
		return CHAR_DIGIT;
	}
	return CHAR_UNKNOWN;
}
Exemple #2
0
char_type get_char_type(wchar_t c) {
	if (wcschr(L".,;-!?:'()[]{}/&"
	           L"\u00AD"  /* SOFT HYPHEN */
	           L"\u2019"  /* RIGHT SINGLE QUOTATION MARK */
	           L"\u2010"  /* HYPHEN */
	           L"\u2011"  /* NON-BREAKING HYPHEN */
	           L"\u2013"  /* EN DASH */
	           L"\u2014"  /* EM DASH */
	           L"\u201C"  /* LEFT DOUBLE QUOTATION MARK */
	           L"\u2026"  /* HORIZONTAL ELLIPSIS */
	           , c)) return CHAR_PUNCTUATION;
	if (isFinnishQuotationMark(c)) {
		return CHAR_PUNCTUATION;
	}
	if (SimpleChar::isWhitespace(c)) {
		return CHAR_WHITESPACE;
	}
	if ((c >= 0x41 && c <= 0x5A) ||
	    (c >= 0x61 && c <= 0x7A) ||
	    (c >= 0xC1 && c <= 0xD6) ||
	    (c >= 0xD8 && c <= 0xF6) ||
	    (c >= 0x00F8 && c <= 0x024F) ||
	    (c >= 0x0400 && c <= 0x0481) ||
	    (c >= 0x048A && c <= 0x0523) ||
	    (c >= 0xFB00 && c <= 0xFB04)) {
		return CHAR_LETTER;
	}
	if (wcschr(L"0123456789", c)) {
		return CHAR_DIGIT;
	}
	return CHAR_UNKNOWN;
}
Exemple #3
0
voikko_sentence_type Sentence::next(voikko_options_t * options,
		const wchar_t * text, size_t textlen, size_t * sentencelen) {
	voikko_token_type token = TOKEN_WORD;
	size_t slen = 0;
	size_t tokenlen;
	size_t previous_token_start = 0;
	voikko_token_type previous_token_type = TOKEN_NONE;
	bool end_found = false;
	bool in_quotation = false;
	bool end_dotword = false;
	bool possible_end_punctuation = false;
	while (token != TOKEN_NONE && textlen > slen) {
		int ignore_dot_saved = options->ignore_dot;
		options->ignore_dot = 0;
		token = tokenizer::Tokenizer::nextToken(options, text + slen,
		                               textlen - slen, &tokenlen);
		options->ignore_dot = ignore_dot_saved;
		if (end_found && !in_quotation) {
			if (token != TOKEN_WHITESPACE) {
				*sentencelen = slen;
				if (end_dotword || possible_end_punctuation) return SENTENCE_POSSIBLE;
				else return SENTENCE_PROBABLE;
			}
		}
		else if (token == TOKEN_PUNCTUATION) {
			wchar_t punct = text[slen];
			if (wcschr(L"!?", punct)) {
				end_found = true;
				if (in_quotation) {
					possible_end_punctuation = true;
				}
			}
			else if ((punct == L'.' && tokenlen == 3) || punct == L'\u2026') {
				// ellipsis
				end_found = true;
				possible_end_punctuation = true;
			}
			else if (punct == L'.') {
				end_found = true;
				if (slen != 0 &&
				    previous_token_type == TOKEN_WORD &&
				    dot_part_of_word(options, text + previous_token_start,
				      slen - previous_token_start + 1)) {
					end_dotword = true;
				}
			}
			else if (punct == L':') {
				end_found = true;
				possible_end_punctuation = true;
			}
			else if (isFinnishQuotationMark(punct) || punct == L'\u201C') {
				in_quotation = !in_quotation;
				if (!in_quotation && slen + 1 < textlen && text[slen + 1] == L',') {
					// Comma immediately after ending quote suggests that
					// the sentence most likely did not end here.
					end_found = false;
					possible_end_punctuation = false;
				}
			}
		}
		previous_token_start = slen;
		previous_token_type = token;
		slen += tokenlen;
	}
	*sentencelen = textlen;
	return SENTENCE_NONE;
}
Exemple #4
0
static size_t word_length(const wchar_t * text, size_t textlen, voikko_options_t * options) {
	size_t wlen = 0;
	bool processing_number = false;
	
	const size_t urlLength = findUrlOrEmail(text, textlen);
	if (urlLength != 0) {
		return urlLength;
	}
	
	size_t adot;
	if (options->ignore_dot) {
		adot = 1;
	}
	else adot = 0;
	
	while (wlen < textlen) {
		switch (get_char_type(text[wlen])) {
			case CHAR_LETTER:
				processing_number = false;
				wlen++;
				break;
			case CHAR_DIGIT:
				processing_number = true;
				wlen++;
				break;
			case CHAR_WHITESPACE:
			case CHAR_UNKNOWN:
				return wlen;
			case CHAR_PUNCTUATION:
				switch (text[wlen]) {
					case L'\'':
					case L'\u2019': /* RIGHT SINGLE QUOTATION MARK */
					case L':':
						if (wlen + 1 == textlen) return wlen;
						if (get_char_type(text[wlen+1]) ==
						    CHAR_LETTER) break;
						return wlen;
					case L'-':
					case L'\u00AD': /* SOFT HYPHEN */
					case L'\u2010': /* HYPHEN */
					case L'\u2011': /* NON-BREAKING HYPHEN */
						if (wlen + 1 == textlen) {
							return wlen + 1;
						}
						if (isFinnishQuotationMark(text[wlen+1])) {
							return wlen + 1;
						}
						switch (get_char_type(text[wlen+1])) {
							case CHAR_LETTER:
							case CHAR_DIGIT:
								break;
							case CHAR_WHITESPACE:
							case CHAR_UNKNOWN:
								return wlen + 1;
							case CHAR_PUNCTUATION:
								if (text[wlen+1] == L',') {
									return wlen + 1;
								}
								return wlen;
						}
						break;
					case L'.':
						if (wlen + 1 == textlen) return wlen + adot;
						switch (get_char_type(text[wlen+1])) {
							case CHAR_LETTER:
							case CHAR_DIGIT:
								break;
							case CHAR_WHITESPACE:
							case CHAR_UNKNOWN:
							case CHAR_PUNCTUATION:
								return wlen + adot;
						}
						break;
					case L',':
						if (!processing_number) return wlen;
						if (wlen + 1 == textlen) return wlen;
						if (get_char_type(text[wlen+1]) ==
						    CHAR_DIGIT) break;
						return wlen;
						
					default:
						return wlen;
				}
				wlen++;
		}
	}
	return textlen;
}