Beispiel #1
0
// Apply just the OCR-specific normalizations and return the normalized char.
char32 OCRNormalize(char32 ch) {
  if (is_hyphen_punc(ch))
    return '-';
  else if (is_single_quote(ch))
    return '\'';
  else if (is_double_quote(ch))
    return '"';
  return ch;
}
void fix_file (const std::wstring& in, const std::wstring& out)
{
    std::wcout << L"in:" << in << L"\nout:" << out << L'\n';
    rsc::istream fsin(in);
    rsc::ostream fsout(out);

    int count = 0;
    bool quote = false;
    char16_t before = L'\0';
    char16_t current, after;
    fsin.get(current);
    while (!fsin.eof())
    {
        fsin.get(after);

        if (is_double_quote(current))
            quote = !quote;

        if (!quote)
        {
            // not in quote, do nothing
        }
        else if (is_floating_vowel(current))
        {
            if (is_long_tail(before))
            {
                count++;
                current = move_floating_left(current);
            }
        }
        else if (is_floating_vowel(current))
        {
            if (is_long_tail(before))
            {
                count++;
                current = move_floating_left(current);
            }
        }
        else if (is_tone_marker(current))
        {
            if (is_floating_vowel(before))
            {
                if (is_long_tail(before))
                {
                    count++;
                    current = move_tone_marker_left_height(current);
                }
            }
            else if (is_long_tail(before))
            {
                if (is_backward_floating_vowel(after))
                {
                    count++;
                    current = move_tone_marker_left_height(current);
                }
                else
                {
                    count++;
                    current = move_tone_marker_left_low(current);
                }
            }
            else if (!is_backward_floating_vowel(after))
            {
                count++;
                current = move_tone_marker_low(current);
            }
        }
        else if (is_lower_foot(current))
        {
            if (is_lower_vowel(after))
            {
                count++;
                current = strip_lower_foot(current);
            }
        }
        else if (is_lower_vowel(current))
        {
            if (is_lower_tail(before))
            {
                count++;
                current = move_lower_vowel_left(current);
            }
        }

        fsout.put(current);

        if (!quote)
        {
        }
        else if (is_digit(current))
        {
            if (is_argument(before) && is_argument(after))
            {
                count++;
                fsout.put(argument_prefix());
            }
        }

        before = current;
        current = after;
    } // while

    if (count > 0)
        std::wcout << L'\n';
    std::wcout << L"fixed:" << count << std::endl;
}