bool explode_ucs2_ucs2 (RECODE_SUBTASK subtask) { Hash_table *table = (Hash_table *) subtask->step->step_table; unsigned value; if (get_ucs2 (&value, subtask)) { if (subtask->task->byte_order_mark) put_ucs2 (BYTE_ORDER_MARK, subtask); while (true) { unsigned short lookup = value; unsigned short *result = (unsigned short *) hash_lookup (table, &lookup); if (result) { result++; while (*result != DONE && *result != ELSE) put_ucs2 (*result++, subtask); } else put_ucs2 (value, subtask); if (!get_ucs2 (&value, subtask)) break; } } SUBTASK_RETURN (subtask); }
bool explode_ucs2_byte (RECODE_SUBTASK subtask) { Hash_table *table = (Hash_table *) subtask->step->step_table; unsigned value; while (get_ucs2 (&value, subtask)) { unsigned short lookup = value; unsigned short *result = (unsigned short *) hash_lookup (table, &lookup); if (result) { result++; while (*result != DONE && *result != ELSE) { put_byte (*result, subtask); result++; } } else put_byte (value, subtask); } SUBTASK_RETURN (subtask); }
static bool transform_ibmpc_iconqnx (RECODE_SUBTASK subtask) { int input_char; input_char = get_byte (subtask); while (true) switch (input_char) { case DOS_EOF: RETURN_IF_NOGO (RECODE_NOT_CANONICAL, subtask); /* Fall through. */ case EOF: SUBTASK_RETURN (subtask); case 133: TRANSLATE_AND_BREAK ('A', 'a'); case 138: TRANSLATE_AND_BREAK ('A', 'e'); case 151: TRANSLATE_AND_BREAK ('A', 'u'); case 130: TRANSLATE_AND_BREAK ('B', 'e'); case 144: TRANSLATE_AND_BREAK ('B', 'E'); case 131: TRANSLATE_AND_BREAK ('C', 'a'); case 136: TRANSLATE_AND_BREAK ('C', 'e'); case 140: TRANSLATE_AND_BREAK ('C', 'i'); case 147: TRANSLATE_AND_BREAK ('C', 'o'); case 150: TRANSLATE_AND_BREAK ('C', 'u'); case 137: TRANSLATE_AND_BREAK ('H', 'e'); case 139: TRANSLATE_AND_BREAK ('H', 'i'); case 129: TRANSLATE_AND_BREAK ('H', 'u'); case 135: TRANSLATE_AND_BREAK ('K', 'c'); case 128: TRANSLATE_AND_BREAK ('K', 'C'); case DOS_CR: input_char = get_byte (subtask); if (input_char == DOS_LF) { put_byte (ENDLINE, subtask); input_char = get_byte (subtask); } else put_byte (DOS_CR, subtask); break; case ENDLINE: case ESCAPE: RETURN_IF_NOGO (RECODE_AMBIGUOUS_OUTPUT, subtask); /* Fall through. */ default: put_byte (input_char, subtask); input_char = get_byte (subtask); } }
bool combine_ucs2_ucs2 (RECODE_SUBTASK subtask) { unsigned value; if (get_ucs2 (&value, subtask)) { struct state *state = NULL; if (subtask->task->byte_order_mark) put_ucs2 (BYTE_ORDER_MARK, subtask); while (true) { struct state *shift = find_shifted_state (state, value, subtask->step); if (shift) { state = shift; if (!get_ucs2 (&value, subtask)) break; } else if (state) { if (state->result == NOT_A_CHARACTER) backtrack_ucs2 (state, subtask); else put_ucs2 (state->result, subtask); state = NULL; } else { put_ucs2 (value, subtask); if (!get_ucs2 (&value, subtask)) break; } } if (state) { if (state->result == NOT_A_CHARACTER) backtrack_ucs2 (state, subtask); else put_ucs2 (state->result, subtask); } } SUBTASK_RETURN (subtask); }
static bool transform_ucs2_html (RECODE_SUBTASK subtask) { Hash_table *table = subtask->step->step_table; unsigned value; while (get_ucs2 (&value, subtask)) { struct ucs2_to_string lookup; struct ucs2_to_string *entry; lookup.code = value; entry = hash_lookup (table, &lookup); if (entry) { const char *cursor = entry->string; put_byte ('&', subtask); while (*cursor) { put_byte (*cursor, subtask); cursor++; } put_byte (';', subtask); } else if ((value < 32 && value != '\n' && value != '\t') || value >= 127) { unsigned divider = 10000; put_byte ('&', subtask); put_byte ('#', subtask); while (divider > value) divider /= 10; while (divider > 1) { put_byte ('0' + value / divider, subtask); value %= divider; divider /= 10; } put_byte ('0' + value, subtask); put_byte (';', subtask); } else put_byte(value, subtask); } SUBTASK_RETURN (subtask); }
bool combine_byte_byte (RECODE_SUBTASK subtask) { struct state *state = NULL; unsigned value; if (value = get_byte (subtask), value != EOF) { while (true) { struct state *shift = find_shifted_state (state, value, subtask->step); if (shift) { state = shift; if (value = get_byte (subtask), value == EOF) break; } else if (state) { if (state->result == NOT_A_CHARACTER) backtrack_byte (state, subtask); else put_byte (state->result, subtask); state = NULL; } else { put_byte (value, subtask); if (value = get_byte (subtask), value == EOF) break; } } if (state) { if (state->result == NOT_A_CHARACTER) backtrack_byte (state, subtask); else put_byte (state->result, subtask); } } SUBTASK_RETURN (subtask); }
bool transform_with_iconv (RECODE_SUBTASK subtask) { RECODE_CONST_STEP step = subtask->step; iconv_t conversion = iconv_open (step->after->iconv_name, step->before->iconv_name); bool status; if (conversion == (iconv_t) -1) { SET_SUBTASK_ERROR (RECODE_SYSTEM_ERROR, subtask); SUBTASK_RETURN (subtask); } status = wrapped_transform (conversion, subtask); iconv_close (conversion); return status; }
static bool transform_html_ucs2 (RECODE_SUBTASK subtask) { RECODE_CONST_REQUEST request = subtask->task->request; int input_char; input_char = get_byte (subtask); if (input_char != EOF) put_ucs2 (BYTE_ORDER_MARK, subtask); /* FIXME: experimental */ while (input_char != EOF) if (input_char == '&') { char buffer[ENTITY_BUFFER_LENGTH]; char *cursor = buffer; bool valid = true; bool echo = false; input_char = get_byte (subtask); if (input_char == '#') { input_char = get_byte (subtask); if (input_char == 'x' || input_char == 'X') { unsigned value = 0; /* Scan &#[xX][0-9a-fA-F]+; notation. */ *cursor++ = '#'; *cursor++ = input_char; input_char = get_byte (subtask); while (valid) { if (input_char >= '0' && input_char <= '9') value = 16 * value + input_char - '0'; else if (input_char >= 'A' && input_char <= 'F') value = 16 * value + input_char - 'A' + 10; else if (input_char >= 'a' && input_char <= 'f') value = 16 * value + input_char - 'a' + 10; else break; if (value >= 65535) valid = false; else if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2) valid = false; else { *cursor++ = input_char; input_char = get_byte (subtask); } } if (valid) if (request->diacritics_only) { echo = true; *cursor = '\0'; } else { put_ucs2 (value, subtask); if (input_char == ';') input_char = get_byte (subtask); } else *cursor = '\0'; } else { unsigned value = 0; /* Scan &#[0-9]+; notation. */ *cursor++ = '#'; while (valid) { if (input_char >= '0' && input_char <= '9') value = 10 * value + input_char - '0'; else break; if (value >= 65535) valid = false; else if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2) valid = false; else { *cursor++ = input_char; input_char = get_byte (subtask); } } if (valid) if (request->diacritics_only) { echo = true; *cursor = '\0'; } else { put_ucs2 (value, subtask); if (input_char == ';') input_char = get_byte (subtask); } else *cursor = '\0'; } } else if ((input_char >= 'A' && input_char <= 'Z') || (input_char >= 'a' && input_char <= 'z')) { /* Scan &[A-Za-z][A-Za-z0-9]*; notation. */ *cursor++ = input_char; input_char = get_byte (subtask); while (valid && input_char != EOF && ((input_char >= 'A' && input_char <= 'Z') || (input_char >= 'a' && input_char <= 'z') || (input_char >= '0' && input_char <= '9'))) if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2) valid = false; else { *cursor++ = input_char; input_char = get_byte (subtask); } *cursor = '\0'; if (valid) { struct ucs2_to_string lookup; struct ucs2_to_string *entry; lookup.string = buffer; entry = hash_lookup (subtask->step->step_table, &lookup); if (entry) { put_ucs2 (entry->code, subtask); if (input_char == ';') input_char = get_byte (subtask); } else valid = false; } } if (echo || !valid) { put_ucs2 ('&', subtask); for (cursor = buffer; *cursor; cursor++) put_ucs2 (*cursor, subtask); } } else { put_ucs2 (input_char, subtask); input_char = get_byte (subtask); } SUBTASK_RETURN (subtask); }
static bool wrapped_transform (iconv_t conversion, RECODE_SUBTASK subtask) { char output_buffer[BUFFER_SIZE]; char input_buffer[BUFFER_SIZE]; int input_char = get_byte (subtask); char *cursor = input_buffer; bool drain_first = false; while (true) { /* The output buffer is fully avaible at this point. */ char *input = input_buffer; char *output = output_buffer; size_t input_left = 0; size_t output_left = BUFFER_SIZE; int saved_errno = 0; size_t converted; if (drain_first) { /* Drain all accumulated partial state and emit output to return to the initial shift state. */ converted = iconv (conversion, NULL, NULL, &output, &output_left); if (converted == (size_t) -1) saved_errno = errno; } if (saved_errno == 0) { /* Continue filling the input buffer. */ while (input_char != EOF && cursor < input_buffer + BUFFER_SIZE) { *cursor++ = input_char; input_char = get_byte (subtask); } if (cursor == input_buffer) { if (output == output_buffer) { /* All work has been done, just make sure we drained. */ if (drain_first) break; drain_first = true; continue; } } else { /* Convert accumulated input and add it to the output buffer. */ input = input_buffer; input_left = cursor - input_buffer; converted = iconv (conversion, &input, &input_left, &output, &output_left); if (converted == (size_t) -1) saved_errno = errno; } } /* Send the converted result, so freeing the output buffer. */ for (cursor = output_buffer; cursor < output; cursor++) put_byte (*cursor, subtask); /* Act according to the outcome of the iconv call. */ drain_first = false; if (saved_errno != 0 && saved_errno != E2BIG) { if (saved_errno == EILSEQ) { /* Invalid input. Skip one byte. */ RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); assert (input_left > 0); input++; input_left--; /* Why is draining required? */ drain_first = true; } else if (saved_errno == EINVAL) { if (input + input_left < input_buffer + BUFFER_SIZE && input_char == EOF) /* Incomplete multibyte sequence at end of input. */ RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); } else { recode_perror (subtask->task->request->outer, "iconv ()"); RETURN_IF_NOGO (RECODE_SYSTEM_ERROR, subtask); } } /* Move back any unprocessed part of the input buffer. */ for (cursor = input_buffer; input_left != 0; input_left--) *cursor++ = *input++; } SUBTASK_RETURN (subtask); }
static bool transform_iconqnx_ibmpc (RECODE_SUBTASK subtask) { int input_char; /* current character */ input_char = get_byte (subtask); while (true) switch (input_char) { case EOF: SUBTASK_RETURN (subtask); case ENDLINE: put_byte (DOS_CR, subtask); put_byte (DOS_LF, subtask); input_char = get_byte (subtask); break; case DOS_CR: input_char = get_byte (subtask); if (input_char == DOS_LF) RETURN_IF_NOGO (RECODE_AMBIGUOUS_OUTPUT, subtask); put_byte (DOS_CR, subtask); break; case ESCAPE: input_char = get_byte (subtask); switch (input_char) { case 'A': input_char = get_byte (subtask); switch (input_char) { case 'a': input_char = 133; break; case 'e': input_char = 138; break; case 'u': input_char = 151; break; default: RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); put_byte (ESCAPE, subtask); put_byte ('A', subtask); if (input_char == EOF) SUBTASK_RETURN (subtask); } break; case 'B': input_char = get_byte (subtask); switch (input_char) { case 'e': input_char = 130; break; case 'E': input_char = 144; break; default: RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); put_byte (ESCAPE, subtask); put_byte ('B', subtask); if (input_char == EOF) SUBTASK_RETURN (subtask); } break; case 'C': input_char = get_byte (subtask); switch (input_char) { case 'a': input_char = 131; break; case 'e': input_char = 136; break; case 'i': input_char = 140; break; case 'o': input_char = 147; break; case 'u': input_char = 150; break; default: RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); put_byte (ESCAPE, subtask); put_byte ('C', subtask); if (input_char == EOF) SUBTASK_RETURN (subtask); } break; case 'H': input_char = get_byte (subtask); switch (input_char) { case 'e': input_char = 137; break; case 'i': input_char = 139; break; case 'u': input_char = 129; break; default: RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); put_byte (ESCAPE, subtask); put_byte ('H', subtask); if (input_char == EOF) SUBTASK_RETURN (subtask); } break; case 'K': input_char = get_byte (subtask); switch (input_char) { case 'c': input_char = 135; break; case 'C': input_char = 128; break; default: RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); put_byte (ESCAPE, subtask); put_byte ('K', subtask); if (input_char == EOF) SUBTASK_RETURN (subtask); } break; default: RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); put_byte (ESCAPE, subtask); if (input_char == EOF) SUBTASK_RETURN (subtask); } /* Fall through. */ default: put_byte (input_char, subtask); input_char = get_byte (subtask); } }