bool explode_ucs2_ucs2 (RECODE_SUBTASK subtask) { Hash_table *table = (Hash_table *) subtask->step->step_table; unsigned value; if (get_ucs2 (&value, subtask)) { if (subtask->task->byte_order_mark) put_ucs2 (BYTE_ORDER_MARK, subtask); while (true) { unsigned short lookup = value; unsigned short *result = (unsigned short *) hash_lookup (table, &lookup); if (result) { result++; while (*result != DONE && *result != ELSE) put_ucs2 (*result++, subtask); } else put_ucs2 (value, subtask); if (!get_ucs2 (&value, subtask)) break; } } SUBTASK_RETURN (subtask); }
bool explode_ucs2_ucs2 (RECODE_CONST_STEP step, RECODE_TASK task) { Hash_table *table = step->step_table; unsigned value; if (get_ucs2 (&value, step, task)) { if (task->byte_order_mark) put_ucs2 (BYTE_ORDER_MARK, task); while (true) { unsigned short lookup = value; unsigned short *result = hash_lookup (table, &lookup); if (result) { result++; while (*result != DONE && *result != ELSE) put_ucs2 (*result++, task); } else put_ucs2 (value, task); if (!get_ucs2 (&value, step, task)) break; } } TASK_RETURN (task); }
static void backtrack_ucs2 (struct state *state, RECODE_SUBTASK subtask) { if (state->result == NOT_A_CHARACTER) { backtrack_ucs2 (state->unshift, subtask); put_ucs2 (state->character, subtask); } else put_ucs2 (state->result, subtask); }
bool combine_ucs2_ucs2 (RECODE_SUBTASK subtask) { unsigned value; if (get_ucs2 (&value, subtask)) { struct state *state = NULL; if (subtask->task->byte_order_mark) put_ucs2 (BYTE_ORDER_MARK, subtask); while (true) { struct state *shift = find_shifted_state (state, value, subtask->step); if (shift) { state = shift; if (!get_ucs2 (&value, subtask)) break; } else if (state) { if (state->result == NOT_A_CHARACTER) backtrack_ucs2 (state, subtask); else put_ucs2 (state->result, subtask); state = NULL; } else { put_ucs2 (value, subtask); if (!get_ucs2 (&value, subtask)) break; } } if (state) { if (state->result == NOT_A_CHARACTER) backtrack_ucs2 (state, subtask); else put_ucs2 (state->result, subtask); } } SUBTASK_RETURN (subtask); }
bool combine_byte_ucs2 (RECODE_CONST_STEP step, RECODE_TASK task) { unsigned value; if (value = get_byte (task), value != EOF) { struct state *state = NULL; if (task->byte_order_mark) put_ucs2 (BYTE_ORDER_MARK, task); while (true) { struct state *shift = find_shifted_state (state, value, step); if (shift) { state = shift; if (value = get_byte (task), value == EOF) break; } else if (state) { if (state->result == NOT_A_CHARACTER) backtrack_ucs2 (state, task); else put_ucs2 (state->result, task); state = NULL; } else { put_ucs2 (value, task); if (value = get_byte (task), value == EOF) break; } } if (state) if (state->result == NOT_A_CHARACTER) backtrack_ucs2 (state, task); else put_ucs2 (state->result, task); } TASK_RETURN (task); }
static bool transform_utf7_utf16 (RECODE_CONST_STEP step, RECODE_TASK task) { int character; unsigned value; unsigned split; character = get_byte (task); if (character != EOF && task->byte_order_mark) put_ucs2 (BYTE_ORDER_MARK, task); while (character != EOF) if (character == '+') { character = get_byte (task); while (IS_BASE64 (character)) { /* Process first byte of first quadruplet. */ value = base64_char_to_value[character] << 10; character = get_byte (task); /* Process second byte of first quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task); break; } value |= base64_char_to_value[character] << 4; character = get_byte (task); /* Process third byte of first quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task); break; } split = base64_char_to_value[character]; value |= split >> 2; if (IS_BODY_DIRECT (value)) RETURN_IF_NOGO (RECODE_NOT_CANONICAL, step, task); put_ucs2 (value, task); character = get_byte (task); /* Process fourth byte of first quadruplet. */ if (!IS_BASE64 (character)) { if (MASK (2) & split) RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task); break; } value = ((MASK (2) & split) << 14 | base64_char_to_value[character] << 8); character = get_byte (task); /* Process first byte of second quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task); break; } value |= base64_char_to_value[character] << 2; character = get_byte (task); /* Process second byte of second quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task); break; } split = base64_char_to_value[character]; value |= split >> 4; if (IS_BODY_DIRECT (value)) RETURN_IF_NOGO (RECODE_NOT_CANONICAL, step, task); put_ucs2 (value, task); character = get_byte (task); /* Process third byte of second quadruplet. */ if (!IS_BASE64 (character)) { if (MASK (4) & split) RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task); break; } value = ((MASK (4) & split) << 12 | base64_char_to_value[character] << 6); character = get_byte (task); /* Process fourth byte of second quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task); break; } value |= base64_char_to_value[character]; if (IS_BODY_DIRECT (value)) RETURN_IF_NOGO (RECODE_NOT_CANONICAL, step, task); put_ucs2 (value, task); character = get_byte (task); } if (character == '-') { character = get_byte (task); if (!IS_BASE64 (character)) RETURN_IF_NOGO (RECODE_NOT_CANONICAL, step, task); } }
static bool transform_html_ucs2 (RECODE_SUBTASK subtask) { RECODE_CONST_REQUEST request = subtask->task->request; int input_char; input_char = get_byte (subtask); if (input_char != EOF) put_ucs2 (BYTE_ORDER_MARK, subtask); /* FIXME: experimental */ while (input_char != EOF) if (input_char == '&') { char buffer[ENTITY_BUFFER_LENGTH]; char *cursor = buffer; bool valid = true; bool echo = false; input_char = get_byte (subtask); if (input_char == '#') { input_char = get_byte (subtask); if (input_char == 'x' || input_char == 'X') { unsigned value = 0; /* Scan &#[xX][0-9a-fA-F]+; notation. */ *cursor++ = '#'; *cursor++ = input_char; input_char = get_byte (subtask); while (valid) { if (input_char >= '0' && input_char <= '9') value = 16 * value + input_char - '0'; else if (input_char >= 'A' && input_char <= 'F') value = 16 * value + input_char - 'A' + 10; else if (input_char >= 'a' && input_char <= 'f') value = 16 * value + input_char - 'a' + 10; else break; if (value >= 65535) valid = false; else if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2) valid = false; else { *cursor++ = input_char; input_char = get_byte (subtask); } } if (valid) if (request->diacritics_only) { echo = true; *cursor = '\0'; } else { put_ucs2 (value, subtask); if (input_char == ';') input_char = get_byte (subtask); } else *cursor = '\0'; } else { unsigned value = 0; /* Scan &#[0-9]+; notation. */ *cursor++ = '#'; while (valid) { if (input_char >= '0' && input_char <= '9') value = 10 * value + input_char - '0'; else break; if (value >= 65535) valid = false; else if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2) valid = false; else { *cursor++ = input_char; input_char = get_byte (subtask); } } if (valid) if (request->diacritics_only) { echo = true; *cursor = '\0'; } else { put_ucs2 (value, subtask); if (input_char == ';') input_char = get_byte (subtask); } else *cursor = '\0'; } } else if ((input_char >= 'A' && input_char <= 'Z') || (input_char >= 'a' && input_char <= 'z')) { /* Scan &[A-Za-z][A-Za-z0-9]*; notation. */ *cursor++ = input_char; input_char = get_byte (subtask); while (valid && input_char != EOF && ((input_char >= 'A' && input_char <= 'Z') || (input_char >= 'a' && input_char <= 'z') || (input_char >= '0' && input_char <= '9'))) if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2) valid = false; else { *cursor++ = input_char; input_char = get_byte (subtask); } *cursor = '\0'; if (valid) { struct ucs2_to_string lookup; struct ucs2_to_string *entry; lookup.string = buffer; entry = hash_lookup (subtask->step->step_table, &lookup); if (entry) { put_ucs2 (entry->code, subtask); if (input_char == ';') input_char = get_byte (subtask); } else valid = false; } } if (echo || !valid) { put_ucs2 ('&', subtask); for (cursor = buffer; *cursor; cursor++) put_ucs2 (*cursor, subtask); } } else { put_ucs2 (input_char, subtask); input_char = get_byte (subtask); } SUBTASK_RETURN (subtask); }
int main(int argc, char **argv) { char *p1, *p2; cons_CharsetEntry *cs1 = 0, *cs2 = 0; int len1 = 0, len2 = 0; unsigned char tbl[256]; int ch; char *s; s = getenv("CLIPROOT"); if (s && *s) CLIPROOT = s; if (argc < 3) { fprintf(stderr, "usage: %s source_charset target_charset\n", argv[0]); return 1; } p1 = argv[1]; p2 = argv[2]; if (load_charset_name(p1, &cs1, &len1)) { fprintf(stderr, "cannot load charset file '%s': %s", p1, strerror(errno)); return 2; } if (!strcasecmp(p2, "utf-8")) { unsigned short *utbl = (unsigned short *) malloc(sizeof(unsigned short) * 256); make_utbl(utbl, cs1, len1); while ((ch = getchar()) != EOF) put_utf8(utbl[ch & 0xff]); return 0; } if (!strcasecmp(p2, "ucs-2")) { unsigned short *utbl = (unsigned short *) malloc(sizeof(unsigned short) * 256); make_utbl(utbl, cs1, len1); while ((ch = getchar()) != EOF) put_ucs2(utbl[ch & 0xff]); return 0; } if (load_charset_name(p2, &cs2, &len2)) { fprintf(stderr, "cannot load charset file '%s': %s", p2, strerror(errno)); return 3; } make_translation(cs1, len1, cs2, len2, tbl); while ((ch = getchar()) != EOF) { putchar(tbl[ch & 0xff]); } return 0; }