Beispiel #1
0
bool
explode_ucs2_ucs2 (RECODE_SUBTASK subtask)
{
  Hash_table *table = (Hash_table *) subtask->step->step_table;
  unsigned value;

  if (get_ucs2 (&value, subtask))
    {
      if (subtask->task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, subtask);

      while (true)
	{
	  unsigned short lookup = value;
	  unsigned short *result
            = (unsigned short *) hash_lookup (table, &lookup);

	  if (result)
	    {
	      result++;
	      while (*result != DONE && *result != ELSE)
		put_ucs2 (*result++, subtask);
	    }
	  else
	    put_ucs2 (value, subtask);

	  if (!get_ucs2 (&value, subtask))
	    break;
	}
    }

  SUBTASK_RETURN (subtask);
}
Beispiel #2
0
bool
explode_ucs2_ucs2 (RECODE_CONST_STEP step, RECODE_TASK task)
{
  Hash_table *table = step->step_table;
  unsigned value;

  if (get_ucs2 (&value, step, task))
    {
      if (task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, task);

      while (true)
	{
	  unsigned short lookup = value;
	  unsigned short *result = hash_lookup (table, &lookup);

	  if (result)
	    {
	      result++;
	      while (*result != DONE && *result != ELSE)
		put_ucs2 (*result++, task);
	    }
	  else
	    put_ucs2 (value, task);

	  if (!get_ucs2 (&value, step, task))
	    break;
	}
    }

  TASK_RETURN (task);
}
Beispiel #3
0
static void
backtrack_ucs2 (struct state *state,  RECODE_SUBTASK subtask)
{
  if (state->result == NOT_A_CHARACTER)
    {
      backtrack_ucs2 (state->unshift, subtask);
      put_ucs2 (state->character, subtask);
    }
  else
    put_ucs2 (state->result, subtask);
}
Beispiel #4
0
bool
combine_ucs2_ucs2 (RECODE_SUBTASK subtask)
{
  unsigned value;

  if (get_ucs2 (&value, subtask))
    {
      struct state *state = NULL;

      if (subtask->task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, subtask);

      while (true)
	{
	  struct state *shift
	    = find_shifted_state (state, value, subtask->step);

	  if (shift)
	    {
	      state = shift;
	      if (!get_ucs2 (&value, subtask))
		break;
	    }
	  else if (state)
	    {
	      if (state->result == NOT_A_CHARACTER)
		backtrack_ucs2 (state, subtask);
	      else
		put_ucs2 (state->result, subtask);
	      state = NULL;
	    }
	  else
	    {
	      put_ucs2 (value, subtask);
	      if (!get_ucs2 (&value, subtask))
		break;
	    }
	}

      if (state)
	{
	  if (state->result == NOT_A_CHARACTER)
	    backtrack_ucs2 (state, subtask);
	  else
	    put_ucs2 (state->result, subtask);
	}
    }

  SUBTASK_RETURN (subtask);
}
Beispiel #5
0
bool
combine_byte_ucs2 (RECODE_CONST_STEP step, RECODE_TASK task)
{
  unsigned value;

  if (value = get_byte (task), value != EOF)
    {
      struct state *state = NULL;

      if (task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, task);

      while (true)
	{
	  struct state *shift = find_shifted_state (state, value, step);

	  if (shift)
	    {
	      state = shift;
	      if (value = get_byte (task), value == EOF)
		break;
	    }
	  else if (state)
	    {
	      if (state->result == NOT_A_CHARACTER)
		backtrack_ucs2 (state, task);
	      else
		put_ucs2 (state->result, task);
	      state = NULL;
	    }
	  else
	    {
	      put_ucs2 (value, task);
	      if (value = get_byte (task), value == EOF)
		break;
	    }
	}

      if (state)
	if (state->result == NOT_A_CHARACTER)
	  backtrack_ucs2 (state, task);
	else
	  put_ucs2 (state->result, task);
    }

  TASK_RETURN (task);
}
Beispiel #6
0
static bool
transform_utf7_utf16 (RECODE_CONST_STEP step, RECODE_TASK task)
{
  int character;
  unsigned value;
  unsigned split;

  character = get_byte (task);

  if (character != EOF && task->byte_order_mark)
    put_ucs2 (BYTE_ORDER_MARK, task);

  while (character != EOF)
    if (character == '+')
      {
	character = get_byte (task);
	while (IS_BASE64 (character))
	  {
	    /* Process first byte of first quadruplet.  */

	    value = base64_char_to_value[character] << 10;
	    character = get_byte (task);

	    /* Process second byte of first quadruplet.  */

	    if (!IS_BASE64 (character))
	      {
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task);
		break;
	      }
	    value |= base64_char_to_value[character] << 4;
	    character = get_byte (task);

	    /* Process third byte of first quadruplet.  */

	    if (!IS_BASE64 (character))
	      {
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task);
		break;
	      }
	    split = base64_char_to_value[character];
	    value |= split >> 2;
	    if (IS_BODY_DIRECT (value))
	      RETURN_IF_NOGO (RECODE_NOT_CANONICAL, step, task);
	    put_ucs2 (value, task);
	    character = get_byte (task);

	    /* Process fourth byte of first quadruplet.  */

	    if (!IS_BASE64 (character))
	      {
		if (MASK (2) & split)
		  RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task);
		break;
	      }
	    value = ((MASK (2) & split) << 14
		     | base64_char_to_value[character] << 8);
	    character = get_byte (task);

	    /* Process first byte of second quadruplet.  */

	    if (!IS_BASE64 (character))
	      {
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task);
		break;
	      }
	    value |= base64_char_to_value[character] << 2;
	    character = get_byte (task);

	    /* Process second byte of second quadruplet.  */

	    if (!IS_BASE64 (character))
	      {
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task);
		break;
	      }
	    split = base64_char_to_value[character];
	    value |= split >> 4;
	    if (IS_BODY_DIRECT (value))
	      RETURN_IF_NOGO (RECODE_NOT_CANONICAL, step, task);
	    put_ucs2 (value, task);
	    character = get_byte (task);

	    /* Process third byte of second quadruplet.  */

	    if (!IS_BASE64 (character))
	      {
		if (MASK (4) & split)
		  RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task);
		break;
	      }
	    value = ((MASK (4) & split) << 12
		     | base64_char_to_value[character] << 6);
	    character = get_byte (task);

	    /* Process fourth byte of second quadruplet.  */

	    if (!IS_BASE64 (character))
	      {
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, step, task);
		break;
	      }
	    value |= base64_char_to_value[character];
	    if (IS_BODY_DIRECT (value))
	      RETURN_IF_NOGO (RECODE_NOT_CANONICAL, step, task);
	    put_ucs2 (value, task);
	    character = get_byte (task);
	  }
	if (character == '-')
	  {
	    character = get_byte (task);
	    if (!IS_BASE64 (character))
	      RETURN_IF_NOGO (RECODE_NOT_CANONICAL, step, task);
	  }
      }
Beispiel #7
0
static bool
transform_html_ucs2 (RECODE_SUBTASK subtask)
{
  RECODE_CONST_REQUEST request = subtask->task->request;
  int input_char;

  input_char = get_byte (subtask);
  if (input_char != EOF)
    put_ucs2 (BYTE_ORDER_MARK, subtask);	/* FIXME: experimental */

  while (input_char != EOF)

    if (input_char == '&')
      {
	char buffer[ENTITY_BUFFER_LENGTH];
	char *cursor = buffer;
	bool valid = true;
	bool echo = false;

	input_char = get_byte (subtask);
	if (input_char == '#')
	  {
	    input_char = get_byte (subtask);
	    if (input_char == 'x' || input_char == 'X')
	      {
		unsigned value = 0;

		/* Scan &#[xX][0-9a-fA-F]+; notation.  */

		*cursor++ = '#';
		*cursor++ = input_char;
		input_char = get_byte (subtask);

		while (valid)
		  {
		    if (input_char >= '0' && input_char <= '9')
		      value = 16 * value + input_char - '0';
		    else if (input_char >= 'A' && input_char <= 'F')
		      value = 16 * value + input_char - 'A' + 10;
		    else if (input_char >= 'a' && input_char <= 'f')
		      value = 16 * value + input_char - 'a' + 10;
		    else
		      break;

		    if (value >= 65535)
		      valid = false;
		    else if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2)
		      valid = false;
		    else
		      {
			*cursor++ = input_char;
			input_char = get_byte (subtask);
		      }
		  }

		if (valid)
		  if (request->diacritics_only)
		    {
		      echo = true;
		      *cursor = '\0';
		    }
		  else
		    {
		      put_ucs2 (value, subtask);
		      if (input_char == ';')
			input_char = get_byte (subtask);
		    }
		else
		  *cursor = '\0';
	      }
	    else
	      {
		unsigned value = 0;

		/* Scan &#[0-9]+; notation.  */

		*cursor++ = '#';

		while (valid)
		  {
		    if (input_char >= '0' && input_char <= '9')
		      value = 10 * value + input_char - '0';
		    else
		      break;

		    if (value >= 65535)
		      valid = false;
		    else if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2)
		      valid = false;
		    else
		      {
			*cursor++ = input_char;
			input_char = get_byte (subtask);
		      }
		  }

		if (valid)
		  if (request->diacritics_only)
		    {
		      echo = true;
		      *cursor = '\0';
		    }
		  else
		    {
		      put_ucs2 (value, subtask);
		      if (input_char == ';')
			input_char = get_byte (subtask);
		    }
		else
		  *cursor = '\0';
	      }
	  }
	else if ((input_char >= 'A' && input_char <= 'Z')
		 || (input_char >= 'a' && input_char <= 'z'))
	  {
	    /* Scan &[A-Za-z][A-Za-z0-9]*; notation.  */

	    *cursor++ = input_char;
	    input_char = get_byte (subtask);

	    while (valid
		   && input_char != EOF
		   && ((input_char >= 'A' && input_char <= 'Z')
		       || (input_char >= 'a' && input_char <= 'z')
		       || (input_char >= '0' && input_char <= '9')))
	      if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2)
		valid = false;
	      else
		{
		  *cursor++ = input_char;
		  input_char = get_byte (subtask);
		}
	    *cursor = '\0';

	    if (valid)
	      {
		struct ucs2_to_string lookup;
		struct ucs2_to_string *entry;

		lookup.string = buffer;
		entry = hash_lookup (subtask->step->step_table, &lookup);
		if (entry)
		  {
		    put_ucs2 (entry->code, subtask);
		    if (input_char == ';')
		      input_char = get_byte (subtask);
		  }
		else
		  valid = false;
	      }
	  }

	if (echo || !valid)
	  {
	    put_ucs2 ('&', subtask);
	    for (cursor = buffer; *cursor; cursor++)
	      put_ucs2 (*cursor, subtask);
	  }
      }
    else
      {
	put_ucs2 (input_char, subtask);
	input_char = get_byte (subtask);
      }

  SUBTASK_RETURN (subtask);
}
Beispiel #8
0
int
main(int argc, char **argv)
{
   char *p1, *p2;

   cons_CharsetEntry *cs1 = 0, *cs2 = 0;

   int len1 = 0, len2 = 0;

   unsigned char tbl[256];

   int ch;

   char *s;

   s = getenv("CLIPROOT");
   if (s && *s)
      CLIPROOT = s;

   if (argc < 3)
   {
      fprintf(stderr, "usage: %s source_charset target_charset\n", argv[0]);
      return 1;
   }

   p1 = argv[1];
   p2 = argv[2];

   if (load_charset_name(p1, &cs1, &len1))
   {
      fprintf(stderr, "cannot load charset file '%s': %s", p1, strerror(errno));
      return 2;
   }

   if (!strcasecmp(p2, "utf-8"))
   {
      unsigned short *utbl = (unsigned short *) malloc(sizeof(unsigned short) * 256);

      make_utbl(utbl, cs1, len1);
      while ((ch = getchar()) != EOF)
	 put_utf8(utbl[ch & 0xff]);

      return 0;
   }

   if (!strcasecmp(p2, "ucs-2"))
   {
      unsigned short *utbl = (unsigned short *) malloc(sizeof(unsigned short) * 256);

      make_utbl(utbl, cs1, len1);
      while ((ch = getchar()) != EOF)
	 put_ucs2(utbl[ch & 0xff]);

      return 0;
   }

   if (load_charset_name(p2, &cs2, &len2))
   {
      fprintf(stderr, "cannot load charset file '%s': %s", p2, strerror(errno));
      return 3;
   }

   make_translation(cs1, len1, cs2, len2, tbl);

   while ((ch = getchar()) != EOF)
   {
      putchar(tbl[ch & 0xff]);
   }

   return 0;
}