Beispiel #1
0
bool
explode_ucs2_ucs2 (RECODE_SUBTASK subtask)
{
  Hash_table *table = (Hash_table *) subtask->step->step_table;
  unsigned value;

  if (get_ucs2 (&value, subtask))
    {
      if (subtask->task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, subtask);

      while (true)
	{
	  unsigned short lookup = value;
	  unsigned short *result
            = (unsigned short *) hash_lookup (table, &lookup);

	  if (result)
	    {
	      result++;
	      while (*result != DONE && *result != ELSE)
		put_ucs2 (*result++, subtask);
	    }
	  else
	    put_ucs2 (value, subtask);

	  if (!get_ucs2 (&value, subtask))
	    break;
	}
    }

  SUBTASK_RETURN (subtask);
}
Beispiel #2
0
bool
explode_ucs2_byte (RECODE_SUBTASK subtask)
{
  Hash_table *table = (Hash_table *) subtask->step->step_table;
  unsigned value;

  while (get_ucs2 (&value, subtask))
    {
      unsigned short lookup = value;
      unsigned short *result = (unsigned short *) hash_lookup (table, &lookup);

      if (result)
	{
	  result++;
	  while (*result != DONE && *result != ELSE)
	    {
	      put_byte (*result, subtask);
	      result++;
	    }
	}
      else
	put_byte (value, subtask);
    }

  SUBTASK_RETURN (subtask);
}
Beispiel #3
0
static bool
transform_ibmpc_iconqnx (RECODE_SUBTASK subtask)
{
  int input_char;

  input_char = get_byte (subtask);
  while (true)
    switch (input_char)
      {
      case DOS_EOF:
	RETURN_IF_NOGO (RECODE_NOT_CANONICAL, subtask);
	/* Fall through.  */

      case EOF:
	SUBTASK_RETURN (subtask);

      case 133: TRANSLATE_AND_BREAK ('A', 'a');
      case 138: TRANSLATE_AND_BREAK ('A', 'e');
      case 151: TRANSLATE_AND_BREAK ('A', 'u');
      case 130: TRANSLATE_AND_BREAK ('B', 'e');
      case 144: TRANSLATE_AND_BREAK ('B', 'E');
      case 131: TRANSLATE_AND_BREAK ('C', 'a');
      case 136: TRANSLATE_AND_BREAK ('C', 'e');
      case 140: TRANSLATE_AND_BREAK ('C', 'i');
      case 147: TRANSLATE_AND_BREAK ('C', 'o');
      case 150: TRANSLATE_AND_BREAK ('C', 'u');
      case 137: TRANSLATE_AND_BREAK ('H', 'e');
      case 139: TRANSLATE_AND_BREAK ('H', 'i');
      case 129: TRANSLATE_AND_BREAK ('H', 'u');
      case 135: TRANSLATE_AND_BREAK ('K', 'c');
      case 128: TRANSLATE_AND_BREAK ('K', 'C');

      case DOS_CR:
	input_char = get_byte (subtask);
	if (input_char == DOS_LF)
	  {
	    put_byte (ENDLINE, subtask);
	    input_char = get_byte (subtask);
	  }
	else
	  put_byte (DOS_CR, subtask);
	break;

      case ENDLINE:
      case ESCAPE:
	RETURN_IF_NOGO (RECODE_AMBIGUOUS_OUTPUT, subtask);
	/* Fall through.  */

      default:
	put_byte (input_char, subtask);
	input_char = get_byte (subtask);
      }
}
Beispiel #4
0
bool
combine_ucs2_ucs2 (RECODE_SUBTASK subtask)
{
  unsigned value;

  if (get_ucs2 (&value, subtask))
    {
      struct state *state = NULL;

      if (subtask->task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, subtask);

      while (true)
	{
	  struct state *shift
	    = find_shifted_state (state, value, subtask->step);

	  if (shift)
	    {
	      state = shift;
	      if (!get_ucs2 (&value, subtask))
		break;
	    }
	  else if (state)
	    {
	      if (state->result == NOT_A_CHARACTER)
		backtrack_ucs2 (state, subtask);
	      else
		put_ucs2 (state->result, subtask);
	      state = NULL;
	    }
	  else
	    {
	      put_ucs2 (value, subtask);
	      if (!get_ucs2 (&value, subtask))
		break;
	    }
	}

      if (state)
	{
	  if (state->result == NOT_A_CHARACTER)
	    backtrack_ucs2 (state, subtask);
	  else
	    put_ucs2 (state->result, subtask);
	}
    }

  SUBTASK_RETURN (subtask);
}
Beispiel #5
0
static bool
transform_ucs2_html (RECODE_SUBTASK subtask)
{
  Hash_table *table = subtask->step->step_table;
  unsigned value;

  while (get_ucs2 (&value, subtask))
    {
      struct ucs2_to_string lookup;
      struct ucs2_to_string *entry;

      lookup.code = value;
      entry = hash_lookup (table, &lookup);
      if (entry)
	{
	  const char *cursor = entry->string;

	  put_byte ('&', subtask);
	  while (*cursor)
	    {
	      put_byte (*cursor, subtask);
	      cursor++;
	    }
	  put_byte (';', subtask);
	}
      else if ((value < 32 && value != '\n' && value != '\t') || value >= 127)
	{
	  unsigned divider = 10000;

	  put_byte ('&', subtask);
	  put_byte ('#', subtask);
	  while (divider > value)
	    divider /= 10;
	  while (divider > 1)
	    {
	      put_byte ('0' + value / divider, subtask);
	      value %= divider;
	      divider /= 10;
	    }
	  put_byte ('0' + value, subtask);
	  put_byte (';', subtask);
	}
      else
	put_byte(value, subtask);
    }

  SUBTASK_RETURN (subtask);
}
Beispiel #6
0
bool
combine_byte_byte (RECODE_SUBTASK subtask)
{
  struct state *state = NULL;
  unsigned value;

  if (value = get_byte (subtask), value != EOF)
    {
      while (true)
	{
	  struct state *shift
	    = find_shifted_state (state, value, subtask->step);

	  if (shift)
	    {
	      state = shift;
	      if (value = get_byte (subtask), value == EOF)
		break;
	    }
	  else if (state)
	    {
	      if (state->result == NOT_A_CHARACTER)
		backtrack_byte (state, subtask);
	      else
		put_byte (state->result, subtask);
	      state = NULL;
	    }
	  else
	    {
	      put_byte (value, subtask);
	      if (value = get_byte (subtask), value == EOF)
		break;
	    }
	}

      if (state)
	{
	  if (state->result == NOT_A_CHARACTER)
	    backtrack_byte (state, subtask);
	  else
	    put_byte (state->result, subtask);
	}
    }

  SUBTASK_RETURN (subtask);
}
Beispiel #7
0
bool
transform_with_iconv (RECODE_SUBTASK subtask)
{
  RECODE_CONST_STEP step = subtask->step;
  iconv_t conversion = iconv_open (step->after->iconv_name,
                                   step->before->iconv_name);
  bool status;

  if (conversion == (iconv_t) -1)
    {
      SET_SUBTASK_ERROR (RECODE_SYSTEM_ERROR, subtask);
      SUBTASK_RETURN (subtask);
    }

  status = wrapped_transform (conversion, subtask);

  iconv_close (conversion);
  return status;
}
Beispiel #8
0
static bool
transform_html_ucs2 (RECODE_SUBTASK subtask)
{
  RECODE_CONST_REQUEST request = subtask->task->request;
  int input_char;

  input_char = get_byte (subtask);
  if (input_char != EOF)
    put_ucs2 (BYTE_ORDER_MARK, subtask);	/* FIXME: experimental */

  while (input_char != EOF)

    if (input_char == '&')
      {
	char buffer[ENTITY_BUFFER_LENGTH];
	char *cursor = buffer;
	bool valid = true;
	bool echo = false;

	input_char = get_byte (subtask);
	if (input_char == '#')
	  {
	    input_char = get_byte (subtask);
	    if (input_char == 'x' || input_char == 'X')
	      {
		unsigned value = 0;

		/* Scan &#[xX][0-9a-fA-F]+; notation.  */

		*cursor++ = '#';
		*cursor++ = input_char;
		input_char = get_byte (subtask);

		while (valid)
		  {
		    if (input_char >= '0' && input_char <= '9')
		      value = 16 * value + input_char - '0';
		    else if (input_char >= 'A' && input_char <= 'F')
		      value = 16 * value + input_char - 'A' + 10;
		    else if (input_char >= 'a' && input_char <= 'f')
		      value = 16 * value + input_char - 'a' + 10;
		    else
		      break;

		    if (value >= 65535)
		      valid = false;
		    else if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2)
		      valid = false;
		    else
		      {
			*cursor++ = input_char;
			input_char = get_byte (subtask);
		      }
		  }

		if (valid)
		  if (request->diacritics_only)
		    {
		      echo = true;
		      *cursor = '\0';
		    }
		  else
		    {
		      put_ucs2 (value, subtask);
		      if (input_char == ';')
			input_char = get_byte (subtask);
		    }
		else
		  *cursor = '\0';
	      }
	    else
	      {
		unsigned value = 0;

		/* Scan &#[0-9]+; notation.  */

		*cursor++ = '#';

		while (valid)
		  {
		    if (input_char >= '0' && input_char <= '9')
		      value = 10 * value + input_char - '0';
		    else
		      break;

		    if (value >= 65535)
		      valid = false;
		    else if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2)
		      valid = false;
		    else
		      {
			*cursor++ = input_char;
			input_char = get_byte (subtask);
		      }
		  }

		if (valid)
		  if (request->diacritics_only)
		    {
		      echo = true;
		      *cursor = '\0';
		    }
		  else
		    {
		      put_ucs2 (value, subtask);
		      if (input_char == ';')
			input_char = get_byte (subtask);
		    }
		else
		  *cursor = '\0';
	      }
	  }
	else if ((input_char >= 'A' && input_char <= 'Z')
		 || (input_char >= 'a' && input_char <= 'z'))
	  {
	    /* Scan &[A-Za-z][A-Za-z0-9]*; notation.  */

	    *cursor++ = input_char;
	    input_char = get_byte (subtask);

	    while (valid
		   && input_char != EOF
		   && ((input_char >= 'A' && input_char <= 'Z')
		       || (input_char >= 'a' && input_char <= 'z')
		       || (input_char >= '0' && input_char <= '9')))
	      if (cursor == buffer + ENTITY_BUFFER_LENGTH - 2)
		valid = false;
	      else
		{
		  *cursor++ = input_char;
		  input_char = get_byte (subtask);
		}
	    *cursor = '\0';

	    if (valid)
	      {
		struct ucs2_to_string lookup;
		struct ucs2_to_string *entry;

		lookup.string = buffer;
		entry = hash_lookup (subtask->step->step_table, &lookup);
		if (entry)
		  {
		    put_ucs2 (entry->code, subtask);
		    if (input_char == ';')
		      input_char = get_byte (subtask);
		  }
		else
		  valid = false;
	      }
	  }

	if (echo || !valid)
	  {
	    put_ucs2 ('&', subtask);
	    for (cursor = buffer; *cursor; cursor++)
	      put_ucs2 (*cursor, subtask);
	  }
      }
    else
      {
	put_ucs2 (input_char, subtask);
	input_char = get_byte (subtask);
      }

  SUBTASK_RETURN (subtask);
}
Beispiel #9
0
static bool
wrapped_transform (iconv_t conversion, RECODE_SUBTASK subtask)
{
  char output_buffer[BUFFER_SIZE];
  char input_buffer[BUFFER_SIZE];
  int input_char = get_byte (subtask);
  char *cursor = input_buffer;
  bool drain_first = false;

  while (true)
    {
      /* The output buffer is fully avaible at this point.  */

      char *input = input_buffer;
      char *output = output_buffer;
      size_t input_left = 0;
      size_t output_left = BUFFER_SIZE;
      int saved_errno = 0;
      size_t converted;

      if (drain_first)
        {
          /* Drain all accumulated partial state and emit output
             to return to the initial shift state.  */
          converted = iconv (conversion, NULL, NULL, &output, &output_left);
          if (converted == (size_t) -1)
            saved_errno = errno;
        }

      if (saved_errno == 0)
        {
          /* Continue filling the input buffer.  */
          while (input_char != EOF && cursor < input_buffer + BUFFER_SIZE)
            {
              *cursor++ = input_char;
              input_char = get_byte (subtask);
            }

          if (cursor == input_buffer)
            {
              if (output == output_buffer)
                {
                  /* All work has been done, just make sure we drained.  */
                  if (drain_first)
                    break;
                  drain_first = true;
                  continue;
                }
            }
          else
            {
              /* Convert accumulated input and add it to the output buffer.  */
              input = input_buffer;
              input_left = cursor - input_buffer;
              converted = iconv (conversion,
                                 &input, &input_left,
                                 &output, &output_left);
              if (converted == (size_t) -1)
                saved_errno = errno;
            }
        }

      /* Send the converted result, so freeing the output buffer.  */
      for (cursor = output_buffer; cursor < output; cursor++)
        put_byte (*cursor, subtask);

      /* Act according to the outcome of the iconv call.  */

      drain_first = false;
      if (saved_errno != 0 && saved_errno != E2BIG)
	{
	  if (saved_errno == EILSEQ)
	    {
	      /* Invalid input.  Skip one byte.  */
	      RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
	      assert (input_left > 0);
	      input++;
	      input_left--;
	      /* Why is draining required?  */
	      drain_first = true;
	    }
	  else if (saved_errno == EINVAL)
	    {
	      if (input + input_left < input_buffer + BUFFER_SIZE
		  && input_char == EOF)
		/* Incomplete multibyte sequence at end of input.  */
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
	    }
	  else
	    {
	      recode_perror (subtask->task->request->outer, "iconv ()");
	      RETURN_IF_NOGO (RECODE_SYSTEM_ERROR, subtask);
	    }
	}

      /* Move back any unprocessed part of the input buffer.  */
      for (cursor = input_buffer; input_left != 0; input_left--)
        *cursor++ = *input++;
    }

  SUBTASK_RETURN (subtask);
}
Beispiel #10
0
static bool
transform_iconqnx_ibmpc (RECODE_SUBTASK subtask)
{
  int input_char;		/* current character */

  input_char = get_byte (subtask);
  while (true)
    switch (input_char)
      {
      case EOF:
	SUBTASK_RETURN (subtask);

      case ENDLINE:
	put_byte (DOS_CR, subtask);
	put_byte (DOS_LF, subtask);
	input_char = get_byte (subtask);
	break;

      case DOS_CR:
	input_char = get_byte (subtask);
	if (input_char == DOS_LF)
	  RETURN_IF_NOGO (RECODE_AMBIGUOUS_OUTPUT, subtask);
	put_byte (DOS_CR, subtask);
	break;

      case ESCAPE:
	input_char = get_byte (subtask);
	switch (input_char)
	  {
	  case 'A':
	    input_char = get_byte (subtask);
	    switch (input_char)
	      {
	      case 'a': input_char = 133; break;
	      case 'e': input_char = 138; break;
	      case 'u': input_char = 151; break;

	      default:
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
		put_byte (ESCAPE, subtask);
		put_byte ('A', subtask);
		if (input_char == EOF)
		  SUBTASK_RETURN (subtask);
	      }
	    break;

	  case 'B':
	    input_char = get_byte (subtask);
	    switch (input_char)
	      {
	      case 'e': input_char = 130; break;
	      case 'E': input_char = 144; break;

	      default:
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
		put_byte (ESCAPE, subtask);
		put_byte ('B', subtask);
		if (input_char == EOF)
		  SUBTASK_RETURN (subtask);
	      }
	    break;

	  case 'C':
	    input_char = get_byte (subtask);
	    switch (input_char)
	      {
	      case 'a': input_char = 131; break;
	      case 'e': input_char = 136; break;
	      case 'i': input_char = 140; break;
	      case 'o': input_char = 147; break;
	      case 'u': input_char = 150; break;

	      default:
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
		put_byte (ESCAPE, subtask);
		put_byte ('C', subtask);
		if (input_char == EOF)
		  SUBTASK_RETURN (subtask);
	      }
	    break;

	  case 'H':
	    input_char = get_byte (subtask);
	    switch (input_char)
	      {
	      case 'e': input_char = 137; break;
	      case 'i': input_char = 139; break;
	      case 'u': input_char = 129; break;

	      default:
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
		put_byte (ESCAPE, subtask);
		put_byte ('H', subtask);
		if (input_char == EOF)
		  SUBTASK_RETURN (subtask);
	      }
	    break;

	  case 'K':
	    input_char = get_byte (subtask);
	    switch (input_char)
	      {
	      case 'c': input_char = 135; break;
	      case 'C': input_char = 128; break;

	      default:
		RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
		put_byte (ESCAPE, subtask);
		put_byte ('K', subtask);
		if (input_char == EOF)
		  SUBTASK_RETURN (subtask);
	      }
	    break;

	  default:
	    RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
	    put_byte (ESCAPE, subtask);
	    if (input_char == EOF)
	      SUBTASK_RETURN (subtask);
	  }
	/* Fall through.  */

      default:
	put_byte (input_char, subtask);
	input_char = get_byte (subtask);
      }
}