Beispiel #1
0
static void
x_awk_lex (token_ty *tp)
{
  static char *buffer;
  static int bufmax;
  int bufpos;
  int c;

  for (;;)
    {
      tp->line_number = line_number;
      c = phase2_getc ();

      switch (c)
        {
        case EOF:
          tp->type = token_type_eof;
          return;

        case '\n':
          if (last_non_comment_line > last_comment_line)
            savable_comment_reset ();
          /* Newline is not allowed inside expressions.  It usually
             introduces a fresh statement.
             FIXME: Newlines after any of ',' '{' '?' ':' '||' '&&' 'do' 'else'
             does *not* introduce a fresh statement.  */
          prefer_division_over_regexp = false;
          /* FALLTHROUGH */
        case '\t':
        case ' ':
          /* Ignore whitespace and comments.  */
          continue;

        case '\\':
          /* Backslash ought to be immediately followed by a newline.  */
          continue;
        }

      last_non_comment_line = tp->line_number;

      switch (c)
        {
        case '.':
          {
            int c2 = phase2_getc ();
            phase2_ungetc (c2);
            if (!(c2 >= '0' && c2 <= '9'))
              {

                tp->type = token_type_other;
                prefer_division_over_regexp = false;
                return;
              }
          }
          /* FALLTHROUGH */
        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
        case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
        case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
        case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
        case 'Y': case 'Z':
        case '_':
        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
        case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
        case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
        case 's': case 't': case 'u': case 'v': case 'w': case 'x':
        case 'y': case 'z':
        case '0': case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9':
          /* Symbol, or part of a number.  */
          bufpos = 0;
          for (;;)
            {
              if (bufpos >= bufmax)
                {
                  bufmax = 2 * bufmax + 10;
                  buffer = xrealloc (buffer, bufmax);
                }
              buffer[bufpos++] = c;
              c = phase2_getc ();
              switch (c)
                {
                case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
                case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
                case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
                case 'Y': case 'Z':
                case '_':
                case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
                case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
                case 's': case 't': case 'u': case 'v': case 'w': case 'x':
                case 'y': case 'z':
                case '0': case '1': case '2': case '3': case '4':
                case '5': case '6': case '7': case '8': case '9':
                  continue;
                default:
                  if (bufpos == 1 && buffer[0] == '_' && c == '"')
                    {
                      tp->type = token_type_i18nstring;
                      goto case_string;
                    }
                  phase2_ungetc (c);
                  break;
                }
              break;
            }
          if (bufpos >= bufmax)
            {
              bufmax = 2 * bufmax + 10;
              buffer = xrealloc (buffer, bufmax);
            }
          buffer[bufpos] = '\0';
          tp->string = xstrdup (buffer);
          tp->type = token_type_symbol;
          /* Most identifiers can be variable names; after them we must
             interpret '/' as division operator.  But for awk's builtin
             keywords we have three cases:
             (a) Must interpret '/' as division operator. "length".
             (b) Must interpret '/' as start of a regular expression.
                 "do", "exit", "print", "printf", "return".
             (c) '/' after this keyword in invalid anyway. All others.
             I used the following script for the distinction.
                for k in $awk_keywords; do
                  echo; echo $k; awk "function foo () { $k / 10 }" < /dev/null
                done
           */
          if (strcmp (buffer, "do") == 0
              || strcmp (buffer, "exit") == 0
              || strcmp (buffer, "print") == 0
              || strcmp (buffer, "printf") == 0
              || strcmp (buffer, "return") == 0)
            prefer_division_over_regexp = false;
          else
            prefer_division_over_regexp = true;
          return;

        case '"':
          tp->type = token_type_string;
        case_string:
          bufpos = 0;
          for (;;)
            {
              c = phase7_getc ();
              if (c == EOF || c == P7_QUOTES)
                break;
              if (bufpos >= bufmax)
                {
                  bufmax = 2 * bufmax + 10;
                  buffer = xrealloc (buffer, bufmax);
                }
              buffer[bufpos++] = c;
            }
          if (bufpos >= bufmax)
            {
              bufmax = 2 * bufmax + 10;
              buffer = xrealloc (buffer, bufmax);
            }
          buffer[bufpos] = '\0';
          tp->string = xstrdup (buffer);
          prefer_division_over_regexp = true;
          return;

        case '(':
          tp->type = token_type_lparen;
          prefer_division_over_regexp = false;
          return;

        case ')':
          tp->type = token_type_rparen;
          prefer_division_over_regexp = true;
          return;

        case ',':
          tp->type = token_type_comma;
          prefer_division_over_regexp = false;
          return;

        case ';':
          tp->type = token_type_semicolon;
          prefer_division_over_regexp = false;
          return;

        case ']':
          tp->type = token_type_other;
          prefer_division_over_regexp = true;
          return;

        case '/':
          if (!prefer_division_over_regexp)
            {
              /* Regular expression.
                 Counting brackets is non-trivial. [[] is balanced, and so is
                 [\]]. Also, /[/]/ is balanced and ends at the third slash.
                 Do not count [ or ] if either one is preceded by a \.
                 A '[' should be counted if
                  a) it is the first one so far (brackets == 0), or
                  b) it is the '[' in '[:'.
                 A ']' should be counted if not preceded by a \.
                 According to POSIX, []] is how you put a ] into a set.
                 Try to handle that too.
               */
              int brackets = 0;
              bool pos0 = true;         /* true at start of regexp */
              bool pos1_open = false;   /* true after [ at start of regexp */
              bool pos2_open_not = false; /* true after [^ at start of regexp */

              for (;;)
                {
                  c = phase1_getc ();

                  if (c == EOF || c == '\n')
                    {
                      phase1_ungetc (c);
                      error_with_progname = false;
                      error (0, 0, _("%s:%d: warning: unterminated regular expression"),
                             logical_file_name, line_number);
                      error_with_progname = true;
                      break;
                    }
                  else if (c == '[')
                    {
                      if (brackets == 0)
                        brackets++;
                      else
                        {
                          c = phase1_getc ();
                          if (c == ':')
                            brackets++;
                          phase1_ungetc (c);
                        }
                      if (pos0)
                        {
                          pos0 = false;
                          pos1_open = true;
                          continue;
                        }
                    }
                  else if (c == ']')
                    {
                      if (!(pos1_open || pos2_open_not))
                        brackets--;
                    }
                  else if (c == '^')
                    {
                      if (pos1_open)
                        {
                          pos1_open = false;
                          pos2_open_not = true;
                          continue;
                        }
                    }
                  else if (c == '\\')
                    {
                      c = phase1_getc ();
                      /* Backslash-newline is valid and ignored.  */
                    }
                  else if (c == '/')
                    {
                      if (brackets <= 0)
                        break;
                    }

                  pos0 = false;
                  pos1_open = false;
                  pos2_open_not = false;
                }

              tp->type = token_type_other;
              prefer_division_over_regexp = false;
              return;
            }
          /* FALLTHROUGH */

        default:
          /* We could carefully recognize each of the 2 and 3 character
             operators, but it is not necessary, as we only need to recognize
             gettext invocations.  Don't bother.  */
          tp->type = token_type_other;
          prefer_division_over_regexp = false;
          return;
        }
    }
}
Beispiel #2
0
static void
phase5_get (token_ty *tp)
{
  static char *buffer;
  static int bufmax;
  int bufpos;
  int c;

  if (phase5_pushback_length)
    {
      *tp = phase5_pushback[--phase5_pushback_length];
      return;
    }
  for (;;)
    {
      tp->line_number = line_number;
      c = phase2_getc ();

      switch (c)
        {
        case EOF:
          tp->type = token_type_eof;
          return;

        case '\n':
          if (last_non_comment_line > last_comment_line)
            savable_comment_reset ();
          /* FALLTHROUGH */
        case '\r':
        case '\t':
        case ' ':
          /* Ignore whitespace and comments.  */
          continue;
        }

      last_non_comment_line = tp->line_number;

      switch (c)
        {
        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
        case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
        case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
        case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
        case 'Y': case 'Z':
        case '_':
        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
        case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
        case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
        case 's': case 't': case 'u': case 'v': case 'w': case 'x':
        case 'y': case 'z':
        case '0': case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9':
          /* Symbol, or part of a number.  */
          bufpos = 0;
          for (;;)
            {
              if (bufpos >= bufmax)
                {
                  bufmax = 2 * bufmax + 10;
                  buffer = xrealloc (buffer, bufmax);
                }
              buffer[bufpos++] = c;
              c = phase2_getc ();
              switch (c)
                {
                case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
                case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
                case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
                case 'Y': case 'Z':
                case '_':
                case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
                case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
                case 's': case 't': case 'u': case 'v': case 'w': case 'x':
                case 'y': case 'z':
                case '0': case '1': case '2': case '3': case '4':
                case '5': case '6': case '7': case '8': case '9':
                  continue;
                default:
                  if (bufpos == 1 && buffer[0] == '_' && c == '(')
                    {
                      tp->type = token_type_i18n;
                      return;
                    }
                  phase2_ungetc (c);
                  break;
                }
              break;
            }
          if (bufpos >= bufmax)
            {
              bufmax = 2 * bufmax + 10;
              buffer = xrealloc (buffer, bufmax);
            }
          buffer[bufpos] = '\0';
          tp->string = xstrdup (buffer);
          tp->type = token_type_symbol;
          return;

        case '"':
          bufpos = 0;
          for (;;)
            {
              c = phase7_getc ();
              if (c == EOF || c == P7_QUOTES)
                break;
              if (bufpos >= bufmax)
                {
                  bufmax = 2 * bufmax + 10;
                  buffer = xrealloc (buffer, bufmax);
                }
              buffer[bufpos++] = c;
            }
          if (bufpos >= bufmax)
            {
              bufmax = 2 * bufmax + 10;
              buffer = xrealloc (buffer, bufmax);
            }
          buffer[bufpos] = '\0';
          tp->string = xstrdup (buffer);
          tp->type = token_type_string_literal;
          tp->comment = add_reference (savable_comment);
          return;

        case '(':
          tp->type = token_type_lparen;
          return;

        case ')':
          tp->type = token_type_rparen;
          return;

        case ',':
          tp->type = token_type_comma;
          return;

        default:
          /* We could carefully recognize each of the 2 and 3 character
             operators, but it is not necessary, as we only need to recognize
             gettext invocations.  Don't bother.  */
          tp->type = token_type_other;
          return;
        }
    }
}
Beispiel #3
0
static void
phase5_get (token_ty *tp)
{
  static char *buffer;
  static int bufmax;
  int bufpos;
  int c;

  if (phase5_pushback_length)
    {
      *tp = phase5_pushback[--phase5_pushback_length];
      return;
    }
  tp->string = NULL;
  tp->number = 0;
  tp->line_number = line_number;
  c = phase4_getc ();
  switch (c)
    {
    case EOF:
      tp->type = token_type_eof;
      return;

    case '\n':
      tp->type = token_type_eoln;
      return;

    case ' ':
    case '\f':
    case '\t':
      for (;;)
	{
	  c = phase4_getc ();
	  switch (c)
	    {
	    case ' ':
	    case '\f':
	    case '\t':
	      continue;

	    default:
	      phase4_ungetc (c);
	      break;
	    }
	  break;
	}
      tp->type = token_type_white_space;
      return;

    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
    case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
    case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
    case 'V': case 'W': case 'X': case 'Y': case 'Z':
    case '_':
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
    case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
    case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
    case 'v': case 'w': case 'x': case 'y': case 'z':
      bufpos = 0;
      for (;;)
	{
	  if (bufpos >= bufmax)
	    {
	      bufmax = 2 * bufmax + 10;
	      buffer = xrealloc (buffer, bufmax);
	    }
	  buffer[bufpos++] = c;
	  c = phase4_getc ();
	  switch (c)
	    {
	    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
	    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
	    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
	    case 'Y': case 'Z':
	    case '_':
	    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
	    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
	    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
	    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
	    case 'y': case 'z':
	    case '0': case '1': case '2': case '3': case '4':
	    case '5': case '6': case '7': case '8': case '9':
	      continue;

	    default:
	      phase4_ungetc (c);
	      break;
	    }
	  break;
	}
      if (bufpos >= bufmax)
	{
	  bufmax = 2 * bufmax + 10;
	  buffer = xrealloc (buffer, bufmax);
	}
      buffer[bufpos] = 0;
      tp->string = xstrdup (buffer);
      tp->type = token_type_name;
      return;

    case '.':
      c = phase4_getc ();
      phase4_ungetc (c);
      switch (c)
	{
	default:
	  tp->type = token_type_symbol;
	  return;

	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
	  c = '.';
	  break;
	}
      /* FALLTHROUGH */

    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
      /* The preprocessing number token is more "generous" than the C
	 number tokens.  This is mostly due to token pasting (another
	 thing we can ignore here).  */
      bufpos = 0;
      for (;;)
	{
	  if (bufpos >= bufmax)
	    {
	      bufmax = 2 * bufmax + 10;
	      buffer = xrealloc (buffer, bufmax);
	    }
	  buffer[bufpos++] = c;
	  c = phase4_getc ();
	  switch (c)
	    {
	    case 'e':
	    case 'E':
	      if (bufpos >= bufmax)
		{
		  bufmax = 2 * bufmax + 10;
		  buffer = xrealloc (buffer, bufmax);
		}
	      buffer[bufpos++] = c;
	      c = phase4_getc ();
	      if (c != '+' || c != '-')
		{
		  phase4_ungetc (c);
		  break;
		}
	      continue;

	    case 'A': case 'B': case 'C': case 'D':           case 'F':
	    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
	    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
	    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
	    case 'Y': case 'Z':
	    case 'a': case 'b': case 'c': case 'd':           case 'f':
	    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
	    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
	    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
	    case 'y': case 'z':
	    case '0': case '1': case '2': case '3': case '4':
	    case '5': case '6': case '7': case '8': case '9':
	    case '.':
	      continue;

	    default:
	      phase4_ungetc (c);
	      break;
	    }
	  break;
	}
      if (bufpos >= bufmax)
	{
	  bufmax = 2 * bufmax + 10;
	  buffer = xrealloc (buffer, bufmax);
	}
      buffer[bufpos] = 0;
      tp->type = token_type_number;
      tp->number = atol (buffer);
      return;

    case '\'':
      /* We could worry about the 'L' before wide character constants,
	 but ignoring it has no effect unless one of the keywords is
	 "L".  Just pretend it won't happen.  Also, we don't need to
	 remember the character constant.  */
      for (;;)
	{
	  c = phase7_getc ();
	  if (c == P7_NEWLINE)
	    {
	      error_with_progname = false;
	      error (0, 0, _("%s:%d: warning: unterminated character constant"),
		     logical_file_name, line_number - 1);
	      error_with_progname = true;
	      phase7_ungetc ('\n');
	      break;
	    }
	  if (c == EOF || c == P7_QUOTE)
	    break;
	}
      tp->type = token_type_character_constant;
      return;

    case '"':
      /* We could worry about the 'L' before wide string constants,
	 but since gettext's argument is not a wide character string,
	 let the compiler complain about the argument not matching the
	 prototype.  Just pretend it won't happen.  */
      bufpos = 0;
      for (;;)
	{
	  c = phase7_getc ();
	  if (c == P7_NEWLINE)
	    {
	      error_with_progname = false;
	      error (0, 0, _("%s:%d: warning: unterminated string literal"),
		     logical_file_name, line_number - 1);
	      error_with_progname = true;
	      phase7_ungetc ('\n');
	      break;
	    }
	  if (c == EOF || c == P7_QUOTES)
	    break;
	  if (c == P7_QUOTE)
	    c = '\'';
	  if (bufpos >= bufmax)
	    {
	      bufmax = 2 * bufmax + 10;
	      buffer = xrealloc (buffer, bufmax);
	    }
	  buffer[bufpos++] = c;
	}
      if (bufpos >= bufmax)
	{
	  bufmax = 2 * bufmax + 10;
	  buffer = xrealloc (buffer, bufmax);
	}
      buffer[bufpos] = 0;
      tp->type = token_type_string_literal;
      tp->string = xstrdup (buffer);
      tp->comment = add_reference (savable_comment);
      return;

    case '(':
      tp->type = token_type_lparen;
      return;

    case ')':
      tp->type = token_type_rparen;
      return;

    case ',':
      tp->type = token_type_comma;
      return;

    case '#':
      tp->type = token_type_hash;
      return;

    case ':':
      tp->type = token_type_colon;
      return;

    case '@':
      if (objc_extensions)
	{
	  tp->type = token_type_objc_special;
	  tp->comment = add_reference (savable_comment);
	  return;
	}
      /* FALLTHROUGH */

    default:
      /* We could carefully recognize each of the 2 and 3 character
	operators, but it is not necessary, as we only need to recognize
	gettext invocations.  Don't bother.  */
      tp->type = token_type_symbol;
      return;
    }
}