Beispiel #1
0
static int nextrune(void)
{
	g.source += chartorune(&g.yychar, g.source);
	if (g.yychar == '\\') {
		g.source += chartorune(&g.yychar, g.source);
		switch (g.yychar) {
		case 0: die("unterminated escape sequence");
		case 'f': g.yychar = '\f'; return 0;
		case 'n': g.yychar = '\n'; return 0;
		case 'r': g.yychar = '\r'; return 0;
		case 't': g.yychar = '\t'; return 0;
		case 'v': g.yychar = '\v'; return 0;
		case 'c':
			g.yychar = (*g.source++) & 31;
			return 0;
		case 'x':
			g.yychar = hex(*g.source++) << 4;
			g.yychar += hex(*g.source++);
			if (g.yychar == 0) {
				g.yychar = '0';
				return 1;
			}
			return 0;
		case 'u':
			g.yychar = hex(*g.source++) << 12;
			g.yychar += hex(*g.source++) << 8;
			g.yychar += hex(*g.source++) << 4;
			g.yychar += hex(*g.source++);
			if (g.yychar == 0) {
				g.yychar = '0';
				return 1;
			}
			return 0;
		}
		if (strchr(ESCAPES, g.yychar))
			return 1;
		if (isalpharune(g.yychar) || g.yychar == '_') /* check identity escape */
			die("invalid escape character");
		return 0;
	}
	return 0;
}
Beispiel #2
0
void
tag(Trie* t, char* s, uvlong qid)
{
	int	l;
	char*	q;
	Rune	r;
	int	nc;

	l = strlen(s);
	// don't use and and two char words as tags
	// don't use uttlerly long words, probably not tags.
	if(l < 3 || l > 40)
		return;

	// don't use as tags things other than alphanumeric.
	for(q = s; *q != 0; ){
		nc = chartorune(&r, q);
		if(!isalpharune(r) && !isdigit(r))
			return;
		q += nc;
	}
	trieput(t, s, qid);
}
Beispiel #3
0
/* Advance `s` pointer to the end of identifier  */
static void ident(const char **s) {
  const unsigned char *p = (unsigned char *) *s;
  int n;
  Rune r;

  while (p[0] != '\0') {
    if (p[0] == '$' || p[0] == '_' || isalnum(p[0])) {
      /* $, _, or any alphanumeric are valid identifier characters */
      p++;
    } else if (p[0] == '\\' && p[1] == 'u' && isxdigit(p[2]) &&
               isxdigit(p[3]) && isxdigit(p[4]) && isxdigit(p[5])) {
      /* Unicode escape, \uXXXX . Could be used like "var \u0078 = 1;" */
      p += 6;
    } else if ((n = chartorune(&r, (char *) p)) > 1 && isalpharune(r)) {
      /* Unicode alphanumeric character */
      p += n;
    } else {
      break;
    }
  }

  *s = (char *) p;
}
Beispiel #4
0
static int jsY_isidentifierpart(int c)
{
	return isdigit(c) || isalpha(c) || c == '$' || c == '_' || isalpharune(c);
}
Beispiel #5
0
/*
 * This function is the heart of the tokenizer.
 * Organized as a giant switch statement.
 *
 * Switch statement is by the first character of the input stream. If first
 * character begins with a letter, it could be either keyword or identifier.
 * get_tok() calls ident() which shifts `s` pointer to the end of the word.
 * Now, tokenizer knows that the word begins at `p` and ends at `s`.
 * It calls function kw() to scan over the keywords that start with `p[0]`
 * letter. Therefore, keyword tokens and keyword strings must be in the
 * same order, to let kw() function work properly.
 * If kw() finds a keyword match, it returns keyword token.
 * Otherwise, it returns TOK_IDENTIFIER.
 * NOTE(lsm): `prev_tok` is a previously parsed token. It is needed for
 * correctly parsing regex literals.
 */
V7_PRIVATE enum v7_tok get_tok(const char **s, double *n,
                               enum v7_tok prev_tok) {
  const char *p = *s;

  switch (*p) {
    /* Letters */
    case 'a':
      ident(s);
      return TOK_IDENTIFIER;
    case 'b':
      ident(s);
      return kw(p, *s - p, 1, TOK_BREAK);
    case 'c':
      ident(s);
      return kw(p, *s - p, 3, TOK_CASE);
    case 'd':
      ident(s);
      return kw(p, *s - p, 4, TOK_DEBUGGER);
    case 'e':
      ident(s);
      return kw(p, *s - p, 1, TOK_ELSE);
    case 'f':
      ident(s);
      return kw(p, *s - p, 4, TOK_FALSE);
    case 'g':
    case 'h':
      ident(s);
      return TOK_IDENTIFIER;
    case 'i':
      ident(s);
      return kw(p, *s - p, 3, TOK_IF);
    case 'j':
    case 'k':
    case 'l':
    case 'm':
      ident(s);
      return TOK_IDENTIFIER;
    case 'n':
      ident(s);
      return kw(p, *s - p, 2, TOK_NEW);
    case 'o':
    case 'p':
    case 'q':
      ident(s);
      return TOK_IDENTIFIER;
    case 'r':
      ident(s);
      return kw(p, *s - p, 1, TOK_RETURN);
    case 's':
      ident(s);
      return kw(p, *s - p, 1, TOK_SWITCH);
    case 't':
      ident(s);
      return kw(p, *s - p, 5, TOK_THIS);
    case 'u':
      ident(s);
      return TOK_IDENTIFIER;
    case 'v':
      ident(s);
      return kw(p, *s - p, 2, TOK_VAR);
    case 'w':
      ident(s);
      return kw(p, *s - p, 2, TOK_WHILE);
    case 'x':
    case 'y':
    case 'z':
      ident(s);
      return TOK_IDENTIFIER;

    case '_':
    case '$':
    case 'A':
    case 'B':
    case 'C':
    case 'D':
    case 'E':
    case 'F':
    case 'G':
    case 'H':
    case 'I':
    case 'J':
    case 'K':
    case 'L':
    case 'M':
    case 'N':
    case 'O':
    case 'P':
    case 'Q':
    case 'R':
    case 'S':
    case 'T':
    case 'U':
    case 'V':
    case 'W':
    case 'X':
    case 'Y':
    case 'Z':
    case '\\': /* Identifier may start with unicode escape sequence */
      ident(s);
      return TOK_IDENTIFIER;

    /* Numbers */
    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9':
      parse_number(p, s, n);
      return TOK_NUMBER;

    /* String literals */
    case '\'':
    case '"':
      return parse_str_literal(s);

    /* Punctuators */
    case '=':
      return punct2(s, '=', TOK_EQ, '=', TOK_EQ_EQ, TOK_ASSIGN);
    case '!':
      return punct2(s, '=', TOK_NE, '=', TOK_NE_NE, TOK_NOT);

    case '%':
      return punct1(s, '=', TOK_REM_ASSIGN, TOK_REM);
    case '*':
      return punct1(s, '=', TOK_MUL_ASSIGN, TOK_MUL);
    case '/':
      /*
       * TOK_DIV, TOK_DIV_ASSIGN, and TOK_REGEX_LITERAL start with `/` char.
       * Division can happen after an expression.
       * In expressions like this:
       *            a /= b; c /= d;
       * things between slashes is NOT a regex literal.
       * The switch below catches all cases where division happens.
       */
      switch (prev_tok) {
        case TOK_CLOSE_CURLY:
        case TOK_CLOSE_PAREN:
        case TOK_CLOSE_BRACKET:
        case TOK_IDENTIFIER:
        case TOK_NUMBER:
          return punct1(s, '=', TOK_DIV_ASSIGN, TOK_DIV);
          break;
        default:
          /* Not a division - this is a regex. Scan until closing slash */
          for (p++; *p != '\0' && *p != '\n'; p++) {
            if (*p == '\\') {
              /* Skip escape sequence */
              p++;
            } else if (*p == '/') {
              /* This is a closing slash */
              p++;
              /* Skip regex flags */
              while (*p == 'g' || *p == 'i' || *p == 'm') {
                p++;
              }
              *s = p;
              return TOK_REGEX_LITERAL;
            }
          }
          break;
      }
      return punct1(s, '=', TOK_DIV_ASSIGN, TOK_DIV);
    case '^':
      return punct1(s, '=', TOK_XOR_ASSIGN, TOK_XOR);

    case '+':
      return punct3(s, '+', TOK_PLUS_PLUS, '=', TOK_PLUS_ASSIGN, TOK_PLUS);
    case '-':
      return punct3(s, '-', TOK_MINUS_MINUS, '=', TOK_MINUS_ASSIGN, TOK_MINUS);
    case '&':
      return punct3(s, '&', TOK_LOGICAL_AND, '=', TOK_AND_ASSIGN, TOK_AND);
    case '|':
      return punct3(s, '|', TOK_LOGICAL_OR, '=', TOK_OR_ASSIGN, TOK_OR);

    case '<':
      if (s[0][1] == '=') {
        (*s) += 2;
        return TOK_LE;
      }
      return punct2(s, '<', TOK_LSHIFT, '=', TOK_LSHIFT_ASSIGN, TOK_LT);
    case '>':
      if (s[0][1] == '=') {
        (*s) += 2;
        return TOK_GE;
      }
      if (s[0][1] == '>' && s[0][2] == '>' && s[0][3] == '=') {
        (*s) += 4;
        return TOK_URSHIFT_ASSIGN;
      }
      if (s[0][1] == '>' && s[0][2] == '>') {
        (*s) += 3;
        return TOK_URSHIFT;
      }
      return punct2(s, '>', TOK_RSHIFT, '=', TOK_RSHIFT_ASSIGN, TOK_GT);

    case '{':
      (*s)++;
      return TOK_OPEN_CURLY;
    case '}':
      (*s)++;
      return TOK_CLOSE_CURLY;
    case '(':
      (*s)++;
      return TOK_OPEN_PAREN;
    case ')':
      (*s)++;
      return TOK_CLOSE_PAREN;
    case '[':
      (*s)++;
      return TOK_OPEN_BRACKET;
    case ']':
      (*s)++;
      return TOK_CLOSE_BRACKET;
    case '.':
      switch (*(*s + 1)) {
        /* Numbers */
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
          parse_number(p, s, n);
          return TOK_NUMBER;
      }
      (*s)++;
      return TOK_DOT;
    case ';':
      (*s)++;
      return TOK_SEMICOLON;
    case ':':
      (*s)++;
      return TOK_COLON;
    case '?':
      (*s)++;
      return TOK_QUESTION;
    case '~':
      (*s)++;
      return TOK_TILDA;
    case ',':
      (*s)++;
      return TOK_COMMA;

    default: {
      /* Handle unicode variables */
      Rune r;
      int n;

      if ((n = chartorune(&r, *s)) > 1 && isalpharune(r)) {
        ident(s);
        return TOK_IDENTIFIER;
      }
      return TOK_END_OF_INPUT;
    }
  }
}
Beispiel #6
0
static int isunicodeletter(int c)
{
	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
}
Beispiel #7
0
int
trans(int c, char *)
{
    int f;

    if (isalpharune(c) && ft == ITAL && c != 'f' && c != 'j') {	/* italic letter */
        shim(pclass, nclass = ILET);
        cadd(c);
        return ITAL;
    }
    if (isalpharune(c) && ft != ITAL) {		/* other letter */
        shim(pclass, nclass = OLET);
        cadd(c);
        return ROM;
    }
    if (isdigitrune(c)) {
        shim(pclass, nclass = DIG);
        roman(c);
        return ROM;	/* this is the right side font of this object */
    }
    f = ROM;
    nclass = OTHER;
    switch (c) {
    case ':':
    case ';':
    case '!':
    case '%':
    case '?':
        shim(pclass, nclass);
        roman(c);
        return f;
    case '(':
    case '[':
        shim(pclass, nclass = LPAR);
        roman(c);
        return f;
    case ')':
    case ']':
        shim(pclass, nclass = RPAR);
        roman(c);
        return f;
    case ',':
        shim(pclass, nclass = OTHER);
        roman(c);
        return f;
    case '.':
        if (rf == ROM)
            roman(c);
        else
            cadd(c);
        return f;
    case '|':		/* postscript needs help with default width! */
        shim(pclass, nclass = VBAR);
        sadd("\\v'.17m'\\z|\\v'-.17m'\\|");	/* and height */
        return f;
    case '=':
        shim(pclass, nclass = PLUS);
        sadd("\\(eq");
        return f;
    case '+':
        shim(pclass, nclass = PLUS);
        sadd("\\(pl");
        return f;
    case '>':
    case '<':		/* >, >=, >>, <, <-, <=, << */
        shim(pclass, nclass = PLUS);
        if (*psp == '=') {
            sadd(c == '<' ? "\\(<=" : "\\(>=");
            psp++;
        } else if (c == '<' && *psp == '-') {	/* <- only */
            sadd("\\(<-");
            psp++;
        } else if (*psp == c) {		/* << or >> */
            cadd(c);
            cadd(c);
            psp++;
        } else {
            cadd(c);
        }
        return f;
    case '-':
        shim(pclass, nclass = PLUS);	/* probably too big for ->'s */
        if (*psp == '>') {
            sadd("\\(->");
            psp++;
        } else {
            sadd("\\(mi");
        }
        return f;
    case '/':
        shim(pclass, nclass = SLASH);
        cadd('/');
        return f;
    case '~':
    case ' ':
        sadd("\\|\\|");
        return f;
    case '^':
        sadd("\\|");
        return f;
    case '\\':	/* troff - pass only \(xx without comment */
        shim(pclass, nclass);
        cadd('\\');
        cadd(c = *psp++);
        if (c == '(' && *psp && *(psp+1)) {
            cadd(*psp++);
            cadd(*psp++);
        } else
            fprintf(stderr, "eqn warning: unquoted troff command \\%c, file %s:%d\n",
                    c, curfile->fname, curfile->lineno);
        return f;
    case '\'':
        shim(pclass, nclass);
        sadd("\\(fm");
        return f;

    case 'f':
        if (ft == ITAL) {
            shim(pclass, nclass = ILETF);
            cadd('f');
            f = ITAL;
        } else
            cadd('f');
        return f;
    case 'j':
        if (ft == ITAL) {
            shim(pclass, nclass = ILETJ);
            cadd('j');
            f = ITAL;
        } else
            cadd('j');
        return f;
    default:
        shim(pclass, nclass);
        cadd(c);
        return ft==ITAL ? ITAL : ROM;
    }
}