/* * Given a string "str", separate the numeric part into * str, and the modifier into mod. */ static bool get_modifier(char *str, char *num, int num_len, char *mod, int mod_len) { int i, len, num_begin, num_end, mod_begin, mod_end; strip_trailing_junk(str); len = strlen(str); for (i=0; i<len; i++) { if (!B_ISSPACE(str[i])) { break; } } num_begin = i; /* Walk through integer part */ for ( ; i<len; i++) { if (!B_ISDIGIT(str[i]) && str[i] != '.') { break; } } num_end = i; if (num_len > (num_end - num_begin + 1)) { num_len = num_end - num_begin + 1; } if (num_len == 0) { return false; } /* Eat any spaces in front of modifier */ for ( ; i<len; i++) { if (!B_ISSPACE(str[i])) { break; } } mod_begin = i; for ( ; i<len; i++) { if (!B_ISALPHA(str[i])) { break; } } mod_end = i; if (mod_len > (mod_end - mod_begin + 1)) { mod_len = mod_end - mod_begin + 1; } Dmsg5(900, "str=%s: num_beg=%d num_end=%d mod_beg=%d mod_end=%d\n", str, num_begin, num_end, mod_begin, mod_end); bstrncpy(num, &str[num_begin], num_len); bstrncpy(mod, &str[mod_begin], mod_len); if (!is_a_number(num)) { return false; } bstrncpy(str, &str[mod_end], len); Dmsg2(900, "num=%s mod=%s\n", num, mod); return true; }
/* We assume ASCII input and don't worry about overflow */ uint64_t str_to_uint64(const char *str) { const char *p = str; uint64_t value = 0; if (!p) { return 0; } while (B_ISSPACE(*p)) { p++; } if (*p == '+') { p++; } while (B_ISDIGIT(*p)) { value = B_TIMES10(value) + *p - '0'; p++; } return value; }
int64_t str_to_int64(const char *str) { const char *p = str; int64_t value; bool negative = false; if (!p) { return 0; } while (B_ISSPACE(*p)) { p++; } if (*p == '+') { p++; } else if (*p == '-') { negative = true; p++; } value = str_to_uint64(p); if (negative) { value = -value; } return value; }
/* * * Get the next token from the input * */ int lex_get_token(LEX *lf, int expect) { int ch; int token = T_NONE; bool esc_next = false; /* Unicode files, especially on Win32, may begin with a "Byte Order Mark" to indicate which transmission format the file is in. The codepoint for this mark is U+FEFF and is represented as the octets EF-BB-BF in UTF-8 and as FF-FE in UTF-16le(little endian) and FE-FF in UTF-16(big endian). We use a distinct state for UTF-8 and UTF-16le, and use bom_bytes_seen to tell which byte we are expecting. */ int bom_bytes_seen = 0; Dmsg0(dbglvl, "enter lex_get_token\n"); while (token == T_NONE) { ch = lex_get_char(lf); switch (lf->state) { case lex_none: Dmsg2(dbglvl, "Lex state lex_none ch=%d,%x\n", ch, ch); if (B_ISSPACE(ch)) break; if (B_ISALPHA(ch)) { if (lf->options & LOPT_NO_IDENT || lf->options & LOPT_STRING) { lf->state = lex_string; } else { lf->state = lex_identifier; } begin_str(lf, ch); break; } if (B_ISDIGIT(ch)) { if (lf->options & LOPT_STRING) { lf->state = lex_string; } else { lf->state = lex_number; } begin_str(lf, ch); break; } Dmsg0(dbglvl, "Enter lex_none switch\n"); switch (ch) { case L_EOF: token = T_EOF; Dmsg0(dbglvl, "got L_EOF set token=T_EOF\n"); break; case '#': lf->state = lex_comment; break; case '{': token = T_BOB; begin_str(lf, ch); break; case '}': token = T_EOB; begin_str(lf, ch); break; case '"': lf->state = lex_quoted_string; begin_str(lf, 0); break; case '=': token = T_EQUALS; begin_str(lf, ch); break; case ',': token = T_COMMA; begin_str(lf, ch); break; case ';': if (expect != T_SKIP_EOL) { token = T_EOL; /* treat ; like EOL */ } break; case L_EOL: Dmsg0(dbglvl, "got L_EOL set token=T_EOL\n"); if (expect != T_SKIP_EOL) { token = T_EOL; } break; case '@': /* In NO_EXTERN mode, @ is part of a string */ if (lf->options & LOPT_NO_EXTERN) { lf->state = lex_string; begin_str(lf, ch); } else { lf->state = lex_include; begin_str(lf, 0); } break; case 0xEF: /* probably a UTF-8 BOM */ case 0xFF: /* probably a UTF-16le BOM */ case 0xFE: /* probably a UTF-16be BOM (error)*/ if (lf->line_no != 1 || lf->col_no != 1) { lf->state = lex_string; begin_str(lf, ch); } else { bom_bytes_seen = 1; if (ch == 0xEF) { lf->state = lex_utf8_bom; } else if (ch == 0xFF) { lf->state = lex_utf16_le_bom; } else { scan_err0(lf, _("This config file appears to be in an " "unsupported Unicode format (UTF-16be). Please resave as UTF-8\n")); return T_ERROR; } } break; default: lf->state = lex_string; begin_str(lf, ch); break; } break; case lex_comment: Dmsg1(dbglvl, "Lex state lex_comment ch=%x\n", ch); if (ch == L_EOL) { lf->state = lex_none; if (expect != T_SKIP_EOL) { token = T_EOL; } } else if (ch == L_EOF) { token = T_ERROR; } break; case lex_number: Dmsg2(dbglvl, "Lex state lex_number ch=%x %c\n", ch, ch); if (ch == L_EOF) { token = T_ERROR; break; } /* Might want to allow trailing specifications here */ if (B_ISDIGIT(ch)) { add_str(lf, ch); break; } /* A valid number can be terminated by the following */ if (B_ISSPACE(ch) || ch == L_EOL || ch == ',' || ch == ';') { token = T_NUMBER; lf->state = lex_none; } else { lf->state = lex_string; } lex_unget_char(lf); break; case lex_ip_addr: if (ch == L_EOF) { token = T_ERROR; break; } Dmsg1(dbglvl, "Lex state lex_ip_addr ch=%x\n", ch); break; case lex_string: Dmsg1(dbglvl, "Lex state lex_string ch=%x\n", ch); if (ch == L_EOF) { token = T_ERROR; break; } if (ch == '\n' || ch == L_EOL || ch == '=' || ch == '}' || ch == '{' || ch == '\r' || ch == ';' || ch == ',' || ch == '#' || (B_ISSPACE(ch)) ) { lex_unget_char(lf); token = T_UNQUOTED_STRING; lf->state = lex_none; break; } add_str(lf, ch); break; case lex_identifier: Dmsg2(dbglvl, "Lex state lex_identifier ch=%x %c\n", ch, ch); if (B_ISALPHA(ch)) { add_str(lf, ch); break; } else if (B_ISSPACE(ch)) { break; } else if (ch == '\n' || ch == L_EOL || ch == '=' || ch == '}' || ch == '{' || ch == '\r' || ch == ';' || ch == ',' || ch == '"' || ch == '#') { lex_unget_char(lf); token = T_IDENTIFIER; lf->state = lex_none; break; } else if (ch == L_EOF) { token = T_ERROR; lf->state = lex_none; begin_str(lf, ch); break; } /* Some non-alpha character => string */ lf->state = lex_string; add_str(lf, ch); break; case lex_quoted_string: Dmsg2(dbglvl, "Lex state lex_quoted_string ch=%x %c\n", ch, ch); if (ch == L_EOF) { token = T_ERROR; break; } if (ch == L_EOL) { esc_next = false; break; } if (esc_next) { add_str(lf, ch); esc_next = false; break; } if (ch == '\\') { esc_next = true; break; } if (ch == '"') { token = T_QUOTED_STRING; /* * Since we may be scanning a quoted list of names, * we get the next character (a comma indicates another * one), then we put it back for rescanning. */ lex_get_char(lf); lex_unget_char(lf); lf->state = lex_none; break; } add_str(lf, ch); break; case lex_include_quoted_string: if (ch == L_EOF) { token = T_ERROR; break; } if (esc_next) { add_str(lf, ch); esc_next = false; break; } if (ch == '\\') { esc_next = true; break; } if (ch == '"') { /* Keep the original LEX so we can print an error if the included file can't be opened. */ LEX* lfori = lf; /* Skip the double quote when restarting parsing */ lex_get_char(lf); lf->state = lex_none; lf = lex_open_file(lf, lf->str, lf->scan_error, lf->scan_warning); if (lf == NULL) { berrno be; scan_err2(lfori, _("Cannot open included config file %s: %s\n"), lfori->str, be.bstrerror()); return T_ERROR; } break; } add_str(lf, ch); break; case lex_include: /* scanning a filename */ if (ch == L_EOF) { token = T_ERROR; break; } if (ch == '"') { lf->state = lex_include_quoted_string; break; } if (B_ISSPACE(ch) || ch == '\n' || ch == L_EOL || ch == '}' || ch == '{' || ch == ';' || ch == ',' || ch == '"' || ch == '#') { /* Keep the original LEX so we can print an error if the included file can't be opened. */ LEX* lfori = lf; lf->state = lex_none; lf = lex_open_file(lf, lf->str, lf->scan_error, lf->scan_warning); if (lf == NULL) { berrno be; scan_err2(lfori, _("Cannot open included config file %s: %s\n"), lfori->str, be.bstrerror()); return T_ERROR; } break; } add_str(lf, ch); break; case lex_utf8_bom: /* we only end up in this state if we have read an 0xEF as the first byte of the file, indicating we are probably reading a UTF-8 file */ if (ch == 0xBB && bom_bytes_seen == 1) { bom_bytes_seen++; } else if (ch == 0xBF && bom_bytes_seen == 2) { token = T_UTF8_BOM; lf->state = lex_none; } else { token = T_ERROR; } break; case lex_utf16_le_bom: /* we only end up in this state if we have read an 0xFF as the first byte of the file -- indicating that we are probably dealing with an Intel based (little endian) UTF-16 file*/ if (ch == 0xFE) { token = T_UTF16_BOM; lf->state = lex_none; } else { token = T_ERROR; } break; } Dmsg4(dbglvl, "ch=%d state=%s token=%s %c\n", ch, lex_state_to_str(lf->state), lex_tok_to_str(token), ch); } Dmsg2(dbglvl, "lex returning: line %d token: %s\n", lf->line_no, lex_tok_to_str(token)); lf->token = token; /* * Here is where we check to see if the user has set certain * expectations (e.g. 32 bit integer). If so, we do type checking * and possible additional scanning (e.g. for range). */ switch (expect) { case T_PINT16: lf->u.pint16_val = (scan_pint(lf, lf->str) & 0xffff); lf->u2.pint16_val = lf->u.pint16_val; token = T_PINT16; break; case T_PINT32: lf->u.pint32_val = scan_pint(lf, lf->str); lf->u2.pint32_val = lf->u.pint32_val; token = T_PINT32; break; case T_PINT32_RANGE: if (token == T_NUMBER) { lf->u.pint32_val = scan_pint(lf, lf->str); lf->u2.pint32_val = lf->u.pint32_val; token = T_PINT32; } else { char *p = strchr(lf->str, '-'); if (!p) { scan_err2(lf, _("expected an integer or a range, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; break; } *p++ = 0; /* terminate first half of range */ lf->u.pint32_val = scan_pint(lf, lf->str); lf->u2.pint32_val = scan_pint(lf, p); token = T_PINT32_RANGE; } break; case T_INT16: if (token != T_NUMBER || !is_a_number(lf->str)) { scan_err2(lf, _("expected an integer number, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; break; } errno = 0; lf->u.int16_val = (int16_t)str_to_int64(lf->str); if (errno != 0) { scan_err2(lf, _("expected an integer number, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; } else { token = T_INT16; } break; case T_INT32: if (token != T_NUMBER || !is_a_number(lf->str)) { scan_err2(lf, _("expected an integer number, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; break; } errno = 0; lf->u.int32_val = (int32_t)str_to_int64(lf->str); if (errno != 0) { scan_err2(lf, _("expected an integer number, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; } else { token = T_INT32; } break; case T_INT64: Dmsg2(dbglvl, "int64=:%s: %f\n", lf->str, strtod(lf->str, NULL)); if (token != T_NUMBER || !is_a_number(lf->str)) { scan_err2(lf, _("expected an integer number, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; break; } errno = 0; lf->u.int64_val = str_to_int64(lf->str); if (errno != 0) { scan_err2(lf, _("expected an integer number, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; } else { token = T_INT64; } break; case T_PINT64_RANGE: if (token == T_NUMBER) { lf->u.pint64_val = scan_pint64(lf, lf->str); lf->u2.pint64_val = lf->u.pint64_val; token = T_PINT64; } else { char *p = strchr(lf->str, '-'); if (!p) { scan_err2(lf, _("expected an integer or a range, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; break; } *p++ = 0; /* terminate first half of range */ lf->u.pint64_val = scan_pint64(lf, lf->str); lf->u2.pint64_val = scan_pint64(lf, p); token = T_PINT64_RANGE; } break; case T_NAME: if (token != T_IDENTIFIER && token != T_UNQUOTED_STRING && token != T_QUOTED_STRING) { scan_err2(lf, _("expected a name, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; } else if (lf->str_len > MAX_RES_NAME_LENGTH) { scan_err3(lf, _("name %s length %d too long, max is %d\n"), lf->str, lf->str_len, MAX_RES_NAME_LENGTH); token = T_ERROR; } break; case T_STRING: if (token != T_IDENTIFIER && token != T_UNQUOTED_STRING && token != T_QUOTED_STRING) { scan_err2(lf, _("expected a string, got %s: %s"), lex_tok_to_str(token), lf->str); token = T_ERROR; } else { token = T_STRING; } break; default: break; /* no expectation given */ } lf->token = token; /* set possible new token */ return token; }