Beispiel #1
0
static int do_whole_file(const char *filename)
{
  errno = 0;
  FILE *fp = fopen(filename, "r");
  if (!fp) {
    error("can't open `%1': %2", filename, strerror(errno));
    return 0;
  }
  int count = 0;
  int key_len = 0;
  int c;
  while ((c = getc(fp)) != EOF) {
    if (csalnum(c)) {
      key_len = 1;
      key_buffer[0] = c;
      while ((c = getc(fp)) != EOF) {
	if (!csalnum(c))
	  break;
	if (key_len < truncate_len)
	  key_buffer[key_len++] = c;
      }
      if (store_key(key_buffer, key_len)) {
	if (++count >= max_keys_per_item)
	  break;
      }
      if (c == EOF)
	break;
    }
  }
  store_reference(filenames.length(), 0, 0);
  store_filename(filename);
  fclose(fp);
  return 1;
}
Beispiel #2
0
void index_search_item::read_common_words_file()
{
  if (header.common <= 0)
    return;
  const char *common_words_file = munge_filename(strchr(pool, '\0') + 1);
  errno = 0;
  FILE *fp = fopen(common_words_file, "r");
  if (!fp) {
    error("can't open `%1': %2", common_words_file, strerror(errno));
    return;
  }
  common_words_table_size = 2*header.common + 1;
  while (!is_prime(common_words_table_size))
    common_words_table_size++;
  common_words_table = new char *[common_words_table_size];
  for (int i = 0; i < common_words_table_size; i++)
    common_words_table[i] = 0;
  int count = 0;
  int key_len = 0;
  for (;;) {
    int c = getc(fp);
    while (c != EOF && !csalnum(c))
      c = getc(fp);
    if (c == EOF)
      break;
    do {
      if (key_len < header.truncate)
	key_buffer[key_len++] = cmlower(c);
      c = getc(fp);
    } while (c != EOF && csalnum(c));
    if (key_len >= header.shortest) {
      int h = hash(key_buffer, key_len) % common_words_table_size;
      while (common_words_table[h]) {
	if (h == 0)
	  h = common_words_table_size;
	--h;
      }
      common_words_table[h] = new char[key_len + 1];
      memcpy(common_words_table[h], key_buffer, key_len);
      common_words_table[h][key_len] = '\0';
    }
    if (++count >= header.common)
      break;
    key_len = 0;
    if (c == EOF)
      break;
  }
  fclose(fp);
}
Beispiel #3
0
char *get_thru_arg()
{
  int c = input_stack::peek_char();
  while (c == ' ') {
    input_stack::get_char();
    c = input_stack::peek_char();
  }
  if (c != EOF && csalpha(c)) {
    // looks like a macro
    input_stack::get_char();
    token_buffer = c;
    for (;;) {
      c = input_stack::peek_char();
      if (c == EOF || (!csalnum(c) && c != '_'))
	break;
      input_stack::get_char();
      token_buffer += char(c);
    }
    context_buffer = token_buffer;
    token_buffer += '\0';
    char *def = macro_table.lookup(token_buffer.contents());
    if (def)
      return strsave(def);
    // I guess it wasn't a macro after all; so push the macro name back.
    // -2 because we added a '\0'
    for (int i = token_buffer.length() - 2; i >= 0; i--)
      input_stack::push_back(token_buffer[i]);
  }
  if (get_delimited()) {
    token_buffer += '\0';
    return strsave(token_buffer.contents());
  }
  else
    return 0;
}
Beispiel #4
0
const int *index_search_item::search1(const char **pp, const char *end)
{
  while (*pp < end && !csalnum(**pp))
    *pp += 1;
  if (*pp >= end)
    return 0;
  const char *start = *pp;
  while (*pp < end && csalnum(**pp))
    *pp += 1;
  int len = *pp - start;
  if (len < header.shortest)
    return 0;
  if (len > header.truncate)
    len = header.truncate;
  int is_number = 1;
  for (int i = 0; i < len; i++)
    if (csdigit(start[i]))
      key_buffer[i] = start[i];
    else {
      key_buffer[i] = cmlower(start[i]);
      is_number = 0;
    }
  if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9'))
    return 0;
  unsigned hc = hash(key_buffer, len);
  if (common_words_table) {
    for (int h = hc % common_words_table_size;
	 common_words_table[h];
	 --h) {
      if (strlen(common_words_table[h]) == (size_t)len
	  && memcmp(common_words_table[h], key_buffer, len) == 0)
	return 0;
      if (h == 0)
	h = common_words_table_size;
    }
  }
  int li = table[int(hc % header.table_size)];
  return li < 0 ? &minus_one : lists + li;
}
Beispiel #5
0
static void read_common_words_file()
{
  if (n_ignore_words <= 0)
    return;
  errno = 0;
  FILE *fp = fopen(common_words_file, "r");
  if (!fp)
    fatal("can't open `%1': %2", common_words_file, strerror(errno));
  common_words_table = new word_list * [hash_table_size];
  for (int i = 0; i < hash_table_size; i++)
    common_words_table[i] = 0;
  int count = 0;
  int key_len = 0;
  for (;;) {
    int c = getc(fp);
    while (c != EOF && !csalnum(c))
      c = getc(fp);
    if (c == EOF)
      break;
    do {
      if (key_len < truncate_len)
	key_buffer[key_len++] = cmlower(c);
      c = getc(fp);
    } while (c != EOF && csalnum(c));
    if (key_len >= shortest_len) {
      int h = hash(key_buffer, key_len) % hash_table_size;
      common_words_table[h] = new word_list(key_buffer, key_len,
					    common_words_table[h]);
    }
    if (++count >= n_ignore_words)
      break;
    key_len = 0;
    if (c == EOF)
      break;
  }
  n_ignore_words = count;
  fclose(fp);
}
Beispiel #6
0
map_init::map_init()
{
  int i;
  for (i = 0; i < 256; i++)
    map[i] = csalnum(i) ? cmlower(i) : '\0';
  for (i = 0; i < 256; i++) {
    if (cslower(i)) {
      inv_map[i][0] = i;
      inv_map[i][1] = cmupper(i);
      inv_map[i][2] = '\0';
    }
    else if (csdigit(i)) {
      inv_map[i][0] = i;
      inv_map[i][1] = 0;
    }
    else
      inv_map[i][0] = '\0';
  }
}
Beispiel #7
0
static int do_file(const char *filename)
{
  errno = 0;
  // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
  // byte counts to be consistent with fseek.
  FILE *fp = fopen(filename, FOPEN_RB);
  if (fp == 0) {
    error("can't open `%1': %2", filename, strerror(errno));
    return 0;
  }
  int filename_index = filenames.length();
  store_filename(filename);

  enum {
    START,	// at the start of the file; also in between references
    BOL,	// in the middle of a reference, at the beginning of the line
    PERCENT,	// seen a percent at the beginning of the line
    IGNORE,	// ignoring a field
    IGNORE_BOL,	// at the beginning of a line ignoring a field
    KEY,	// in the middle of a key
    DISCARD,	// after truncate_len bytes of a key
    MIDDLE	// in between keys
  } state = START;
  
  // In states START, BOL, IGNORE_BOL, space_count how many spaces at
  // the beginning have been seen.  In states PERCENT, IGNORE, KEY,
  // MIDDLE space_count must be 0.
  int space_count = 0;
  int byte_count = 0;		// bytes read
  int key_len = 0;
  int ref_start = -1;		// position of start of current reference
  for (;;) {
    int c = getc(fp);
    if (c == EOF)
      break;
    // We opened the file in binary mode, so we need to skip
    // every CR character before a Newline.
    if (c == '\r') {
      int peek = getc(fp);
      if (peek == '\n') {
	byte_count++;
	c = peek;
      }
      else
	ungetc(peek, fp);
    }
#if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
    else if (c == 0x1a)	// ^Z means EOF in text files
      break;
#endif
    byte_count++;
    switch (state) {
    case START:
      if (c == ' ' || c == '\t') {
	space_count++;
	break;
      }
      if (c == '\n') {
	space_count = 0;
	break;
      }
      ref_start = byte_count - space_count - 1;
      space_count = 0;
      if (c == '%')
	state = PERCENT;
      else if (csalnum(c)) {
	state = KEY;
	key_buffer[0] = c;
	key_len = 1;
      }
      else
	state = MIDDLE;
      break;
    case BOL:
      switch (c) {
      case '%':
	if (space_count > 0) {
	  space_count = 0;
	  state = MIDDLE;
	}
	else
	  state = PERCENT;
	break;
      case ' ':
      case '\t':
	space_count++;
	break;
      case '\n':
	store_reference(filename_index, ref_start,
			byte_count - 1 - space_count - ref_start);
	state = START;
	space_count = 0;
	break;
      default:
	space_count = 0;
	if (csalnum(c)) {
	  state = KEY;
	  key_buffer[0] = c;
	  key_len = 1;
	}
	else
	  state = MIDDLE;
      }
      break;
    case PERCENT:
      if (strchr(ignore_fields, c) != 0)
	state = IGNORE;
      else if (c == '\n')
	state = BOL;
      else
	state = MIDDLE;
      break;
    case IGNORE:
      if (c == '\n')
	state = IGNORE_BOL;
      break;
    case IGNORE_BOL:
      switch (c) {
      case '%':
	if (space_count > 0) {
	  state = IGNORE;
	  space_count = 0;
	}
	else
	  state = PERCENT;
	break;
      case ' ':
      case '\t':
	space_count++;
	break;
      case '\n':
	store_reference(filename_index, ref_start,
			byte_count - 1 - space_count - ref_start);
	state = START;
	space_count = 0;
	break;
      default:
	space_count = 0;
	state = IGNORE;
      }
      break;
    case KEY:
      if (csalnum(c)) {
	if (key_len < truncate_len)
	  key_buffer[key_len++] = c;
	else
	  state = DISCARD;
      }
      else {
	possibly_store_key(key_buffer, key_len);
	key_len = 0;
	if (c == '\n')
	  state = BOL;
	else
	  state = MIDDLE;
      }
      break;
    case DISCARD:
      if (!csalnum(c)) {
	possibly_store_key(key_buffer, key_len);
	key_len = 0;
	if (c == '\n')
	  state = BOL;
	else
	  state = MIDDLE;
      }
      break;
    case MIDDLE:
      if (csalnum(c)) {
	state = KEY;
	key_buffer[0] = c;
	key_len = 1;
      }
      else if (c == '\n')
	state = BOL;
      break;
    default:
      assert(0);
    }
  }
  switch (state) {
  case START:
    break;
  case DISCARD:
  case KEY:
    possibly_store_key(key_buffer, key_len);
    // fall through
  case BOL:
  case PERCENT:
  case IGNORE_BOL:
  case IGNORE:
  case MIDDLE:
    store_reference(filename_index, ref_start,
		    byte_count - ref_start - space_count);
    break;
  default:
    assert(0);
  }
  fclose(fp);
  return 1;
}
Beispiel #8
0
int get_token(int lookup_flag)
{
  context_buffer.clear();
  for (;;) {
    int n = 0;
    int bol = input_stack::bol();
    int c = input_stack::get_char();
    if (bol && c == command_char) {
      token_buffer.clear();
      token_buffer += c;
      // the newline is not part of the token
      for (;;) {
	c = input_stack::peek_char();
	if (c == EOF || c == '\n')
	  break;
	input_stack::get_char();
	token_buffer += char(c);
      }
      context_buffer = token_buffer;
      return COMMAND_LINE;
    }
    switch (c) {
    case EOF:
      return EOF;
    case ' ':
    case '\t':
      break;
    case '\\':
      {
	int d = input_stack::peek_char();
	if (d != '\n') {
	  context_buffer = '\\';
	  return '\\';
	}
	input_stack::get_char();
	break;
      }
    case '#':
      do {
	c = input_stack::get_char();
      } while (c != '\n' && c != EOF);
      if (c == '\n')
	context_buffer = '\n';
      return c;
    case '"':
      context_buffer = '"';
      token_buffer.clear();
      for (;;) {
	c = input_stack::get_char();
	if (c == '\\') {
	  context_buffer += '\\';
	  c = input_stack::peek_char();
	  if (c == '"') {
	    input_stack::get_char();
	    token_buffer += '"';
	    context_buffer += '"';
	  }
	  else
	    token_buffer += '\\';
	}
	else if (c == '\n') {
	  error("newline in string");
	  break;
	}
	else if (c == EOF) {
	  error("missing `\"'");
	  break;
	}
	else if (c == '"') {
	  context_buffer += '"';
	  break;
	}
	else {
	  context_buffer += char(c);
	  token_buffer += char(c);
	}
      }
      return TEXT;
    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9':
      {   
	int overflow = 0;
	n = 0;
	for (;;) {
	  if (n > (INT_MAX - 9)/10) {
	    overflow = 1;
	    break;
	  }
	  n *= 10;
	  n += c - '0';
	  context_buffer += char(c);
	  c = input_stack::peek_char();
	  if (c == EOF || !csdigit(c))
	    break;
	  c = input_stack::get_char();
	}
	token_double = n;
	if (overflow) {
	  for (;;) {
	    token_double *= 10.0;
	    token_double += c - '0';
	    context_buffer += char(c);
	    c = input_stack::peek_char();
	    if (c == EOF || !csdigit(c))
	      break;
	    c = input_stack::get_char();
	  }
	  // if somebody asks for 1000000000000th, we will silently
	  // give them INT_MAXth
	  double temp = token_double; // work around gas 1.34/sparc bug
	  if (token_double > INT_MAX)
	    n = INT_MAX;
	  else
	    n = int(temp);
	}
      }
      switch (c) {
      case 'i':
      case 'I':
	context_buffer += char(c);
	input_stack::get_char();
	return NUMBER;
      case '.':
	{
	  context_buffer += '.';
	  input_stack::get_char();
	got_dot:
	  double factor = 1.0;
	  for (;;) {
	    c = input_stack::peek_char();
	    if (c == EOF || !csdigit(c))
	      break;
	    input_stack::get_char();
	    context_buffer += char(c);
	    factor /= 10.0;
	    if (c != '0')
	      token_double += factor*(c - '0');
	  }
	  if (c != 'e' && c != 'E') {
	    if (c == 'i' || c == 'I') {
	      context_buffer += char(c);
	      input_stack::get_char();
	    }
	    return NUMBER;
	  }
	}
	// fall through
      case 'e':
      case 'E':
	{
	  int echar = c;
	  input_stack::get_char();
	  c = input_stack::peek_char();
	  int sign = '+';
	  if (c == '+' || c == '-') {
	    sign = c;
	    input_stack::get_char();
	    c = input_stack::peek_char();
	    if (c == EOF || !csdigit(c)) {
	      input_stack::push_back(sign);
	      input_stack::push_back(echar);
	      return NUMBER;
	    }
	    context_buffer += char(echar);
	    context_buffer += char(sign);
	  }
	  else {
	    if (c == EOF || !csdigit(c)) {
	      input_stack::push_back(echar);
	      return NUMBER;
	    }
	    context_buffer += char(echar);
	  }
	  input_stack::get_char();
	  context_buffer += char(c);
	  n = c - '0';
	  for (;;) {
	    c = input_stack::peek_char();
	    if (c == EOF || !csdigit(c))
	      break;
	    input_stack::get_char();
	    context_buffer += char(c);
	    n = n*10 + (c - '0');
	  }
	  if (sign == '-')
	    n = -n;
	  if (c == 'i' || c == 'I') {
	    context_buffer += char(c);
	    input_stack::get_char();
	  }
	  token_double *= pow(10.0, n);
	  return NUMBER;
	}
      case 'n':
	input_stack::get_char();
	c = input_stack::peek_char();
	if (c == 'd') {
	  input_stack::get_char();
	  token_int = n;
	  context_buffer += "nd";
	  return ORDINAL;
	}
	input_stack::push_back('n');
	return NUMBER;
      case 'r':
	input_stack::get_char();
	c = input_stack::peek_char();
	if (c == 'd') {
	  input_stack::get_char();
	  token_int = n;
	  context_buffer += "rd";
	  return ORDINAL;
	}
	input_stack::push_back('r');
	return NUMBER;
      case 't':
	input_stack::get_char();
	c = input_stack::peek_char();
	if (c == 'h') {
	  input_stack::get_char();
	  token_int = n;
	  context_buffer += "th";
	  return ORDINAL;
	}
	input_stack::push_back('t');
	return NUMBER;
      case 's':
	input_stack::get_char();
	c = input_stack::peek_char();
	if (c == 't') {
	  input_stack::get_char();
	  token_int = n;
	  context_buffer += "st";
	  return ORDINAL;
	}
	input_stack::push_back('s');
	return NUMBER;
      default:
	return NUMBER;
      }
      break;
    case '\'':
      {
	c = input_stack::peek_char();
	if (c == 't') {
	  input_stack::get_char();
	  c = input_stack::peek_char();
	  if (c == 'h') {
	    input_stack::get_char();
	    context_buffer = "'th";
	    return TH;
	  }
	  else
	    input_stack::push_back('t');
	}
	context_buffer = "'";
	return '\'';
      }
    case '.':
      {
	c = input_stack::peek_char();
	if (c != EOF && csdigit(c)) {
	  n = 0;
	  token_double = 0.0;
	  context_buffer = '.';
	  goto got_dot;
	}
	return get_token_after_dot(c);
      }
    case '<':
      c = input_stack::peek_char();
      if (c == '-') {
	input_stack::get_char();
	c = input_stack::peek_char();
	if (c == '>') {
	  input_stack::get_char();
	  context_buffer = "<->";
	  return DOUBLE_ARROW_HEAD;
	}
	context_buffer = "<-";
	return LEFT_ARROW_HEAD;
      }
      else if (c == '=') {
	input_stack::get_char();
	context_buffer = "<=";
	return LESSEQUAL;
      }
      context_buffer = "<";
      return '<';
    case '-':
      c = input_stack::peek_char();
      if (c == '>') {
	input_stack::get_char();
	context_buffer = "->";
	return RIGHT_ARROW_HEAD;
      }
      context_buffer = "-";
      return '-';
    case '!':
      c = input_stack::peek_char();
      if (c == '=') {
	input_stack::get_char();
	context_buffer = "!=";
	return NOTEQUAL;
      }
      context_buffer = "!";
      return '!';
    case '>':
      c = input_stack::peek_char();
      if (c == '=') {
	input_stack::get_char();
	context_buffer = ">=";
	return GREATEREQUAL;
      }
      context_buffer = ">";
      return '>';
    case '=':
      c = input_stack::peek_char();
      if (c == '=') {
	input_stack::get_char();
	context_buffer = "==";
	return EQUALEQUAL;
      }
      context_buffer = "=";
      return '=';
    case '&':
      c = input_stack::peek_char();
      if (c == '&') {
	input_stack::get_char();
	context_buffer = "&&";
	return ANDAND;
      }
      context_buffer = "&";
      return '&';
    case '|':
      c = input_stack::peek_char();
      if (c == '|') {
	input_stack::get_char();
	context_buffer = "||";
	return OROR;
      }
      context_buffer = "|";
      return '|';
    default:
      if (c != EOF && csalpha(c)) {
	token_buffer.clear();
	token_buffer = c;
	for (;;) {
	  c = input_stack::peek_char();
	  if (c == EOF || (!csalnum(c) && c != '_'))
	    break;
	  input_stack::get_char();
	  token_buffer += char(c);
	}
	int tok = lookup_keyword(token_buffer.contents(),
				 token_buffer.length());
	if (tok != 0) {
	  context_buffer = token_buffer;
	  return tok;
	}
	char *def = 0;
	if (lookup_flag) {
	  token_buffer += '\0';
	  def = macro_table.lookup(token_buffer.contents());
	  token_buffer.set_length(token_buffer.length() - 1);
	  if (def) {
	    if (c == '(') {
	      input_stack::get_char();
	      interpolate_macro_with_args(def);
	    }
	    else
	      input_stack::push(new macro_input(def));
	  }
	}
	if (!def) {
	  context_buffer = token_buffer;
	  if (csupper(token_buffer[0]))
	    return LABEL;
	  else
	    return VARIABLE;
	}
      }
      else {
	context_buffer = char(c);
	return (unsigned char)c;
      }
      break;
    }
  }
}