static int do_whole_file(const char *filename) { errno = 0; FILE *fp = fopen(filename, "r"); if (!fp) { error("can't open `%1': %2", filename, strerror(errno)); return 0; } int count = 0; int key_len = 0; int c; while ((c = getc(fp)) != EOF) { if (csalnum(c)) { key_len = 1; key_buffer[0] = c; while ((c = getc(fp)) != EOF) { if (!csalnum(c)) break; if (key_len < truncate_len) key_buffer[key_len++] = c; } if (store_key(key_buffer, key_len)) { if (++count >= max_keys_per_item) break; } if (c == EOF) break; } } store_reference(filenames.length(), 0, 0); store_filename(filename); fclose(fp); return 1; }
void index_search_item::read_common_words_file() { if (header.common <= 0) return; const char *common_words_file = munge_filename(strchr(pool, '\0') + 1); errno = 0; FILE *fp = fopen(common_words_file, "r"); if (!fp) { error("can't open `%1': %2", common_words_file, strerror(errno)); return; } common_words_table_size = 2*header.common + 1; while (!is_prime(common_words_table_size)) common_words_table_size++; common_words_table = new char *[common_words_table_size]; for (int i = 0; i < common_words_table_size; i++) common_words_table[i] = 0; int count = 0; int key_len = 0; for (;;) { int c = getc(fp); while (c != EOF && !csalnum(c)) c = getc(fp); if (c == EOF) break; do { if (key_len < header.truncate) key_buffer[key_len++] = cmlower(c); c = getc(fp); } while (c != EOF && csalnum(c)); if (key_len >= header.shortest) { int h = hash(key_buffer, key_len) % common_words_table_size; while (common_words_table[h]) { if (h == 0) h = common_words_table_size; --h; } common_words_table[h] = new char[key_len + 1]; memcpy(common_words_table[h], key_buffer, key_len); common_words_table[h][key_len] = '\0'; } if (++count >= header.common) break; key_len = 0; if (c == EOF) break; } fclose(fp); }
char *get_thru_arg() { int c = input_stack::peek_char(); while (c == ' ') { input_stack::get_char(); c = input_stack::peek_char(); } if (c != EOF && csalpha(c)) { // looks like a macro input_stack::get_char(); token_buffer = c; for (;;) { c = input_stack::peek_char(); if (c == EOF || (!csalnum(c) && c != '_')) break; input_stack::get_char(); token_buffer += char(c); } context_buffer = token_buffer; token_buffer += '\0'; char *def = macro_table.lookup(token_buffer.contents()); if (def) return strsave(def); // I guess it wasn't a macro after all; so push the macro name back. // -2 because we added a '\0' for (int i = token_buffer.length() - 2; i >= 0; i--) input_stack::push_back(token_buffer[i]); } if (get_delimited()) { token_buffer += '\0'; return strsave(token_buffer.contents()); } else return 0; }
const int *index_search_item::search1(const char **pp, const char *end) { while (*pp < end && !csalnum(**pp)) *pp += 1; if (*pp >= end) return 0; const char *start = *pp; while (*pp < end && csalnum(**pp)) *pp += 1; int len = *pp - start; if (len < header.shortest) return 0; if (len > header.truncate) len = header.truncate; int is_number = 1; for (int i = 0; i < len; i++) if (csdigit(start[i])) key_buffer[i] = start[i]; else { key_buffer[i] = cmlower(start[i]); is_number = 0; } if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9')) return 0; unsigned hc = hash(key_buffer, len); if (common_words_table) { for (int h = hc % common_words_table_size; common_words_table[h]; --h) { if (strlen(common_words_table[h]) == (size_t)len && memcmp(common_words_table[h], key_buffer, len) == 0) return 0; if (h == 0) h = common_words_table_size; } } int li = table[int(hc % header.table_size)]; return li < 0 ? &minus_one : lists + li; }
static void read_common_words_file() { if (n_ignore_words <= 0) return; errno = 0; FILE *fp = fopen(common_words_file, "r"); if (!fp) fatal("can't open `%1': %2", common_words_file, strerror(errno)); common_words_table = new word_list * [hash_table_size]; for (int i = 0; i < hash_table_size; i++) common_words_table[i] = 0; int count = 0; int key_len = 0; for (;;) { int c = getc(fp); while (c != EOF && !csalnum(c)) c = getc(fp); if (c == EOF) break; do { if (key_len < truncate_len) key_buffer[key_len++] = cmlower(c); c = getc(fp); } while (c != EOF && csalnum(c)); if (key_len >= shortest_len) { int h = hash(key_buffer, key_len) % hash_table_size; common_words_table[h] = new word_list(key_buffer, key_len, common_words_table[h]); } if (++count >= n_ignore_words) break; key_len = 0; if (c == EOF) break; } n_ignore_words = count; fclose(fp); }
map_init::map_init() { int i; for (i = 0; i < 256; i++) map[i] = csalnum(i) ? cmlower(i) : '\0'; for (i = 0; i < 256; i++) { if (cslower(i)) { inv_map[i][0] = i; inv_map[i][1] = cmupper(i); inv_map[i][2] = '\0'; } else if (csdigit(i)) { inv_map[i][0] = i; inv_map[i][1] = 0; } else inv_map[i][0] = '\0'; } }
static int do_file(const char *filename) { errno = 0; // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on // byte counts to be consistent with fseek. FILE *fp = fopen(filename, FOPEN_RB); if (fp == 0) { error("can't open `%1': %2", filename, strerror(errno)); return 0; } int filename_index = filenames.length(); store_filename(filename); enum { START, // at the start of the file; also in between references BOL, // in the middle of a reference, at the beginning of the line PERCENT, // seen a percent at the beginning of the line IGNORE, // ignoring a field IGNORE_BOL, // at the beginning of a line ignoring a field KEY, // in the middle of a key DISCARD, // after truncate_len bytes of a key MIDDLE // in between keys } state = START; // In states START, BOL, IGNORE_BOL, space_count how many spaces at // the beginning have been seen. In states PERCENT, IGNORE, KEY, // MIDDLE space_count must be 0. int space_count = 0; int byte_count = 0; // bytes read int key_len = 0; int ref_start = -1; // position of start of current reference for (;;) { int c = getc(fp); if (c == EOF) break; // We opened the file in binary mode, so we need to skip // every CR character before a Newline. if (c == '\r') { int peek = getc(fp); if (peek == '\n') { byte_count++; c = peek; } else ungetc(peek, fp); } #if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__) else if (c == 0x1a) // ^Z means EOF in text files break; #endif byte_count++; switch (state) { case START: if (c == ' ' || c == '\t') { space_count++; break; } if (c == '\n') { space_count = 0; break; } ref_start = byte_count - space_count - 1; space_count = 0; if (c == '%') state = PERCENT; else if (csalnum(c)) { state = KEY; key_buffer[0] = c; key_len = 1; } else state = MIDDLE; break; case BOL: switch (c) { case '%': if (space_count > 0) { space_count = 0; state = MIDDLE; } else state = PERCENT; break; case ' ': case '\t': space_count++; break; case '\n': store_reference(filename_index, ref_start, byte_count - 1 - space_count - ref_start); state = START; space_count = 0; break; default: space_count = 0; if (csalnum(c)) { state = KEY; key_buffer[0] = c; key_len = 1; } else state = MIDDLE; } break; case PERCENT: if (strchr(ignore_fields, c) != 0) state = IGNORE; else if (c == '\n') state = BOL; else state = MIDDLE; break; case IGNORE: if (c == '\n') state = IGNORE_BOL; break; case IGNORE_BOL: switch (c) { case '%': if (space_count > 0) { state = IGNORE; space_count = 0; } else state = PERCENT; break; case ' ': case '\t': space_count++; break; case '\n': store_reference(filename_index, ref_start, byte_count - 1 - space_count - ref_start); state = START; space_count = 0; break; default: space_count = 0; state = IGNORE; } break; case KEY: if (csalnum(c)) { if (key_len < truncate_len) key_buffer[key_len++] = c; else state = DISCARD; } else { possibly_store_key(key_buffer, key_len); key_len = 0; if (c == '\n') state = BOL; else state = MIDDLE; } break; case DISCARD: if (!csalnum(c)) { possibly_store_key(key_buffer, key_len); key_len = 0; if (c == '\n') state = BOL; else state = MIDDLE; } break; case MIDDLE: if (csalnum(c)) { state = KEY; key_buffer[0] = c; key_len = 1; } else if (c == '\n') state = BOL; break; default: assert(0); } } switch (state) { case START: break; case DISCARD: case KEY: possibly_store_key(key_buffer, key_len); // fall through case BOL: case PERCENT: case IGNORE_BOL: case IGNORE: case MIDDLE: store_reference(filename_index, ref_start, byte_count - ref_start - space_count); break; default: assert(0); } fclose(fp); return 1; }
int get_token(int lookup_flag) { context_buffer.clear(); for (;;) { int n = 0; int bol = input_stack::bol(); int c = input_stack::get_char(); if (bol && c == command_char) { token_buffer.clear(); token_buffer += c; // the newline is not part of the token for (;;) { c = input_stack::peek_char(); if (c == EOF || c == '\n') break; input_stack::get_char(); token_buffer += char(c); } context_buffer = token_buffer; return COMMAND_LINE; } switch (c) { case EOF: return EOF; case ' ': case '\t': break; case '\\': { int d = input_stack::peek_char(); if (d != '\n') { context_buffer = '\\'; return '\\'; } input_stack::get_char(); break; } case '#': do { c = input_stack::get_char(); } while (c != '\n' && c != EOF); if (c == '\n') context_buffer = '\n'; return c; case '"': context_buffer = '"'; token_buffer.clear(); for (;;) { c = input_stack::get_char(); if (c == '\\') { context_buffer += '\\'; c = input_stack::peek_char(); if (c == '"') { input_stack::get_char(); token_buffer += '"'; context_buffer += '"'; } else token_buffer += '\\'; } else if (c == '\n') { error("newline in string"); break; } else if (c == EOF) { error("missing `\"'"); break; } else if (c == '"') { context_buffer += '"'; break; } else { context_buffer += char(c); token_buffer += char(c); } } return TEXT; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { int overflow = 0; n = 0; for (;;) { if (n > (INT_MAX - 9)/10) { overflow = 1; break; } n *= 10; n += c - '0'; context_buffer += char(c); c = input_stack::peek_char(); if (c == EOF || !csdigit(c)) break; c = input_stack::get_char(); } token_double = n; if (overflow) { for (;;) { token_double *= 10.0; token_double += c - '0'; context_buffer += char(c); c = input_stack::peek_char(); if (c == EOF || !csdigit(c)) break; c = input_stack::get_char(); } // if somebody asks for 1000000000000th, we will silently // give them INT_MAXth double temp = token_double; // work around gas 1.34/sparc bug if (token_double > INT_MAX) n = INT_MAX; else n = int(temp); } } switch (c) { case 'i': case 'I': context_buffer += char(c); input_stack::get_char(); return NUMBER; case '.': { context_buffer += '.'; input_stack::get_char(); got_dot: double factor = 1.0; for (;;) { c = input_stack::peek_char(); if (c == EOF || !csdigit(c)) break; input_stack::get_char(); context_buffer += char(c); factor /= 10.0; if (c != '0') token_double += factor*(c - '0'); } if (c != 'e' && c != 'E') { if (c == 'i' || c == 'I') { context_buffer += char(c); input_stack::get_char(); } return NUMBER; } } // fall through case 'e': case 'E': { int echar = c; input_stack::get_char(); c = input_stack::peek_char(); int sign = '+'; if (c == '+' || c == '-') { sign = c; input_stack::get_char(); c = input_stack::peek_char(); if (c == EOF || !csdigit(c)) { input_stack::push_back(sign); input_stack::push_back(echar); return NUMBER; } context_buffer += char(echar); context_buffer += char(sign); } else { if (c == EOF || !csdigit(c)) { input_stack::push_back(echar); return NUMBER; } context_buffer += char(echar); } input_stack::get_char(); context_buffer += char(c); n = c - '0'; for (;;) { c = input_stack::peek_char(); if (c == EOF || !csdigit(c)) break; input_stack::get_char(); context_buffer += char(c); n = n*10 + (c - '0'); } if (sign == '-') n = -n; if (c == 'i' || c == 'I') { context_buffer += char(c); input_stack::get_char(); } token_double *= pow(10.0, n); return NUMBER; } case 'n': input_stack::get_char(); c = input_stack::peek_char(); if (c == 'd') { input_stack::get_char(); token_int = n; context_buffer += "nd"; return ORDINAL; } input_stack::push_back('n'); return NUMBER; case 'r': input_stack::get_char(); c = input_stack::peek_char(); if (c == 'd') { input_stack::get_char(); token_int = n; context_buffer += "rd"; return ORDINAL; } input_stack::push_back('r'); return NUMBER; case 't': input_stack::get_char(); c = input_stack::peek_char(); if (c == 'h') { input_stack::get_char(); token_int = n; context_buffer += "th"; return ORDINAL; } input_stack::push_back('t'); return NUMBER; case 's': input_stack::get_char(); c = input_stack::peek_char(); if (c == 't') { input_stack::get_char(); token_int = n; context_buffer += "st"; return ORDINAL; } input_stack::push_back('s'); return NUMBER; default: return NUMBER; } break; case '\'': { c = input_stack::peek_char(); if (c == 't') { input_stack::get_char(); c = input_stack::peek_char(); if (c == 'h') { input_stack::get_char(); context_buffer = "'th"; return TH; } else input_stack::push_back('t'); } context_buffer = "'"; return '\''; } case '.': { c = input_stack::peek_char(); if (c != EOF && csdigit(c)) { n = 0; token_double = 0.0; context_buffer = '.'; goto got_dot; } return get_token_after_dot(c); } case '<': c = input_stack::peek_char(); if (c == '-') { input_stack::get_char(); c = input_stack::peek_char(); if (c == '>') { input_stack::get_char(); context_buffer = "<->"; return DOUBLE_ARROW_HEAD; } context_buffer = "<-"; return LEFT_ARROW_HEAD; } else if (c == '=') { input_stack::get_char(); context_buffer = "<="; return LESSEQUAL; } context_buffer = "<"; return '<'; case '-': c = input_stack::peek_char(); if (c == '>') { input_stack::get_char(); context_buffer = "->"; return RIGHT_ARROW_HEAD; } context_buffer = "-"; return '-'; case '!': c = input_stack::peek_char(); if (c == '=') { input_stack::get_char(); context_buffer = "!="; return NOTEQUAL; } context_buffer = "!"; return '!'; case '>': c = input_stack::peek_char(); if (c == '=') { input_stack::get_char(); context_buffer = ">="; return GREATEREQUAL; } context_buffer = ">"; return '>'; case '=': c = input_stack::peek_char(); if (c == '=') { input_stack::get_char(); context_buffer = "=="; return EQUALEQUAL; } context_buffer = "="; return '='; case '&': c = input_stack::peek_char(); if (c == '&') { input_stack::get_char(); context_buffer = "&&"; return ANDAND; } context_buffer = "&"; return '&'; case '|': c = input_stack::peek_char(); if (c == '|') { input_stack::get_char(); context_buffer = "||"; return OROR; } context_buffer = "|"; return '|'; default: if (c != EOF && csalpha(c)) { token_buffer.clear(); token_buffer = c; for (;;) { c = input_stack::peek_char(); if (c == EOF || (!csalnum(c) && c != '_')) break; input_stack::get_char(); token_buffer += char(c); } int tok = lookup_keyword(token_buffer.contents(), token_buffer.length()); if (tok != 0) { context_buffer = token_buffer; return tok; } char *def = 0; if (lookup_flag) { token_buffer += '\0'; def = macro_table.lookup(token_buffer.contents()); token_buffer.set_length(token_buffer.length() - 1); if (def) { if (c == '(') { input_stack::get_char(); interpolate_macro_with_args(def); } else input_stack::push(new macro_input(def)); } } if (!def) { context_buffer = token_buffer; if (csupper(token_buffer[0])) return LABEL; else return VARIABLE; } } else { context_buffer = char(c); return (unsigned char)c; } break; } } }