static void db_print_forward_references(void) { size_t n; size_t *printed_at = (size_t *)Calloc(Text_Length(), sizeof (size_t)); for (n = 1; n < Text_Length(); n++) { size_t fw = forward_reference[n]; if (fw == 0) continue; fprintf(Debug_File, "FWR[%s]:", any_uint2string(n, 0)); if (printed_at[fw]) { fprintf(Debug_File, " see %s", any_uint2string(printed_at[fw], 0)); } else { while (fw) { fprintf(Debug_File, " %s", any_uint2string(fw, 0)); printed_at[fw] = n; fw = forward_reference[fw]; } } fprintf(Debug_File, "\n"); } Free((void *)printed_at); }
static void make_forward_references_hash2(void) { size_t i; /* Clean out spurious matches, by a quadratic algorithm. Note that we do not want to eliminate overlapping sequences in this stage, since we might be removing the wrong copy. */ for (i = 0; i+Min_Run_Size < Text_Length(); i++) { size_t j = i; size_t h2 = hash2(&Token_Array[i]); /* Find the first token sequence in the chain with same secondary hash code. */ while ( /* there is still a forward reference */ (j = forward_reference[j]) && /* its hash code does not match */ hash2(&Token_Array[j]) != h2 ) { /* continue searching */ } /* short-circuit forward reference to it, or to zero */ forward_reference[i] = j; } #ifdef DB_FORW_REF db_forward_references("second hashing"); #endif /* DB_FORW_REF */ }
static void init_hash_table(void) { int n; /* find the ideal hash table size */ n = 0; while (prime[n] < Text_Length()) { n++; /* this will always terminate, if prime[] is large enough */ } /* see if we can allocate that much space, and if not, step down */ last_index = 0; while (!last_index && n >= 0) { hash_table_size = prime[n]; last_index = (size_t *) TryCalloc(hash_table_size, sizeof (size_t)); n--; } if (!last_index) { fatal("out of memory"); } /* find sample positions */ for (n = 0; n < N_SAMPLES; n++) { /* straigh-line approximation; uninituitive as usual */ sample_pos[n] = ( (2 * n * (Min_Run_Size - 1) + (N_SAMPLES - 1)) / (2 * (N_SAMPLES - 1)) ); } }
static void db_forward_references(const char *msg) { size_t n; size_t n_frw_chains = 0; /* number of forward ref. chains */ size_t tot_frwc_len = 0; char *crossed_out; fprintf(Debug_File, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg); fprintf(Debug_File, "hash_table_size = %s\n", any_uint2string(hash_table_size, 0)); fprintf(Debug_File, "N_SAMPLES = %d\n", N_SAMPLES); crossed_out = (char *)Calloc(Text_Length(), sizeof (char)); /* Each forward_reference[n] starts in principle a new chain, and these chains never touch each other. We check this property by marking the positions in each chain in an array; if we meet a marked entry while following a chain, it must have been on an earlier chain and we have an error. We also determine the lengths of the chains, for statistics. */ if (forward_reference[0]) { fprintf(Debug_File, ">>>> forward_reference[0] is not zero <<<<\n" ); } for (n = 1; n < Text_Length(); n++) { if (forward_reference[n] && !crossed_out[n]) { /* start of a new chain */ n_frw_chains++; tot_frwc_len += db_frw_chain(n, crossed_out); } } db_print_forward_references(); Free((char *)crossed_out); fprintf(Debug_File, "text length = %s, # forward chains = %s, total frw chain length = %s\n\n", any_uint2string(Text_Length(), 0), any_uint2string(n_frw_chains, 0), any_uint2string(tot_frwc_len, 0) ); }
void Make_Forward_References(void) { /* Constructs the forward references table. */ n_forward_references = Text_Length(); forward_reference = (size_t *)Calloc( n_forward_references, sizeof (size_t) ); make_forward_references_hash1(); make_forward_references_hash2(); #ifdef DB_FORW_REF make_forward_references_hash3(); #endif }
static void make_forward_references_hash3(void) { size_t i; /* Do a third hash to check up on the previous two */ /* This time we use a genuine compare */ for (i = 0; i+Min_Run_Size < Text_Length(); i++) { size_t j = i; while ( /* there is still a forward reference */ (j = forward_reference[j]) && /* its hash code does not match */ !hash3(&Token_Array[i], &Token_Array[j]) ) { /* continue searching */ } /* short-circuit forward reference to it, or to zero */ forward_reference[i] = j; } db_forward_references("third hashing"); }
void Read_Input_Files(int argc, const char *argv[], int round) { int n; Init_Text(argc); Init_Token_Array(); /* Assume all texts to be new */ Number_Of_New_Texts = Number_Of_Texts; /* Read the files */ for (n = 0; n < Number_Of_Texts; n++) { const char *fname = argv[n]; struct text *txt = &Text[n]; if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "File %s: ", fname); } txt->tx_fname = fname; txt->tx_pos = 0; txt->tx_start = txt->tx_limit = Text_Length(); if (is_new_old_separator(fname)) { if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "separator\n"); } Number_Of_New_Texts = n; } else { if (!Open_Text(First, txt)) { if (round == 1 && !is_set_option('T')) { fprintf(Output_File, ">>>> cannot open <<<< "); } /* the file has still been opened with a null file for uniformity */ } while (Next_Text_Token_Obtained(First)) { if (!Token_EQ(lex_token, End_Of_Line)) { Store_Token(lex_token); } } Close_Text(First, txt); txt->tx_limit = Text_Length(); /* report */ if (round == 1 && !is_set_option('T')) { fprint_count(Output_File, txt->tx_limit - txt->tx_start, token_name ); fprintf(Output_File, ", "); fprint_count(Output_File, lex_nl_cnt-1, "line"); if (lex_non_ascii_cnt) { fprintf(Output_File, ", "); fprint_count(Output_File, lex_non_ascii_cnt, "non-ASCII character" ); } fprintf(Output_File, "\n"); } #ifdef DB_TEXT db_print_text(txt); #endif /* DB_TEXT */ } fflush(Output_File); } /* report total */ if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "Total: "); fprint_count(Output_File, Text_Length() - 1, token_name); fprintf(Output_File, "\n\n"); fflush(Output_File); } }