void fprint_token(FILE *ofile, const Token tk) { /* Prints a regular token in two characters: normal char meta (bit 9 set) ^A cntl $A meta-cntl A printable #A meta and hashed tokens in hexadecimal. */ int tki = Token2int(tk); int ch = tki & 0x7F; int bit8 = tki & 0x80; if (Token_EQ(tk, No_Token)) {fprintf(ofile, "--"); return;} if (Token_EQ(tk, IDF)) {fprintf(ofile, "IDF"); return;} if (Token_EQ(tk, End_Of_Line)) {fprintf(ofile, "EOL"); return;} if (is_simple_token(tk)) { if ('!' <= ch && ch <= '~') { fprintf(ofile, "%s%c", (bit8 ? "8" : ""), ch); return; } if (0 < ch && ch <= ' ') { fprintf(ofile, "%s%c", (bit8 ? "$" : "^"), ch + '@'); return; } if (ch == 0x7F) { fprintf(ofile, "%s%c", (bit8 ? "$" : "^"), '?'); return; } } if (is_CTRL_token(tk)) { if (check_and_print(ofile, "CTRL", ch, 'A', '~', '@')) return; } if (is_NORM_token(tk)) { if (check_and_print(ofile, "NORM", ch, '!', '~', '\0')) return; } if (is_MTCT_token(tk)) { if (check_and_print(ofile, "MTCT", ch, 'A', '~', '@')) return; } if (is_META_token(tk)) { if (check_and_print(ofile, "META", ch, '!', '~', '\0')) return; } if (is_hashed_token(tk)) { fprintf(ofile, "0x%04x", tki); return; } /* gap token! */ fprintf(ofile, "!0x%04x!", tki); }
int Next_Text_Token_Obtained(void) { if (!Next_Stream_Token_Obtained()) return 0; if (Token_EQ(lex_token, End_Of_Line)) { store_newline(); last_tk_cnt = lex_tk_cnt; } return 1; }
static int hash3(const Token *p, const Token *q) { /* a full comparison for the tertiary sweep */ size_t n; for (n = 0; n < Min_Run_Size; n++) { if (!Token_EQ(p[n], q[n])) return 0; } return 1; }
int Next_Text_Token_Obtained(enum Pass pass) { int ok = 0; /* gcc does not understand enum Pass */ switch (pass) { case First: ok = Next_Stream_Token_Obtained(); if (Token_EQ(lex_token, End_Of_Line)) { store_newline(); last_tk_cnt = lex_tk_cnt; } break; case Second: /* get newline info from the buffer or from the file itself */ if (nl_buff) { if (nl_next == nl_limit) { ok = 0; } else { struct newline *nl = &nl_buff[nl_next++]; lex_nl_cnt = ++last_nl_cnt; lex_tk_cnt = (last_tk_cnt += nl->nl_tk_diff); lex_token = End_Of_Line; ok = 1; } } else { while ( (ok = Next_Stream_Token_Obtained()) && !Token_EQ(lex_token, End_Of_Line) ) { /* skip */ } } break; } return ok; }
int Next_Text_EOL_Obtained(void) { /* get newline info from the buffer or from the file itself */ if (nl_buff) { if (nl_next == nl_limit) return 0; struct newline *nl = &nl_buff[nl_next++]; lex_nl_cnt = ++last_nl_cnt; lex_tk_cnt = (last_tk_cnt += nl->nl_tk_diff); lex_token = End_Of_Line; return 1; } else { int ok; while ( (ok = Next_Stream_Token_Obtained()) && !Token_EQ(lex_token, End_Of_Line) ) { /* skip */ } return ok; } }
void Read_Input_Files(int argc, const char *argv[], int round) { int n; Init_Text(argc); Init_Token_Array(); /* Assume all texts to be new */ Number_Of_New_Texts = Number_Of_Texts; /* Read the files */ for (n = 0; n < Number_Of_Texts; n++) { const char *fname = argv[n]; struct text *txt = &Text[n]; if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "File %s: ", fname); } txt->tx_fname = fname; txt->tx_pos = 0; txt->tx_start = txt->tx_limit = Text_Length(); if (is_new_old_separator(fname)) { if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "separator\n"); } Number_Of_New_Texts = n; } else { if (!Open_Text(First, txt)) { if (round == 1 && !is_set_option('T')) { fprintf(Output_File, ">>>> cannot open <<<< "); } /* the file has still been opened with a null file for uniformity */ } while (Next_Text_Token_Obtained(First)) { if (!Token_EQ(lex_token, End_Of_Line)) { Store_Token(lex_token); } } Close_Text(First, txt); txt->tx_limit = Text_Length(); /* report */ if (round == 1 && !is_set_option('T')) { fprint_count(Output_File, txt->tx_limit - txt->tx_start, token_name ); fprintf(Output_File, ", "); fprint_count(Output_File, lex_nl_cnt-1, "line"); if (lex_non_ascii_cnt) { fprintf(Output_File, ", "); fprint_count(Output_File, lex_non_ascii_cnt, "non-ASCII character" ); } fprintf(Output_File, "\n"); } #ifdef DB_TEXT db_print_text(txt); #endif /* DB_TEXT */ } fflush(Output_File); } /* report total */ if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "Total: "); fprint_count(Output_File, Text_Length() - 1, token_name); fprintf(Output_File, "\n\n"); fflush(Output_File); } }
static size_t lcs( struct text *txt0, /* input: starting position */ size_t i0, struct text **tbp, /* output: position of best run */ size_t *ibp, size_t i_first, /* no comparison before this pos. */ size_t i_limit /* no comparison after this pos. */ ) { /* Finds the longest common substring (not subsequence) in: txt0, starting precisely at i0 and the text from i_first to i_limit-1. Writes the position in tbp and ibp and returns the size. Returns 0 if no common substring is found. */ struct text *txt1 = txt0; size_t i1 = i0; size_t size_best = 0; while ( /* there is a next opportunity */ (i1 = Forward_Reference(i1)) && /* it is still in range */ i1 < i_limit ) { size_t min_size= (size_best ? size_best+1 : Min_Run_Size); if (i1 < i_first) { /* not in range */ continue; } /* bump txt1; we may have to skip a text or two */ while (i1 >= txt1->tx_limit) { txt1++; } /* are we looking at something better than we have got? */ { /* comparing backwards */ size_t j0 = i0 + min_size - 1; size_t j1 = i1 + min_size - 1; if ( /* j0 still inside txt0 */ j0 < txt0->tx_limit && /* j1 still inside txt1 */ j1 < txt1->tx_limit && /* j0 and j1 don't overlap */ j0 + min_size <= j1 ) { /* there is room enough for a match */ size_t cnt = min_size; /* text matches for at least min_size tokens? */ while ( cnt && Token_EQ(Token_Array[j0], Token_Array[j1]) ) { cnt--, j0--, j1--; } if (cnt) continue; /* forget it */ } else continue; /* forget it */ } /* yes, we are; how long can we make it? */ size_t new_size = min_size; { /* extending forwards */ size_t j0 = i0 + min_size; size_t j1 = i1 + min_size; while ( /* j0 still inside txt0 */ j0 < txt0->tx_limit && /* j1 still inside txt1 */ j1 < txt1->tx_limit && /* j0 and j1 don't overlap */ j0 + new_size < j1 && /* tokens are the same */ Token_EQ(Token_Array[j0], Token_Array[j1]) ) { j0++, j1++, new_size++; } } /* offer the run to the Language Department which may reject it or may cut its tail */ new_size = ( May_Be_Start_Of_Run(Token_Array[i0]) ? Best_Run_Size(&Token_Array[i0], new_size) : 0 ); if ( /* we still have something acceptable */ new_size >= Min_Run_Size && /* it is better still than what we had */ new_size > size_best ) { /* record it */ *tbp = txt1; *ibp = i1; size_best = new_size; } } return size_best; }
void Read_Input_Files(int argc, const char *argv[]) { int n; Init_Text(argc); Init_Token_Array(); /* Initially assume all texts to be new */ Number_of_New_Texts = Number_of_Texts; /* Read the files */ for (n = 0; n < Number_of_Texts; n++) { const char *fname = argv[n]; struct text *txt = &Text[n]; if (!is_set_option('T')) { fprintf(Output_File, "File %s: ", fname); } txt->tx_fname = fname; txt->tx_pos = 0; txt->tx_start = Token_Array_Length(); txt->tx_limit = Token_Array_Length(); if (is_new_old_separator(fname)) { if (!is_set_option('T')) { fprintf(Output_File, "new/old separator\n"); } if (Number_of_New_Texts == Number_of_Texts) { Number_of_New_Texts = n; } else fatal("more than one new/old separator"); } else { int file_opened = 0; if (Open_Text(First_Pass, txt)) { file_opened = 1; } else { /* print a warning */ if (is_set_option('T')) { /* the file name has not yet been printed; print it now */ fprintf(Output_File, "File %s: ", fname); } fprintf(Output_File, ">>>> cannot open <<<<\n"); /* the file has still been opened with a null file for uniformity */ } while (Next_Text_Token_Obtained()) { if (!Token_EQ(lex_token, End_Of_Line)) { Store_Token(lex_token); } } Close_Text(First_Pass, txt); txt->tx_limit = Token_Array_Length(); txt->tx_EOL_terminated = Token_EQ(lex_token, End_Of_Line); /* report */ if (file_opened && !is_set_option('T')) { fprint_count(Output_File, txt->tx_limit - txt->tx_start, Token_Name ); fprintf(Output_File, ", "); fprint_count(Output_File, lex_nl_cnt - 1 + (!txt->tx_EOL_terminated ? 1 : 0), "line" ); if (!txt->tx_EOL_terminated) { fprintf(Output_File, " (not NL-terminated)"); } if (lex_non_ascii_cnt) { fprintf(Output_File, ", "); fprint_count(Output_File, lex_non_ascii_cnt, "non-ASCII character" ); } fprintf(Output_File, "\n"); } #ifdef DB_TEXT db_print_text(txt); #endif /* DB_TEXT */ } fflush(Output_File); } /* report total */ int sep_present = (Number_of_Texts != Number_of_New_Texts); fprintf(Output_File, "Total input: "); fprint_count(Output_File, (!sep_present ? Number_of_Texts : Number_of_Texts - 1), "file" ); fprintf(Output_File, " (%d new, %d old), ", Number_of_New_Texts, (!sep_present ? 0 : Number_of_Texts - Number_of_New_Texts - 1) ); fprint_count(Output_File, Token_Array_Length() - 1, Token_Name); fprintf(Output_File, "\n\n"); fflush(Output_File); }