Пример #1
0
void
fprint_token(FILE *ofile, const Token tk) {
	/*	Prints a regular token in two characters:
			normal char		meta (bit 9 set)
			^A	cntl		$A	meta-cntl
			 A	printable	#A	meta
		and hashed tokens in hexadecimal.
	*/
	int tki = Token2int(tk);
	int ch =   tki & 0x7F;
	int bit8 = tki & 0x80;


	if (Token_EQ(tk, No_Token))	{fprintf(ofile, "--"); return;}
	if (Token_EQ(tk, IDF))		{fprintf(ofile, "IDF"); return;}
	if (Token_EQ(tk, End_Of_Line))	{fprintf(ofile, "EOL"); return;}

	if (is_simple_token(tk)) {
		if ('!' <= ch && ch <= '~') {
			fprintf(ofile, "%s%c", (bit8 ? "8" : ""), ch);
			return;
		}
		if (0 < ch && ch <= ' ') {
			fprintf(ofile, "%s%c", (bit8 ? "$" : "^"), ch + '@');
			return;
		}
		if (ch == 0x7F) {
			fprintf(ofile, "%s%c", (bit8 ? "$" : "^"), '?');
			return;
		}
	}

	if (is_CTRL_token(tk)) {
		if (check_and_print(ofile, "CTRL", ch, 'A', '~', '@')) return;
	}

	if (is_NORM_token(tk)) {
		if (check_and_print(ofile, "NORM", ch, '!', '~', '\0')) return;
	}

	if (is_MTCT_token(tk)) {
		if (check_and_print(ofile, "MTCT", ch, 'A', '~', '@')) return;
	}

	if (is_META_token(tk)) {
		if (check_and_print(ofile, "META", ch, '!', '~', '\0')) return;
	}

	if (is_hashed_token(tk)) {
		fprintf(ofile, "0x%04x", tki);
		return;
	}

	/* gap token! */
	fprintf(ofile, "!0x%04x!", tki);
}
Пример #2
0
int
Next_Text_Token_Obtained(void) {
	if (!Next_Stream_Token_Obtained()) return 0;
	if (Token_EQ(lex_token, End_Of_Line)) {
		store_newline();
		last_tk_cnt = lex_tk_cnt;
	}
	return 1;
}
Пример #3
0
static int
hash3(const Token *p, const Token *q) {
	/* a full comparison for the tertiary sweep */
	size_t n;

	for (n = 0; n < Min_Run_Size; n++) {
		if (!Token_EQ(p[n], q[n])) return 0;
	}
	return 1;
}
Пример #4
0
int
Next_Text_Token_Obtained(enum Pass pass) {
	int ok = 0;	/* gcc does not understand enum Pass */

	switch (pass) {
	case First:
		ok = Next_Stream_Token_Obtained();
		if (Token_EQ(lex_token, End_Of_Line)) {
			store_newline();
			last_tk_cnt = lex_tk_cnt;
		}
		break;

	case Second:
		/* get newline info from the buffer or from the file itself */
		if (nl_buff) {
			if (nl_next == nl_limit) {
				ok = 0;
			}
			else {
				struct newline *nl = &nl_buff[nl_next++];

				lex_nl_cnt = ++last_nl_cnt;
				lex_tk_cnt = (last_tk_cnt += nl->nl_tk_diff);
				lex_token = End_Of_Line;
				ok = 1;
			}
		}
		else {
			while (	(ok = Next_Stream_Token_Obtained())
			&&	!Token_EQ(lex_token, End_Of_Line)
			) {
				/* skip */
			}
		}
		break;
	}

	return ok;
}
Пример #5
0
int
Next_Text_EOL_Obtained(void) {
	/* get newline info from the buffer or from the file itself */
	if (nl_buff) {
		if (nl_next == nl_limit) return 0;

		struct newline *nl = &nl_buff[nl_next++];
		lex_nl_cnt = ++last_nl_cnt;
		lex_tk_cnt = (last_tk_cnt += nl->nl_tk_diff);
		lex_token = End_Of_Line;
		return 1;
	} else {
		int ok;
		while (	(ok = Next_Stream_Token_Obtained())
		&&	!Token_EQ(lex_token, End_Of_Line)
		) {
			/* skip */
		}
		return ok;
	}
}
Пример #6
0
void
Read_Input_Files(int argc, const char *argv[], int round) {
	int n;

	Init_Text(argc);
	Init_Token_Array();

	/* Assume all texts to be new */
	Number_Of_New_Texts = Number_Of_Texts;

	/* Read the files */
	for (n = 0; n < Number_Of_Texts; n++) {
		const char *fname = argv[n];
		struct text *txt = &Text[n];

		if (round == 1 && !is_set_option('T')) {
			fprintf(Output_File, "File %s: ", fname);
		}

		txt->tx_fname = fname;
		txt->tx_pos = 0;
		txt->tx_start =
		txt->tx_limit = Text_Length();
		if (is_new_old_separator(fname)) {
			if (round == 1 && !is_set_option('T')) {
				fprintf(Output_File, "separator\n");
			}
			Number_Of_New_Texts = n;
		}
		else {
			if (!Open_Text(First, txt)) {
				if (round == 1 && !is_set_option('T')) {
					fprintf(Output_File,
						">>>> cannot open <<<< ");
				}
				/*	the file has still been opened
					with a null file for uniformity
				*/
			}
			while (Next_Text_Token_Obtained(First)) {
				if (!Token_EQ(lex_token, End_Of_Line)) {
					Store_Token(lex_token);
				}
			}
			Close_Text(First, txt);
			txt->tx_limit = Text_Length();

			/* report */
			if (round == 1 && !is_set_option('T')) {
				fprint_count(Output_File,
					     txt->tx_limit - txt->tx_start,
					     token_name
				);
				fprintf(Output_File, ", ");
				fprint_count(Output_File, lex_nl_cnt-1, "line");
				if (lex_non_ascii_cnt) {
					fprintf(Output_File, ", ");
					fprint_count(Output_File,
						     lex_non_ascii_cnt,
						     "non-ASCII character"
					);
				}
				fprintf(Output_File, "\n");
			}

#ifdef	DB_TEXT
			db_print_text(txt);
#endif	/* DB_TEXT */
		}
		fflush(Output_File);
	}

	/* report total */
	if (round == 1 && !is_set_option('T')) {
		fprintf(Output_File, "Total: ");
		fprint_count(Output_File, Text_Length() - 1, token_name);
		fprintf(Output_File, "\n\n");
		fflush(Output_File);
	}
}
Пример #7
0
static size_t
lcs(	struct text *txt0,		/* input: starting position */
	size_t i0,
	struct text **tbp,		/* output: position of best run */
	size_t *ibp,
	size_t i_first,		/* no comparison before this pos. */
	size_t i_limit		/* no comparison after this pos. */
) {
	/*	Finds the longest common substring (not subsequence) in:
			txt0, starting precisely at i0 and
			the text from i_first to i_limit-1.
		Writes the position in tbp and ibp and returns the size.
		Returns 0 if no common substring is found.
	*/
	struct text *txt1 = txt0;
	size_t i1 = i0;
	size_t size_best = 0;

	while (	/* there is a next opportunity */
		(i1 = Forward_Reference(i1))
	&&	/* it is still in range */
		i1 < i_limit
	) {
		size_t min_size= (size_best ? size_best+1 : Min_Run_Size);

		if (i1 < i_first) {	/* not in range */
			continue;
		}

		/* bump txt1; we may have to skip a text or two */
		while (i1 >= txt1->tx_limit) {
			txt1++;
		}

		/* are we looking at something better than we have got? */
		{	/* comparing backwards */
			size_t j0 = i0 + min_size - 1;
			size_t j1 = i1 + min_size - 1;
			if (	/* j0 still inside txt0 */
				j0 < txt0->tx_limit
			&&	/* j1 still inside txt1 */
				j1 < txt1->tx_limit
			&&	/* j0 and j1 don't overlap */
				j0 + min_size <= j1
			) {
				/* there is room enough for a match */
				size_t cnt = min_size;

				/* text matches for at least min_size tokens? */
				while (	cnt
				&&	Token_EQ(Token_Array[j0],
						 Token_Array[j1])
				) {
					cnt--, j0--, j1--;
				}
				if (cnt) continue;	/* forget it */
			}
			else continue;			/* forget it */
		}

		/* yes, we are; how long can we make it? */
		size_t new_size = min_size;
		{	/* extending forwards */
			size_t j0 = i0 + min_size;
			size_t j1 = i1 + min_size;

			while (	/* j0 still inside txt0 */
				j0 < txt0->tx_limit
			&&	/* j1 still inside txt1 */
				j1 < txt1->tx_limit
			&&	/* j0 and j1 don't overlap */
				j0 + new_size < j1
			&&	/* tokens are the same */
				Token_EQ(Token_Array[j0], Token_Array[j1])
			) {
				j0++, j1++, new_size++;
			}
		}

		/*	offer the run to the Language Department which may
			reject it or may cut its tail
		*/
		new_size = (	May_Be_Start_Of_Run(Token_Array[i0])
			   ?	Best_Run_Size(&Token_Array[i0], new_size)
			   :	0
			   );

		if (	/* we still have something acceptable */
			new_size >= Min_Run_Size
		&&	/* it is better still than what we had */
			new_size > size_best
		) {
			/* record it */
			*tbp = txt1;
			*ibp = i1;
			size_best = new_size;
		}
	}

	return size_best;
}
Пример #8
0
void
Read_Input_Files(int argc, const char *argv[]) {
	int n;

	Init_Text(argc);
	Init_Token_Array();

	/* Initially assume all texts to be new */
	Number_of_New_Texts = Number_of_Texts;

	/* Read the files */
	for (n = 0; n < Number_of_Texts; n++) {
		const char *fname = argv[n];
		struct text *txt = &Text[n];

		if (!is_set_option('T')) {
			fprintf(Output_File, "File %s: ", fname);
		}

		txt->tx_fname = fname;
		txt->tx_pos = 0;
		txt->tx_start = Token_Array_Length();
		txt->tx_limit = Token_Array_Length();

		if (is_new_old_separator(fname)) {
			if (!is_set_option('T')) {
				fprintf(Output_File, "new/old separator\n");
			}
			if (Number_of_New_Texts == Number_of_Texts) {
				Number_of_New_Texts = n;
			} else fatal("more than one new/old separator");
		}
		else {
			int file_opened = 0;
			if (Open_Text(First_Pass, txt)) {
				file_opened = 1;
			} else {
				/* print a warning */
				if (is_set_option('T')) {
					/* the file name has not yet been
					   printed; print it now
					*/
					fprintf(Output_File, "File %s: ",
						fname);
				}
				fprintf(Output_File,
					">>>> cannot open <<<<\n");
				/*	the file has still been opened
					with a null file for uniformity
				*/
			}
			while (Next_Text_Token_Obtained()) {
				if (!Token_EQ(lex_token, End_Of_Line)) {
					Store_Token(lex_token);
				}
			}
			Close_Text(First_Pass, txt);
			txt->tx_limit = Token_Array_Length();
			txt->tx_EOL_terminated =
				Token_EQ(lex_token, End_Of_Line);

			/* report */
			if (file_opened && !is_set_option('T')) {
				fprint_count(Output_File,
					     txt->tx_limit - txt->tx_start,
					     Token_Name
				);
				fprintf(Output_File, ", ");
				fprint_count(Output_File,
					lex_nl_cnt - 1 +
					     (!txt->tx_EOL_terminated ? 1 : 0),
					"line"
				);
				if (!txt->tx_EOL_terminated) {
					fprintf(Output_File,
						" (not NL-terminated)");
				}
				if (lex_non_ascii_cnt) {
					fprintf(Output_File, ", ");
					fprint_count(Output_File,
						     lex_non_ascii_cnt,
						     "non-ASCII character"
					);
				}
				fprintf(Output_File, "\n");
			}

#ifdef	DB_TEXT
			db_print_text(txt);
#endif	/* DB_TEXT */
		}
		fflush(Output_File);
	}

	/* report total */
	int sep_present = (Number_of_Texts != Number_of_New_Texts);
	fprintf(Output_File, "Total input: ");
	fprint_count(Output_File,
		     (!sep_present ? Number_of_Texts : Number_of_Texts - 1),
		     "file"
	);
	fprintf(Output_File, " (%d new, %d old), ",
		Number_of_New_Texts,
		(!sep_present ? 0 :  Number_of_Texts - Number_of_New_Texts - 1)
	);
	fprint_count(Output_File, Token_Array_Length() - 1, Token_Name);
	fprintf(Output_File, "\n\n");
	fflush(Output_File);
}