Ejemplo n.º 1
0
static void
make_forward_references_hash1(void) {
	int n;

	init_hash_table();

	/* set up the forward references using the last_index hash table */
	for (n = 0; n < Number_Of_Texts; n++) {
		struct text *txt = &Text[n];
		size_t j;

		for (	/* all pos'ns in txt except the last Min_Run_Size-1 */
			j = txt->tx_start;			/* >= 1 */
			j + Min_Run_Size - 1 < txt->tx_limit;
			j++
		) {
			if (May_Be_Start_Of_Run(Token_Array[j])) {
				size_t h = hash1(&Token_Array[j]);

				if (last_index[h]) {
					forward_reference[last_index[h]] = j;
				}
				last_index[h] = j;
			}
		}
	}
	Free((char *)last_index);

#ifdef	DB_FORW_REF
	db_forward_references("first hashing");
#endif	/* DB_FORW_REF */
}
Ejemplo n.º 2
0
static size_t
lcs(	struct text *txt0,		/* input: starting position */
	size_t i0,
	struct text **tbp,		/* output: position of best run */
	size_t *ibp,
	size_t i_first,		/* no comparison before this pos. */
	size_t i_limit		/* no comparison after this pos. */
) {
	/*	Finds the longest common substring (not subsequence) in:
			txt0, starting precisely at i0 and
			the text from i_first to i_limit-1.
		Writes the position in tbp and ibp and returns the size.
		Returns 0 if no common substring is found.
	*/
	struct text *txt1 = txt0;
	size_t i1 = i0;
	size_t size_best = 0;

	while (	/* there is a next opportunity */
		(i1 = Forward_Reference(i1))
	&&	/* it is still in range */
		i1 < i_limit
	) {
		size_t min_size= (size_best ? size_best+1 : Min_Run_Size);

		if (i1 < i_first) {	/* not in range */
			continue;
		}

		/* bump txt1; we may have to skip a text or two */
		while (i1 >= txt1->tx_limit) {
			txt1++;
		}

		/* are we looking at something better than we have got? */
		{	/* comparing backwards */
			size_t j0 = i0 + min_size - 1;
			size_t j1 = i1 + min_size - 1;
			if (	/* j0 still inside txt0 */
				j0 < txt0->tx_limit
			&&	/* j1 still inside txt1 */
				j1 < txt1->tx_limit
			&&	/* j0 and j1 don't overlap */
				j0 + min_size <= j1
			) {
				/* there is room enough for a match */
				size_t cnt = min_size;

				/* text matches for at least min_size tokens? */
				while (	cnt
				&&	Token_EQ(Token_Array[j0],
						 Token_Array[j1])
				) {
					cnt--, j0--, j1--;
				}
				if (cnt) continue;	/* forget it */
			}
			else continue;			/* forget it */
		}

		/* yes, we are; how long can we make it? */
		size_t new_size = min_size;
		{	/* extending forwards */
			size_t j0 = i0 + min_size;
			size_t j1 = i1 + min_size;

			while (	/* j0 still inside txt0 */
				j0 < txt0->tx_limit
			&&	/* j1 still inside txt1 */
				j1 < txt1->tx_limit
			&&	/* j0 and j1 don't overlap */
				j0 + new_size < j1
			&&	/* tokens are the same */
				Token_EQ(Token_Array[j0], Token_Array[j1])
			) {
				j0++, j1++, new_size++;
			}
		}

		/*	offer the run to the Language Department which may
			reject it or may cut its tail
		*/
		new_size = (	May_Be_Start_Of_Run(Token_Array[i0])
			   ?	Best_Run_Size(&Token_Array[i0], new_size)
			   :	0
			   );

		if (	/* we still have something acceptable */
			new_size >= Min_Run_Size
		&&	/* it is better still than what we had */
			new_size > size_best
		) {
			/* record it */
			*tbp = txt1;
			*ibp = i1;
			size_best = new_size;
		}
	}

	return size_best;
}