コード例 #1
0
ファイル: stext-device.c プロジェクト: ziel/mupdf
static void
fz_add_text_char_imp(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_matrix *trm, float adv, int wmode)
{
	int can_append = 1;
	int add_space = 0;
	fz_point dir, ndir, p, q, r;
	float size;
	fz_point delta;
	float spacing = 0;
	float base_offset = 0;

	if (wmode == 0)
	{
		dir.x = 1;
		dir.y = 0;
	}
	else
	{
		dir.x = 0;
		dir.y = -1;
	}
	fz_transform_vector(&dir, trm);
	ndir = dir;
	fz_normalize_vector(&ndir);
	/* dir = direction vector for motion. ndir = normalised(dir) */

	size = fz_matrix_expansion(trm);

	/* We need to identify where glyphs 'start' (p) and 'stop' (q).
	 * Each glyph holds it's 'start' position, and the next glyph in the
	 * span (or span->max if there is no next glyph) holds it's 'end'
	 * position.
	 *
	 * For both horizontal and vertical motion, trm->{e,f} gives the
	 * bottom left corner of the glyph.
	 *
	 * In horizontal mode:
	 *   + p is bottom left.
	 *   + q is the bottom right
	 * In vertical mode:
	 *   + p is top left (where it advanced from)
	 *   + q is bottom left
	 */
	if (wmode == 0)
	{
		p.x = trm->e;
		p.y = trm->f;
		q.x = trm->e + adv * dir.x;
		q.y = trm->f + adv * dir.y;
	}
	else
	{
		p.x = trm->e - adv * dir.x;
		p.y = trm->f - adv * dir.y;
		q.x = trm->e;
		q.y = trm->f;
	}

	if (dev->cur_span == NULL ||
		trm->a != dev->cur_span->transform.a || trm->b != dev->cur_span->transform.b ||
		trm->c != dev->cur_span->transform.c || trm->d != dev->cur_span->transform.d ||
		dev->cur_span->wmode != wmode)
	{
		/* If the matrix has changed, or the wmode is different (or
		 * if we don't have a span at all), then we can't append. */
#ifdef DEBUG_SPANS
		printf("Transform/WMode changed\n");
#endif
		can_append = 0;
	}
	else
	{
		/* Calculate how far we've moved since the end of the current
		 * span. */
		delta.x = p.x - dev->cur_span->max.x;
		delta.y = p.y - dev->cur_span->max.y;

		/* The transform has not changed, so we know we're in the same
		 * direction. Calculate 2 distances; how far off the previous
		 * baseline we are, together with how far along the baseline
		 * we are from the expected position. */
		spacing = ndir.x * delta.x + ndir.y * delta.y;
		base_offset = -ndir.y * delta.x + ndir.x * delta.y;

		spacing /= size * SPACE_DIST;
		spacing = fabsf(spacing);
		if (fabsf(base_offset) < size * 0.1)
		{
			/* Only a small amount off the baseline - we'll take this */
			if (spacing < 1.0)
			{
				/* Motion is in line, and small. */
			}
			else if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST))
			{
				/* Motion is in line, but large enough
				 * to warrant us adding a space */
				if (dev->lastchar != ' ' && wmode == 0)
					add_space = 1;
			}
			else
			{
				/* Motion is in line, but too large - split to a new span */
				can_append = 0;
			}
		}
		else
		{
			can_append = 0;
#ifdef DEBUG_SPANS
			spacing = 0;
#endif
		}
	}

#ifdef DEBUG_SPANS
	printf("%c%c append=%d space=%d size=%g spacing=%g base_offset=%g\n", dev->lastchar, c, can_append, add_space, size, spacing, base_offset);
#endif

	if (can_append == 0)
	{
		/* Start a new span */
		add_span_to_soup(ctx, dev->spans, dev->cur_span);
		dev->cur_span = NULL;
		dev->cur_span = fz_new_text_span(ctx, &p, wmode, trm);
		dev->cur_span->spacing = 0;
	}
	if (add_space)
	{
		r.x = - 0.2f;
		r.y = 0;
		fz_transform_point(&r, trm);
		add_char_to_span(ctx, dev->cur_span, ' ', &p, &r, style);
	}
	add_char_to_span(ctx, dev->cur_span, c, &p, &q, style);
}
コード例 #2
0
void
fz_analyze_text(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page)
{
	fz_text_line *line;
	fz_text_span *span;
	line_heights *lh;
	region_masks *rms;
	int block_num;

	/* Simple paragraph analysis; look for the most common 'inter line'
	 * spacing. This will be assumed to be our line spacing. Anything
	 * more than 25% wider than this will be assumed to be a paragraph
	 * space. */

	/* Step 1: Gather the line height information */
	lh = new_line_heights(ctx);
	for (block_num = 0; block_num < page->len; block_num++)
	{
		fz_text_block *block;

		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
			continue;
		block = page->blocks[block_num].u.text;

		for (line = block->lines; line < block->lines + block->len; line++)
		{
			/* For every style in the line, add lineheight to the
			 * record for that style. FIXME: This is a nasty n^2
			 * algorithm at the moment. */
			fz_text_style *style = NULL;

			if (line->distance == 0)
				continue;

			for (span = line->first_span; span; span = span->next)
			{
				int char_num;

				if (is_list_entry(line, span, &char_num))
					goto list_entry;

				for (; char_num < span->len; char_num++)
				{
					fz_text_char *chr = &span->text[char_num];

					/* Ignore any whitespace chars */
					if (is_unicode_wspace(chr->c))
						continue;

					if (chr->style != style)
					{
						/* Have we had this style before? */
						int match = 0;
						fz_text_span *span2;
						for (span2 = line->first_span; span2 != span; span2 = span2->next)
						{
							int char_num2;
							for (char_num2 = 0; char_num2 < span2->len; char_num2++)
							{
								fz_text_char *chr2 = &span2->text[char_num2];
								if (chr2->style == chr->style)
								{
									match = 1;
									break;
								}
							}
						}
						if (char_num > 0 && match == 0)
						{
							fz_text_span *span2 = span;
							int char_num2;
							for (char_num2 = 0; char_num2 < char_num; char_num2++)
							{
								fz_text_char *chr2 = &span2->text[char_num2];
								if (chr2->style == chr->style)
								{
									match = 1;
									break;
								}
							}
						}
						if (match == 0)
							insert_line_height(lh, chr->style, line->distance);
						style = chr->style;
					}
				}
list_entry:
				{}
			}
		}
	}

	/* Step 2: Find the most popular line height for each style */
	cull_line_heights(lh);

	/* Step 3: Run through the blocks, breaking each block into two if
	 * the line height isn't right. */
	for (block_num = 0; block_num < page->len; block_num++)
	{
		int line_num;
		fz_text_block *block;

		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
			continue;
		block = page->blocks[block_num].u.text;

		for (line_num = 0; line_num < block->len; line_num++)
		{
			/* For every style in the line, check to see if lineheight
			 * is correct for that style. FIXME: We check each style
			 * more than once, currently. */
			int ok = 0; /* -1 = early exit, split now. 0 = split. 1 = don't split. */
			fz_text_style *style = NULL;
			line = &block->lines[line_num];

			if (line->distance == 0)
				continue;

#ifdef DEBUG_LINE_HEIGHTS
			printf("line height=%g\n", line->distance);
#endif
			for (span = line->first_span; span; span = span->next)
			{
				int char_num;

				if (is_list_entry(line, span, &char_num))
					goto force_paragraph;

				/* Now we do the rest of the line */
				for (; char_num < span->len; char_num++)
				{
					fz_text_char *chr = &span->text[char_num];

					/* Ignore any whitespace chars */
					if (is_unicode_wspace(chr->c))
						continue;

					if (chr->style != style)
					{
						float proper_step = line_height_for_style(lh, chr->style);
						if (proper_step * 0.95 <= line->distance && line->distance <= proper_step * 1.05)
						{
							ok = 1;
							break;
						}
						style = chr->style;
					}
				}
				if (ok)
					break;
			}
			if (!ok)
			{
force_paragraph:
				split_block(ctx, page, block_num, line_num);
				break;
			}
		}
	}
	free_line_heights(lh);

	/* Simple line region analysis:
	 * For each line:
	 *	form a list of 'start/stop' points (henceforth a 'region mask')
	 *	find the normalised baseline vector for the line.
	 *	Store the region mask and baseline vector.
	 * Collate lines that have compatible region masks and identical
	 * baseline vectors.
	 * If the collated masks are column-like, then split into columns.
	 * Otherwise split into tables.
	 */
	rms = new_region_masks(ctx);

	/* Step 1: Form the region masks and store them into a list with the
	 * normalised baseline vectors. */
	for (block_num = 0; block_num < page->len; block_num++)
	{
		fz_text_block *block;

		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
			continue;
		block = page->blocks[block_num].u.text;

		for (line = block->lines; line < block->lines + block->len; line++)
		{
			fz_point blv;
			region_mask *rm;

#ifdef DEBUG_MASKS
			printf("Line: ");
			dump_line(line);
#endif
			blv = line->first_span->max;
			blv.x -= line->first_span->min.x;
			blv.y -= line->first_span->min.y;
			fz_normalize_vector(&blv);

			rm = new_region_mask(ctx, &blv);
			for (span = line->first_span; span; span = span->next)
			{
				fz_point *region_min = &span->min;
				fz_point *region_max = &span->max;

				/* Treat adjacent spans as one big region */
				while (span->next && span->next->spacing < 1.5)
				{
					span = span->next;
					region_max = &span->max;
				}

				region_mask_add(rm, region_min, region_max);
			}
#ifdef DEBUG_MASKS
			dump_region_mask(rm);
#endif
			region_masks_add(rms, rm);
		}
	}

	/* Step 2: Sort the region_masks by size of masked region */
	region_masks_sort(rms);

#ifdef DEBUG_MASKS
	printf("Sorted list of regions:\n");
	dump_region_masks(rms);
#endif
	/* Step 3: Merge the region masks where possible (large ones first) */
	{
		int i;
		region_masks *rms2;
		rms2 = new_region_masks(ctx);
		for (i=0; i < rms->len; i++)
		{
			region_mask *rm = rms->mask[i];
			rms->mask[i] = NULL;
			region_masks_merge(rms2, rm);
		}
		free_region_masks(rms);
		rms = rms2;
	}

#ifdef DEBUG_MASKS
	printf("Merged list of regions:\n");
	dump_region_masks(rms);
#endif

	/* Step 4: Figure out alignment */
	region_masks_alignment(rms);

	/* Step 5: At this point, we should probably look at the region masks
	 * to try to guess which ones represent columns on the page. With our
	 * current code, we could only get blocks of lines that span 2 or more
	 * columns if the PDF producer wrote text out horizontally across 2
	 * or more columns, and we've never seen that (yet!). So we skip this
	 * step for now. */

	/* Step 6: Run through the lines again, deciding which ones fit into
	 * which region mask. */
	{
	region_mask *prev_match = NULL;
	for (block_num = 0; block_num < page->len; block_num++)
	{
		fz_text_block *block;

		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
			continue;
		block = page->blocks[block_num].u.text;

		for (line = block->lines; line < block->lines + block->len; line++)
		{
			fz_point blv;
			region_mask *rm;
			region_mask *match;

			blv = line->first_span->max;
			blv.x -= line->first_span->min.x;
			blv.y -= line->first_span->min.y;
			fz_normalize_vector(&blv);

#ifdef DEBUG_MASKS
			dump_line(line);
#endif
			rm = new_region_mask(ctx, &blv);
			for (span = line->first_span; span; span = span->next)
			{
				fz_point *region_min = &span->min;
				fz_point *region_max = &span->max;

				/* Treat adjacent spans as one big region */
				while (span->next && span->next->spacing < 1.5)
				{
					span = span->next;
					region_max = &span->max;
				}

				region_mask_add(rm, region_min, region_max);
			}
#ifdef DEBUG_MASKS
			printf("Mask: ");
			dump_region_mask(rm);
#endif
			match = region_masks_match(rms, rm, line, prev_match);
			prev_match = match;
#ifdef DEBUG_MASKS
			printf("Matches: ");
			dump_region_mask(match);
#endif
			free_region_mask(rm);
			span = line->first_span;
			while (span)
			{
				fz_point *region_min = &span->min;
				fz_point *region_max = &span->max;
				fz_text_span *sn;
				int col, align;
				float colw, left;

				/* Treat adjacent spans as one big region */
#ifdef DEBUG_ALIGN
				dump_span(span);
#endif
				for (sn = span->next; sn && sn->spacing < 1.5; sn = sn->next)
				{
					region_max = &sn->max;
#ifdef DEBUG_ALIGN
					dump_span(sn);
#endif
				}
				col = region_mask_column(match, region_min, region_max, &align, &colw, &left);
#ifdef DEBUG_ALIGN
				printf(" = col%d colw=%g align=%d\n", col, colw, align);
#endif
				do
				{
					span->column = col;
					span->align = align;
					span->indent = left;
					span->column_width = colw;
					span = span->next;
				}
				while (span != sn);

				if (span)
					span = span->next;
			}
			line->region = match;
		}
	}
	free_region_masks(rms);
	}

	/* Step 7: Collate lines within a block that share the same region
	 * mask. */
	for (block_num = 0; block_num < page->len; block_num++)
	{
		int line_num;
		int prev_line_num;

		fz_text_block *block;

		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
			continue;
		block = page->blocks[block_num].u.text;

		/* First merge lines. This may leave empty lines behind. */
		for (prev_line_num = 0, line_num = 1; line_num < block->len; line_num++)
		{
			fz_text_line *prev_line;
			line = &block->lines[line_num];
			if (!line->first_span)
				continue;
			prev_line = &block->lines[prev_line_num];
			if (prev_line->region == line->region)
			{
				/* We only merge lines if the second line
				 * only uses 1 of the columns. */
				int col = line->first_span->column;
				/* Copy the left value for the first span
				 * in the first column in this line forward
				 * for all the rest of the spans in the same
				 * column. */
				float indent = line->first_span->indent;
				for (span = line->first_span->next; span; span = span->next)
				{
					if (col != span->column)
						break;
					span->indent = indent;
				}
				if (span)
				{
					prev_line_num = line_num;
					continue;
				}

				/* Merge line into prev_line */
				{
					fz_text_span **prev_line_span = &prev_line->first_span;
					int try_dehyphen = -1;
					fz_text_span *prev_span = NULL;
					span = line->first_span;
					while (span && *prev_line_span)
					{
						/* Skip forwards through the original
						 * line, until we find a place where
						 * span should go. */
						if ((*prev_line_span)->column <= span->column)
						{
							/* The current span we are considering
							 * in prev_line is earlier than span.
							 * Just skip forwards in prev_line. */
							prev_span = (*prev_line_span);
							prev_line_span = &prev_span->next;
							try_dehyphen = span->column;
						}
						else
						{
							/* We want to copy span into prev_line. */
							fz_text_span *next = (*prev_line_span)->next;

							if (prev_line_span == &prev_line->first_span)
								prev_line->first_span = span;
							if (next == NULL)
								prev_line->last_span = span;
							if (try_dehyphen == span->column)
								dehyphenate(prev_span, span);
							try_dehyphen = -1;
							prev_span = *prev_line_span = span;
							span = span->next;
							(*prev_line_span)->next = next;
							prev_line_span = &(*prev_line_span)->next;
						}
					}
					if (span)
					{
						*prev_line_span = span;
						prev_line->last_span = line->last_span;
					}

					line->first_span = NULL;
					line->last_span = NULL;
				}
			}
			else
				prev_line_num = line_num;
		}

		/* Now get rid of the empty lines */
		for (prev_line_num = 0, line_num = 0; line_num < block->len; line_num++)
		{
			line = &block->lines[line_num];
			if (line->first_span)
				block->lines[prev_line_num++] = *line;
		}
		block->len = prev_line_num;

		/* Now try to spot indents */
		for (line_num = 0; line_num < block->len; line_num++)
		{
			fz_text_span *span_num, *sn;
			int col, count;
			line = &block->lines[line_num];

			/* Run through the spans... */
			span_num = line->first_span;
			{
				float indent = 0;
				/* For each set of spans that share the same
				 * column... */
				col = span_num->column;
#ifdef DEBUG_INDENTS
				printf("Indent %g: ", span_num->indent);
				dump_span(span_num);
				printf("\n");
#endif

				/* find the average indent of all but the first.. */
				for (sn = span_num->next, count = 0; sn && sn->column == col; sn = sn->next, count++)
				{
#ifdef DEBUG_INDENTS
					printf("Indent %g: ", sn->indent);
					dump_span(sn);
				printf("\n");
#endif
					indent += sn->indent;
					sn->indent = 0;
				}
				if (sn != span_num->next)
					indent /= count;

				/* And compare this indent with the first one... */
#ifdef DEBUG_INDENTS
				printf("Average indent %g ", indent);
#endif
				indent -= span_num->indent;
#ifdef DEBUG_INDENTS
				printf("delta %g ", indent);
#endif
				if (fabsf(indent) < 1)
				{
					/* No indent worth speaking of */
					indent = 0;
				}
#ifdef DEBUG_INDENTS
				printf("recorded %g\n", indent);
#endif
				span_num->indent = indent;
				span_num = sn;
			}
			for (; span_num; span_num = span_num->next)
			{
				span_num->indent = 0;
			}
		}
	}
}
コード例 #3
0
ファイル: stext-device.c プロジェクト: ziel/mupdf
static void
strain_soup(fz_context *ctx, fz_text_device *tdev)
{
	span_soup *soup = tdev->spans;
	fz_text_line *last_line = NULL;
	fz_text_span *last_span = NULL;
	int span_num;

	if (soup == NULL)
		return;

	/* Really dumb implementation to match what we had before */
	for (span_num=0; span_num < soup->len; span_num++)
	{
		fz_text_span *span = soup->spans[span_num];
		int new_line = 1;
		float distance = 0;
		float spacing = 0;
		soup->spans[span_num] = NULL;
		if (last_span)
		{
			/* If we have a last_span, we must have a last_line */
			/* Do span and last_line share the same baseline? */
			fz_point p, q, perp_r;
			float dot;
			float size = fz_matrix_expansion(&span->transform);

#ifdef DEBUG_SPANS
			{
				printf("Comparing: \"");
				dump_span(last_span);
				printf("\" and \"");
				dump_span(span);
				printf("\"\n");
			}
#endif

			p.x = last_line->first_span->max.x - last_line->first_span->min.x;
			p.y = last_line->first_span->max.y - last_line->first_span->min.y;
			fz_normalize_vector(&p);
			q.x = span->max.x - span->min.x;
			q.y = span->max.y - span->min.y;
			fz_normalize_vector(&q);
#ifdef DEBUG_SPANS
			printf("last_span=%g %g -> %g %g = %g %g\n", last_span->min.x, last_span->min.y, last_span->max.x, last_span->max.y, p.x, p.y);
			printf("span     =%g %g -> %g %g = %g %g\n", span->min.x, span->min.y, span->max.x, span->max.y, q.x, q.y);
#endif
			perp_r.y = last_line->first_span->min.x - span->min.x;
			perp_r.x = -(last_line->first_span->min.y - span->min.y);
			/* Check if p and q are parallel. If so, then this
			 * line is parallel with the last one. */
			dot = p.x * q.x + p.y * q.y;
			if (fabsf(dot) > 0.9995)
			{
				/* If we take the dot product of normalised(p) and
				 * perp(r), we get the perpendicular distance from
				 * one line to the next (assuming they are parallel). */
				distance = p.x * perp_r.x + p.y * perp_r.y;
				/* We allow 'small' distances of baseline changes
				 * to cope with super/subscript. FIXME: We should
				 * gather subscript/superscript information here. */
				new_line = (fabsf(distance) > size * LINE_DIST);
			}
			else
			{
				new_line = 1;
				distance = 0;
			}
			if (!new_line)
			{
				fz_point delta;

				delta.x = span->min.x - last_span->max.x;
				delta.y = span->min.y - last_span->max.y;

				spacing = (p.x * delta.x + p.y * delta.y);
				spacing = fabsf(spacing);
				/* Only allow changes in baseline (subscript/superscript etc)
				 * when the spacing is small. */
				if (spacing * fabsf(distance) > size * LINE_DIST && fabsf(distance) > size * 0.1f)
				{
					new_line = 1;
					distance = 0;
					spacing = 0;
				}
				else
				{
					spacing /= size * SPACE_DIST;
					/* Apply the same logic here as when we're adding chars to build spans. */
					if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST))
						spacing = 1;
				}
			}
#ifdef DEBUG_SPANS
			printf("dot=%g new_line=%d distance=%g size=%g spacing=%g\n", dot, new_line, distance, size, spacing);
#endif
		}
		span->spacing = spacing;
		last_line = push_span(ctx, tdev, span, new_line, distance);
		last_span = span;
	}
}