static void fz_add_text_char_imp(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_matrix *trm, float adv, int wmode) { int can_append = 1; int add_space = 0; fz_point dir, ndir, p, q, r; float size; fz_point delta; float spacing = 0; float base_offset = 0; if (wmode == 0) { dir.x = 1; dir.y = 0; } else { dir.x = 0; dir.y = -1; } fz_transform_vector(&dir, trm); ndir = dir; fz_normalize_vector(&ndir); /* dir = direction vector for motion. ndir = normalised(dir) */ size = fz_matrix_expansion(trm); /* We need to identify where glyphs 'start' (p) and 'stop' (q). * Each glyph holds it's 'start' position, and the next glyph in the * span (or span->max if there is no next glyph) holds it's 'end' * position. * * For both horizontal and vertical motion, trm->{e,f} gives the * bottom left corner of the glyph. * * In horizontal mode: * + p is bottom left. * + q is the bottom right * In vertical mode: * + p is top left (where it advanced from) * + q is bottom left */ if (wmode == 0) { p.x = trm->e; p.y = trm->f; q.x = trm->e + adv * dir.x; q.y = trm->f + adv * dir.y; } else { p.x = trm->e - adv * dir.x; p.y = trm->f - adv * dir.y; q.x = trm->e; q.y = trm->f; } if (dev->cur_span == NULL || trm->a != dev->cur_span->transform.a || trm->b != dev->cur_span->transform.b || trm->c != dev->cur_span->transform.c || trm->d != dev->cur_span->transform.d || dev->cur_span->wmode != wmode) { /* If the matrix has changed, or the wmode is different (or * if we don't have a span at all), then we can't append. */ #ifdef DEBUG_SPANS printf("Transform/WMode changed\n"); #endif can_append = 0; } else { /* Calculate how far we've moved since the end of the current * span. */ delta.x = p.x - dev->cur_span->max.x; delta.y = p.y - dev->cur_span->max.y; /* The transform has not changed, so we know we're in the same * direction. Calculate 2 distances; how far off the previous * baseline we are, together with how far along the baseline * we are from the expected position. */ spacing = ndir.x * delta.x + ndir.y * delta.y; base_offset = -ndir.y * delta.x + ndir.x * delta.y; spacing /= size * SPACE_DIST; spacing = fabsf(spacing); if (fabsf(base_offset) < size * 0.1) { /* Only a small amount off the baseline - we'll take this */ if (spacing < 1.0) { /* Motion is in line, and small. */ } else if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST)) { /* Motion is in line, but large enough * to warrant us adding a space */ if (dev->lastchar != ' ' && wmode == 0) add_space = 1; } else { /* Motion is in line, but too large - split to a new span */ can_append = 0; } } else { can_append = 0; #ifdef DEBUG_SPANS spacing = 0; #endif } } #ifdef DEBUG_SPANS printf("%c%c append=%d space=%d size=%g spacing=%g base_offset=%g\n", dev->lastchar, c, can_append, add_space, size, spacing, base_offset); #endif if (can_append == 0) { /* Start a new span */ add_span_to_soup(ctx, dev->spans, dev->cur_span); dev->cur_span = NULL; dev->cur_span = fz_new_text_span(ctx, &p, wmode, trm); dev->cur_span->spacing = 0; } if (add_space) { r.x = - 0.2f; r.y = 0; fz_transform_point(&r, trm); add_char_to_span(ctx, dev->cur_span, ' ', &p, &r, style); } add_char_to_span(ctx, dev->cur_span, c, &p, &q, style); }
void fz_analyze_text(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page) { fz_text_line *line; fz_text_span *span; line_heights *lh; region_masks *rms; int block_num; /* Simple paragraph analysis; look for the most common 'inter line' * spacing. This will be assumed to be our line spacing. Anything * more than 25% wider than this will be assumed to be a paragraph * space. */ /* Step 1: Gather the line height information */ lh = new_line_heights(ctx); for (block_num = 0; block_num < page->len; block_num++) { fz_text_block *block; if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) continue; block = page->blocks[block_num].u.text; for (line = block->lines; line < block->lines + block->len; line++) { /* For every style in the line, add lineheight to the * record for that style. FIXME: This is a nasty n^2 * algorithm at the moment. */ fz_text_style *style = NULL; if (line->distance == 0) continue; for (span = line->first_span; span; span = span->next) { int char_num; if (is_list_entry(line, span, &char_num)) goto list_entry; for (; char_num < span->len; char_num++) { fz_text_char *chr = &span->text[char_num]; /* Ignore any whitespace chars */ if (is_unicode_wspace(chr->c)) continue; if (chr->style != style) { /* Have we had this style before? */ int match = 0; fz_text_span *span2; for (span2 = line->first_span; span2 != span; span2 = span2->next) { int char_num2; for (char_num2 = 0; char_num2 < span2->len; char_num2++) { fz_text_char *chr2 = &span2->text[char_num2]; if (chr2->style == chr->style) { match = 1; break; } } } if (char_num > 0 && match == 0) { fz_text_span *span2 = span; int char_num2; for (char_num2 = 0; char_num2 < char_num; char_num2++) { fz_text_char *chr2 = &span2->text[char_num2]; if (chr2->style == chr->style) { match = 1; break; } } } if (match == 0) insert_line_height(lh, chr->style, line->distance); style = chr->style; } } list_entry: {} } } } /* Step 2: Find the most popular line height for each style */ cull_line_heights(lh); /* Step 3: Run through the blocks, breaking each block into two if * the line height isn't right. */ for (block_num = 0; block_num < page->len; block_num++) { int line_num; fz_text_block *block; if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) continue; block = page->blocks[block_num].u.text; for (line_num = 0; line_num < block->len; line_num++) { /* For every style in the line, check to see if lineheight * is correct for that style. FIXME: We check each style * more than once, currently. */ int ok = 0; /* -1 = early exit, split now. 0 = split. 1 = don't split. */ fz_text_style *style = NULL; line = &block->lines[line_num]; if (line->distance == 0) continue; #ifdef DEBUG_LINE_HEIGHTS printf("line height=%g\n", line->distance); #endif for (span = line->first_span; span; span = span->next) { int char_num; if (is_list_entry(line, span, &char_num)) goto force_paragraph; /* Now we do the rest of the line */ for (; char_num < span->len; char_num++) { fz_text_char *chr = &span->text[char_num]; /* Ignore any whitespace chars */ if (is_unicode_wspace(chr->c)) continue; if (chr->style != style) { float proper_step = line_height_for_style(lh, chr->style); if (proper_step * 0.95 <= line->distance && line->distance <= proper_step * 1.05) { ok = 1; break; } style = chr->style; } } if (ok) break; } if (!ok) { force_paragraph: split_block(ctx, page, block_num, line_num); break; } } } free_line_heights(lh); /* Simple line region analysis: * For each line: * form a list of 'start/stop' points (henceforth a 'region mask') * find the normalised baseline vector for the line. * Store the region mask and baseline vector. * Collate lines that have compatible region masks and identical * baseline vectors. * If the collated masks are column-like, then split into columns. * Otherwise split into tables. */ rms = new_region_masks(ctx); /* Step 1: Form the region masks and store them into a list with the * normalised baseline vectors. */ for (block_num = 0; block_num < page->len; block_num++) { fz_text_block *block; if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) continue; block = page->blocks[block_num].u.text; for (line = block->lines; line < block->lines + block->len; line++) { fz_point blv; region_mask *rm; #ifdef DEBUG_MASKS printf("Line: "); dump_line(line); #endif blv = line->first_span->max; blv.x -= line->first_span->min.x; blv.y -= line->first_span->min.y; fz_normalize_vector(&blv); rm = new_region_mask(ctx, &blv); for (span = line->first_span; span; span = span->next) { fz_point *region_min = &span->min; fz_point *region_max = &span->max; /* Treat adjacent spans as one big region */ while (span->next && span->next->spacing < 1.5) { span = span->next; region_max = &span->max; } region_mask_add(rm, region_min, region_max); } #ifdef DEBUG_MASKS dump_region_mask(rm); #endif region_masks_add(rms, rm); } } /* Step 2: Sort the region_masks by size of masked region */ region_masks_sort(rms); #ifdef DEBUG_MASKS printf("Sorted list of regions:\n"); dump_region_masks(rms); #endif /* Step 3: Merge the region masks where possible (large ones first) */ { int i; region_masks *rms2; rms2 = new_region_masks(ctx); for (i=0; i < rms->len; i++) { region_mask *rm = rms->mask[i]; rms->mask[i] = NULL; region_masks_merge(rms2, rm); } free_region_masks(rms); rms = rms2; } #ifdef DEBUG_MASKS printf("Merged list of regions:\n"); dump_region_masks(rms); #endif /* Step 4: Figure out alignment */ region_masks_alignment(rms); /* Step 5: At this point, we should probably look at the region masks * to try to guess which ones represent columns on the page. With our * current code, we could only get blocks of lines that span 2 or more * columns if the PDF producer wrote text out horizontally across 2 * or more columns, and we've never seen that (yet!). So we skip this * step for now. */ /* Step 6: Run through the lines again, deciding which ones fit into * which region mask. */ { region_mask *prev_match = NULL; for (block_num = 0; block_num < page->len; block_num++) { fz_text_block *block; if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) continue; block = page->blocks[block_num].u.text; for (line = block->lines; line < block->lines + block->len; line++) { fz_point blv; region_mask *rm; region_mask *match; blv = line->first_span->max; blv.x -= line->first_span->min.x; blv.y -= line->first_span->min.y; fz_normalize_vector(&blv); #ifdef DEBUG_MASKS dump_line(line); #endif rm = new_region_mask(ctx, &blv); for (span = line->first_span; span; span = span->next) { fz_point *region_min = &span->min; fz_point *region_max = &span->max; /* Treat adjacent spans as one big region */ while (span->next && span->next->spacing < 1.5) { span = span->next; region_max = &span->max; } region_mask_add(rm, region_min, region_max); } #ifdef DEBUG_MASKS printf("Mask: "); dump_region_mask(rm); #endif match = region_masks_match(rms, rm, line, prev_match); prev_match = match; #ifdef DEBUG_MASKS printf("Matches: "); dump_region_mask(match); #endif free_region_mask(rm); span = line->first_span; while (span) { fz_point *region_min = &span->min; fz_point *region_max = &span->max; fz_text_span *sn; int col, align; float colw, left; /* Treat adjacent spans as one big region */ #ifdef DEBUG_ALIGN dump_span(span); #endif for (sn = span->next; sn && sn->spacing < 1.5; sn = sn->next) { region_max = &sn->max; #ifdef DEBUG_ALIGN dump_span(sn); #endif } col = region_mask_column(match, region_min, region_max, &align, &colw, &left); #ifdef DEBUG_ALIGN printf(" = col%d colw=%g align=%d\n", col, colw, align); #endif do { span->column = col; span->align = align; span->indent = left; span->column_width = colw; span = span->next; } while (span != sn); if (span) span = span->next; } line->region = match; } } free_region_masks(rms); } /* Step 7: Collate lines within a block that share the same region * mask. */ for (block_num = 0; block_num < page->len; block_num++) { int line_num; int prev_line_num; fz_text_block *block; if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) continue; block = page->blocks[block_num].u.text; /* First merge lines. This may leave empty lines behind. */ for (prev_line_num = 0, line_num = 1; line_num < block->len; line_num++) { fz_text_line *prev_line; line = &block->lines[line_num]; if (!line->first_span) continue; prev_line = &block->lines[prev_line_num]; if (prev_line->region == line->region) { /* We only merge lines if the second line * only uses 1 of the columns. */ int col = line->first_span->column; /* Copy the left value for the first span * in the first column in this line forward * for all the rest of the spans in the same * column. */ float indent = line->first_span->indent; for (span = line->first_span->next; span; span = span->next) { if (col != span->column) break; span->indent = indent; } if (span) { prev_line_num = line_num; continue; } /* Merge line into prev_line */ { fz_text_span **prev_line_span = &prev_line->first_span; int try_dehyphen = -1; fz_text_span *prev_span = NULL; span = line->first_span; while (span && *prev_line_span) { /* Skip forwards through the original * line, until we find a place where * span should go. */ if ((*prev_line_span)->column <= span->column) { /* The current span we are considering * in prev_line is earlier than span. * Just skip forwards in prev_line. */ prev_span = (*prev_line_span); prev_line_span = &prev_span->next; try_dehyphen = span->column; } else { /* We want to copy span into prev_line. */ fz_text_span *next = (*prev_line_span)->next; if (prev_line_span == &prev_line->first_span) prev_line->first_span = span; if (next == NULL) prev_line->last_span = span; if (try_dehyphen == span->column) dehyphenate(prev_span, span); try_dehyphen = -1; prev_span = *prev_line_span = span; span = span->next; (*prev_line_span)->next = next; prev_line_span = &(*prev_line_span)->next; } } if (span) { *prev_line_span = span; prev_line->last_span = line->last_span; } line->first_span = NULL; line->last_span = NULL; } } else prev_line_num = line_num; } /* Now get rid of the empty lines */ for (prev_line_num = 0, line_num = 0; line_num < block->len; line_num++) { line = &block->lines[line_num]; if (line->first_span) block->lines[prev_line_num++] = *line; } block->len = prev_line_num; /* Now try to spot indents */ for (line_num = 0; line_num < block->len; line_num++) { fz_text_span *span_num, *sn; int col, count; line = &block->lines[line_num]; /* Run through the spans... */ span_num = line->first_span; { float indent = 0; /* For each set of spans that share the same * column... */ col = span_num->column; #ifdef DEBUG_INDENTS printf("Indent %g: ", span_num->indent); dump_span(span_num); printf("\n"); #endif /* find the average indent of all but the first.. */ for (sn = span_num->next, count = 0; sn && sn->column == col; sn = sn->next, count++) { #ifdef DEBUG_INDENTS printf("Indent %g: ", sn->indent); dump_span(sn); printf("\n"); #endif indent += sn->indent; sn->indent = 0; } if (sn != span_num->next) indent /= count; /* And compare this indent with the first one... */ #ifdef DEBUG_INDENTS printf("Average indent %g ", indent); #endif indent -= span_num->indent; #ifdef DEBUG_INDENTS printf("delta %g ", indent); #endif if (fabsf(indent) < 1) { /* No indent worth speaking of */ indent = 0; } #ifdef DEBUG_INDENTS printf("recorded %g\n", indent); #endif span_num->indent = indent; span_num = sn; } for (; span_num; span_num = span_num->next) { span_num->indent = 0; } } } }
static void strain_soup(fz_context *ctx, fz_text_device *tdev) { span_soup *soup = tdev->spans; fz_text_line *last_line = NULL; fz_text_span *last_span = NULL; int span_num; if (soup == NULL) return; /* Really dumb implementation to match what we had before */ for (span_num=0; span_num < soup->len; span_num++) { fz_text_span *span = soup->spans[span_num]; int new_line = 1; float distance = 0; float spacing = 0; soup->spans[span_num] = NULL; if (last_span) { /* If we have a last_span, we must have a last_line */ /* Do span and last_line share the same baseline? */ fz_point p, q, perp_r; float dot; float size = fz_matrix_expansion(&span->transform); #ifdef DEBUG_SPANS { printf("Comparing: \""); dump_span(last_span); printf("\" and \""); dump_span(span); printf("\"\n"); } #endif p.x = last_line->first_span->max.x - last_line->first_span->min.x; p.y = last_line->first_span->max.y - last_line->first_span->min.y; fz_normalize_vector(&p); q.x = span->max.x - span->min.x; q.y = span->max.y - span->min.y; fz_normalize_vector(&q); #ifdef DEBUG_SPANS printf("last_span=%g %g -> %g %g = %g %g\n", last_span->min.x, last_span->min.y, last_span->max.x, last_span->max.y, p.x, p.y); printf("span =%g %g -> %g %g = %g %g\n", span->min.x, span->min.y, span->max.x, span->max.y, q.x, q.y); #endif perp_r.y = last_line->first_span->min.x - span->min.x; perp_r.x = -(last_line->first_span->min.y - span->min.y); /* Check if p and q are parallel. If so, then this * line is parallel with the last one. */ dot = p.x * q.x + p.y * q.y; if (fabsf(dot) > 0.9995) { /* If we take the dot product of normalised(p) and * perp(r), we get the perpendicular distance from * one line to the next (assuming they are parallel). */ distance = p.x * perp_r.x + p.y * perp_r.y; /* We allow 'small' distances of baseline changes * to cope with super/subscript. FIXME: We should * gather subscript/superscript information here. */ new_line = (fabsf(distance) > size * LINE_DIST); } else { new_line = 1; distance = 0; } if (!new_line) { fz_point delta; delta.x = span->min.x - last_span->max.x; delta.y = span->min.y - last_span->max.y; spacing = (p.x * delta.x + p.y * delta.y); spacing = fabsf(spacing); /* Only allow changes in baseline (subscript/superscript etc) * when the spacing is small. */ if (spacing * fabsf(distance) > size * LINE_DIST && fabsf(distance) > size * 0.1f) { new_line = 1; distance = 0; spacing = 0; } else { spacing /= size * SPACE_DIST; /* Apply the same logic here as when we're adding chars to build spans. */ if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST)) spacing = 1; } } #ifdef DEBUG_SPANS printf("dot=%g new_line=%d distance=%g size=%g spacing=%g\n", dot, new_line, distance, size, spacing); #endif } span->spacing = spacing; last_line = push_span(ctx, tdev, span, new_line, distance); last_span = span; } }