void textord_page( //make rows & words ICOORD page_tr, //top right BLOCK_LIST *blocks, //block list TO_BLOCK_LIST *land_blocks, //rotated for landscape TO_BLOCK_LIST *port_blocks //output list ) { float gradient; //global skew set_global_loc_code(LOC_TEXT_ORD_ROWS); gradient = make_rows (page_tr, blocks, land_blocks, port_blocks); if (global_monitor != NULL) { global_monitor->ocr_alive = TRUE; global_monitor->progress = 20; } set_global_loc_code(LOC_TEXT_ORD_WORDS); make_words(page_tr, gradient, blocks, land_blocks, port_blocks); if (global_monitor != NULL) { global_monitor->ocr_alive = TRUE; global_monitor->progress = 30; } cleanup_blocks(blocks); //remove empties #ifndef GRAPHICS_DISABLED close_to_win(); #endif if (textord_exit_after && !interactive_mode) exit (0); }
// Make the textlines and words inside each block. void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) { page_tr_.set_x(width); page_tr_.set_y(height); if (to_blocks->empty()) { // AutoPageSeg was not used, so we need to find_components first. find_components(binary_pix, blocks, to_blocks); TO_BLOCK_IT it(to_blocks); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { TO_BLOCK *to_block = it.data(); // Compute the edge offsets whether or not there is a grey_pix. // We have by-passed auto page seg, so we have to run it here. // By page segmentation mode there is no non-text to avoid running on. to_block->ComputeEdgeOffsets(thresholds_pix, grey_pix); } } else if (!PSM_SPARSE(pageseg_mode)) { // AutoPageSeg does not need to find_components as it did that already. // Filter_blobs sets up the TO_BLOCKs the same as find_components does. filter_blobs(page_tr_, to_blocks, true); } ASSERT_HOST(!to_blocks->empty()); if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) { const FCOORD anticlockwise90(0.0f, 1.0f); const FCOORD clockwise90(0.0f, -1.0f); TO_BLOCK_IT it(to_blocks); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { TO_BLOCK *to_block = it.data(); BLOCK *block = to_block->block; // Create a fake poly_block in block from its bounding box. block->set_poly_block(new POLY_BLOCK(block->bounding_box(), PT_VERTICAL_TEXT)); // Rotate the to_block along with its contained block and blobnbox lists. to_block->rotate(anticlockwise90); // Set the block's rotation values to obey the convention followed in // layout analysis for vertical text. block->set_re_rotation(clockwise90); block->set_classify_rotation(clockwise90); } } TO_BLOCK_IT to_block_it(to_blocks); TO_BLOCK *to_block = to_block_it.data(); // Make the rows in the block. float gradient = 0; // Do it the old fashioned way. if (PSM_LINE_FIND_ENABLED(pageseg_mode)) { gradient = make_rows(page_tr_, to_blocks); } else if (!PSM_SPARSE(pageseg_mode)) { // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row. gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE, to_block, to_blocks); } BaselineDetect baseline_detector(textord_baseline_debug, reskew, to_blocks); baseline_detector.ComputeStraightBaselines(use_box_bottoms); baseline_detector.ComputeBaselineSplinesAndXheights(page_tr_, true, textord_heavy_nr, textord_show_final_rows, this); // Now make the words in the lines. if (PSM_WORD_FIND_ENABLED(pageseg_mode)) { // SINGLE_LINE uses the old word maker on the single line. make_words(this, page_tr_, gradient, blocks, to_blocks); } else { // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a // single word, and in SINGLE_CHAR mode, all the outlines // go in a single blob. TO_BLOCK *to_block = to_block_it.data(); make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), to_block->block->row_list()); } cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); // Remove empties. // Compute the margins for each row in the block, to be used later for // paragraph detection. BLOCK_IT b_it(blocks); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { b_it.data()->compute_row_margins(); } #ifndef GRAPHICS_DISABLED close_to_win(); #endif }
// Segment the page according to the current value of tessedit_pageseg_mode. // If the pix_binary_ member is not NULL, it is used as the source image, // and copied to image, otherwise it just uses image as the input. // On return the blocks list owns all the constructed page layout. int Tesseract::SegmentPage(const STRING* input_file, IMAGE* image, BLOCK_LIST* blocks) { int width = image->get_xsize(); int height = image->get_ysize(); int resolution = image->get_res(); #ifdef HAVE_LIBLEPT if (pix_binary_ != NULL) { width = pixGetWidth(pix_binary_); height = pixGetHeight(pix_binary_); resolution = pixGetXRes(pix_binary_); } #endif // Zero resolution messes up the algorithms, so make sure it is credible. if (resolution < kMinCredibleResolution) resolution = kDefaultResolution; // Get page segmentation mode. PageSegMode pageseg_mode = static_cast<PageSegMode>( static_cast<int>(tessedit_pageseg_mode)); // If a UNLV zone file can be found, use that instead of segmentation. if (pageseg_mode != tesseract::PSM_AUTO && input_file != NULL && input_file->length() > 0) { STRING name = *input_file; const char* lastdot = strrchr(name.string(), '.'); if (lastdot != NULL) name[lastdot - name.string()] = '\0'; read_unlv_file(name, width, height, blocks); } bool single_column = pageseg_mode > PSM_AUTO; if (blocks->empty()) { // No UNLV file present. Work according to the PageSegMode. // First make a single block covering the whole image. BLOCK_IT block_it(blocks); BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height); block_it.add_to_end(block); } else { // UNLV file present. Use PSM_SINGLE_COLUMN. pageseg_mode = PSM_SINGLE_COLUMN; } TO_BLOCK_LIST land_blocks, port_blocks; TBOX page_box; if (pageseg_mode <= PSM_SINGLE_COLUMN) { if (AutoPageSeg(width, height, resolution, single_column, image, blocks, &port_blocks) < 0) { return -1; } // To create blobs from the image region bounds uncomment this line: // port_blocks.clear(); // Uncomment to go back to the old mode. } else { #if HAVE_LIBLEPT image->FromPix(pix_binary_); #endif deskew_ = FCOORD(1.0f, 0.0f); reskew_ = FCOORD(1.0f, 0.0f); } if (blocks->empty()) { tprintf("Empty page\n"); return 0; // AutoPageSeg found an empty page. } if (port_blocks.empty()) { // AutoPageSeg was not used, so we need to find_components first. find_components(blocks, &land_blocks, &port_blocks, &page_box); } else { // AutoPageSeg does not need to find_components as it did that already. page_box.set_left(0); page_box.set_bottom(0); page_box.set_right(width); page_box.set_top(height); // Filter_blobs sets up the TO_BLOCKs the same as find_components does. filter_blobs(page_box.topright(), &port_blocks, true); } TO_BLOCK_IT to_block_it(&port_blocks); ASSERT_HOST(!port_blocks.empty()); TO_BLOCK* to_block = to_block_it.data(); if (pageseg_mode <= PSM_SINGLE_BLOCK || to_block->line_size < 2) { // For now, AUTO, SINGLE_COLUMN and SINGLE_BLOCK all map to the old // textord. The difference is the number of blocks and how the are made. textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks, this); } else { // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row. float gradient = make_single_row(page_box.topright(), to_block, &port_blocks, this); if (pageseg_mode == PSM_SINGLE_LINE) { // SINGLE_LINE uses the old word maker on the single line. make_words(page_box.topright(), gradient, blocks, &land_blocks, &port_blocks, this); } else { // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a // single word, and in SINGLE_CHAR mode, all the outlines // go in a single blob. make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), to_block->block->row_list()); } } return 0; }
//-------------------------------------------------------------------------- int idaapi accept_file(linput_t *li, char fileformatname[MAX_FILE_FORMAT_NAME], int n) { char line[MAXSTR]; char *words[MAXSTR]; if ( n ) return 0; // We try to interpret the input file as a text // file with a dump format, i.e. all lines should look like //00000020: 59 69 74 54-55 B6 3E F7-D6 B9 C9 B9-45 E6 A4 52 YitTU¦>?O?E?E??R //0020: 59 69 74 54 55 B6 3E F7 D6 B9 C9 B9 45 E6 A4 52 "YitTU¦>?O?E?E??R" //1000: 12 23 34 56 78 //0100: 31 C7 1D AF 32 04 1E 32 05 1E 3C 32 07 1E 21 D9 //12 23 34 56 78 // and similar lines // We allow non-ascii characters at the end of the line // We skip empty lines ssize_t p0len = -1; // length of the first word's hex part char w0sep[10]; // separator after the first word w0sep[0] = '\0'; int nl = 0; int nontrivial_line_count = 0; bool no_more_lines = false; uint32 adr, oldadr=0; while ( qlgets(line, sizeof(line), li) ) { nl++; strrpl(line, '-', ' '); int nw = make_words(line, words, qnumber(words)); if ( line[0] == ';' || line[0] == '#' ) continue; if ( nw == 0 ) continue; nontrivial_line_count++; if ( no_more_lines ) FAILED; // od -x format may contain '*' lines which mean repetition if ( strcmp(words[0], "*") == 0 && nw == 1 ) continue; // the first word must be a number (more than one digit) char *ptr = words[0]; adr = hex(ptr); ssize_t p0 = ptr - words[0]; if ( p0 <= 1 ) FAILED; if ( nontrivial_line_count > 1 && p0 < p0len ) FAILED; p0len = p0; // take the separator from the first line if ( nontrivial_line_count == 1 ) { qstrncpy(w0sep, ptr, sizeof(w0sep)); while ( *ptr ) if ( strchr(":>-.", *ptr++) == NULL ) FAILED; } else { if ( strcmp(w0sep, ptr) != 0 ) FAILED; } bool haspref = p0len >= 4 || w0sep[0] != '\0'; if ( haspref ) { // if the line contains only the address, then don't accept lines anymore if ( nw == 1 ) { if ( nontrivial_line_count == 1 ) FAILED; no_more_lines = true; if ( adr <= oldadr ) FAILED; } else { // the remaining words should be numbers with at least 1 position // (at least the second word should be so) ptr = words[1]; hex(ptr); if ( ptr == words[1] ) FAILED; } } oldadr = adr; } if ( nontrivial_line_count == 0 ) FAILED; qstrncpy(fileformatname, "Dump file", MAX_FILE_FORMAT_NAME); return 1; }
//-------------------------------------------------------------------------- void idaapi load_file(linput_t *li, ushort _neflag, const char * /*fileformatname*/) { char line[MAXSTR]; char *words[MAXSTR]; neflag = _neflag; iscode = (neflag & NEF_CODE) != 0; sel = BADSEL; sea = BADADDR; ea_t ea = 0; ea_t top= 0; bool use32 = false; bool octpref = false; bool octnum = false; size_t fill = 0; // Since we made all the checks in accept_file, // here we don't repeat them ssize_t p0len = -1; // length of the first word's hex part char w0sep[10]; // separator after the first word w0sep[0] = '\0'; int nontrivial_line_count = 0; while ( qlgets(line, sizeof(line), li) ) { strrpl(line, '-', ' '); if ( line[0] == ';' || line[0] == '#' ) continue; int n = make_words(line, words, qnumber(words)); if ( n == 0 ) continue; nontrivial_line_count++; ssize_t bi; // od -x format may contain '*' lines which mean repetition if ( strcmp(words[0], "*") == 0 && n == 1 ) { fill = size_t(top - ea); octpref = true; // od -x have octal prefixes continue; } // the first word must be a number (more than one digit) char *ptr = words[0]; uint32 w0 = octpref ? oct(ptr) : hex(ptr); p0len = ptr - words[0]; // take the separator from the first line if ( nontrivial_line_count == 1 ) qstrncpy(w0sep, ptr, sizeof(w0sep)); // process '*' and fill the gap if ( fill > 0 ) { while ( top < w0 ) { ea = top; top = ea + fill; copy(ea, top); } } int idx = 0; if ( w0sep[0] != '\0' || p0len >= 4 ) { if ( nontrivial_line_count > 1 && !octpref && top != w0 ) { // strange, the sequence is not contiguous // check if the prefixes are octal (od -x) ptr = words[0]; if ( oct(ptr) == top ) { octpref = true; ptr = words[0]; w0 = oct(ptr); } } ea = w0; idx = 1; } else { ea = top; } for ( bi=0; idx < n; idx++ ) //lint !e443 { ptr = words[idx]; if ( nontrivial_line_count == 1 && !octnum && strlen(ptr) == 6 ) { oct(ptr); if ( ptr-words[idx] == 6 ) octnum = true; ptr = words[idx]; // msg("ptr=%s octnum=%d\n", ptr, octnum); } uint32 b = octnum ? oct(ptr) : hex(ptr); ssize_t nc = ptr - words[idx]; if ( nc < 2 ) { // we tolerate one-letter separators between numbers if ( words[idx][1] == '\0' && strchr("\xA6|-:", words[idx][0]) != NULL ) continue; break; } nc /= octnum ? 3 : 2; // number of bytes *(uint32 *)&bytes[bi] = b; bi += nc; } top = ea + bi; copy(ea, top); } if ( eea >= 0x10000 || p0len > 4 ) use32 = true; if ( neflag & NEF_SEGS ) { if ( use32 ) { set_segm_addressing(getseg(sea), 1); if ( ph.id == PLFM_386 ) inf.lflags |= LFLG_PC_FLAT; } set_default_dataseg(sel); } if ( (neflag & NEF_RELOAD) == 0 ) create_filename_cmt(); }