void textord_page(                             //make rows & words
                  ICOORD page_tr,              //top right
                  BLOCK_LIST *blocks,          //block list
                  TO_BLOCK_LIST *land_blocks,  //rotated for landscape
                  TO_BLOCK_LIST *port_blocks   //output list
                 ) {
  float gradient;                //global skew

  set_global_loc_code(LOC_TEXT_ORD_ROWS);
  gradient = make_rows (page_tr, blocks, land_blocks, port_blocks);
  if (global_monitor != NULL) {
    global_monitor->ocr_alive = TRUE;
    global_monitor->progress = 20;
  }
  set_global_loc_code(LOC_TEXT_ORD_WORDS);
  make_words(page_tr, gradient, blocks, land_blocks, port_blocks);
  if (global_monitor != NULL) {
    global_monitor->ocr_alive = TRUE;
    global_monitor->progress = 30;
  }
  cleanup_blocks(blocks);  //remove empties
#ifndef GRAPHICS_DISABLED
  close_to_win();
#endif
  if (textord_exit_after && !interactive_mode)
    exit (0);
}
Beispiel #2
0
// Make the textlines and words inside each block.
void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew,
                          int width, int height, Pix *binary_pix,
                          Pix *thresholds_pix, Pix *grey_pix,
                          bool use_box_bottoms,
                          BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) {
  page_tr_.set_x(width);
  page_tr_.set_y(height);
  if (to_blocks->empty()) {
    // AutoPageSeg was not used, so we need to find_components first.
    find_components(binary_pix, blocks, to_blocks);
    TO_BLOCK_IT it(to_blocks);
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
      TO_BLOCK *to_block = it.data();
      // Compute the edge offsets whether or not there is a grey_pix.
      // We have by-passed auto page seg, so we have to run it here.
      // By page segmentation mode there is no non-text to avoid running on.
      to_block->ComputeEdgeOffsets(thresholds_pix, grey_pix);
    }
  } else if (!PSM_SPARSE(pageseg_mode)) {
    // AutoPageSeg does not need to find_components as it did that already.
    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
    filter_blobs(page_tr_, to_blocks, true);
  }

  ASSERT_HOST(!to_blocks->empty());
  if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) {
    const FCOORD anticlockwise90(0.0f, 1.0f);
    const FCOORD clockwise90(0.0f, -1.0f);
    TO_BLOCK_IT it(to_blocks);
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
      TO_BLOCK *to_block = it.data();
      BLOCK *block = to_block->block;
      // Create a fake poly_block in block from its bounding box.
      block->set_poly_block(new POLY_BLOCK(block->bounding_box(),
                                           PT_VERTICAL_TEXT));
      // Rotate the to_block along with its contained block and blobnbox lists.
      to_block->rotate(anticlockwise90);
      // Set the block's rotation values to obey the convention followed in
      // layout analysis for vertical text.
      block->set_re_rotation(clockwise90);
      block->set_classify_rotation(clockwise90);
    }
  }

  TO_BLOCK_IT to_block_it(to_blocks);
  TO_BLOCK *to_block = to_block_it.data();
  // Make the rows in the block.
  float gradient = 0;
  // Do it the old fashioned way.
  if (PSM_LINE_FIND_ENABLED(pageseg_mode)) {
    gradient = make_rows(page_tr_, to_blocks);
  } else if (!PSM_SPARSE(pageseg_mode)) {
    // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
    gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE,
                               to_block, to_blocks);
  }
  BaselineDetect baseline_detector(textord_baseline_debug,
                                   reskew, to_blocks);
  baseline_detector.ComputeStraightBaselines(use_box_bottoms);
  baseline_detector.ComputeBaselineSplinesAndXheights(page_tr_, true,
      textord_heavy_nr,
      textord_show_final_rows,
      this);
  // Now make the words in the lines.
  if (PSM_WORD_FIND_ENABLED(pageseg_mode)) {
    // SINGLE_LINE uses the old word maker on the single line.
    make_words(this, page_tr_, gradient, blocks, to_blocks);
  } else {
    // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
    // single word, and in SINGLE_CHAR mode, all the outlines
    // go in a single blob.
    TO_BLOCK *to_block = to_block_it.data();
    make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
                     to_block->get_rows(), to_block->block->row_list());
  }
  cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
  // Remove empties.

  // Compute the margins for each row in the block, to be used later for
  // paragraph detection.
  BLOCK_IT b_it(blocks);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    b_it.data()->compute_row_margins();
  }
#ifndef GRAPHICS_DISABLED
  close_to_win();
#endif
}
Beispiel #3
0
// Segment the page according to the current value of tessedit_pageseg_mode.
// If the pix_binary_ member is not NULL, it is used as the source image,
// and copied to image, otherwise it just uses image as the input.
// On return the blocks list owns all the constructed page layout.
int Tesseract::SegmentPage(const STRING* input_file,
                           IMAGE* image, BLOCK_LIST* blocks) {
  int width = image->get_xsize();
  int height = image->get_ysize();
  int resolution = image->get_res();
#ifdef HAVE_LIBLEPT
  if (pix_binary_ != NULL) {
    width = pixGetWidth(pix_binary_);
    height = pixGetHeight(pix_binary_);
    resolution = pixGetXRes(pix_binary_);
  }
#endif
  // Zero resolution messes up the algorithms, so make sure it is credible.
  if (resolution < kMinCredibleResolution)
    resolution = kDefaultResolution;
  // Get page segmentation mode.
  PageSegMode pageseg_mode = static_cast<PageSegMode>(
      static_cast<int>(tessedit_pageseg_mode));
  // If a UNLV zone file can be found, use that instead of segmentation.
  if (pageseg_mode != tesseract::PSM_AUTO &&
      input_file != NULL && input_file->length() > 0) {
    STRING name = *input_file;
    const char* lastdot = strrchr(name.string(), '.');
    if (lastdot != NULL)
      name[lastdot - name.string()] = '\0';
    read_unlv_file(name, width, height, blocks);
  }
  bool single_column = pageseg_mode > PSM_AUTO;
  if (blocks->empty()) {
    // No UNLV file present. Work according to the PageSegMode.
    // First make a single block covering the whole image.
    BLOCK_IT block_it(blocks);
    BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
    block_it.add_to_end(block);
  } else {
    // UNLV file present. Use PSM_SINGLE_COLUMN.
    pageseg_mode = PSM_SINGLE_COLUMN;
  }

  TO_BLOCK_LIST land_blocks, port_blocks;
  TBOX page_box;
  if (pageseg_mode <= PSM_SINGLE_COLUMN) {
    if (AutoPageSeg(width, height, resolution, single_column,
                    image, blocks, &port_blocks) < 0) {
      return -1;
    }
    // To create blobs from the image region bounds uncomment this line:
    //  port_blocks.clear();  // Uncomment to go back to the old mode.
  } else {
#if HAVE_LIBLEPT
    image->FromPix(pix_binary_);
#endif
    deskew_ = FCOORD(1.0f, 0.0f);
    reskew_ = FCOORD(1.0f, 0.0f);
  }
  if (blocks->empty()) {
    tprintf("Empty page\n");
    return 0;  // AutoPageSeg found an empty page.
  }

  if (port_blocks.empty()) {
    // AutoPageSeg was not used, so we need to find_components first.
    find_components(blocks, &land_blocks, &port_blocks, &page_box);
  } else {
    // AutoPageSeg does not need to find_components as it did that already.
    page_box.set_left(0);
    page_box.set_bottom(0);
    page_box.set_right(width);
    page_box.set_top(height);
    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
    filter_blobs(page_box.topright(), &port_blocks, true);
  }

  TO_BLOCK_IT to_block_it(&port_blocks);
  ASSERT_HOST(!port_blocks.empty());
  TO_BLOCK* to_block = to_block_it.data();
  if (pageseg_mode <= PSM_SINGLE_BLOCK ||
      to_block->line_size < 2) {
    // For now, AUTO, SINGLE_COLUMN and SINGLE_BLOCK all map to the old
    // textord. The difference is the number of blocks and how the are made.
    textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks,
                 this);
  } else {
    // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
    float gradient = make_single_row(page_box.topright(),
                                     to_block, &port_blocks, this);
    if (pageseg_mode == PSM_SINGLE_LINE) {
      // SINGLE_LINE uses the old word maker on the single line.
      make_words(page_box.topright(), gradient, blocks,
                 &land_blocks, &port_blocks, this);
    } else {
      // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
      // single word, and in SINGLE_CHAR mode, all the outlines
      // go in a single blob.
      make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
                       to_block->get_rows(), to_block->block->row_list());
    }
  }
  return 0;
}
Beispiel #4
0
//--------------------------------------------------------------------------
int idaapi accept_file(linput_t *li, char fileformatname[MAX_FILE_FORMAT_NAME], int n)
{
  char line[MAXSTR];
  char *words[MAXSTR];

  if ( n )
    return 0;

  // We try to interpret the input file as a text
  // file with a dump format, i.e. all lines should look like

//00000020:  59 69 74 54-55 B6 3E F7-D6 B9 C9 B9-45 E6 A4 52  YitTU¦>?O?E?E??R
//0020: 59 69 74 54 55 B6 3E F7 D6 B9 C9 B9 45 E6 A4 52  "YitTU¦>?O?E?E??R"
//1000: 12 23 34 56 78
//0100: 31 C7 1D AF 32 04 1E 32 05 1E 3C 32 07 1E 21 D9
//12 23 34 56 78

  // and similar lines
  // We allow non-ascii characters at the end of the line
  // We skip empty lines

  ssize_t p0len = -1;    // length of the first word's hex part
  char w0sep[10];        // separator after the first word
  w0sep[0] = '\0';
  int nl = 0;
  int nontrivial_line_count = 0;
  bool no_more_lines = false;
  uint32 adr, oldadr=0;
  while ( qlgets(line, sizeof(line), li) )
  {
    nl++;
    strrpl(line, '-', ' ');
    int nw = make_words(line, words, qnumber(words));
    if ( line[0] == ';' || line[0] == '#' )
      continue;
    if ( nw == 0 )
      continue;
    nontrivial_line_count++;
    if ( no_more_lines )
      FAILED;
    // od -x format may contain '*' lines which mean repetition
    if ( strcmp(words[0], "*") == 0 && nw == 1 )
      continue;
    // the first word must be a number (more than one digit)
    char *ptr = words[0];
    adr = hex(ptr);
    ssize_t p0 = ptr - words[0];
    if ( p0 <= 1 )
      FAILED;
    if ( nontrivial_line_count > 1 && p0 < p0len )
      FAILED;
    p0len = p0;
    // take the separator from the first line
    if ( nontrivial_line_count == 1 )
    {
      qstrncpy(w0sep, ptr, sizeof(w0sep));
      while ( *ptr )
        if ( strchr(":>-.", *ptr++) == NULL )
          FAILED;
    }
    else
    {
      if ( strcmp(w0sep, ptr) != 0 )
        FAILED;
    }
    bool haspref = p0len >= 4 || w0sep[0] != '\0';
    if ( haspref )
    {
      // if the line contains only the address, then don't accept lines anymore
      if ( nw == 1 )
      {
        if ( nontrivial_line_count == 1 )
          FAILED;
        no_more_lines = true;
        if ( adr <= oldadr )
          FAILED;
      }
      else
      {
        // the remaining words should be numbers with at least 1 position
        // (at least the second word should be so)
        ptr = words[1];
        hex(ptr);
        if ( ptr == words[1] )
          FAILED;
      }
    }
    oldadr = adr;
  }
  if ( nontrivial_line_count == 0 )
    FAILED;

  qstrncpy(fileformatname, "Dump file", MAX_FILE_FORMAT_NAME);
  return 1;
}
Beispiel #5
0
//--------------------------------------------------------------------------
void idaapi load_file(linput_t *li, ushort _neflag, const char * /*fileformatname*/)
{
  char line[MAXSTR];
  char *words[MAXSTR];

  neflag = _neflag;
  iscode = (neflag & NEF_CODE) != 0;
  sel = BADSEL;
  sea = BADADDR;
  ea_t ea = 0;
  ea_t top= 0;
  bool use32   = false;
  bool octpref = false;
  bool octnum  = false;
  size_t fill = 0;

  // Since we made all the checks in accept_file,
  // here we don't repeat them

  ssize_t p0len = -1;    // length of the first word's hex part
  char w0sep[10];        // separator after the first word
  w0sep[0] = '\0';
  int nontrivial_line_count = 0;
  while ( qlgets(line, sizeof(line), li) )
  {
    strrpl(line, '-', ' ');
    if ( line[0] == ';' || line[0] == '#' )
      continue;
    int n = make_words(line, words, qnumber(words));
    if ( n == 0 )
      continue;
    nontrivial_line_count++;
    ssize_t bi;
    // od -x format may contain '*' lines which mean repetition
    if ( strcmp(words[0], "*") == 0 && n == 1 )
    {
      fill  = size_t(top - ea);
      octpref = true;             // od -x have octal prefixes
      continue;
    }
    // the first word must be a number (more than one digit)
    char *ptr = words[0];
    uint32 w0 = octpref ? oct(ptr) : hex(ptr);
    p0len = ptr - words[0];
    // take the separator from the first line
    if ( nontrivial_line_count == 1 )
      qstrncpy(w0sep, ptr, sizeof(w0sep));

    // process '*' and fill the gap
    if ( fill > 0 )
    {
      while ( top < w0 )
      {
        ea = top;
        top = ea + fill;
        copy(ea, top);
      }
    }

    int idx = 0;
    if ( w0sep[0] != '\0' || p0len >= 4 )
    {
      if ( nontrivial_line_count > 1 && !octpref && top != w0 )
      {
        // strange, the sequence is not contiguous
        // check if the prefixes are octal (od -x)
        ptr = words[0];
        if ( oct(ptr) == top )
        {
          octpref = true;
          ptr = words[0];
          w0 = oct(ptr);
        }
      }
      ea = w0;
      idx = 1;
    }
    else
    {
      ea = top;
    }
    for ( bi=0; idx < n; idx++ ) //lint !e443
    {
      ptr = words[idx];
      if ( nontrivial_line_count == 1 && !octnum && strlen(ptr) == 6 )
      {
        oct(ptr);
        if ( ptr-words[idx] == 6 )
          octnum = true;
        ptr = words[idx];
//        msg("ptr=%s octnum=%d\n", ptr, octnum);
      }
      uint32 b = octnum ? oct(ptr) : hex(ptr);
      ssize_t nc = ptr - words[idx];
      if ( nc < 2 )
      {
        // we tolerate one-letter separators between numbers
        if ( words[idx][1] == '\0' && strchr("\xA6|-:", words[idx][0]) != NULL )
          continue;
        break;
      }
      nc /= octnum ? 3 : 2;             // number of bytes
      *(uint32 *)&bytes[bi] = b;
      bi += nc;
    }
    top = ea + bi;
    copy(ea, top);
  }

  if ( eea >= 0x10000 || p0len > 4 )
    use32 = true;
  if ( neflag & NEF_SEGS )
  {
    if ( use32 )
    {
      set_segm_addressing(getseg(sea), 1);
      if ( ph.id == PLFM_386 ) inf.lflags |= LFLG_PC_FLAT;
    }
    set_default_dataseg(sel);
  }
  if ( (neflag & NEF_RELOAD) == 0 )
    create_filename_cmt();
}