Beispiel #1
0
/**
 * Sets up auto page segmentation, determines the orientation, and corrects it.
 * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
 * facilitate testing.
 * photo_mask_pix is a pointer to a NULL pointer that will be filled on return
 * with the leptonica photo mask, which must be pixDestroyed by the caller.
 * to_blocks is an empty list that will be filled with (usually a single)
 * block that is used during layout analysis. This ugly API is required
 * because of the possibility of a unlv zone file.
 * TODO(rays) clean this up.
 * See AutoPageSeg for other arguments.
 * The returned ColumnFinder must be deleted after use.
 */
ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
    PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
    OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
    Pix** music_mask_pix) {
  int vertical_x = 0;
  int vertical_y = 1;
  TabVector_LIST v_lines;
  TabVector_LIST h_lines;
  ICOORD bleft(0, 0);

  ASSERT_HOST(pix_binary_ != NULL);
  if (tessedit_dump_pageseg_images) {
    pixWrite("tessinput.png", pix_binary_, IFF_PNG);
  }
  // Leptonica is used to find the rule/separator lines in the input.
  LineFinder::FindAndRemoveLines(source_resolution_,
                                 textord_tabfind_show_vlines, pix_binary_,
                                 &vertical_x, &vertical_y, music_mask_pix,
                                 &v_lines, &h_lines);
  if (tessedit_dump_pageseg_images)
    pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
  // Leptonica is used to find a mask of the photo regions in the input.
  *photo_mask_pix = ImageFind::FindImages(pix_binary_);
  if (tessedit_dump_pageseg_images)
    pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();

  // The rest of the algorithm uses the usual connected components.
  textord_.find_components(pix_binary_, blocks, to_blocks);

  TO_BLOCK_IT to_block_it(to_blocks);
  // There must be exactly one input block.
  // TODO(rays) handle new textline finding with a UNLV zone file.
  ASSERT_HOST(to_blocks->singleton());
  TO_BLOCK* to_block = to_block_it.data();
  TBOX blkbox = to_block->block->bounding_box();
  ColumnFinder* finder = NULL;

  if (to_block->line_size >= 2) {
    finder = new ColumnFinder(static_cast<int>(to_block->line_size),
                              blkbox.botleft(), blkbox.topright(),
                              source_resolution_, textord_use_cjk_fp_model,
                              textord_tabfind_aligned_gap_fraction,
                              &v_lines, &h_lines, vertical_x, vertical_y);

    finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);

    if (equ_detect_) {
      equ_detect_->LabelSpecialText(to_block);
    }

    BLOBNBOX_CLIST osd_blobs;
    // osd_orientation is the number of 90 degree rotations to make the
    // characters upright. (See osdetect.h for precise definition.)
    // We want the text lines horizontal, (vertical text indicates vertical
    // textlines) which may conflict (eg vertically written CJK).
    int osd_orientation = 0;
    bool vertical_text = textord_tabfind_force_vertical_text ||
                         pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
    if (!vertical_text && textord_tabfind_vertical_text &&
        PSM_ORIENTATION_ENABLED(pageseg_mode)) {
      vertical_text =
          finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
                                          to_block, &osd_blobs);
    }
    if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) {
      GenericVector<int> osd_scripts;
      if (osd_tess != this) {
        // We are running osd as part of layout analysis, so constrain the
        // scripts to those allowed by *this.
        AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
        for (int s = 0; s < sub_langs_.size(); ++s) {
          AddAllScriptsConverted(sub_langs_[s]->unicharset,
                                 osd_tess->unicharset, &osd_scripts);
        }
      }
      os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
      if (pageseg_mode == PSM_OSD_ONLY) {
        delete finder;
        return NULL;
      }
      osd_orientation = osr->best_result.orientation_id;
      double osd_score = osr->orientations[osd_orientation];
      double osd_margin = min_orientation_margin * 2;
      for (int i = 0; i < 4; ++i) {
        if (i != osd_orientation &&
            osd_score - osr->orientations[i] < osd_margin) {
          osd_margin = osd_score - osr->orientations[i];
        }
      }
      int best_script_id = osr->best_result.script_id;
      const char* best_script_str =
          osd_tess->unicharset.get_script_from_script_id(best_script_id);
      bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
          best_script_id == osd_tess->unicharset.hiragana_sid() ||
          best_script_id == osd_tess->unicharset.katakana_sid() ||
          strcmp("Japanese", best_script_str) == 0 ||
          strcmp("Korean", best_script_str) == 0 ||
          strcmp("Hangul", best_script_str) == 0;
      if (cjk) {
        finder->set_cjk_script(true);
      }
      if (osd_margin < min_orientation_margin) {
        // The margin is weak.
        if (!cjk && !vertical_text && osd_orientation == 2) {
          // upside down latin text is improbable with such a weak margin.
          tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
                  "Don't rotate.\n", osd_margin);
          osd_orientation = 0;
        } else {
          tprintf("OSD: Weak margin (%.2f) for %d blob text block, "
                  "but using orientation anyway: %d\n",
                  osd_margin, osd_blobs.length(), osd_orientation);
        }
      }
    }
    osd_blobs.shallow_clear();
    finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
  }

  return finder;
}
Beispiel #2
0
static int
tprint_timex(struct tcb *tcp, long addr)
{
	struct timex tx;

#if SUPPORTED_PERSONALITIES > 1
	if (current_wordsize == 4)
		return tprint_timex32(tcp, addr);
#endif
	if (umove(tcp, addr, &tx) < 0)
		return -1;

#if LINUX_VERSION_CODE < 66332
	tprintf("{mode=%d, offset=%ld, frequency=%ld, ",
		tx.mode, tx.offset, tx.frequency);
	tprintf("maxerror=%ld, esterror=%lu, status=%u, ",
		tx.maxerror, tx.esterror, tx.status);
	tprintf("time_constant=%ld, precision=%lu, ",
		tx.time_constant, tx.precision);
	tprintf("tolerance=%ld, time=", tx.tolerance);
	tprint_timeval(tcp, &tx.time);
#else
	tprints("{modes=");
	printflags(adjtimex_modes, tx.modes, "ADJ_???");
	tprintf(", offset=%ld, freq=%ld, maxerror=%ld, ",
		(long) tx.offset, (long) tx.freq, (long) tx.maxerror);
	tprintf("esterror=%lu, status=", (long) tx.esterror);
	printflags(adjtimex_status, tx.status, "STA_???");
	tprintf(", constant=%ld, precision=%lu, ",
		(long) tx.constant, (long) tx.precision);
	tprintf("tolerance=%ld, time=", (long) tx.tolerance);
	tprint_timeval(tcp, &tx.time);
	tprintf(", tick=%ld, ppsfreq=%ld, jitter=%ld",
		(long) tx.tick, (long) tx.ppsfreq, (long) tx.jitter);
	tprintf(", shift=%d, stabil=%ld, jitcnt=%ld",
		tx.shift, (long) tx.stabil, (long) tx.jitcnt);
	tprintf(", calcnt=%ld, errcnt=%ld, stbcnt=%ld",
		(long) tx.calcnt, (long) tx.errcnt, (long) tx.stbcnt);
#endif
	tprints("}");
	return 0;
}
// Accumulates the errors from the classifier results on a single sample.
// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
// boosting_mode selects the type of error to be used for boosting and the
// is_error_ member of sample is set according to whether the required type
// of error occurred. The font_table provides access to font properties
// for error counting and shape_table is used to understand the relationship
// between unichar_ids and shape_ids in the results
bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
                                    const FontInfoTable& font_table,
                                    const GenericVector<UnicharRating>& results,
                                    TrainingSample* sample) {
  int num_results = results.size();
  int answer_actual_rank = -1;
  int font_id = sample->font_id();
  int unichar_id = sample->class_id();
  sample->set_is_error(false);
  if (num_results == 0) {
    // Reject. We count rejects as a separate category, but still mark the
    // sample as an error in case any training module wants to use that to
    // improve the classifier.
    sample->set_is_error(true);
    ++font_counts_[font_id].n[CT_REJECT];
  } else {
    // Find rank of correct unichar answer, using rating_epsilon_ to allow
    // different answers to score as equal. (Ignoring the font.)
    int epsilon_rank = 0;
    int answer_epsilon_rank = -1;
    int num_top_answers = 0;
    double prev_rating = results[0].rating;
    bool joined = false;
    bool broken = false;
    int res_index = 0;
    while (res_index < num_results) {
      if (results[res_index].rating < prev_rating - rating_epsilon_) {
        ++epsilon_rank;
        prev_rating = results[res_index].rating;
      }
      if (results[res_index].unichar_id == unichar_id &&
          answer_epsilon_rank < 0) {
        answer_epsilon_rank = epsilon_rank;
        answer_actual_rank = res_index;
      }
      if (results[res_index].unichar_id == UNICHAR_JOINED &&
          unicharset_.has_special_codes())
        joined = true;
      else if (results[res_index].unichar_id == UNICHAR_BROKEN &&
               unicharset_.has_special_codes())
        broken = true;
      else if (epsilon_rank == 0)
        ++num_top_answers;
      ++res_index;
    }
    if (answer_actual_rank != 0) {
      // Correct result is not absolute top.
      ++font_counts_[font_id].n[CT_UNICHAR_TOPTOP_ERR];
      if (boosting_mode == CT_UNICHAR_TOPTOP_ERR) sample->set_is_error(true);
    }
    if (answer_epsilon_rank == 0) {
      ++font_counts_[font_id].n[CT_UNICHAR_TOP_OK];
      // Unichar OK, but count if multiple unichars.
      if (num_top_answers > 1) {
        ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
        ++multi_unichar_counts_[unichar_id];
      }
      // Check to see if any font in the top choice has attributes that match.
      // TODO(rays) It is easy to add counters for individual font attributes
      // here if we want them.
      if (font_table.SetContainsFontProperties(
          font_id, results[answer_actual_rank].fonts)) {
        // Font attributes were matched.
        // Check for multiple properties.
        if (font_table.SetContainsMultipleFontProperties(
            results[answer_actual_rank].fonts))
          ++font_counts_[font_id].n[CT_OK_MULTI_FONT];
      } else {
        // Font attributes weren't matched.
        ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
      }
    } else {
      // This is a top unichar error.
      ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];
      if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true);
      // Count maps from unichar id to wrong unichar id.
      ++unichar_counts_(unichar_id, results[0].unichar_id);
      if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) {
        // It is also a 2nd choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];
        if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true);
      }
      if (answer_epsilon_rank < 0) {
        // It is also a top-n choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];
        if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true);
        answer_epsilon_rank = epsilon_rank;
      }
    }
    // Compute mean number of return values and mean rank of correct answer.
    font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
    font_counts_[font_id].n[CT_RANK] += answer_epsilon_rank;
    if (joined)
      ++font_counts_[font_id].n[CT_OK_JOINED];
    if (broken)
      ++font_counts_[font_id].n[CT_OK_BROKEN];
  }
  // If it was an error for boosting then sum the weight.
  if (sample->is_error()) {
    scaled_error_ += sample->weight();
    if (debug) {
      tprintf("%d results for char %s font %d :",
              num_results, unicharset_.id_to_unichar(unichar_id),
              font_id);
      for (int i = 0; i < num_results; ++i) {
        tprintf(" %.3f : %s\n",
                results[i].rating,
                unicharset_.id_to_unichar(results[i].unichar_id));
      }
      return true;
    }
    int percent = 0;
    if (num_results > 0)
      percent = IntCastRounded(results[0].rating * 100);
    bad_score_hist_.add(percent, 1);
  } else {
    int percent = 0;
    if (answer_actual_rank >= 0)
      percent = IntCastRounded(results[answer_actual_rank].rating * 100);
    ok_score_hist_.add(percent, 1);
  }
  return false;
}
/*
 * get a result from the slave 
 */
static int get_slave_result()
{
	char *buf;
	char *token;
	char *os;
	char *userid;
	char *host;
	int local_port, remote_port;
	char *p;
	DESC *d;
	int len;

	buf = alloc_lbuf("slave_buf");

	len = read(slave_socket, buf, LBUF_SIZE - 1);
	if (len < 0) {
		if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
			free_lbuf(buf);
			return (-1);
		}
		close(slave_socket);
		slave_socket = -1;
		free_lbuf(buf);
		return (-1);
	} else if (len == 0) {
		free_lbuf(buf);
		return (-1);
	}
	buf[len] = '\0';

	token = alloc_lbuf("slave_token");
	os = alloc_lbuf("slave_os");
	userid = alloc_lbuf("slave_userid");
	host = alloc_lbuf("slave_host");

	if (sscanf(buf, "%s %s",
		   host, token) != 2) {
		free_lbuf(buf);
		free_lbuf(token);
		free_lbuf(os);
		free_lbuf(userid);
		free_lbuf(host);
		return (0);
	}
	p = strchr(buf, '\n');
	*p = '\0';
	for (d = descriptor_list; d; d = d->next) {
		if (strcmp(d->addr, host))
			continue;
		if (mudconf.use_hostname) {
			StringCopyTrunc(d->addr, token, 50);
			d->addr[50] = '\0';
			if (d->player != 0) {
				if (d->username[0])
					atr_add_raw(d->player, A_LASTSITE, tprintf("%[email protected]%s",
						     d->username, d->addr));
				else
					atr_add_raw(d->player, A_LASTSITE, d->addr);
			}
		}
	}

	if (sscanf(p + 1, "%s %d , %d : %s : %s : %s",
		   host,
		   &remote_port, &local_port,
		   token, os, userid) != 6) {
		free_lbuf(buf);
		free_lbuf(token);
		free_lbuf(os);
		free_lbuf(userid);
		free_lbuf(host);
		return (0);
	}
	for (d = descriptor_list; d; d = d->next) {
		if (ntohs((d->address).sin_port) != remote_port)
			continue;
		StringCopyTrunc(d->username, userid, 10);
		d->username[10] = '\0';
		if (d->player != 0) {
			atr_add_raw(d->player, A_LASTSITE, tprintf("%[email protected]%s",
						     d->username, d->addr));
		}
		free_lbuf(buf);
		free_lbuf(token);
		free_lbuf(os);
		free_lbuf(userid);
		free_lbuf(host);
		return (0);
	}
	free_lbuf(buf);
	free_lbuf(token);
	free_lbuf(os);
	free_lbuf(userid);
	free_lbuf(host);
	return (0);
}
Beispiel #5
0
static void
tprint_timeval32(struct tcb *tcp, const struct timeval32 *tv)
{
	tprintf("{%u, %u}", tv->tv_sec, tv->tv_usec);
}
Beispiel #6
0
inT32 OL_BUCKETS::outline_complexity(
                                     C_OUTLINE *outline,   // parent outline
                                     inT32 max_count,      // max output
                                     inT16 depth           // recurion depth
                                    ) {
  inT16 xmin, xmax;              // coord limits
  inT16 ymin, ymax;
  inT16 xindex, yindex;          // current bucket
  C_OUTLINE *child;              // current child
  inT32 child_count;             // no of children
  inT32 grandchild_count;        // no of grandchildren
  C_OUTLINE_IT child_it;         // search iterator

  TBOX olbox = outline->bounding_box();
  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
  child_count = 0;
  grandchild_count = 0;
  if (++depth > edges_max_children_layers)  // nested loops are too deep
    return max_count + depth;

  for (yindex = ymin; yindex <= ymax; yindex++) {
    for (xindex = xmin; xindex <= xmax; xindex++) {
      child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
      if (child_it.empty())
        continue;
      for (child_it.mark_cycle_pt(); !child_it.cycled_list();
           child_it.forward()) {
        child = child_it.data();
        if (child == outline || !(*child < *outline))
          continue;
        child_count++;

        if (child_count > edges_max_children_per_outline) {   // too fragmented
          if (edges_debug)
            tprintf("Discard outline on child_count=%d > "
                    "max_children_per_outline=%d\n",
                    child_count,
                    static_cast<inT32>(edges_max_children_per_outline));
          return max_count + child_count;
        }

        // Compute the "complexity" of each child recursively
        inT32 remaining_count = max_count - child_count - grandchild_count;
        if (remaining_count > 0)
          grandchild_count += edges_children_per_grandchild *
                              outline_complexity(child, remaining_count, depth);
        if (child_count + grandchild_count > max_count) {  // too complex
          if (edges_debug)
            tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
                    "> max_count=%d\n",
                    child_count, grandchild_count, max_count);
          return child_count + grandchild_count;
        }
      }
    }
  }
  return child_count + grandchild_count;
}
Beispiel #7
0
int
sys_query_module(struct tcb *tcp)
{
	if (entering(tcp)) {
		printstr(tcp, tcp->u_arg[0], -1);
		tprintf(", ");
		printxval(which, tcp->u_arg[1], "QM_???");
		tprintf(", ");
	} else {
		size_t ret;

		if (!verbose(tcp) || syserror(tcp) ||
		    umove(tcp, tcp->u_arg[4], &ret) < 0) {
			tprintf("%#lx, %lu, %#lx", tcp->u_arg[2],
				tcp->u_arg[3], tcp->u_arg[4]);
		} else if (tcp->u_arg[1]==QM_INFO) {
			struct module_info	mi;
			if (umove(tcp, tcp->u_arg[2], &mi) < 0) {
				tprintf("%#lx, ", tcp->u_arg[2]);
			} else {
				tprintf("{address=%#lx, size=%lu, flags=",
					mi.addr, mi.size);
				printflags(modflags, mi.flags, "MOD_???");
				tprintf(", usecount=%lu}, ", mi.usecount);
			}
			tprintf("%Zu", ret);
		} else if ((tcp->u_arg[1]==QM_MODULES) ||
			   (tcp->u_arg[1]==QM_DEPS) ||
			   (tcp->u_arg[1]==QM_REFS)) {
			tprintf("{");
			if (!abbrev(tcp)) {
				char*	data	= malloc(tcp->u_arg[3]);
				char*	mod	= data;
				size_t	idx;

				if (!data) {
					fprintf(stderr, "out of memory\n");
					tprintf(" /* %Zu entries */ ", ret);
				} else {
					if (umoven(tcp, tcp->u_arg[2],
						tcp->u_arg[3], data) < 0) {
						tprintf(" /* %Zu entries */ ", ret);
					} else {
						for (idx=0; idx<ret; idx++) {
							tprintf("%s%s",
								(idx ? ", " : ""),
								mod);
							mod += strlen(mod)+1;
						}
					}
					free(data);
				}
			} else
				tprintf(" /* %Zu entries */ ", ret);
			tprintf("}, %Zu", ret);
		} else if (tcp->u_arg[1]==QM_SYMBOLS) {
			tprintf("{");
			if (!abbrev(tcp)) {
				char*			data	= malloc(tcp->u_arg[3]);
				struct module_symbol*	sym	= (struct module_symbol*)data;
				size_t			idx;

				if (!data) {
					fprintf(stderr, "out of memory\n");
					tprintf(" /* %Zu entries */ ", ret);
				} else {
					if (umoven(tcp, tcp->u_arg[2],
						tcp->u_arg[3], data) < 0) {
						tprintf(" /* %Zu entries */ ", ret);
					} else {
						for (idx=0; idx<ret; idx++) {
							tprintf("%s{name=%s, value=%lu}",
								(idx ? " " : ""),
								data+(long)sym->name,
								sym->value);
							sym++;
						}
					}
					free(data);
				}
			} else
				tprintf(" /* %Zu entries */ ", ret);
			tprintf("}, %Zd", ret);
		} else {
			printstr(tcp, tcp->u_arg[2], tcp->u_arg[3]);
			tprintf(", %#lx", tcp->u_arg[4]);
		}
	}
	return 0;
}
int par_tpl( int npar, char **par_id, double *par, char *fn_in_t, char *fn_out, int debug )
{
	FILE *in, *out;
	char *sep = " \t\n";
	char *word, token[2], number[80], buf[1000], *pnt_inst;
	word = ( char * ) malloc( 1000 * sizeof( char ) );
	int i, l, l2, c, start, space = 0, bad_data = 0, preserve;
	if( ( in = fopen( fn_in_t, "r" ) ) == NULL )
	{
		tprintf( "\n\nERROR: File %s cannot be opened to read template data!\n", fn_in_t );
		return( -1 );
	}
	if( debug ) tprintf( "Remove files for model inputs: %s\n", fn_out );
	remove( fn_out );
	if( ( out = fopen( fn_out, "w" ) ) == NULL )
	{
		tprintf( "\n\nERROR: File %s cannot be opened to write data!\n", fn_out );
		return( -1 );
	}
	if( debug ) tprintf( "\nCreating model input file \'%s\' for external model execution using template file \'%s\'.\n", fn_out, fn_in_t );
	fgets( buf, 1000, in );
	pnt_inst = &buf[0];
	for( c = 0, word = strtok_r( buf, sep, &pnt_inst ); word; c++, word = strtok_r( NULL, sep, &pnt_inst ) )
	{
		if( c == 0 ) // first entry
		{
			white_trim( word );
			if( strcasestr( word, "ptf" ) )
			{
				if( debug ) tprintf( "PEST Template file\n" );
			}
			else if( strcasestr( word, "template" ) )
			{
				if( debug ) tprintf( "MADS Template file; user-specified parameter token is expected\n" );
			}
			else
			{
				if( debug ) tprintf( "MADS Template file\n" );
				rewind( in );
				token[0] = '#';
				break; // quit the loop; done
			}
		}
		if( c == 1 ) // second entry in the case of PEST Template file
		{
			white_trim( word );
			if( strlen( word ) > 1 )
				tprintf( "WARNING: expecting a single character as parameter keyword separator on the first line of template file (\'%s\'; assumed \'%s\')\n", word, token );
			token[0] = word[0];
			if( token[0] == 0 ) token[0] = '#';
			break;
		}
	}
	token[1] = 0;
	if( debug > 1 ) tprintf( "Parameter separator: %s\n", token );
	while( !feof( in ) )
	{
		if( fgets( buf, 1000, in ) == NULL ) { if( debug > 1 ) tprintf( "END of template file.\n" ); break; }
		l = strlen( buf );
		buf[l - 1] = 0; // remove 'new line' character
		if( buf[0] == token[0] ) start = 0; else start = 1; // if first character is a token it will be not considered a separator
		space = 0;
		pnt_inst = &buf[0];
		for( c = 0, word = strtok_r( buf, token, &pnt_inst ); word; c++, word = strtok_r( NULL, token, &pnt_inst ) ) // separation between the tokens is expected; e.g. "# a   # space # b  #"
		{
			if( c % 2 == start )
			{
				if( debug ) tprintf( "Parameter keyword \'%s\' ", word );
				l = strlen( word );
				white_skip( &word );
				white_trim( word );
				l2 = strlen( word );
				if( l > ( l2 + 2 ) ) preserve = 1;
				else preserve = 0;
				for( i = 0; i < npar; i++ )
				{
					if( strcmp( word, par_id[i] ) == 0 )
					{
						if( preserve == 1 )
						{
							if( par[i] > 0 ) sprintf( number, "%.*g", l - 1, par[i] );
							else sprintf( number, "%.*g", l - 2, par[i] );
							l2 = strlen( number );
							if( l2 > l ) tprintf( "WARNING: The parameter does not fit the requested field (%s length %d > %d)!\n", number, l2, l );
						}
						else
							sprintf( number, "%.15g", par[i] );
						if( space ) fprintf( out, " %s", number );
						else { space = 0; fprintf( out, "%s", number ); } // TODO originally was space = 1
						if( debug ) tprintf( "is replaced with \'%s\'\n", number );
						break;
					}
				}
				if( i == npar )
				{
					if( debug )	tprintf( "ERROR: does not match defined model parameters!!!\n" );
					else tprintf( "\nERROR: Parameter keyword \'%s\' in template file \'%s\' does not match defined model parameters!\n", word, fn_in_t );
					bad_data = 1;
				}
			}
			else
			{
				if( space ) fprintf( out, " %s", word );
				else { space = 0; fprintf( out, "%s", word ); }  // TODO originally was space = 1
			}
		}
		fprintf( out, "\n" );
	}
	fclose( in ); fclose( out );
	if( bad_data == 1 ) return( -1 );
	else return( 0 );
}
Beispiel #9
0
void AssociateUtils::ComputeStats(int col, int row,
                                  const AssociateStats *parent_stats,
                                  int parent_path_length,
                                  bool fixed_pitch,
                                  float max_char_wh_ratio,
                                  const DENORM *denorm,
                                  CHUNKS_RECORD *chunks_record,
                                  int debug_level,
                                  AssociateStats *stats) {
  stats->Clear();

  if (debug_level > 0) {
    tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n",
            col, row, fixed_pitch ? " (fixed pitch)" : "");
  }
  float normalizing_height = BASELINE_SCALE;
  // TODO(rays/daria) Can unicharset.script_has_xheight be useful here?
  if (fixed_pitch && denorm != NULL && denorm->row() != NULL) {
    // For fixed pitch language like CJK, we use the full text height
    // as the normalizing factor so we are not dependent on xheight
    // calculation.
    if (denorm->row()->body_size() > 0.0f) {
      normalizing_height = denorm->y_scale() * denorm->row()->body_size();
    } else {
      normalizing_height = denorm->y_scale() *
          (denorm->row()->x_height() + denorm->row()->ascenders());
    }
    if (debug_level > 0) {
      tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n",
              normalizing_height, denorm->y_scale(), denorm->row()->x_height(),
              denorm->row()->ascenders());
    }
  }
  float wh_ratio =
    GetChunksWidth(chunks_record->chunk_widths, col, row) / normalizing_height;
  if (debug_level) tprintf("wh_ratio %g\n", wh_ratio);
  if (wh_ratio > max_char_wh_ratio) stats->bad_shape = true;
  if (fixed_pitch) {
    bool end_row = (row == (chunks_record->ratings->dimension() - 1));

    // Ensure that the blob has gaps on the left and the right sides
    // (except for beginning and ending punctuation) and that there is
    // no cutting through ink at the blob boundaries.
    if (col > 0) {
      float left_gap =
        GetChunksGap(chunks_record->chunk_widths, col-1) / normalizing_height;
      SEAM *left_seam =
        static_cast<SEAM *>(array_value(chunks_record->splits, col-1));
      if (debug_level) {
        tprintf("left_gap %g, left_seam %g\n", left_gap, left_seam->priority);
      }
      if ((!end_row && left_gap < kMinGap) || left_seam->priority > 0.0f) {
        stats->bad_shape = true;
      }
    }
    float right_gap = 0.0f;
    if (!end_row) {
      right_gap =
        GetChunksGap(chunks_record->chunk_widths, row) / normalizing_height;
      SEAM *right_seam =
        static_cast<SEAM *>(array_value(chunks_record->splits, row));
      if (debug_level) {
        tprintf("right_gap %g right_seam %g\n",
                right_gap, right_seam->priority);
      }
      if (right_gap < kMinGap || right_seam->priority > 0.0f) {
        stats->bad_shape = true;
        if (right_gap < kMinGap) stats->bad_fixed_pitch_right_gap = true;
      }
    }

    // Impose additional segmentation penalties if blob widths or gaps
    // distribution don't fit a fixed-pitch model.
    // Since we only know the widths and gaps of the path explored so far,
    // the means and variances are computed for the path so far (not
    // considering characters to the right of the last character on the path).
    stats->full_wh_ratio = wh_ratio + right_gap;
    if (parent_stats != NULL) {
      stats->full_wh_ratio_total =
        (parent_stats->full_wh_ratio_total + stats->full_wh_ratio);
      float mean =
        stats->full_wh_ratio_total / static_cast<float>(parent_path_length+1);
      stats->full_wh_ratio_var =
        parent_stats->full_wh_ratio_var + pow(mean-stats->full_wh_ratio, 2);
    } else {
      stats->full_wh_ratio_total = stats->full_wh_ratio;
    }
    if (debug_level) {
      tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n",
              stats->full_wh_ratio, stats->full_wh_ratio_total,
              stats->full_wh_ratio_var);
    }

    stats->shape_cost =
      FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio);

    // For some reason Tesseract prefers to treat the whole CJ words
    // as one blob when the initial segmentation is particularly bad.
    // This hack is to avoid favoring such states.
    if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) {
      stats->shape_cost += 10;
    }
    stats->shape_cost += stats->full_wh_ratio_var;
    if (debug_level) tprintf("shape_cost %g\n", stats->shape_cost);
  }
}
Beispiel #10
0
int load_pst( char *filename, struct opt_data *op )
{
	FILE *in;
	double d;
	char code[20], buf[1000];
	int i, j, k, npar_groups, nobs_groups, bad_data = 0;
	struct calc_data *cd;
	struct param_data *pd;
	struct obs_data *od;
	struct extrn_data *ed;
	cd = op->cd;
	pd = op->pd;
	od = op->od;
	ed = op->ed;
	pd->nParam = pd->nFlgParam = pd->nOptParam = 0;
	od->nTObs = od->nCObs = od->nObs = 0;
	ed->ntpl = ed->nins = 0;
	bad_data = 0;
	op->gd->min_t = op-> gd->time = 0;
	if( ( in = fopen( filename, "r" ) ) == NULL )
	{
		tprintf( "PEST control file %s cannot be opened to read problem data!\n", filename );
		return( -1 );
	}
	cd->opt_method = ( char * ) malloc( 50 * sizeof( char ) );
	cd->solution_id = ( char * ) malloc( 50 * sizeof( char ) );
	cd->solution_type = ( int * ) malloc( 1 * sizeof( int ) );
	strcpy( cd->solution_id, "external" );
	cd->num_sources = 1;
	cd->solution_type = ( int * ) malloc( sizeof( int ) );
	cd->solution_type[0] = EXTERNAL;
	for( i = 0; i < 4; i++ ) // skip 4 lines
		fgets( buf, 1000, in );
	sscanf( buf, "%d %d %d %*d %d", &pd->nParam, &od->nTObs, &npar_groups, &nobs_groups );
	tprintf( "Parameters = %d (groups %d)\n", pd->nParam, npar_groups );
	tprintf( "Observations = %d (groups %d)\n", od->nTObs, nobs_groups );
	od->nObs = od->nCObs = od->nTObs;
	fgets( buf, 1000, in );
	sscanf( buf, "%d %d", &ed->ntpl, &ed->nins );
	tprintf( "Number of template files = %d\nNumber of instruction files = %d\n", ed->ntpl, ed->nins );
	pd->var_name = char_matrix( pd->nParam, 50 );
	pd->var_id = char_matrix( pd->nParam, 50 );
	pd->var = ( double * ) malloc( pd->nParam * sizeof( double ) );
	pd->var_current = ( double * ) malloc( pd->nParam * sizeof( double ) );
	pd->var_best = ( double * ) malloc( pd->nParam * sizeof( double ) );
	cd->var = ( double * ) malloc( pd->nParam * sizeof( double ) );
	pd->var_opt = ( int * ) malloc( pd->nParam * sizeof( int ) );
	pd->var_log = ( int * ) malloc( pd->nParam * sizeof( int ) );
	pd->var_dx = ( double * ) malloc( pd->nParam * sizeof( double ) );
	pd->var_min = ( double * ) malloc( pd->nParam * sizeof( double ) );
	pd->var_max = ( double * ) malloc( pd->nParam * sizeof( double ) );
	pd->var_range = ( double * ) malloc( pd->nParam * sizeof( double ) );
	tprintf( "Parameters = %d:\n", pd->nParam );
	for( i = 0; i < 6; i++ ) // skip 6 lines
		fgets( buf, 1000, in );
	for( i = 0; i < npar_groups; i++ )
		fgets( buf, 1000, in );
	fgets( buf, 1000, in );
	pd->nFlgParam = 0;
	pd->nOptParam = 0;
	for( i = 0; i < pd->nParam; i++ )
	{
		fscanf( in, "%s %s %*s %lf %lf %lf %*s %*f %*f %*f\n", pd->var_id[i], code, &pd->var[i], &pd->var_min[i], &pd->var_max[i] );
		strcpy( pd->var_name[i], pd->var_id[i] );
		tprintf( "%-27s: init %15.12g min %12g max %12g\n", pd->var_name[i], pd->var[i], pd->var_min[i], pd->var_max[i] );
		if( strcmp( code, "fixed" ) == 0 ) pd->var_opt[i] = 0; else { pd->nOptParam++; pd->var_opt[i] = 1; }
		if( strcmp( code, "log" ) == 0 ) pd->var_log[i] = 1; else pd->var_log[i] = 0;
		if( pd->var_log[i] == 1 )
		{
			pd->var[i] = log10( pd->var[i] );
			pd->var_min[i] = log10( pd->var_min[i] );
			pd->var_max[i] = log10( pd->var_max[i] );
		}
		pd->var_range[i] = pd->var_max[i] - pd->var_min[i];
		pd->var_dx[i] = pd->var_range[i] / 10;
	}
	pd->var_index = ( int * ) malloc( pd->nOptParam * sizeof( int ) );
	tprintf( "Optimized parameters = %d\n", pd->nOptParam );
	for( k = i = 0; i < pd->nParam; i++ )
		if( pd->var_opt[i] == 1 )
		{
			if( pd->var_log[i] == 1 ) d = log10( pd->var[i] ); else d = pd->var[i];
			tprintf( "%-27s: init %15.12g min %12g max %12g\n", pd->var_name[i], d, pd->var_min[i], pd->var_max[i] );
			pd->var_index[k++] = i;
		}
	for( i = 0; i < pd->nParam; i++ )
		for( j = i + 1; j < pd->nParam; j++ )
			if( strcmp( pd->var_name[i], pd->var_name[j] ) == 0 )
			{
				tprintf( "ERROR: Parameter names #%i (%s) and #%i (%s) are identical!\n", i + 1, pd->var_name[i], j + 1, pd->var_name[j] );
				bad_data = 1;
			}
	if( bad_data ) return( 0 );
	fgets( buf, 1000, in ); // skip line
	for( i = 0; i < nobs_groups; i++ )
		fgets( buf, 1000, in );
	fgets( buf, 1000, in ); // skip line
	od->obs_id = char_matrix( od->nTObs, 50 );
	od->obs_target = ( double * ) malloc( od->nTObs * sizeof( double ) );
	od->obs_weight = ( double * ) malloc( od->nTObs * sizeof( double ) );
	od->obs_min = ( double * ) malloc( od->nTObs * sizeof( double ) );
	od->obs_max = ( double * ) malloc( od->nTObs * sizeof( double ) );
	od->obs_current = ( double * ) malloc( od->nTObs * sizeof( double ) );
	od->obs_best = ( double * ) malloc( od->nTObs * sizeof( double ) );
	od->res = ( double * ) malloc( od->nTObs * sizeof( double ) );
	od->obs_log = ( int * ) malloc( od->nTObs * sizeof( int ) );
	for( i = 0; i < od->nTObs; i++ )
		fscanf( in, "%s %lf %lf %*s\n", od->obs_id[i], &od->obs_target[i], &od->obs_weight[i] );
	tprintf( "Calibration targets = %d\n", od->nTObs );
	for( i = 0; i < od->nTObs; i++ )
	{
		if( od->nTObs < 50 || ( i < 20 || i > od->nTObs - 20 ) )
			tprintf( "%-13s: value %15.12g weight %g\n", od->obs_id[i], od->obs_target[i], od->obs_weight[i] );
		if( od->nTObs > 50 && i == 21 ) tprintf( "...\n" );
		od->obs_min[i] = 0; od->obs_max[i] = od->obs_target[i] * 2;
		od->obs_log[i] = 0;
	}
	if( od->nObs < 10000 || cd->problem_type == CHECK || cd->debug > 10 )
	{
		tprintf( "Checking for duplicate observations ... \n" );
		if( od->nObs >= 10000 ) tprintf( "WARNING: The number of observations is large (%d); this may take a long time ... \n", od->nObs );
		for( i = 0; i < od->nTObs; i++ )
			for( j = i + 1; j < od->nTObs; j++ )
				if( strcmp( od->obs_id[i], od->obs_id[j] ) == 0 )
				{
					tprintf( "ERROR: Observation names #%i (%s) and #%i (%s) are identical!\n", i + 1, od->obs_id[i], j + 1, od->obs_id[j] );
					bad_data = 1;
				}
	}
	if( bad_data ) return( 0 );
	fgets( buf, 1000, in ); // skip line
	ed->cmdline = ( char * ) malloc( 255 * sizeof( char ) );
	fgets( ed->cmdline, 255, in );
	ed->cmdline[strlen( ed->cmdline ) - 1] = 0;
	tprintf( "Execution command: %s\n", ed->cmdline );
	tprintf( "External files:\n" );
	ed->fn_ins = char_matrix( ed->nins, 255 );
	ed->fn_obs = char_matrix( ed->nins, 255 );
	ed->fn_tpl = char_matrix( ed->ntpl, 255 );
	ed->fn_out = char_matrix( ed->ntpl, 255 );
	fgets( buf, 1000, in ); // skip line
	for( i = 0; i < ed->ntpl; i++ )
		fscanf( in, "%s %s\n", ed->fn_tpl[i], ed->fn_out[i] );
	tprintf( "- to provide current model parameters:\n" );
	for( i = 0; i < ed->ntpl; i++ )
		tprintf( "%s -> %s\n", ed->fn_tpl[i], ed->fn_out[i] );
	for( i = 0; i < ed->nins; i++ )
		fscanf( in, "%s %s\n", ed->fn_ins[i], ed->fn_obs[i] );
	tprintf( "- to read current model predictions:\n" );
	for( i = 0; i < ed->nins; i++ )
		tprintf( "%s <- %s\n", ed->fn_ins[i], ed->fn_obs[i] );
	fclose( in );
	tprintf( "\n" );
	return( 1 );
}
Beispiel #11
0
int check_par_tpl( int npar, char **par_id, int *par_count, char *fn_in_t, int debug )
{
	FILE *in;
	char *sep = " \t\n"; // White spaces
	char *word, token[2], buf[1000], *pnt_inst;
	int i, l, c, start = 0, bad_data = 0;
	if( ( in = fopen( fn_in_t, "r" ) ) == NULL )
	{
		tprintf( "\n\nERROR: File %s cannot be opened to read template data!\n", fn_in_t );
		return( -1 );
	}
	if( debug ) tprintf( "\nChecking the template file \'%s\'.\n", fn_in_t );
	fgets( buf, 1000, in );
	pnt_inst = &buf[0];
	for( c = 0, word = strtok_r( buf, sep, &pnt_inst ); word; c++, word = strtok_r( NULL, sep, &pnt_inst ) )
	{
		if( c == 0 ) // first entry
		{
			white_trim( word );
			if( strstr( word, "ptf" ) )
			{
				if( debug ) tprintf( "PEST Template file\n" );
			}
			else if( strcasestr( word, "template" ) )
			{
				if( debug ) tprintf( "MADS Template file; user-specified parameter token is expected\n" );
			}
			else
			{
				if( debug ) tprintf( "MADS Template file\n" );
				rewind( in );
				token[0] = '#'; // default tokes
				break; // quit the loop; done
			}
		}
		if( c == 1 ) // second entry in the case of PEST Template file
		{
			white_trim( word );
			if( strlen( word ) > 1 )
				tprintf( "WARNING: expecting a single character as parameter keyword separator on the first line of template file (\'%s\'; assumed \'%s\')\n", word, token );
			token[0] = word[0];
			if( token[0] == 0 ) token[0] = '#';
			break;
		}
	}
	token[1] = 0;
	if( debug ) tprintf( "Parameter separator: %s\n", token );
	while( !feof( in ) )
	{
		if( fgets( buf, 1000, in ) == NULL ) { if( debug > 1 ) tprintf( "END of template file.\n" ); break; }
		l = strlen( buf );
		buf[l - 1] = 0;
		if( buf[0] == token[0] ) start = 0; else start = 1;
		pnt_inst = &buf[0];
		for( c = 0, word = strtok_r( buf, token, &pnt_inst ); word; c++, word = strtok_r( NULL, token, &pnt_inst ) ) // separation between the tokens is expected; e.g. space # b  #"
		{
//			tprintf( "%d %s\n", c, word );
			if( c % 2 == start )
			{
				if( debug ) tprintf( "Parameter keyword \'%s\' ", word );
				l = strlen( word );
				white_skip( &word );
				white_trim( word );
				for( i = 0; i < npar; i++ )
				{
					if( strcmp( word, par_id[i] ) == 0 )
					{
						if( debug ) tprintf( "will be replaced with the value of model parameter \'%s\'\n", par_id[i] );
						if( par_count[i] < 0 ) par_count[i] = 1;
						else par_count[i] += 1;
						break;
					}
				}
				if( i == npar )
				{
					if( debug ) tprintf( "ERROR: does not match defined model parameters!!!\n" );
					else tprintf( "\nERROR: Parameter keyword \'%s\' in template file \'%s\' does not match defined model parameters!\n", word, fn_in_t );
					bad_data = 1;
				}
			}
		}
	}
	fclose( in );
	if( bad_data == 1 ) return( -1 );
	else return( 0 );
}
Beispiel #12
0
int ins_obs( int nobs, char **obs_id, double *obs, int *obs_count, char *fn_in_i, char *fn_in_d, int debug )
{
	FILE *infile_inst, *infile_data;
	char *separator = " \t\n";
	char *word_inst, *word_data, *word_search, token_search[2], token_obs[2], comment[2], dummy_var[6], buf_data[1000], buf_inst[1000], *pnt_inst, *pnt_data;
	int i, c, bad_data = 0, sl;
	double v;
	if( ( infile_inst = fopen( fn_in_i, "r" ) ) == NULL )
	{
		tprintf( "\nERROR: File %s cannot be opened to read template data!\n", fn_in_i );
		return( -1 );
	}
	if( ( infile_data = fopen( fn_in_d, "r" ) ) == NULL )
	{
		tprintf( "\nERROR: File %s cannot be opened to read the model-predicted observations!\n", fn_in_d );
		return( -1 );
	}
	if( debug ) tprintf( "\nReading output file \'%s\' obtained from external model execution using instruction file \'%s\'.\n", fn_in_d, fn_in_i );
	fgets( buf_inst, 1000, infile_inst );
	if( debug > 1 ) tprintf( "First instruction line: %s\n", buf_inst );
	pnt_inst = &buf_inst[0];
	for( c = 0, word_inst = strtok_r( buf_inst, separator, &pnt_inst ); word_inst; c++, word_inst = strtok_r( NULL, separator, &pnt_inst ) )
	{
		if( c == 0 ) // first entry
		{
			white_trim( word_inst );
			if( strcasestr( word_inst, "pif" ) )
			{
				if( debug > 1 ) tprintf( "PEST Instruction file\n" );
				token_search[0] = '@'; // just in case
				token_obs[0] = '!';
				comment[0] = 0;
			}
			else if( strcasestr( word_inst, "instruction" ) )
			{
				if( debug > 1 ) tprintf( "MADS Instruction file; user-specified search/variable tokens are expected\n" );
				token_search[0] = '@'; // just in case
				token_obs[0] = '!';
				comment[0] = '#';
			}
			else
			{
				if( debug > 1 ) tprintf( "MADS Instruction file\n" );
				rewind( infile_inst );
				token_search[0] = '@';
				token_obs[0] = '!';
				comment[0] = '#';
				break;
			}
		}
		else if( c == 1 ) // second entry; "search" token
		{
			white_trim( word_inst );
			token_search[0] = word_inst[0];
			if( strlen( word_inst ) > 1 )
				tprintf( "WARNING: expecting a single character as search separator on the first line of instruction file (\'%s\'; assumed \'%s\')\n", word_inst, token_search );
			if( token_search[0] == 0 ) token_search[0] = '@';
		}
		else if( c == 2 ) // third entry; "variable" token
		{
			white_trim( word_inst );
			token_obs[0] = word_inst[0];
			if( strlen( word_inst ) > 1 )
				tprintf( "WARNING: expecting a single character as search separator on the first line of instruction file (\'%s\'; assumed \'%s\')\n", word_inst, token_search );
			if( token_obs[0] == 0 ) token_obs[0] = '!';
		}
		else if( c == 3 ) // third entry; "comment" token
		{
			white_trim( word_inst );
			comment[0] = word_inst[0];
			if( strlen( word_inst ) > 1 )
				tprintf( "WARNING: expecting a single character as search separator on the first line of instruction file (\'%s\'; assumed \'%s\')\n", word_inst, token_search );
			if( comment[0] == 0 ) comment[0] = '#';
			break;
		}
	}
	token_search[1] = token_obs[1] = 0;
	dummy_var[0] = token_obs[0];
	dummy_var[1] = 0;
	strcat( dummy_var, "dum" );
	dummy_var[4] = token_obs[0];
	dummy_var[5] = 0;
	token_obs[1] = token_search[1] = comment[1] = 0;
	if( debug > 1 )
	{
		tprintf( "Search separator: %s\n", token_search );
		tprintf( "Observation separator: %s\n", token_obs );
		tprintf( "Dummy observation: %s\n", dummy_var );
		if( comment[0] ) tprintf( "Comment: %s\n", comment );
	}
	buf_data[0] = 0; word_data = NULL;
	while( !feof( infile_inst ) )
	{
		if( fgets( buf_inst, 1000, infile_inst ) == NULL ) { if( debug > 1 ) tprintf( "END of instruction file.\n" ); break; }
		pnt_inst = &buf_inst[0];
		word_inst = 0;
		white_trim( pnt_inst ); white_skip( &pnt_inst );
		if( debug > 1 ) tprintf( "\n\nCurrent instruction line: %s\n", pnt_inst );
		if( comment[0] && pnt_inst[0] == comment[0] ) { if( debug > 1 ) tprintf( "Comment; skip this line.\n" ); continue; } // Instruction line is a comment
		if( strlen( pnt_inst ) == 0 ) { if( debug ) tprintf( "Empty line; will be skipped.\n" ); continue; }
		pnt_data = NULL;
		if( pnt_inst[0] == 'l' ) // skip lines in the "data" file
		{
			sscanf( &pnt_inst[1], "%d", &c );
			if( debug > 1 ) tprintf( "Skip %d lines\n", c );
			for( i = 0; i < c; i++ )
				if( fgets( buf_data, 1000, infile_data ) == NULL ) { tprintf( "\nERROR: Model output file \'%s\' is incomplete or instruction file \'%s\' is inaccurate!\n       Model output file \'%s\' ended before instruction file \'%s\' is completely processed!\n", fn_in_d, fn_in_i, fn_in_d, fn_in_i ); break; }
			word_inst = strtok_r( NULL, separator, &pnt_inst ); // skip l command
			if( feof( infile_data ) ) { tprintf( "\nERROR: Model output file \'%s\' is incomplete or instruction file \'%s\' is inaccurate!\n       Model output file \'%s\' ended before instruction file \'%s\' is completely processed!\n", fn_in_d, fn_in_i, fn_in_d, fn_in_i ); break; }
			white_trim( buf_data );
			pnt_data = &buf_data[0];
			word_data = NULL;
		}
		if( pnt_data == NULL ) // if there was no "l" (skip line) command, read the next "data" line
		{
			if( debug > 1 ) tprintf( "Read the next \'data\' line (there was no \'l\' (skip line) command)\n" );
			fgets( buf_data, 1000, infile_data );
			white_trim( buf_data );
			pnt_data = &buf_data[0];
			word_data = NULL;
		}
		if( debug > 1 ) tprintf( "Current location in model output file: => \'%s\' <= \'%s\'\n", word_data, pnt_data );
		if( debug ) { if( pnt_data != NULL ) { if( pnt_data[strlen( pnt_data ) - 2] != '\n' ) {} } }
		c = 0;
		while( 1 )
		{
			if( pnt_inst[0] == token_search[0] ) // search for keyword
			{
				if( debug > 1 ) tprintf( "KEYWORD search " );
				word_search = strtok_r( NULL, token_search, &pnt_inst ); // read search keyword
				if( debug > 1 ) tprintf( "\'%s\' in the data file ...\n", word_search );
				bad_data = 1;
				while( !feof( infile_data ) )
				{
					if( ( pnt_data = strstr( pnt_data, word_search ) ) != NULL )
					{
						pnt_data += strlen( word_search );
						if( debug > 1 ) tprintf( "Matching data file location \'=>%s<=%s\'\n", word_search, pnt_data );
						bad_data = 0;
						break;
					}
					if( fgets( buf_data, 1000, infile_data ) == NULL ) { tprintf( "\nERROR: Model output file \'%s\' is incomplete or instruction file \'%s\' is inaccurate!\n       Model output file \'%s\' ended before instruction file \'%s\' is completely processed!\n", fn_in_d, fn_in_i, fn_in_d, fn_in_i ); break; }
					white_trim( buf_data );
					pnt_data = &buf_data[0];
					word_data = NULL; // Force reading
				}
				if( bad_data == 1 )
				{
					tprintf( "\nERROR: Search keyword \'%s\' cannot be found in the data file \'%s\'!\n", word_search, fn_in_d );
					return( -1 );
				}
			}
			else // no keyword search
			{
				word_inst = strtok_r( NULL, separator, &pnt_inst ); // read TEMPLETE word
				if( debug > 1 ) tprintf( "Current location in instruction input file: => \'%s\' <= \'%s\'\n", word_inst, pnt_inst );
				white_trim( word_inst );
				if( debug > 1 ) tprintf( "INSTRUCTION word \'%s\' : ", word_inst );
				if( strncmp( word_inst, dummy_var, 5 ) == 0 ) // dummy variable
				{
					if( debug > 1 ) tprintf( "Skip dummy data!\n" );
					if( word_data == NULL ) word_data = strtok_r( NULL, separator, &pnt_data );
					word_data = strtok_r( NULL, separator, &pnt_data );
					if( debug > 1 ) tprintf( "Current location in model output file: => \'%s\' <= \'%s\'\n", word_data, pnt_data );
				}
				else if( word_inst[0] == 'w' ) // white space
				{
					if( debug > 1 ) tprintf( "Skip white space!\n" );
					if( !iswhite( pnt_data[0] ) )
					{
						if( word_data == NULL ) word_data = strtok_r( NULL, separator, &pnt_data );
						word_data = strtok_r( NULL, separator, &pnt_data );
					}
					else
					{
						word_data = strtok_r( NULL, separator, &pnt_data );
					}
					if( debug > 1 ) tprintf( "Current location in model output file: => \'%s\' <= \'%s\'\n", word_data, pnt_data );
				}
				else if( word_inst[0] == token_obs[0] ) // observation variable
				{
					if( debug ) tprintf( "Observation variable\n" );
					c++;
					if( word_data == NULL || c > 1 ) word_data = strtok_r( NULL, separator, &pnt_data );
					if( strlen( word_inst ) == 1 ) word_inst = strtok_r( NULL, separator, &pnt_inst );
					else word_inst = &word_inst[1];
					sl = strlen( word_inst );
					if( word_inst[sl - 1] == token_obs[0] ) word_inst[sl - 1] = 0;
					else strtok_r( NULL, separator, &pnt_inst );
					white_skip( &word_inst );
					white_trim( word_inst );
					if( debug ) tprintf( "Observation keyword \'%s\' & data field \'%s\' ... ", word_inst, word_data );
					if( word_data == NULL || strlen( word_data ) == 0 )
					{
						tprintf( "ERROR: Mismatch between the instruction file \'%s\' and the data file \'%s\'!\n", fn_in_i, fn_in_d );
						tprintf( "INSTRUCTION word \'%s\'\n", word_inst );
						tprintf( "Current location in instruction input file: => \'%s\' <= \'%s\'\n", word_inst, pnt_inst );
						tprintf( "Current location in model output file: => \'%s\' <= \'%s\'\n", word_data, pnt_data );
						bad_data = 1;
						break;
					}
					for( i = 0; i < nobs; i++ )
					{
						if( strcmp( word_inst, obs_id[i] ) == 0 )
						{
							sscanf( word_data, "%lf", &v );
							if( obs_count[i] == 0 ) { obs[i] = v; obs_count[i] = 1; }
							else { obs[i] += v; obs_count[i]++; }
							if( debug ) tprintf( "\'%s\'=%d\n", obs_id[i], obs[i] );
							break;
						}
					}
					if( nobs == i )
					{
						tprintf( "\nERROR: Observation keyword \'%s\' does not match any of observation variables!\n", word_inst );
						bad_data = 1;
					}
				}
				else if( comment[0] && word_inst[0] == comment[0] ) // comment
				{
					if( debug > 1 ) tprintf( "Comment. Skip rest of the instruction line!\n" );
					break;
				}
				else
				{
					tprintf( "\nERROR: Instruction file %s does not follow the expected format!\n", fn_in_i );
					tprintf( "White space (w), search (%s) or observation (%s) tokens are expected!\n", token_search, token_obs );
					bad_data = 1;
					break;
				}
			}
			if( pnt_inst == NULL || strlen( pnt_inst ) == 0 ) break;
		}
	}
	fclose( infile_data ); fclose( infile_inst );
	if( bad_data ) return( -1 );
	else return( 0 );
}
Beispiel #13
0
int check_ins_obs( int nobs, char **obs_id, int *obs_count, char *fn_in_i, int debug )
{
	FILE *infile_inst;
	char *separator = " \t\n";
	char *word_inst, *word_search, token_obs[2], token_search[2], comment[2], dummy_var[6], buf_inst[1000], *pnt_inst;
	int i, c, bad_data = 0;
	if( debug ) tprintf( "\nChecking instruction file \'%s\'.\n", fn_in_i );
	if( ( infile_inst = fopen( fn_in_i, "r" ) ) == NULL )
	{
		tprintf( "\n\nERROR: File %s cannot be opened to read template data!\n", fn_in_i );
		return( -1 );
	}
	fgets( buf_inst, 1000, infile_inst );
	if( debug ) tprintf( "\nFirst instruction line: %s\n", buf_inst );
	pnt_inst = &buf_inst[0];
	for( c = 0, word_inst = strtok_r( buf_inst, separator, &pnt_inst ); word_inst; c++, word_inst = strtok_r( NULL, separator, &pnt_inst ) )
	{
		if( c == 0 ) // first entry
		{
			white_trim( word_inst );
			if( strcasestr( word_inst, "pif" ) )
			{
				if( debug ) tprintf( "PEST Instruction file\n" );
				token_search[0] = '@'; // just in case
				token_obs[0] = '!';
				comment[0] = 0;
			}
			else if( strcasestr( word_inst, "instruction" ) )
			{
				if( debug ) tprintf( "MADS Instruction file; user-specified search/variable tokens are expected\n" );
				token_search[0] = '@'; // just in case
				token_obs[0] = '!';
				comment[0] = '#';
			}
			else
			{
				if( debug ) tprintf( "MADS Instruction file\n" );
				rewind( infile_inst );
				token_search[0] = '@';
				token_obs[0] = '!';
				comment[0] = '#';
				break;
			}
		}
		else if( c == 1 ) // second entry; "search" token
		{
			white_trim( word_inst );
			if( debug > 1 ) tprintf( "Search token %s\n", word_inst );
			token_search[0] = word_inst[0];
			if( strlen( word_inst ) > 1 )
				tprintf( "WARNING: expecting a single character as search separator on the first line of instruction file (\'%s\'; assumed \'%s\')\n", word_inst, token_search );
			if( token_search[0] == 0 ) token_search[0] = '@';
		}
		else if( c == 2 ) // third entry; "variable" token
		{
			white_trim( word_inst );
			if( debug > 1 ) tprintf( "Variable token %s\n", word_inst );
			token_obs[0] = word_inst[0];
			if( strlen( word_inst ) > 1 )
				tprintf( "WARNING: expecting a single character as search separator on the first line of instruction file (\'%s\'; assumed \'%s\')\n", word_inst, token_search );
			if( token_obs[0] == 0 ) token_obs[0] = '!';
		}
		else if( c == 3 ) // third entry; "comment" token
		{
			white_trim( word_inst );
			if( debug > 1 ) tprintf( "Comment token %s\n", word_inst );
			comment[0] = word_inst[0];
			if( strlen( word_inst ) > 1 )
				tprintf( "WARNING: expecting a single character as search separator on the first line of instruction file (\'%s\'; assumed \'%s\')\n", word_inst, token_search );
			if( comment[0] == 0 ) comment[0] = '#';
		}
	}
	token_search[1] = token_obs[1] = 0;
	dummy_var[0] = token_obs[0];
	dummy_var[1] = 0;
	strcat( dummy_var, "dum" );
	dummy_var[4] = token_obs[0];
	dummy_var[5] = 0;
	token_obs[1] = token_search[1] = comment[1] = 0;
	if( debug )
	{
		tprintf( "Search separator: %s\n", token_search );
		tprintf( "Observation separator: %s\n", token_obs );
		tprintf( "Dummy observation: %s\n", dummy_var );
		if( comment[0] ) tprintf( "Comment: %s\n", comment );
	}
	while( !feof( infile_inst ) ) // IMPORTANT: strtok below modifies buf_inst by adding '\0's; if needed strcpy buf_inst
	{
		if( fgets( buf_inst, 1000, infile_inst ) == NULL ) { if( debug > 1 ) tprintf( "END of instruction file.\n" ); break; }
		pnt_inst = &buf_inst[0];
		word_inst = 0;
		white_trim( pnt_inst ); white_skip( &pnt_inst );
		if( debug ) tprintf( "\nCurrent instruction line: %s\n", pnt_inst );
		if( comment[0] && pnt_inst[0] == comment[0] ) { if( debug > 1 ) tprintf( "Comment; skip this line.\n" ); continue; } // Instruction line is a comment
		if( strlen( pnt_inst ) == 0 ) { if( debug ) tprintf( "Empty line; will be skipped.\n" ); continue; } // Empty line
		if( pnt_inst[0] == 'l' ) // skip lines in the "data" file
		{
			sscanf( &pnt_inst[1], "%d", &c );
			if( debug > 1 ) tprintf( "Skip %d lines\n", c );
			word_inst = strtok_r( NULL, separator, &pnt_inst ); // skip l command
		}
		while( 1 )
		{
			if( pnt_inst[0] == token_search[0] ) // search for keyword
			{
				if( debug ) tprintf( "KEYWORD search " );
				word_search = strtok_r( NULL, token_search, &pnt_inst ); // read search keyword
				if( debug ) tprintf( "\'%s\' in the data file ...\n", word_search );
			}
			else
			{
				word_inst = strtok_r( NULL, separator, &pnt_inst ); // read TEMPLETE word
				if( debug > 1 ) tprintf( "Current location in instruction input file: => \'%s\' <= \'%s\'\n", word_inst, pnt_inst );
				white_trim( word_inst );
				if( debug ) tprintf( "INSTRUCTION word \'%s\' : ", word_inst );
				if( strncmp( word_inst, dummy_var, 5 ) == 0 ) // dummy variable
				{
					if( debug ) tprintf( "Skip dummy data!\n" );
				}
				else if( word_inst[0] == 'w' ) // white space
				{
					if( debug ) tprintf( "Skip white space!\n" );
				}
				else if( word_inst[0] == token_obs[0] ) // observation variable
				{
					c = 0;
					if( strlen( word_inst ) == 1 ) word_inst = strtok_r( NULL, separator, &pnt_inst );
					else word_inst = &word_inst[1];
					if( word_inst[strlen( word_inst ) - 1] == token_obs[0] ) word_inst[strlen( word_inst ) - 1] = 0;
					else strtok_r( NULL, separator, &pnt_inst );
					if( debug ) tprintf( "Observation keyword \'%s\' ... ", word_inst );
					white_skip( &word_inst );
					white_trim( word_inst );
					for( i = 0; i < nobs; i++ )
					{
						if( strcmp( word_inst, obs_id[i] ) == 0 )
						{
							obs_count[i]++;
							if( debug ) tprintf( "\'%s\' detected %d times\n", obs_id[i], obs_count[i] );
							break;
						}
					}
					if( nobs == i )
					{
						tprintf( "\nERROR: Observation keyword \'%s\' does not match any of observation variables!\n", word_inst );
						bad_data = 1;
					}
				}
				else if( comment[0] && word_inst[0] == comment[0] ) // comment
				{
					if( debug ) tprintf( "Comment. Skip rest of the instruction line!\n" );
					break;
				}
				else
				{
					tprintf( "\nERROR: Instruction file %s does not follow the expected format!\n", fn_in_i );
					tprintf( "White space (w), search (%s) or observation (%s) tokens are expected!\n", token_search, token_obs );
					bad_data = 1;
					break;
				}
			}
			if( pnt_inst == NULL || strlen( pnt_inst ) == 0 ) break;
		}
	}
	fclose( infile_inst );
	if( bad_data ) return( -1 );
	else return( 0 );
}
Beispiel #14
0
void BLOCK::print(            //print list of sides
                  FILE *,     //< file to print on
                  BOOL8 dump  //< print full detail
                 ) {
  ICOORDELT_IT it = &leftside;   //iterator

  box.print ();
  tprintf ("Proportional= %s\n", proportional ? "TRUE" : "FALSE");
  tprintf ("Kerning= %d\n", kerning);
  tprintf ("Spacing= %d\n", spacing);
  tprintf ("Fixed_pitch=%d\n", pitch);
  tprintf ("Filename= %s\n", filename.string ());

  if (dump) {
    tprintf ("Left side coords are:\n");
    for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ())
      tprintf ("(%d,%d) ", it.data ()->x (), it.data ()->y ());
    tprintf ("\n");
    tprintf ("Right side coords are:\n");
    it.set_to_list (&rightside);
    for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ())
      tprintf ("(%d,%d) ", it.data ()->x (), it.data ()->y ());
    tprintf ("\n");
  }
}
Beispiel #15
0
WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
                                      C_BLOB_LIST* orphan_blobs) {
  C_BLOB_LIST current_blob_list;
  C_BLOB_IT werd_blobs_it(&current_blob_list);
  // Add the word's c_blobs.
  werd_blobs_it.add_list_after(cblob_list());

  // New blob list. These contain the blobs which will form the new word.
  C_BLOB_LIST new_werd_blobs;
  C_BLOB_IT new_blobs_it(&new_werd_blobs);

  // not_found_blobs contains the list of current word's blobs for which a
  // corresponding blob wasn't found in the input all_blobs list.
  C_BLOB_LIST not_found_blobs;
  C_BLOB_IT not_found_it(&not_found_blobs);
  not_found_it.move_to_last();

  werd_blobs_it.move_to_first();
  for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list();
       werd_blobs_it.forward()) {
    C_BLOB* werd_blob = werd_blobs_it.extract();
    TBOX werd_blob_box = werd_blob->bounding_box();
    bool found = false;
    // Now find the corresponding blob for this blob in the all_blobs
    // list. For now, follow the inefficient method of pairwise
    // comparisons. Ideally, one can pre-bucket the blobs by row.
    C_BLOB_IT all_blobs_it(all_blobs);
    for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
         all_blobs_it.forward()) {
      C_BLOB* a_blob = all_blobs_it.data();
      // Compute the overlap of the two blobs. If major, a_blob should
      // be added to the new blobs list.
      TBOX a_blob_box = a_blob->bounding_box();
      if (a_blob_box.null_box()) {
        tprintf("Bounding box couldn't be ascertained\n");
      }
      if (werd_blob_box.contains(a_blob_box) ||
          werd_blob_box.major_overlap(a_blob_box)) {
        // Old blobs are from minimal splits, therefore are expected to be
        // bigger. The new small blobs should cover a significant portion.
        // This is it.
        all_blobs_it.extract();
        new_blobs_it.add_after_then_move(a_blob);
        found = true;
      }
    }
    if (!found) {
      not_found_it.add_after_then_move(werd_blob);
    } else {
      delete werd_blob;
    }
  }
  // Iterate over all not found blobs. Some of them may be due to
  // under-segmentation (which is OK, since the corresponding blob is already
  // in the list in that case.
  not_found_it.move_to_first();
  for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
       not_found_it.forward()) {
    C_BLOB* not_found = not_found_it.data();
    TBOX not_found_box = not_found->bounding_box();
    C_BLOB_IT existing_blobs_it(new_blobs_it);
    for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
         existing_blobs_it.forward()) {
      C_BLOB* a_blob = existing_blobs_it.data();
      TBOX a_blob_box = a_blob->bounding_box();
      if ((not_found_box.major_overlap(a_blob_box) ||
           a_blob_box.major_overlap(not_found_box)) &&
           not_found_box.y_overlap_fraction(a_blob_box) > 0.8) {
        // Already taken care of.
        delete not_found_it.extract();
        break;
      }
    }
  }
  if (orphan_blobs) {
    C_BLOB_IT orphan_blobs_it(orphan_blobs);
    orphan_blobs_it.move_to_last();
    orphan_blobs_it.add_list_after(&not_found_blobs);
  }

  // New blobs are ready. Create a new werd object with these.
  WERD* new_werd = NULL;
  if (!new_werd_blobs.empty()) {
    new_werd = new WERD(&new_werd_blobs, this);
  } else {
    // Add the blobs back to this word so that it can be reused.
    C_BLOB_IT this_list_it(cblob_list());
    this_list_it.add_list_after(&not_found_blobs);
  }
  return new_werd;
}
int main(int argc, char **argv) {
  if ((argc == 2 && strcmp(argv[1], "-v") == 0) ||
      (argc == 2 && strcmp(argv[1], "--version") == 0)) {
    char *versionStrP;

    fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());

    versionStrP = getLeptonicaVersion();
    fprintf(stderr, " %s\n", versionStrP);
    lept_free(versionStrP);

    versionStrP = getImagelibVersions();
    fprintf(stderr, "  %s\n", versionStrP);
    lept_free(versionStrP);

#ifdef USE_OPENCL
    cl_platform_id platform;
    cl_uint num_platforms;
    cl_device_id devices[2];
    cl_uint num_devices;
    char info[256];
    int i;

    fprintf(stderr, " OpenCL info:\n");
    clGetPlatformIDs(1, &platform, &num_platforms);
    fprintf(stderr, "  Found %d platforms.\n", num_platforms);
    clGetPlatformInfo(platform, CL_PLATFORM_NAME, 256, info, 0);
    fprintf(stderr, "  Platform name: %s.\n", info);
    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 256, info, 0);
    fprintf(stderr, "  Version: %s.\n", info);
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, devices, &num_devices);
    fprintf(stderr, "  Found %d devices.\n", num_devices);
    for (i = 0; i < num_devices; ++i) {
      clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 256, info, 0);
      fprintf(stderr, "    Device %d name: %s.\n", i+1, info);
    }
#endif
    exit(0);
  }

  // Make the order of args a bit more forgiving than it used to be.
  const char* lang = "eng";
  const char* image = NULL;
  const char* output = NULL;
  const char* datapath = NULL;
  bool noocr = false;
  bool list_langs = false;
  bool print_parameters = false;

  tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
  int arg = 1;
  while (arg < argc && (output == NULL || argv[arg][0] == '-')) {
    if (strcmp(argv[arg], "-l") == 0 && arg + 1 < argc) {
      lang = argv[arg + 1];
      ++arg;
    } else if (strcmp(argv[arg], "--tessdata-dir") == 0 && arg + 1 < argc) {
      datapath = argv[arg + 1];
      ++arg;
    } else if (strcmp(argv[arg], "--list-langs") == 0) {
      noocr = true;
      list_langs = true;
    } else if (strcmp(argv[arg], "-psm") == 0 && arg + 1 < argc) {
      pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[arg + 1]));
      ++arg;
    } else if (strcmp(argv[arg], "--print-parameters") == 0) {
      noocr = true;
      print_parameters = true;
    } else if (strcmp(argv[arg], "-c") == 0 && arg + 1 < argc) {
      // handled properly after api init
      ++arg;
    } else if (image == NULL) {
      image = argv[arg];
    } else if (output == NULL) {
      output = argv[arg];
    }
    ++arg;
  }

  if (argc == 2 && strcmp(argv[1], "--list-langs") == 0) {
    list_langs = true;
    noocr = true;
  }

  if (output == NULL && noocr == false) {
    fprintf(stderr, "Usage:\n  %s imagename|stdin outputbase|stdout "
            "[options...] [configfile...]\n\n", argv[0]);

    fprintf(stderr, "OCR options:\n");
    fprintf(stderr, "  --tessdata-dir /path\tspecify location of tessdata"
                      " path\n");
    fprintf(stderr, "  -l lang[+lang]\tspecify language(s) used for OCR\n");
    fprintf(stderr, "  -c configvar=value\tset value for control parameter.\n"
                      "\t\t\tMultiple -c arguments are allowed.\n");
    fprintf(stderr, "  -psm pagesegmode\tspecify page segmentation mode.\n");
    fprintf(stderr, "These options must occur before any configfile.\n\n");
    fprintf(stderr,
            "pagesegmode values are:\n"
            "  0 = Orientation and script detection (OSD) only.\n"
            "  1 = Automatic page segmentation with OSD.\n"
            "  2 = Automatic page segmentation, but no OSD, or OCR\n"
            "  3 = Fully automatic page segmentation, but no OSD. (Default)\n"
            "  4 = Assume a single column of text of variable sizes.\n"
            "  5 = Assume a single uniform block of vertically aligned text.\n"
            "  6 = Assume a single uniform block of text.\n"
            "  7 = Treat the image as a single text line.\n"
            "  8 = Treat the image as a single word.\n"
            "  9 = Treat the image as a single word in a circle.\n"
            "  10 = Treat the image as a single character.\n\n");
    fprintf(stderr, "Single options:\n");
    fprintf(stderr, "  -v --version: version info\n");
    fprintf(stderr, "  --list-langs: list available languages for tesseract "
                      "engine. Can be used with --tessdata-dir.\n");
    fprintf(stderr, "  --print-parameters: print tesseract parameters to the "
                      "stdout.\n");
    exit(1);
  }

  if (output != NULL && strcmp(output, "-") && strcmp(output, "stdout")) {
    tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n",
           tesseract::TessBaseAPI::Version());
  }
  PERF_COUNT_START("Tesseract:main")
  tesseract::TessBaseAPI api;

  api.SetOutputName(output);
  int rc = api.Init(datapath, lang, tesseract::OEM_DEFAULT,
                &(argv[arg]), argc - arg, NULL, NULL, false);

  if (rc) {
    fprintf(stderr, "Could not initialize tesseract.\n");
    exit(1);
  }

  char opt1[255], opt2[255];
  for (arg = 0; arg < argc; arg++) {
    if (strcmp(argv[arg], "-c") == 0 && arg + 1 < argc) {
      strncpy(opt1, argv[arg + 1], 255);
      *(strchr(opt1, '=')) = 0;
      strncpy(opt2, strchr(argv[arg + 1], '=') + 1, 255);
      opt2[254] = 0;
      ++arg;

      if (!api.SetVariable(opt1, opt2)) {
        fprintf(stderr, "Could not set option: %s=%s\n", opt1, opt2);
      }
    }
  }

  if (list_langs) {
     GenericVector<STRING> languages;
     api.GetAvailableLanguagesAsVector(&languages);
     fprintf(stderr, "List of available languages (%d):\n",
             languages.size());
     for (int index = 0; index < languages.size(); ++index) {
       STRING& string = languages[index];
       fprintf(stderr, "%s\n", string.string());
     }
     api.End();
     exit(0);
  }

  if (print_parameters) {
     FILE* fout = stdout;
     fprintf(stdout, "Tesseract parameters:\n");
     api.PrintVariables(fout);
     api.End();
     exit(0);
  }

  // We have 2 possible sources of pagesegmode: a config file and
  // the command line. For backwards compatability reasons, the
  // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
  // default for this program is tesseract::PSM_AUTO. We will let
  // the config file take priority, so the command-line default
  // can take priority over the tesseract default, so we use the
  // value from the command line only if the retrieved mode
  // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
  // in any config file. Therefore the only way to force
  // tesseract::PSM_SINGLE_BLOCK is from the command line.
  // It would be simpler if we could set the value before Init,
  // but that doesn't work.
  if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
     api.SetPageSegMode(pagesegmode);

  bool stdInput = !strcmp(image, "stdin") || !strcmp(image, "-");
  Pix* pixs = NULL;
  if (stdInput) {
    char byt;
    GenericVector<l_uint8> ch_data;
    std::istream file(std::cin.rdbuf());

#ifdef WIN32
    if (_setmode(_fileno(stdin), _O_BINARY) == -1)
      tprintf("ERROR: cin to binary: %s", strerror(errno));
#endif  // WIN32

    while (file.get(byt)) {
      ch_data.push_back(byt);
    }
    std::cin.ignore(std::cin.rdbuf()->in_avail() + 1);

    pixs = pixReadMem(&ch_data[0], ch_data.size());
  }

  if (pagesegmode == tesseract::PSM_AUTO_ONLY ||
      pagesegmode == tesseract::PSM_OSD_ONLY) {
    int ret_val = 0;

    if (!pixs)
      pixs = pixRead(image);
    if (!pixs) {
      fprintf(stderr, "Cannot open input file: %s\n", image);
      exit(2);
    }
    api.SetImage(pixs);

    if (pagesegmode == tesseract::PSM_OSD_ONLY) {
       OSResults osr;
       if (api.DetectOS(&osr)) {
         int orient = osr.best_result.orientation_id;
         int script_id = osr.get_best_script(orient);
         float orient_oco = osr.best_result.oconfidence;
         float orient_sco = osr.best_result.sconfidence;
         tprintf("Orientation: %d\nOrientation in degrees: %d\n" \
                 "Orientation confidence: %.2f\n" \
                 "Script: %d\nScript confidence: %.2f\n",
                 orient, OrientationIdToValue(orient), orient_oco,
                 script_id, orient_sco);
       } else {
         ret_val = 1;
       }
    } else {
       tesseract::Orientation orientation;
       tesseract::WritingDirection direction;
       tesseract::TextlineOrder order;
       float deskew_angle;
       tesseract::PageIterator* it =  api.AnalyseLayout();
       if (it) {
         it->Orientation(&orientation, &direction, &order, &deskew_angle);
         tprintf("Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n" \
                 "Deskew angle: %.4f\n",
                  orientation, direction, order, deskew_angle);
       } else {
         ret_val = 1;
       }
       delete it;
    }
    pixDestroy(&pixs);
    exit(ret_val);
  }

  tesseract::TessResultRenderer* renderer = NULL;
  bool b;
  api.GetBoolVariable("tessedit_create_hocr", &b);
  if (b && renderer == NULL) renderer = new tesseract::TessHOcrRenderer();

  api.GetBoolVariable("tessedit_create_pdf", &b);
  if (b && renderer == NULL)
    renderer = new tesseract::TessPDFRenderer(api.GetDatapath());

  api.GetBoolVariable("tessedit_create_boxfile", &b);
  if (b && renderer == NULL) renderer = new tesseract::TessBoxTextRenderer();

  if (renderer == NULL) renderer = new tesseract::TessTextRenderer();

  if (pixs) {
    api.ProcessPage(pixs, 0, NULL, NULL, 0, renderer);
    pixDestroy(&pixs);
  } else {
    FILE* fin = fopen(image, "rb");
    if (fin == NULL) {
      fprintf(stderr, "Cannot open input file: %s\n", image);
      exit(2);
    }
    fclose(fin);
    if (!api.ProcessPages(image, NULL, 0, renderer)) {
      fprintf(stderr, "Error during processing.\n");
      exit(1);
    }
  }

  FILE* fout = stdout;
  if (strcmp(output, "-") && strcmp(output, "stdout")) {
    STRING outfile = STRING(output)
        + STRING(".")
        + STRING(renderer->file_extension());
    fout = fopen(outfile.string(), "wb");
    if (fout == NULL) {
      fprintf(stderr, "Cannot create output file %s\n", outfile.string());
      exit(1);
    }
  }

  const char* data;
  inT32 data_len;
  if (renderer->GetOutput(&data, &data_len)) {
    fwrite(data, 1, data_len, fout);
    if (fout != stdout)
      fclose(fout);
    else
      clearerr(fout);
  }
  PERF_COUNT_END
  return 0;                      // Normal exit
}
Beispiel #17
0
int
main(void)
{
	tprintf("%s", "");

	const unsigned int big_size = 1024 / 8;
	unsigned int set_size;

	for (set_size = big_size; set_size; set_size >>= 1) {
		if (!k_sigprocmask(SIG_SETMASK, NULL, NULL, set_size))
			break;
		tprintf("rt_sigprocmask(SIG_SETMASK, NULL, NULL, %u)"
			" = -1 EINVAL (%m)\n", set_size);
	}
	if (!set_size)
		perror_msg_and_fail("rt_sigprocmask");
	tprintf("rt_sigprocmask(SIG_SETMASK, NULL, NULL, %u) = 0\n",
		set_size);

	void *const k_set = tail_alloc(set_size);
	void *const old_set = tail_alloc(set_size);
	sigset_t *const libc_set = tail_alloc(sizeof(sigset_t));

	memset(k_set, 0, set_size);
	if (k_sigprocmask(SIG_SETMASK, k_set, NULL, set_size))
		perror_msg_and_fail("rt_sigprocmask");
	tprintf("rt_sigprocmask(SIG_SETMASK, [], NULL, %u) = 0\n", set_size);

	if (k_sigprocmask(SIG_UNBLOCK, k_set - set_size, old_set, set_size))
		perror_msg_and_fail("rt_sigprocmask");
	tprintf("rt_sigprocmask(SIG_UNBLOCK, ~[], [], %u) = 0\n", set_size);

	assert(k_sigprocmask(SIG_SETMASK, k_set - set_size,
			     old_set, set_size << 1) == -1);
	tprintf("rt_sigprocmask(SIG_SETMASK, %p, %p, %u) = -1 EINVAL (%m)\n",
		k_set - set_size, old_set, set_size << 1);

	iterate("~[]", k_set - set_size, old_set, set_size >> 1);

	sigemptyset(libc_set);
	sigaddset(libc_set, SIGHUP);
	memcpy(k_set, libc_set, set_size);

	if (k_sigprocmask(SIG_BLOCK, k_set, old_set, set_size))
		perror_msg_and_fail("rt_sigprocmask");
	tprintf("rt_sigprocmask(SIG_BLOCK, [HUP], [], %u) = 0\n", set_size);

	memset(libc_set, -1, sizeof(sigset_t));
	sigdelset(libc_set, SIGHUP);
	memcpy(k_set, libc_set, set_size);

	if (k_sigprocmask(SIG_UNBLOCK, k_set, old_set, set_size))
		perror_msg_and_fail("rt_sigprocmask");
	tprintf("rt_sigprocmask(SIG_UNBLOCK, ~[HUP], [HUP], %u) = 0\n",
		set_size);

	sigdelset(libc_set, SIGKILL);
	memcpy(k_set, libc_set, set_size);

	if (k_sigprocmask(SIG_UNBLOCK, k_set, old_set, set_size))
		perror_msg_and_fail("rt_sigprocmask");
	tprintf("rt_sigprocmask(SIG_UNBLOCK, ~[HUP KILL], [HUP], %u) = 0\n",
		set_size);

	sigemptyset(libc_set);
	sigaddset(libc_set, SIGHUP);
	sigaddset(libc_set, SIGINT);
	sigaddset(libc_set, SIGQUIT);
	sigaddset(libc_set, SIGALRM);
	sigaddset(libc_set, SIGTERM);
	memcpy(k_set, libc_set, set_size);

	if (k_sigprocmask(SIG_BLOCK, k_set, old_set, set_size))
		perror_msg_and_fail("rt_sigprocmask");
	tprintf("rt_sigprocmask(SIG_BLOCK, %s, [HUP], %u) = 0\n",
		"[HUP INT QUIT ALRM TERM]", set_size);

	if (k_sigprocmask(SIG_SETMASK, NULL, old_set, set_size))
		perror_msg_and_fail("rt_sigprocmask");
	tprintf("rt_sigprocmask(SIG_SETMASK, NULL, %s, %u) = 0\n",
		"[HUP INT QUIT ALRM TERM]", set_size);

	assert(k_sigprocmask(SIG_SETMASK, k_set + (set_size >> 1), NULL,
			     set_size) == -1);
	tprintf("rt_sigprocmask(SIG_SETMASK, %p, NULL, %u) = -1 EFAULT (%m)\n",
		k_set + (set_size >> 1), set_size);

	assert(k_sigprocmask(SIG_SETMASK, k_set, old_set + (set_size >> 1),
			     set_size) == -1);
	tprintf("rt_sigprocmask(SIG_SETMASK, %s, %p, %u) = -1 EFAULT (%m)\n",
		"[HUP INT QUIT ALRM TERM]",
		old_set + (set_size >> 1), set_size);

	tprintf("+++ exited with 0 +++\n");
	return 0;
}
Beispiel #18
0
inT32 row_words(                  //compute space size
                TO_BLOCK *block,  //block it came from
                TO_ROW *row,      //row to operate on
                inT32 maxwidth,   //max expected space size
                FCOORD rotation,  //for drawing
                BOOL8 testing_on  //for debug
               ) {
  BOOL8 testing_row;             //contains testpt
  BOOL8 prev_valid;              //if decent size
  BOOL8 this_valid;              //current blob big enough
  inT32 prev_x;                  //end of prev blob
  inT32 min_gap;                 //min interesting gap
  inT32 cluster_count;           //no of clusters
  inT32 gap_index;               //which cluster
  inT32 smooth_factor;           //for smoothing stats
  BLOBNBOX *blob;                //current blob
  float lower, upper;            //clustering parameters
  float gaps[3];                 //gap clusers
  ICOORD testpt;
  TBOX blob_box;                  //bounding box
                                 //iterator
  BLOBNBOX_IT blob_it = row->blob_list ();
  STATS gap_stats (0, maxwidth);
  STATS cluster_stats[4];        //clusters

  testpt = ICOORD (textord_test_x, textord_test_y);
  smooth_factor =
    (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
  //      if (testing_on)
  //              tprintf("Row smooth factor=%d\n",smooth_factor);
  prev_valid = FALSE;
  prev_x = -MAX_INT32;
  testing_row = FALSE;
  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
    blob = blob_it.data ();
    blob_box = blob->bounding_box ();
    if (blob_box.contains (testpt))
      testing_row = TRUE;
    gap_stats.add (blob_box.width (), 1);
  }
  min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile));
  gap_stats.clear ();
  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
    blob = blob_it.data ();
    if (!blob->joined_to_prev ()) {
      blob_box = blob->bounding_box ();
      //                      this_valid=blob_box.width()>=min_gap;
      this_valid = TRUE;
      if (this_valid && prev_valid
      && blob_box.left () - prev_x < maxwidth) {
        gap_stats.add (blob_box.left () - prev_x, 1);
      }
      prev_x = blob_box.right ();
      prev_valid = this_valid;
    }
  }
  if (gap_stats.get_total () == 0) {
    row->min_space = 0;          //no evidence
    row->max_nonspace = 0;
    return 0;
  }
  gap_stats.smooth (smooth_factor);
  lower = row->xheight * textord_words_initial_lower;
  upper = row->xheight * textord_words_initial_upper;
  cluster_count = gap_stats.cluster (lower, upper,
    textord_spacesize_ratioprop, 3,
    cluster_stats);
  while (cluster_count < 2 && ceil (lower) < floor (upper)) {
                                 //shrink gap
    upper = (upper * 3 + lower) / 4;
    lower = (lower * 3 + upper) / 4;
    cluster_count = gap_stats.cluster (lower, upper,
      textord_spacesize_ratioprop, 3,
      cluster_stats);
  }
  if (cluster_count < 2) {
    row->min_space = 0;          //no evidence
    row->max_nonspace = 0;
    return 0;
  }
  for (gap_index = 0; gap_index < cluster_count; gap_index++)
    gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
  //get medians
  if (cluster_count > 2) {
    if (testing_on && textord_show_initial_words) {
      tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n",
        row->intercept (),
        cluster_stats[1].ile (0.5),
        cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5));
    }
    lower = gaps[0];
    if (gaps[1] > lower) {
      upper = gaps[1];           //prefer most frequent
      if (upper < block->xheight * textord_words_min_minspace
      && gaps[2] > gaps[1]) {
        upper = gaps[2];
      }
    }
    else if (gaps[2] > lower
      && gaps[2] >= block->xheight * textord_words_min_minspace)
      upper = gaps[2];
    else if (lower >= block->xheight * textord_words_min_minspace) {
      upper = lower;             //not nice
      lower = gaps[1];
      if (testing_on && textord_show_initial_words) {
        tprintf ("Had to switch most common from lower to upper!!\n");
        gap_stats.print (stdout, TRUE);
      }
    }
    else {
      row->min_space = 0;        //no evidence
      row->max_nonspace = 0;
      return 0;
    }
  }
  else {
    if (gaps[1] < gaps[0]) {
      if (testing_on && textord_show_initial_words) {
        tprintf ("Had to switch most common from lower to upper!!\n");
        gap_stats.print (stdout, TRUE);
      }
      lower = gaps[1];
      upper = gaps[0];
    }
    else {
      upper = gaps[1];
      lower = gaps[0];
    }
  }
  if (upper < block->xheight * textord_words_min_minspace) {
    row->min_space = 0;          //no evidence
    row->max_nonspace = 0;
    return 0;
  }
  if (upper * 3 < block->min_space * 2 + block->max_nonspace
  || lower * 3 > block->min_space * 2 + block->max_nonspace) {
    if (testing_on && textord_show_initial_words) {
      tprintf ("Disagreement between block and row at %g!!\n",
        row->intercept ());
      tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper);
      gap_stats.print (stdout, TRUE);
    }
  }
  row->min_space =
    (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
  row->max_nonspace =
    (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
  row->space_size = upper;
  row->kern_size = lower;
  if (testing_on && textord_show_initial_words) {
    if (testing_row) {
      tprintf ("GAP STATS\n");
      gap_stats.print (stdout, TRUE);
      tprintf ("SPACE stats\n");
      cluster_stats[2].print (stdout, FALSE);
      tprintf ("NONSPACE stats\n");
      cluster_stats[1].print (stdout, FALSE);
    }
    tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
      row->intercept (), row->min_space, upper,
      row->max_nonspace, lower);
  }
  return cluster_stats[2].get_total ();
}
Beispiel #19
0
// TODO(rays) Merge with outline_complexity.
inT32 OL_BUCKETS::count_children(                     // recursive count
                                 C_OUTLINE *outline,  // parent outline
                                 inT32 max_count      // max output
                                ) {
  BOOL8 parent_box;              // could it be boxy
  inT16 xmin, xmax;              // coord limits
  inT16 ymin, ymax;
  inT16 xindex, yindex;          // current bucket
  C_OUTLINE *child;              // current child
  inT32 child_count;             // no of children
  inT32 grandchild_count;        // no of grandchildren
  inT32 parent_area;             // potential box
  FLOAT32 max_parent_area;       // potential box
  inT32 child_area;              // current child
  inT32 child_length;            // current child
  TBOX olbox;
  C_OUTLINE_IT child_it;         // search iterator

  olbox = outline->bounding_box();
  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
  child_count = 0;
  grandchild_count = 0;
  parent_area = 0;
  max_parent_area = 0;
  parent_box = TRUE;
  for (yindex = ymin; yindex <= ymax; yindex++) {
    for (xindex = xmin; xindex <= xmax; xindex++) {
      child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
      if (child_it.empty())
        continue;
      for (child_it.mark_cycle_pt(); !child_it.cycled_list();
           child_it.forward()) {
        child = child_it.data();
        if (child != outline && *child < *outline) {
          child_count++;
          if (child_count <= max_count) {
            int max_grand =(max_count - child_count) /
                            edges_children_per_grandchild;
            if (max_grand > 0)
              grandchild_count += count_children(child, max_grand) *
                                  edges_children_per_grandchild;
            else
              grandchild_count += count_children(child, 1);
          }
          if (child_count + grandchild_count > max_count) {
            if (edges_debug)
              tprintf("Discarding parent with child count=%d, gc=%d\n",
                      child_count,grandchild_count);
            return child_count + grandchild_count;
          }
          if (parent_area == 0) {
            parent_area = outline->outer_area();
            if (parent_area < 0)
              parent_area = -parent_area;
            max_parent_area = outline->bounding_box().area() * edges_boxarea;
            if (parent_area < max_parent_area)
              parent_box = FALSE;
          }
          if (parent_box &&
              (!edges_children_fix ||
               child->bounding_box().height() > edges_min_nonhole)) {
            child_area = child->outer_area();
            if (child_area < 0)
              child_area = -child_area;
            if (edges_children_fix) {
              if (parent_area - child_area < max_parent_area) {
                parent_box = FALSE;
                continue;
              }
              if (grandchild_count > 0) {
                if (edges_debug)
                  tprintf("Discarding parent of area %d, child area=%d, max%g "
                          "with gc=%d\n",
                          parent_area, child_area, max_parent_area,
                          grandchild_count);
                return max_count + 1;
              }
              child_length = child->pathlength();
              if (child_length * child_length >
                  child_area * edges_patharea_ratio) {
                if (edges_debug)
                  tprintf("Discarding parent of area %d, child area=%d, max%g "
                          "with child length=%d\n",
                          parent_area, child_area, max_parent_area,
                          child_length);
                return max_count + 1;
              }
            }
            if (child_area < child->bounding_box().area() * edges_childarea) {
              if (edges_debug)
                tprintf("Discarding parent of area %d, child area=%d, max%g "
                        "with child rect=%d\n",
                        parent_area, child_area, max_parent_area,
                        child->bounding_box().area());
              return max_count + 1;
            }
          }
        }
      }
    }
  }
  return child_count + grandchild_count;
}
Beispiel #20
0
inT32 row_words2(                  //compute space size
                 TO_BLOCK *block,  //block it came from
                 TO_ROW *row,      //row to operate on
                 inT32 maxwidth,   //max expected space size
                 FCOORD rotation,  //for drawing
                 BOOL8 testing_on  //for debug
                ) {
  BOOL8 testing_row;             //contains testpt
  BOOL8 prev_valid;              //if decent size
  BOOL8 this_valid;              //current blob big enough
  inT32 prev_x;                  //end of prev blob
  inT32 min_width;               //min interesting width
  inT32 valid_count;             //good gaps
  inT32 total_count;             //total gaps
  inT32 cluster_count;           //no of clusters
  inT32 prev_count;              //previous cluster_count
  inT32 gap_index;               //which cluster
  inT32 smooth_factor;           //for smoothing stats
  BLOBNBOX *blob;                //current blob
  float lower, upper;            //clustering parameters
  ICOORD testpt;
  TBOX blob_box;                  //bounding box
                                 //iterator
  BLOBNBOX_IT blob_it = row->blob_list ();
  STATS gap_stats (0, maxwidth);
                                 //gap sizes
  float gaps[BLOCK_STATS_CLUSTERS];
  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
  //clusters

  testpt = ICOORD (textord_test_x, textord_test_y);
  smooth_factor =
    (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
  //      if (testing_on)
  //              tprintf("Row smooth factor=%d\n",smooth_factor);
  prev_valid = FALSE;
  prev_x = -MAX_INT16;
  testing_row = FALSE;
                                 //min blob size
  min_width = (inT32) block->pr_space;
  total_count = 0;
  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
    blob = blob_it.data ();
    if (!blob->joined_to_prev ()) {
      blob_box = blob->bounding_box ();
      this_valid = blob_box.width () >= min_width;
      this_valid = TRUE;
      if (this_valid && prev_valid
      && blob_box.left () - prev_x < maxwidth) {
        gap_stats.add (blob_box.left () - prev_x, 1);
      }
      total_count++;             //count possibles
      prev_x = blob_box.right ();
      prev_valid = this_valid;
    }
  }
  valid_count = gap_stats.get_total ();
  if (valid_count < total_count * textord_words_minlarge) {
    gap_stats.clear ();
    prev_x = -MAX_INT16;
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      if (!blob->joined_to_prev ()) {
        blob_box = blob->bounding_box ();
        if (blob_box.left () - prev_x < maxwidth) {
          gap_stats.add (blob_box.left () - prev_x, 1);
        }
        prev_x = blob_box.right ();
      }
    }
  }
  if (gap_stats.get_total () == 0) {
    row->min_space = 0;          //no evidence
    row->max_nonspace = 0;
    return 0;
  }

  cluster_count = 0;
  lower = block->xheight * words_initial_lower;
  upper = block->xheight * words_initial_upper;
  gap_stats.smooth (smooth_factor);
  do {
    prev_count = cluster_count;
    cluster_count = gap_stats.cluster (lower, upper,
      textord_spacesize_ratioprop,
      BLOCK_STATS_CLUSTERS, cluster_stats);
  }
  while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
  if (cluster_count < 1) {
    row->min_space = 0;
    row->max_nonspace = 0;
    return 0;
  }
  for (gap_index = 0; gap_index < cluster_count; gap_index++)
    gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
  //get medians
  if (testing_on) {
    tprintf ("cluster_count=%d:", cluster_count);
    for (gap_index = 0; gap_index < cluster_count; gap_index++)
      tprintf (" %g(%d)", gaps[gap_index],
        cluster_stats[gap_index + 1].get_total ());
    tprintf ("\n");
  }

  //Try to find proportional non-space and space for row.
  for (gap_index = 0; gap_index < cluster_count
    && gaps[gap_index] > block->max_nonspace; gap_index++);
  if (gap_index < cluster_count)
    lower = gaps[gap_index];     //most frequent below
  else {
    if (testing_on)
      tprintf ("No cluster below block threshold!, using default=%g\n",
        block->pr_nonsp);
    lower = block->pr_nonsp;
  }
  for (gap_index = 0; gap_index < cluster_count
    && gaps[gap_index] <= block->max_nonspace; gap_index++);
  if (gap_index < cluster_count)
    upper = gaps[gap_index];     //most frequent above
  else {
    if (testing_on)
      tprintf ("No cluster above block threshold!, using default=%g\n",
        block->pr_space);
    upper = block->pr_space;
  }
  row->min_space =
    (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
  row->max_nonspace =
    (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
  row->space_size = upper;
  row->kern_size = lower;
  if (testing_on) {
    if (testing_row) {
      tprintf ("GAP STATS\n");
      gap_stats.print (stdout, TRUE);
      tprintf ("SPACE stats\n");
      cluster_stats[2].print (stdout, FALSE);
      tprintf ("NONSPACE stats\n");
      cluster_stats[1].print (stdout, FALSE);
    }
    tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
      row->intercept (), row->min_space, upper,
      row->max_nonspace, lower);
  }
  return 1;
}
static void
decode_seccomp_set_mode_strict(unsigned int flags, unsigned long addr)
{
	tprintf("%u, ", flags);
	printaddr(addr);
}
Beispiel #22
0
double vp8_calcpsnr_tester(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                           double *ypsnr, double *upsnr, double *vpsnr,
                           double *sq_error, int print_out,
                           int& possible_artifact)
{
    int i, j;
    int diff;
    double frame_psnr;
    double total;
    double grand_total;
    unsigned char *src = source->y_buffer;
    unsigned char *dst = dest->y_buffer;

    double sub_frame_ypsnr[16][16] = {0}; // break the frame into 16 by 16
    double sub_frame_total[16][16] = {0}; // hold 16 by 16 frame total data

    // try to keep at least 64 pixel segments
    int width_segments = source->y_width / 64;
    int height_segments = width_segments;

    if(height_segments > 16)
        height_segments = 16;
    if(width_segments > 16)
        width_segments = 16;

    total = 0.0;
    grand_total = 0.0;

    // Loop throught the Y plane raw and reconstruction data summing
    // (square differences)
    for (i = 0; i < source->y_height; i++)
    {
        for (j = 0; j < source->y_width; j++)
        {
            diff        = (int)(src[j]) - (int)(dst[j]);
            total      += diff * diff;

            // gather totals for internal segments
            if(possible_artifact == kRunArtifactDetection)
                sub_frame_total[i / ((source->y_height / height_segments == 0)
                ? 1 : ((height_segments - 1) + source->y_height) /
                height_segments)] [j / ((source->y_width/width_segments ==0 )
                ? 1 : ((width_segments - 1 ) + source->y_width) /
                width_segments)] += diff * diff;
        }

        src += source->y_stride;
        dst += dest->y_stride;
    }

    // Work out Y PSNR
    *ypsnr = vp8_mse_2_psnr_tester(source->y_height * source->y_width, 255.0,
        total);

    double max_psnr_1 = 0;
    double max_psnr_2 = 0;
    double max_psnr_3 = 0;
    double min_psnr = 61;

    if(possible_artifact == kRunArtifactDetection)
    {
        // Work out Y PSNRs for internal segments and find min and max
        for(i=0; i < height_segments; i++){
            for(j=0; j < width_segments; j++){

                int sub_frame_height = 1;
                if(i == (height_segments - 1))
                    sub_frame_height = source->y_height - ((height_segments-1)
                    * (((height_segments - 1) + source->y_height)
                    / height_segments));
                else
                    sub_frame_height = ((height_segments - 1) + source->y_height
                    ) / height_segments;

                int sub_frame_width = 1;
                if(j == (width_segments - 1))
                    sub_frame_width = source->y_width - ((width_segments - 1) *
                    (((width_segments - 1) + source->y_width) /
                    width_segments));
                else
                    sub_frame_width = (15 + source->y_width) / width_segments;

                sub_frame_ypsnr[i][j] = vp8_mse_2_psnr_tester(sub_frame_height *
                    sub_frame_width, 255.0, sub_frame_total[i][j]);

                // Get min and top three max sub psnrs
                if(sub_frame_ypsnr[i][j] != 60 && sub_frame_ypsnr[i][j] >
                    max_psnr_1){
                    max_psnr_2 = max_psnr_1;
                    max_psnr_3 = max_psnr_2;
                    max_psnr_1 = sub_frame_ypsnr[i][j];
                }
                else if(sub_frame_ypsnr[i][j] != 60 && sub_frame_ypsnr[i][j] >
                    max_psnr_2){
                    max_psnr_3 = max_psnr_2;
                    max_psnr_2 = sub_frame_ypsnr[i][j];
                }
                else if(sub_frame_ypsnr[i][j] != 60 && sub_frame_ypsnr[i][j] >
                    max_psnr_3){
                    max_psnr_3 = sub_frame_ypsnr[i][j];
                }
                if(sub_frame_ypsnr[i][j] < min_psnr)
                    min_psnr = sub_frame_ypsnr[i][j];
            }
        }

        // if min sub psnr is not within ~57% of top three psnr
        // average then flag as potential artifact.
        if((max_psnr_1 + max_psnr_2 + max_psnr_3) / 7 >=  min_psnr)
            possible_artifact = kPossibleArtifactFound;
        else
            possible_artifact = kNoArtifactFound;

        if(possible_artifact == kPossibleArtifactFound && print_out)
        {
            tprintf(print_out, "min: %.0f Max: %.0f %.0f %.0f", min_psnr,
                max_psnr_1, max_psnr_2, max_psnr_3);

            for(i=0; i < height_segments; i++){
                tprintf(print_out, "\n");
                for(int z = 0; z < (width_segments*3)+1; z++){
                    tprintf(print_out, "-");
                }

                tprintf(print_out, "\n|");
                for(j=0; j < width_segments; j++){
                    tprintf(print_out, "%.0f|",sub_frame_ypsnr[i][j]);
                }
            }
            tprintf(print_out, "\n");
            for(int z = 0; z < (width_segments*3)+1; z++){
                tprintf(print_out, "-");
            }
            tprintf(print_out, "\n");
        }
    }

    grand_total += total;
    total = 0;

    // Loop through the U plane
    src = source->u_buffer;
    dst = dest->u_buffer;

    for (i = 0; i < source->uv_height; i++)
    {
        for (j = 0; j < source->uv_width; j++)
        {
            diff        = (int)(src[j]) - (int)(dst[j]);
            total      += diff * diff;
        }

        src += source->uv_stride;
        dst += dest->uv_stride;
    }

    // Work out U PSNR
    *upsnr = vp8_mse_2_psnr_tester(source->uv_height * source->uv_width, 255.0,
        total);
    grand_total += total;
    total = 0;

    // V PSNR
    src = source->v_buffer;
    dst = dest->v_buffer;

    for (i = 0; i < source->uv_height; i++)
    {
        for (j = 0; j < source->uv_width; j++)
        {
            diff        = (int)(src[j]) - (int)(dst[j]);
            total      += diff * diff;
        }

        src += source->uv_stride;
        dst += dest->uv_stride;
    }

    // Work out UV PSNR
    *vpsnr = vp8_mse_2_psnr_tester(source->uv_height * source->uv_width, 255.0,
        total);
    grand_total += total;
    total = 0;

    // Work out total PSNR
    frame_psnr = vp8_mse_2_psnr_tester(source->y_height * source->y_width *
        3 / 2 , 255.0, grand_total);

    *sq_error = 1.0 * grand_total;

    return frame_psnr;
}
Beispiel #23
0
/**
 * @name cube_recognize
 *
 * Call cube on the current word, and write the result to word.
 * Sets up a fake result and returns false if something goes wrong.
 */
bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
                               WERD_RES *word) {
  // Run cube
  WordAltList *cube_alt_list = cube_obj->RecognizeWord();
  if (!cube_alt_list || cube_alt_list->AltCount() <= 0) {
    if (cube_debug_level > 0) {
      tprintf("Cube returned nothing for word at:");
      word->word->bounding_box().print();
    }
    word->SetupFake(unicharset);
    return false;
  }

  // Get cube's best result and its probability, mapped to tesseract's
  // certainty range
  char_32 *cube_best_32 = cube_alt_list->Alt(0);
  double cube_prob = CubeUtils::Cost2Prob(cube_alt_list->AltCost(0));
  float cube_certainty = convert_prob_to_tess_certainty(cube_prob);
  string cube_best_str;
  CubeUtils::UTF32ToUTF8(cube_best_32, &cube_best_str);

  // Retrieve Cube's character bounding boxes and CharSamples,
  // corresponding to the most recent call to RecognizeWord().
  Boxa *char_boxes = NULL;
  CharSamp **char_samples = NULL;;
  int num_chars;
  if (!extract_cube_state(cube_obj, &num_chars, &char_boxes, &char_samples)
      && cube_debug_level > 0) {
    tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract "
            "cube state.\n");
    word->SetupFake(unicharset);
    return false;
  }

  // Convert cube's character bounding boxes to a BoxWord.
  BoxWord cube_box_word;
  TBOX tess_word_box = word->word->bounding_box();
  if (word->denorm.block() != NULL)
    tess_word_box.rotate(word->denorm.block()->re_rotation());
  bool box_word_success = create_cube_box_word(char_boxes, num_chars,
                                               tess_word_box,
                                               &cube_box_word);
  boxaDestroy(&char_boxes);
  if (!box_word_success) {
    if (cube_debug_level > 0) {
      tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
              "create cube BoxWord\n");
    }
    word->SetupFake(unicharset);
    return false;
  }

  // Fill tesseract result's fields with cube results
  fill_werd_res(cube_box_word, cube_best_str.c_str(), word);

  // Create cube's best choice.
  BLOB_CHOICE** choices = new BLOB_CHOICE*[num_chars];
  for (int i = 0; i < num_chars; ++i) {
    UNICHAR_ID uch_id =
        cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel());
    choices[i] = new BLOB_CHOICE(uch_id, -cube_certainty, cube_certainty,
                                 -1, 0.0f, 0.0f, 0.0f, BCC_STATIC_CLASSIFIER);
  }
  word->FakeClassifyWord(num_chars, choices);
  // within a word, cube recognizes the word in reading order.
  word->best_choice->set_unichars_in_script_order(true);
  delete [] choices;
  delete [] char_samples;

  // Some sanity checks
  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());

  if (cube_debug_level || classify_debug_level) {
    tprintf("Cube result: %s r=%g, c=%g\n",
            word->best_choice->unichar_string().string(),
            word->best_choice->rating(),
            word->best_choice->certainty());
  }
  return true;
}
Beispiel #24
0
static int parse_setup_cpu_list(void)
{
	struct thread_data *td;
	char *str0, *str;
	int t;

	if (!g->p.cpu_list_str)
		return 0;

	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);

	str0 = str = strdup(g->p.cpu_list_str);
	t = 0;

	BUG_ON(!str);

	tprintf("# binding tasks to CPUs:\n");
	tprintf("#  ");

	while (true) {
		int bind_cpu, bind_cpu_0, bind_cpu_1;
		char *tok, *tok_end, *tok_step, *tok_len, *tok_mul;
		int bind_len;
		int step;
		int mul;

		tok = strsep(&str, ",");
		if (!tok)
			break;

		tok_end = strstr(tok, "-");

		dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
		if (!tok_end) {
			/* Single CPU specified: */
			bind_cpu_0 = bind_cpu_1 = atol(tok);
		} else {
			/* CPU range specified (for example: "5-11"): */
			bind_cpu_0 = atol(tok);
			bind_cpu_1 = atol(tok_end + 1);
		}

		step = 1;
		tok_step = strstr(tok, "#");
		if (tok_step) {
			step = atol(tok_step + 1);
			BUG_ON(step <= 0 || step >= g->p.nr_cpus);
		}

		/*
		 * Mask length.
		 * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4',
		 * where the _4 means the next 4 CPUs are allowed.
		 */
		bind_len = 1;
		tok_len = strstr(tok, "_");
		if (tok_len) {
			bind_len = atol(tok_len + 1);
			BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus);
		}

		/* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
		mul = 1;
		tok_mul = strstr(tok, "x");
		if (tok_mul) {
			mul = atol(tok_mul + 1);
			BUG_ON(mul <= 0);
		}

		dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);

		if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) {
			printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus);
			return -1;
		}

		BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0);
		BUG_ON(bind_cpu_0 > bind_cpu_1);

		for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
			int i;

			for (i = 0; i < mul; i++) {
				int cpu;

				if (t >= g->p.nr_tasks) {
					printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu);
					goto out;
				}
				td = g->threads + t;

				if (t)
					tprintf(",");
				if (bind_len > 1) {
					tprintf("%2d/%d", bind_cpu, bind_len);
				} else {
					tprintf("%2d", bind_cpu);
				}

				CPU_ZERO(&td->bind_cpumask);
				for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
					BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus);
					CPU_SET(cpu, &td->bind_cpumask);
				}
				t++;
			}
		}
	}
out:

	tprintf("\n");

	if (t < g->p.nr_tasks)
		printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t);

	free(str0);
	return 0;
}
Beispiel #25
0
static void
tprint_timeval(struct tcb *tcp, const struct timeval *tv)
{
	tprintf("{%lu, %lu}",
		(unsigned long) tv->tv_sec, (unsigned long) tv->tv_usec);
}
// Returns false if a unicharset file for the specified language was not found
// or was invalid.
// This function initializes TessdataManager. After TessdataManager is
// no longer needed, TessdataManager::End() should be called.
//
// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
// it is OEM_DEFAULT, in which case the value of the variable will be obtained
// from the language-specific config file (stored in [lang].traineddata), from
// the config files specified on the command line or left as the default
// OEM_TESSERACT_ONLY if none of the configs specify this variable.
bool Tesseract::init_tesseract_lang_data(
    const char *arg0, const char *textbase, const char *language,
    OcrEngineMode oem, char **configs, int configs_size,
    const GenericVector<STRING> *vars_vec,
    const GenericVector<STRING> *vars_values,
    bool set_only_init_params) {
  // Set the basename, compute the data directory.
	 #if _BUILDASDLL
		imagebasename = textbase;      /*name of image */
		STRING dll_module_name;
	#ifdef __MSW32__
		dll_module_name = tessedit_module_name;
	#endif
		if (getpath(arg0, dll_module_name, datadir) < 0)
			return false;
	#else
		main_setup(arg0, textbase);
	#endif

  // Set the language data path prefix
  lang = language != NULL ? language : "eng";
  language_data_path_prefix = datadir;
  language_data_path_prefix += lang;
  language_data_path_prefix += ".";

  // Initialize TessdataManager.
  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
  if (!tessdata_manager.Init(tessdata_path.string(),
                             tessdata_manager_debug_level)) {
    return false;
  }

  // If a language specific config file (lang.config) exists, load it in.
  if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
    ParamUtils::ReadParamsFromFp(
        tessdata_manager.GetDataFilePtr(),
        tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
        false, this->params());
    if (tessdata_manager_debug_level) {
      tprintf("Loaded language config file\n");
    }
  }

  // Load tesseract variables from config files. This is done after loading
  // language-specific variables from [lang].traineddata file, so that custom
  // config files can override values in [lang].traineddata file.
  for (int i = 0; i < configs_size; ++i) {
    read_config_file(configs[i], set_only_init_params);
  }

  // Set params specified in vars_vec (done after setting params from config
  // files, so that params in vars_vec can override those from files).
  if (vars_vec != NULL && vars_values != NULL) {
    for (int i = 0; i < vars_vec->size(); ++i) {
      if (!ParamUtils::SetParam((*vars_vec)[i].string(),
                                (*vars_values)[i].string(),
                                set_only_init_params, this->params())) {
        tprintf("Error setting param %s\n", (*vars_vec)[i].string());
        exit(1);
      }
    }
  }

  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
    FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
    if (params_file != NULL) {
      ParamUtils::PrintParams(params_file, this->params());
      fclose(params_file);
      if (tessdata_manager_debug_level > 0) {
        tprintf("Wrote parameters to %s\n",
                tessedit_write_params_to_file.string());
      }
    } else {
      tprintf("Failed to open %s for writing params.\n",
              tessedit_write_params_to_file.string());
    }
  }

  // Determine which ocr engine(s) should be loaded and used for recognition.
  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
  if (tessdata_manager_debug_level) {
    tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
            static_cast<int>(tessedit_ocr_engine_mode));
  }

  // Load the unicharset
  if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
      !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
    return false;
  }
  if (unicharset.size() > MAX_NUM_CLASSES) {
    tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
    return false;
  }
  right_to_left_ = unicharset.any_right_to_left();
  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");

  if (!tessedit_ambigs_training &&
      tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
    unichar_ambigs.LoadUnicharAmbigs(
        tessdata_manager.GetDataFilePtr(),
        tessdata_manager.GetEndOffset(TESSDATA_AMBIGS),
        ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
    if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
  }

  // Load Cube objects if necessary.
  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
    ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
    if (tessdata_manager_debug_level)
      tprintf("Loaded Cube w/out combiner\n");
  } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
    ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
    if (tessdata_manager_debug_level)
      tprintf("Loaded Cube with combiner\n");
  }

  return true;
}
Beispiel #27
0
static void dump_enemy() {
/*
		.scroll_line = 1,
		.weapon = EW_GUN,
		.x = 100,
		.y = 100,
		.route = {
			[3] = {
				.shape = ES_JEEP,
				.dir = DIR16_WNW,
				.start_step = 0,
				.vel = 2,
			},
			[4] = {
				.shape = ES_SOLDIER1_RIGHT,
				.dir = DIR16_NNW,
				.start_step = 128,
				.vel = 2,
			},
			
		},
 */
#define tprintf(tabs, args...) do { int __tabs; for(__tabs = 0; __tabs < tabs; __tabs++) printf("\t"); printf(args); } while(0)
	printf("XXX screen %d\n", map_spawn_screen_index);
	tprintf(2, ".scroll_line = %d,\n", tag_enemy_spawnline);
	tprintf(2, ".weapon = %s,\n", enemy_weapon_string_lut[tag_enemy.weapon]);
	tprintf(2, ".x = %d,\n", tag_enemy.x);
	tprintf(2, ".y = %d,\n", tag_enemy_y);
	tprintf(2, ".route = {\n");
	int i;
	for(i = 0; i < ENEMY_MAX_ROUTE && tag_enemy.route[i].shape != ES_INVALID; i++) {
		tprintf(3, "[%d] = {\n", i);
		tprintf(4, ".shape = %s,\n", enemy_shape_string_lut[tag_enemy.route[i].shape]);
		tprintf(4, ".dir = %s,\n", dir16_string_lut[tag_enemy.route[i].dir]);
		tprintf(4, ".start_step = %d,\n", tag_enemy.route[i].start_step);
		tprintf(4, ".vel = %d,\n", tag_enemy.route[i].vel);
		tprintf(3, "},\n");
	}
	tprintf(2, "},\n");
	tprintf(2, ".shots = {\n");
	for(i = 0; i < ENEMY_MAX_SHOT; i++)
		tprintf(3, "[%d] = %d,\n", i, tag_enemy.shots[i]);
	tprintf(2, "},\n");
}
Beispiel #28
0
void WERD::print() {
  tprintf("Blanks= %d\n", blanks);
  bounding_box().print();
  tprintf("Flags = %d = 0%o\n", flags.val, flags.val);
  tprintf("   W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE ");
  tprintf("   W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE ");
  tprintf("   W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE ");
  tprintf("   W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE ");
  tprintf("   W_NORMALIZED = %s\n",
          flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE ");
  tprintf("   W_SCRIPT_HAS_XHEIGHT = %s\n",
          flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE ");
  tprintf("   W_SCRIPT_IS_LATIN = %s\n",
          flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE ");
  tprintf("   W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE ");
  tprintf("   W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE ");
  tprintf("   W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE ");
  tprintf("   W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE ");
  tprintf("Correct= %s\n", correct.string());
  tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
  tprintf("Script = %d\n", script_id_);
}
// Creates a report of the error rate. The report_level controls the detail
// that is reported to stderr via tprintf:
// 0   -> no output.
// >=1 -> bottom-line error rate.
// >=3 -> font-level error rate.
// boosting_mode determines the return value. It selects which (un-weighted)
// error rate to return.
// The fontinfo_table from MasterTrainer provides the names of fonts.
// The it determines the current subset of the training samples.
// If not NULL, the top-choice unichar error rate is saved in unichar_error.
// If not NULL, the report string is saved in fonts_report.
// (Ignoring report_level).
double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
                                  const FontInfoTable& fontinfo_table,
                                  const SampleIterator& it,
                                  double* unichar_error,
                                  STRING* fonts_report) {
  // Compute totals over all the fonts and report individual font results
  // when required.
  Counts totals;
  int fontsize = font_counts_.size();
  for (int f = 0; f < fontsize; ++f) {
    // Accumulate counts over fonts.
    totals += font_counts_[f];
    STRING font_report;
    if (ReportString(false, font_counts_[f], &font_report)) {
      if (fonts_report != NULL) {
        *fonts_report += fontinfo_table.get(f).name;
        *fonts_report += ": ";
        *fonts_report += font_report;
        *fonts_report += "\n";
      }
      if (report_level > 2) {
        // Report individual font error rates.
        tprintf("%s: %s\n", fontinfo_table.get(f).name, font_report.string());
      }
    }
  }
  // Report the totals.
  STRING total_report;
  bool any_results = ReportString(true, totals, &total_report);
  if (fonts_report != NULL && fonts_report->length() == 0) {
    // Make sure we return something even if there were no samples.
    *fonts_report = "NoSamplesFound: ";
    *fonts_report += total_report;
    *fonts_report += "\n";
  }
  if (report_level > 0) {
    // Report the totals.
    STRING total_report;
    if (any_results) {
      tprintf("TOTAL Scaled Err=%.4g%%, %s\n",
              scaled_error_ * 100.0, total_report.string());
    }
    // Report the worst substitution error only for now.
    if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) {
      int charsetsize = unicharset_.size();
      int worst_uni_id = 0;
      int worst_result_id = 0;
      int worst_err = 0;
      for (int u = 0; u < charsetsize; ++u) {
        for (int v = 0; v < charsetsize; ++v) {
          if (unichar_counts_(u, v) > worst_err) {
            worst_err = unichar_counts_(u, v);
            worst_uni_id = u;
            worst_result_id = v;
          }
        }
      }
      if (worst_err > 0) {
        tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n",
                worst_uni_id, unicharset_.id_to_unichar(worst_uni_id),
                unicharset_.id_to_unichar(worst_result_id),
                worst_err, totals.n[CT_UNICHAR_TOP1_ERR],
                100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]);
      }
    }
    tprintf("Multi-unichar shape use:\n");
    for (int u = 0; u < multi_unichar_counts_.size(); ++u) {
      if (multi_unichar_counts_[u] > 0) {
        tprintf("%d multiple answers for unichar: %s\n",
                multi_unichar_counts_[u],
                unicharset_.id_to_unichar(u));
      }
    }
    tprintf("OK Score histogram:\n");
    ok_score_hist_.print();
    tprintf("ERROR Score histogram:\n");
    bad_score_hist_.print();
  }

  double rates[CT_SIZE];
  if (!ComputeRates(totals, rates))
    return 0.0;
  // Set output values if asked for.
  if (unichar_error != NULL)
    *unichar_error = rates[CT_UNICHAR_TOP1_ERR];
  return rates[boosting_mode];
}
Beispiel #30
0
/**
 * Segment the page according to the current value of tessedit_pageseg_mode.
 * pix_binary_ is used as the source image and should not be NULL.
 * On return the blocks list owns all the constructed page layout.
 */
int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
                           Tesseract* osd_tess, OSResults* osr) {
  ASSERT_HOST(pix_binary_ != NULL);
  int width = pixGetWidth(pix_binary_);
  int height = pixGetHeight(pix_binary_);
  // Get page segmentation mode.
  PageSegMode pageseg_mode = static_cast<PageSegMode>(
      static_cast<int>(tessedit_pageseg_mode));
  // If a UNLV zone file can be found, use that instead of segmentation.
  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
      input_file != NULL && input_file->length() > 0) {
    STRING name = *input_file;
    const char* lastdot = strrchr(name.string(), '.');
    if (lastdot != NULL)
      name[lastdot - name.string()] = '\0';
    read_unlv_file(name, width, height, blocks);
  }
  if (blocks->empty()) {
    // No UNLV file present. Work according to the PageSegMode.
    // First make a single block covering the whole image.
    BLOCK_IT block_it(blocks);
    BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
    block->set_right_to_left(right_to_left());
    block_it.add_to_end(block);
  } else {
    // UNLV file present. Use PSM_SINGLE_BLOCK.
    pageseg_mode = PSM_SINGLE_BLOCK;
  }
  // The diacritic_blobs holds noise blobs that may be diacritics. They
  // are separated out on areas of the image that seem noisy and short-circuit
  // the layout process, going straight from the initial partition creation
  // right through to after word segmentation, where they are added to the
  // rej_cblobs list of the most appropriate word. From there classification
  // will determine whether they are used.
  BLOBNBOX_LIST diacritic_blobs;
  int auto_page_seg_ret_val = 0;
  TO_BLOCK_LIST to_blocks;
  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
      PSM_SPARSE(pageseg_mode)) {
    auto_page_seg_ret_val = AutoPageSeg(
        pageseg_mode, blocks, &to_blocks,
        enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
    if (pageseg_mode == PSM_OSD_ONLY)
      return auto_page_seg_ret_val;
    // To create blobs from the image region bounds uncomment this line:
    //  to_blocks.clear();  // Uncomment to go back to the old mode.
  } else {
    deskew_ = FCOORD(1.0f, 0.0f);
    reskew_ = FCOORD(1.0f, 0.0f);
    if (pageseg_mode == PSM_CIRCLE_WORD) {
      Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
      if (pixcleaned != NULL) {
        pixDestroy(&pix_binary_);
        pix_binary_ = pixcleaned;
      }
    }
  }

  if (auto_page_seg_ret_val < 0) {
    return -1;
  }

  if (blocks->empty()) {
    if (textord_debug_tabfind)
      tprintf("Empty page\n");
    return 0;  // AutoPageSeg found an empty page.
  }
  bool splitting =
      pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
  bool cjk_mode = textord_use_cjk_fp_model;

  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
                       pix_thresholds_, pix_grey_, splitting || cjk_mode,
                       &diacritic_blobs, blocks, &to_blocks);
  return auto_page_seg_ret_val;
}