Beispiel #1
0
// Remove outlines that are a tiny fraction in either width or height
// of the word height.
void Textord::clean_small_noise_from_words(ROW *row) {
  WERD_IT word_it(row->word_list());
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    WERD* word = word_it.data();
    int min_size = static_cast<int>(
      textord_noise_hfract * word->bounding_box().height() + 0.5);
    C_BLOB_IT blob_it(word->cblob_list());
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
      C_BLOB* blob = blob_it.data();
      C_OUTLINE_IT out_it(blob->out_list());
      for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
        C_OUTLINE* outline = out_it.data();
        outline->RemoveSmallRecursive(min_size, &out_it);
      }
      if (blob->out_list()->empty()) {
        delete blob_it.extract();
      }
    }
    if (word->cblob_list()->empty()) {
      if (!word_it.at_last()) {
        // The next word is no longer a fuzzy non space if it was before,
        // since the word before is about to be deleted.
        WERD* next_word = word_it.data_relative(1);
        if (next_word->flag(W_FUZZY_NON)) {
          next_word->set_flag(W_FUZZY_NON, false);
        }
      }
      delete word_it.extract();
    }
  }
}
void SHA1(sLONG_PTR *pResult, PackagePtr pParams)
{
	C_BLOB Param1;
	C_LONGINT Param2;
	C_TEXT returnValue;
	
	Param1.fromParamAtIndex(pParams, 1);
	Param2.fromParamAtIndex(pParams, 2);
	
	uint8_t *buf = (uint8_t *)calloc(20, sizeof(uint8_t)); 
	
	CC_SHA1((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf);	
	
	C_BLOB temp;
	temp.setBytes((const uint8_t *)buf, 20);
	
	switch (Param2.getIntValue()) 
	{
		case 1:
			temp.toB64Text(&returnValue);	
			break;
		default:
			temp.toHexText(&returnValue);	
			break;
	}
	
	free(buf);
	
	returnValue.setReturn(pResult);
}
Beispiel #3
0
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
  PAGE_RES_IT pr_it(page_res);
  C_BLOB_LIST new_blobs;               // list of gathered blobs
  C_BLOB_IT new_blob_it = &new_blobs;  // iterator

  for (WERD_RES* word_res = pr_it.word(); word_res != NULL;
       word_res = pr_it.forward()) {
    WERD* word = word_res->word;
    if (word->bounding_box().overlap(selection_box)) {
      C_BLOB_IT blob_it(word->cblob_list());
      for (blob_it.mark_cycle_pt();
           !blob_it.cycled_list(); blob_it.forward()) {
        C_BLOB* blob = blob_it.data();
        if (blob->bounding_box().overlap(selection_box)) {
          new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
        }
      }
      if (!new_blobs.empty()) {
        WERD* pseudo_word = new WERD(&new_blobs, 1, NULL);
        word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
        PAGE_RES_IT* it = new PAGE_RES_IT(page_res);
        while (it->word() != word_res && it->word() != NULL) it->forward();
        ASSERT_HOST(it->word() == word_res);
        return it;
      }
    }
  }
  return NULL;
}
void HMACMD5(sLONG_PTR *pResult, PackagePtr pParams)
{
	C_BLOB Param1;
	C_BLOB Param2;
	C_LONGINT Param3;
	C_TEXT returnValue;
	
	Param1.fromParamAtIndex(pParams, 1);
	Param2.fromParamAtIndex(pParams, 2);
	Param3.fromParamAtIndex(pParams, 3);
	
	uint8_t *buf = (uint8_t *)calloc(16, sizeof(uint8_t)); 
	uint32_t mdlen = 16;
	
	HMAC(EVP_md5(), (const void *)Param1.getBytesPtr(), (int)Param1.getBytesLength(), (const unsigned char *)Param2.getBytesPtr(), (int)Param2.getBytesLength(), buf, &mdlen);		
	
	C_BLOB temp;
	temp.setBytes((const uint8_t *)buf, 16);
	
	switch (Param3.getIntValue()) 
	{
		case 1:
			temp.toB64Text(&returnValue);	
			break;
		default:
			temp.toHexText(&returnValue);	
			break;
	}
	
	free(buf);
	
	
	returnValue.setReturn(pResult);
}
Beispiel #5
0
// Extracts all the noise outlines and stuffs the pointers into the given
// vector of outlines. Afterwards, the outlines vector owns the pointers.
void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE*>* outlines) {
  C_BLOB_IT rej_it(&rej_cblobs);
  for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
    C_BLOB* blob = rej_it.extract();
    C_OUTLINE_IT ol_it(blob->out_list());
    outlines->push_back(ol_it.extract());
    delete blob;
  }
}
// TODO(mezhirov) delete this function and replace with word->bounding_box()
static TBOX c_blob_list_get_bbox(C_BLOB_LIST *cblobs) {
  TBOX result;
  C_BLOB_IT c_it(cblobs);
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
    C_BLOB *blob = c_it.data();
    //bboxes.push(tessy_rectangle(blob->bounding_box()));
    result.bounding_union(blob->bounding_box());
  }
  return result;
}
void RSASHA256(sLONG_PTR *pResult, PackagePtr pParams)
{
	C_BLOB Param1;
	C_BLOB Param2;
	C_LONGINT Param3;
	C_TEXT returnValue;
    
	Param1.fromParamAtIndex(pParams, 1);
	Param2.fromParamAtIndex(pParams, 2);
	Param3.fromParamAtIndex(pParams, 3);

	uint8_t *buf = (uint8_t *)calloc(32, sizeof(uint8_t)); 
	
	CC_SHA256((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf);	    
    
    unsigned int signatureLength = 0;
    
	BIO *bio = BIO_new_mem_buf((void *)Param2.getBytesPtr(), Param2.getBytesLength());
	
	if(bio){
		
		RSA *key = NULL;
		key = PEM_read_bio_RSAPrivateKey(bio, NULL, NULL, NULL);	
		
		if(key){
			
			uint8_t *sgn = (uint8_t *)calloc(RSA_size(key), sizeof(uint8_t)); 
			
			if(RSA_sign(NID_sha256, buf, 32, sgn, &signatureLength, key)){
				
				C_BLOB temp;
				temp.setBytes((const uint8_t *)sgn, signatureLength);
				
				switch (Param3.getIntValue()) 
				{
					case 1:
						temp.toB64Text(&returnValue);	
						break;
					default:
						temp.toHexText(&returnValue);	
						break;
				}
			}
			
			free(sgn);
		}
		
		BIO_free(bio);
	}
    
	free(buf);
	
	returnValue.setReturn(pResult);
}
Beispiel #8
0
// Adds the selected outlines to the indcated real blobs, and puts the rest
// back in rej_cblobs where they came from. Where the target_blobs entry is
// nullptr, a run of wanted outlines is put into a single new blob.
// Ownership of the outlines is transferred back to the word. (Hence
// GenericVector and not PointerVector.)
// Returns true if any new blob was added to the start of the word, which
// suggests that it might need joining to the word before it, and likewise
// sets make_next_word_fuzzy true if any new blob was added to the end.
bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted,
                               const GenericVector<C_BLOB*>& target_blobs,
                               const GenericVector<C_OUTLINE*>& outlines,
                               bool* make_next_word_fuzzy) {
  bool outline_added_to_start = false;
  if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = false;
  C_BLOB_IT rej_it(&rej_cblobs);
  for (int i = 0; i < outlines.size(); ++i) {
    C_OUTLINE* outline = outlines[i];
    if (outline == nullptr) continue;  // Already used it.
    if (wanted[i]) {
      C_BLOB* target_blob = target_blobs[i];
      TBOX noise_box = outline->bounding_box();
      if (target_blob == nullptr) {
        target_blob = new C_BLOB(outline);
        // Need to find the insertion point.
        C_BLOB_IT blob_it(&cblobs);
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
             blob_it.forward()) {
          C_BLOB* blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (blob_box.left() > noise_box.left()) {
            if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {
              // We might want to join this word to its predecessor.
              outline_added_to_start = true;
            }
            blob_it.add_before_stay_put(target_blob);
            break;
          }
        }
        if (blob_it.cycled_list()) {
          blob_it.add_to_end(target_blob);
          if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = true;
        }
        // Add all consecutive wanted, but null-blob outlines to same blob.
        C_OUTLINE_IT ol_it(target_blob->out_list());
        while (i + 1 < outlines.size() && wanted[i + 1] &&
               target_blobs[i + 1] == nullptr) {
          ++i;
          ol_it.add_to_end(outlines[i]);
        }
      } else {
        // Insert outline into this blob.
        C_OUTLINE_IT ol_it(target_blob->out_list());
        ol_it.add_to_end(outline);
      }
    } else {
      // Put back on noise list.
      rej_it.add_to_end(new C_BLOB(outline));
    }
  }
  return outline_added_to_start;
}
void CC_HMACHASH(uint32_t hashlen, const EVP_MD * (*EVP)(void),
								 C_BLOB &Param1,
								 C_BLOB &Param2,
								 C_LONGINT &Param3,
								 C_TEXT &returnValue)
{
	uint8_t *buf = (uint8_t *)calloc(hashlen, sizeof(uint8_t));
	
	HMAC(EVP(), (const void *)Param1.getBytesPtr(), (int)Param1.getBytesLength(), (const unsigned char *)Param2.getBytesPtr(), (int)Param2.getBytesLength(), buf, &hashlen);

	C_BLOB temp;
	temp.setBytes((const uint8_t *)buf, hashlen);
	switch (Param3.getIntValue())
	{
		case 1:
			temp.toB64Text(&returnValue);
			break;
		case 2:
			temp.toB64Text(&returnValue, true);
			break;
		default:
			temp.toHexText(&returnValue);
			break;
	}
	
	free(buf);
}
void CC_HASH(unsigned int hashlen, void (*CC)(const void *data, uint32_t len, unsigned char *md),
						 C_BLOB &Param1,
						 C_LONGINT &Param2,
						 C_TEXT &returnValue)
{
	uint8_t *buf = (uint8_t *)calloc(hashlen, sizeof(uint8_t));
	
	CC((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf);
	
	C_BLOB temp;
	temp.setBytes((const uint8_t *)buf, hashlen);
	switch (Param2.getIntValue())
	{
		case 1:
			temp.toB64Text(&returnValue);
			break;
		case 2:
			temp.toB64Text(&returnValue, true);
			break;
		default:
			temp.toHexText(&returnValue);
			break;
	}
	
	free(buf);
}
Beispiel #11
0
// Removes noise from the word by moving small outlines to the rej_cblobs
// list, based on the size_threshold.
void WERD::CleanNoise(float size_threshold) {
  C_BLOB_IT blob_it(&cblobs);
  C_BLOB_IT rej_it(&rej_cblobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    C_BLOB* blob = blob_it.data();
    C_OUTLINE_IT ol_it(blob->out_list());
    for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
      C_OUTLINE* outline = ol_it.data();
      TBOX ol_box = outline->bounding_box();
      int ol_size =
          ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();
      if (ol_size < size_threshold) {
        // This outline is too small. Move it to a separate blob in the
        // reject blobs list.
        C_BLOB* rej_blob = new C_BLOB(ol_it.extract());
        rej_it.add_after_then_move(rej_blob);
      }
    }
    if (blob->out_list()->empty()) delete blob_it.extract();
  }
}
Beispiel #12
0
void show_point(PAGE_RES* page_res, float x, float y) {
  FCOORD pt(x, y);
  PAGE_RES_IT pr_it(page_res);

  const int kBufsize = 512;
  char msg[kBufsize];
  char *msg_ptr = msg;

  msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);

  for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) {
    if (pr_it.row() != pr_it.prev_row() &&
        pr_it.row()->row->bounding_box().contains(pt)) {
      msg_ptr += sprintf(msg_ptr, "BL(x)=%0.3f ",
                         pr_it.row()->row->base_line(x));
    }
    if (word->word->bounding_box().contains(pt)) {
      TBOX box = word->word->bounding_box();
      msg_ptr += sprintf(msg_ptr, "Wd(%d, %d)/(%d, %d) ",
                         box.left(), box.bottom(),
                         box.right(), box.top());
      C_BLOB_IT cblob_it(word->word->cblob_list());
      for (cblob_it.mark_cycle_pt();
           !cblob_it.cycled_list();
           cblob_it.forward()) {
        C_BLOB* cblob = cblob_it.data();
        box = cblob->bounding_box();
        if (box.contains(pt)) {
          msg_ptr += sprintf(msg_ptr,
                             "CBlb(%d, %d)/(%d, %d) ",
                             box.left(), box.bottom(),
                             box.right(), box.top());
        }
      }
    }
  }
  image_win->AddMessage(msg);
}
void CC_RSASHA(unsigned int hashlen, int nid, void (*CC)(const void *data, uint32_t len, unsigned char *md),
							 C_BLOB &Param1,
							 C_BLOB &Param2,
							 C_LONGINT &Param3,
							 C_TEXT &returnValue)
{
	uint8_t *buf = (uint8_t *)calloc(hashlen, sizeof(uint8_t));
	unsigned int signatureLength = 0;
	
	CC((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf);
	BIO *bio = BIO_new_mem_buf((void *)Param2.getBytesPtr(), Param2.getBytesLength());
	
	if(bio)
	{
		RSA *key = NULL;
		key = PEM_read_bio_RSAPrivateKey(bio, NULL, NULL, NULL);
		if(key)
		{
			uint8_t *sgn = (uint8_t *)calloc(RSA_size(key), sizeof(uint8_t));
			if(RSA_sign(nid, buf, hashlen, sgn, &signatureLength, key))
			{
				C_BLOB temp;
				temp.setBytes((const uint8_t *)sgn, signatureLength);
				switch (Param3.getIntValue())
				{
					case 1:
						temp.toB64Text(&returnValue);
						break;
					case 2:
						temp.toB64Text(&returnValue, true);
						break;
					default:
						temp.toHexText(&returnValue);
						break;
				}
			}
			free(sgn);
		}
		BIO_free(bio);
	}
	free(buf);
}
void RIPEMD160(PA_PluginParameters params)
{
	sLONG_PTR *pResult = (sLONG_PTR *)params->fResult;
	PackagePtr pParams = (PackagePtr)params->fParameters;
	
	C_BLOB Param1;
	C_LONGINT Param2;
	C_TEXT returnValue;
	
	Param1.fromParamAtIndex(pParams, 1);
	Param2.fromParamAtIndex(pParams, 2);
	
	uint8_t *buf = (uint8_t *)calloc(20, sizeof(uint8_t));
	
	CC_RIPEMD160((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf);
	
	C_BLOB temp;
	temp.setBytes((const uint8_t *)buf, 20);
	
	switch (Param2.getIntValue())
	{
		case 1:
			temp.toB64Text(&returnValue);
			break;
		case 2:
			temp.toB64Text(&returnValue, true);
			break;
		default:
			temp.toHexText(&returnValue);
			break;
	}
	
	free(buf);
	
	returnValue.setReturn(pResult);
}
void CC_RSASHAVERIFY(unsigned int hashlen, int nid, void (*CC)(const void *data, uint32_t len, unsigned char *md),
										 C_BLOB &Param1,
										 C_BLOB &Param2,
										 C_TEXT &Param3,
										 C_LONGINT &Param4,
										 C_LONGINT &returnValue)
{
	uint8_t *buf = (uint8_t *)calloc(hashlen, sizeof(uint8_t));
	
	CC((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf);
	BIO *bio = BIO_new_mem_buf((void *)Param2.getBytesPtr(), Param2.getBytesLength());
	
	if(bio)
	{
		RSA *key = NULL;
		key = PEM_read_bio_RSA_PUBKEY(bio, NULL, NULL, NULL);
		if(key)
		{
			C_BLOB temp;
			
			switch (Param4.getIntValue())
			{
				case 1:
					temp.fromB64Text(&Param3);
					break;
				default:
					temp.fromHexText(&Param3);
					break;
			}
			
			if(RSA_verify(nid, buf, hashlen, (unsigned char *)temp.getBytesPtr(), temp.getBytesLength(), key))
			{
				returnValue.setIntValue(1);
			}
			
		}
		BIO_free(bio);
	}
	free(buf);
}
Beispiel #16
0
bool Textord::clean_noise_from_row(          //remove empties
        ROW* row  //row to clean
) {
  bool testing_on;
  TBOX blob_box;                 //bounding box
  C_BLOB *blob;                  //current blob
  C_OUTLINE *outline;            //current outline
  WERD *word;                    //current word
  int32_t blob_size;             //biggest size
  int32_t trans_count = 0;       //no of transitions
  int32_t trans_threshold;       //noise tolerance
  int32_t dot_count;             //small objects
  int32_t norm_count;            //normal objects
  int32_t super_norm_count;      //real char-like
                                 //words of row
  WERD_IT word_it = row->word_list ();
  C_BLOB_IT blob_it;             //blob iterator
  C_OUTLINE_IT out_it;           //outline iterator

  testing_on = textord_test_y > row->base_line (textord_test_x)
               && textord_show_blobs
               && textord_test_y < row->base_line (textord_test_x) + row->x_height ();
  dot_count = 0;
  norm_count = 0;
  super_norm_count = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
                                 //blobs in word
    blob_it.set_to_list (word->cblob_list ());
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      if (!word->flag (W_DONT_CHOP)) {
                                 //get outlines
        out_it.set_to_list (blob->out_list ());
        for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
        out_it.forward ()) {
          outline = out_it.data ();
          blob_box = outline->bounding_box ();
          blob_size =
            blob_box.width () >
            blob_box.height ()? blob_box.width () : blob_box.
            height();
          if (blob_size < textord_noise_sizelimit * row->x_height ())
            dot_count++;         //count smal outlines
          if (!outline->child ()->empty ()
            && blob_box.height () <
            (1 + textord_noise_syfract) * row->x_height ()
            && blob_box.height () >
            (1 - textord_noise_syfract) * row->x_height ()
            && blob_box.width () <
            (1 + textord_noise_sxfract) * row->x_height ()
            && blob_box.width () >
            (1 - textord_noise_sxfract) * row->x_height ())
            super_norm_count++;  //count smal outlines
        }
      }
      else
        super_norm_count++;
      blob_box = blob->bounding_box ();
      blob_size =
        blob_box.width () >
        blob_box.height ()? blob_box.width () : blob_box.height ();
      if (blob_size >= textord_noise_sizelimit * row->x_height ()
          && blob_size < row->x_height () * 2) {
        trans_threshold = blob_size / textord_noise_sizefraction;
        trans_count = blob->count_transitions (trans_threshold);
        if (trans_count < textord_noise_translimit)
          norm_count++;
      }
      else if (blob_box.height () > row->x_height () * 2
        && (!word_it.at_first () || !blob_it.at_first ()))
        dot_count += 2;
      if (testing_on) {
        tprintf
          ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
          blob_box.left (), blob_box.bottom (), blob_box.right (),
          blob_box.top (), blob->out_list ()->length (), trans_count,
          blob_box.bottom () - row->base_line (blob_box.left ()));
      }
    }
  }
  if (textord_noise_debug) {
    tprintf ("Row ending at (%d,%g):",
      blob_box.right (), row->base_line (blob_box.right ()));
    tprintf (" R=%g, dc=%d, nc=%d, %s\n",
      norm_count > 0 ? (float) dot_count / norm_count : 9999,
      dot_count, norm_count,
      dot_count > norm_count * textord_noise_normratio
      && dot_count > 2 ? "REJECTED" : "ACCEPTED");
  }
  return super_norm_count < textord_noise_sncount
    && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
}
void HTML_Convert_to_pdf(sLONG_PTR *pResult, PackagePtr pParams)
{
	C_TEXT ParamHtml;
	ARRAY_LONGINT ParamKeys;
	ARRAY_TEXT ParamValues;
	C_BLOB returnValue;

	ParamHtml.fromParamAtIndex(pParams, 1);
	ParamKeys.fromParamAtIndex(pParams, 2);
	ParamValues.fromParamAtIndex(pParams, 3);

	CUTF8String paramValue, path;
	
	wkhtmltopdf_global_settings *gs;
	wkhtmltopdf_object_settings *os;
	wkhtmltopdf_converter *c;
	
	gs = _wkhtmltopdf_create_global_settings();
	os = _wkhtmltopdf_create_object_settings();
	
	/*hard code these options
	 http://www.cs.au.dk/~jakobt/libwkhtmltox_0.10.0_doc/pagesettings.html#pagePdfGlobal
	 */
	
	_wkhtmltopdf_set_object_setting(os, "load.blockLocalFileAccess", "false");
	_wkhtmltopdf_set_object_setting(os, "load.stopSlowScript", "true");
	_wkhtmltopdf_set_object_setting(os, "load.debugJavascript", "false");
	_wkhtmltopdf_set_object_setting(os, "load.loadErrorHandling", "ignore");
	_wkhtmltopdf_set_object_setting(os, "includeInOutline", "true");
	
	_wkhtmltopdf_set_global_setting(gs, "outputFormat", "pdf");
	
	for(unsigned int i = 0; i < ParamKeys.getSize(); ++i){
		
		ParamValues.copyUTF8StringAtIndex(&paramValue, i);
		
		switch (ParamKeys.getIntValueAtIndex(i)){
				
			case HTML_USE_BACKGROUND:
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "web.background", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "web.background", "false");	
				}				
				break;						
			case HTML_USE_IMAGES:	
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "web.loadImages", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "web.loadImages", "false");	
				}				
				break;						
			case HTML_USE_JAVASCRIPT:
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "web.enableJavascript", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "web.enableJavascript", "false");	
				}				
				break;						
			case HTML_USE_SMART_RESIZE:	
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "web.enableIntelligentShrinking", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "web.enableIntelligentShrinking", "false");	
				}				
				break;										
			case HTML_USE_PRINT_MEDIA:
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "web.printMediaType", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "web.printMediaType", "false");	
				}				
				break;						
			case HTML_USE_PLUGINS:					
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "web.enablePlugins", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "web.enablePlugins", "false");	
				}				
				break;		
			case HTML_PDF_USE_COMPRESSION:	
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_global_setting(gs, "useCompression", "true");
				}else{
					_wkhtmltopdf_set_global_setting(gs, "useCompression", "false");	
				}				
				break;						
			case HTML_PDF_USE_OUTLINE:	
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_global_setting(gs, "outline", "true");
				}else{
					_wkhtmltopdf_set_global_setting(gs, "outline", "false");	
				}				
				break;						
			case HTML_PDF_HEADER_USE_LINE:
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "header.line", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "header.line", "false");	
				}				
				break;						
			case HTML_PDF_FOOTER_USE_LINE:
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "footer.line", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "footer.line", "false");	
				}				
				break;						
			case HTML_PDF_TOC_USE_DOTTED_LINES:	
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "toc.useDottedLines", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "toc.useDottedLines", "false");	
				}				
				break;					
			case HTML_PDF_TOC_USE_FORWARD_LINKS:
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "toc.forwardLinks", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "toc.forwardLinks", "false");	
				}				
				break;						
			case HTML_PDF_TOC_USE_BACK_LINKS:
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "toc.backLinks", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "toc.backLinks", "false");	
				}				
				break;					
			case HTML_PDF_USE_EXTERNAL_LINKS:	
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "useExternalLinks", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "useExternalLinks", "false");	
				}				
				break;						
			case HTML_PDF_USE_LOCAL_LINKS:	
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "useLocalLinks", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "useLocalLinks", "false");	
				}				
				break;						
			case HTML_PDF_USE_FORMS:
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "produceForms", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "produceForms", "false");	
				}				
				break;					
			case HTML_PDF_USE_PAGES_COUNT:		
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltopdf_set_object_setting(os, "pagesCount", "true");
				}else{
					_wkhtmltopdf_set_object_setting(os, "pagesCount", "false");	
				}									
				break;
			case HTML_MINIMUM_FONT_SIZE:
				_wkhtmltopdf_set_object_setting(os, "web.minimumFontSize", (const char *)paramValue.c_str());
				break;
			case HTML_DEFAULT_ENCODING:
				_wkhtmltopdf_set_object_setting(os, "web.defaultEncoding", (const char *)paramValue.c_str());
				break;	
			case HTML_ZOOM_FACTOR:
				_wkhtmltopdf_set_object_setting(os, "load.zoomFactor",(const char *)paramValue.c_str());
				break;
			case HTML_PDF_PAPER_SIZE:
				_wkhtmltopdf_set_global_setting(gs, "size.paperSize",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_DOCUMENT_WIDTH:
				_wkhtmltopdf_set_global_setting(gs, "size.width",(const char *)paramValue.c_str());
				break;					
			case HTML_PDF_DOCUMENT_HEIGHT:
				_wkhtmltopdf_set_global_setting(gs, "size.height",(const char *)paramValue.c_str());
				break;						
			case HTML_PDF_DPI:
				_wkhtmltopdf_set_global_setting(gs, "dpi",(const char *)paramValue.c_str());
				break;
			case HTML_PDF_JPEG_QUALITY:
				_wkhtmltopdf_set_global_setting(gs, "imageQuality",(const char *)paramValue.c_str());
				break;					
			case HTML_PDF_IMAGE_DPI:
				_wkhtmltopdf_set_global_setting(gs, "imageDPI",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_ORIENTATION:
				if(   (!paramValue.compare((const uint8_t *)"Landscape")) 
				   || (!paramValue.compare((const uint8_t *)"Portrait"))){
					_wkhtmltopdf_set_global_setting(gs, "orientation", (const char *)paramValue.c_str());
				}
				break;
			case HTML_PDF_COLOR_MODE:
				if(   (!paramValue.compare((const uint8_t *)"Color")) 
				   || (!paramValue.compare((const uint8_t *)"Grayscale"))){
					_wkhtmltopdf_set_global_setting(gs, "colorMode", (const char *)paramValue.c_str());
				}
				break;	
			case HTML_PDF_OUTPUT_FORMAT:
				if(   (!paramValue.compare((const uint8_t *)"pdf")) 
				   || (!paramValue.compare((const uint8_t *)"ps"))){
					_wkhtmltopdf_set_global_setting(gs, "outputFormat", (const char *)paramValue.c_str());
				}
				break;
			case HTML_PDF_MARGIN_RIGHT:
				_wkhtmltopdf_set_global_setting(gs, "margin.right",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_MARGIN_LEFT:
				_wkhtmltopdf_set_global_setting(gs, "margin.left",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_MARGIN_BOTTOM:
				_wkhtmltopdf_set_global_setting(gs, "margin.bottom",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_MARGIN_TOP:
				_wkhtmltopdf_set_global_setting(gs, "margin.top",(const char *)paramValue.c_str());
				break;		
			case HTML_PDF_DOCUMENT_TITLE:
				_wkhtmltopdf_set_global_setting(gs, "documentTitle",(const char *)paramValue.c_str());
				break;
			case HTML_PDF_PAGE_NUMBER_OFFSET:
				_wkhtmltopdf_set_global_setting(gs, "pageOffset",(const char *)paramValue.c_str());
				break;					
			case HTML_PDF_NUMBER_OF_COPIES:
				_wkhtmltopdf_set_global_setting(gs, "copies",(const char *)paramValue.c_str());
				break;					
			case HTML_PDF_OUTLINE_DEPTH:
				_wkhtmltopdf_set_global_setting(gs, "outlineDepth",(const char *)paramValue.c_str());
				break;						
			case HTML_PDF_HEADER_FONT_SIZE:
				_wkhtmltopdf_set_object_setting(os, "header.fontSize",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_HEADER_FONT_NAME:
				_wkhtmltopdf_set_object_setting(os, "header.fontName",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_HEADER_LEFT:
				_wkhtmltopdf_set_object_setting(os, "header.left",(const char *)paramValue.c_str());
				break;		
			case HTML_PDF_HEADER_CENTER:
				_wkhtmltopdf_set_object_setting(os, "header.center",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_HEADER_RIGHT:
				_wkhtmltopdf_set_object_setting(os, "header.right",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_HEADER_SPACE:
				_wkhtmltopdf_set_object_setting(os, "header.space",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_FOOTER_FONT_SIZE:
				_wkhtmltopdf_set_object_setting(os, "footer.fontSize",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_FOOTER_FONT_NAME:
				_wkhtmltopdf_set_object_setting(os, "footer.fontName",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_FOOTER_LEFT:
				_wkhtmltopdf_set_object_setting(os, "footer.left",(const char *)paramValue.c_str());
				break;					
			case HTML_PDF_FOOTER_CENTER:
				_wkhtmltopdf_set_object_setting(os, "footer.center",(const char *)paramValue.c_str());
				break;			
			case HTML_PDF_FOOTER_RIGHT:
				_wkhtmltopdf_set_object_setting(os, "footer.right",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_FOOTER_SPACE:
				_wkhtmltopdf_set_object_setting(os, "footer.space",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_TOC_CAPTION_TEXT:
				_wkhtmltopdf_set_object_setting(os, "toc.captionText",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_TOC_INDENTATION:
				_wkhtmltopdf_set_object_setting(os, "toc.indentation",(const char *)paramValue.c_str());
				break;	
			case HTML_PDF_TOC_FONT_SCALE:
				_wkhtmltopdf_set_object_setting(os, "toc.fontScale",(const char *)paramValue.c_str());
				break;
			case HTML_WEB_USERNAME:
				_wkhtmltopdf_set_object_setting(os, "load.username",(const char *)paramValue.c_str());
				break;
			case HTML_WEB_PASSWORD:
				_wkhtmltopdf_set_object_setting(os, "load.password",(const char *)paramValue.c_str());
				break;					
			case HTML_WEB_PROXY:
				_wkhtmltopdf_set_object_setting(os, "load.proxy",(const char *)paramValue.c_str());
				break;						

			case HTML_PDF_OUTLINE_PATH:
				ParamValues.copyPathAtIndex(&path, i);
				_wkhtmltopdf_set_object_setting(os, "dumpOutline",(const char *)path.c_str());
				break;						
			case HTML_PDF_OUTLINE_XSL_PATH:
		//		ParamValues.copyUTF16StringAtIndex(&path, i);
		//		_wkhtmltopdf_set_object_setting(os, "tocXsl",(const char *)path.c_str());					
				break;			
			case HTML_CUSTOM_CSS_PATH:
				ParamValues.copyPathAtIndex(&path, i);
				_wkhtmltopdf_set_object_setting(os, "web.userStyleSheet",(const char *)path.c_str());					
				break;
			case HTML_COOKIE_JAR_PATH:
				ParamValues.copyPathAtIndex(&path, i);
				_wkhtmltopdf_set_object_setting(os, "load.cookieJar",(const char *)path.c_str());						
				break;
		}
		
	}
	
	c = _wkhtmltopdf_create_converter(gs);
	
	if(ParamHtml.getUTF16Length()){
	
		CUTF8String htmlPath, html, htmlUrl;
		ParamHtml.copyPath(&htmlPath);
		ParamHtml.copyUTF8String(&htmlUrl);
		
		if(   (htmlUrl.find((const uint8_t *)"http://") == 0) 
		   || (htmlUrl.find((const uint8_t *)"https://") == 0)
		   || (htmlUrl.find((const uint8_t *)"ftp://") == 0)
		   || (htmlUrl.find((const uint8_t *)"ftps://") == 0)){
			
			_wkhtmltopdf_set_object_setting(os, "page",(const char *)htmlUrl.c_str());	
			_wkhtmltopdf_add_object(c, os, NULL);
			
		}else{
			
			if(checkPath(&htmlPath)){
				_wkhtmltopdf_set_object_setting(os, "page",(const char *)htmlPath.c_str());	
				_wkhtmltopdf_add_object(c, os, NULL);
			}else{
				ParamHtml.copyUTF8String(&html);
				_wkhtmltopdf_add_object(c, os, (const char *)html.c_str());	
			}
			
		}
		
		if(_wkhtmltopdf_convert(c)){
			const unsigned char *bytes;
			long len = _wkhtmltopdf_get_output(c, &bytes);
			returnValue.setBytes((const uint8_t *)bytes, len);
		}
	
	}
	
	_wkhtmltopdf_destroy_converter(c);
	
	returnValue.setReturn(pResult);
}
Beispiel #18
0
void tweak_row_baseline(ROW *row,
                        double blshift_maxshift,
                        double blshift_xfraction) {
  TBOX blob_box;                 //bounding box
  C_BLOB *blob;                  //current blob
  WERD *word;                    //current word
  inT32 blob_count;              //no of blobs
  inT32 src_index;               //source segment
  inT32 dest_index;              //destination segment
  inT32 *xstarts;                //spline segments
  double *coeffs;                //spline coeffs
  float ydiff;                   //baseline error
  float x_centre;                //centre of blob
                                 //words of row
  WERD_IT word_it = row->word_list ();
  C_BLOB_IT blob_it;             //blob iterator

  blob_count = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
                                 //get total blobs
    blob_count += word->cblob_list ()->length ();
  }
  if (blob_count == 0)
    return;
  xstarts =
    (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
    sizeof (inT32));
  coeffs =
    (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
    sizeof (double));

  src_index = 0;
  dest_index = 0;
  xstarts[0] = row->baseline.xcoords[0];
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
                                 //blobs in word
    blob_it.set_to_list (word->cblob_list ());
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      blob_box = blob->bounding_box ();
      x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
      ydiff = blob_box.bottom () - row->base_line (x_centre);
      if (ydiff < 0)
        ydiff = -ydiff / row->x_height ();
      else
        ydiff = ydiff / row->x_height ();
      if (ydiff < blshift_maxshift
        && blob_box.height () / row->x_height () > blshift_xfraction) {
        if (xstarts[dest_index] >= x_centre)
          xstarts[dest_index] = blob_box.left ();
        coeffs[dest_index * 3] = 0;
        coeffs[dest_index * 3 + 1] = 0;
        coeffs[dest_index * 3 + 2] = blob_box.bottom ();
        //shift it
        dest_index++;
        xstarts[dest_index] = blob_box.right () + 1;
      }
      else {
        if (xstarts[dest_index] <= x_centre) {
          while (row->baseline.xcoords[src_index + 1] <= x_centre
          && src_index < row->baseline.segments - 1) {
            if (row->baseline.xcoords[src_index + 1] >
            xstarts[dest_index]) {
              coeffs[dest_index * 3] =
                row->baseline.quadratics[src_index].a;
              coeffs[dest_index * 3 + 1] =
                row->baseline.quadratics[src_index].b;
              coeffs[dest_index * 3 + 2] =
                row->baseline.quadratics[src_index].c;
              dest_index++;
              xstarts[dest_index] =
                row->baseline.xcoords[src_index + 1];
            }
            src_index++;
          }
          coeffs[dest_index * 3] =
            row->baseline.quadratics[src_index].a;
          coeffs[dest_index * 3 + 1] =
            row->baseline.quadratics[src_index].b;
          coeffs[dest_index * 3 + 2] =
            row->baseline.quadratics[src_index].c;
          dest_index++;
          xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
        }
      }
    }
  }
  while (src_index < row->baseline.segments
    && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
    src_index++;
  while (src_index < row->baseline.segments) {
    coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
    coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
    coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
    dest_index++;
    src_index++;
    xstarts[dest_index] = row->baseline.xcoords[src_index];
  }
                                 //turn to spline
  row->baseline = QSPLINE (dest_index, xstarts, coeffs);
  free_mem(xstarts);
  free_mem(coeffs);
}
void HTML_Convert_to_image(sLONG_PTR *pResult, PackagePtr pParams)
{
	C_TEXT ParamHtml;
	ARRAY_LONGINT ParamKeys;
	ARRAY_TEXT ParamValues;
	C_BLOB returnValue;

	ParamHtml.fromParamAtIndex(pParams, 1);
	ParamKeys.fromParamAtIndex(pParams, 2);
	ParamValues.fromParamAtIndex(pParams, 3);

	CUTF8String paramValue, path;
	
	wkhtmltoimage_global_settings *gs;
	wkhtmltoimage_converter *c;
	
	gs = _wkhtmltoimage_create_global_settings();

	for(unsigned int i = 0; i < ParamKeys.getSize(); ++i){
		
		ParamValues.copyUTF8StringAtIndex(&paramValue, i);
		
		switch (ParamKeys.getIntValueAtIndex(i)){
				
			case HTML_IMAGE_USE_SMART_WIDTH:
				if(!paramValue.compare((const uint8_t *)"true")){
					_wkhtmltoimage_set_global_setting(gs, "smartWidth", "true");
				}else{
					_wkhtmltoimage_set_global_setting(gs, "smartWidth", "false");	
				}				
				break;
			case HTML_IMAGE_USE_BACKGROUND:
				if(!paramValue.compare((const uint8_t *)"false")){
					_wkhtmltoimage_set_global_setting(gs, "transparent", "true");
				}else{
					_wkhtmltoimage_set_global_setting(gs, "transparent", "false");	
				}				
				break;
			case HTML_IMAGE_CROP_LEFT:
				_wkhtmltoimage_set_global_setting(gs, "crop.left", (const char *)paramValue.c_str());
				break;			
			case HTML_IMAGE_CROP_TOP:
				_wkhtmltoimage_set_global_setting(gs, "crop.top", (const char *)paramValue.c_str());
				break;	
			case HTML_IMAGE_CROP_WIDTH:
				_wkhtmltoimage_set_global_setting(gs, "crop.width", (const char *)paramValue.c_str());
				break;	
			case HTML_IMAGE_CROP_HEIGHT:
				_wkhtmltoimage_set_global_setting(gs, "crop.height", (const char *)paramValue.c_str());
				break;					
			case HTML_IMAGE_JPEG_QUALITY:
				_wkhtmltoimage_set_global_setting(gs, "quality", (const char *)paramValue.c_str());
				break;
			case HTML_IMAGE_SCREEN_WIDTH:
				_wkhtmltoimage_set_global_setting(gs, "screenWidth", (const char *)paramValue.c_str());
				break;				
			case HTML_IMAGE_FORMAT:
				if(   (!paramValue.compare((const uint8_t *)"jpg")) 
				   || (!paramValue.compare((const uint8_t *)"png")) 
				   || (!paramValue.compare((const uint8_t *)"bmp")) 	
				   || (!paramValue.compare((const uint8_t *)"svg"))){
					_wkhtmltoimage_set_global_setting(gs, "fmt", (const char *)paramValue.c_str());
				}
				break;		
		}
		
	}
	
	if(ParamHtml.getUTF16Length()){
		
		CUTF8String htmlPath, html, htmlUrl;
		ParamHtml.copyPath(&htmlPath);
		ParamHtml.copyUTF8String(&htmlUrl);
		
		if(   (htmlUrl.find((const uint8_t *)"http://") == 0) 
		   || (htmlUrl.find((const uint8_t *)"https://") == 0)
		   || (htmlUrl.find((const uint8_t *)"ftp://") == 0)
		   || (htmlUrl.find((const uint8_t *)"ftps://") == 0)){
			
			_wkhtmltoimage_set_global_setting(gs, "in",(const char *)htmlUrl.c_str());	
			c = _wkhtmltoimage_create_converter(gs, NULL);
			
		}else{
			
			if(checkPath(&htmlPath)){
				_wkhtmltoimage_set_global_setting(gs, "in",(const char *)htmlPath.c_str());
				c = _wkhtmltoimage_create_converter(gs, NULL);
			}else{
				ParamHtml.copyUTF8String(&html);
				c = _wkhtmltoimage_create_converter(gs, (const char *)html.c_str());	
			}
			
		}
		
		if(_wkhtmltoimage_convert(c)){
			const unsigned char *bytes;
			long len = _wkhtmltoimage_get_output(c, &bytes);
			returnValue.setBytes((const uint8_t *)bytes, len);
		}
		
	}
	
	_wkhtmltoimage_destroy_converter(c);
	
	returnValue.setReturn(pResult);
}
Beispiel #20
0
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
                                 const TBOX& box, const TBOX& next_box,
                                 const char* correct_text) {
  if (applybox_debug > 1) {
    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
  }
  WERD* new_word = NULL;
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOCK* block = b_it.data();
    if (!box.major_overlap(block->bounding_box()))
      continue;
    ROW_IT r_it(block->row_list());
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
      ROW* row = r_it.data();
      if (!box.major_overlap(row->bounding_box()))
        continue;
      WERD_IT w_it(row->word_list());
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
        WERD* word = w_it.data();
        if (applybox_debug > 2) {
          tprintf("Checking word:");
          word->bounding_box().print();
        }
        if (word->text() != NULL && word->text()[0] != '\0')
          continue;  // Ignore words that are already done.
        if (!box.major_overlap(word->bounding_box()))
          continue;
        C_BLOB_IT blob_it(word->cblob_list());
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
             blob_it.forward()) {
          C_BLOB* blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (!blob_box.major_overlap(box))
            continue;
          double current_box_miss_metric = BoxMissMetric(blob_box, box);
          double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
          if (applybox_debug > 2) {
            tprintf("Checking blob:");
            blob_box.print();
            tprintf("Current miss metric = %g, next = %g\n",
                    current_box_miss_metric, next_box_miss_metric);
          }
          if (current_box_miss_metric > next_box_miss_metric)
            continue;  // Blob is a better match for next box.
          if (applybox_debug > 2) {
            tprintf("Blob match: blob:");
            blob_box.print();
            tprintf("Matches box:");
            box.print();
            tprintf("With next box:");
            next_box.print();
          }
          if (new_word == NULL) {
            // Make a new word with a single blob.
            new_word = word->shallow_copy();
            new_word->set_text(correct_text);
            w_it.add_to_end(new_word);
          }
          C_BLOB_IT new_blob_it(new_word->cblob_list());
          new_blob_it.add_to_end(blob_it.extract());
        }
      }
    }
  }
  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
  return new_word != NULL;
}
void CC_AES(const EVP_CIPHER *cipher,
						C_BLOB &Param1,
						C_BLOB &Param2,
						C_LONGINT &Param3,
						C_LONGINT &Param5,
						C_LONGINT &Param6,
						C_BLOB &Param7,
						C_BLOB &Param8,
						C_TEXT &returnValue)
{
	EVP_CIPHER_CTX *ctx = EVP_CIPHER_CTX_new();
	
	unsigned char key[EVP_MAX_KEY_LENGTH], iv[EVP_MAX_IV_LENGTH];
	
	const unsigned char *source = (const unsigned char *)Param1.getBytesPtr();
	int source_len = Param1.getBytesLength();
	int crypted_len, tail_len;
	
	bool key_and_iv_is_valid = false;
	
	if(  !Param2.getBytesLength()
		 && Param7.getBytesLength()
		 && Param8.getBytesLength()
		 && Param7.getBytesLength() <= EVP_MAX_KEY_LENGTH
		 && Param8.getBytesLength() <= EVP_MAX_IV_LENGTH)
	{
		memset(key, 0, EVP_MAX_KEY_LENGTH);
		memset( iv, 0, EVP_MAX_IV_LENGTH );
		memcpy(key, Param7.getBytesPtr(), Param7.getBytesLength());
		memcpy( iv, Param8.getBytesPtr(), Param8.getBytesLength());
		key_and_iv_is_valid = true;
	}else
	{
		// passphrase -> key, iv
		key_and_iv_is_valid = (EVP_BytesToKey(cipher, EVP_md5(), NULL,
																					Param2.getBytesPtr(), Param2.getBytesLength(),
																					2048, key, iv) > 0);
	}
	
	if (key_and_iv_is_valid) {
		if(EVP_CipherInit(ctx, cipher, key, iv, 0 == Param3.getIntValue()))
		{
			if(Param6.getIntValue())
			{
				EVP_CIPHER_CTX_set_padding(ctx, 0);
			}
			size_t buf_size = source_len + EVP_MAX_BLOCK_LENGTH;
			unsigned char *buf = (unsigned char *)calloc(buf_size, sizeof(unsigned char));
			if(EVP_CipherUpdate(ctx, buf, &crypted_len, source, source_len))
			{
				if(EVP_CipherFinal(ctx, (buf + crypted_len), &tail_len))
				{
					crypted_len += tail_len;
					C_BLOB temp;
					temp.setBytes((const uint8_t *)buf, crypted_len);
					
					switch (Param5.getIntValue())
					{
						case 1:
							temp.toB64Text(&returnValue);
							break;
						case 2:
							temp.toB64Text(&returnValue, true);
							break;
						default:
							temp.toHexText(&returnValue);
							break;
					}
				}
			}
			free(buf);
		}
		EVP_CIPHER_CTX_free(ctx);
	}
}
Beispiel #22
0
void Textord::clean_noise_from_words(          //remove empties
                                     ROW *row  //row to clean
                                    ) {
  TBOX blob_box;                  //bounding box
  inT8 *word_dud;                //was it chucked
  C_BLOB *blob;                  //current blob
  C_OUTLINE *outline;            //current outline
  WERD *word;                    //current word
  inT32 blob_size;               //biggest size
  inT32 trans_count;             //no of transitions
  inT32 trans_threshold;         //noise tolerance
  inT32 dot_count;               //small objects
  inT32 norm_count;              //normal objects
  inT32 dud_words;               //number discarded
  inT32 ok_words;                //number remaining
  inT32 word_index;              //current word
                                 //words of row
  WERD_IT word_it = row->word_list ();
  C_BLOB_IT blob_it;             //blob iterator
  C_OUTLINE_IT out_it;           //outline iterator

  ok_words = word_it.length ();
  if (ok_words == 0 || textord_no_rejects)
    return;
  word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
  dud_words = 0;
  ok_words = 0;
  word_index = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
    dot_count = 0;
    norm_count = 0;
                                 //blobs in word
    blob_it.set_to_list (word->cblob_list ());
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      if (!word->flag (W_DONT_CHOP)) {
                                 //get outlines
        out_it.set_to_list (blob->out_list ());
        for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
        out_it.forward ()) {
          outline = out_it.data ();
          blob_box = outline->bounding_box ();
          blob_size =
            blob_box.width () >
            blob_box.height ()? blob_box.width () : blob_box.
            height();
          if (blob_size < textord_noise_sizelimit * row->x_height ())
            dot_count++;         //count smal outlines
          if (!outline->child ()->empty ()
            && blob_box.height () <
            (1 + textord_noise_syfract) * row->x_height ()
            && blob_box.height () >
            (1 - textord_noise_syfract) * row->x_height ()
            && blob_box.width () <
            (1 + textord_noise_sxfract) * row->x_height ()
            && blob_box.width () >
            (1 - textord_noise_sxfract) * row->x_height ())
            norm_count++;        //count smal outlines
        }
      }
      else
        norm_count++;
      blob_box = blob->bounding_box ();
      blob_size =
        blob_box.width () >
        blob_box.height ()? blob_box.width () : blob_box.height ();
      if (blob_size >= textord_noise_sizelimit * row->x_height ()
      && blob_size < row->x_height () * 2) {
        trans_threshold = blob_size / textord_noise_sizefraction;
        trans_count = blob->count_transitions (trans_threshold);
        if (trans_count < textord_noise_translimit)
          norm_count++;
      }
      else if (blob_box.height () > row->x_height () * 2
        && (!word_it.at_first () || !blob_it.at_first ()))
        dot_count += 2;
    }
    if (dot_count > 2) {
      if (dot_count > norm_count * textord_noise_normratio * 2)
        word_dud[word_index] = 2;
      else if (dot_count > norm_count * textord_noise_normratio)
        word_dud[word_index] = 1;
      else
        word_dud[word_index] = 0;
    }
    else
      word_dud[word_index] = 0;
    if (word_dud[word_index] == 2)
      dud_words++;
    else
      ok_words++;
    word_index++;
  }

  word_index = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    if (word_dud[word_index] == 2
    || (word_dud[word_index] == 1 && dud_words > ok_words)) {
      word = word_it.data ();    //current word
                                 //rejected blobs
      blob_it.set_to_list (word->rej_cblob_list ());
                                 //move from blobs
      blob_it.add_list_after (word->cblob_list ());
    }
    word_index++;
  }
  free_mem(word_dud);
}
Beispiel #23
0
void Textord::clean_noise_from_words(          //remove empties
                                     ROW *row  //row to clean
                                    ) {
  TBOX blob_box;                 //bounding box
  C_BLOB *blob;                  //current blob
  C_OUTLINE *outline;            //current outline
  WERD *word;                    //current word
  int32_t blob_size;             //biggest size
  int32_t trans_count;           //no of transitions
  int32_t trans_threshold;       //noise tolerance
  int32_t dot_count;             //small objects
  int32_t norm_count;            //normal objects
  int32_t dud_words;             //number discarded
  int32_t ok_words;              //number remaining
  int32_t word_index;            //current word
                                 //words of row
  WERD_IT word_it = row->word_list ();
  C_BLOB_IT blob_it;             //blob iterator
  C_OUTLINE_IT out_it;           //outline iterator

  ok_words = word_it.length ();
  if (ok_words == 0 || textord_no_rejects)
    return;
  // was it chucked
  std::vector<int8_t> word_dud(ok_words);
  dud_words = 0;
  ok_words = 0;
  word_index = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
    dot_count = 0;
    norm_count = 0;
                                 //blobs in word
    blob_it.set_to_list (word->cblob_list ());
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      if (!word->flag (W_DONT_CHOP)) {
                                 //get outlines
        out_it.set_to_list (blob->out_list ());
        for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
        out_it.forward ()) {
          outline = out_it.data ();
          blob_box = outline->bounding_box ();
          blob_size =
            blob_box.width () >
            blob_box.height ()? blob_box.width () : blob_box.
            height();
          if (blob_size < textord_noise_sizelimit * row->x_height ())
            dot_count++;         //count smal outlines
          if (!outline->child ()->empty ()
            && blob_box.height () <
            (1 + textord_noise_syfract) * row->x_height ()
            && blob_box.height () >
            (1 - textord_noise_syfract) * row->x_height ()
            && blob_box.width () <
            (1 + textord_noise_sxfract) * row->x_height ()
            && blob_box.width () >
            (1 - textord_noise_sxfract) * row->x_height ())
            norm_count++;        //count smal outlines
        }
      }
      else
        norm_count++;
      blob_box = blob->bounding_box ();
      blob_size =
        blob_box.width () >
        blob_box.height ()? blob_box.width () : blob_box.height ();
      if (blob_size >= textord_noise_sizelimit * row->x_height ()
      && blob_size < row->x_height () * 2) {
        trans_threshold = blob_size / textord_noise_sizefraction;
        trans_count = blob->count_transitions (trans_threshold);
        if (trans_count < textord_noise_translimit)
          norm_count++;
      }
      else if (blob_box.height () > row->x_height () * 2
        && (!word_it.at_first () || !blob_it.at_first ()))
        dot_count += 2;
    }
    if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
      if (dot_count > norm_count * textord_noise_normratio * 2)
        word_dud[word_index] = 2;
      else if (dot_count > norm_count * textord_noise_normratio)
        word_dud[word_index] = 1;
      else
        word_dud[word_index] = 0;
    } else {
      word_dud[word_index] = 0;
    }
    if (word_dud[word_index] == 2)
      dud_words++;
    else
      ok_words++;
    word_index++;
  }

  word_index = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    if (word_dud[word_index] == 2
    || (word_dud[word_index] == 1 && dud_words > ok_words)) {
      word = word_it.data();  // Current word.
      // Previously we threw away the entire word.
      // Now just aggressively throw all small blobs into the reject list, where
      // the classifier can decide whether they are actually needed.
      word->CleanNoise(textord_noise_sizelimit * row->x_height());
    }
    word_index++;
  }
}