void testtess() { cv::Mat img = imread("phototest.png"); if(img.empty()) { cerr << "noup." << endl; return; } imshow("blah",img); waitKey(1); cout << "photo " << img.size() << endl; int rc = api.Init(tessdata_dir.c_str(), "ara", tesseract::OEM_DEFAULT); if (rc) { cerr << "Could not initialize tesseract.\n"; exit(1); } api.SetPageSegMode(tesseract::PSM_AUTO); printf("Tesseract Open Source OCR Engine v%s with Leptonica\n", tesseract::TessBaseAPI::Version()); printf("Init languages %s\n",api.GetInitLanguagesAsString()); Mat tmp; cvtColor(img, tmp, CV_BGR2GRAY); adaptiveThreshold(tmp, tmp, 255, ADAPTIVE_THRESH_MEAN_C, THRESH_BINARY, 51, 35); imshow("eqhist", tmp); // cvtColor(tmp, out(Rect(0,0,r.width,r.height)), CV_GRAY2BGR); char* cstr = api.TesseractRect(tmp.data, tmp.channels(), tmp.cols*tmp.channels(), 0, 0, tmp.cols, tmp.rows); cout << cstr << endl; delete[] cstr; waitKey(0); }
//Default constructor OCRTesseractImpl(const char* datapath, const char* language, const char* char_whitelist, int oemode, int psmode) { #ifdef HAVE_TESSERACT const char *lang = "eng"; if (language != NULL) lang = language; if (tess.Init(datapath, lang, (tesseract::OcrEngineMode)oemode)) { cout << "OCRTesseract: Could not initialize tesseract." << endl; throw 1; } //cout << "OCRTesseract: tesseract version " << tess.Version() << endl; tesseract::PageSegMode pagesegmode = (tesseract::PageSegMode)psmode; tess.SetPageSegMode(pagesegmode); if(char_whitelist != NULL) tess.SetVariable("tessedit_char_whitelist", char_whitelist); else tess.SetVariable("tessedit_char_whitelist", "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"); tess.SetVariable("save_best_choices", "T"); #else cout << "OCRTesseract("<<oemode<<psmode<<"): Tesseract not found." << endl; if (datapath != NULL) cout << " " << datapath << endl; if (language != NULL) cout << " " << language << endl; if (char_whitelist != NULL) cout << " " << char_whitelist << endl; #endif }
std::string identifyText(cv::Mat input, std::string language) { ocr.Init(NULL, language.c_str(), tesseract::OEM_TESSERACT_ONLY); //std::string whitelist = "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,.:"; //ocr.SetVariable("tessedit_char_whitelist", whitelist.c_str()); ocr.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK); ocr.SetImage(input.data, input.cols, input.rows, 1, input.step); std::string text = ocr.GetUTF8Text(); return text; }
void TesseractBridge::init() { // int rc = api.Init(tessdata_dir.c_str(), NULL); // if (rc) { // cerr << "Could not initialize tesseract.\n"; // exit(1); // } // api.End(); int rc = api.Init(tessdata_dir.c_str(), "ara", tesseract::OEM_DEFAULT); if (rc) { cerr << "Could not initialize tesseract.\n"; exit(1); } api.SetPageSegMode(tesseract::PSM_AUTO); printf("Tesseract Open Source OCR Engine v%s with Leptonica\n", tesseract::TessBaseAPI::Version()); printf("Init languages %s\n",api.GetInitLanguagesAsString()); // GenericVector<STRING> v; api.GetAvailableLanguagesAsVector(&v); // for (int i = 0; i < v.length(); ++i) { // printf("lang %s\n",v[i].string()); // } }
/* ** ocr_type=0: OEM_DEFAULT ** ocr_type=1: OEM_TESSERACT_ONLY ** ocr_type=2: OEM_CUBE_ONLY ** ocr_type=3: OEM_TESSERACT_CUBE_COMBINED */ int tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out, char *initstr,int maxlen) { int status; #ifdef USE_NLS setlocale (LC_ALL, ""); bindtextdomain (PACKAGE, LOCALEDIR); textdomain (PACKAGE); #endif // fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version()); // Make the order of args a bit more forgiving than it used to be. const char* lang = "eng"; tesseract::PageSegMode pagesegmode = tesseract::PSM_SINGLE_BLOCK; if (language!=NULL && language[0]!='\0') lang = language; /* if (output == NULL) { fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] " "[-psm pagesegmode] [configfile...]\n"), argv[0]); fprintf(stderr, _("pagesegmode values are:\n" "0 = Orientation and script detection (OSD) only.\n" "1 = Automatic page segmentation with OSD.\n" "2 = Automatic page segmentation, but no OSD, or OCR\n" "3 = Fully automatic page segmentation, but no OSD. (Default)\n" "4 = Assume a single column of text of variable sizes.\n" "5 = Assume a single uniform block of vertically aligned text.\n" "6 = Assume a single uniform block of text.\n" "7 = Treat the image as a single text line.\n" "8 = Treat the image as a single word.\n" "9 = Treat the image as a single word in a circle.\n" "10 = Treat the image as a single character.\n")); fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any" "configfile.\n")); exit(1); } */ api.SetOutputName(NULL); status=api.Init(datapath,lang, ocr_type==0 ? tesseract::OEM_DEFAULT : (ocr_type==1 ? tesseract::OEM_TESSERACT_ONLY : (ocr_type==2 ? tesseract::OEM_CUBE_ONLY : (tesseract::OEM_TESSERACT_CUBE_COMBINED)))); if (status) return(status); /* api.Init("tesscapi",lang,tesseract::OEM_DEFAULT, &(argv[arg]), argc - arg, NULL, NULL, false); */ // We have 2 possible sources of pagesegmode: a config file and // the command line. For backwards compatability reasons, the // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the // default for this program is tesseract::PSM_AUTO. We will let // the config file take priority, so the command-line default // can take priority over the tesseract default, so we use the // value from the command line only if the retrieved mode // is still tesseract::PSM_SINGLE_BLOCK, indicating no change // in any config file. Therefore the only way to force // tesseract::PSM_SINGLE_BLOCK is from the command line. // It would be simpler if we could set the value before Init, // but that doesn't work. if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) api.SetPageSegMode(pagesegmode); /* ** Initialization message */ { char istr[256]; sprintf(istr,"Tesseract Open Source OCR Engine v%s ",tesseract::TessBaseAPI::Version()); if (ocr_type==0 || ocr_type==3) sprintf(&istr[strlen(istr)],"[CUBE+] (lang="); else if (ocr_type==2) sprintf(&istr[strlen(istr)],"[CUBE] (lang="); strncpy(&istr[strlen(istr)],language,253-strlen(istr)); istr[253]='\0'; strcat(istr,")"); if (out!=NULL) fprintf(out,"%s\n",istr); if (initstr!=NULL) { strncpy(initstr,istr,maxlen-1); initstr[maxlen-1]='\0'; } } /* Turn off CUBE debugging output */ api.SetVariable("cube_debug_level","0"); #if (WILLUSDEBUG & 1) api.SetVariable("cube_debug_level","9"); api.SetVariable("paragraph_debug_level","9"); api.SetVariable("tessdata_manager_debug_level","9"); api.SetVariable("tosp_debug_level","9"); api.SetVariable("wordrec_debug_level","9"); api.SetVariable("segsearch_debug_level","9"); #endif return(0); }