/* array = '[' [ value { ',' value } ] ']' */ static int parse_array(struct frozen *f) { int i = 0, current_path_len; char buf[20]; CALL_BACK(f, JSON_TYPE_ARRAY_START, NULL, 0); TRY(test_and_skip(f, '[')); { { SET_STATE(f, f->cur - 1, "", 0); while (cur(f) != ']') { snprintf(buf, sizeof(buf), "[%d]", i); i++; current_path_len = append_to_path(f, buf, strlen(buf)); f->cur_name = f->path + strlen(f->path) - strlen(buf) + 1 /*opening brace*/; f->cur_name_len = strlen(buf) - 2 /*braces*/; TRY(parse_value(f)); truncate_path(f, current_path_len); if (cur(f) == ',') f->cur++; } TRY(test_and_skip(f, ']')); truncate_path(f, fstate.path_len); CALL_BACK(f, JSON_TYPE_ARRAY_END, fstate.ptr, f->cur - fstate.ptr); } } return 0; }
/* number = [ '-' ] digit+ [ '.' digit+ ] [ ['e'|'E'] ['+'|'-'] digit+ ] */ static int parse_number(struct frozen *f) { int ch = cur(f); SET_STATE(f, f->cur, "", 0); if (ch == '-') f->cur++; EXPECT(f->cur < f->end, JSON_STRING_INCOMPLETE); EXPECT(is_digit(f->cur[0]), JSON_STRING_INVALID); while (f->cur < f->end && is_digit(f->cur[0])) f->cur++; if (f->cur < f->end && f->cur[0] == '.') { f->cur++; EXPECT(f->cur < f->end, JSON_STRING_INCOMPLETE); EXPECT(is_digit(f->cur[0]), JSON_STRING_INVALID); while (f->cur < f->end && is_digit(f->cur[0])) f->cur++; } if (f->cur < f->end && (f->cur[0] == 'e' || f->cur[0] == 'E')) { f->cur++; EXPECT(f->cur < f->end, JSON_STRING_INCOMPLETE); if ((f->cur[0] == '+' || f->cur[0] == '-')) f->cur++; EXPECT(f->cur < f->end, JSON_STRING_INCOMPLETE); EXPECT(is_digit(f->cur[0]), JSON_STRING_INVALID); while (f->cur < f->end && is_digit(f->cur[0])) f->cur++; } truncate_path(f, fstate.path_len); CALL_BACK(f, JSON_TYPE_NUMBER, fstate.ptr, f->cur - fstate.ptr); return 0; }
/* identifier = letter { letter | digit | '_' } */ static int parse_identifier(struct frozen *f) { EXPECT(is_alpha(cur(f)), JSON_STRING_INVALID); { SET_STATE(f, f->cur, "", 0); while (f->cur < f->end && (*f->cur == '_' || is_alpha(*f->cur) || is_digit(*f->cur))) { f->cur++; } truncate_path(f, fstate.path_len); CALL_BACK(f, JSON_TYPE_STRING, fstate.ptr, f->cur - fstate.ptr); } return 0; }
/* pair = key ':' value */ static int parse_pair(struct frozen *f) { int current_path_len; const char *tok; skip_whitespaces(f); tok = f->cur; TRY(parse_key(f)); current_path_len = append_to_path(f, *tok == '"' ? tok + 1 : tok, *tok == '"' ? f->cur - tok - 2 : f->cur - tok); TRY(test_and_skip(f, ':')); TRY(parse_value(f)); truncate_path(f, current_path_len); return 0; }
/* object = '{' pair { ',' pair } '}' */ static int parse_object(struct frozen *f) { CALL_BACK(f, JSON_TYPE_OBJECT_START, NULL, 0); TRY(test_and_skip(f, '{')); { SET_STATE(f, f->cur - 1, ".", 1); while (cur(f) != '}') { TRY(parse_pair(f)); if (cur(f) == ',') f->cur++; } TRY(test_and_skip(f, '}')); truncate_path(f, fstate.path_len); CALL_BACK(f, JSON_TYPE_OBJECT_END, fstate.ptr, f->cur - fstate.ptr); } return 0; }
static int expect(struct frozen *f, const char *s, int len, enum json_token_type tok_type) { int i, n = left(f); SET_STATE(f, f->cur, "", 0); for (i = 0; i < len; i++) { if (i >= n) return JSON_STRING_INCOMPLETE; if (f->cur[i] != s[i]) return JSON_STRING_INVALID; } f->cur += len; truncate_path(f, fstate.path_len); CALL_BACK(f, tok_type, fstate.ptr, f->cur - fstate.ptr); return 0; }
/* array = '[' [ value { ',' value } ] ']' */ static int parse_array(struct frozen *f) { int i = 0, current_path_len; char buf[20]; TRY(test_and_skip(f, '[')); { SET_STATE(f, f->cur - 1, JSON_TYPE_ARRAY, "", 0); while (cur(f) != ']') { snprintf(buf, sizeof(buf), "[%d]", i); i++; current_path_len = append_to_path(f, buf, strlen(buf)); TRY(parse_value(f)); truncate_path(f, current_path_len); if (cur(f) == ',') f->cur++; } TRY(test_and_skip(f, ']')); CALL_BACK(f); } return 0; }
/* string = '"' { quoted_printable_chars } '"' */ static int parse_string(struct frozen *f) { int n, ch = 0, len = 0; TRY(test_and_skip(f, '"')); { SET_STATE(f, f->cur, "", 0); for (; f->cur < f->end; f->cur += len) { ch = *(unsigned char *) f->cur; len = get_utf8_char_len((unsigned char) ch); EXPECT(ch >= 32 && len > 0, JSON_STRING_INVALID); /* No control chars */ EXPECT(len <= left(f), JSON_STRING_INCOMPLETE); if (ch == '\\') { EXPECT((n = get_escape_len(f->cur + 1, left(f))) > 0, n); len += n; } else if (ch == '"') { truncate_path(f, fstate.path_len); CALL_BACK(f, JSON_TYPE_STRING, fstate.ptr, f->cur - fstate.ptr); f->cur++; break; }; } } return ch == '"' ? 0 : JSON_STRING_INCOMPLETE; }
static void format_filelist_filename(LPWSTR file, LPWSTR out) { LPWSTR pos_basename; LPWSTR truncpos1, truncpos2; WCHAR myDocs[MAX_PATH]; SHGetFolderPathW(NULL, CSIDL_PERSONAL, NULL, SHGFP_TYPE_CURRENT, myDocs); pos_basename = file_basename(file); truncpos1 = NULL; truncpos2 = NULL; *(pos_basename-1) = 0; if(!lstrcmpiW(file, myDocs) || (lstrlenW(pos_basename) > FILELIST_ENTRY_LENGTH)) { truncpos1 = pos_basename; *(pos_basename-1) = '\\'; } else { LPWSTR pos; BOOL morespace = FALSE; *(pos_basename-1) = '\\'; for(pos = file; pos < pos_basename; pos++) { if(*pos == '\\' || *pos == '/') { if(truncpos1) { if((pos - file + lstrlenW(pos_basename)) > FILELIST_ENTRY_LENGTH) break; truncpos1 = pos; morespace = TRUE; break; } if((pos - file + lstrlenW(pos_basename)) > FILELIST_ENTRY_LENGTH) break; truncpos1 = pos; } } if(morespace) { for(pos = pos_basename; pos >= truncpos1; pos--) { if(*pos == '\\' || *pos == '/') { if((truncpos1 - file + lstrlenW(pos_basename) + pos_basename - pos) > FILELIST_ENTRY_LENGTH) break; truncpos2 = pos; } } } } if(truncpos1 == pos_basename) lstrcatW(out, pos_basename); else if(truncpos1 == truncpos2 || !truncpos2) lstrcatW(out, file); else truncate_path(file, out, truncpos1, truncpos2); }
int main(int argc, char **argv) { #ifdef USING_GETTEXT setlocale (LC_ALL, ""); bindtextdomain (PACKAGE, LOCALEDIR); textdomain (PACKAGE); #endif if ((argc == 2 && strcmp(argv[1], "-v") == 0) || (argc == 2 && strcmp(argv[1], "--version") == 0)) { char *versionStrP; fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version()); versionStrP = getLeptonicaVersion(); fprintf(stderr, " %s\n", versionStrP); lept_free(versionStrP); versionStrP = getImagelibVersions(); fprintf(stderr, " %s\n", versionStrP); lept_free(versionStrP); exit(0); } tesseract::TessBaseAPI api; int rc = api.Init(argv[0], NULL); if (rc) { fprintf(stderr, "Could not initialize tesseract.\n"); exit(1); } if (argc == 2 && strcmp(argv[1], "--list-langs") == 0) { GenericVector<STRING> languages; api.GetAvailableLanguagesAsVector(&languages); fprintf(stderr, "List of available languages (%d):\n", languages.size()); for (int index = 0; index < languages.size(); ++index) { STRING& string = languages[index]; fprintf(stderr, "%s\n", string.string()); } api.Clear(); exit(0); } api.End(); // Make the order of args a bit more forgiving than it used to be. const char* lang = "eng"; const char* image = NULL; const char* output = NULL; tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO; int arg = 1; while (arg < argc && (output == NULL || argv[arg][0] == '-')) { if (strcmp(argv[arg], "-l") == 0 && arg + 1 < argc) { lang = argv[arg + 1]; ++arg; } else if (strcmp(argv[arg], "-psm") == 0 && arg + 1 < argc) { pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[arg + 1])); ++arg; } else if (image == NULL) { image = argv[arg]; } else if (output == NULL) { output = argv[arg]; } ++arg; } if (output == NULL) { fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] " "[-psm pagesegmode] [configfile...]\n\n"), argv[0]); fprintf(stderr, _("pagesegmode values are:\n" "0 = Orientation and script detection (OSD) only.\n" "1 = Automatic page segmentation with OSD.\n" "2 = Automatic page segmentation, but no OSD, or OCR\n" "3 = Fully automatic page segmentation, but no OSD. (Default)\n" "4 = Assume a single column of text of variable sizes.\n" "5 = Assume a single uniform block of vertically aligned text.\n" "6 = Assume a single uniform block of text.\n" "7 = Treat the image as a single text line.\n" "8 = Treat the image as a single word.\n" "9 = Treat the image as a single word in a circle.\n" "10 = Treat the image as a single character.\n")); fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any" "configfile.\n\n")); fprintf(stderr, _("Single options:\n")); fprintf(stderr, _(" -v --version: version info\n")); fprintf(stderr, _(" --list-langs: list available languages for tesseract engine\n")); exit(1); } api.SetOutputName(output); STRING tessdata_dir; truncate_path(argv[0], &tessdata_dir); rc = api.Init(tessdata_dir.string(), lang, tesseract::OEM_DEFAULT, &(argv[arg]), argc - arg, NULL, NULL, false); if (rc) { fprintf(stderr, "Could not initialize tesseract.\n"); exit(1); } // We have 2 possible sources of pagesegmode: a config file and // the command line. For backwards compatability reasons, the // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the // default for this program is tesseract::PSM_AUTO. We will let // the config file take priority, so the command-line default // can take priority over the tesseract default, so we use the // value from the command line only if the retrieved mode // is still tesseract::PSM_SINGLE_BLOCK, indicating no change // in any config file. Therefore the only way to force // tesseract::PSM_SINGLE_BLOCK is from the command line. // It would be simpler if we could set the value before Init, // but that doesn't work. if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) api.SetPageSegMode(pagesegmode); tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n", tesseract::TessBaseAPI::Version()); FILE* fin = fopen(image, "rb"); if (fin == NULL) { printf("Cannot open input file: %s\n", image); exit(2); } fclose(fin); PIX *pixs; if ((pixs = pixRead(image)) == NULL) { printf("Unsupported image type.\n"); exit(3); } pixDestroy(&pixs); STRING text_out; if (!api.ProcessPages(image, NULL, 0, &text_out)) { printf("Error during processing.\n"); } bool output_hocr = false; api.GetBoolVariable("tessedit_create_hocr", &output_hocr); bool output_box = false; api.GetBoolVariable("tessedit_create_boxfile", &output_box); STRING outfile = output; outfile += output_hocr ? ".html" : output_box ? ".box" : ".txt"; FILE* fout = fopen(outfile.string(), "wb"); if (fout == NULL) { printf("Cannot create output file %s\n", outfile.string()); exit(1); } fwrite(text_out.string(), 1, text_out.length(), fout); fclose(fout); return 0; // Normal exit }
int main(int argc, char **argv) { if ((argc == 2 && strcmp(argv[1], "-v") == 0) || (argc == 2 && strcmp(argv[1], "--version") == 0)) { char *versionStrP; fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version()); versionStrP = getLeptonicaVersion(); fprintf(stderr, " %s\n", versionStrP); lept_free(versionStrP); versionStrP = getImagelibVersions(); fprintf(stderr, " %s\n", versionStrP); lept_free(versionStrP); exit(0); } // Make the order of args a bit more forgiving than it used to be. const char* lang = "eng"; const char* image = NULL; const char* output = NULL; bool noocr = false; bool list_langs = false; bool print_parameters = false; tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO; int arg = 1; while (arg < argc && (output == NULL || argv[arg][0] == '-')) { if (strcmp(argv[arg], "-l") == 0 && arg + 1 < argc) { lang = argv[arg + 1]; ++arg; } else if (strcmp(argv[arg], "-psm") == 0 && arg + 1 < argc) { pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[arg + 1])); ++arg; } else if (strcmp(argv[arg], "--print-parameters") == 0) { noocr = true; print_parameters = true; } else if (strcmp(argv[arg], "-c") == 0 && arg + 1 < argc) { // handled properly after api init ++arg; } else if (image == NULL) { image = argv[arg]; } else if (output == NULL) { output = argv[arg]; } ++arg; } if (argc == 2 && strcmp(argv[1], "--list-langs") == 0) { list_langs = true; noocr = true; } if (output == NULL && noocr == false) { fprintf(stderr, "Usage:%s imagename outputbase|stdout [-l lang] " "[-psm pagesegmode] [-c configvar=value] " "[configfile...]\n\n", argv[0]); fprintf(stderr, "pagesegmode values are:\n" "0 = Orientation and script detection (OSD) only.\n" "1 = Automatic page segmentation with OSD.\n" "2 = Automatic page segmentation, but no OSD, or OCR\n" "3 = Fully automatic page segmentation, but no OSD. (Default)\n" "4 = Assume a single column of text of variable sizes.\n" "5 = Assume a single uniform block of vertically aligned text.\n" "6 = Assume a single uniform block of text.\n" "7 = Treat the image as a single text line.\n" "8 = Treat the image as a single word.\n" "9 = Treat the image as a single word in a circle.\n" "10 = Treat the image as a single character.\n"); fprintf(stderr, "multiple -c arguments are allowed.\n"); fprintf(stderr, "-l lang, -psm pagesegmode and any -c options must occur" "before any configfile.\n\n"); fprintf(stderr, "Single options:\n"); fprintf(stderr, " -v --version: version info\n"); fprintf(stderr, " --list-langs: list available languages for tesseract " "engine\n"); fprintf(stderr, " --print-parameters: print tesseract parameters to the " "stdout\n"); exit(1); } tesseract::TessBaseAPI api; STRING tessdata_dir; truncate_path(argv[0], &tessdata_dir); api.SetOutputName(output); int rc = api.Init(tessdata_dir.string(), lang, tesseract::OEM_DEFAULT, &(argv[arg]), argc - arg, NULL, NULL, false); if (rc) { fprintf(stderr, "Could not initialize tesseract.\n"); exit(1); } char opt1[255], opt2[255]; for (arg = 0; arg < argc; arg++) { if (strcmp(argv[arg], "-c") == 0 && arg + 1 < argc) { strncpy(opt1, argv[arg + 1], 255); *(strchr(opt1, '=')) = 0; strncpy(opt2, strchr(argv[arg + 1], '=') + 1, 255); opt2[254] = 0; ++arg; if(!api.SetVariable(opt1, opt2)) { fprintf(stderr, "Could not set option: %s=%s\n", opt1, opt2); } } } if (list_langs) { GenericVector<STRING> languages; api.GetAvailableLanguagesAsVector(&languages); fprintf(stderr, "List of available languages (%d):\n", languages.size()); for (int index = 0; index < languages.size(); ++index) { STRING& string = languages[index]; fprintf(stderr, "%s\n", string.string()); } api.End(); exit(0); } if (print_parameters) { FILE* fout = stdout; fprintf(stdout, "Tesseract parameters:\n"); api.PrintVariables(fout); api.End(); exit(0); } // We have 2 possible sources of pagesegmode: a config file and // the command line. For backwards compatability reasons, the // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the // default for this program is tesseract::PSM_AUTO. We will let // the config file take priority, so the command-line default // can take priority over the tesseract default, so we use the // value from the command line only if the retrieved mode // is still tesseract::PSM_SINGLE_BLOCK, indicating no change // in any config file. Therefore the only way to force // tesseract::PSM_SINGLE_BLOCK is from the command line. // It would be simpler if we could set the value before Init, // but that doesn't work. if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) api.SetPageSegMode(pagesegmode); tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n", tesseract::TessBaseAPI::Version()); FILE* fin = fopen(image, "rb"); if (fin == NULL) { fprintf(stderr, "Cannot open input file: %s\n", image); exit(2); } fclose(fin); bool output_hocr = false; api.GetBoolVariable("tessedit_create_hocr", &output_hocr); bool output_box = false; api.GetBoolVariable("tessedit_create_boxfile", &output_box); FILE* fout = stdout; if (strcmp(output, "-") && strcmp(output, "stdout")) { STRING outfile = output; outfile += output_hocr ? ".html" : output_box ? ".box" : ".txt"; fout = fopen(outfile.string(), "wb"); if (fout == NULL) { fprintf(stderr, "Cannot create output file %s\n", outfile.string()); exit(1); } } STRING text_out; if (!api.ProcessPages(image, NULL, 0, &text_out)) { fprintf(stderr, "Error during processing.\n"); if (fout != stdout) fclose(fout); exit(1); } fwrite(text_out.string(), 1, text_out.length(), fout); if (fout != stdout) fclose(fout); else clearerr(fout); return 0; // Normal exit }