void *process_doc( void *in ) { struct process_phash *data = (struct process_phash *)in; int thread_id = data->thread_id; thread_active[thread_id] = 1; // belt & braces o_log(INFORMATION, "[%d] Calculating pHash for docid = %d", thread_id, data->docid ); // Generate the pHash for this image. o_log(DEBUGM, "Calculating pHash for %s", data->filename ); unsigned long long hash = getImagePhash_fn( data->filename ); // Save the result back to the DB savePhash( data->docid, hash ); // Cleanup and go home free(data->filename); free(data); thread_active[thread_id] = 0; return NULL; }
char *internalDoScanningOperation(char *uuid, char *lang) { int request_resolution = 0; int docid; int current_page = 0; int total_requested_pages; double totbytes = 0; SANE_Status status; SANE_Handle *openDeviceHandle; SANE_Byte *raw_image; SANE_Parameters pars; char *docid_s; char *total_requested_pages_s; char *devName; char *outFilename; char *raw_image_format; char *header; o_log(DEBUGM, "doScanningOperation: sane initialized uuid(%s)",(char *)uuid); updateScanProgress(uuid, SCAN_WAITING_ON_SCANNER, 0); // Open the device devName = getScanParam(uuid, SCAN_PARAM_DEVNAME); o_log(DEBUGM, "sane_open of \"%s\"",devName); status = sane_open ((SANE_String_Const) devName, (SANE_Handle)&openDeviceHandle); if(status != SANE_STATUS_GOOD) { handleSaneErrors("Cannot open device ", devName, status, 0); updateScanProgress(uuid, SCAN_ERRO_FROM_SCANNER, status); free(devName); return 0; } free(devName); /* ========================================================== */ if ( ! setOptions( (char *)uuid, openDeviceHandle, &request_resolution ) ) return 0; o_log(DEBUGM, "sane_start: setOptions returned request_resolution %d\n",request_resolution); int timeout = 5; while( 0 < timeout ) { status = sane_start (openDeviceHandle); if(status == SANE_STATUS_GOOD) { break; } else { if(status == SANE_STATUS_DEVICE_BUSY ) { // BUSY signal could be the scanner just having a // bit of lag - specially network connected devices timeout--; if ( timeout == 0 ) { handleSaneErrors("Cannot start scanning", "even after trying several time", status, 0); updateScanProgress(uuid, SCAN_ERRO_FROM_SCANNER, status); return 0; } else { o_log(WARNING, "Device reports not ready to 'start', waiting 500ms. Will try another %d times", timeout); usleep(500 * 1000); // 500ms or 0.5sec } } else { handleSaneErrors("Cannot start scanning", "", status, 0); updateScanProgress(uuid, SCAN_ERRO_FROM_SCANNER, status); return 0; } } } // Get scanning params (from the scanner) if( request_resolution == 0 ) { o_log(DEBUGM, "Resolution did not get set in scanner setup."); updateScanProgress(uuid, SCAN_INTERNAL_ERROR, 10004); return 0; } o_log(DEBUGM, "Get scanning params"); status = sane_get_parameters (openDeviceHandle, &pars); o_log(INFORMATION, "Scanner Parm : stat=%s form=%d,lf=%d,bpl=%d,pixpl=%d,lin=%d,dep=%d", sane_strstatus (status), pars.format, pars.last_frame, pars.bytes_per_line, pars.pixels_per_line, pars.lines, pars.depth); switch (pars.format) { case SANE_FRAME_GRAY: o_log(DEBUGM, "Expecting Gray data (1 channel only)."); raw_image_format = o_strdup( "P5" ); break; case SANE_FRAME_RGB: o_log(DEBUGM, "Expecting RGB data (3 channels)."); raw_image_format = o_strdup( "P6" ); break; default: o_log(DEBUGM, "backend returns three frames speratly. We do not currently support this."); updateScanProgress(uuid, SCAN_INTERNAL_ERROR, 10003); return 0; break; } header = o_printf ("%s\n# SANE data follows\n%d %d\n%d\n", raw_image_format, pars.pixels_per_line, pars.lines, (pars.depth <= 8) ? 255 : 65535); free( raw_image_format ); // Save Record // docid_s = getScanParam(uuid, SCAN_PARAM_DOCID); total_requested_pages_s = getScanParam(uuid, SCAN_PARAM_REQUESTED_PAGES); total_requested_pages = atoi(total_requested_pages_s); free(total_requested_pages_s); if( docid_s == NULL ) { o_log(DEBUGM, "Saving record"); updateScanProgress(uuid, SCAN_DB_WORKING, 0); docid_s = addNewScannedDoc(pars.lines, pars.pixels_per_line, request_resolution, total_requested_pages); setScanParam(uuid, SCAN_PARAM_DOCID, docid_s); setScanParam(uuid, SCAN_PARAM_ON_PAGE, "1"); current_page = 1; } else { char *current_page_s = getScanParam(uuid, SCAN_PARAM_ON_PAGE); current_page = atoi(current_page_s); free(current_page_s); current_page++; current_page_s = itoa(current_page, 10); setScanParam(uuid, SCAN_PARAM_ON_PAGE, current_page_s); free(current_page_s); } docid = atoi(docid_s); free(docid_s); totbytes = (double)((pars.bytes_per_line * pars.lines)); /* ========================================================== */ raw_image = collectData( (char *)uuid, openDeviceHandle, totbytes, pars.bytes_per_line, header ); o_log(INFORMATION, "Scanning done."); o_log(DEBUGM, "sane_cancel"); sane_cancel(openDeviceHandle); o_log(DEBUGM, "sane_close"); sane_close(openDeviceHandle); // Convert Raw into JPEG // updateScanProgress(uuid, SCAN_CONVERTING_FORMAT, 0); PIX *pix; if ( ( pix = pixReadMem( raw_image, (pars.bytes_per_line*pars.lines)+strlen(header) ) ) == NULL) { o_log(ERROR, "Could not load the image data into a PIX"); } updateScanProgress(uuid, SCAN_CONVERTING_FORMAT, 55); o_log(INFORMATION, "Convertion process: Loaded (depth: %d)", pixGetDepth(pix)); free(raw_image); free(header); outFilename = o_printf("%s/scans/%d_%d.jpg", BASE_DIR, docid, current_page); pixWrite(outFilename, pix, IFF_JFIF_JPEG); free(outFilename); updateScanProgress(uuid, SCAN_CONVERTING_FORMAT, 100); o_log(INFORMATION, "Conversion process: Complete"); // Do OCR - on this page // - OCR libs just wants the raw data and not the image header ocrImage( uuid, docid, current_page, request_resolution, pix, lang ); #ifdef CAN_PHASH // Calulate the pHash, so we can compare images later if( current_page == 1 ) { updateScanProgress(uuid, SCAN_CALULATING_PHASH, 0); unsigned long long hash = getImagePhash_px( pix ); savePhash( docid, hash ); } #endif /* CAN_PHASH */ pixDestroy( &pix ); // cleaup && What should we do next // o_log(DEBUGM, "mostly done."); if(current_page >= total_requested_pages) updateScanProgress(uuid, SCAN_FINISHED, docid); else updateScanProgress(uuid, SCAN_WAITING_ON_NEW_PAGE, ++current_page); o_log(DEBUGM, "Page scan done."); return o_strdup("OK"); }
char *uploadfile(char *filename, char *lookForSimilar, char *lang) { #ifndef CAN_MAGIC o_log(ERROR, "Unable to determin the file type, aborting."); return NULL; #else int width = 0, height = 0, itype = PLACE_HOLDER; char *final_filename, *ocrText = NULL, *tmp; #ifdef CAN_PDF char *thumbext = NULL; #else #ifdef CAN_READODF char *thumbext = NULL; #endif /* CAN_READODF */ #endif /* CAN_PDF */ char *docid; char *ftype; char *datafile; char *thumbfile = NULL; PIX *pix; datafile = o_printf("/tmp/%s.dat", filename); magic_t cookie = magic_open(MAGIC_MIME_TYPE); magic_load( cookie, NULL ); const char *t = magic_file( cookie, datafile ); ftype = o_strdup( t ); o_log( ERROR, "Uploaded file looks to be of type: %s", ftype ); magic_close( cookie ); // -------------------------------------- if( 0 == strcmp("application/pdf", ftype) ) { itype = PDF_FILETYPE; #ifdef CAN_PDF thumbfile = o_printf("/tmp/%s.thumb", filename); ocrText = parse_pdf( datafile, thumbfile ); // pdf_plug.cc [create thumbnail and return body text] thumbext = o_strdup("jpg"); #endif /* CAN_PDF */ o_log( INFORMATION, "Processed PDF"); } // -------------------------------------- else if( 0 == strcmp("application/vnd.oasis.opendocument.text", ftype) ) { itype = ODF_FILETYPE; #ifdef CAN_READODF thumbfile = o_printf("/tmp/%s.thumb", filename); get_odf_Thumb( datafile, thumbfile ); ocrText = get_odf_Text( datafile ); // odf_plug.c thumbext = o_strdup("png"); #endif /* CAN_READODF */ o_log( INFORMATION, "Processed ODF doc"); } // -------------------------------------- else if( 0 == strcmp("image/jpeg", ftype) ) { itype = JPG_FILETYPE; #ifdef CAN_OCR PIX *pix_l; if ( ( pix_l = pixRead( datafile ) ) == NULL) { o_log(ERROR, "Could not load the image data into a PIX"); return NULL; } int depth; pixGetDimensions( pix_l, &width, &height, &depth ); o_log(INFORMATION, "Convertion process: Loaded (depth: %d)", depth ); pix = pixScaleRGBToGrayFast( pix_l, 1, COLOR_GREEN ); pixDestroy( &pix_l ); if (pix == NULL ) { o_log(ERROR,"Conversion process failed pixScaleRGBToGrayFast! skip ocr"); } else { o_log(INFORMATION, "Convertion process: Reduced depth to %d", pixGetDepth(pix)); ocrText = getTextFromImage(pix, 0, "eng"); } #endif /* CAN_OCR */ o_log( INFORMATION, "Processed JPG doc"); } // -------------------------------------- else { free( ftype ); free( datafile ); o_log(ERROR, "unknown file type."); return NULL; } free( ftype ); // Set a default OCR text string if( ocrText == NULL ) { ocrText = o_strdup( getString("LOCAL_ocr_default_text", lang ) ); } // Save the record to the DB o_log(DEBUGM, "Saving doc import record"); docid = addNewFileDoc(itype, width, height, ocrText); // ocrText get freed in this method // Move the main datafile to the file store location final_filename = o_printf("%s/scans/%s", BASE_DIR, docid); // none image imported docs, are stored with no "_x" postfix. if( itype == JPG_FILETYPE ) { conCat(&final_filename, "_1"); } addFileExt(&final_filename, itype); fcopy(datafile, final_filename); o_log( DEBUGM, "Moved data file"); // The original file will be unlinked by the HTTPD process free(datafile); // Move any thumbnail image to the file store location if( thumbfile ) { free(final_filename); // This currently holds the main PDG or ODF file. final_filename = o_printf("%s/scans/%s_thumb.%s", BASE_DIR, docid, thumbext); // any thumbnails are postfixed with "_thumb" fcopy(thumbfile, final_filename); o_log( DEBUGM, "Moved thumbnail file"); unlink(thumbfile); free(thumbfile); free(thumbext); #ifdef CAN_PHASH o_log( DEBUGM, "About to perform pHash on file"); unsigned long long hash = getImagePhash_fn( final_filename ); savePhash( atoi(docid), hash ); #endif /* CAN_PHASH */ } else { #ifdef CAN_PHASH o_log( DEBUGM, "About to perform pHash on pix"); unsigned long long hash = getImagePhash_px( pix ); savePhash( atoi(docid), hash ); #endif /* CAN_PHASH */ pixDestroy( &pix ); } free(final_filename); // Should we look for a similar doc, on opening? char *findSim = ""; #ifdef CAN_PHASH if( lookForSimilar != (void *)NULL ) { findSim = "&findSimilar=1"; } #endif /* CAN_PHASH */ // Open the document for editing. tmp = o_printf("<html><HEAD><META HTTP-EQUIV=\"refresh\" CONTENT=\"0;URL=/opendias/docDetail.html?docid=%s%s\"></HEAD><body></body></html>", docid, findSim); free(docid); return tmp; #endif /* CAN_MAGIC */ }