Ejemplo n.º 1
0
void *process_doc( void *in ) {

  struct process_phash *data = (struct process_phash *)in;
  int thread_id = data->thread_id;
  thread_active[thread_id] = 1; // belt & braces
  o_log(INFORMATION, "[%d] Calculating pHash for docid = %d", thread_id, data->docid );

  // Generate the pHash for this image.
  o_log(DEBUGM, "Calculating pHash for %s", data->filename );
  unsigned long long hash = getImagePhash_fn( data->filename );

  // Save the result back to the DB
  savePhash( data->docid, hash );

  // Cleanup and go home
  free(data->filename);
  free(data);
  thread_active[thread_id] = 0;
  return NULL;
}
Ejemplo n.º 2
0
char *uploadfile(char *filename, char *lookForSimilar, char *lang) {

#ifndef CAN_MAGIC
  o_log(ERROR, "Unable to determin the file type, aborting.");
  return NULL;
#else

  int width = 0, height = 0, itype = PLACE_HOLDER;
  char *final_filename, *ocrText = NULL, *tmp;
#ifdef CAN_PDF
  char *thumbext = NULL;
#else
#ifdef CAN_READODF
  char *thumbext = NULL;
#endif /* CAN_READODF */
#endif /* CAN_PDF */
  char *docid;
  char *ftype;
  char *datafile;
  char *thumbfile = NULL;
  PIX *pix;

  datafile = o_printf("/tmp/%s.dat", filename);
  magic_t cookie = magic_open(MAGIC_MIME_TYPE);
  magic_load( cookie, NULL );
  const char *t = magic_file( cookie, datafile );
  ftype = o_strdup( t );
  o_log( ERROR, "Uploaded file looks to be of type: %s", ftype );
  magic_close( cookie );

  // --------------------------------------
  if( 0 == strcmp("application/pdf", ftype) ) {
    itype = PDF_FILETYPE;
#ifdef CAN_PDF
    thumbfile = o_printf("/tmp/%s.thumb", filename);
    ocrText = parse_pdf( datafile, thumbfile ); // pdf_plug.cc [create thumbnail and return body text] 
    thumbext = o_strdup("jpg");
#endif /* CAN_PDF */
    o_log( INFORMATION, "Processed PDF");
  }

  // --------------------------------------
  else if( 0 == strcmp("application/vnd.oasis.opendocument.text", ftype) ) {
    itype = ODF_FILETYPE;
#ifdef CAN_READODF
    thumbfile = o_printf("/tmp/%s.thumb", filename);
    get_odf_Thumb( datafile, thumbfile );
    ocrText = get_odf_Text( datafile ); // odf_plug.c 
    thumbext = o_strdup("png");
#endif /* CAN_READODF */
    o_log( INFORMATION, "Processed ODF doc");
  }

  // --------------------------------------
  else if( 0 == strcmp("image/jpeg", ftype) ) {
    itype = JPG_FILETYPE;
#ifdef CAN_OCR
    PIX *pix_l;
    if ( ( pix_l = pixRead( datafile ) ) == NULL) {
      o_log(ERROR, "Could not load the image data into a PIX");
      return NULL;
    }
    int depth;
    pixGetDimensions( pix_l, &width, &height, &depth );
    o_log(INFORMATION, "Convertion process: Loaded (depth: %d)", depth );
    pix = pixScaleRGBToGrayFast( pix_l, 1, COLOR_GREEN );
    pixDestroy( &pix_l );
    if (pix == NULL ) {
      o_log(ERROR,"Conversion process failed pixScaleRGBToGrayFast! skip ocr");
    }
    else {
      o_log(INFORMATION, "Convertion process: Reduced depth to %d", pixGetDepth(pix));
      ocrText = getTextFromImage(pix, 0, "eng");
    }
#endif /* CAN_OCR */
    o_log( INFORMATION, "Processed JPG doc");
  }

  // --------------------------------------
  else {
    free( ftype );
    free( datafile );
    o_log(ERROR, "unknown file type.");
    return NULL;
  }
  free( ftype );

  // Set a default OCR text string
  if( ocrText == NULL ) {
    ocrText = o_strdup( getString("LOCAL_ocr_default_text", lang ) );
  }

  // Save the record to the DB
  o_log(DEBUGM, "Saving doc import record");
  docid = addNewFileDoc(itype, width, height, ocrText); // ocrText get freed in this method

  // Move the main datafile to the file store location
  final_filename = o_printf("%s/scans/%s", BASE_DIR, docid); // none image imported docs, are stored with no "_x" postfix.
  if( itype == JPG_FILETYPE ) {
    conCat(&final_filename, "_1");
  }
  addFileExt(&final_filename, itype);

  fcopy(datafile, final_filename);
  o_log( DEBUGM, "Moved data file");
  // The original file will be unlinked by the HTTPD process
  free(datafile);

  // Move any thumbnail image to the file store location
  if( thumbfile ) {
    free(final_filename); // This currently holds the main PDG or ODF file.
    final_filename = o_printf("%s/scans/%s_thumb.%s", BASE_DIR, docid, thumbext); // any thumbnails are postfixed with "_thumb"
    fcopy(thumbfile, final_filename);
    o_log( DEBUGM, "Moved thumbnail file");
    unlink(thumbfile);
    free(thumbfile);
    free(thumbext);

#ifdef CAN_PHASH
    o_log( DEBUGM, "About to perform pHash on file");
    unsigned long long hash = getImagePhash_fn( final_filename );
    savePhash( atoi(docid), hash );
#endif /* CAN_PHASH */
  }
  else {
#ifdef CAN_PHASH
    o_log( DEBUGM, "About to perform pHash on pix");
    unsigned long long hash = getImagePhash_px( pix );
    savePhash( atoi(docid), hash );
#endif /* CAN_PHASH */
    pixDestroy( &pix );
  }
  free(final_filename);

  // Should we look for a similar doc, on opening?
  char *findSim = "";
#ifdef CAN_PHASH
  if( lookForSimilar != (void *)NULL ) {
    findSim = "&findSimilar=1";
  }
#endif /*  CAN_PHASH */

  // Open the document for editing.
  tmp = o_printf("<html><HEAD><META HTTP-EQUIV=\"refresh\" CONTENT=\"0;URL=/opendias/docDetail.html?docid=%s%s\"></HEAD><body></body></html>", docid, findSim);
  free(docid);

  return tmp;
#endif /* CAN_MAGIC */
}