C++ (Cpp) dt_alloc_align Examples

Example #1

0

Show file

File: pfm.c Project: AdamMajer/darktable

int write_image(dt_imageio_module_data_t *data, const char *filename, const void *ivoid, void *exif,
                int exif_len, int imgid, int num, int total)
{
  const dt_imageio_module_data_t *const pfm = data;
  int status = 0;
  FILE *f = fopen(filename, "wb");
  if(f)
  {
    // INFO: per-line fwrite call seems to perform best. LebedevRI, 18.04.2014
    (void)fprintf(f, "PF\n%d %d\n-1.0\n", pfm->width, pfm->height);
    void *buf_line = dt_alloc_align(16, 3 * sizeof(float) * pfm->width);
    for(int j = 0; j < pfm->height; j++)
    {
      // NOTE: pfm has rows in reverse order
      const int row_in = pfm->height - 1 - j;
      const float *in = (const float *)ivoid + 4 * (size_t)pfm->width * row_in;
      float *out = (float *)buf_line;
      for(int i = 0; i < pfm->width; i++, in += 4, out += 3)
      {
        memcpy(out, in, 3 * sizeof(float));
      }
      int cnt = fwrite(buf_line, 3 * sizeof(float), pfm->width, f);
      if(cnt != pfm->width)
        status = 1;
      else
        status = 0;
    }
    dt_free_align(buf_line);
    buf_line = NULL;
    fclose(f);
  }
  return status;
}

Example #2

0

Show file

File: pixelpipe_cache.c Project: CaptainSifff/darktable

int dt_dev_pixelpipe_cache_init(dt_dev_pixelpipe_cache_t *cache, int entries, size_t size)
{
  cache->entries = entries;
  cache->data = (void **)calloc(entries, sizeof(void *));
  cache->size = (size_t *)calloc(entries, sizeof(size_t));
  cache->hash = (uint64_t *)calloc(entries, sizeof(uint64_t));
  cache->used = (int32_t *)calloc(entries, sizeof(int32_t));
  for(int k = 0; k < entries; k++)
  {
    cache->data[k] = (void *)dt_alloc_align(16, size);
    if(!cache->data[k]) goto alloc_memory_fail;
    cache->size[k] = size;
#ifdef _DEBUG
    memset(cache->data[k], 0x5d, size);
#endif
    cache->hash[k] = -1;
    cache->used[k] = 0;
  }
  cache->queries = cache->misses = 0;
  return 1;

alloc_memory_fail:
  for(int k = 0; k < entries; k++)
  {
    if(cache->data[k]) dt_free_align(cache->data[k]);
  }

  free(cache->data);
  free(cache->size);
  free(cache->hash);
  free(cache->used);

  return 0;
}

Example #3

0

Show file

File: image_cache.c Project: amitkr/darktable

void
dt_image_cache_init(dt_image_cache_t *cache)
{
  // the image cache does no serialization.
  // (unsafe. data should be in db/xmp, not in any other additional cache,
  // also, it should be relatively fast to get the image_t structs from sql.)
  // TODO: actually an independent conf var?
  //       too large: dangerous and wasteful?
  //       can we get away with a fixed size?
  const uint32_t max_mem = 50*1024*1024;
  uint32_t num = (uint32_t)(1.5f*max_mem/sizeof(dt_image_t));
  dt_cache_init(&cache->cache, num, 16, 64, max_mem);
  dt_cache_set_allocate_callback(&cache->cache, &dt_image_cache_allocate,   cache);
  dt_cache_set_cleanup_callback (&cache->cache, &dt_image_cache_deallocate, cache);

  // might have been rounded to power of two:
  num = dt_cache_capacity(&cache->cache);
  cache->images = dt_alloc_align(64, sizeof(dt_image_t)*num);
  dt_print(DT_DEBUG_CACHE, "[image_cache] has %d entries\n", num);
  // initialize first image as empty data:
  dt_image_init(cache->images);
  for(uint32_t k=1; k<num; k++)
  {
    // optimized initialization (avoid accessing conf):
    memcpy(cache->images + k, cache->images, sizeof(dt_image_t));
  }
}

Example #4

0

Show file

// callback for the cache backend to initialize payload pointers
int32_t
dt_mipmap_cache_allocate_dynamic(void *data, const uint32_t key, int32_t *cost, void **buf)
{
  // for full image buffers
  struct dt_mipmap_buffer_dsc* dsc = *buf;
  // alloc mere minimum for the header + broken image buffer:
  if(!dsc)
  {
    *buf = dt_alloc_align(16, sizeof(*dsc)+sizeof(float)*4*64);
    // fprintf(stderr, "[mipmap cache] alloc dynamic for key %u %lX\n", key, (uint64_t)*buf);
    if(!(*buf))
    {
      fprintf(stderr, "[mipmap cache] memory allocation failed!\n");
      exit(1);
    }
    dsc = *buf;
    dsc->width = 0;
    dsc->height = 0;
    dsc->size = sizeof(*dsc)+sizeof(float)*4*64;
  }
  assert(dsc->size >= sizeof(*dsc));
  dsc->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE;

  // cost is just flat one for the buffer, as the buffers might have different sizes,
  // to make sure quota is meaningful.
  *cost = 1;
  // fprintf(stderr, "dummy allocing %lX\n", (uint64_t)*buf);
  return 1; // request write lock
}

Example #5

0

Show file

File: pixelpipe_gegl.c Project: AntonSh/darktable

int dt_dev_pixelpipe_process(dt_dev_pixelpipe_t *pipe, dt_develop_t *dev, int x, int y, int width, int height, float scale)
{
  pipe->processing = 1;
  printf("pixelpipe process start\n");

  // have backbuf in right size:
  if(pipe->backbuf_size < width*height*4*sizeof(uint8_t))
  {
    pthread_mutex_lock(&pipe->backbuf_mutex);
    pipe->backbuf_size = width*height*4*sizeof(uint8_t);
    free(pipe->backbuf);
    pipe->backbuf = (uint8_t *)dt_alloc_align(16, pipe->backbuf_size);
    pthread_mutex_unlock(&pipe->backbuf_mutex);
  }

  // scale node (is slow):
  // scale *= 2;
  // FIXME: this seems to be a bug in gegl. need to manually adjust updated roi here.
  GeglRectangle roi  = (GeglRectangle)
  {
    x, y, width, height
  };
  GeglRectangle roio = (GeglRectangle)
  {
    roi.x/scale, roi.y/scale, roi.width/scale, roi.height/scale
  };
  roio.x      = MAX(0, roio.x);
  roio.y      = MAX(0, roio.y);
  roio.width  = MIN(pipe->iwidth -roio.x-1, roio.width);
  roio.height = MIN(pipe->iheight-roio.y-1, roio.height);
  GeglProcessor *processor = gegl_node_new_processor (pipe->output, &roio);
  // gegl_node_set(pipe->scale, "x", scale, "y", scale, NULL);
  // GeglProcessor *processor = gegl_node_new_processor (pipe->output, roi);
  double         progress;

  // TODO: insert constant scale node at beginning, maintain lo-res branch of pipeline (shadowed).
  // TODO: decide on scale param, which one to use.

  while (gegl_processor_work (processor, &progress))
  {
    // if history changed, abort processing?
    if(pipe->changed != DT_DEV_PIPE_UNCHANGED || dev->gui_leaving) return 1;
  }
  gegl_processor_destroy (processor);

  // gegl scale node turned out to be even slower :(
  gegl_node_blit (pipe->output, scale, &roi, babl_format("RGBA u8"), pipe->backbuf, GEGL_AUTO_ROWSTRIDE, GEGL_BLIT_CACHE);
  // gegl_node_blit (pipe->output, 1.0, roi, babl_format("RGBA u8"), output, GEGL_AUTO_ROWSTRIDE, GEGL_BLIT_CACHE);

  // TODO: update histograms here with this data?
  printf("pixelpipe process end\n");
  pipe->processing = 0;
  return 0;
}

Example #6

0

Show file

File: mipmap_cache.c Project: rharrison10/darktable

// compression stuff: alloc a buffer if needed
uint8_t*
dt_mipmap_cache_alloc_scratchmem(
  const dt_mipmap_cache_t *cache)
{
  const size_t size = cache->mip[DT_MIPMAP_3].max_width *
                      cache->mip[DT_MIPMAP_3].max_height;

  if(cache->compression_type)
  {
    return dt_alloc_align(64, size * 4 * sizeof(uint8_t));
  }
  else // no compression, no buffer:
    return NULL;
}

Example #7

0

Show file

File: pixelpipe_cache.c Project: Acidburn0zzz/darktable

int dt_dev_pixelpipe_cache_get_weighted(dt_dev_pixelpipe_cache_t *cache, const uint64_t hash,
                                        const size_t size, void **data, int weight)
{
  cache->queries++;
  *data = NULL;
  int max_used = -1, max = 0;
  size_t sz = 0;
  for(int k = 0; k < cache->entries; k++)
  {
    // search for hash in cache
    if(cache->used[k] > max_used)
    {
      max_used = cache->used[k];
      max = k;
    }
    cache->used[k]++; // age all entries
    if(cache->hash[k] == hash)
    {
      *data = cache->data[k];
      sz = cache->size[k];
      cache->used[k] = weight; // this is the MRU entry
    }
  }

  if(!*data || sz < size)
  {
    // kill LRU entry
    // printf("[pixelpipe_cache_get] hash not found, returning slot %d/%d age %d\n", max, cache->entries,
    // weight);
    if(cache->size[max] < size)
    {
      dt_free_align(cache->data[max]);
      cache->data[max] = (void *)dt_alloc_align(16, size);
      cache->size[max] = size;
    }
    *data = cache->data[max];
    cache->hash[max] = hash;
    cache->used[max] = weight;
    cache->misses++;
    return 1;
  }
  else
    return 0;
}

Example #8

0

Show file

File: gaussian.c Project: chubinou/darktable

dt_gaussian_t *
dt_gaussian_init(
    const int width,       // width of input image
    const int height,      // height of input image
    const int channels,    // channels per pixel
    const float *max,      // maximum allowed values per channel for clamping
    const float *min,      // minimum allowed values per channel for clamping
    const float sigma,     // gaussian sigma
    const int order)       // order of gaussian blur
{
  dt_gaussian_t *g = (dt_gaussian_t *)malloc(sizeof(dt_gaussian_t));
  if(!g) return NULL;

  g->width = width;
  g->height = height;
  g->channels = channels;
  g->sigma = sigma;
  g->order = order;
  g->buf = NULL;
  g->max = (float *)malloc(channels * sizeof(float));
  g->min = (float *)malloc(channels * sizeof(float));

  if(!g->min || !g->max) goto error;

  for(int k=0; k < channels; k++)
  {
    g->max[k] = max[k];
    g->min[k] = min[k];
  }

  g->buf = dt_alloc_align(64, width*height*channels*sizeof(float));
  if(!g->buf) goto error;

  return g;

error:
  free(g->buf);
  free(g->max);
  free(g->min);
  free(g);
  return NULL;
}

Example #9

0

Show file

File: pfm.c Project: PkmX/darktable

int write_image(dt_imageio_module_data_t *data, const char *filename, const void *ivoid, void *exif,
                int exif_len, int imgid, int num, int total)
{
  const dt_imageio_module_data_t *const pfm = data;
  int status = 0;
  FILE *f = fopen(filename, "wb");
  if(f)
  {
    // align pfm header to sse, assuming the file will
    // be mmapped to page boundaries.
    char header[1024];
    snprintf(header, 1024, "PF\n%d %d\n-1.0", pfm->width, pfm->height);
    size_t len = strlen(header);
    fprintf(f, "PF\n%d %d\n-1.0", pfm->width, pfm->height);
    ssize_t off = 0;
    while((len + 1 + off) & 0xf) off++;
    while(off-- > 0) fprintf(f, "0");
    fprintf(f, "\n");
    void *buf_line = dt_alloc_align(16, 3 * sizeof(float) * pfm->width);
    for(int j = 0; j < pfm->height; j++)
    {
      // NOTE: pfm has rows in reverse order
      const int row_in = pfm->height - 1 - j;
      const float *in = (const float *)ivoid + 4 * (size_t)pfm->width * row_in;
      float *out = (float *)buf_line;
      for(int i = 0; i < pfm->width; i++, in += 4, out += 3)
      {
        memcpy(out, in, 3 * sizeof(float));
      }
      // INFO: per-line fwrite call seems to perform best. LebedevRI, 18.04.2014
      int cnt = fwrite(buf_line, 3 * sizeof(float), pfm->width, f);
      if(cnt != pfm->width)
        status = 1;
      else
        status = 0;
    }
    dt_free_align(buf_line);
    buf_line = NULL;
    fclose(f);
  }
  return status;
}

Example #10

0

Show file

File: mipmap_cache.c Project: rharrison10/darktable

// callback for the imageio core to allocate memory.
// only needed for _F and _FULL buffers, as they change size
// with the input image. will allocate img->width*img->height*img->bpp bytes.
void*
dt_mipmap_cache_alloc(dt_image_t *img, dt_mipmap_size_t size, dt_mipmap_cache_allocator_t a)
{
  assert(size == DT_MIPMAP_FULL);

  struct dt_mipmap_buffer_dsc** dsc = (struct dt_mipmap_buffer_dsc**)a;

  int32_t wd = img->width;
  int32_t ht = img->height;
  int32_t bpp = img->bpp;
  const uint32_t buffer_size =
    ((wd*ht*bpp) + sizeof(**dsc));

  // buf might have been alloc'ed before,
  // so only check size and re-alloc if necessary:
  if(!(*dsc) || ((*dsc)->size < buffer_size) || ((void *)*dsc == (void *)dt_mipmap_cache_static_dead_image))
  {
    if((void *)*dsc != (void *)dt_mipmap_cache_static_dead_image)
      dt_free_align(*dsc);
    *dsc = dt_alloc_align(64, buffer_size);
    // fprintf(stderr, "[mipmap cache] alloc for key %u %p\n", get_key(img->id, size), *buf);
    if(!(*dsc))
    {
      // return fallback: at least alloc size for a dead image:
      *dsc = (struct dt_mipmap_buffer_dsc *)dt_mipmap_cache_static_dead_image;
      // allocator holds the pointer. but imageio client is tricked to believe allocation failed:
      return NULL;
    }
    // set buffer size only if we're making it larger.
    (*dsc)->size = buffer_size;
  }
  (*dsc)->width = wd;
  (*dsc)->height = ht;
  (*dsc)->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE;

  // fprintf(stderr, "full buffer allocating img %u %d x %d = %u bytes (%p)\n", img->id, img->width, img->height, buffer_size, *buf);

  // trick the user into using a pointer without the header:
  return (*dsc)+1;
}

Example #11

0

Show file

File: locallaplacian.c Project: rgo/darktable

// allocate output buffer with monochrome brightness channel from input, padded
// up by max_supp on all four sides, dimensions written to wd2 ht2
static inline float *ll_pad_input(
    const float *const input,
    const int wd,
    const int ht,
    const int max_supp,
    int *wd2,
    int *ht2)
{
  const int stride = 4;
  *wd2 = 2*max_supp + wd;
  *ht2 = 2*max_supp + ht;
  float *const out = dt_alloc_align(16, *wd2**ht2*sizeof(*out));

#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic) default(none) shared(wd2, ht2)
#endif
  for(int j=0;j<ht;j++)
  {
    for(int i=0;i<max_supp;i++)
      out[(j+max_supp)**wd2+i] = input[stride*wd*j]* 0.01f; // L -> [0,1]
    for(int i=0;i<wd;i++)
      out[(j+max_supp)**wd2+i+max_supp] = input[stride*(wd*j+i)] * 0.01f; // L -> [0,1]
    for(int i=wd+max_supp;i<*wd2;i++)
      out[(j+max_supp)**wd2+i] = input[stride*(j*wd+wd-1)] * 0.01f; // L -> [0,1]
  }
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic) default(none) shared(wd2, ht2)
#endif
  for(int j=0;j<max_supp;j++)
    memcpy(out + *wd2*j, out+max_supp**wd2, sizeof(float)**wd2);
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic) default(none) shared(wd2, ht2)
#endif
  for(int j=max_supp+ht;j<*ht2;j++)
    memcpy(out + *wd2*j, out + *wd2*(max_supp+ht-1), sizeof(float)**wd2);

  return out;
}

Example #12

0

Show file

File: mipmap_cache.c Project: pedrocr/darktable

// callback for the imageio core to allocate memory.
// only needed for _F and _FULL buffers, as they change size
// with the input image. will allocate img->width*img->height*img->bpp bytes.
void *dt_mipmap_cache_alloc(dt_mipmap_buffer_t *buf, const dt_image_t *img)
{
  assert(buf->size == DT_MIPMAP_FULL);

  const int wd = img->width;
  const int ht = img->height;
  struct dt_mipmap_buffer_dsc *dsc = (struct dt_mipmap_buffer_dsc *)buf->cache_entry->data;
  const size_t buffer_size = (size_t)wd*ht*img->bpp + sizeof(*dsc);

  // buf might have been alloc'ed before,
  // so only check size and re-alloc if necessary:
  if(!buf->buf || (dsc->size < buffer_size) || ((void *)dsc == (void *)dt_mipmap_cache_static_dead_image))
  {
    if((void *)dsc != (void *)dt_mipmap_cache_static_dead_image) dt_free_align(buf->cache_entry->data);
    buf->cache_entry->data = dt_alloc_align(64, buffer_size);
    if(!buf->cache_entry->data)
    {
      // return fallback: at least alloc size for a dead image:
      buf->cache_entry->data = (void*)dt_mipmap_cache_static_dead_image;
      // allocator holds the pointer. but let imageio client know that allocation failed:
      return NULL;
    }
    // set buffer size only if we're making it larger.
    dsc = (struct dt_mipmap_buffer_dsc *)buf->cache_entry->data;
    dsc->size = buffer_size;
  }
  dsc->width = wd;
  dsc->height = ht;
  dsc->color_space = DT_COLORSPACE_NONE;
  dsc->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE;
  buf->buf = (uint8_t *)(dsc + 1);

  // fprintf(stderr, "full buffer allocating img %u %d x %d = %u bytes (%p)\n", img->id, img->width,
  // img->height, buffer_size, *buf);

  // return pointer to start of payload
  return dsc + 1;
}

Example #13

0

Show file

File: locallaplacian.c Project: rgo/darktable

void local_laplacian_internal(
    const float *const input,   // input buffer in some Labx or yuvx format
    float *const out,           // output buffer with colour
    const int wd,               // width and
    const int ht,               // height of the input buffer
    const float sigma,          // user param: separate shadows/midtones/highlights
    const float shadows,        // user param: lift shadows
    const float highlights,     // user param: compress highlights
    const float clarity,        // user param: increase clarity/local contrast
    const int use_sse2)         // flag whether to use SSE version
{
#define max_levels 30
#define num_gamma 6
  // don't divide by 2 more often than we can:
  const int num_levels = MIN(max_levels, 31-__builtin_clz(MIN(wd,ht)));
  const int max_supp = 1<<(num_levels-1);
  int w, h;
  float *padded[max_levels] = {0};
  padded[0] = ll_pad_input(input, wd, ht, max_supp, &w, &h);

  // allocate pyramid pointers for padded input
  for(int l=1;l<num_levels;l++)
    padded[l] = dt_alloc_align(16, sizeof(float)*dl(w,l)*dl(h,l));

  // allocate pyramid pointers for output
  float *output[max_levels] = {0};
  for(int l=0;l<num_levels;l++)
    output[l] = dt_alloc_align(16, sizeof(float)*dl(w,l)*dl(h,l));

  // create gauss pyramid of padded input, write coarse directly to output
#if defined(__SSE2__)
  if(use_sse2)
  {
    for(int l=1;l<num_levels-1;l++)
      gauss_reduce_sse2(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1));
    gauss_reduce_sse2(padded[num_levels-2], output[num_levels-1], dl(w,num_levels-2), dl(h,num_levels-2));
  }
  else
#endif
  {
    for(int l=1;l<num_levels-1;l++)
      gauss_reduce(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1));
    gauss_reduce(padded[num_levels-2], output[num_levels-1], dl(w,num_levels-2), dl(h,num_levels-2));
  }

  // evenly sample brightness [0,1]:
  float gamma[num_gamma] = {0.0f};
  for(int k=0;k<num_gamma;k++) gamma[k] = (k+.5f)/(float)num_gamma;
  // for(int k=0;k<num_gamma;k++) gamma[k] = k/(num_gamma-1.0f);

  // allocate memory for intermediate laplacian pyramids
  float *buf[num_gamma][max_levels] = {{0}};
  for(int k=0;k<num_gamma;k++) for(int l=0;l<num_levels;l++)
    buf[k][l] = dt_alloc_align(16, sizeof(float)*dl(w,l)*dl(h,l));

  // the paper says remapping only level 3 not 0 does the trick, too
  // (but i really like the additional octave of sharpness we get,
  // willing to pay the cost).
  for(int k=0;k<num_gamma;k++)
  { // process images
#if defined(__SSE2__)
    if(use_sse2)
      apply_curve_sse2(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity);
    else // brackets in next line needed for silly gcc warning:
#endif
    {apply_curve(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity);}

    // create gaussian pyramids
    for(int l=1;l<num_levels;l++)
#if defined(__SSE2__)
      if(use_sse2)
        gauss_reduce_sse2(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1));
      else
#endif
        gauss_reduce(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1));
  }

  // assemble output pyramid coarse to fine
  for(int l=num_levels-2;l >= 0; l--)
  {
    const int pw = dl(w,l), ph = dl(h,l);

    gauss_expand(output[l+1], output[l], pw, ph);
    // go through all coefficients in the upsampled gauss buffer:
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static) collapse(2) shared(w,h,buf,output,l,gamma,padded)
#endif
    for(int j=0;j<ph;j++) for(int i=0;i<pw;i++)
    {
      const float v = padded[l][j*pw+i];
      int hi = 1;
      for(;hi<num_gamma-1 && gamma[hi] <= v;hi++);
      int lo = hi-1;
      const float a = CLAMPS((v - gamma[lo])/(gamma[hi]-gamma[lo]), 0.0f, 1.0f);
      const float l0 = ll_laplacian(buf[lo][l+1], buf[lo][l], i, j, pw, ph);
      const float l1 = ll_laplacian(buf[hi][l+1], buf[hi][l], i, j, pw, ph);
      output[l][j*pw+i] += l0 * (1.0f-a) + l1 * a;
      // we could do this to save on memory (no need for finest buf[][]).
      // unfortunately it results in a quite noticable loss of sharpness, i think
      // the extra level is worth it.
      // else if(l == 0) // use finest scale from input to not amplify noise (and use less memory)
      //   output[l][j*pw+i] += ll_laplacian(padded[l+1], padded[l], i, j, pw, ph);
    }
  }
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(dynamic) collapse(2) shared(w,output,buf)
#endif
  for(int j=0;j<ht;j++) for(int i=0;i<wd;i++)
  {
    out[4*(j*wd+i)+0] = 100.0f * output[0][(j+max_supp)*w+max_supp+i]; // [0,1] -> L
    out[4*(j*wd+i)+1] = input[4*(j*wd+i)+1]; // copy original colour channels
    out[4*(j*wd+i)+2] = input[4*(j*wd+i)+2];
  }
  // free all buffers!
  for(int l=0;l<max_levels;l++)
  {
    dt_free_align(padded[l]);
    dt_free_align(output[l]);
    for(int k = 0; k < num_gamma; k++) dt_free_align(buf[k][l]);
  }
#undef num_levels
#undef num_gamma
}

Example #14

0

Show file

File: locallaplacian.c Project: rgo/darktable

static inline void gauss_reduce_sse2(
    const float *const input, // fine input buffer
    float *const coarse,      // coarse scale, blurred input buf
    const int wd,             // fine res
    const int ht)
{
  // blur, store only coarse res
  const int cw = (wd-1)/2+1, ch = (ht-1)/2+1;

  // this version is inspired by opencv's pyrDown_ :
  // - allocate 5 rows of ring buffer (aligned)
  // - for coarse res y
  //   - fill 5 coarse-res row buffers with 1 4 6 4 1 weights (reuse some from last time)
  //   - do vertical convolution via sse and write to coarse output buf

  const int stride = ((cw+8)&~7); // assure sse alignment of rows
  float *ringbuf = dt_alloc_align(16, sizeof(*ringbuf)*stride*5);
  float *rows[5] = {0};
  int rowj = 0; // we initialised this many rows so far

  for(int j=1;j<ch-1;j++)
  {
    // horizontal pass, convolve with 1 4 6 4 1 kernel and decimate
    for(;rowj<=2*j+2;rowj++)
    {
      float *const row = ringbuf + (rowj % 5)*stride;
      const float *const in = input + rowj*wd;
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
#endif
      for(int i=1;i<cw-1;i++)
        row[i] = 6*in[2*i] + 4*(in[2*i-1]+in[2*i+1]) + in[2*i-2] + in[2*i+2];
    }

    // init row pointers
    for(int k=0;k<5;k++)
      rows[k] = ringbuf + ((2*j-2+k)%5)*stride;

    // vertical pass, convolve and decimate using SIMD:
    // note that we're ignoring the (1..cw-1) buffer limit, we'll pull in
    // garbage and fix it later by border filling.
    float *const out = coarse + j*cw;
    const float *const row0 = rows[0], *const row1 = rows[1],
                *const row2 = rows[2], *const row3 = rows[3], *const row4 = rows[4];
    const __m128 four = _mm_set1_ps(4.f), scale = _mm_set1_ps(1.f/256.f);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
#endif
    for(int i=0;i<=cw-8;i+=8)
    {
      __m128 r0, r1, r2, r3, r4, t0, t1;
      r0 = _mm_load_ps(row0 + i);
      r1 = _mm_load_ps(row1 + i);
      r2 = _mm_load_ps(row2 + i);
      r3 = _mm_load_ps(row3 + i);
      r4 = _mm_load_ps(row4 + i);
      r0 = _mm_add_ps(r0, r4);
      r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
      r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
      t0 = _mm_add_ps(r0, _mm_mul_ps(r1, four));

      r0 = _mm_load_ps(row0 + i + 4);
      r1 = _mm_load_ps(row1 + i + 4);
      r2 = _mm_load_ps(row2 + i + 4);
      r3 = _mm_load_ps(row3 + i + 4);
      r4 = _mm_load_ps(row4 + i + 4);
      r0 = _mm_add_ps(r0, r4);
      r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
      r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
      t1 = _mm_add_ps(r0, _mm_mul_ps(r1, four));

      t0 = _mm_mul_ps(t0, scale);
      t1 = _mm_mul_ps(t1, scale);

      _mm_storeu_ps(out + i, t0);
      _mm_storeu_ps(out + i + 4, t1);
    }
    // process the rest
    for(int i=cw&~7;i<cw-1;i++)
      out[i] = (6*row2[i] + 4*(row1[i] + row3[i]) + row0[i] + row4[i])*(1.0f/256.0f);
  }
  dt_free_align(ringbuf);
  ll_fill_boundary1(coarse, cw, ch);
}

Example #15

0

Show file

File: imageio.c Project: guitorri/darktable

// internal function: to avoid exif blob reading + 8-bit byteorder flag + high-quality override
int dt_imageio_export_with_flags(
  const uint32_t              imgid,
  const char                 *filename,
  dt_imageio_module_format_t *format,
  dt_imageio_module_data_t   *format_params,
  const int32_t               ignore_exif,
  const int32_t               display_byteorder,
  const gboolean              high_quality,
  const int32_t               thumbnail_export,
  const char                 *filter,
  const gboolean              copy_metadata,
  dt_imageio_module_storage_t *storage,
  dt_imageio_module_data_t   *storage_params)
{
  dt_develop_t dev;
  dt_dev_init(&dev, 0);
  dt_mipmap_buffer_t buf;
  if(thumbnail_export && dt_conf_get_bool("plugins/lighttable/low_quality_thumbnails"))
    dt_mipmap_cache_read_get(darktable.mipmap_cache, &buf, imgid, DT_MIPMAP_F, DT_MIPMAP_BLOCKING);
  else
    dt_mipmap_cache_read_get(darktable.mipmap_cache, &buf, imgid, DT_MIPMAP_FULL, DT_MIPMAP_BLOCKING);
  dt_dev_load_image(&dev, imgid);
  const dt_image_t *img = &dev.image_storage;
  const int wd = img->width;
  const int ht = img->height;

  int res = 0;

  dt_times_t start;
  dt_get_times(&start);
  dt_dev_pixelpipe_t pipe;
  res = thumbnail_export ? dt_dev_pixelpipe_init_thumbnail(&pipe, wd, ht) : dt_dev_pixelpipe_init_export(&pipe, wd, ht, format->levels(format_params));
  if(!res)
  {
    dt_control_log(_("failed to allocate memory for %s, please lower the threads used for export or buy more memory."), thumbnail_export ? C_("noun", "thumbnail export") : C_("noun", "export"));
    dt_dev_cleanup(&dev);
    dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf);
    return 1;
  }

  if(!buf.buf)
  {
    dt_control_log(_("image `%s' is not available!"), img->filename);
    dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf);
    dt_dev_cleanup(&dev);
    return 1;
  }

  //  If a style is to be applied during export, add the iop params into the history
  if (!thumbnail_export && format_params->style[0] != '\0')
  {
    GList *stls;

    GList *modules = dev.iop;
    dt_iop_module_t *m = NULL;

    if ((stls=dt_styles_get_item_list(format_params->style, TRUE, -1)) == 0)
    {
      dt_control_log(_("cannot find the style '%s' to apply during export."), format_params->style);
      dt_dev_cleanup(&dev);
      dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf);
      return 1;
    }

    //  Add each params
    while (stls)
    {
      dt_style_item_t *s = (dt_style_item_t *) stls->data;

      modules = dev.iop;
      while (modules)
      {
        m = (dt_iop_module_t *)modules->data;

        //  since the name in the style is returned with a possible multi-name, just check the start of the name
        if (strncmp(m->op, s->name, strlen(m->op)) == 0)
        {
          dt_dev_history_item_t *h = malloc(sizeof(dt_dev_history_item_t));

          h->params = s->params;
          h->blend_params = s->blendop_params;
          h->enabled = s->enabled;
          h->module = m;
          h->multi_priority = 1;
          g_strlcpy(h->multi_name, "", sizeof(h->multi_name));

          if(m->legacy_params && (s->module_version != m->version()))
          {
            void *new_params = malloc(m->params_size);
            m->legacy_params (m, h->params, s->module_version, new_params, labs(m->version()));

            free (h->params);
            h->params = new_params;
          }

          dev.history_end++;
          dev.history = g_list_append(dev.history, h);
          break;
        }
        modules = g_list_next(modules);
      }
      stls = g_list_next(stls);
    }
  }

  dt_dev_pixelpipe_set_input(&pipe, &dev, (float *)buf.buf, buf.width, buf.height, 1.0);
  dt_dev_pixelpipe_create_nodes(&pipe, &dev);
  dt_dev_pixelpipe_synch_all(&pipe, &dev);
  dt_dev_pixelpipe_get_dimensions(&pipe, &dev, pipe.iwidth, pipe.iheight, &pipe.processed_width, &pipe.processed_height);
  if(filter)
  {
    if(!strncmp(filter, "pre:", 4))
      dt_dev_pixelpipe_disable_after(&pipe, filter+4);
    if(!strncmp(filter, "post:", 5))
      dt_dev_pixelpipe_disable_before(&pipe, filter+5);
  }
  dt_show_times(&start, "[export] creating pixelpipe", NULL);

  // find output color profile for this image:
  int sRGB = 1;
  gchar *overprofile = dt_conf_get_string("plugins/lighttable/export/iccprofile");
  if(overprofile && !strcmp(overprofile, "sRGB"))
  {
    sRGB = 1;
  }
  else if(!overprofile || !strcmp(overprofile, "image"))
  {
    GList *modules = dev.iop;
    dt_iop_module_t *colorout = NULL;
    while (modules)
    {
      colorout = (dt_iop_module_t *)modules->data;
      if(colorout->get_p && strcmp(colorout->op, "colorout") == 0)
      {
        const char *iccprofile = colorout->get_p(colorout->params, "iccprofile");
        if(!strcmp(iccprofile, "sRGB")) sRGB = 1;
        else sRGB = 0;
      }
      modules = g_list_next(modules);
    }
  }
  else
  {
    sRGB = 0;
  }
  g_free(overprofile);

  // get only once at the beginning, in case the user changes it on the way:
  const gboolean high_quality_processing = ((format_params->max_width  == 0 || format_params->max_width  >= pipe.processed_width ) &&
      (format_params->max_height == 0 || format_params->max_height >= pipe.processed_height)) ? FALSE :
      high_quality;
  const int width  = high_quality_processing ? 0 : format_params->max_width;
  const int height = high_quality_processing ? 0 : format_params->max_height;
  const double scalex = width  > 0 ? fminf(width /(double)pipe.processed_width,  1.0) : 1.0;
  const double scaley = height > 0 ? fminf(height/(double)pipe.processed_height, 1.0) : 1.0;
  const double scale = fminf(scalex, scaley);
  int processed_width  = scale*pipe.processed_width  + .5f;
  int processed_height = scale*pipe.processed_height + .5f;
  const int bpp = format->bpp(format_params);

  // downsampling done last, if high quality processing was requested:
  uint8_t *outbuf = pipe.backbuf;
  uint8_t *moutbuf = NULL; // keep track of alloc'ed memory
  dt_get_times(&start);
  if(high_quality_processing)
  {
    dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale);
    const double scalex = format_params->max_width  > 0 ? fminf(format_params->max_width /(double)pipe.processed_width,  1.0) : 1.0;
    const double scaley = format_params->max_height > 0 ? fminf(format_params->max_height/(double)pipe.processed_height, 1.0) : 1.0;
    const double scale = fminf(scalex, scaley);
    processed_width  = scale*pipe.processed_width  + .5f;
    processed_height = scale*pipe.processed_height + .5f;
    moutbuf = (uint8_t *)dt_alloc_align(64, (size_t)sizeof(float)*processed_width*processed_height*4);
    outbuf = moutbuf;
    // now downscale into the new buffer:
    dt_iop_roi_t roi_in, roi_out;
    roi_in.x = roi_in.y = roi_out.x = roi_out.y = 0;
    roi_in.scale = 1.0;
    roi_out.scale = scale;
    roi_in.width = pipe.processed_width;
    roi_in.height = pipe.processed_height;
    roi_out.width = processed_width;
    roi_out.height = processed_height;
    dt_iop_clip_and_zoom((float *)outbuf, (float *)pipe.backbuf, &roi_out, &roi_in, processed_width, pipe.processed_width);
  }
  else
  {
    // do the processing (8-bit with special treatment, to make sure we can use openmp further down):
    if(bpp == 8)
      dt_dev_pixelpipe_process(&pipe, &dev, 0, 0, processed_width, processed_height, scale);
    else
      dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale);
    outbuf = pipe.backbuf;
  }
  dt_show_times(&start, thumbnail_export ? "[dev_process_thumbnail] pixel pipeline processing" : "[dev_process_export] pixel pipeline processing", NULL);

  // downconversion to low-precision formats:
  if(bpp == 8)
  {
    if(display_byteorder)
    {
      if(high_quality_processing)
      {
        const float *const inbuf = (float *)outbuf;
        for(size_t k=0; k<(size_t)processed_width*processed_height; k++)
        {
          // convert in place, this is unfortunately very serial..
          const uint8_t r = CLAMP(inbuf[4*k+2]*0xff, 0, 0xff);
          const uint8_t g = CLAMP(inbuf[4*k+1]*0xff, 0, 0xff);
          const uint8_t b = CLAMP(inbuf[4*k+0]*0xff, 0, 0xff);
          outbuf[4*k+0] = r;
          outbuf[4*k+1] = g;
          outbuf[4*k+2] = b;
        }
      }
      // else processing output was 8-bit already, and no need to swap order
    }
    else // need to flip 
    {
      // ldr output: char
      if(high_quality_processing)
      {
        const float *const inbuf = (float *)outbuf;
        for(size_t k=0; k<(size_t)processed_width*processed_height; k++)
        {
          // convert in place, this is unfortunately very serial..
          const uint8_t r = CLAMP(inbuf[4*k+0]*0xff, 0, 0xff);
          const uint8_t g = CLAMP(inbuf[4*k+1]*0xff, 0, 0xff);
          const uint8_t b = CLAMP(inbuf[4*k+2]*0xff, 0, 0xff);
          outbuf[4*k+0] = r;
          outbuf[4*k+1] = g;
          outbuf[4*k+2] = b;
        }
      }
      else
      { // !display_byteorder, need to swap:
        uint8_t *const buf8 = pipe.backbuf;
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(processed_width, processed_height) schedule(static)
#endif
        // just flip byte order
        for(size_t k=0; k<(size_t)processed_width*processed_height; k++)
        {
          uint8_t tmp = buf8[4*k+0];
          buf8[4*k+0] = buf8[4*k+2];
          buf8[4*k+2] = tmp;
        }
      }
    }
  }
  else if(bpp == 16)
  {
    // uint16_t per color channel
    float    *buff  = (float *)   outbuf;
    uint16_t *buf16 = (uint16_t *)outbuf;
    for(int y=0; y<processed_height; y++) for(int x=0; x<processed_width ; x++)
      {
        // convert in place
        const size_t k = (size_t)processed_width*y + x;
        for(int i=0; i<3; i++) buf16[4*k+i] = CLAMP(buff[4*k+i]*0x10000, 0, 0xffff);
      }
  }
  // else output float, no further harm done to the pixels :)

  format_params->width  = processed_width;
  format_params->height = processed_height;

  if(!ignore_exif)
  {
    int length;
    uint8_t exif_profile[65535]; // C++ alloc'ed buffer is uncool, so we waste some bits here.
    char pathname[PATH_MAX];
    gboolean from_cache = TRUE;
    dt_image_full_path(imgid, pathname, sizeof(pathname), &from_cache);
    // last param is dng mode, it's false here
    length = dt_exif_read_blob(exif_profile, pathname, imgid, sRGB, processed_width, processed_height, 0);

    res = format->write_image (format_params, filename, outbuf, exif_profile, length, imgid);
  }
  else
  {
    res = format->write_image (format_params, filename, outbuf, NULL, 0, imgid);
  }

  dt_dev_pixelpipe_cleanup(&pipe);
  dt_dev_cleanup(&dev);
  dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf);
  dt_free_align(moutbuf);
  /* now write xmp into that container, if possible */
  if(copy_metadata && (format->flags(format_params) & FORMAT_FLAGS_SUPPORT_XMP)) {
    dt_exif_xmp_attach(imgid, filename);
    // no need to cancel the export if this fail
  }


  if(!thumbnail_export && strcmp(format->mime(format_params), "memory"))
  {
    dt_control_signal_raise(darktable.signals,DT_SIGNAL_IMAGE_EXPORT_TMPFILE,imgid,filename,format,format_params,storage,storage_params);
  }
  return res;
}

Example #16

0

Show file

File: group.c Project: AdamMajer/darktable

static int dt_group_get_mask_roi(dt_iop_module_t *module, dt_dev_pixelpipe_iop_t *piece,
                                 dt_masks_form_t *form, const dt_iop_roi_t *roi, float *buffer)
{
  double start2 = dt_get_wtime();
  const guint nb = g_list_length(form->points);
  if(nb == 0) return 0;
  int nb_ok = 0;

  const int width = roi->width;
  const int height = roi->height;

  // we need to allocate a temporary buffer for intermediate creation of individual shapes
  float *bufs = dt_alloc_align(64, (size_t)width * height * sizeof(float));
  if(bufs == NULL) return 0;

  // empty the output buffer
  memset(buffer, 0, (size_t)width * height * sizeof(float));

  // and we get all masks
  GList *fpts = g_list_first(form->points);

  while(fpts)
  {
    dt_masks_point_group_t *fpt = (dt_masks_point_group_t *)fpts->data;
    dt_masks_form_t *sel = dt_masks_get_from_id(module->dev, fpt->formid);

    if(sel)
    {
      const int ok = dt_masks_get_mask_roi(module, piece, sel, roi, bufs);
      const float op = fpt->opacity;
      const int state = fpt->state;

      if(ok)
      {
        // first see if we need to invert this shape
        if(state & DT_MASKS_STATE_INVERSE)
        {
#ifdef _OPENMP
#if !defined(__SUNOS__) && !defined(__NetBSD__)
#pragma omp parallel for default(none) shared(bufs)
#else
#pragma omp parallel for shared(bufs)
#endif
#endif
          for(int y = 0; y < height; y++)
            for(int x = 0; x < width; x++)
            {
              size_t index = (size_t)y * width + x;
              bufs[index] = 1.0f - bufs[index];
            }
        }

        if(state & DT_MASKS_STATE_UNION)
        {
#ifdef _OPENMP
#if !defined(__SUNOS__) && !defined(__NetBSD__)
#pragma omp parallel for default(none) shared(bufs, buffer)
#else
#pragma omp parallel for shared(bufs, buffer)
#endif
#endif
          for(int y = 0; y < height; y++)
            for(int x = 0; x < width; x++)
            {
              size_t index = (size_t)y * width + x;
              buffer[index] = fmaxf(buffer[index], bufs[index] * op);
            }
        }
        else if(state & DT_MASKS_STATE_INTERSECTION)
        {
#ifdef _OPENMP
#if !defined(__SUNOS__) && !defined(__NetBSD__)
#pragma omp parallel for default(none) shared(bufs, buffer)
#else
#pragma omp parallel for shared(bufs, buffer)
#endif
#endif
          for(int y = 0; y < height; y++)
            for(int x = 0; x < width; x++)
            {
              size_t index = (size_t)y * width + x;
              float b1 = buffer[index];
              float b2 = b2 = bufs[index]; // FIXME: is this line correct? what it supposed to be doing?
              if(b1 > 0.0f && b2 > 0.0f)
                buffer[index] = fminf(b1, b2 * op);
              else
                buffer[index] = 0.0f;
            }
        }
        else if(state & DT_MASKS_STATE_DIFFERENCE)
        {
#ifdef _OPENMP
#if !defined(__SUNOS__) && !defined(__NetBSD__)
#pragma omp parallel for default(none) shared(bufs, buffer)
#else
#pragma omp parallel for shared(bufs, buffer)
#endif
#endif
          for(int y = 0; y < height; y++)
            for(int x = 0; x < width; x++)
            {
              size_t index = (size_t)y * width + x;
              float b1 = buffer[index];
              float b2 = bufs[index] * op;
              if(b1 > 0.0f && b2 > 0.0f) buffer[index] = b1 * (1.0f - b2);
            }
        }
        else if(state & DT_MASKS_STATE_EXCLUSION)
        {
#ifdef _OPENMP
#if !defined(__SUNOS__) && !defined(__NetBSD__)
#pragma omp parallel for default(none) shared(bufs, buffer)
#else
#pragma omp parallel for shared(bufs, buffer)
#endif
#endif
          for(int y = 0; y < height; y++)
            for(int x = 0; x < width; x++)
            {
              size_t index = (size_t)y * width + x;
              float b1 = buffer[index];
              float b2 = bufs[index] * op;
              if(b1 > 0.0f && b2 > 0.0f)
                buffer[index] = fmaxf((1.0f - b1) * b2, b1 * (1.0f - b2));
              else
                buffer[index] = fmaxf(b1, b2);
            }
        }
        else // if we are here, this mean that we just have to copy the shape and null other parts
        {
#ifdef _OPENMP
#if !defined(__SUNOS__) && !defined(__NetBSD__)
#pragma omp parallel for default(none) shared(bufs, buffer)
#else
#pragma omp parallel for shared(bufs, buffer)
#endif
#endif
          for(int y = 0; y < height; y++)
            for(int x = 0; x < width; x++)
            {
              size_t index = (size_t)y * width + x;
              buffer[index] = bufs[index] * op;
            }
        }

        if(darktable.unmuted & DT_DEBUG_PERF)
          dt_print(DT_DEBUG_MASKS, "[masks %d] combine took %0.04f sec\n", nb_ok, dt_get_wtime() - start2);
        start2 = dt_get_wtime();

        nb_ok++;
      }
    }
    fpts = g_list_next(fpts);
  }

  // and we free the intermediate buffer
  dt_free_align(bufs);

  return (nb_ok != 0);
}

Example #17

0

Show file

File: imageio_gm.c Project: hauva69/darktable

dt_imageio_retval_t
dt_imageio_open_gm(
  dt_image_t *img,
  const char *filename,
  dt_mipmap_cache_allocator_t a)
{
  int err = DT_IMAGEIO_FILE_CORRUPTED;
  float *buf = NULL;
  ExceptionInfo exception;
  Image *image = NULL;
  ImageInfo *image_info = NULL;
  uint32_t width, height, orientation;

  if(!_supported_image(filename)) return DT_IMAGEIO_FILE_CORRUPTED;

  if(!img->exif_inited)
    (void) dt_exif_read(img, filename);

  GetExceptionInfo(&exception);
  image_info=CloneImageInfo((ImageInfo *) NULL);

  g_strlcpy(image_info->filename,filename,sizeof(image_info->filename));

  image=ReadImage(image_info,&exception);
  if (exception.severity != UndefinedException)
    CatchException(&exception);
  if (!image)
  {
    fprintf(stderr, "[GraphicsMagick_open] image `%s' not found\n", img->filename);
    err = DT_IMAGEIO_FILE_NOT_FOUND;
    goto error;
  }

  width = image->columns;
  height = image->rows;
  orientation = image->orientation;

  if(orientation & 4)
  {
    img->width = height;
    img->height = width;
  }
  else
  {
    img->width = width;
    img->height = height;
  }

  img->bpp = 4*sizeof(float);

  float *mipbuf = (float *)dt_mipmap_cache_alloc(img, DT_MIPMAP_FULL, a);
  if(!mipbuf)
  {
    fprintf(stderr, "[GraphicsMagick_open] could not alloc full buffer for image `%s'\n", img->filename);
    err = DT_IMAGEIO_CACHE_FULL;
    goto error;
  }

  buf = (float *)dt_alloc_align(16, width*img->bpp);
  if(!buf) goto error;

  const int ht2 = orientation & 4 ? img->width  : img->height; // pretend unrotated, rotate in write_pos
  const int wd2 = orientation & 4 ? img->height : img->width;

  for (uint32_t row = 0; row < height; row++)
  {
    int ret = DispatchImage(image, 0, row, width, 1, "RGBP", FloatPixel, (void *)buf, &exception);
    if (exception.severity != UndefinedException)
      CatchException(&exception);
    if(ret != MagickPass)
    {
      fprintf(stderr, "[GraphicsMagick_open] error reading image `%s'\n", img->filename);
      err = DT_IMAGEIO_FILE_CORRUPTED;
      goto error;
    }

    for(uint32_t i=0; i<width; i++)
      for(int k=0; k<4; k++) mipbuf[4*dt_imageio_write_pos(i, row, wd2, ht2, wd2, ht2, orientation) + k] = buf[4*i + k];
  }

  if(buf) dt_free_align(buf);
  if(image) DestroyImage(image);
  if(image_info) DestroyImageInfo(image_info);
  DestroyExceptionInfo(&exception);

  img->filters = 0;
  img->flags &= ~DT_IMAGE_RAW;
  img->flags &= ~DT_IMAGE_HDR;
  img->flags |= DT_IMAGE_LDR;

  return DT_IMAGEIO_OK;

error:
  if(buf) dt_free_align(buf);
  if(image) DestroyImage(image);
  if(image_info) DestroyImageInfo(image_info);
  DestroyExceptionInfo(&exception);
  return err;
}

Example #18

0

Show file

File: mipmap_cache.c Project: pedrocr/darktable

// callback for the cache backend to initialize payload pointers
void dt_mipmap_cache_allocate_dynamic(void *data, dt_cache_entry_t *entry)
{
  dt_mipmap_cache_t *cache = (dt_mipmap_cache_t *)data;
  // for full image buffers
  struct dt_mipmap_buffer_dsc *dsc = entry->data;
  const dt_mipmap_size_t mip = get_size(entry->key);
  // alloc mere minimum for the header + broken image buffer:
  if(!dsc)
  {
    if(mip <= DT_MIPMAP_F)
    {
      // these are fixed-size:
      entry->data = dt_alloc_align(16, cache->buffer_size[mip]);
    }
    else
    {
      entry->data = dt_alloc_align(16, sizeof(*dsc) + sizeof(float) * 4 * 64);
    }
    // fprintf(stderr, "[mipmap cache] alloc dynamic for key %u %p\n", key, *buf);
    if(!(entry->data))
    {
      fprintf(stderr, "[mipmap cache] memory allocation failed!\n");
      exit(1);
    }
    dsc = entry->data;
    if(mip <= DT_MIPMAP_F)
    {
      dsc->width = cache->max_width[mip];
      dsc->height = cache->max_height[mip];
      dsc->size = cache->buffer_size[mip];
      dsc->color_space = DT_COLORSPACE_NONE;
    }
    else
    {
      dsc->width = 0;
      dsc->height = 0;
      dsc->color_space = DT_COLORSPACE_NONE;
      dsc->size = sizeof(*dsc) + sizeof(float) * 4 * 64;
    }
  }
  assert(dsc->size >= sizeof(*dsc));

  int loaded_from_disk = 0;
  if(mip < DT_MIPMAP_F)
  {
    if(cache->cachedir[0] && dt_conf_get_bool("cache_disk_backend"))
    {
      // try and load from disk, if successful set flag
      char filename[PATH_MAX] = {0};
      snprintf(filename, sizeof(filename), "%s.d/%d/%d.jpg", cache->cachedir, mip, get_imgid(entry->key));
      FILE *f = fopen(filename, "rb");
      if(f)
      {
        long len = 0;
        uint8_t *blob = 0;
        fseek(f, 0, SEEK_END);
        len = ftell(f);
        if(len <= 0) goto read_error; // coverity madness
        blob = (uint8_t *)malloc(len);
        if(!blob) goto read_error;
        fseek(f, 0, SEEK_SET);
        int rd = fread(blob, sizeof(uint8_t), len, f);
        if(rd != len) goto read_error;
        dt_colorspaces_color_profile_type_t color_space;
        dt_imageio_jpeg_t jpg;
        if(dt_imageio_jpeg_decompress_header(blob, len, &jpg)
           || (jpg.width > cache->max_width[mip] || jpg.height > cache->max_height[mip])
           || ((color_space = dt_imageio_jpeg_read_color_space(&jpg)) == DT_COLORSPACE_NONE) // pointless test to keep it in the if clause
           || dt_imageio_jpeg_decompress(&jpg, entry->data + sizeof(*dsc)))
        {
          fprintf(stderr, "[mipmap_cache] failed to decompress thumbnail for image %d from `%s'!\n", get_imgid(entry->key), filename);
          goto read_error;
        }
        dsc->width = jpg.width;
        dsc->height = jpg.height;
        dsc->color_space = color_space;
        loaded_from_disk = 1;
        if(0)
        {
read_error:
          g_unlink(filename);
        }
        free(blob);
        fclose(f);
      }
    }
  }

  if(!loaded_from_disk)
    dsc->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE;
  else dsc->flags = 0;

  // cost is just flat one for the buffer, as the buffers might have different sizes,
  // to make sure quota is meaningful.
  if(mip >= DT_MIPMAP_F) entry->cost = 1;
  else entry->cost = cache->buffer_size[mip];
}

Example #19

0

Show file

File: imageio.c Project: bgK/darktable

// internal function: to avoid exif blob reading + 8-bit byteorder flag + high-quality override
int dt_imageio_export_with_flags(
    const uint32_t              imgid,
    const char                 *filename,
    dt_imageio_module_format_t *format,
    dt_imageio_module_data_t   *format_params,
    const int32_t               ignore_exif,
    const int32_t               display_byteorder,
    const int32_t               high_quality,
    const int32_t               thumbnail_export)
{
  dt_develop_t dev;
  dt_dev_init(&dev, 0);
  dt_mipmap_buffer_t buf;
  dt_mipmap_cache_read_get(darktable.mipmap_cache, &buf, imgid, DT_MIPMAP_FULL, DT_MIPMAP_BLOCKING);
  dt_dev_load_image(&dev, imgid);
  const dt_image_t *img = &dev.image_storage;
  const int wd = img->width;
  const int ht = img->height;

  int res = 0;

  dt_times_t start;
  dt_get_times(&start);
  dt_dev_pixelpipe_t pipe;
  res = thumbnail_export ? dt_dev_pixelpipe_init_thumbnail(&pipe, wd, ht) : dt_dev_pixelpipe_init_export(&pipe, wd, ht);
  if(!res)
  {
    dt_control_log(_("failed to allocate memory for export, please lower the threads used for export or buy more memory."));
    dt_dev_cleanup(&dev);
    if(buf.buf)
      dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf);
    return 1;
  }

  if(!buf.buf)
  {
    dt_control_log(_("image `%s' is not available!"), img->filename);
    dt_dev_cleanup(&dev);
    return 1;
  }

  dt_dev_pixelpipe_set_input(&pipe, &dev, (float *)buf.buf, buf.width, buf.height, 1.0);
  dt_dev_pixelpipe_create_nodes(&pipe, &dev);
  dt_dev_pixelpipe_synch_all(&pipe, &dev);
  dt_dev_pixelpipe_get_dimensions(&pipe, &dev, pipe.iwidth, pipe.iheight, &pipe.processed_width, &pipe.processed_height);
  dt_show_times(&start, "[export] creating pixelpipe", NULL);

  // find output color profile for this image:
  int sRGB = 1;
  gchar *overprofile = dt_conf_get_string("plugins/lighttable/export/iccprofile");
  if(overprofile && !strcmp(overprofile, "sRGB"))
  {
    sRGB = 1;
  }
  else if(!overprofile || !strcmp(overprofile, "image"))
  {
    GList *modules = dev.iop;
    dt_iop_module_t *colorout = NULL;
    while (modules)
    {
      colorout = (dt_iop_module_t *)modules->data;
      if (strcmp(colorout->op, "colorout") == 0)
      {
        dt_iop_colorout_params_t *p = (dt_iop_colorout_params_t *)colorout->params;
        if(!strcmp(p->iccprofile, "sRGB")) sRGB = 1;
        else sRGB = 0;
      }
      modules = g_list_next(modules);
    }
  }
  else
  {
    sRGB = 0;
  }
  g_free(overprofile);

  // get only once at the beginning, in case the user changes it on the way:
  const int high_quality_processing = ((format_params->max_width  == 0 || format_params->max_width  >= pipe.processed_width ) &&
                                       (format_params->max_height == 0 || format_params->max_height >= pipe.processed_height)) ? 0 :
                                        high_quality;
  const int width  = high_quality_processing ? 0 : format_params->max_width;
  const int height = high_quality_processing ? 0 : format_params->max_height;
  const float scalex = width  > 0 ? fminf(width /(float)pipe.processed_width,  1.0) : 1.0;
  const float scaley = height > 0 ? fminf(height/(float)pipe.processed_height, 1.0) : 1.0;
  const float scale = fminf(scalex, scaley);
  int processed_width  = scale*pipe.processed_width;
  int processed_height = scale*pipe.processed_height;
  const int bpp = format->bpp(format_params);

  // downsampling done last, if high quality processing was requested:
  uint8_t *outbuf = pipe.backbuf;
  uint8_t *moutbuf = NULL; // keep track of alloc'ed memory
  if(high_quality_processing)
  {
    dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale);
    const float scalex = format_params->max_width  > 0 ? fminf(format_params->max_width /(float)pipe.processed_width,  1.0) : 1.0;
    const float scaley = format_params->max_height > 0 ? fminf(format_params->max_height/(float)pipe.processed_height, 1.0) : 1.0;
    const float scale = fminf(scalex, scaley);
    processed_width  = scale*pipe.processed_width  + .5f;
    processed_height = scale*pipe.processed_height + .5f;
    moutbuf = (uint8_t *)dt_alloc_align(64, sizeof(float)*processed_width*processed_height*4);
    outbuf = moutbuf;
    // now downscale into the new buffer:
    dt_iop_roi_t roi_in, roi_out;
    roi_in.x = roi_in.y = roi_out.x = roi_out.y = 0;
    roi_in.scale = 1.0;
    roi_out.scale = scale;
    roi_in.width = pipe.processed_width;
    roi_in.height = pipe.processed_height;
    roi_out.width = processed_width;
    roi_out.height = processed_height;
    dt_iop_clip_and_zoom((float *)outbuf, (float *)pipe.backbuf, &roi_out, &roi_in, processed_width, pipe.processed_width);
  }
  else
  {
    // do the processing (8-bit with special treatment, to make sure we can use openmp further down):
    if(bpp == 8)
      dt_dev_pixelpipe_process(&pipe, &dev, 0, 0, processed_width, processed_height, scale);
    else
      dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale);
    outbuf = pipe.backbuf;
  }

  // downconversion to low-precision formats:
  if(bpp == 8 && !display_byteorder)
  {
    // ldr output: char
    if(high_quality_processing)
    {
      const float *const inbuf = (float *)outbuf;
      for(int k=0; k<processed_width*processed_height; k++)
      {
        // convert in place, this is unfortunately very serial.. 
        const uint8_t r = CLAMP(inbuf[4*k+0]*0xff, 0, 0xff);
        const uint8_t g = CLAMP(inbuf[4*k+1]*0xff, 0, 0xff);
        const uint8_t b = CLAMP(inbuf[4*k+2]*0xff, 0, 0xff);
        outbuf[4*k+0] = r;
        outbuf[4*k+1] = g;
        outbuf[4*k+2] = b;
      }
    }
    else
    {
      uint8_t *const buf8 = pipe.backbuf;
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(processed_width, processed_height) schedule(static)
#endif
      // just flip byte order
      for(int k=0; k<processed_width*processed_height; k++)
      {
        uint8_t tmp = buf8[4*k+0];
        buf8[4*k+0] = buf8[4*k+2];
        buf8[4*k+2] = tmp;
      }
    }
  }
  else if(bpp == 16)
  {
    // uint16_t per color channel
    float    *buff  = (float *)   outbuf;
    uint16_t *buf16 = (uint16_t *)outbuf;
    for(int y=0; y<processed_height; y++) for(int x=0; x<processed_width ; x++)
    {
      // convert in place
      const int k = x + processed_width*y;
      for(int i=0; i<3; i++) buf16[4*k+i] = CLAMP(buff[4*k+i]*0x10000, 0, 0xffff);
    }
  }
  // else output float, no further harm done to the pixels :)

  format_params->width  = processed_width;
  format_params->height = processed_height;

  if(!ignore_exif)
  {
    int length;
    uint8_t exif_profile[65535]; // C++ alloc'ed buffer is uncool, so we waste some bits here.
    char pathname[1024];
    dt_image_full_path(imgid, pathname, 1024);
    length = dt_exif_read_blob(exif_profile, pathname, sRGB, imgid);

    res = format->write_image (format_params, filename, outbuf, exif_profile, length, imgid);
  }
  else
  {
    res = format->write_image (format_params, filename, outbuf, NULL, 0, imgid);
  }

  dt_dev_pixelpipe_cleanup(&pipe);
  dt_dev_cleanup(&dev);
  dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf);
  free(moutbuf);
  return res;
}

Example #20

0

Show file

File: main.c Project: SenorNaddy/darktable

static void generate_thumbnail_cache()
{
  const int max_mip = DT_MIPMAP_2;
  fprintf(stderr, _("creating cache directories\n"));
  char filename[PATH_MAX] = {0};
  for(int k=DT_MIPMAP_0;k<=max_mip;k++)
  {
    snprintf(filename, sizeof(filename), "%s.d/%d", darktable.mipmap_cache->cachedir, k);
    fprintf(stderr, _("creating cache directory '%s'\n"), filename);
    int mkd = g_mkdir_with_parents(filename, 0750);
    if(mkd)
    {
      fprintf(stderr, _("could not create directory '%s'!\n"), filename);
      return;
    }
  }
  // some progress counter
  sqlite3_stmt *stmt;
  uint64_t image_count = 0, counter = 0;
  DT_DEBUG_SQLITE3_PREPARE_V2(dt_database_get(darktable.db), "select count(id) from images", -1, &stmt, 0);
  if(sqlite3_step(stmt) == SQLITE_ROW)
    image_count = sqlite3_column_int(stmt, 0);
  sqlite3_finalize(stmt);

  // go through all images:
  DT_DEBUG_SQLITE3_PREPARE_V2(dt_database_get(darktable.db), "select id from images", -1, &stmt, 0);
  // could only alloc max_mip-1, but would need to detect the special case that max==0.
  const size_t bufsize = (size_t)4 * darktable.mipmap_cache->max_width[max_mip]
                         * darktable.mipmap_cache->max_height[max_mip];
  uint8_t *tmp = (uint8_t *)dt_alloc_align(16, bufsize);
  if(!tmp)
  {
    fprintf(stderr, "couldn't allocate temporary memory!\n");
    sqlite3_finalize(stmt);
    return;
  }
  const int cache_quality = MIN(100, MAX(10, dt_conf_get_int("database_cache_quality")));
  while(sqlite3_step(stmt) == SQLITE_ROW)
  {
    const int32_t imgid = sqlite3_column_int(stmt, 0);
    // check whether all of these files are already there
    int all_exist = 1;
    for(int k=max_mip;k>=DT_MIPMAP_0;k--)
    {
      snprintf(filename, sizeof(filename), "%s.d/%d/%d.jpg", darktable.mipmap_cache->cachedir, k, imgid);
      all_exist &= !access(filename, R_OK); 
    }
    if(all_exist) goto next;
    dt_mipmap_buffer_t buf;
    // get largest thumbnail for this image
    // this one will take care of itself, we'll just write out the lower thumbs manually:
    dt_mipmap_cache_get(darktable.mipmap_cache, &buf, imgid, max_mip, DT_MIPMAP_BLOCKING, 'r');
    if(buf.width > 8 && buf.height > 8) // don't create for skulls
    for(int k=max_mip-1;k>=DT_MIPMAP_0;k--)
    {
      uint32_t width, height;
      const int wd = darktable.mipmap_cache->max_width[k];
      const int ht = darktable.mipmap_cache->max_height[k];
      // use exactly the same mechanism as the cache internally to rescale the thumbnail:
      dt_iop_flip_and_zoom_8(buf.buf, buf.width, buf.height, tmp, wd, ht, 0, &width, &height);

      snprintf(filename, sizeof(filename), "%s.d/%d/%d.jpg", darktable.mipmap_cache->cachedir, k, imgid);
      FILE *f = fopen(filename, "wb");
      if(f)
      {
        // allocate temp memory:
        uint8_t *blob = (uint8_t *)malloc(bufsize);
        if(!blob) goto write_error;
        const int32_t length
          = dt_imageio_jpeg_compress(tmp, blob, width, height, cache_quality);
        assert(length <= bufsize);
        int written = fwrite(blob, sizeof(uint8_t), length, f);
        if(written != length)
        {
write_error:
          unlink(filename);
        }
        free(blob);
        fclose(f);
      }
    }
    dt_mipmap_cache_release(darktable.mipmap_cache, &buf);
next:
    counter ++;
    fprintf(stderr, "\rimage %lu/%lu (%.02f%%)            ", counter, image_count, 100.0*counter/(float)image_count);
  }
  dt_free_align(tmp);
  sqlite3_finalize(stmt);
  fprintf(stderr, "done                     \n");
}

Example #21

0

Show file

File: colorout.c Project: dirkbr/darktable

void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid,
             const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;
  const int gamutcheck = (d->mode == DT_PROFILE_GAMUTCHECK);

  if(!isnan(d->cmatrix[0]))
  {
// fprintf(stderr,"Using cmatrix codepath\n");
// convert to rgb using matrix
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none) shared(roi_in, roi_out, ivoid, ovoid)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {

      float *in = (float *)ivoid + (size_t)ch * roi_in->width * j;
      float *out = (float *)ovoid + (size_t)ch * roi_out->width * j;
      const __m128 m0 = _mm_set_ps(0.0f, d->cmatrix[6], d->cmatrix[3], d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f, d->cmatrix[7], d->cmatrix[4], d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f, d->cmatrix[8], d->cmatrix[5], d->cmatrix[2]);

      for(int i = 0; i < roi_out->width; i++, in += ch, out += ch)
      {
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t
            = _mm_add_ps(_mm_mul_ps(m0, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(0, 0, 0, 0))),
                         _mm_add_ps(_mm_mul_ps(m1, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(1, 1, 1, 1))),
                                    _mm_mul_ps(m2, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(2, 2, 2, 2)))));

        _mm_stream_ps(out, t);
      }
    }
    _mm_sfence();
// apply profile
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none) shared(roi_in, roi_out, ivoid, ovoid)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {

      float *in = (float *)ivoid + (size_t)ch * roi_in->width * j;
      float *out = (float *)ovoid + (size_t)ch * roi_out->width * j;

      for(int i = 0; i < roi_out->width; i++, in += ch, out += ch)
      {
        for(int i = 0; i < 3; i++)
          if(d->lut[i][0] >= 0.0f)
          {
            out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i])
                                     : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]);
          }
      }
    }
  }
  else
  {
    // fprintf(stderr,"Using xform codepath\n");
    const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none) shared(ivoid, ovoid, roi_out)
#endif
    for(int k = 0; k < roi_out->height; k++)
    {
      const float *in = ((float *)ivoid) + (size_t)ch * k * roi_out->width;
      float *out = ((float *)ovoid) + (size_t)ch * k * roi_out->width;

      if(!gamutcheck)
      {
        cmsDoTransform(d->xform, in, out, roi_out->width);
      }
      else
      {
        void *rgb = dt_alloc_align(16, 4 * sizeof(float) * roi_out->width);
        cmsDoTransform(d->xform, in, rgb, roi_out->width);
        float *rgbptr = (float *)rgb;
        for(int j = 0; j < roi_out->width; j++, rgbptr += 4, out += 4)
        {
          const __m128 pixel = _mm_load_ps(rgbptr);
          __m128 ingamut = _mm_cmplt_ps(pixel, _mm_set_ps(-FLT_MAX, 0.0f, 0.0f, 0.0f));

          ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut));
          ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut));

          const __m128 result
              = _mm_or_ps(_mm_and_ps(ingamut, outofgamutpixel), _mm_andnot_ps(ingamut, pixel));
          _mm_stream_ps(out, result);
        }
        dt_free_align(rgb);
      }
    }
    _mm_sfence();
  }

  if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}

Example #22

0

Show file

File: interpolation.c Project: sk1p/darktable

/** Prepares a 1D resampling plan
 *
 * This consists of the following informations
 * <ul>
 * <li>A list of lengths that tell how many pixels are relevant for the
 *    next output</li>
 * <li>A list of required filter kernels</li>
 * <li>A list of sample indexes</li>
 * </ul>
 *
 * How to apply the resampling plan:
 * <ol>
 * <li>Pick a length from the length array</li>
 * <li>until length is reached
 *     <ol>
 *     <li>pick a kernel tap></li>
 *     <li>pick the relevant sample according to the picked index</li>
 *     <li>multiply them and accumulate</li>
 *     </ol>
 * </li>
 * <li>here goes a single output sample</li>
 * </ol>
 *
 * This until you reach the number of output pixels
 *
 * @param itor interpolator used to resample
 * @param in [in] Number of input samples
 * @param out [in] Number of output samples
 * @param plength [out] Array of lengths for each pixel filtering (number
 * of taps/indexes to use). This array mus be freed with fre() when you're
 * done with the plan.
 * @param pkernel [out] Array of filter kernel taps
 * @param pindex [out] Array of sample indexes to be used for applying each kernel tap
 * arrays of informations
 * @param pmeta [out] Array of int triplets (length, kernel, index) telling where to start for an arbitrary out position meta[3*out]
 * @return 0 for success, !0 for failure
 */
static int
prepare_resampling_plan(
  const struct dt_interpolation* itor,
  int in,
  const int in_x0,
  int out,
  const int out_x0,
  float scale,
  int** plength,
  float** pkernel,
  int** pindex,
  int** pmeta)
{
  // Safe return values
  *plength = NULL;
  *pkernel = NULL;
  *pindex = NULL;
  if (pmeta) {
	  *pmeta = NULL;
  }

  if (scale == 1.f) {
    // No resampling required
    return 0;
  }

  // Compute common upsampling/downsampling memory requirements
  int maxtapsapixel;
  if (scale > 1.f) {
    // Upscale... the easy one. The values are exact
    maxtapsapixel = 2*itor->width;
  } else {
    // Downscale... going for worst case values memory wise
    maxtapsapixel = ceil_fast((float)2*(float)itor->width/scale);
  }

  int nlengths = out;
  int nindex = maxtapsapixel*out;
  int nkernel = maxtapsapixel*out;
  size_t lengthreq = increase_for_alignment(nlengths*sizeof(int), SSE_ALIGNMENT);
  size_t indexreq = increase_for_alignment(nindex*sizeof(int), SSE_ALIGNMENT);
  size_t kernelreq = increase_for_alignment(nkernel*sizeof(float), SSE_ALIGNMENT);
  size_t scratchreq = maxtapsapixel*sizeof(float) + 4*sizeof(float);
  // NB: because sse versions compute four taps a time
  size_t metareq = pmeta ? 3*sizeof(int)*out : 0;

  void *blob = NULL;
  size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq;
  blob = dt_alloc_align(SSE_ALIGNMENT, totalreq);
  if (!blob) {
    return 1;
  }

  int* lengths = (int*)blob;
  blob = (char*)blob + lengthreq;
  int* index = (int*)blob;
  blob = (char*)blob + indexreq;
  float* kernel = (float*)blob;
  blob = (char*)blob + kernelreq;
  float* scratchpad = scratchreq ? (float*)blob : NULL;
  blob = (char*)blob + scratchreq;
  int* meta = metareq ? (int*)blob : NULL;
  blob = (char*)blob + metareq;

  /* setting this as a const should help the compilers trim all unecessary
   * codepaths */
  const enum border_mode bordermode = RESAMPLING_BORDER_MODE;

  /* Upscale and downscale differ in subtle points, getting rid of code
   * duplication might have been tricky and i prefer keeping the code
   * as straight as possible */
  if (scale > 1.f) {
    int kidx = 0;
    int iidx = 0;
    int lidx = 0;
    int midx = 0;
    for (int x=0; x<out; x++) {
      if (meta) {
        meta[midx++] = lidx;
        meta[midx++] = kidx;
        meta[midx++] = iidx;
      }

      // Projected position in input samples
      float fx = (float)(out_x0 + x)/scale;

      // Compute the filter kernel at that position
      int first;
      compute_upsampling_kernel_sse(itor, scratchpad, NULL, &first, fx);

      /* Check lower and higher bound pixel index and skip as many pixels as
       * necessary to fall into range */
      int tap_first;
      int tap_last;
      prepare_tap_boundaries(&tap_first, &tap_last, bordermode, 2*itor->width, first, in);

      // Track number of taps that will be used
      lengths[lidx++] = tap_last - tap_first;

      // Precompute the inverse of the norm
      float norm = 0.f;
      for (int tap=tap_first; tap<tap_last; tap++) {
        norm += scratchpad[tap];
      }
      norm = 1.f/norm;

      /* Unlike single pixel or single sample code, here it's interesting to
       * precompute the normalized filter kernel as this will avoid dividing
       * by the norm for all processed samples/pixels
       * NB: use the same loop to put in place the index list */
      first += tap_first;
      for (int tap=tap_first; tap<tap_last; tap++) {
        kernel[kidx++] = scratchpad[tap]*norm;
        index[iidx++] = clip(first++, 0, in-1, bordermode);
      }
    }
  } else {
    int kidx = 0;
    int iidx = 0;
    int lidx = 0;
    int midx = 0;
    for (int x=0; x<out; x++) {
      if (meta) {
        meta[midx++] = lidx;
        meta[midx++] = kidx;
        meta[midx++] = iidx;
      }

      // Compute downsampling kernel centered on output position
      int taps;
      int first;
      compute_downsampling_kernel_sse(itor, &taps, &first, scratchpad, NULL, scale, out_x0 + x);

      /* Check lower and higher bound pixel index and skip as many pixels as
       * necessary to fall into range */
      int tap_first;
      int tap_last;
      prepare_tap_boundaries(&tap_first, &tap_last, bordermode, taps, first, in);

      // Track number of taps that will be used
      lengths[lidx++] = tap_last - tap_first;

      // Precompute the inverse of the norm
      float norm = 0.f;
      for (int tap=tap_first; tap<tap_last; tap++) {
        norm += scratchpad[tap];
      }
      norm = 1.f/norm;

      /* Unlike single pixel or single sample code, here it's interesting to
       * precompute the normalized filter kernel as this will avoid dividing
       * by the norm for all processed samples/pixels
       * NB: use the same loop to put in place the index list */
      first += tap_first;
      for (int tap=tap_first; tap<tap_last; tap++) {
        kernel[kidx++] = scratchpad[tap]*norm;
        index[iidx++] = clip(first++, 0, in-1, bordermode);
      }
    }
  }

  // Validate plan wrt caller
  *plength = lengths;
  *pindex = index;
  *pkernel = kernel;
  if (pmeta) {
    *pmeta = meta;
  }
  return 0;
}

Example #23

0

Show file

File: tiling.c Project: joninvski/darktable-old

/* if a module does not implement process_tiling() by itself, this function is called instead.
   default_process_tiling() is able to handle standard cases where pixels change their values
   but not their places. */
void
default_process_tiling (struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out, const int in_bpp)
{
  void *input = NULL;
  void *output = NULL;

  /* we only care for the most simple cases ATM. else try to process the standard way, i.e. in one chunk. let's hope for the best... */
  if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)))
  {
    dt_print(DT_DEBUG_DEV, "[default_process_tiling] cannot handle requested roi's. fall back to standard method for module '%s'\n", self->op);
    goto fallback;
  }

  const int out_bpp = self->output_bpp(self, piece->pipe, piece);
  const int ipitch = roi_in->width * in_bpp;
  const int opitch = roi_out->width * out_bpp;

  /* get tiling requirements of module */
  dt_develop_tiling_t tiling = { 0 };
  self->tiling_callback(self, piece, roi_in, roi_out, &tiling);

  /* tiling really does not make sense in these cases. standard process() is not better or worse than we are */
  if(tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_out->width * roi_out->height * max(in_bpp, out_bpp))
  {
    dt_print(DT_DEBUG_DEV, "[default_process_tiling] don't use tiling for module '%s'. no real memory saving could be reached\n", self->op);
    goto fallback;
  }

  /* calculate optimal size of tiles */
  float available = dt_conf_get_int("host_memory_limit")*1024*1024;
  assert(available >= 500*1024*1024);
  /* correct for size of ivoid and ovoid which are needed on top of tiling */
  available = max(available - roi_out->width * roi_out->height * (in_bpp + out_bpp) - tiling.overhead, 0);

  /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor.
     this will mainly allow tiling for modules with high and "unpredictable" memory demand which is
     reflected in high values of tiling.factor (take bilateral noise reduction as an example). */
  float singlebuffer = dt_conf_get_int("singlebuffer_limit")*1024*1024;
  singlebuffer = max(singlebuffer, 1024*1024);
  assert(tiling.factor > 1.0f);
  singlebuffer = max(available / tiling.factor, singlebuffer);

  int width = roi_out->width;
  int height = roi_out->height;

  /* shrink tile size in case it would exceed singlebuffer size */
  if(width*height*max(in_bpp, out_bpp) > singlebuffer)
  {
    const float scale = singlebuffer/(width*height*max(in_bpp, out_bpp));

    /* TODO: can we make this more efficient to minimize total overlap between tiles? */
    if(width < height && scale >= 0.333f)
    { 
      height = floorf(height * scale);
    }
    else if(height <= width && scale >= 0.333f)
    {
      width = floorf(width * scale);
    }
    else
    {
      width = floorf(width * sqrt(scale));
      height = floorf(height * sqrt(scale));
    }
  }

  /* make sure we have a reasonably effective tile dimension. if not try square tiles */
  if(3*tiling.overlap > width || 3*tiling.overlap > height)
  {
    width = height = floorf(sqrtf((float)width*height));
  }

#if 0
  /* we might want to grow dimensions a bit */
  width = max(4*tiling.overlap, width);
  height = max(4*tiling.overlap, height);
#endif

  /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
     Modules will report alignment requirements via xalign and yalign within tiling_callback().
     Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
     direction.
     We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
     that is identical to image width/height no special alignment is needed. */

  const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);

  assert(xyalign != 0);

  /* properly align tile width and height by making them smaller if needed */
  if(width < roi_out->width) width = (width / xyalign) * xyalign;
  if(height < roi_out->height) height = (height / xyalign) * xyalign;

  /* also make sure that overlap follows alignment rules by making it wider when needed */
  const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign : tiling.overlap;

  /* calculate effective tile size */
  const int tile_wd = width - 2*overlap > 0 ? width - 2*overlap : 1;
  const int tile_ht = height - 2*overlap > 0 ? height - 2*overlap : 1;

  /* calculate number of tiles */
  const int tiles_x = width < roi_out->width ? ceilf(roi_out->width /(float)tile_wd) : 1;
  const int tiles_y = height < roi_out->height ? ceilf(roi_out->height/(float)tile_ht) : 1;

  /* sanity check: don't run wild on too many tiles */
  if(tiles_x * tiles_y > DT_TILING_MAXTILES)
  {
    dt_print(DT_DEBUG_DEV, "[default_process_tiling] gave up tiling for module '%s'. too many tiles: %d x %d\n", self->op, tiles_x, tiles_y);
    goto error;
  }


  dt_print(DT_DEBUG_DEV, "[default_process_tiling] use tiling on module '%s' for image with full size %d x %d\n", self->op, roi_out->width, roi_out->height);
  dt_print(DT_DEBUG_DEV, "[default_process_tiling] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n", tiles_x, tiles_y, width, height, overlap);

  /* reserve input and output buffers for tiles */
  input = dt_alloc_align(64, width*height*in_bpp);
  if(input == NULL)
  {
    dt_print(DT_DEBUG_DEV, "[default_process_tiling] could not alloc input buffer for module '%s'\n", self->op);
    goto error;
  }
  output = dt_alloc_align(64, width*height*out_bpp);
  if(output == NULL)
  {
    dt_print(DT_DEBUG_DEV, "[default_process_tiling] could not alloc output buffer for module '%s'\n", self->op);
    goto error;
  }

  /* store processed_maximum to be re-used and aggregated */
  float processed_maximum_saved[3];
  float processed_maximum_new[3] = { 1.0f };
  for(int k=0; k<3; k++)
    processed_maximum_saved[k] = piece->pipe->processed_maximum[k];


  /* iterate over tiles */
  for(int tx=0; tx<tiles_x; tx++)
    for(int ty=0; ty<tiles_y; ty++)  
  {
    size_t wd = tx * tile_wd + width > roi_out->width  ? roi_out->width - tx * tile_wd : width;
    size_t ht = ty * tile_ht + height > roi_out->height ? roi_out->height- ty * tile_ht : height;

    /* no need to process end-tiles that are smaller than overlap */
    if((wd <= overlap && tx > 0) || (ht <= overlap && ty > 0)) continue;

    /* origin and region of effective part of tile, which we want to store later */
    size_t origin[] = { 0, 0, 0 };
    size_t region[] = { wd, ht, 1 };

    /* roi_in and roi_out for process_cl on subbuffer */
    dt_iop_roi_t iroi = { 0, 0, wd, ht, roi_in->scale };
    dt_iop_roi_t oroi = { 0, 0, wd, ht, roi_out->scale };

    /* offsets of tile into ivoid and ovoid */
    size_t ioffs = (ty * tile_ht)*ipitch + (tx * tile_wd)*in_bpp;
    size_t ooffs = (ty * tile_ht)*opitch + (tx * tile_wd)*out_bpp;

    dt_print(DT_DEBUG_DEV, "[default_process_tiling] tile (%d, %d) with %d x %d at origin [%d, %d]\n", tx, ty, wd, ht, tx*tile_wd, ty*tile_ht);

    /* prepare input tile buffer */
#ifdef _OPENMP
    #pragma omp parallel for default(none) shared(input,width,ivoid,ioffs,wd,ht) schedule(static)
#endif
    for(int j=0; j<ht; j++)
      memcpy((char *)input+j*wd*in_bpp, (char *)ivoid+ioffs+j*ipitch, wd*in_bpp);

    /* take original processed_maximum as starting point */
    for(int k=0; k<3; k++)
      piece->pipe->processed_maximum[k] = processed_maximum_saved[k];

    /* call process() of module */
    self->process(self, piece, input, output, &iroi, &oroi);

    /* aggregate resulting processed_maximum */
    /* TODO: check if there really can be differences between tiles and take
             appropriate action (calculate minimum, maximum, average, ...?) */
    for(int k=0; k<3; k++)
    {
      if(tx+ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->processed_maximum[k]) > 1.0e-6f)
        dt_print(DT_DEBUG_DEV, "[default_process_tiling] processed_maximum[%d] differs between tiles in module '%s'\n", k, self->op);
      processed_maximum_new[k] = piece->pipe->processed_maximum[k];
    }

    /* correct origin and region of tile for overlap.
       make sure that we only copy back the "good" part. */
    if(tx > 0)
    {
      origin[0] += overlap;
      region[0] -= overlap;
      ooffs += overlap*out_bpp;
    }
    if(ty > 0)
    {
      origin[1] += overlap;
      region[1] -= overlap;
      ooffs += overlap*opitch;
    }

    /* copy "good" part of tile to output buffer */
#ifdef _OPENMP
    #pragma omp parallel for default(none) shared(ovoid,ooffs,output,width,origin,region,wd) schedule(static)
#endif
    for(int j=0; j<region[1]; j++)
      memcpy((char *)ovoid+ooffs+j*opitch, (char *)output+((j+origin[1])*wd+origin[0])*out_bpp, region[0]*out_bpp);
  }

  /* copy back final processed_maximum */
  for(int k=0; k<3; k++)
    piece->pipe->processed_maximum[k] = processed_maximum_new[k];

  if(input != NULL) free(input);
  if(output != NULL) free(output);
  return;

error:
  if(input != NULL) free(input);
  if(output != NULL) free(output);
  dt_print(DT_DEBUG_DEV, "[default_process_tiling] tiling failed for module '%s'\n", self->op);
  /* TODO: give a warning message to user */
  return;

fallback:
  if(input != NULL) free(input);
  if(output != NULL) free(output);
  dt_print(DT_DEBUG_DEV, "[default_process_tiling] fall back to standard processing for module '%s'\n", self->op);
  self->process(self, piece, ivoid, ovoid, roi_in, roi_out);
  return;

}

Example #24

0

Show file

File: imageio_png.c Project: damienfir/darktable

dt_imageio_retval_t dt_imageio_open_png(dt_image_t *img, const char *filename, dt_mipmap_buffer_t *mbuf)
{
  const char *ext = filename + strlen(filename);
  while(*ext != '.' && ext > filename) ext--;
  if(strncmp(ext, ".png", 4) && strncmp(ext, ".PNG", 4)) return DT_IMAGEIO_FILE_CORRUPTED;
  if(!img->exif_inited) (void)dt_exif_read(img, filename);

  dt_imageio_png_t image;
  uint8_t *buf = NULL;
  uint32_t width, height;
  uint16_t bpp;


  if(read_header(filename, &image) != 0) return DT_IMAGEIO_FILE_CORRUPTED;

  width = img->width = image.width;
  height = img->height = image.height;
  bpp = image.bit_depth;

  img->bpp = 4 * sizeof(float);

  float *mipbuf = (float *)dt_mipmap_cache_alloc(mbuf, img);
  if(!mipbuf)
  {
    fclose(image.f);
    png_destroy_read_struct(&image.png_ptr, &image.info_ptr, NULL);
    fprintf(stderr, "[png_open] could not alloc full buffer for image `%s'\n", img->filename);
    return DT_IMAGEIO_CACHE_FULL;
  }

  buf = dt_alloc_align(16, (size_t)width * height * 3 * (bpp < 16 ? 1 : 2));
  if(!buf)
  {
    fclose(image.f);
    png_destroy_read_struct(&image.png_ptr, &image.info_ptr, NULL);
    fprintf(stderr, "[png_open] could not alloc intermediate buffer for image `%s'\n", img->filename);
    return DT_IMAGEIO_CACHE_FULL;
  }

  if(read_image(&image, (void *)buf) != 0)
  {
    dt_free_align(buf);
    fprintf(stderr, "[png_open] could not read image `%s'\n", img->filename);
    return DT_IMAGEIO_FILE_CORRUPTED;
  }

  for(size_t j = 0; j < height; j++)
  {
    if(bpp < 16)
      for(size_t i = 0; i < width; i++)
        for(int k = 0; k < 3; k++)
          mipbuf[4 * (j * width + i) + k] = buf[3 * (j * width + i) + k] * (1.0f / 255.0f);
    else
      for(size_t i = 0; i < width; i++)
        for(int k = 0; k < 3; k++)
          mipbuf[4 * (j * width + i) + k] = (256.0f * buf[2 * (3 * (j * width + i) + k)]
                                             + buf[2 * (3 * (j * width + i) + k) + 1]) * (1.0f / 65535.0f);
  }

  dt_free_align(buf);
  return DT_IMAGEIO_OK;
}

Example #25

0

Show file

File: mipmap_cache.c Project: rharrison10/darktable

void dt_mipmap_cache_init(dt_mipmap_cache_t *cache)
{
  // make sure static memory is initialized
  struct dt_mipmap_buffer_dsc *dsc = (struct dt_mipmap_buffer_dsc *)dt_mipmap_cache_static_dead_image;
  dead_image_f((dt_mipmap_buffer_t *)(dsc+1));

  cache->compression_type = 0;
  gchar *compression = dt_conf_get_string("cache_compression");
  if(compression)
  {
    if(!strcmp(compression, "low quality (fast)"))
      cache->compression_type = 1;
    else if(!strcmp(compression, "high quality (slow)"))
      cache->compression_type = 2;
    g_free(compression);
  }

  dt_print(DT_DEBUG_CACHE, "[mipmap_cache_init] using %s\n", cache->compression_type == 0 ? "no compression" :
           (cache->compression_type == 1 ? "low quality compression" : "slow high quality compression"));

  // adjust numbers to be large enough to hold what mem limit suggests.
  // we want at least 100MB, and consider 8G just still reasonable.
  size_t max_mem = CLAMPS(dt_conf_get_int64("cache_memory"), 100u<<20, ((uint64_t)8)<<30);
  const uint32_t parallel = CLAMP(dt_conf_get_int ("worker_threads")*dt_conf_get_int("parallel_export"), 1, 8);
  const int32_t max_size = 2048, min_size = 32;
  int32_t wd = darktable.thumbnail_width;
  int32_t ht = darktable.thumbnail_height;
  wd = CLAMPS(wd, min_size, max_size);
  ht = CLAMPS(ht, min_size, max_size);
  // round up to a multiple of 8, so we can divide by two 3 times
  if(wd & 0xf) wd = (wd & ~0xf) + 0x10;
  if(ht & 0xf) ht = (ht & ~0xf) + 0x10;
  // cache these, can't change at runtime:
  cache->mip[DT_MIPMAP_F].max_width  = wd;
  cache->mip[DT_MIPMAP_F].max_height = ht;
  cache->mip[DT_MIPMAP_F-1].max_width  = wd;
  cache->mip[DT_MIPMAP_F-1].max_height = ht;
  for(int k=DT_MIPMAP_F-2; k>=DT_MIPMAP_0; k--)
  {
    cache->mip[k].max_width  = cache->mip[k+1].max_width  / 2;
    cache->mip[k].max_height = cache->mip[k+1].max_height / 2;
  }

  // initialize some per-thread cached scratchmem for uncompressed buffers during thumb creation:
  if(cache->compression_type)
  {
    cache->scratchmem.max_width = wd;
    cache->scratchmem.max_height = ht;
    cache->scratchmem.buffer_size = wd*ht*sizeof(uint32_t);
    cache->scratchmem.size = DT_MIPMAP_3; // at max.
    // TODO: use thread local storage instead (zero performance penalty on linux)
    dt_cache_init(&cache->scratchmem.cache, parallel, parallel, 64, 0.9f*parallel*wd*ht*sizeof(uint32_t));
    // might have been rounded to power of two:
    const int cnt = dt_cache_capacity(&cache->scratchmem.cache);
    cache->scratchmem.buf = dt_alloc_align(64, cnt * wd*ht*sizeof(uint32_t));
    dt_cache_static_allocation(&cache->scratchmem.cache, (uint8_t *)cache->scratchmem.buf, wd*ht*sizeof(uint32_t));
    dt_cache_set_allocate_callback(&cache->scratchmem.cache,
                                   scratchmem_allocate, &cache->scratchmem);
    dt_print(DT_DEBUG_CACHE,
             "[mipmap_cache_init] cache has % 5d entries for temporary compression buffers (% 4.02f MB).\n",
             cnt, cnt* wd*ht*sizeof(uint32_t)/(1024.0*1024.0));
  }

  for(int k=DT_MIPMAP_3; k>=0; k--)
  {
    // clear stats:
    cache->mip[k].stats_requests = 0;
    cache->mip[k].stats_near_match = 0;
    cache->mip[k].stats_misses = 0;
    cache->mip[k].stats_fetches = 0;
    cache->mip[k].stats_standin = 0;
    // buffer stores width and height + actual data
    const int width  = cache->mip[k].max_width;
    const int height = cache->mip[k].max_height;
    // header + adjusted for dxt compression:
    cache->mip[k].buffer_size = 4*sizeof(uint32_t) + compressed_buffer_size(cache->compression_type, width, height);
    cache->mip[k].size = k;
    // level of parallelism also gives minimum size (which is twice that)
    // is rounded to a power of two by the cache anyways, we might as well.
    // XXX this needs adjustment for video mode (more full-res thumbs for replay)
    // TODO: collect hit/miss stats and auto-adjust to user browsing behaviour
    // TODO: can #prefetches be collected this way, too?
    const size_t max_mem2 = MAX(0, (k == 0) ? (max_mem) : (max_mem/(k+4)));
    uint32_t thumbnails = MAX(2, nearest_power_of_two((uint32_t)((double)max_mem2/cache->mip[k].buffer_size)));
    while(thumbnails > parallel && (size_t)thumbnails * cache->mip[k].buffer_size > max_mem2) thumbnails /= 2;

    // try to utilize that memory well (use 90% quota), the hopscotch paper claims good scalability up to
    // even more than that.
    dt_cache_init(&cache->mip[k].cache, thumbnails,
                  parallel,
                  64, 0.9f*thumbnails*cache->mip[k].buffer_size);

    // might have been rounded to power of two:
    thumbnails = dt_cache_capacity(&cache->mip[k].cache);
    max_mem -= thumbnails * cache->mip[k].buffer_size;
    // dt_print(DT_DEBUG_CACHE, "[mipmap mem] %4.02f left\n", max_mem/(1024.0*1024.0));
    cache->mip[k].buf = dt_alloc_align(64, thumbnails * cache->mip[k].buffer_size);
    dt_cache_static_allocation(&cache->mip[k].cache, (uint8_t *)cache->mip[k].buf, cache->mip[k].buffer_size);
    dt_cache_set_allocate_callback(&cache->mip[k].cache,
                                   dt_mipmap_cache_allocate, &cache->mip[k]);
    // dt_cache_set_cleanup_callback(&cache->mip[k].cache,
    // &dt_mipmap_cache_deallocate, &cache->mip[k]);

    dt_print(DT_DEBUG_CACHE,
             "[mipmap_cache_init] cache has % 5d entries for mip %d (% 4.02f MB).\n",
             thumbnails, k, thumbnails * cache->mip[k].buffer_size/(1024.0*1024.0));
  }

  // full buffer needs dynamic alloc:
  const int full_entries = MAX(2, parallel); // even with one thread you want two buffers. one for dr one for thumbs.
  int32_t max_mem_bufs = nearest_power_of_two(full_entries);

  // for this buffer, because it can be very busy during import, we want the minimum
  // number of entries in the hashtable to be 16, but leave the quota as is. the dynamic
  // alloc/free properties of this cache take care that no more memory is required.
  dt_cache_init(&cache->mip[DT_MIPMAP_FULL].cache, max_mem_bufs, parallel, 64, max_mem_bufs);
  dt_cache_set_allocate_callback(&cache->mip[DT_MIPMAP_FULL].cache,
                                 dt_mipmap_cache_allocate_dynamic, &cache->mip[DT_MIPMAP_FULL]);
  // dt_cache_set_cleanup_callback(&cache->mip[DT_MIPMAP_FULL].cache,
  // &dt_mipmap_cache_deallocate_dynamic, &cache->mip[DT_MIPMAP_FULL]);
  cache->mip[DT_MIPMAP_FULL].buffer_size = 0;
  cache->mip[DT_MIPMAP_FULL].size = DT_MIPMAP_FULL;
  cache->mip[DT_MIPMAP_FULL].buf = NULL;

  // same for mipf:
  dt_cache_init(&cache->mip[DT_MIPMAP_F].cache, max_mem_bufs, parallel, 64, max_mem_bufs);
  dt_cache_set_allocate_callback(&cache->mip[DT_MIPMAP_F].cache,
                                 dt_mipmap_cache_allocate_dynamic, &cache->mip[DT_MIPMAP_F]);
  dt_cache_set_cleanup_callback(&cache->mip[DT_MIPMAP_F].cache,
                                dt_mipmap_cache_deallocate_dynamic, &cache->mip[DT_MIPMAP_F]);
  cache->mip[DT_MIPMAP_F].buffer_size = 4*sizeof(uint32_t) +
                                        4*sizeof(float) * cache->mip[DT_MIPMAP_F].max_width * cache->mip[DT_MIPMAP_F].max_height;
  cache->mip[DT_MIPMAP_F].size = DT_MIPMAP_F;
  cache->mip[DT_MIPMAP_F].buf = NULL;

  dt_mipmap_cache_deserialize(cache);
}

Example #26

0

Show file

File: globaltonemap.c Project: a3novy/darktable

int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out,
               const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data;
  dt_iop_global_tonemap_global_data_t *gd = (dt_iop_global_tonemap_global_data_t *)self->data;
  dt_iop_global_tonemap_gui_data_t *g = (dt_iop_global_tonemap_gui_data_t *)self->gui_data;
  dt_bilateral_cl_t *b = NULL;

  cl_int err = -999;
  cl_mem dev_m = NULL;
  cl_mem dev_r = NULL;
  float *maximum = NULL;
  const int devid = piece->pipe->devid;
  int gtkernel = -1;

  const int width = roi_out->width;
  const int height = roi_out->height;
  float parameters[4] = { 0.0f };

  switch(d->operator)
  {
    case OPERATOR_REINHARD:
      gtkernel = gd->kernel_global_tonemap_reinhard;
      break;
    case OPERATOR_DRAGO:
      gtkernel = gd->kernel_global_tonemap_drago;
      break;
    case OPERATOR_FILMIC:
      gtkernel = gd->kernel_global_tonemap_filmic;
      break;
  }

  if(d->operator== OPERATOR_DRAGO)
  {
    const float eps = 0.0001f;
    float tmp_lwmax = NAN;

    // see comments in process() about lwmax value
    if(self->dev->gui_attached && g && piece->pipe->type == DT_DEV_PIXELPIPE_FULL)
    {
      dt_pthread_mutex_lock(&g->lock);
      const uint64_t hash = g->hash;
      dt_pthread_mutex_unlock(&g->lock);

      if(hash != 0 && !dt_dev_sync_pixelpipe_hash(self->dev, piece->pipe, 0, self->priority, &g->lock, &g->hash))
        dt_control_log(_("inconsistent output"));

      dt_pthread_mutex_lock(&g->lock);
      tmp_lwmax = g->lwmax;
      dt_pthread_mutex_unlock(&g->lock);
    }

    if(isnan(tmp_lwmax))
    {
      dt_opencl_local_buffer_t flocopt
        = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1,
                                      .cellsize = sizeof(float), .overhead = 0,
                                      .sizex = 1 << 4, .sizey = 1 << 4 };

      if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_first, &flocopt))
        goto error;

      const size_t bwidth = ROUNDUP(width, flocopt.sizex);
      const size_t bheight = ROUNDUP(height, flocopt.sizey);

      const int bufsize = (bwidth / flocopt.sizex) * (bheight / flocopt.sizey);

      dt_opencl_local_buffer_t slocopt
        = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1,
                                      .cellsize = sizeof(float), .overhead = 0,
                                      .sizex = 1 << 16, .sizey = 1 };

      if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_second, &slocopt))
        goto error;

      const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex);

      size_t sizes[3];
      size_t local[3];

      dev_m = dt_opencl_alloc_device_buffer(devid, (size_t)bufsize * sizeof(float));
      if(dev_m == NULL) goto error;

      dev_r = dt_opencl_alloc_device_buffer(devid, (size_t)reducesize * sizeof(float));
      if(dev_r == NULL) goto error;

      sizes[0] = bwidth;
      sizes[1] = bheight;
      sizes[2] = 1;
      local[0] = flocopt.sizex;
      local[1] = flocopt.sizey;
      local[2] = 1;
      dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 0, sizeof(cl_mem), &dev_in);
      dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 1, sizeof(int), &width);
      dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 2, sizeof(int), &height);
      dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 3, sizeof(cl_mem), &dev_m);
      dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 4, flocopt.sizex * flocopt.sizey * sizeof(float), NULL);
      err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_pixelmax_first, sizes, local);
      if(err != CL_SUCCESS) goto error;

      sizes[0] = reducesize * slocopt.sizex;
      sizes[1] = 1;
      sizes[2] = 1;
      local[0] = slocopt.sizex;
      local[1] = 1;
      local[2] = 1;
      dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 0, sizeof(cl_mem), &dev_m);
      dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 1, sizeof(cl_mem), &dev_r);
      dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 2, sizeof(int), &bufsize);
      dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 3, slocopt.sizex * sizeof(float), NULL);
      err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_pixelmax_second, sizes, local);
      if(err != CL_SUCCESS) goto error;

      maximum = dt_alloc_align(16, reducesize * sizeof(float));
      err = dt_opencl_read_buffer_from_device(devid, (void *)maximum, dev_r, 0,
                                            (size_t)reducesize * sizeof(float), CL_TRUE);
      if(err != CL_SUCCESS) goto error;

      dt_opencl_release_mem_object(dev_r);
      dt_opencl_release_mem_object(dev_m);
      dev_r = dev_m = NULL;

      for(int k = 1; k < reducesize; k++)
      {
        float mine = maximum[0];
        float other = maximum[k];
        maximum[0] = (other > mine) ? other : mine;
      }

      tmp_lwmax = MAX(eps, (maximum[0] * 0.01f));

      dt_free_align(maximum);
      maximum = NULL;
    }

    const float lwmax = tmp_lwmax;
    const float ldc = d->drago.max_light * 0.01f / log10f(lwmax + 1.0f);
    const float bl = logf(MAX(eps, d->drago.bias)) / logf(0.5f);

    parameters[0] = eps;
    parameters[1] = ldc;
    parameters[2] = bl;
    parameters[3] = lwmax;

    if(self->dev->gui_attached && g && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW)
    {
      uint64_t hash = dt_dev_hash_plus(self->dev, piece->pipe, 0, self->priority);
      dt_pthread_mutex_lock(&g->lock);
      g->lwmax = lwmax;
      g->hash = hash;
      dt_pthread_mutex_unlock(&g->lock);
    }
  }

  const float scale = piece->iscale / roi_in->scale;
  const float sigma_r = 8.0f; // does not depend on scale
  const float iw = piece->buf_in.width / scale;
  const float ih = piece->buf_in.height / scale;
  const float sigma_s = fminf(iw, ih) * 0.03f;

  if(d->detail != 0.0f)
  {
    b = dt_bilateral_init_cl(devid, roi_in->width, roi_in->height, sigma_s, sigma_r);
    if(!b) goto error;
    // get detail from unchanged input buffer
    err = dt_bilateral_splat_cl(b, dev_in);
    if(err != CL_SUCCESS) goto error;
  }

  size_t sizes[2] = { ROUNDUPWD(width), ROUNDUPHT(height) };
  dt_opencl_set_kernel_arg(devid, gtkernel, 0, sizeof(cl_mem), &dev_in);
  dt_opencl_set_kernel_arg(devid, gtkernel, 1, sizeof(cl_mem), &dev_out);
  dt_opencl_set_kernel_arg(devid, gtkernel, 2, sizeof(int), &width);
  dt_opencl_set_kernel_arg(devid, gtkernel, 3, sizeof(int), &height);
  dt_opencl_set_kernel_arg(devid, gtkernel, 4, 4 * sizeof(float), &parameters);
  err = dt_opencl_enqueue_kernel_2d(devid, gtkernel, sizes);
  if(err != CL_SUCCESS) goto error;

  if(d->detail != 0.0f)
  {
    err = dt_bilateral_blur_cl(b);
    if(err != CL_SUCCESS) goto error;
    // and apply it to output buffer after logscale
    err = dt_bilateral_slice_to_output_cl(b, dev_in, dev_out, d->detail);
    if(err != CL_SUCCESS) goto error;
    dt_bilateral_free_cl(b);
  }

  return TRUE;

error:
  if(b) dt_bilateral_free_cl(b);
  dt_opencl_release_mem_object(dev_m);
  dt_opencl_release_mem_object(dev_r);
  dt_free_align(maximum);
  dt_print(DT_DEBUG_OPENCL, "[opencl_global_tonemap] couldn't enqueue kernel! %d\n", err);
  return FALSE;
}
#endif


void tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
                     const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out,
                     struct dt_develop_tiling_t *tiling)
{
  dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data;

  const float scale = piece->iscale / roi_in->scale;
  const float iw = piece->buf_in.width / scale;
  const float ih = piece->buf_in.height / scale;
  const float sigma_s = fminf(iw, ih) * 0.03f;
  const float sigma_r = 8.0f;
  const int detail = (d->detail != 0.0f);

  const int width = roi_in->width;
  const int height = roi_in->height;
  const int channels = piece->colors;

  const size_t basebuffer = width * height * channels * sizeof(float);

  tiling->factor = 2.0f + (detail ? (float)dt_bilateral_memory_use2(width, height, sigma_s, sigma_r) / basebuffer : 0.0f);
  tiling->maxbuf
      = (detail ? MAX(1.0f, (float)dt_bilateral_singlebuffer_size2(width, height, sigma_s, sigma_r) / basebuffer) : 1.0f);
  tiling->overhead = 0;
  tiling->overlap = (detail ? ceilf(4 * sigma_s) : 0);
  tiling->xalign = 1;
  tiling->yalign = 1;
  return;
}

void commit_params(struct dt_iop_module_t *self, dt_iop_params_t *p1, dt_dev_pixelpipe_t *pipe,
                   dt_dev_pixelpipe_iop_t *piece)
{
  dt_iop_global_tonemap_params_t *p = (dt_iop_global_tonemap_params_t *)p1;
  dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data;

  d->operator= p->operator;
  d->drago.bias = p->drago.bias;
  d->drago.max_light = p->drago.max_light;
  d->detail = p->detail;

  // drago needs the maximum L-value of the whole image so it must not use tiling
  if(d->operator == OPERATOR_DRAGO) piece->process_tiling_ready = 0;

#ifdef HAVE_OPENCL
  if(d->detail != 0.0f)
    piece->process_cl_ready = (piece->process_cl_ready && !(darktable.opencl->avoid_atomics));
#endif
}

Example #27

0

Show file

File: cache.c Project: CChiappa/darktable

// if found, the data void* is returned. if not, it is set to be
// the given *data and a new hash table entry is created, which can be
// found using the given key later on.
dt_cache_entry_t *dt_cache_get_with_caller(dt_cache_t *cache, const uint32_t key, char mode, const char *file, int line)
{
  gpointer orig_key, value;
  gboolean res;
  int result;
  double start = dt_get_wtime();
restart:
  dt_pthread_mutex_lock(&cache->lock);
  res = g_hash_table_lookup_extended(
      cache->hashtable, GINT_TO_POINTER(key), &orig_key, &value);
  if(res)
  { // yay, found. read lock and pass on.
    dt_cache_entry_t *entry = (dt_cache_entry_t *)value;
    if(mode == 'w') result = dt_pthread_rwlock_trywrlock_with_caller(&entry->lock, file, line);
    else            result = dt_pthread_rwlock_tryrdlock_with_caller(&entry->lock, file, line);
    if(result)
    { // need to give up mutex so other threads have a chance to get in between and
      // free the lock we're trying to acquire:
      dt_pthread_mutex_unlock(&cache->lock);
      g_usleep(5);
      goto restart;
    }
    // bubble up in lru list:
    cache->lru = g_list_remove_link(cache->lru, entry->link);
    cache->lru = g_list_concat(cache->lru, entry->link);
    dt_pthread_mutex_unlock(&cache->lock);

#ifdef _DEBUG
    const pthread_t writer = dt_pthread_rwlock_get_writer(&entry->lock);
    if(mode == 'w')
    {
      assert(pthread_equal(writer, pthread_self()));
    }
    else
    {
      assert(!pthread_equal(writer, pthread_self()));
    }
#endif

    if(mode == 'w')
    {
      assert(entry->data_size);
      ASAN_POISON_MEMORY_REGION(entry->data, entry->data_size);
    }

    // WARNING: do *NOT* unpoison here. it must be done by the caller!

    return entry;
  }

  // else, not found, need to allocate.

  // first try to clean up.
  // also wait if we can't free more than the requested fill ratio.
  if(cache->cost > 0.8f * cache->cost_quota)
  {
    // need to roll back all the way to get a consistent lock state:
    dt_cache_gc(cache, 0.8f);
  }

  // here dies your 32-bit system:
  dt_cache_entry_t *entry = (dt_cache_entry_t *)g_slice_alloc(sizeof(dt_cache_entry_t));
  int ret = dt_pthread_rwlock_init(&entry->lock, 0);
  if(ret) fprintf(stderr, "rwlock init: %d\n", ret);
  entry->data = 0;
  entry->data_size = cache->entry_size;
  entry->cost = 1;
  entry->link = g_list_append(0, entry);
  entry->key = key;
  entry->_lock_demoting = 0;

  g_hash_table_insert(cache->hashtable, GINT_TO_POINTER(key), entry);

  assert(cache->allocate || entry->data_size);

  if(cache->allocate)
    cache->allocate(cache->allocate_data, entry);
  else
    entry->data = dt_alloc_align(16, entry->data_size);

  assert(entry->data_size);
  ASAN_POISON_MEMORY_REGION(entry->data, entry->data_size);

  // if allocate callback is given, always return a write lock
  const int write = ((mode == 'w') || cache->allocate);

  // write lock in case the caller requests it:
  if(write) dt_pthread_rwlock_wrlock_with_caller(&entry->lock, file, line);
  else      dt_pthread_rwlock_rdlock_with_caller(&entry->lock, file, line);

  cache->cost += entry->cost;

  // put at end of lru list (most recently used):
  cache->lru = g_list_concat(cache->lru, entry->link);

  dt_pthread_mutex_unlock(&cache->lock);
  double end = dt_get_wtime();
  if(end - start > 0.1)
    fprintf(stderr, "wait time %.06fs\n", end - start);

  // WARNING: do *NOT* unpoison here. it must be done by the caller!

  return entry;
}

Example #28

0

Show file

File: nlmeans.c Project: joninvski/darktable

/** process, all real work is done here. */
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  // this is called for preview and full pipe separately, each with its own pixelpipe piece.
  // get our data struct:
  dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data;

  // adjust to zoom size:
  const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size
  const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood
  if(P <= 1)
  {
    // nothing to do from this distance:
    memcpy (ovoid, ivoid, sizeof(float)*4*roi_out->width*roi_out->height);
    return;
  }

  // adjust to Lab, make L more important
  // float max_L = 100.0f, max_C = 256.0f;
  // float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
  float max_L = 120.0f, max_C = 512.0f;
  float nL = 1.0f/max_L, nC = 1.0f/max_C;
  const float norm2[4] = { nL*nL, nC*nC, nC*nC, 1.0f };

  float *Sa = dt_alloc_align(64, sizeof(float)*roi_out->width*dt_get_num_threads());
  // we want to sum up weights in col[3], so need to init to 0:
  memset(ovoid, 0x0, sizeof(float)*roi_out->width*roi_out->height*4);

  // for each shift vector
  for(int kj=-K;kj<=K;kj++)
  {
    for(int ki=-K;ki<=K;ki++)
    {
      int inited_slide = 0;
      // don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else we will add up errors)
      // do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more memory
#ifdef _OPENMP
#  pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, ivoid, ovoid, Sa)
#endif
      for(int j=0; j<roi_out->height; j++)
      {
        if(j+kj < 0 || j+kj >= roi_out->height) continue;
        float *S = Sa + dt_get_thread_num() * roi_out->width;
        const float *ins = ((float *)ivoid) + 4*(roi_in->width *(j+kj) + ki);
        float *out = ((float *)ovoid) + 4*roi_out->width*j;

        const int Pm = MIN(MIN(P, j+kj), j);
        const int PM = MIN(MIN(P, roi_out->height-1-j-kj), roi_out->height-1-j);
        // first line of every thread
        // TODO: also every once in a while to assert numerical precision!
        if(!inited_slide)
        {
          // sum up a line 
          memset(S, 0x0, sizeof(float)*roi_out->width);
          for(int jj=-Pm;jj<=PM;jj++)
          {
            int i = MAX(0, -ki);
            float *s = S + i;
            const float *inp  = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+jj);
            const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+jj+kj) + ki);
            const int last = roi_out->width + MIN(0, -ki);
            for(; i<last; i++, inp+=4, inps+=4, s++)
            {
              for(int k=0;k<3;k++)
                s[0] += (inp[k] - inps[k])*(inp[k] - inps[k]) * norm2[k];
            }
          }
          // only reuse this if we had a full stripe
          if(Pm == P && PM == P) inited_slide = 1;
        }

        // sliding window for this line:
        float *s = S;
        float slide = 0.0f;
        // sum up the first -P..P
        for(int i=0;i<2*P+1;i++) slide += s[i];
        for(int i=0; i<roi_out->width; i++)
        {
          if(i-P > 0 && i+P<roi_out->width)
            slide += s[P] - s[-P-1];
          if(i+ki >= 0 && i+ki < roi_out->width)
          {
            const __m128 iv = { ins[0], ins[1], ins[2], 1.0f };
            _mm_store_ps(out, _mm_load_ps(out) + iv * _mm_set1_ps(gh(slide)));
          }
          s   ++;
          ins += 4;
          out += 4;
        }
        if(inited_slide && j+P+1+MAX(0,kj) < roi_out->height)
        {
          // sliding window in j direction:
          int i = MAX(0, -ki);
          float *s = S + i;
          const float *inp  = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+P+1);
          const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+P+1+kj) + ki);
          const float *inm  = ((float *)ivoid) + 4*i + 4* roi_in->width *(j-P);
          const float *inms = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j-P+kj) + ki);
          const int last = roi_out->width + MIN(0, -ki);
          for(; ((unsigned long)s & 0xf) != 0 && i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)
          {
            float stmp = s[0];
            for(int k=0;k<3;k++)
              stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])
                    -  (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];
            s[0] = stmp;
          }
          /* Process most of the line 4 pixels at a time */
          for(; i<last-4; i+=4, inp+=16, inps+=16, inm+=16, inms+=16, s+=4)
          {
            __m128 sv = _mm_load_ps(s);
            const __m128 inp1 = _mm_load_ps(inp)    - _mm_load_ps(inps);
            const __m128 inp2 = _mm_load_ps(inp+4)  - _mm_load_ps(inps+4);
            const __m128 inp3 = _mm_load_ps(inp+8)  - _mm_load_ps(inps+8);
            const __m128 inp4 = _mm_load_ps(inp+12) - _mm_load_ps(inps+12);

            const __m128 inp12lo = _mm_unpacklo_ps(inp1,inp2);
            const __m128 inp34lo = _mm_unpacklo_ps(inp3,inp4);
            const __m128 inp12hi = _mm_unpackhi_ps(inp1,inp2);
            const __m128 inp34hi = _mm_unpackhi_ps(inp3,inp4);

            const __m128 inpv0 = _mm_movelh_ps(inp12lo,inp34lo);
            sv += inpv0*inpv0 * _mm_set1_ps(norm2[0]);

            const __m128 inpv1 = _mm_movehl_ps(inp34lo,inp12lo);
            sv += inpv1*inpv1 * _mm_set1_ps(norm2[1]);

            const __m128 inpv2 = _mm_movelh_ps(inp12hi,inp34hi);
            sv += inpv2*inpv2 * _mm_set1_ps(norm2[2]);

            const __m128 inm1 = _mm_load_ps(inm)    - _mm_load_ps(inms);
            const __m128 inm2 = _mm_load_ps(inm+4)  - _mm_load_ps(inms+4);
            const __m128 inm3 = _mm_load_ps(inm+8)  - _mm_load_ps(inms+8);
            const __m128 inm4 = _mm_load_ps(inm+12) - _mm_load_ps(inms+12);

            const __m128 inm12lo = _mm_unpacklo_ps(inm1,inm2);
            const __m128 inm34lo = _mm_unpacklo_ps(inm3,inm4);
            const __m128 inm12hi = _mm_unpackhi_ps(inm1,inm2);
            const __m128 inm34hi = _mm_unpackhi_ps(inm3,inm4);

            const __m128 inmv0 = _mm_movelh_ps(inm12lo,inm34lo);
            sv -= inmv0*inmv0 * _mm_set1_ps(norm2[0]);

            const __m128 inmv1 = _mm_movehl_ps(inm34lo,inm12lo);
            sv -= inmv1*inmv1 * _mm_set1_ps(norm2[1]);

            const __m128 inmv2 = _mm_movelh_ps(inm12hi,inm34hi);
            sv -= inmv2*inmv2 * _mm_set1_ps(norm2[2]);

            _mm_store_ps(s, sv);
          }
          for(; i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)
          {
            float stmp = s[0];
            for(int k=0;k<3;k++)
              stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])
                    -  (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];
            s[0] = stmp;
          }
        }
        else inited_slide = 0;
      }
    }
  }
  // normalize and apply chroma/luma blending
  // bias a bit towards higher values for low input values:
  const __m128 weight = _mm_set_ps(1.0f, powf(d->chroma, 0.6), powf(d->chroma, 0.6), powf(d->luma, 0.6));
  const __m128 invert = _mm_sub_ps(_mm_set1_ps(1.0f), weight);
#ifdef _OPENMP
  #pragma omp parallel for default(none) schedule(static) shared(ovoid,ivoid,roi_out,d)
#endif
  for(int j=0; j<roi_out->height; j++)
  {
    float *out = ((float *)ovoid) + 4*roi_out->width*j;
    float *in  = ((float *)ivoid) + 4*roi_out->width*j;
    for(int i=0; i<roi_out->width; i++)
    {
      _mm_store_ps(out, _mm_add_ps(
          _mm_mul_ps(_mm_load_ps(in),  invert),
          _mm_mul_ps(_mm_load_ps(out), _mm_div_ps(weight, _mm_set1_ps(out[3])))));
      out += 4;
      in  += 4;
    }
  }
  // free shared tmp memory:
  free(Sa);
}