int write_image(dt_imageio_module_data_t *data, const char *filename, const void *ivoid, void *exif, int exif_len, int imgid, int num, int total) { const dt_imageio_module_data_t *const pfm = data; int status = 0; FILE *f = fopen(filename, "wb"); if(f) { // INFO: per-line fwrite call seems to perform best. LebedevRI, 18.04.2014 (void)fprintf(f, "PF\n%d %d\n-1.0\n", pfm->width, pfm->height); void *buf_line = dt_alloc_align(16, 3 * sizeof(float) * pfm->width); for(int j = 0; j < pfm->height; j++) { // NOTE: pfm has rows in reverse order const int row_in = pfm->height - 1 - j; const float *in = (const float *)ivoid + 4 * (size_t)pfm->width * row_in; float *out = (float *)buf_line; for(int i = 0; i < pfm->width; i++, in += 4, out += 3) { memcpy(out, in, 3 * sizeof(float)); } int cnt = fwrite(buf_line, 3 * sizeof(float), pfm->width, f); if(cnt != pfm->width) status = 1; else status = 0; } dt_free_align(buf_line); buf_line = NULL; fclose(f); } return status; }
int dt_dev_pixelpipe_cache_init(dt_dev_pixelpipe_cache_t *cache, int entries, size_t size) { cache->entries = entries; cache->data = (void **)calloc(entries, sizeof(void *)); cache->size = (size_t *)calloc(entries, sizeof(size_t)); cache->hash = (uint64_t *)calloc(entries, sizeof(uint64_t)); cache->used = (int32_t *)calloc(entries, sizeof(int32_t)); for(int k = 0; k < entries; k++) { cache->data[k] = (void *)dt_alloc_align(16, size); if(!cache->data[k]) goto alloc_memory_fail; cache->size[k] = size; #ifdef _DEBUG memset(cache->data[k], 0x5d, size); #endif cache->hash[k] = -1; cache->used[k] = 0; } cache->queries = cache->misses = 0; return 1; alloc_memory_fail: for(int k = 0; k < entries; k++) { if(cache->data[k]) dt_free_align(cache->data[k]); } free(cache->data); free(cache->size); free(cache->hash); free(cache->used); return 0; }
void dt_image_cache_init(dt_image_cache_t *cache) { // the image cache does no serialization. // (unsafe. data should be in db/xmp, not in any other additional cache, // also, it should be relatively fast to get the image_t structs from sql.) // TODO: actually an independent conf var? // too large: dangerous and wasteful? // can we get away with a fixed size? const uint32_t max_mem = 50*1024*1024; uint32_t num = (uint32_t)(1.5f*max_mem/sizeof(dt_image_t)); dt_cache_init(&cache->cache, num, 16, 64, max_mem); dt_cache_set_allocate_callback(&cache->cache, &dt_image_cache_allocate, cache); dt_cache_set_cleanup_callback (&cache->cache, &dt_image_cache_deallocate, cache); // might have been rounded to power of two: num = dt_cache_capacity(&cache->cache); cache->images = dt_alloc_align(64, sizeof(dt_image_t)*num); dt_print(DT_DEBUG_CACHE, "[image_cache] has %d entries\n", num); // initialize first image as empty data: dt_image_init(cache->images); for(uint32_t k=1; k<num; k++) { // optimized initialization (avoid accessing conf): memcpy(cache->images + k, cache->images, sizeof(dt_image_t)); } }
// callback for the cache backend to initialize payload pointers int32_t dt_mipmap_cache_allocate_dynamic(void *data, const uint32_t key, int32_t *cost, void **buf) { // for full image buffers struct dt_mipmap_buffer_dsc* dsc = *buf; // alloc mere minimum for the header + broken image buffer: if(!dsc) { *buf = dt_alloc_align(16, sizeof(*dsc)+sizeof(float)*4*64); // fprintf(stderr, "[mipmap cache] alloc dynamic for key %u %lX\n", key, (uint64_t)*buf); if(!(*buf)) { fprintf(stderr, "[mipmap cache] memory allocation failed!\n"); exit(1); } dsc = *buf; dsc->width = 0; dsc->height = 0; dsc->size = sizeof(*dsc)+sizeof(float)*4*64; } assert(dsc->size >= sizeof(*dsc)); dsc->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE; // cost is just flat one for the buffer, as the buffers might have different sizes, // to make sure quota is meaningful. *cost = 1; // fprintf(stderr, "dummy allocing %lX\n", (uint64_t)*buf); return 1; // request write lock }
int dt_dev_pixelpipe_process(dt_dev_pixelpipe_t *pipe, dt_develop_t *dev, int x, int y, int width, int height, float scale) { pipe->processing = 1; printf("pixelpipe process start\n"); // have backbuf in right size: if(pipe->backbuf_size < width*height*4*sizeof(uint8_t)) { pthread_mutex_lock(&pipe->backbuf_mutex); pipe->backbuf_size = width*height*4*sizeof(uint8_t); free(pipe->backbuf); pipe->backbuf = (uint8_t *)dt_alloc_align(16, pipe->backbuf_size); pthread_mutex_unlock(&pipe->backbuf_mutex); } // scale node (is slow): // scale *= 2; // FIXME: this seems to be a bug in gegl. need to manually adjust updated roi here. GeglRectangle roi = (GeglRectangle) { x, y, width, height }; GeglRectangle roio = (GeglRectangle) { roi.x/scale, roi.y/scale, roi.width/scale, roi.height/scale }; roio.x = MAX(0, roio.x); roio.y = MAX(0, roio.y); roio.width = MIN(pipe->iwidth -roio.x-1, roio.width); roio.height = MIN(pipe->iheight-roio.y-1, roio.height); GeglProcessor *processor = gegl_node_new_processor (pipe->output, &roio); // gegl_node_set(pipe->scale, "x", scale, "y", scale, NULL); // GeglProcessor *processor = gegl_node_new_processor (pipe->output, roi); double progress; // TODO: insert constant scale node at beginning, maintain lo-res branch of pipeline (shadowed). // TODO: decide on scale param, which one to use. while (gegl_processor_work (processor, &progress)) { // if history changed, abort processing? if(pipe->changed != DT_DEV_PIPE_UNCHANGED || dev->gui_leaving) return 1; } gegl_processor_destroy (processor); // gegl scale node turned out to be even slower :( gegl_node_blit (pipe->output, scale, &roi, babl_format("RGBA u8"), pipe->backbuf, GEGL_AUTO_ROWSTRIDE, GEGL_BLIT_CACHE); // gegl_node_blit (pipe->output, 1.0, roi, babl_format("RGBA u8"), output, GEGL_AUTO_ROWSTRIDE, GEGL_BLIT_CACHE); // TODO: update histograms here with this data? printf("pixelpipe process end\n"); pipe->processing = 0; return 0; }
// compression stuff: alloc a buffer if needed uint8_t* dt_mipmap_cache_alloc_scratchmem( const dt_mipmap_cache_t *cache) { const size_t size = cache->mip[DT_MIPMAP_3].max_width * cache->mip[DT_MIPMAP_3].max_height; if(cache->compression_type) { return dt_alloc_align(64, size * 4 * sizeof(uint8_t)); } else // no compression, no buffer: return NULL; }
int dt_dev_pixelpipe_cache_get_weighted(dt_dev_pixelpipe_cache_t *cache, const uint64_t hash, const size_t size, void **data, int weight) { cache->queries++; *data = NULL; int max_used = -1, max = 0; size_t sz = 0; for(int k = 0; k < cache->entries; k++) { // search for hash in cache if(cache->used[k] > max_used) { max_used = cache->used[k]; max = k; } cache->used[k]++; // age all entries if(cache->hash[k] == hash) { *data = cache->data[k]; sz = cache->size[k]; cache->used[k] = weight; // this is the MRU entry } } if(!*data || sz < size) { // kill LRU entry // printf("[pixelpipe_cache_get] hash not found, returning slot %d/%d age %d\n", max, cache->entries, // weight); if(cache->size[max] < size) { dt_free_align(cache->data[max]); cache->data[max] = (void *)dt_alloc_align(16, size); cache->size[max] = size; } *data = cache->data[max]; cache->hash[max] = hash; cache->used[max] = weight; cache->misses++; return 1; } else return 0; }
dt_gaussian_t * dt_gaussian_init( const int width, // width of input image const int height, // height of input image const int channels, // channels per pixel const float *max, // maximum allowed values per channel for clamping const float *min, // minimum allowed values per channel for clamping const float sigma, // gaussian sigma const int order) // order of gaussian blur { dt_gaussian_t *g = (dt_gaussian_t *)malloc(sizeof(dt_gaussian_t)); if(!g) return NULL; g->width = width; g->height = height; g->channels = channels; g->sigma = sigma; g->order = order; g->buf = NULL; g->max = (float *)malloc(channels * sizeof(float)); g->min = (float *)malloc(channels * sizeof(float)); if(!g->min || !g->max) goto error; for(int k=0; k < channels; k++) { g->max[k] = max[k]; g->min[k] = min[k]; } g->buf = dt_alloc_align(64, width*height*channels*sizeof(float)); if(!g->buf) goto error; return g; error: free(g->buf); free(g->max); free(g->min); free(g); return NULL; }
int write_image(dt_imageio_module_data_t *data, const char *filename, const void *ivoid, void *exif, int exif_len, int imgid, int num, int total) { const dt_imageio_module_data_t *const pfm = data; int status = 0; FILE *f = fopen(filename, "wb"); if(f) { // align pfm header to sse, assuming the file will // be mmapped to page boundaries. char header[1024]; snprintf(header, 1024, "PF\n%d %d\n-1.0", pfm->width, pfm->height); size_t len = strlen(header); fprintf(f, "PF\n%d %d\n-1.0", pfm->width, pfm->height); ssize_t off = 0; while((len + 1 + off) & 0xf) off++; while(off-- > 0) fprintf(f, "0"); fprintf(f, "\n"); void *buf_line = dt_alloc_align(16, 3 * sizeof(float) * pfm->width); for(int j = 0; j < pfm->height; j++) { // NOTE: pfm has rows in reverse order const int row_in = pfm->height - 1 - j; const float *in = (const float *)ivoid + 4 * (size_t)pfm->width * row_in; float *out = (float *)buf_line; for(int i = 0; i < pfm->width; i++, in += 4, out += 3) { memcpy(out, in, 3 * sizeof(float)); } // INFO: per-line fwrite call seems to perform best. LebedevRI, 18.04.2014 int cnt = fwrite(buf_line, 3 * sizeof(float), pfm->width, f); if(cnt != pfm->width) status = 1; else status = 0; } dt_free_align(buf_line); buf_line = NULL; fclose(f); } return status; }
// callback for the imageio core to allocate memory. // only needed for _F and _FULL buffers, as they change size // with the input image. will allocate img->width*img->height*img->bpp bytes. void* dt_mipmap_cache_alloc(dt_image_t *img, dt_mipmap_size_t size, dt_mipmap_cache_allocator_t a) { assert(size == DT_MIPMAP_FULL); struct dt_mipmap_buffer_dsc** dsc = (struct dt_mipmap_buffer_dsc**)a; int32_t wd = img->width; int32_t ht = img->height; int32_t bpp = img->bpp; const uint32_t buffer_size = ((wd*ht*bpp) + sizeof(**dsc)); // buf might have been alloc'ed before, // so only check size and re-alloc if necessary: if(!(*dsc) || ((*dsc)->size < buffer_size) || ((void *)*dsc == (void *)dt_mipmap_cache_static_dead_image)) { if((void *)*dsc != (void *)dt_mipmap_cache_static_dead_image) dt_free_align(*dsc); *dsc = dt_alloc_align(64, buffer_size); // fprintf(stderr, "[mipmap cache] alloc for key %u %p\n", get_key(img->id, size), *buf); if(!(*dsc)) { // return fallback: at least alloc size for a dead image: *dsc = (struct dt_mipmap_buffer_dsc *)dt_mipmap_cache_static_dead_image; // allocator holds the pointer. but imageio client is tricked to believe allocation failed: return NULL; } // set buffer size only if we're making it larger. (*dsc)->size = buffer_size; } (*dsc)->width = wd; (*dsc)->height = ht; (*dsc)->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE; // fprintf(stderr, "full buffer allocating img %u %d x %d = %u bytes (%p)\n", img->id, img->width, img->height, buffer_size, *buf); // trick the user into using a pointer without the header: return (*dsc)+1; }
// allocate output buffer with monochrome brightness channel from input, padded // up by max_supp on all four sides, dimensions written to wd2 ht2 static inline float *ll_pad_input( const float *const input, const int wd, const int ht, const int max_supp, int *wd2, int *ht2) { const int stride = 4; *wd2 = 2*max_supp + wd; *ht2 = 2*max_supp + ht; float *const out = dt_alloc_align(16, *wd2**ht2*sizeof(*out)); #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) default(none) shared(wd2, ht2) #endif for(int j=0;j<ht;j++) { for(int i=0;i<max_supp;i++) out[(j+max_supp)**wd2+i] = input[stride*wd*j]* 0.01f; // L -> [0,1] for(int i=0;i<wd;i++) out[(j+max_supp)**wd2+i+max_supp] = input[stride*(wd*j+i)] * 0.01f; // L -> [0,1] for(int i=wd+max_supp;i<*wd2;i++) out[(j+max_supp)**wd2+i] = input[stride*(j*wd+wd-1)] * 0.01f; // L -> [0,1] } #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) default(none) shared(wd2, ht2) #endif for(int j=0;j<max_supp;j++) memcpy(out + *wd2*j, out+max_supp**wd2, sizeof(float)**wd2); #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) default(none) shared(wd2, ht2) #endif for(int j=max_supp+ht;j<*ht2;j++) memcpy(out + *wd2*j, out + *wd2*(max_supp+ht-1), sizeof(float)**wd2); return out; }
// callback for the imageio core to allocate memory. // only needed for _F and _FULL buffers, as they change size // with the input image. will allocate img->width*img->height*img->bpp bytes. void *dt_mipmap_cache_alloc(dt_mipmap_buffer_t *buf, const dt_image_t *img) { assert(buf->size == DT_MIPMAP_FULL); const int wd = img->width; const int ht = img->height; struct dt_mipmap_buffer_dsc *dsc = (struct dt_mipmap_buffer_dsc *)buf->cache_entry->data; const size_t buffer_size = (size_t)wd*ht*img->bpp + sizeof(*dsc); // buf might have been alloc'ed before, // so only check size and re-alloc if necessary: if(!buf->buf || (dsc->size < buffer_size) || ((void *)dsc == (void *)dt_mipmap_cache_static_dead_image)) { if((void *)dsc != (void *)dt_mipmap_cache_static_dead_image) dt_free_align(buf->cache_entry->data); buf->cache_entry->data = dt_alloc_align(64, buffer_size); if(!buf->cache_entry->data) { // return fallback: at least alloc size for a dead image: buf->cache_entry->data = (void*)dt_mipmap_cache_static_dead_image; // allocator holds the pointer. but let imageio client know that allocation failed: return NULL; } // set buffer size only if we're making it larger. dsc = (struct dt_mipmap_buffer_dsc *)buf->cache_entry->data; dsc->size = buffer_size; } dsc->width = wd; dsc->height = ht; dsc->color_space = DT_COLORSPACE_NONE; dsc->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE; buf->buf = (uint8_t *)(dsc + 1); // fprintf(stderr, "full buffer allocating img %u %d x %d = %u bytes (%p)\n", img->id, img->width, // img->height, buffer_size, *buf); // return pointer to start of payload return dsc + 1; }
void local_laplacian_internal( const float *const input, // input buffer in some Labx or yuvx format float *const out, // output buffer with colour const int wd, // width and const int ht, // height of the input buffer const float sigma, // user param: separate shadows/midtones/highlights const float shadows, // user param: lift shadows const float highlights, // user param: compress highlights const float clarity, // user param: increase clarity/local contrast const int use_sse2) // flag whether to use SSE version { #define max_levels 30 #define num_gamma 6 // don't divide by 2 more often than we can: const int num_levels = MIN(max_levels, 31-__builtin_clz(MIN(wd,ht))); const int max_supp = 1<<(num_levels-1); int w, h; float *padded[max_levels] = {0}; padded[0] = ll_pad_input(input, wd, ht, max_supp, &w, &h); // allocate pyramid pointers for padded input for(int l=1;l<num_levels;l++) padded[l] = dt_alloc_align(16, sizeof(float)*dl(w,l)*dl(h,l)); // allocate pyramid pointers for output float *output[max_levels] = {0}; for(int l=0;l<num_levels;l++) output[l] = dt_alloc_align(16, sizeof(float)*dl(w,l)*dl(h,l)); // create gauss pyramid of padded input, write coarse directly to output #if defined(__SSE2__) if(use_sse2) { for(int l=1;l<num_levels-1;l++) gauss_reduce_sse2(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1)); gauss_reduce_sse2(padded[num_levels-2], output[num_levels-1], dl(w,num_levels-2), dl(h,num_levels-2)); } else #endif { for(int l=1;l<num_levels-1;l++) gauss_reduce(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1)); gauss_reduce(padded[num_levels-2], output[num_levels-1], dl(w,num_levels-2), dl(h,num_levels-2)); } // evenly sample brightness [0,1]: float gamma[num_gamma] = {0.0f}; for(int k=0;k<num_gamma;k++) gamma[k] = (k+.5f)/(float)num_gamma; // for(int k=0;k<num_gamma;k++) gamma[k] = k/(num_gamma-1.0f); // allocate memory for intermediate laplacian pyramids float *buf[num_gamma][max_levels] = {{0}}; for(int k=0;k<num_gamma;k++) for(int l=0;l<num_levels;l++) buf[k][l] = dt_alloc_align(16, sizeof(float)*dl(w,l)*dl(h,l)); // the paper says remapping only level 3 not 0 does the trick, too // (but i really like the additional octave of sharpness we get, // willing to pay the cost). for(int k=0;k<num_gamma;k++) { // process images #if defined(__SSE2__) if(use_sse2) apply_curve_sse2(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity); else // brackets in next line needed for silly gcc warning: #endif {apply_curve(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity);} // create gaussian pyramids for(int l=1;l<num_levels;l++) #if defined(__SSE2__) if(use_sse2) gauss_reduce_sse2(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1)); else #endif gauss_reduce(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1)); } // assemble output pyramid coarse to fine for(int l=num_levels-2;l >= 0; l--) { const int pw = dl(w,l), ph = dl(h,l); gauss_expand(output[l+1], output[l], pw, ph); // go through all coefficients in the upsampled gauss buffer: #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) collapse(2) shared(w,h,buf,output,l,gamma,padded) #endif for(int j=0;j<ph;j++) for(int i=0;i<pw;i++) { const float v = padded[l][j*pw+i]; int hi = 1; for(;hi<num_gamma-1 && gamma[hi] <= v;hi++); int lo = hi-1; const float a = CLAMPS((v - gamma[lo])/(gamma[hi]-gamma[lo]), 0.0f, 1.0f); const float l0 = ll_laplacian(buf[lo][l+1], buf[lo][l], i, j, pw, ph); const float l1 = ll_laplacian(buf[hi][l+1], buf[hi][l], i, j, pw, ph); output[l][j*pw+i] += l0 * (1.0f-a) + l1 * a; // we could do this to save on memory (no need for finest buf[][]). // unfortunately it results in a quite noticable loss of sharpness, i think // the extra level is worth it. // else if(l == 0) // use finest scale from input to not amplify noise (and use less memory) // output[l][j*pw+i] += ll_laplacian(padded[l+1], padded[l], i, j, pw, ph); } } #ifdef _OPENMP #pragma omp parallel for default(none) schedule(dynamic) collapse(2) shared(w,output,buf) #endif for(int j=0;j<ht;j++) for(int i=0;i<wd;i++) { out[4*(j*wd+i)+0] = 100.0f * output[0][(j+max_supp)*w+max_supp+i]; // [0,1] -> L out[4*(j*wd+i)+1] = input[4*(j*wd+i)+1]; // copy original colour channels out[4*(j*wd+i)+2] = input[4*(j*wd+i)+2]; } // free all buffers! for(int l=0;l<max_levels;l++) { dt_free_align(padded[l]); dt_free_align(output[l]); for(int k = 0; k < num_gamma; k++) dt_free_align(buf[k][l]); } #undef num_levels #undef num_gamma }
static inline void gauss_reduce_sse2( const float *const input, // fine input buffer float *const coarse, // coarse scale, blurred input buf const int wd, // fine res const int ht) { // blur, store only coarse res const int cw = (wd-1)/2+1, ch = (ht-1)/2+1; // this version is inspired by opencv's pyrDown_ : // - allocate 5 rows of ring buffer (aligned) // - for coarse res y // - fill 5 coarse-res row buffers with 1 4 6 4 1 weights (reuse some from last time) // - do vertical convolution via sse and write to coarse output buf const int stride = ((cw+8)&~7); // assure sse alignment of rows float *ringbuf = dt_alloc_align(16, sizeof(*ringbuf)*stride*5); float *rows[5] = {0}; int rowj = 0; // we initialised this many rows so far for(int j=1;j<ch-1;j++) { // horizontal pass, convolve with 1 4 6 4 1 kernel and decimate for(;rowj<=2*j+2;rowj++) { float *const row = ringbuf + (rowj % 5)*stride; const float *const in = input + rowj*wd; #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) #endif for(int i=1;i<cw-1;i++) row[i] = 6*in[2*i] + 4*(in[2*i-1]+in[2*i+1]) + in[2*i-2] + in[2*i+2]; } // init row pointers for(int k=0;k<5;k++) rows[k] = ringbuf + ((2*j-2+k)%5)*stride; // vertical pass, convolve and decimate using SIMD: // note that we're ignoring the (1..cw-1) buffer limit, we'll pull in // garbage and fix it later by border filling. float *const out = coarse + j*cw; const float *const row0 = rows[0], *const row1 = rows[1], *const row2 = rows[2], *const row3 = rows[3], *const row4 = rows[4]; const __m128 four = _mm_set1_ps(4.f), scale = _mm_set1_ps(1.f/256.f); #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) #endif for(int i=0;i<=cw-8;i+=8) { __m128 r0, r1, r2, r3, r4, t0, t1; r0 = _mm_load_ps(row0 + i); r1 = _mm_load_ps(row1 + i); r2 = _mm_load_ps(row2 + i); r3 = _mm_load_ps(row3 + i); r4 = _mm_load_ps(row4 + i); r0 = _mm_add_ps(r0, r4); r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2); r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2)); t0 = _mm_add_ps(r0, _mm_mul_ps(r1, four)); r0 = _mm_load_ps(row0 + i + 4); r1 = _mm_load_ps(row1 + i + 4); r2 = _mm_load_ps(row2 + i + 4); r3 = _mm_load_ps(row3 + i + 4); r4 = _mm_load_ps(row4 + i + 4); r0 = _mm_add_ps(r0, r4); r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2); r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2)); t1 = _mm_add_ps(r0, _mm_mul_ps(r1, four)); t0 = _mm_mul_ps(t0, scale); t1 = _mm_mul_ps(t1, scale); _mm_storeu_ps(out + i, t0); _mm_storeu_ps(out + i + 4, t1); } // process the rest for(int i=cw&~7;i<cw-1;i++) out[i] = (6*row2[i] + 4*(row1[i] + row3[i]) + row0[i] + row4[i])*(1.0f/256.0f); } dt_free_align(ringbuf); ll_fill_boundary1(coarse, cw, ch); }
// internal function: to avoid exif blob reading + 8-bit byteorder flag + high-quality override int dt_imageio_export_with_flags( const uint32_t imgid, const char *filename, dt_imageio_module_format_t *format, dt_imageio_module_data_t *format_params, const int32_t ignore_exif, const int32_t display_byteorder, const gboolean high_quality, const int32_t thumbnail_export, const char *filter, const gboolean copy_metadata, dt_imageio_module_storage_t *storage, dt_imageio_module_data_t *storage_params) { dt_develop_t dev; dt_dev_init(&dev, 0); dt_mipmap_buffer_t buf; if(thumbnail_export && dt_conf_get_bool("plugins/lighttable/low_quality_thumbnails")) dt_mipmap_cache_read_get(darktable.mipmap_cache, &buf, imgid, DT_MIPMAP_F, DT_MIPMAP_BLOCKING); else dt_mipmap_cache_read_get(darktable.mipmap_cache, &buf, imgid, DT_MIPMAP_FULL, DT_MIPMAP_BLOCKING); dt_dev_load_image(&dev, imgid); const dt_image_t *img = &dev.image_storage; const int wd = img->width; const int ht = img->height; int res = 0; dt_times_t start; dt_get_times(&start); dt_dev_pixelpipe_t pipe; res = thumbnail_export ? dt_dev_pixelpipe_init_thumbnail(&pipe, wd, ht) : dt_dev_pixelpipe_init_export(&pipe, wd, ht, format->levels(format_params)); if(!res) { dt_control_log(_("failed to allocate memory for %s, please lower the threads used for export or buy more memory."), thumbnail_export ? C_("noun", "thumbnail export") : C_("noun", "export")); dt_dev_cleanup(&dev); dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); return 1; } if(!buf.buf) { dt_control_log(_("image `%s' is not available!"), img->filename); dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); dt_dev_cleanup(&dev); return 1; } // If a style is to be applied during export, add the iop params into the history if (!thumbnail_export && format_params->style[0] != '\0') { GList *stls; GList *modules = dev.iop; dt_iop_module_t *m = NULL; if ((stls=dt_styles_get_item_list(format_params->style, TRUE, -1)) == 0) { dt_control_log(_("cannot find the style '%s' to apply during export."), format_params->style); dt_dev_cleanup(&dev); dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); return 1; } // Add each params while (stls) { dt_style_item_t *s = (dt_style_item_t *) stls->data; modules = dev.iop; while (modules) { m = (dt_iop_module_t *)modules->data; // since the name in the style is returned with a possible multi-name, just check the start of the name if (strncmp(m->op, s->name, strlen(m->op)) == 0) { dt_dev_history_item_t *h = malloc(sizeof(dt_dev_history_item_t)); h->params = s->params; h->blend_params = s->blendop_params; h->enabled = s->enabled; h->module = m; h->multi_priority = 1; g_strlcpy(h->multi_name, "", sizeof(h->multi_name)); if(m->legacy_params && (s->module_version != m->version())) { void *new_params = malloc(m->params_size); m->legacy_params (m, h->params, s->module_version, new_params, labs(m->version())); free (h->params); h->params = new_params; } dev.history_end++; dev.history = g_list_append(dev.history, h); break; } modules = g_list_next(modules); } stls = g_list_next(stls); } } dt_dev_pixelpipe_set_input(&pipe, &dev, (float *)buf.buf, buf.width, buf.height, 1.0); dt_dev_pixelpipe_create_nodes(&pipe, &dev); dt_dev_pixelpipe_synch_all(&pipe, &dev); dt_dev_pixelpipe_get_dimensions(&pipe, &dev, pipe.iwidth, pipe.iheight, &pipe.processed_width, &pipe.processed_height); if(filter) { if(!strncmp(filter, "pre:", 4)) dt_dev_pixelpipe_disable_after(&pipe, filter+4); if(!strncmp(filter, "post:", 5)) dt_dev_pixelpipe_disable_before(&pipe, filter+5); } dt_show_times(&start, "[export] creating pixelpipe", NULL); // find output color profile for this image: int sRGB = 1; gchar *overprofile = dt_conf_get_string("plugins/lighttable/export/iccprofile"); if(overprofile && !strcmp(overprofile, "sRGB")) { sRGB = 1; } else if(!overprofile || !strcmp(overprofile, "image")) { GList *modules = dev.iop; dt_iop_module_t *colorout = NULL; while (modules) { colorout = (dt_iop_module_t *)modules->data; if(colorout->get_p && strcmp(colorout->op, "colorout") == 0) { const char *iccprofile = colorout->get_p(colorout->params, "iccprofile"); if(!strcmp(iccprofile, "sRGB")) sRGB = 1; else sRGB = 0; } modules = g_list_next(modules); } } else { sRGB = 0; } g_free(overprofile); // get only once at the beginning, in case the user changes it on the way: const gboolean high_quality_processing = ((format_params->max_width == 0 || format_params->max_width >= pipe.processed_width ) && (format_params->max_height == 0 || format_params->max_height >= pipe.processed_height)) ? FALSE : high_quality; const int width = high_quality_processing ? 0 : format_params->max_width; const int height = high_quality_processing ? 0 : format_params->max_height; const double scalex = width > 0 ? fminf(width /(double)pipe.processed_width, 1.0) : 1.0; const double scaley = height > 0 ? fminf(height/(double)pipe.processed_height, 1.0) : 1.0; const double scale = fminf(scalex, scaley); int processed_width = scale*pipe.processed_width + .5f; int processed_height = scale*pipe.processed_height + .5f; const int bpp = format->bpp(format_params); // downsampling done last, if high quality processing was requested: uint8_t *outbuf = pipe.backbuf; uint8_t *moutbuf = NULL; // keep track of alloc'ed memory dt_get_times(&start); if(high_quality_processing) { dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale); const double scalex = format_params->max_width > 0 ? fminf(format_params->max_width /(double)pipe.processed_width, 1.0) : 1.0; const double scaley = format_params->max_height > 0 ? fminf(format_params->max_height/(double)pipe.processed_height, 1.0) : 1.0; const double scale = fminf(scalex, scaley); processed_width = scale*pipe.processed_width + .5f; processed_height = scale*pipe.processed_height + .5f; moutbuf = (uint8_t *)dt_alloc_align(64, (size_t)sizeof(float)*processed_width*processed_height*4); outbuf = moutbuf; // now downscale into the new buffer: dt_iop_roi_t roi_in, roi_out; roi_in.x = roi_in.y = roi_out.x = roi_out.y = 0; roi_in.scale = 1.0; roi_out.scale = scale; roi_in.width = pipe.processed_width; roi_in.height = pipe.processed_height; roi_out.width = processed_width; roi_out.height = processed_height; dt_iop_clip_and_zoom((float *)outbuf, (float *)pipe.backbuf, &roi_out, &roi_in, processed_width, pipe.processed_width); } else { // do the processing (8-bit with special treatment, to make sure we can use openmp further down): if(bpp == 8) dt_dev_pixelpipe_process(&pipe, &dev, 0, 0, processed_width, processed_height, scale); else dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale); outbuf = pipe.backbuf; } dt_show_times(&start, thumbnail_export ? "[dev_process_thumbnail] pixel pipeline processing" : "[dev_process_export] pixel pipeline processing", NULL); // downconversion to low-precision formats: if(bpp == 8) { if(display_byteorder) { if(high_quality_processing) { const float *const inbuf = (float *)outbuf; for(size_t k=0; k<(size_t)processed_width*processed_height; k++) { // convert in place, this is unfortunately very serial.. const uint8_t r = CLAMP(inbuf[4*k+2]*0xff, 0, 0xff); const uint8_t g = CLAMP(inbuf[4*k+1]*0xff, 0, 0xff); const uint8_t b = CLAMP(inbuf[4*k+0]*0xff, 0, 0xff); outbuf[4*k+0] = r; outbuf[4*k+1] = g; outbuf[4*k+2] = b; } } // else processing output was 8-bit already, and no need to swap order } else // need to flip { // ldr output: char if(high_quality_processing) { const float *const inbuf = (float *)outbuf; for(size_t k=0; k<(size_t)processed_width*processed_height; k++) { // convert in place, this is unfortunately very serial.. const uint8_t r = CLAMP(inbuf[4*k+0]*0xff, 0, 0xff); const uint8_t g = CLAMP(inbuf[4*k+1]*0xff, 0, 0xff); const uint8_t b = CLAMP(inbuf[4*k+2]*0xff, 0, 0xff); outbuf[4*k+0] = r; outbuf[4*k+1] = g; outbuf[4*k+2] = b; } } else { // !display_byteorder, need to swap: uint8_t *const buf8 = pipe.backbuf; #ifdef _OPENMP #pragma omp parallel for default(none) shared(processed_width, processed_height) schedule(static) #endif // just flip byte order for(size_t k=0; k<(size_t)processed_width*processed_height; k++) { uint8_t tmp = buf8[4*k+0]; buf8[4*k+0] = buf8[4*k+2]; buf8[4*k+2] = tmp; } } } } else if(bpp == 16) { // uint16_t per color channel float *buff = (float *) outbuf; uint16_t *buf16 = (uint16_t *)outbuf; for(int y=0; y<processed_height; y++) for(int x=0; x<processed_width ; x++) { // convert in place const size_t k = (size_t)processed_width*y + x; for(int i=0; i<3; i++) buf16[4*k+i] = CLAMP(buff[4*k+i]*0x10000, 0, 0xffff); } } // else output float, no further harm done to the pixels :) format_params->width = processed_width; format_params->height = processed_height; if(!ignore_exif) { int length; uint8_t exif_profile[65535]; // C++ alloc'ed buffer is uncool, so we waste some bits here. char pathname[PATH_MAX]; gboolean from_cache = TRUE; dt_image_full_path(imgid, pathname, sizeof(pathname), &from_cache); // last param is dng mode, it's false here length = dt_exif_read_blob(exif_profile, pathname, imgid, sRGB, processed_width, processed_height, 0); res = format->write_image (format_params, filename, outbuf, exif_profile, length, imgid); } else { res = format->write_image (format_params, filename, outbuf, NULL, 0, imgid); } dt_dev_pixelpipe_cleanup(&pipe); dt_dev_cleanup(&dev); dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); dt_free_align(moutbuf); /* now write xmp into that container, if possible */ if(copy_metadata && (format->flags(format_params) & FORMAT_FLAGS_SUPPORT_XMP)) { dt_exif_xmp_attach(imgid, filename); // no need to cancel the export if this fail } if(!thumbnail_export && strcmp(format->mime(format_params), "memory")) { dt_control_signal_raise(darktable.signals,DT_SIGNAL_IMAGE_EXPORT_TMPFILE,imgid,filename,format,format_params,storage,storage_params); } return res; }
static int dt_group_get_mask_roi(dt_iop_module_t *module, dt_dev_pixelpipe_iop_t *piece, dt_masks_form_t *form, const dt_iop_roi_t *roi, float *buffer) { double start2 = dt_get_wtime(); const guint nb = g_list_length(form->points); if(nb == 0) return 0; int nb_ok = 0; const int width = roi->width; const int height = roi->height; // we need to allocate a temporary buffer for intermediate creation of individual shapes float *bufs = dt_alloc_align(64, (size_t)width * height * sizeof(float)); if(bufs == NULL) return 0; // empty the output buffer memset(buffer, 0, (size_t)width * height * sizeof(float)); // and we get all masks GList *fpts = g_list_first(form->points); while(fpts) { dt_masks_point_group_t *fpt = (dt_masks_point_group_t *)fpts->data; dt_masks_form_t *sel = dt_masks_get_from_id(module->dev, fpt->formid); if(sel) { const int ok = dt_masks_get_mask_roi(module, piece, sel, roi, bufs); const float op = fpt->opacity; const int state = fpt->state; if(ok) { // first see if we need to invert this shape if(state & DT_MASKS_STATE_INVERSE) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs) #else #pragma omp parallel for shared(bufs) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; bufs[index] = 1.0f - bufs[index]; } } if(state & DT_MASKS_STATE_UNION) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; buffer[index] = fmaxf(buffer[index], bufs[index] * op); } } else if(state & DT_MASKS_STATE_INTERSECTION) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; float b1 = buffer[index]; float b2 = b2 = bufs[index]; // FIXME: is this line correct? what it supposed to be doing? if(b1 > 0.0f && b2 > 0.0f) buffer[index] = fminf(b1, b2 * op); else buffer[index] = 0.0f; } } else if(state & DT_MASKS_STATE_DIFFERENCE) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; float b1 = buffer[index]; float b2 = bufs[index] * op; if(b1 > 0.0f && b2 > 0.0f) buffer[index] = b1 * (1.0f - b2); } } else if(state & DT_MASKS_STATE_EXCLUSION) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; float b1 = buffer[index]; float b2 = bufs[index] * op; if(b1 > 0.0f && b2 > 0.0f) buffer[index] = fmaxf((1.0f - b1) * b2, b1 * (1.0f - b2)); else buffer[index] = fmaxf(b1, b2); } } else // if we are here, this mean that we just have to copy the shape and null other parts { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; buffer[index] = bufs[index] * op; } } if(darktable.unmuted & DT_DEBUG_PERF) dt_print(DT_DEBUG_MASKS, "[masks %d] combine took %0.04f sec\n", nb_ok, dt_get_wtime() - start2); start2 = dt_get_wtime(); nb_ok++; } } fpts = g_list_next(fpts); } // and we free the intermediate buffer dt_free_align(bufs); return (nb_ok != 0); }
dt_imageio_retval_t dt_imageio_open_gm( dt_image_t *img, const char *filename, dt_mipmap_cache_allocator_t a) { int err = DT_IMAGEIO_FILE_CORRUPTED; float *buf = NULL; ExceptionInfo exception; Image *image = NULL; ImageInfo *image_info = NULL; uint32_t width, height, orientation; if(!_supported_image(filename)) return DT_IMAGEIO_FILE_CORRUPTED; if(!img->exif_inited) (void) dt_exif_read(img, filename); GetExceptionInfo(&exception); image_info=CloneImageInfo((ImageInfo *) NULL); g_strlcpy(image_info->filename,filename,sizeof(image_info->filename)); image=ReadImage(image_info,&exception); if (exception.severity != UndefinedException) CatchException(&exception); if (!image) { fprintf(stderr, "[GraphicsMagick_open] image `%s' not found\n", img->filename); err = DT_IMAGEIO_FILE_NOT_FOUND; goto error; } width = image->columns; height = image->rows; orientation = image->orientation; if(orientation & 4) { img->width = height; img->height = width; } else { img->width = width; img->height = height; } img->bpp = 4*sizeof(float); float *mipbuf = (float *)dt_mipmap_cache_alloc(img, DT_MIPMAP_FULL, a); if(!mipbuf) { fprintf(stderr, "[GraphicsMagick_open] could not alloc full buffer for image `%s'\n", img->filename); err = DT_IMAGEIO_CACHE_FULL; goto error; } buf = (float *)dt_alloc_align(16, width*img->bpp); if(!buf) goto error; const int ht2 = orientation & 4 ? img->width : img->height; // pretend unrotated, rotate in write_pos const int wd2 = orientation & 4 ? img->height : img->width; for (uint32_t row = 0; row < height; row++) { int ret = DispatchImage(image, 0, row, width, 1, "RGBP", FloatPixel, (void *)buf, &exception); if (exception.severity != UndefinedException) CatchException(&exception); if(ret != MagickPass) { fprintf(stderr, "[GraphicsMagick_open] error reading image `%s'\n", img->filename); err = DT_IMAGEIO_FILE_CORRUPTED; goto error; } for(uint32_t i=0; i<width; i++) for(int k=0; k<4; k++) mipbuf[4*dt_imageio_write_pos(i, row, wd2, ht2, wd2, ht2, orientation) + k] = buf[4*i + k]; } if(buf) dt_free_align(buf); if(image) DestroyImage(image); if(image_info) DestroyImageInfo(image_info); DestroyExceptionInfo(&exception); img->filters = 0; img->flags &= ~DT_IMAGE_RAW; img->flags &= ~DT_IMAGE_HDR; img->flags |= DT_IMAGE_LDR; return DT_IMAGEIO_OK; error: if(buf) dt_free_align(buf); if(image) DestroyImage(image); if(image_info) DestroyImageInfo(image_info); DestroyExceptionInfo(&exception); return err; }
// callback for the cache backend to initialize payload pointers void dt_mipmap_cache_allocate_dynamic(void *data, dt_cache_entry_t *entry) { dt_mipmap_cache_t *cache = (dt_mipmap_cache_t *)data; // for full image buffers struct dt_mipmap_buffer_dsc *dsc = entry->data; const dt_mipmap_size_t mip = get_size(entry->key); // alloc mere minimum for the header + broken image buffer: if(!dsc) { if(mip <= DT_MIPMAP_F) { // these are fixed-size: entry->data = dt_alloc_align(16, cache->buffer_size[mip]); } else { entry->data = dt_alloc_align(16, sizeof(*dsc) + sizeof(float) * 4 * 64); } // fprintf(stderr, "[mipmap cache] alloc dynamic for key %u %p\n", key, *buf); if(!(entry->data)) { fprintf(stderr, "[mipmap cache] memory allocation failed!\n"); exit(1); } dsc = entry->data; if(mip <= DT_MIPMAP_F) { dsc->width = cache->max_width[mip]; dsc->height = cache->max_height[mip]; dsc->size = cache->buffer_size[mip]; dsc->color_space = DT_COLORSPACE_NONE; } else { dsc->width = 0; dsc->height = 0; dsc->color_space = DT_COLORSPACE_NONE; dsc->size = sizeof(*dsc) + sizeof(float) * 4 * 64; } } assert(dsc->size >= sizeof(*dsc)); int loaded_from_disk = 0; if(mip < DT_MIPMAP_F) { if(cache->cachedir[0] && dt_conf_get_bool("cache_disk_backend")) { // try and load from disk, if successful set flag char filename[PATH_MAX] = {0}; snprintf(filename, sizeof(filename), "%s.d/%d/%d.jpg", cache->cachedir, mip, get_imgid(entry->key)); FILE *f = fopen(filename, "rb"); if(f) { long len = 0; uint8_t *blob = 0; fseek(f, 0, SEEK_END); len = ftell(f); if(len <= 0) goto read_error; // coverity madness blob = (uint8_t *)malloc(len); if(!blob) goto read_error; fseek(f, 0, SEEK_SET); int rd = fread(blob, sizeof(uint8_t), len, f); if(rd != len) goto read_error; dt_colorspaces_color_profile_type_t color_space; dt_imageio_jpeg_t jpg; if(dt_imageio_jpeg_decompress_header(blob, len, &jpg) || (jpg.width > cache->max_width[mip] || jpg.height > cache->max_height[mip]) || ((color_space = dt_imageio_jpeg_read_color_space(&jpg)) == DT_COLORSPACE_NONE) // pointless test to keep it in the if clause || dt_imageio_jpeg_decompress(&jpg, entry->data + sizeof(*dsc))) { fprintf(stderr, "[mipmap_cache] failed to decompress thumbnail for image %d from `%s'!\n", get_imgid(entry->key), filename); goto read_error; } dsc->width = jpg.width; dsc->height = jpg.height; dsc->color_space = color_space; loaded_from_disk = 1; if(0) { read_error: g_unlink(filename); } free(blob); fclose(f); } } } if(!loaded_from_disk) dsc->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE; else dsc->flags = 0; // cost is just flat one for the buffer, as the buffers might have different sizes, // to make sure quota is meaningful. if(mip >= DT_MIPMAP_F) entry->cost = 1; else entry->cost = cache->buffer_size[mip]; }
// internal function: to avoid exif blob reading + 8-bit byteorder flag + high-quality override int dt_imageio_export_with_flags( const uint32_t imgid, const char *filename, dt_imageio_module_format_t *format, dt_imageio_module_data_t *format_params, const int32_t ignore_exif, const int32_t display_byteorder, const int32_t high_quality, const int32_t thumbnail_export) { dt_develop_t dev; dt_dev_init(&dev, 0); dt_mipmap_buffer_t buf; dt_mipmap_cache_read_get(darktable.mipmap_cache, &buf, imgid, DT_MIPMAP_FULL, DT_MIPMAP_BLOCKING); dt_dev_load_image(&dev, imgid); const dt_image_t *img = &dev.image_storage; const int wd = img->width; const int ht = img->height; int res = 0; dt_times_t start; dt_get_times(&start); dt_dev_pixelpipe_t pipe; res = thumbnail_export ? dt_dev_pixelpipe_init_thumbnail(&pipe, wd, ht) : dt_dev_pixelpipe_init_export(&pipe, wd, ht); if(!res) { dt_control_log(_("failed to allocate memory for export, please lower the threads used for export or buy more memory.")); dt_dev_cleanup(&dev); if(buf.buf) dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); return 1; } if(!buf.buf) { dt_control_log(_("image `%s' is not available!"), img->filename); dt_dev_cleanup(&dev); return 1; } dt_dev_pixelpipe_set_input(&pipe, &dev, (float *)buf.buf, buf.width, buf.height, 1.0); dt_dev_pixelpipe_create_nodes(&pipe, &dev); dt_dev_pixelpipe_synch_all(&pipe, &dev); dt_dev_pixelpipe_get_dimensions(&pipe, &dev, pipe.iwidth, pipe.iheight, &pipe.processed_width, &pipe.processed_height); dt_show_times(&start, "[export] creating pixelpipe", NULL); // find output color profile for this image: int sRGB = 1; gchar *overprofile = dt_conf_get_string("plugins/lighttable/export/iccprofile"); if(overprofile && !strcmp(overprofile, "sRGB")) { sRGB = 1; } else if(!overprofile || !strcmp(overprofile, "image")) { GList *modules = dev.iop; dt_iop_module_t *colorout = NULL; while (modules) { colorout = (dt_iop_module_t *)modules->data; if (strcmp(colorout->op, "colorout") == 0) { dt_iop_colorout_params_t *p = (dt_iop_colorout_params_t *)colorout->params; if(!strcmp(p->iccprofile, "sRGB")) sRGB = 1; else sRGB = 0; } modules = g_list_next(modules); } } else { sRGB = 0; } g_free(overprofile); // get only once at the beginning, in case the user changes it on the way: const int high_quality_processing = ((format_params->max_width == 0 || format_params->max_width >= pipe.processed_width ) && (format_params->max_height == 0 || format_params->max_height >= pipe.processed_height)) ? 0 : high_quality; const int width = high_quality_processing ? 0 : format_params->max_width; const int height = high_quality_processing ? 0 : format_params->max_height; const float scalex = width > 0 ? fminf(width /(float)pipe.processed_width, 1.0) : 1.0; const float scaley = height > 0 ? fminf(height/(float)pipe.processed_height, 1.0) : 1.0; const float scale = fminf(scalex, scaley); int processed_width = scale*pipe.processed_width; int processed_height = scale*pipe.processed_height; const int bpp = format->bpp(format_params); // downsampling done last, if high quality processing was requested: uint8_t *outbuf = pipe.backbuf; uint8_t *moutbuf = NULL; // keep track of alloc'ed memory if(high_quality_processing) { dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale); const float scalex = format_params->max_width > 0 ? fminf(format_params->max_width /(float)pipe.processed_width, 1.0) : 1.0; const float scaley = format_params->max_height > 0 ? fminf(format_params->max_height/(float)pipe.processed_height, 1.0) : 1.0; const float scale = fminf(scalex, scaley); processed_width = scale*pipe.processed_width + .5f; processed_height = scale*pipe.processed_height + .5f; moutbuf = (uint8_t *)dt_alloc_align(64, sizeof(float)*processed_width*processed_height*4); outbuf = moutbuf; // now downscale into the new buffer: dt_iop_roi_t roi_in, roi_out; roi_in.x = roi_in.y = roi_out.x = roi_out.y = 0; roi_in.scale = 1.0; roi_out.scale = scale; roi_in.width = pipe.processed_width; roi_in.height = pipe.processed_height; roi_out.width = processed_width; roi_out.height = processed_height; dt_iop_clip_and_zoom((float *)outbuf, (float *)pipe.backbuf, &roi_out, &roi_in, processed_width, pipe.processed_width); } else { // do the processing (8-bit with special treatment, to make sure we can use openmp further down): if(bpp == 8) dt_dev_pixelpipe_process(&pipe, &dev, 0, 0, processed_width, processed_height, scale); else dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale); outbuf = pipe.backbuf; } // downconversion to low-precision formats: if(bpp == 8 && !display_byteorder) { // ldr output: char if(high_quality_processing) { const float *const inbuf = (float *)outbuf; for(int k=0; k<processed_width*processed_height; k++) { // convert in place, this is unfortunately very serial.. const uint8_t r = CLAMP(inbuf[4*k+0]*0xff, 0, 0xff); const uint8_t g = CLAMP(inbuf[4*k+1]*0xff, 0, 0xff); const uint8_t b = CLAMP(inbuf[4*k+2]*0xff, 0, 0xff); outbuf[4*k+0] = r; outbuf[4*k+1] = g; outbuf[4*k+2] = b; } } else { uint8_t *const buf8 = pipe.backbuf; #ifdef _OPENMP #pragma omp parallel for default(none) shared(processed_width, processed_height) schedule(static) #endif // just flip byte order for(int k=0; k<processed_width*processed_height; k++) { uint8_t tmp = buf8[4*k+0]; buf8[4*k+0] = buf8[4*k+2]; buf8[4*k+2] = tmp; } } } else if(bpp == 16) { // uint16_t per color channel float *buff = (float *) outbuf; uint16_t *buf16 = (uint16_t *)outbuf; for(int y=0; y<processed_height; y++) for(int x=0; x<processed_width ; x++) { // convert in place const int k = x + processed_width*y; for(int i=0; i<3; i++) buf16[4*k+i] = CLAMP(buff[4*k+i]*0x10000, 0, 0xffff); } } // else output float, no further harm done to the pixels :) format_params->width = processed_width; format_params->height = processed_height; if(!ignore_exif) { int length; uint8_t exif_profile[65535]; // C++ alloc'ed buffer is uncool, so we waste some bits here. char pathname[1024]; dt_image_full_path(imgid, pathname, 1024); length = dt_exif_read_blob(exif_profile, pathname, sRGB, imgid); res = format->write_image (format_params, filename, outbuf, exif_profile, length, imgid); } else { res = format->write_image (format_params, filename, outbuf, NULL, 0, imgid); } dt_dev_pixelpipe_cleanup(&pipe); dt_dev_cleanup(&dev); dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); free(moutbuf); return res; }
static void generate_thumbnail_cache() { const int max_mip = DT_MIPMAP_2; fprintf(stderr, _("creating cache directories\n")); char filename[PATH_MAX] = {0}; for(int k=DT_MIPMAP_0;k<=max_mip;k++) { snprintf(filename, sizeof(filename), "%s.d/%d", darktable.mipmap_cache->cachedir, k); fprintf(stderr, _("creating cache directory '%s'\n"), filename); int mkd = g_mkdir_with_parents(filename, 0750); if(mkd) { fprintf(stderr, _("could not create directory '%s'!\n"), filename); return; } } // some progress counter sqlite3_stmt *stmt; uint64_t image_count = 0, counter = 0; DT_DEBUG_SQLITE3_PREPARE_V2(dt_database_get(darktable.db), "select count(id) from images", -1, &stmt, 0); if(sqlite3_step(stmt) == SQLITE_ROW) image_count = sqlite3_column_int(stmt, 0); sqlite3_finalize(stmt); // go through all images: DT_DEBUG_SQLITE3_PREPARE_V2(dt_database_get(darktable.db), "select id from images", -1, &stmt, 0); // could only alloc max_mip-1, but would need to detect the special case that max==0. const size_t bufsize = (size_t)4 * darktable.mipmap_cache->max_width[max_mip] * darktable.mipmap_cache->max_height[max_mip]; uint8_t *tmp = (uint8_t *)dt_alloc_align(16, bufsize); if(!tmp) { fprintf(stderr, "couldn't allocate temporary memory!\n"); sqlite3_finalize(stmt); return; } const int cache_quality = MIN(100, MAX(10, dt_conf_get_int("database_cache_quality"))); while(sqlite3_step(stmt) == SQLITE_ROW) { const int32_t imgid = sqlite3_column_int(stmt, 0); // check whether all of these files are already there int all_exist = 1; for(int k=max_mip;k>=DT_MIPMAP_0;k--) { snprintf(filename, sizeof(filename), "%s.d/%d/%d.jpg", darktable.mipmap_cache->cachedir, k, imgid); all_exist &= !access(filename, R_OK); } if(all_exist) goto next; dt_mipmap_buffer_t buf; // get largest thumbnail for this image // this one will take care of itself, we'll just write out the lower thumbs manually: dt_mipmap_cache_get(darktable.mipmap_cache, &buf, imgid, max_mip, DT_MIPMAP_BLOCKING, 'r'); if(buf.width > 8 && buf.height > 8) // don't create for skulls for(int k=max_mip-1;k>=DT_MIPMAP_0;k--) { uint32_t width, height; const int wd = darktable.mipmap_cache->max_width[k]; const int ht = darktable.mipmap_cache->max_height[k]; // use exactly the same mechanism as the cache internally to rescale the thumbnail: dt_iop_flip_and_zoom_8(buf.buf, buf.width, buf.height, tmp, wd, ht, 0, &width, &height); snprintf(filename, sizeof(filename), "%s.d/%d/%d.jpg", darktable.mipmap_cache->cachedir, k, imgid); FILE *f = fopen(filename, "wb"); if(f) { // allocate temp memory: uint8_t *blob = (uint8_t *)malloc(bufsize); if(!blob) goto write_error; const int32_t length = dt_imageio_jpeg_compress(tmp, blob, width, height, cache_quality); assert(length <= bufsize); int written = fwrite(blob, sizeof(uint8_t), length, f); if(written != length) { write_error: unlink(filename); } free(blob); fclose(f); } } dt_mipmap_cache_release(darktable.mipmap_cache, &buf); next: counter ++; fprintf(stderr, "\rimage %lu/%lu (%.02f%%) ", counter, image_count, 100.0*counter/(float)image_count); } dt_free_align(tmp); sqlite3_finalize(stmt); fprintf(stderr, "done \n"); }
void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data; const int ch = piece->colors; const int gamutcheck = (d->mode == DT_PROFILE_GAMUTCHECK); if(!isnan(d->cmatrix[0])) { // fprintf(stderr,"Using cmatrix codepath\n"); // convert to rgb using matrix #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in, roi_out, ivoid, ovoid) #endif for(int j = 0; j < roi_out->height; j++) { float *in = (float *)ivoid + (size_t)ch * roi_in->width * j; float *out = (float *)ovoid + (size_t)ch * roi_out->width * j; const __m128 m0 = _mm_set_ps(0.0f, d->cmatrix[6], d->cmatrix[3], d->cmatrix[0]); const __m128 m1 = _mm_set_ps(0.0f, d->cmatrix[7], d->cmatrix[4], d->cmatrix[1]); const __m128 m2 = _mm_set_ps(0.0f, d->cmatrix[8], d->cmatrix[5], d->cmatrix[2]); for(int i = 0; i < roi_out->width; i++, in += ch, out += ch) { const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in)); const __m128 t = _mm_add_ps(_mm_mul_ps(m0, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(0, 0, 0, 0))), _mm_add_ps(_mm_mul_ps(m1, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(1, 1, 1, 1))), _mm_mul_ps(m2, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(2, 2, 2, 2))))); _mm_stream_ps(out, t); } } _mm_sfence(); // apply profile #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in, roi_out, ivoid, ovoid) #endif for(int j = 0; j < roi_out->height; j++) { float *in = (float *)ivoid + (size_t)ch * roi_in->width * j; float *out = (float *)ovoid + (size_t)ch * roi_out->width * j; for(int i = 0; i < roi_out->width; i++, in += ch, out += ch) { for(int i = 0; i < 3; i++) if(d->lut[i][0] >= 0.0f) { out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]); } } } } else { // fprintf(stderr,"Using xform codepath\n"); const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f); #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(ivoid, ovoid, roi_out) #endif for(int k = 0; k < roi_out->height; k++) { const float *in = ((float *)ivoid) + (size_t)ch * k * roi_out->width; float *out = ((float *)ovoid) + (size_t)ch * k * roi_out->width; if(!gamutcheck) { cmsDoTransform(d->xform, in, out, roi_out->width); } else { void *rgb = dt_alloc_align(16, 4 * sizeof(float) * roi_out->width); cmsDoTransform(d->xform, in, rgb, roi_out->width); float *rgbptr = (float *)rgb; for(int j = 0; j < roi_out->width; j++, rgbptr += 4, out += 4) { const __m128 pixel = _mm_load_ps(rgbptr); __m128 ingamut = _mm_cmplt_ps(pixel, _mm_set_ps(-FLT_MAX, 0.0f, 0.0f, 0.0f)); ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut)); ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut)); const __m128 result = _mm_or_ps(_mm_and_ps(ingamut, outofgamutpixel), _mm_andnot_ps(ingamut, pixel)); _mm_stream_ps(out, result); } dt_free_align(rgb); } } _mm_sfence(); } if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
/** Prepares a 1D resampling plan * * This consists of the following informations * <ul> * <li>A list of lengths that tell how many pixels are relevant for the * next output</li> * <li>A list of required filter kernels</li> * <li>A list of sample indexes</li> * </ul> * * How to apply the resampling plan: * <ol> * <li>Pick a length from the length array</li> * <li>until length is reached * <ol> * <li>pick a kernel tap></li> * <li>pick the relevant sample according to the picked index</li> * <li>multiply them and accumulate</li> * </ol> * </li> * <li>here goes a single output sample</li> * </ol> * * This until you reach the number of output pixels * * @param itor interpolator used to resample * @param in [in] Number of input samples * @param out [in] Number of output samples * @param plength [out] Array of lengths for each pixel filtering (number * of taps/indexes to use). This array mus be freed with fre() when you're * done with the plan. * @param pkernel [out] Array of filter kernel taps * @param pindex [out] Array of sample indexes to be used for applying each kernel tap * arrays of informations * @param pmeta [out] Array of int triplets (length, kernel, index) telling where to start for an arbitrary out position meta[3*out] * @return 0 for success, !0 for failure */ static int prepare_resampling_plan( const struct dt_interpolation* itor, int in, const int in_x0, int out, const int out_x0, float scale, int** plength, float** pkernel, int** pindex, int** pmeta) { // Safe return values *plength = NULL; *pkernel = NULL; *pindex = NULL; if (pmeta) { *pmeta = NULL; } if (scale == 1.f) { // No resampling required return 0; } // Compute common upsampling/downsampling memory requirements int maxtapsapixel; if (scale > 1.f) { // Upscale... the easy one. The values are exact maxtapsapixel = 2*itor->width; } else { // Downscale... going for worst case values memory wise maxtapsapixel = ceil_fast((float)2*(float)itor->width/scale); } int nlengths = out; int nindex = maxtapsapixel*out; int nkernel = maxtapsapixel*out; size_t lengthreq = increase_for_alignment(nlengths*sizeof(int), SSE_ALIGNMENT); size_t indexreq = increase_for_alignment(nindex*sizeof(int), SSE_ALIGNMENT); size_t kernelreq = increase_for_alignment(nkernel*sizeof(float), SSE_ALIGNMENT); size_t scratchreq = maxtapsapixel*sizeof(float) + 4*sizeof(float); // NB: because sse versions compute four taps a time size_t metareq = pmeta ? 3*sizeof(int)*out : 0; void *blob = NULL; size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq; blob = dt_alloc_align(SSE_ALIGNMENT, totalreq); if (!blob) { return 1; } int* lengths = (int*)blob; blob = (char*)blob + lengthreq; int* index = (int*)blob; blob = (char*)blob + indexreq; float* kernel = (float*)blob; blob = (char*)blob + kernelreq; float* scratchpad = scratchreq ? (float*)blob : NULL; blob = (char*)blob + scratchreq; int* meta = metareq ? (int*)blob : NULL; blob = (char*)blob + metareq; /* setting this as a const should help the compilers trim all unecessary * codepaths */ const enum border_mode bordermode = RESAMPLING_BORDER_MODE; /* Upscale and downscale differ in subtle points, getting rid of code * duplication might have been tricky and i prefer keeping the code * as straight as possible */ if (scale > 1.f) { int kidx = 0; int iidx = 0; int lidx = 0; int midx = 0; for (int x=0; x<out; x++) { if (meta) { meta[midx++] = lidx; meta[midx++] = kidx; meta[midx++] = iidx; } // Projected position in input samples float fx = (float)(out_x0 + x)/scale; // Compute the filter kernel at that position int first; compute_upsampling_kernel_sse(itor, scratchpad, NULL, &first, fx); /* Check lower and higher bound pixel index and skip as many pixels as * necessary to fall into range */ int tap_first; int tap_last; prepare_tap_boundaries(&tap_first, &tap_last, bordermode, 2*itor->width, first, in); // Track number of taps that will be used lengths[lidx++] = tap_last - tap_first; // Precompute the inverse of the norm float norm = 0.f; for (int tap=tap_first; tap<tap_last; tap++) { norm += scratchpad[tap]; } norm = 1.f/norm; /* Unlike single pixel or single sample code, here it's interesting to * precompute the normalized filter kernel as this will avoid dividing * by the norm for all processed samples/pixels * NB: use the same loop to put in place the index list */ first += tap_first; for (int tap=tap_first; tap<tap_last; tap++) { kernel[kidx++] = scratchpad[tap]*norm; index[iidx++] = clip(first++, 0, in-1, bordermode); } } } else { int kidx = 0; int iidx = 0; int lidx = 0; int midx = 0; for (int x=0; x<out; x++) { if (meta) { meta[midx++] = lidx; meta[midx++] = kidx; meta[midx++] = iidx; } // Compute downsampling kernel centered on output position int taps; int first; compute_downsampling_kernel_sse(itor, &taps, &first, scratchpad, NULL, scale, out_x0 + x); /* Check lower and higher bound pixel index and skip as many pixels as * necessary to fall into range */ int tap_first; int tap_last; prepare_tap_boundaries(&tap_first, &tap_last, bordermode, taps, first, in); // Track number of taps that will be used lengths[lidx++] = tap_last - tap_first; // Precompute the inverse of the norm float norm = 0.f; for (int tap=tap_first; tap<tap_last; tap++) { norm += scratchpad[tap]; } norm = 1.f/norm; /* Unlike single pixel or single sample code, here it's interesting to * precompute the normalized filter kernel as this will avoid dividing * by the norm for all processed samples/pixels * NB: use the same loop to put in place the index list */ first += tap_first; for (int tap=tap_first; tap<tap_last; tap++) { kernel[kidx++] = scratchpad[tap]*norm; index[iidx++] = clip(first++, 0, in-1, bordermode); } } } // Validate plan wrt caller *plength = lengths; *pindex = index; *pkernel = kernel; if (pmeta) { *pmeta = meta; } return 0; }
/* if a module does not implement process_tiling() by itself, this function is called instead. default_process_tiling() is able to handle standard cases where pixels change their values but not their places. */ void default_process_tiling (struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out, const int in_bpp) { void *input = NULL; void *output = NULL; /* we only care for the most simple cases ATM. else try to process the standard way, i.e. in one chunk. let's hope for the best... */ if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t))) { dt_print(DT_DEBUG_DEV, "[default_process_tiling] cannot handle requested roi's. fall back to standard method for module '%s'\n", self->op); goto fallback; } const int out_bpp = self->output_bpp(self, piece->pipe, piece); const int ipitch = roi_in->width * in_bpp; const int opitch = roi_out->width * out_bpp; /* get tiling requirements of module */ dt_develop_tiling_t tiling = { 0 }; self->tiling_callback(self, piece, roi_in, roi_out, &tiling); /* tiling really does not make sense in these cases. standard process() is not better or worse than we are */ if(tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_out->width * roi_out->height * max(in_bpp, out_bpp)) { dt_print(DT_DEBUG_DEV, "[default_process_tiling] don't use tiling for module '%s'. no real memory saving could be reached\n", self->op); goto fallback; } /* calculate optimal size of tiles */ float available = dt_conf_get_int("host_memory_limit")*1024*1024; assert(available >= 500*1024*1024); /* correct for size of ivoid and ovoid which are needed on top of tiling */ available = max(available - roi_out->width * roi_out->height * (in_bpp + out_bpp) - tiling.overhead, 0); /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor. this will mainly allow tiling for modules with high and "unpredictable" memory demand which is reflected in high values of tiling.factor (take bilateral noise reduction as an example). */ float singlebuffer = dt_conf_get_int("singlebuffer_limit")*1024*1024; singlebuffer = max(singlebuffer, 1024*1024); assert(tiling.factor > 1.0f); singlebuffer = max(available / tiling.factor, singlebuffer); int width = roi_out->width; int height = roi_out->height; /* shrink tile size in case it would exceed singlebuffer size */ if(width*height*max(in_bpp, out_bpp) > singlebuffer) { const float scale = singlebuffer/(width*height*max(in_bpp, out_bpp)); /* TODO: can we make this more efficient to minimize total overlap between tiles? */ if(width < height && scale >= 0.333f) { height = floorf(height * scale); } else if(height <= width && scale >= 0.333f) { width = floorf(width * scale); } else { width = floorf(width * sqrt(scale)); height = floorf(height * sqrt(scale)); } } /* make sure we have a reasonably effective tile dimension. if not try square tiles */ if(3*tiling.overlap > width || 3*tiling.overlap > height) { width = height = floorf(sqrtf((float)width*height)); } #if 0 /* we might want to grow dimensions a bit */ width = max(4*tiling.overlap, width); height = max(4*tiling.overlap, height); #endif /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled. Modules will report alignment requirements via xalign and yalign within tiling_callback(). Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y direction. We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height that is identical to image width/height no special alignment is needed. */ const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign); assert(xyalign != 0); /* properly align tile width and height by making them smaller if needed */ if(width < roi_out->width) width = (width / xyalign) * xyalign; if(height < roi_out->height) height = (height / xyalign) * xyalign; /* also make sure that overlap follows alignment rules by making it wider when needed */ const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign : tiling.overlap; /* calculate effective tile size */ const int tile_wd = width - 2*overlap > 0 ? width - 2*overlap : 1; const int tile_ht = height - 2*overlap > 0 ? height - 2*overlap : 1; /* calculate number of tiles */ const int tiles_x = width < roi_out->width ? ceilf(roi_out->width /(float)tile_wd) : 1; const int tiles_y = height < roi_out->height ? ceilf(roi_out->height/(float)tile_ht) : 1; /* sanity check: don't run wild on too many tiles */ if(tiles_x * tiles_y > DT_TILING_MAXTILES) { dt_print(DT_DEBUG_DEV, "[default_process_tiling] gave up tiling for module '%s'. too many tiles: %d x %d\n", self->op, tiles_x, tiles_y); goto error; } dt_print(DT_DEBUG_DEV, "[default_process_tiling] use tiling on module '%s' for image with full size %d x %d\n", self->op, roi_out->width, roi_out->height); dt_print(DT_DEBUG_DEV, "[default_process_tiling] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n", tiles_x, tiles_y, width, height, overlap); /* reserve input and output buffers for tiles */ input = dt_alloc_align(64, width*height*in_bpp); if(input == NULL) { dt_print(DT_DEBUG_DEV, "[default_process_tiling] could not alloc input buffer for module '%s'\n", self->op); goto error; } output = dt_alloc_align(64, width*height*out_bpp); if(output == NULL) { dt_print(DT_DEBUG_DEV, "[default_process_tiling] could not alloc output buffer for module '%s'\n", self->op); goto error; } /* store processed_maximum to be re-used and aggregated */ float processed_maximum_saved[3]; float processed_maximum_new[3] = { 1.0f }; for(int k=0; k<3; k++) processed_maximum_saved[k] = piece->pipe->processed_maximum[k]; /* iterate over tiles */ for(int tx=0; tx<tiles_x; tx++) for(int ty=0; ty<tiles_y; ty++) { size_t wd = tx * tile_wd + width > roi_out->width ? roi_out->width - tx * tile_wd : width; size_t ht = ty * tile_ht + height > roi_out->height ? roi_out->height- ty * tile_ht : height; /* no need to process end-tiles that are smaller than overlap */ if((wd <= overlap && tx > 0) || (ht <= overlap && ty > 0)) continue; /* origin and region of effective part of tile, which we want to store later */ size_t origin[] = { 0, 0, 0 }; size_t region[] = { wd, ht, 1 }; /* roi_in and roi_out for process_cl on subbuffer */ dt_iop_roi_t iroi = { 0, 0, wd, ht, roi_in->scale }; dt_iop_roi_t oroi = { 0, 0, wd, ht, roi_out->scale }; /* offsets of tile into ivoid and ovoid */ size_t ioffs = (ty * tile_ht)*ipitch + (tx * tile_wd)*in_bpp; size_t ooffs = (ty * tile_ht)*opitch + (tx * tile_wd)*out_bpp; dt_print(DT_DEBUG_DEV, "[default_process_tiling] tile (%d, %d) with %d x %d at origin [%d, %d]\n", tx, ty, wd, ht, tx*tile_wd, ty*tile_ht); /* prepare input tile buffer */ #ifdef _OPENMP #pragma omp parallel for default(none) shared(input,width,ivoid,ioffs,wd,ht) schedule(static) #endif for(int j=0; j<ht; j++) memcpy((char *)input+j*wd*in_bpp, (char *)ivoid+ioffs+j*ipitch, wd*in_bpp); /* take original processed_maximum as starting point */ for(int k=0; k<3; k++) piece->pipe->processed_maximum[k] = processed_maximum_saved[k]; /* call process() of module */ self->process(self, piece, input, output, &iroi, &oroi); /* aggregate resulting processed_maximum */ /* TODO: check if there really can be differences between tiles and take appropriate action (calculate minimum, maximum, average, ...?) */ for(int k=0; k<3; k++) { if(tx+ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->processed_maximum[k]) > 1.0e-6f) dt_print(DT_DEBUG_DEV, "[default_process_tiling] processed_maximum[%d] differs between tiles in module '%s'\n", k, self->op); processed_maximum_new[k] = piece->pipe->processed_maximum[k]; } /* correct origin and region of tile for overlap. make sure that we only copy back the "good" part. */ if(tx > 0) { origin[0] += overlap; region[0] -= overlap; ooffs += overlap*out_bpp; } if(ty > 0) { origin[1] += overlap; region[1] -= overlap; ooffs += overlap*opitch; } /* copy "good" part of tile to output buffer */ #ifdef _OPENMP #pragma omp parallel for default(none) shared(ovoid,ooffs,output,width,origin,region,wd) schedule(static) #endif for(int j=0; j<region[1]; j++) memcpy((char *)ovoid+ooffs+j*opitch, (char *)output+((j+origin[1])*wd+origin[0])*out_bpp, region[0]*out_bpp); } /* copy back final processed_maximum */ for(int k=0; k<3; k++) piece->pipe->processed_maximum[k] = processed_maximum_new[k]; if(input != NULL) free(input); if(output != NULL) free(output); return; error: if(input != NULL) free(input); if(output != NULL) free(output); dt_print(DT_DEBUG_DEV, "[default_process_tiling] tiling failed for module '%s'\n", self->op); /* TODO: give a warning message to user */ return; fallback: if(input != NULL) free(input); if(output != NULL) free(output); dt_print(DT_DEBUG_DEV, "[default_process_tiling] fall back to standard processing for module '%s'\n", self->op); self->process(self, piece, ivoid, ovoid, roi_in, roi_out); return; }
dt_imageio_retval_t dt_imageio_open_png(dt_image_t *img, const char *filename, dt_mipmap_buffer_t *mbuf) { const char *ext = filename + strlen(filename); while(*ext != '.' && ext > filename) ext--; if(strncmp(ext, ".png", 4) && strncmp(ext, ".PNG", 4)) return DT_IMAGEIO_FILE_CORRUPTED; if(!img->exif_inited) (void)dt_exif_read(img, filename); dt_imageio_png_t image; uint8_t *buf = NULL; uint32_t width, height; uint16_t bpp; if(read_header(filename, &image) != 0) return DT_IMAGEIO_FILE_CORRUPTED; width = img->width = image.width; height = img->height = image.height; bpp = image.bit_depth; img->bpp = 4 * sizeof(float); float *mipbuf = (float *)dt_mipmap_cache_alloc(mbuf, img); if(!mipbuf) { fclose(image.f); png_destroy_read_struct(&image.png_ptr, &image.info_ptr, NULL); fprintf(stderr, "[png_open] could not alloc full buffer for image `%s'\n", img->filename); return DT_IMAGEIO_CACHE_FULL; } buf = dt_alloc_align(16, (size_t)width * height * 3 * (bpp < 16 ? 1 : 2)); if(!buf) { fclose(image.f); png_destroy_read_struct(&image.png_ptr, &image.info_ptr, NULL); fprintf(stderr, "[png_open] could not alloc intermediate buffer for image `%s'\n", img->filename); return DT_IMAGEIO_CACHE_FULL; } if(read_image(&image, (void *)buf) != 0) { dt_free_align(buf); fprintf(stderr, "[png_open] could not read image `%s'\n", img->filename); return DT_IMAGEIO_FILE_CORRUPTED; } for(size_t j = 0; j < height; j++) { if(bpp < 16) for(size_t i = 0; i < width; i++) for(int k = 0; k < 3; k++) mipbuf[4 * (j * width + i) + k] = buf[3 * (j * width + i) + k] * (1.0f / 255.0f); else for(size_t i = 0; i < width; i++) for(int k = 0; k < 3; k++) mipbuf[4 * (j * width + i) + k] = (256.0f * buf[2 * (3 * (j * width + i) + k)] + buf[2 * (3 * (j * width + i) + k) + 1]) * (1.0f / 65535.0f); } dt_free_align(buf); return DT_IMAGEIO_OK; }
void dt_mipmap_cache_init(dt_mipmap_cache_t *cache) { // make sure static memory is initialized struct dt_mipmap_buffer_dsc *dsc = (struct dt_mipmap_buffer_dsc *)dt_mipmap_cache_static_dead_image; dead_image_f((dt_mipmap_buffer_t *)(dsc+1)); cache->compression_type = 0; gchar *compression = dt_conf_get_string("cache_compression"); if(compression) { if(!strcmp(compression, "low quality (fast)")) cache->compression_type = 1; else if(!strcmp(compression, "high quality (slow)")) cache->compression_type = 2; g_free(compression); } dt_print(DT_DEBUG_CACHE, "[mipmap_cache_init] using %s\n", cache->compression_type == 0 ? "no compression" : (cache->compression_type == 1 ? "low quality compression" : "slow high quality compression")); // adjust numbers to be large enough to hold what mem limit suggests. // we want at least 100MB, and consider 8G just still reasonable. size_t max_mem = CLAMPS(dt_conf_get_int64("cache_memory"), 100u<<20, ((uint64_t)8)<<30); const uint32_t parallel = CLAMP(dt_conf_get_int ("worker_threads")*dt_conf_get_int("parallel_export"), 1, 8); const int32_t max_size = 2048, min_size = 32; int32_t wd = darktable.thumbnail_width; int32_t ht = darktable.thumbnail_height; wd = CLAMPS(wd, min_size, max_size); ht = CLAMPS(ht, min_size, max_size); // round up to a multiple of 8, so we can divide by two 3 times if(wd & 0xf) wd = (wd & ~0xf) + 0x10; if(ht & 0xf) ht = (ht & ~0xf) + 0x10; // cache these, can't change at runtime: cache->mip[DT_MIPMAP_F].max_width = wd; cache->mip[DT_MIPMAP_F].max_height = ht; cache->mip[DT_MIPMAP_F-1].max_width = wd; cache->mip[DT_MIPMAP_F-1].max_height = ht; for(int k=DT_MIPMAP_F-2; k>=DT_MIPMAP_0; k--) { cache->mip[k].max_width = cache->mip[k+1].max_width / 2; cache->mip[k].max_height = cache->mip[k+1].max_height / 2; } // initialize some per-thread cached scratchmem for uncompressed buffers during thumb creation: if(cache->compression_type) { cache->scratchmem.max_width = wd; cache->scratchmem.max_height = ht; cache->scratchmem.buffer_size = wd*ht*sizeof(uint32_t); cache->scratchmem.size = DT_MIPMAP_3; // at max. // TODO: use thread local storage instead (zero performance penalty on linux) dt_cache_init(&cache->scratchmem.cache, parallel, parallel, 64, 0.9f*parallel*wd*ht*sizeof(uint32_t)); // might have been rounded to power of two: const int cnt = dt_cache_capacity(&cache->scratchmem.cache); cache->scratchmem.buf = dt_alloc_align(64, cnt * wd*ht*sizeof(uint32_t)); dt_cache_static_allocation(&cache->scratchmem.cache, (uint8_t *)cache->scratchmem.buf, wd*ht*sizeof(uint32_t)); dt_cache_set_allocate_callback(&cache->scratchmem.cache, scratchmem_allocate, &cache->scratchmem); dt_print(DT_DEBUG_CACHE, "[mipmap_cache_init] cache has % 5d entries for temporary compression buffers (% 4.02f MB).\n", cnt, cnt* wd*ht*sizeof(uint32_t)/(1024.0*1024.0)); } for(int k=DT_MIPMAP_3; k>=0; k--) { // clear stats: cache->mip[k].stats_requests = 0; cache->mip[k].stats_near_match = 0; cache->mip[k].stats_misses = 0; cache->mip[k].stats_fetches = 0; cache->mip[k].stats_standin = 0; // buffer stores width and height + actual data const int width = cache->mip[k].max_width; const int height = cache->mip[k].max_height; // header + adjusted for dxt compression: cache->mip[k].buffer_size = 4*sizeof(uint32_t) + compressed_buffer_size(cache->compression_type, width, height); cache->mip[k].size = k; // level of parallelism also gives minimum size (which is twice that) // is rounded to a power of two by the cache anyways, we might as well. // XXX this needs adjustment for video mode (more full-res thumbs for replay) // TODO: collect hit/miss stats and auto-adjust to user browsing behaviour // TODO: can #prefetches be collected this way, too? const size_t max_mem2 = MAX(0, (k == 0) ? (max_mem) : (max_mem/(k+4))); uint32_t thumbnails = MAX(2, nearest_power_of_two((uint32_t)((double)max_mem2/cache->mip[k].buffer_size))); while(thumbnails > parallel && (size_t)thumbnails * cache->mip[k].buffer_size > max_mem2) thumbnails /= 2; // try to utilize that memory well (use 90% quota), the hopscotch paper claims good scalability up to // even more than that. dt_cache_init(&cache->mip[k].cache, thumbnails, parallel, 64, 0.9f*thumbnails*cache->mip[k].buffer_size); // might have been rounded to power of two: thumbnails = dt_cache_capacity(&cache->mip[k].cache); max_mem -= thumbnails * cache->mip[k].buffer_size; // dt_print(DT_DEBUG_CACHE, "[mipmap mem] %4.02f left\n", max_mem/(1024.0*1024.0)); cache->mip[k].buf = dt_alloc_align(64, thumbnails * cache->mip[k].buffer_size); dt_cache_static_allocation(&cache->mip[k].cache, (uint8_t *)cache->mip[k].buf, cache->mip[k].buffer_size); dt_cache_set_allocate_callback(&cache->mip[k].cache, dt_mipmap_cache_allocate, &cache->mip[k]); // dt_cache_set_cleanup_callback(&cache->mip[k].cache, // &dt_mipmap_cache_deallocate, &cache->mip[k]); dt_print(DT_DEBUG_CACHE, "[mipmap_cache_init] cache has % 5d entries for mip %d (% 4.02f MB).\n", thumbnails, k, thumbnails * cache->mip[k].buffer_size/(1024.0*1024.0)); } // full buffer needs dynamic alloc: const int full_entries = MAX(2, parallel); // even with one thread you want two buffers. one for dr one for thumbs. int32_t max_mem_bufs = nearest_power_of_two(full_entries); // for this buffer, because it can be very busy during import, we want the minimum // number of entries in the hashtable to be 16, but leave the quota as is. the dynamic // alloc/free properties of this cache take care that no more memory is required. dt_cache_init(&cache->mip[DT_MIPMAP_FULL].cache, max_mem_bufs, parallel, 64, max_mem_bufs); dt_cache_set_allocate_callback(&cache->mip[DT_MIPMAP_FULL].cache, dt_mipmap_cache_allocate_dynamic, &cache->mip[DT_MIPMAP_FULL]); // dt_cache_set_cleanup_callback(&cache->mip[DT_MIPMAP_FULL].cache, // &dt_mipmap_cache_deallocate_dynamic, &cache->mip[DT_MIPMAP_FULL]); cache->mip[DT_MIPMAP_FULL].buffer_size = 0; cache->mip[DT_MIPMAP_FULL].size = DT_MIPMAP_FULL; cache->mip[DT_MIPMAP_FULL].buf = NULL; // same for mipf: dt_cache_init(&cache->mip[DT_MIPMAP_F].cache, max_mem_bufs, parallel, 64, max_mem_bufs); dt_cache_set_allocate_callback(&cache->mip[DT_MIPMAP_F].cache, dt_mipmap_cache_allocate_dynamic, &cache->mip[DT_MIPMAP_F]); dt_cache_set_cleanup_callback(&cache->mip[DT_MIPMAP_F].cache, dt_mipmap_cache_deallocate_dynamic, &cache->mip[DT_MIPMAP_F]); cache->mip[DT_MIPMAP_F].buffer_size = 4*sizeof(uint32_t) + 4*sizeof(float) * cache->mip[DT_MIPMAP_F].max_width * cache->mip[DT_MIPMAP_F].max_height; cache->mip[DT_MIPMAP_F].size = DT_MIPMAP_F; cache->mip[DT_MIPMAP_F].buf = NULL; dt_mipmap_cache_deserialize(cache); }
int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data; dt_iop_global_tonemap_global_data_t *gd = (dt_iop_global_tonemap_global_data_t *)self->data; dt_iop_global_tonemap_gui_data_t *g = (dt_iop_global_tonemap_gui_data_t *)self->gui_data; dt_bilateral_cl_t *b = NULL; cl_int err = -999; cl_mem dev_m = NULL; cl_mem dev_r = NULL; float *maximum = NULL; const int devid = piece->pipe->devid; int gtkernel = -1; const int width = roi_out->width; const int height = roi_out->height; float parameters[4] = { 0.0f }; switch(d->operator) { case OPERATOR_REINHARD: gtkernel = gd->kernel_global_tonemap_reinhard; break; case OPERATOR_DRAGO: gtkernel = gd->kernel_global_tonemap_drago; break; case OPERATOR_FILMIC: gtkernel = gd->kernel_global_tonemap_filmic; break; } if(d->operator== OPERATOR_DRAGO) { const float eps = 0.0001f; float tmp_lwmax = NAN; // see comments in process() about lwmax value if(self->dev->gui_attached && g && piece->pipe->type == DT_DEV_PIXELPIPE_FULL) { dt_pthread_mutex_lock(&g->lock); const uint64_t hash = g->hash; dt_pthread_mutex_unlock(&g->lock); if(hash != 0 && !dt_dev_sync_pixelpipe_hash(self->dev, piece->pipe, 0, self->priority, &g->lock, &g->hash)) dt_control_log(_("inconsistent output")); dt_pthread_mutex_lock(&g->lock); tmp_lwmax = g->lwmax; dt_pthread_mutex_unlock(&g->lock); } if(isnan(tmp_lwmax)) { dt_opencl_local_buffer_t flocopt = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1, .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 4, .sizey = 1 << 4 }; if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_first, &flocopt)) goto error; const size_t bwidth = ROUNDUP(width, flocopt.sizex); const size_t bheight = ROUNDUP(height, flocopt.sizey); const int bufsize = (bwidth / flocopt.sizex) * (bheight / flocopt.sizey); dt_opencl_local_buffer_t slocopt = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1, .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 16, .sizey = 1 }; if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_second, &slocopt)) goto error; const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex); size_t sizes[3]; size_t local[3]; dev_m = dt_opencl_alloc_device_buffer(devid, (size_t)bufsize * sizeof(float)); if(dev_m == NULL) goto error; dev_r = dt_opencl_alloc_device_buffer(devid, (size_t)reducesize * sizeof(float)); if(dev_r == NULL) goto error; sizes[0] = bwidth; sizes[1] = bheight; sizes[2] = 1; local[0] = flocopt.sizex; local[1] = flocopt.sizey; local[2] = 1; dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 0, sizeof(cl_mem), &dev_in); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 1, sizeof(int), &width); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 2, sizeof(int), &height); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 3, sizeof(cl_mem), &dev_m); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 4, flocopt.sizex * flocopt.sizey * sizeof(float), NULL); err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_pixelmax_first, sizes, local); if(err != CL_SUCCESS) goto error; sizes[0] = reducesize * slocopt.sizex; sizes[1] = 1; sizes[2] = 1; local[0] = slocopt.sizex; local[1] = 1; local[2] = 1; dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 0, sizeof(cl_mem), &dev_m); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 1, sizeof(cl_mem), &dev_r); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 2, sizeof(int), &bufsize); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 3, slocopt.sizex * sizeof(float), NULL); err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_pixelmax_second, sizes, local); if(err != CL_SUCCESS) goto error; maximum = dt_alloc_align(16, reducesize * sizeof(float)); err = dt_opencl_read_buffer_from_device(devid, (void *)maximum, dev_r, 0, (size_t)reducesize * sizeof(float), CL_TRUE); if(err != CL_SUCCESS) goto error; dt_opencl_release_mem_object(dev_r); dt_opencl_release_mem_object(dev_m); dev_r = dev_m = NULL; for(int k = 1; k < reducesize; k++) { float mine = maximum[0]; float other = maximum[k]; maximum[0] = (other > mine) ? other : mine; } tmp_lwmax = MAX(eps, (maximum[0] * 0.01f)); dt_free_align(maximum); maximum = NULL; } const float lwmax = tmp_lwmax; const float ldc = d->drago.max_light * 0.01f / log10f(lwmax + 1.0f); const float bl = logf(MAX(eps, d->drago.bias)) / logf(0.5f); parameters[0] = eps; parameters[1] = ldc; parameters[2] = bl; parameters[3] = lwmax; if(self->dev->gui_attached && g && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW) { uint64_t hash = dt_dev_hash_plus(self->dev, piece->pipe, 0, self->priority); dt_pthread_mutex_lock(&g->lock); g->lwmax = lwmax; g->hash = hash; dt_pthread_mutex_unlock(&g->lock); } } const float scale = piece->iscale / roi_in->scale; const float sigma_r = 8.0f; // does not depend on scale const float iw = piece->buf_in.width / scale; const float ih = piece->buf_in.height / scale; const float sigma_s = fminf(iw, ih) * 0.03f; if(d->detail != 0.0f) { b = dt_bilateral_init_cl(devid, roi_in->width, roi_in->height, sigma_s, sigma_r); if(!b) goto error; // get detail from unchanged input buffer err = dt_bilateral_splat_cl(b, dev_in); if(err != CL_SUCCESS) goto error; } size_t sizes[2] = { ROUNDUPWD(width), ROUNDUPHT(height) }; dt_opencl_set_kernel_arg(devid, gtkernel, 0, sizeof(cl_mem), &dev_in); dt_opencl_set_kernel_arg(devid, gtkernel, 1, sizeof(cl_mem), &dev_out); dt_opencl_set_kernel_arg(devid, gtkernel, 2, sizeof(int), &width); dt_opencl_set_kernel_arg(devid, gtkernel, 3, sizeof(int), &height); dt_opencl_set_kernel_arg(devid, gtkernel, 4, 4 * sizeof(float), ¶meters); err = dt_opencl_enqueue_kernel_2d(devid, gtkernel, sizes); if(err != CL_SUCCESS) goto error; if(d->detail != 0.0f) { err = dt_bilateral_blur_cl(b); if(err != CL_SUCCESS) goto error; // and apply it to output buffer after logscale err = dt_bilateral_slice_to_output_cl(b, dev_in, dev_out, d->detail); if(err != CL_SUCCESS) goto error; dt_bilateral_free_cl(b); } return TRUE; error: if(b) dt_bilateral_free_cl(b); dt_opencl_release_mem_object(dev_m); dt_opencl_release_mem_object(dev_r); dt_free_align(maximum); dt_print(DT_DEBUG_OPENCL, "[opencl_global_tonemap] couldn't enqueue kernel! %d\n", err); return FALSE; } #endif void tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out, struct dt_develop_tiling_t *tiling) { dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data; const float scale = piece->iscale / roi_in->scale; const float iw = piece->buf_in.width / scale; const float ih = piece->buf_in.height / scale; const float sigma_s = fminf(iw, ih) * 0.03f; const float sigma_r = 8.0f; const int detail = (d->detail != 0.0f); const int width = roi_in->width; const int height = roi_in->height; const int channels = piece->colors; const size_t basebuffer = width * height * channels * sizeof(float); tiling->factor = 2.0f + (detail ? (float)dt_bilateral_memory_use2(width, height, sigma_s, sigma_r) / basebuffer : 0.0f); tiling->maxbuf = (detail ? MAX(1.0f, (float)dt_bilateral_singlebuffer_size2(width, height, sigma_s, sigma_r) / basebuffer) : 1.0f); tiling->overhead = 0; tiling->overlap = (detail ? ceilf(4 * sigma_s) : 0); tiling->xalign = 1; tiling->yalign = 1; return; } void commit_params(struct dt_iop_module_t *self, dt_iop_params_t *p1, dt_dev_pixelpipe_t *pipe, dt_dev_pixelpipe_iop_t *piece) { dt_iop_global_tonemap_params_t *p = (dt_iop_global_tonemap_params_t *)p1; dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data; d->operator= p->operator; d->drago.bias = p->drago.bias; d->drago.max_light = p->drago.max_light; d->detail = p->detail; // drago needs the maximum L-value of the whole image so it must not use tiling if(d->operator == OPERATOR_DRAGO) piece->process_tiling_ready = 0; #ifdef HAVE_OPENCL if(d->detail != 0.0f) piece->process_cl_ready = (piece->process_cl_ready && !(darktable.opencl->avoid_atomics)); #endif }
// if found, the data void* is returned. if not, it is set to be // the given *data and a new hash table entry is created, which can be // found using the given key later on. dt_cache_entry_t *dt_cache_get_with_caller(dt_cache_t *cache, const uint32_t key, char mode, const char *file, int line) { gpointer orig_key, value; gboolean res; int result; double start = dt_get_wtime(); restart: dt_pthread_mutex_lock(&cache->lock); res = g_hash_table_lookup_extended( cache->hashtable, GINT_TO_POINTER(key), &orig_key, &value); if(res) { // yay, found. read lock and pass on. dt_cache_entry_t *entry = (dt_cache_entry_t *)value; if(mode == 'w') result = dt_pthread_rwlock_trywrlock_with_caller(&entry->lock, file, line); else result = dt_pthread_rwlock_tryrdlock_with_caller(&entry->lock, file, line); if(result) { // need to give up mutex so other threads have a chance to get in between and // free the lock we're trying to acquire: dt_pthread_mutex_unlock(&cache->lock); g_usleep(5); goto restart; } // bubble up in lru list: cache->lru = g_list_remove_link(cache->lru, entry->link); cache->lru = g_list_concat(cache->lru, entry->link); dt_pthread_mutex_unlock(&cache->lock); #ifdef _DEBUG const pthread_t writer = dt_pthread_rwlock_get_writer(&entry->lock); if(mode == 'w') { assert(pthread_equal(writer, pthread_self())); } else { assert(!pthread_equal(writer, pthread_self())); } #endif if(mode == 'w') { assert(entry->data_size); ASAN_POISON_MEMORY_REGION(entry->data, entry->data_size); } // WARNING: do *NOT* unpoison here. it must be done by the caller! return entry; } // else, not found, need to allocate. // first try to clean up. // also wait if we can't free more than the requested fill ratio. if(cache->cost > 0.8f * cache->cost_quota) { // need to roll back all the way to get a consistent lock state: dt_cache_gc(cache, 0.8f); } // here dies your 32-bit system: dt_cache_entry_t *entry = (dt_cache_entry_t *)g_slice_alloc(sizeof(dt_cache_entry_t)); int ret = dt_pthread_rwlock_init(&entry->lock, 0); if(ret) fprintf(stderr, "rwlock init: %d\n", ret); entry->data = 0; entry->data_size = cache->entry_size; entry->cost = 1; entry->link = g_list_append(0, entry); entry->key = key; entry->_lock_demoting = 0; g_hash_table_insert(cache->hashtable, GINT_TO_POINTER(key), entry); assert(cache->allocate || entry->data_size); if(cache->allocate) cache->allocate(cache->allocate_data, entry); else entry->data = dt_alloc_align(16, entry->data_size); assert(entry->data_size); ASAN_POISON_MEMORY_REGION(entry->data, entry->data_size); // if allocate callback is given, always return a write lock const int write = ((mode == 'w') || cache->allocate); // write lock in case the caller requests it: if(write) dt_pthread_rwlock_wrlock_with_caller(&entry->lock, file, line); else dt_pthread_rwlock_rdlock_with_caller(&entry->lock, file, line); cache->cost += entry->cost; // put at end of lru list (most recently used): cache->lru = g_list_concat(cache->lru, entry->link); dt_pthread_mutex_unlock(&cache->lock); double end = dt_get_wtime(); if(end - start > 0.1) fprintf(stderr, "wait time %.06fs\n", end - start); // WARNING: do *NOT* unpoison here. it must be done by the caller! return entry; }
/** process, all real work is done here. */ void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { // this is called for preview and full pipe separately, each with its own pixelpipe piece. // get our data struct: dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data; // adjust to zoom size: const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood if(P <= 1) { // nothing to do from this distance: memcpy (ovoid, ivoid, sizeof(float)*4*roi_out->width*roi_out->height); return; } // adjust to Lab, make L more important // float max_L = 100.0f, max_C = 256.0f; // float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C); float max_L = 120.0f, max_C = 512.0f; float nL = 1.0f/max_L, nC = 1.0f/max_C; const float norm2[4] = { nL*nL, nC*nC, nC*nC, 1.0f }; float *Sa = dt_alloc_align(64, sizeof(float)*roi_out->width*dt_get_num_threads()); // we want to sum up weights in col[3], so need to init to 0: memset(ovoid, 0x0, sizeof(float)*roi_out->width*roi_out->height*4); // for each shift vector for(int kj=-K;kj<=K;kj++) { for(int ki=-K;ki<=K;ki++) { int inited_slide = 0; // don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else we will add up errors) // do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more memory #ifdef _OPENMP # pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, ivoid, ovoid, Sa) #endif for(int j=0; j<roi_out->height; j++) { if(j+kj < 0 || j+kj >= roi_out->height) continue; float *S = Sa + dt_get_thread_num() * roi_out->width; const float *ins = ((float *)ivoid) + 4*(roi_in->width *(j+kj) + ki); float *out = ((float *)ovoid) + 4*roi_out->width*j; const int Pm = MIN(MIN(P, j+kj), j); const int PM = MIN(MIN(P, roi_out->height-1-j-kj), roi_out->height-1-j); // first line of every thread // TODO: also every once in a while to assert numerical precision! if(!inited_slide) { // sum up a line memset(S, 0x0, sizeof(float)*roi_out->width); for(int jj=-Pm;jj<=PM;jj++) { int i = MAX(0, -ki); float *s = S + i; const float *inp = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+jj); const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+jj+kj) + ki); const int last = roi_out->width + MIN(0, -ki); for(; i<last; i++, inp+=4, inps+=4, s++) { for(int k=0;k<3;k++) s[0] += (inp[k] - inps[k])*(inp[k] - inps[k]) * norm2[k]; } } // only reuse this if we had a full stripe if(Pm == P && PM == P) inited_slide = 1; } // sliding window for this line: float *s = S; float slide = 0.0f; // sum up the first -P..P for(int i=0;i<2*P+1;i++) slide += s[i]; for(int i=0; i<roi_out->width; i++) { if(i-P > 0 && i+P<roi_out->width) slide += s[P] - s[-P-1]; if(i+ki >= 0 && i+ki < roi_out->width) { const __m128 iv = { ins[0], ins[1], ins[2], 1.0f }; _mm_store_ps(out, _mm_load_ps(out) + iv * _mm_set1_ps(gh(slide))); } s ++; ins += 4; out += 4; } if(inited_slide && j+P+1+MAX(0,kj) < roi_out->height) { // sliding window in j direction: int i = MAX(0, -ki); float *s = S + i; const float *inp = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+P+1); const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+P+1+kj) + ki); const float *inm = ((float *)ivoid) + 4*i + 4* roi_in->width *(j-P); const float *inms = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j-P+kj) + ki); const int last = roi_out->width + MIN(0, -ki); for(; ((unsigned long)s & 0xf) != 0 && i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++) { float stmp = s[0]; for(int k=0;k<3;k++) stmp += ((inp[k] - inps[k])*(inp[k] - inps[k]) - (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k]; s[0] = stmp; } /* Process most of the line 4 pixels at a time */ for(; i<last-4; i+=4, inp+=16, inps+=16, inm+=16, inms+=16, s+=4) { __m128 sv = _mm_load_ps(s); const __m128 inp1 = _mm_load_ps(inp) - _mm_load_ps(inps); const __m128 inp2 = _mm_load_ps(inp+4) - _mm_load_ps(inps+4); const __m128 inp3 = _mm_load_ps(inp+8) - _mm_load_ps(inps+8); const __m128 inp4 = _mm_load_ps(inp+12) - _mm_load_ps(inps+12); const __m128 inp12lo = _mm_unpacklo_ps(inp1,inp2); const __m128 inp34lo = _mm_unpacklo_ps(inp3,inp4); const __m128 inp12hi = _mm_unpackhi_ps(inp1,inp2); const __m128 inp34hi = _mm_unpackhi_ps(inp3,inp4); const __m128 inpv0 = _mm_movelh_ps(inp12lo,inp34lo); sv += inpv0*inpv0 * _mm_set1_ps(norm2[0]); const __m128 inpv1 = _mm_movehl_ps(inp34lo,inp12lo); sv += inpv1*inpv1 * _mm_set1_ps(norm2[1]); const __m128 inpv2 = _mm_movelh_ps(inp12hi,inp34hi); sv += inpv2*inpv2 * _mm_set1_ps(norm2[2]); const __m128 inm1 = _mm_load_ps(inm) - _mm_load_ps(inms); const __m128 inm2 = _mm_load_ps(inm+4) - _mm_load_ps(inms+4); const __m128 inm3 = _mm_load_ps(inm+8) - _mm_load_ps(inms+8); const __m128 inm4 = _mm_load_ps(inm+12) - _mm_load_ps(inms+12); const __m128 inm12lo = _mm_unpacklo_ps(inm1,inm2); const __m128 inm34lo = _mm_unpacklo_ps(inm3,inm4); const __m128 inm12hi = _mm_unpackhi_ps(inm1,inm2); const __m128 inm34hi = _mm_unpackhi_ps(inm3,inm4); const __m128 inmv0 = _mm_movelh_ps(inm12lo,inm34lo); sv -= inmv0*inmv0 * _mm_set1_ps(norm2[0]); const __m128 inmv1 = _mm_movehl_ps(inm34lo,inm12lo); sv -= inmv1*inmv1 * _mm_set1_ps(norm2[1]); const __m128 inmv2 = _mm_movelh_ps(inm12hi,inm34hi); sv -= inmv2*inmv2 * _mm_set1_ps(norm2[2]); _mm_store_ps(s, sv); } for(; i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++) { float stmp = s[0]; for(int k=0;k<3;k++) stmp += ((inp[k] - inps[k])*(inp[k] - inps[k]) - (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k]; s[0] = stmp; } } else inited_slide = 0; } } } // normalize and apply chroma/luma blending // bias a bit towards higher values for low input values: const __m128 weight = _mm_set_ps(1.0f, powf(d->chroma, 0.6), powf(d->chroma, 0.6), powf(d->luma, 0.6)); const __m128 invert = _mm_sub_ps(_mm_set1_ps(1.0f), weight); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) shared(ovoid,ivoid,roi_out,d) #endif for(int j=0; j<roi_out->height; j++) { float *out = ((float *)ovoid) + 4*roi_out->width*j; float *in = ((float *)ivoid) + 4*roi_out->width*j; for(int i=0; i<roi_out->width; i++) { _mm_store_ps(out, _mm_add_ps( _mm_mul_ps(_mm_load_ps(in), invert), _mm_mul_ps(_mm_load_ps(out), _mm_div_ps(weight, _mm_set1_ps(out[3]))))); out += 4; in += 4; } } // free shared tmp memory: free(Sa); }