int dt_dev_pixelpipe_cache_init(dt_dev_pixelpipe_cache_t *cache, int entries, size_t size) { cache->entries = entries; cache->data = (void **)calloc(entries, sizeof(void *)); cache->size = (size_t *)calloc(entries, sizeof(size_t)); cache->hash = (uint64_t *)calloc(entries, sizeof(uint64_t)); cache->used = (int32_t *)calloc(entries, sizeof(int32_t)); for(int k = 0; k < entries; k++) { cache->data[k] = (void *)dt_alloc_align(16, size); if(!cache->data[k]) goto alloc_memory_fail; cache->size[k] = size; #ifdef _DEBUG memset(cache->data[k], 0x5d, size); #endif cache->hash[k] = -1; cache->used[k] = 0; } cache->queries = cache->misses = 0; return 1; alloc_memory_fail: for(int k = 0; k < entries; k++) { if(cache->data[k]) dt_free_align(cache->data[k]); } free(cache->data); free(cache->size); free(cache->hash); free(cache->used); return 0; }
int write_image(dt_imageio_module_data_t *data, const char *filename, const void *ivoid, void *exif, int exif_len, int imgid, int num, int total) { const dt_imageio_module_data_t *const pfm = data; int status = 0; FILE *f = fopen(filename, "wb"); if(f) { // INFO: per-line fwrite call seems to perform best. LebedevRI, 18.04.2014 (void)fprintf(f, "PF\n%d %d\n-1.0\n", pfm->width, pfm->height); void *buf_line = dt_alloc_align(16, 3 * sizeof(float) * pfm->width); for(int j = 0; j < pfm->height; j++) { // NOTE: pfm has rows in reverse order const int row_in = pfm->height - 1 - j; const float *in = (const float *)ivoid + 4 * (size_t)pfm->width * row_in; float *out = (float *)buf_line; for(int i = 0; i < pfm->width; i++, in += 4, out += 3) { memcpy(out, in, 3 * sizeof(float)); } int cnt = fwrite(buf_line, 3 * sizeof(float), pfm->width, f); if(cnt != pfm->width) status = 1; else status = 0; } dt_free_align(buf_line); buf_line = NULL; fclose(f); } return status; }
void dt_cache_cleanup(dt_cache_t *cache) { g_hash_table_destroy(cache->hashtable); GList *l = cache->lru; while(l) { dt_cache_entry_t *entry = (dt_cache_entry_t *)l->data; if(cache->cleanup) { assert(entry->data_size); ASAN_UNPOISON_MEMORY_REGION(entry->data, entry->data_size); cache->cleanup(cache->cleanup_data, entry); } else dt_free_align(entry->data); dt_pthread_rwlock_destroy(&entry->lock); g_slice_free1(sizeof(*entry), entry); l = g_list_next(l); } g_list_free(cache->lru); dt_pthread_mutex_destroy(&cache->lock); }
void dt_dev_pixelpipe_cache_cleanup(dt_dev_pixelpipe_cache_t *cache) { for(int k = 0; k < cache->entries; k++) dt_free_align(cache->data[k]); free(cache->data); free(cache->hash); free(cache->used); free(cache->size); }
void dt_gaussian_free(dt_gaussian_t *g) { if(!g) return; dt_free_align(g->buf); free(g->min); free(g->max); free(g); }
int dt_cache_remove(dt_cache_t *cache, const uint32_t key) { gpointer orig_key, value; gboolean res; int result; dt_cache_entry_t *entry; restart: dt_pthread_mutex_lock(&cache->lock); res = g_hash_table_lookup_extended( cache->hashtable, GINT_TO_POINTER(key), &orig_key, &value); entry = (dt_cache_entry_t *)value; if(!res) { // not found in cache, not deleting. dt_pthread_mutex_unlock(&cache->lock); return 1; } // need write lock to be able to delete: result = dt_pthread_rwlock_trywrlock(&entry->lock); if(result) { dt_pthread_mutex_unlock(&cache->lock); g_usleep(5); goto restart; } if(entry->_lock_demoting) { // oops, we are currently demoting (rw -> r) lock to this entry in some thread. do not touch! dt_pthread_rwlock_unlock(&entry->lock); dt_pthread_mutex_unlock(&cache->lock); g_usleep(5); goto restart; } gboolean removed = g_hash_table_remove(cache->hashtable, GINT_TO_POINTER(key)); (void)removed; // make non-assert compile happy assert(removed); cache->lru = g_list_delete_link(cache->lru, entry->link); if(cache->cleanup) { assert(entry->data_size); ASAN_UNPOISON_MEMORY_REGION(entry->data, entry->data_size); cache->cleanup(cache->cleanup_data, entry); } else dt_free_align(entry->data); dt_pthread_rwlock_unlock(&entry->lock); dt_pthread_rwlock_destroy(&entry->lock); cache->cost -= entry->cost; g_slice_free1(sizeof(*entry), entry); dt_pthread_mutex_unlock(&cache->lock); return 0; }
void dt_mipmap_cache_cleanup(dt_mipmap_cache_t *cache) { dt_mipmap_cache_serialize(cache); for(int k=0; k<DT_MIPMAP_F; k++) { dt_cache_cleanup(&cache->mip[k].cache); // now mem is actually freed, not during cache cleanup dt_free_align(cache->mip[k].buf); } dt_cache_cleanup(&cache->mip[DT_MIPMAP_FULL].cache); dt_cache_cleanup(&cache->mip[DT_MIPMAP_F].cache); // clean up temporary buffers for decompressed images, if any: if(cache->compression_type) { dt_cache_cleanup(&cache->scratchmem.cache); dt_free_align(cache->scratchmem.buf); } }
int dt_dev_pixelpipe_cache_get_weighted(dt_dev_pixelpipe_cache_t *cache, const uint64_t hash, const size_t size, void **data, int weight) { cache->queries++; *data = NULL; int max_used = -1, max = 0; size_t sz = 0; for(int k = 0; k < cache->entries; k++) { // search for hash in cache if(cache->used[k] > max_used) { max_used = cache->used[k]; max = k; } cache->used[k]++; // age all entries if(cache->hash[k] == hash) { *data = cache->data[k]; sz = cache->size[k]; cache->used[k] = weight; // this is the MRU entry } } if(!*data || sz < size) { // kill LRU entry // printf("[pixelpipe_cache_get] hash not found, returning slot %d/%d age %d\n", max, cache->entries, // weight); if(cache->size[max] < size) { dt_free_align(cache->data[max]); cache->data[max] = (void *)dt_alloc_align(16, size); cache->size[max] = size; } *data = cache->data[max]; cache->hash[max] = hash; cache->used[max] = weight; cache->misses++; return 1; } else return 0; }
int write_image(dt_imageio_module_data_t *data, const char *filename, const void *ivoid, void *exif, int exif_len, int imgid, int num, int total) { const dt_imageio_module_data_t *const pfm = data; int status = 0; FILE *f = fopen(filename, "wb"); if(f) { // align pfm header to sse, assuming the file will // be mmapped to page boundaries. char header[1024]; snprintf(header, 1024, "PF\n%d %d\n-1.0", pfm->width, pfm->height); size_t len = strlen(header); fprintf(f, "PF\n%d %d\n-1.0", pfm->width, pfm->height); ssize_t off = 0; while((len + 1 + off) & 0xf) off++; while(off-- > 0) fprintf(f, "0"); fprintf(f, "\n"); void *buf_line = dt_alloc_align(16, 3 * sizeof(float) * pfm->width); for(int j = 0; j < pfm->height; j++) { // NOTE: pfm has rows in reverse order const int row_in = pfm->height - 1 - j; const float *in = (const float *)ivoid + 4 * (size_t)pfm->width * row_in; float *out = (float *)buf_line; for(int i = 0; i < pfm->width; i++, in += 4, out += 3) { memcpy(out, in, 3 * sizeof(float)); } // INFO: per-line fwrite call seems to perform best. LebedevRI, 18.04.2014 int cnt = fwrite(buf_line, 3 * sizeof(float), pfm->width, f); if(cnt != pfm->width) status = 1; else status = 0; } dt_free_align(buf_line); buf_line = NULL; fclose(f); } return status; }
// best-effort garbage collection. never blocks, never fails. well, sometimes it just doesn't free anything. void dt_cache_gc(dt_cache_t *cache, const float fill_ratio) { GList *l = cache->lru; int cnt = 0; while(l) { cnt++; dt_cache_entry_t *entry = (dt_cache_entry_t *)l->data; assert(entry->link->data == entry); l = g_list_next(l); // we might remove this element, so walk to the next one while we still have the pointer.. if(cache->cost < cache->cost_quota * fill_ratio) break; // if still locked by anyone else give up: if(dt_pthread_rwlock_trywrlock(&entry->lock)) continue; if(entry->_lock_demoting) { // oops, we are currently demoting (rw -> r) lock to this entry in some thread. do not touch! dt_pthread_rwlock_unlock(&entry->lock); continue; } // delete! g_hash_table_remove(cache->hashtable, GINT_TO_POINTER(entry->key)); cache->lru = g_list_delete_link(cache->lru, entry->link); cache->cost -= entry->cost; if(cache->cleanup) { assert(entry->data_size); ASAN_UNPOISON_MEMORY_REGION(entry->data, entry->data_size); cache->cleanup(cache->cleanup_data, entry); } else dt_free_align(entry->data); dt_pthread_rwlock_unlock(&entry->lock); dt_pthread_rwlock_destroy(&entry->lock); g_slice_free1(sizeof(*entry), entry); } }
// callback for the imageio core to allocate memory. // only needed for _F and _FULL buffers, as they change size // with the input image. will allocate img->width*img->height*img->bpp bytes. void* dt_mipmap_cache_alloc(dt_image_t *img, dt_mipmap_size_t size, dt_mipmap_cache_allocator_t a) { assert(size == DT_MIPMAP_FULL); struct dt_mipmap_buffer_dsc** dsc = (struct dt_mipmap_buffer_dsc**)a; int32_t wd = img->width; int32_t ht = img->height; int32_t bpp = img->bpp; const uint32_t buffer_size = ((wd*ht*bpp) + sizeof(**dsc)); // buf might have been alloc'ed before, // so only check size and re-alloc if necessary: if(!(*dsc) || ((*dsc)->size < buffer_size) || ((void *)*dsc == (void *)dt_mipmap_cache_static_dead_image)) { if((void *)*dsc != (void *)dt_mipmap_cache_static_dead_image) dt_free_align(*dsc); *dsc = dt_alloc_align(64, buffer_size); // fprintf(stderr, "[mipmap cache] alloc for key %u %p\n", get_key(img->id, size), *buf); if(!(*dsc)) { // return fallback: at least alloc size for a dead image: *dsc = (struct dt_mipmap_buffer_dsc *)dt_mipmap_cache_static_dead_image; // allocator holds the pointer. but imageio client is tricked to believe allocation failed: return NULL; } // set buffer size only if we're making it larger. (*dsc)->size = buffer_size; } (*dsc)->width = wd; (*dsc)->height = ht; (*dsc)->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE; // fprintf(stderr, "full buffer allocating img %u %d x %d = %u bytes (%p)\n", img->id, img->width, img->height, buffer_size, *buf); // trick the user into using a pointer without the header: return (*dsc)+1; }
dt_gaussian_t *dt_gaussian_init(const int width, // width of input image const int height, // height of input image const int channels, // channels per pixel const float *max, // maximum allowed values per channel for clamping const float *min, // minimum allowed values per channel for clamping const float sigma, // gaussian sigma const int order) // order of gaussian blur { dt_gaussian_t *g = (dt_gaussian_t *)malloc(sizeof(dt_gaussian_t)); if(!g) return NULL; g->width = width; g->height = height; g->channels = channels; g->sigma = sigma; g->order = order; g->buf = NULL; g->max = (float *)calloc(channels, sizeof(float)); g->min = (float *)calloc(channels, sizeof(float)); if(!g->min || !g->max) goto error; for(int k = 0; k < channels; k++) { g->max[k] = max[k]; g->min[k] = min[k]; } g->buf = dt_alloc_align(64, (size_t)width * height * channels * sizeof(float)); if(!g->buf) goto error; return g; error: dt_free_align(g->buf); free(g->max); free(g->min); free(g); return NULL; }
// callback for the imageio core to allocate memory. // only needed for _F and _FULL buffers, as they change size // with the input image. will allocate img->width*img->height*img->bpp bytes. void *dt_mipmap_cache_alloc(dt_mipmap_buffer_t *buf, const dt_image_t *img) { assert(buf->size == DT_MIPMAP_FULL); const int wd = img->width; const int ht = img->height; struct dt_mipmap_buffer_dsc *dsc = (struct dt_mipmap_buffer_dsc *)buf->cache_entry->data; const size_t buffer_size = (size_t)wd*ht*img->bpp + sizeof(*dsc); // buf might have been alloc'ed before, // so only check size and re-alloc if necessary: if(!buf->buf || (dsc->size < buffer_size) || ((void *)dsc == (void *)dt_mipmap_cache_static_dead_image)) { if((void *)dsc != (void *)dt_mipmap_cache_static_dead_image) dt_free_align(buf->cache_entry->data); buf->cache_entry->data = dt_alloc_align(64, buffer_size); if(!buf->cache_entry->data) { // return fallback: at least alloc size for a dead image: buf->cache_entry->data = (void*)dt_mipmap_cache_static_dead_image; // allocator holds the pointer. but let imageio client know that allocation failed: return NULL; } // set buffer size only if we're making it larger. dsc = (struct dt_mipmap_buffer_dsc *)buf->cache_entry->data; dsc->size = buffer_size; } dsc->width = wd; dsc->height = ht; dsc->color_space = DT_COLORSPACE_NONE; dsc->flags = DT_MIPMAP_BUFFER_DSC_FLAG_GENERATE; buf->buf = (uint8_t *)(dsc + 1); // fprintf(stderr, "full buffer allocating img %u %d x %d = %u bytes (%p)\n", img->id, img->width, // img->height, buffer_size, *buf); // return pointer to start of payload return dsc + 1; }
static void generate_thumbnail_cache() { const int max_mip = DT_MIPMAP_2; fprintf(stderr, _("creating cache directories\n")); char filename[PATH_MAX] = {0}; for(int k=DT_MIPMAP_0;k<=max_mip;k++) { snprintf(filename, sizeof(filename), "%s.d/%d", darktable.mipmap_cache->cachedir, k); fprintf(stderr, _("creating cache directory '%s'\n"), filename); int mkd = g_mkdir_with_parents(filename, 0750); if(mkd) { fprintf(stderr, _("could not create directory '%s'!\n"), filename); return; } } // some progress counter sqlite3_stmt *stmt; uint64_t image_count = 0, counter = 0; DT_DEBUG_SQLITE3_PREPARE_V2(dt_database_get(darktable.db), "select count(id) from images", -1, &stmt, 0); if(sqlite3_step(stmt) == SQLITE_ROW) image_count = sqlite3_column_int(stmt, 0); sqlite3_finalize(stmt); // go through all images: DT_DEBUG_SQLITE3_PREPARE_V2(dt_database_get(darktable.db), "select id from images", -1, &stmt, 0); // could only alloc max_mip-1, but would need to detect the special case that max==0. const size_t bufsize = (size_t)4 * darktable.mipmap_cache->max_width[max_mip] * darktable.mipmap_cache->max_height[max_mip]; uint8_t *tmp = (uint8_t *)dt_alloc_align(16, bufsize); if(!tmp) { fprintf(stderr, "couldn't allocate temporary memory!\n"); sqlite3_finalize(stmt); return; } const int cache_quality = MIN(100, MAX(10, dt_conf_get_int("database_cache_quality"))); while(sqlite3_step(stmt) == SQLITE_ROW) { const int32_t imgid = sqlite3_column_int(stmt, 0); // check whether all of these files are already there int all_exist = 1; for(int k=max_mip;k>=DT_MIPMAP_0;k--) { snprintf(filename, sizeof(filename), "%s.d/%d/%d.jpg", darktable.mipmap_cache->cachedir, k, imgid); all_exist &= !access(filename, R_OK); } if(all_exist) goto next; dt_mipmap_buffer_t buf; // get largest thumbnail for this image // this one will take care of itself, we'll just write out the lower thumbs manually: dt_mipmap_cache_get(darktable.mipmap_cache, &buf, imgid, max_mip, DT_MIPMAP_BLOCKING, 'r'); if(buf.width > 8 && buf.height > 8) // don't create for skulls for(int k=max_mip-1;k>=DT_MIPMAP_0;k--) { uint32_t width, height; const int wd = darktable.mipmap_cache->max_width[k]; const int ht = darktable.mipmap_cache->max_height[k]; // use exactly the same mechanism as the cache internally to rescale the thumbnail: dt_iop_flip_and_zoom_8(buf.buf, buf.width, buf.height, tmp, wd, ht, 0, &width, &height); snprintf(filename, sizeof(filename), "%s.d/%d/%d.jpg", darktable.mipmap_cache->cachedir, k, imgid); FILE *f = fopen(filename, "wb"); if(f) { // allocate temp memory: uint8_t *blob = (uint8_t *)malloc(bufsize); if(!blob) goto write_error; const int32_t length = dt_imageio_jpeg_compress(tmp, blob, width, height, cache_quality); assert(length <= bufsize); int written = fwrite(blob, sizeof(uint8_t), length, f); if(written != length) { write_error: unlink(filename); } free(blob); fclose(f); } } dt_mipmap_cache_release(darktable.mipmap_cache, &buf); next: counter ++; fprintf(stderr, "\rimage %lu/%lu (%.02f%%) ", counter, image_count, 100.0*counter/(float)image_count); } dt_free_align(tmp); sqlite3_finalize(stmt); fprintf(stderr, "done \n"); }
dt_imageio_retval_t dt_imageio_open_png(dt_image_t *img, const char *filename, dt_mipmap_buffer_t *mbuf) { const char *ext = filename + strlen(filename); while(*ext != '.' && ext > filename) ext--; if(strncmp(ext, ".png", 4) && strncmp(ext, ".PNG", 4)) return DT_IMAGEIO_FILE_CORRUPTED; if(!img->exif_inited) (void)dt_exif_read(img, filename); dt_imageio_png_t image; uint8_t *buf = NULL; uint32_t width, height; uint16_t bpp; if(read_header(filename, &image) != 0) return DT_IMAGEIO_FILE_CORRUPTED; width = img->width = image.width; height = img->height = image.height; bpp = image.bit_depth; img->bpp = 4 * sizeof(float); float *mipbuf = (float *)dt_mipmap_cache_alloc(mbuf, img); if(!mipbuf) { fclose(image.f); png_destroy_read_struct(&image.png_ptr, &image.info_ptr, NULL); fprintf(stderr, "[png_open] could not alloc full buffer for image `%s'\n", img->filename); return DT_IMAGEIO_CACHE_FULL; } buf = dt_alloc_align(16, (size_t)width * height * 3 * (bpp < 16 ? 1 : 2)); if(!buf) { fclose(image.f); png_destroy_read_struct(&image.png_ptr, &image.info_ptr, NULL); fprintf(stderr, "[png_open] could not alloc intermediate buffer for image `%s'\n", img->filename); return DT_IMAGEIO_CACHE_FULL; } if(read_image(&image, (void *)buf) != 0) { dt_free_align(buf); fprintf(stderr, "[png_open] could not read image `%s'\n", img->filename); return DT_IMAGEIO_FILE_CORRUPTED; } for(size_t j = 0; j < height; j++) { if(bpp < 16) for(size_t i = 0; i < width; i++) for(int k = 0; k < 3; k++) mipbuf[4 * (j * width + i) + k] = buf[3 * (j * width + i) + k] * (1.0f / 255.0f); else for(size_t i = 0; i < width; i++) for(int k = 0; k < 3; k++) mipbuf[4 * (j * width + i) + k] = (256.0f * buf[2 * (3 * (j * width + i) + k)] + buf[2 * (3 * (j * width + i) + k) + 1]) * (1.0f / 65535.0f); } dt_free_align(buf); return DT_IMAGEIO_OK; }
void local_laplacian_internal( const float *const input, // input buffer in some Labx or yuvx format float *const out, // output buffer with colour const int wd, // width and const int ht, // height of the input buffer const float sigma, // user param: separate shadows/midtones/highlights const float shadows, // user param: lift shadows const float highlights, // user param: compress highlights const float clarity, // user param: increase clarity/local contrast const int use_sse2) // flag whether to use SSE version { #define max_levels 30 #define num_gamma 6 // don't divide by 2 more often than we can: const int num_levels = MIN(max_levels, 31-__builtin_clz(MIN(wd,ht))); const int max_supp = 1<<(num_levels-1); int w, h; float *padded[max_levels] = {0}; padded[0] = ll_pad_input(input, wd, ht, max_supp, &w, &h); // allocate pyramid pointers for padded input for(int l=1;l<num_levels;l++) padded[l] = dt_alloc_align(16, sizeof(float)*dl(w,l)*dl(h,l)); // allocate pyramid pointers for output float *output[max_levels] = {0}; for(int l=0;l<num_levels;l++) output[l] = dt_alloc_align(16, sizeof(float)*dl(w,l)*dl(h,l)); // create gauss pyramid of padded input, write coarse directly to output #if defined(__SSE2__) if(use_sse2) { for(int l=1;l<num_levels-1;l++) gauss_reduce_sse2(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1)); gauss_reduce_sse2(padded[num_levels-2], output[num_levels-1], dl(w,num_levels-2), dl(h,num_levels-2)); } else #endif { for(int l=1;l<num_levels-1;l++) gauss_reduce(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1)); gauss_reduce(padded[num_levels-2], output[num_levels-1], dl(w,num_levels-2), dl(h,num_levels-2)); } // evenly sample brightness [0,1]: float gamma[num_gamma] = {0.0f}; for(int k=0;k<num_gamma;k++) gamma[k] = (k+.5f)/(float)num_gamma; // for(int k=0;k<num_gamma;k++) gamma[k] = k/(num_gamma-1.0f); // allocate memory for intermediate laplacian pyramids float *buf[num_gamma][max_levels] = {{0}}; for(int k=0;k<num_gamma;k++) for(int l=0;l<num_levels;l++) buf[k][l] = dt_alloc_align(16, sizeof(float)*dl(w,l)*dl(h,l)); // the paper says remapping only level 3 not 0 does the trick, too // (but i really like the additional octave of sharpness we get, // willing to pay the cost). for(int k=0;k<num_gamma;k++) { // process images #if defined(__SSE2__) if(use_sse2) apply_curve_sse2(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity); else // brackets in next line needed for silly gcc warning: #endif {apply_curve(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity);} // create gaussian pyramids for(int l=1;l<num_levels;l++) #if defined(__SSE2__) if(use_sse2) gauss_reduce_sse2(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1)); else #endif gauss_reduce(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1)); } // assemble output pyramid coarse to fine for(int l=num_levels-2;l >= 0; l--) { const int pw = dl(w,l), ph = dl(h,l); gauss_expand(output[l+1], output[l], pw, ph); // go through all coefficients in the upsampled gauss buffer: #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) collapse(2) shared(w,h,buf,output,l,gamma,padded) #endif for(int j=0;j<ph;j++) for(int i=0;i<pw;i++) { const float v = padded[l][j*pw+i]; int hi = 1; for(;hi<num_gamma-1 && gamma[hi] <= v;hi++); int lo = hi-1; const float a = CLAMPS((v - gamma[lo])/(gamma[hi]-gamma[lo]), 0.0f, 1.0f); const float l0 = ll_laplacian(buf[lo][l+1], buf[lo][l], i, j, pw, ph); const float l1 = ll_laplacian(buf[hi][l+1], buf[hi][l], i, j, pw, ph); output[l][j*pw+i] += l0 * (1.0f-a) + l1 * a; // we could do this to save on memory (no need for finest buf[][]). // unfortunately it results in a quite noticable loss of sharpness, i think // the extra level is worth it. // else if(l == 0) // use finest scale from input to not amplify noise (and use less memory) // output[l][j*pw+i] += ll_laplacian(padded[l+1], padded[l], i, j, pw, ph); } } #ifdef _OPENMP #pragma omp parallel for default(none) schedule(dynamic) collapse(2) shared(w,output,buf) #endif for(int j=0;j<ht;j++) for(int i=0;i<wd;i++) { out[4*(j*wd+i)+0] = 100.0f * output[0][(j+max_supp)*w+max_supp+i]; // [0,1] -> L out[4*(j*wd+i)+1] = input[4*(j*wd+i)+1]; // copy original colour channels out[4*(j*wd+i)+2] = input[4*(j*wd+i)+2]; } // free all buffers! for(int l=0;l<max_levels;l++) { dt_free_align(padded[l]); dt_free_align(output[l]); for(int k = 0; k < num_gamma; k++) dt_free_align(buf[k][l]); } #undef num_levels #undef num_gamma }
static inline void gauss_reduce_sse2( const float *const input, // fine input buffer float *const coarse, // coarse scale, blurred input buf const int wd, // fine res const int ht) { // blur, store only coarse res const int cw = (wd-1)/2+1, ch = (ht-1)/2+1; // this version is inspired by opencv's pyrDown_ : // - allocate 5 rows of ring buffer (aligned) // - for coarse res y // - fill 5 coarse-res row buffers with 1 4 6 4 1 weights (reuse some from last time) // - do vertical convolution via sse and write to coarse output buf const int stride = ((cw+8)&~7); // assure sse alignment of rows float *ringbuf = dt_alloc_align(16, sizeof(*ringbuf)*stride*5); float *rows[5] = {0}; int rowj = 0; // we initialised this many rows so far for(int j=1;j<ch-1;j++) { // horizontal pass, convolve with 1 4 6 4 1 kernel and decimate for(;rowj<=2*j+2;rowj++) { float *const row = ringbuf + (rowj % 5)*stride; const float *const in = input + rowj*wd; #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) #endif for(int i=1;i<cw-1;i++) row[i] = 6*in[2*i] + 4*(in[2*i-1]+in[2*i+1]) + in[2*i-2] + in[2*i+2]; } // init row pointers for(int k=0;k<5;k++) rows[k] = ringbuf + ((2*j-2+k)%5)*stride; // vertical pass, convolve and decimate using SIMD: // note that we're ignoring the (1..cw-1) buffer limit, we'll pull in // garbage and fix it later by border filling. float *const out = coarse + j*cw; const float *const row0 = rows[0], *const row1 = rows[1], *const row2 = rows[2], *const row3 = rows[3], *const row4 = rows[4]; const __m128 four = _mm_set1_ps(4.f), scale = _mm_set1_ps(1.f/256.f); #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) #endif for(int i=0;i<=cw-8;i+=8) { __m128 r0, r1, r2, r3, r4, t0, t1; r0 = _mm_load_ps(row0 + i); r1 = _mm_load_ps(row1 + i); r2 = _mm_load_ps(row2 + i); r3 = _mm_load_ps(row3 + i); r4 = _mm_load_ps(row4 + i); r0 = _mm_add_ps(r0, r4); r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2); r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2)); t0 = _mm_add_ps(r0, _mm_mul_ps(r1, four)); r0 = _mm_load_ps(row0 + i + 4); r1 = _mm_load_ps(row1 + i + 4); r2 = _mm_load_ps(row2 + i + 4); r3 = _mm_load_ps(row3 + i + 4); r4 = _mm_load_ps(row4 + i + 4); r0 = _mm_add_ps(r0, r4); r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2); r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2)); t1 = _mm_add_ps(r0, _mm_mul_ps(r1, four)); t0 = _mm_mul_ps(t0, scale); t1 = _mm_mul_ps(t1, scale); _mm_storeu_ps(out + i, t0); _mm_storeu_ps(out + i + 4, t1); } // process the rest for(int i=cw&~7;i<cw-1;i++) out[i] = (6*row2[i] + 4*(row1[i] + row3[i]) + row0[i] + row4[i])*(1.0f/256.0f); } dt_free_align(ringbuf); ll_fill_boundary1(coarse, cw, ch); }
// internal function: to avoid exif blob reading + 8-bit byteorder flag + high-quality override int dt_imageio_export_with_flags( const uint32_t imgid, const char *filename, dt_imageio_module_format_t *format, dt_imageio_module_data_t *format_params, const int32_t ignore_exif, const int32_t display_byteorder, const gboolean high_quality, const int32_t thumbnail_export, const char *filter, const gboolean copy_metadata, dt_imageio_module_storage_t *storage, dt_imageio_module_data_t *storage_params) { dt_develop_t dev; dt_dev_init(&dev, 0); dt_mipmap_buffer_t buf; if(thumbnail_export && dt_conf_get_bool("plugins/lighttable/low_quality_thumbnails")) dt_mipmap_cache_read_get(darktable.mipmap_cache, &buf, imgid, DT_MIPMAP_F, DT_MIPMAP_BLOCKING); else dt_mipmap_cache_read_get(darktable.mipmap_cache, &buf, imgid, DT_MIPMAP_FULL, DT_MIPMAP_BLOCKING); dt_dev_load_image(&dev, imgid); const dt_image_t *img = &dev.image_storage; const int wd = img->width; const int ht = img->height; int res = 0; dt_times_t start; dt_get_times(&start); dt_dev_pixelpipe_t pipe; res = thumbnail_export ? dt_dev_pixelpipe_init_thumbnail(&pipe, wd, ht) : dt_dev_pixelpipe_init_export(&pipe, wd, ht, format->levels(format_params)); if(!res) { dt_control_log(_("failed to allocate memory for %s, please lower the threads used for export or buy more memory."), thumbnail_export ? C_("noun", "thumbnail export") : C_("noun", "export")); dt_dev_cleanup(&dev); dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); return 1; } if(!buf.buf) { dt_control_log(_("image `%s' is not available!"), img->filename); dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); dt_dev_cleanup(&dev); return 1; } // If a style is to be applied during export, add the iop params into the history if (!thumbnail_export && format_params->style[0] != '\0') { GList *stls; GList *modules = dev.iop; dt_iop_module_t *m = NULL; if ((stls=dt_styles_get_item_list(format_params->style, TRUE, -1)) == 0) { dt_control_log(_("cannot find the style '%s' to apply during export."), format_params->style); dt_dev_cleanup(&dev); dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); return 1; } // Add each params while (stls) { dt_style_item_t *s = (dt_style_item_t *) stls->data; modules = dev.iop; while (modules) { m = (dt_iop_module_t *)modules->data; // since the name in the style is returned with a possible multi-name, just check the start of the name if (strncmp(m->op, s->name, strlen(m->op)) == 0) { dt_dev_history_item_t *h = malloc(sizeof(dt_dev_history_item_t)); h->params = s->params; h->blend_params = s->blendop_params; h->enabled = s->enabled; h->module = m; h->multi_priority = 1; g_strlcpy(h->multi_name, "", sizeof(h->multi_name)); if(m->legacy_params && (s->module_version != m->version())) { void *new_params = malloc(m->params_size); m->legacy_params (m, h->params, s->module_version, new_params, labs(m->version())); free (h->params); h->params = new_params; } dev.history_end++; dev.history = g_list_append(dev.history, h); break; } modules = g_list_next(modules); } stls = g_list_next(stls); } } dt_dev_pixelpipe_set_input(&pipe, &dev, (float *)buf.buf, buf.width, buf.height, 1.0); dt_dev_pixelpipe_create_nodes(&pipe, &dev); dt_dev_pixelpipe_synch_all(&pipe, &dev); dt_dev_pixelpipe_get_dimensions(&pipe, &dev, pipe.iwidth, pipe.iheight, &pipe.processed_width, &pipe.processed_height); if(filter) { if(!strncmp(filter, "pre:", 4)) dt_dev_pixelpipe_disable_after(&pipe, filter+4); if(!strncmp(filter, "post:", 5)) dt_dev_pixelpipe_disable_before(&pipe, filter+5); } dt_show_times(&start, "[export] creating pixelpipe", NULL); // find output color profile for this image: int sRGB = 1; gchar *overprofile = dt_conf_get_string("plugins/lighttable/export/iccprofile"); if(overprofile && !strcmp(overprofile, "sRGB")) { sRGB = 1; } else if(!overprofile || !strcmp(overprofile, "image")) { GList *modules = dev.iop; dt_iop_module_t *colorout = NULL; while (modules) { colorout = (dt_iop_module_t *)modules->data; if(colorout->get_p && strcmp(colorout->op, "colorout") == 0) { const char *iccprofile = colorout->get_p(colorout->params, "iccprofile"); if(!strcmp(iccprofile, "sRGB")) sRGB = 1; else sRGB = 0; } modules = g_list_next(modules); } } else { sRGB = 0; } g_free(overprofile); // get only once at the beginning, in case the user changes it on the way: const gboolean high_quality_processing = ((format_params->max_width == 0 || format_params->max_width >= pipe.processed_width ) && (format_params->max_height == 0 || format_params->max_height >= pipe.processed_height)) ? FALSE : high_quality; const int width = high_quality_processing ? 0 : format_params->max_width; const int height = high_quality_processing ? 0 : format_params->max_height; const double scalex = width > 0 ? fminf(width /(double)pipe.processed_width, 1.0) : 1.0; const double scaley = height > 0 ? fminf(height/(double)pipe.processed_height, 1.0) : 1.0; const double scale = fminf(scalex, scaley); int processed_width = scale*pipe.processed_width + .5f; int processed_height = scale*pipe.processed_height + .5f; const int bpp = format->bpp(format_params); // downsampling done last, if high quality processing was requested: uint8_t *outbuf = pipe.backbuf; uint8_t *moutbuf = NULL; // keep track of alloc'ed memory dt_get_times(&start); if(high_quality_processing) { dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale); const double scalex = format_params->max_width > 0 ? fminf(format_params->max_width /(double)pipe.processed_width, 1.0) : 1.0; const double scaley = format_params->max_height > 0 ? fminf(format_params->max_height/(double)pipe.processed_height, 1.0) : 1.0; const double scale = fminf(scalex, scaley); processed_width = scale*pipe.processed_width + .5f; processed_height = scale*pipe.processed_height + .5f; moutbuf = (uint8_t *)dt_alloc_align(64, (size_t)sizeof(float)*processed_width*processed_height*4); outbuf = moutbuf; // now downscale into the new buffer: dt_iop_roi_t roi_in, roi_out; roi_in.x = roi_in.y = roi_out.x = roi_out.y = 0; roi_in.scale = 1.0; roi_out.scale = scale; roi_in.width = pipe.processed_width; roi_in.height = pipe.processed_height; roi_out.width = processed_width; roi_out.height = processed_height; dt_iop_clip_and_zoom((float *)outbuf, (float *)pipe.backbuf, &roi_out, &roi_in, processed_width, pipe.processed_width); } else { // do the processing (8-bit with special treatment, to make sure we can use openmp further down): if(bpp == 8) dt_dev_pixelpipe_process(&pipe, &dev, 0, 0, processed_width, processed_height, scale); else dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, processed_width, processed_height, scale); outbuf = pipe.backbuf; } dt_show_times(&start, thumbnail_export ? "[dev_process_thumbnail] pixel pipeline processing" : "[dev_process_export] pixel pipeline processing", NULL); // downconversion to low-precision formats: if(bpp == 8) { if(display_byteorder) { if(high_quality_processing) { const float *const inbuf = (float *)outbuf; for(size_t k=0; k<(size_t)processed_width*processed_height; k++) { // convert in place, this is unfortunately very serial.. const uint8_t r = CLAMP(inbuf[4*k+2]*0xff, 0, 0xff); const uint8_t g = CLAMP(inbuf[4*k+1]*0xff, 0, 0xff); const uint8_t b = CLAMP(inbuf[4*k+0]*0xff, 0, 0xff); outbuf[4*k+0] = r; outbuf[4*k+1] = g; outbuf[4*k+2] = b; } } // else processing output was 8-bit already, and no need to swap order } else // need to flip { // ldr output: char if(high_quality_processing) { const float *const inbuf = (float *)outbuf; for(size_t k=0; k<(size_t)processed_width*processed_height; k++) { // convert in place, this is unfortunately very serial.. const uint8_t r = CLAMP(inbuf[4*k+0]*0xff, 0, 0xff); const uint8_t g = CLAMP(inbuf[4*k+1]*0xff, 0, 0xff); const uint8_t b = CLAMP(inbuf[4*k+2]*0xff, 0, 0xff); outbuf[4*k+0] = r; outbuf[4*k+1] = g; outbuf[4*k+2] = b; } } else { // !display_byteorder, need to swap: uint8_t *const buf8 = pipe.backbuf; #ifdef _OPENMP #pragma omp parallel for default(none) shared(processed_width, processed_height) schedule(static) #endif // just flip byte order for(size_t k=0; k<(size_t)processed_width*processed_height; k++) { uint8_t tmp = buf8[4*k+0]; buf8[4*k+0] = buf8[4*k+2]; buf8[4*k+2] = tmp; } } } } else if(bpp == 16) { // uint16_t per color channel float *buff = (float *) outbuf; uint16_t *buf16 = (uint16_t *)outbuf; for(int y=0; y<processed_height; y++) for(int x=0; x<processed_width ; x++) { // convert in place const size_t k = (size_t)processed_width*y + x; for(int i=0; i<3; i++) buf16[4*k+i] = CLAMP(buff[4*k+i]*0x10000, 0, 0xffff); } } // else output float, no further harm done to the pixels :) format_params->width = processed_width; format_params->height = processed_height; if(!ignore_exif) { int length; uint8_t exif_profile[65535]; // C++ alloc'ed buffer is uncool, so we waste some bits here. char pathname[PATH_MAX]; gboolean from_cache = TRUE; dt_image_full_path(imgid, pathname, sizeof(pathname), &from_cache); // last param is dng mode, it's false here length = dt_exif_read_blob(exif_profile, pathname, imgid, sRGB, processed_width, processed_height, 0); res = format->write_image (format_params, filename, outbuf, exif_profile, length, imgid); } else { res = format->write_image (format_params, filename, outbuf, NULL, 0, imgid); } dt_dev_pixelpipe_cleanup(&pipe); dt_dev_cleanup(&dev); dt_mipmap_cache_read_release(darktable.mipmap_cache, &buf); dt_free_align(moutbuf); /* now write xmp into that container, if possible */ if(copy_metadata && (format->flags(format_params) & FORMAT_FLAGS_SUPPORT_XMP)) { dt_exif_xmp_attach(imgid, filename); // no need to cancel the export if this fail } if(!thumbnail_export && strcmp(format->mime(format_params), "memory")) { dt_control_signal_raise(darktable.signals,DT_SIGNAL_IMAGE_EXPORT_TMPFILE,imgid,filename,format,format_params,storage,storage_params); } return res; }
static int dt_group_get_mask_roi(dt_iop_module_t *module, dt_dev_pixelpipe_iop_t *piece, dt_masks_form_t *form, const dt_iop_roi_t *roi, float *buffer) { double start2 = dt_get_wtime(); const guint nb = g_list_length(form->points); if(nb == 0) return 0; int nb_ok = 0; const int width = roi->width; const int height = roi->height; // we need to allocate a temporary buffer for intermediate creation of individual shapes float *bufs = dt_alloc_align(64, (size_t)width * height * sizeof(float)); if(bufs == NULL) return 0; // empty the output buffer memset(buffer, 0, (size_t)width * height * sizeof(float)); // and we get all masks GList *fpts = g_list_first(form->points); while(fpts) { dt_masks_point_group_t *fpt = (dt_masks_point_group_t *)fpts->data; dt_masks_form_t *sel = dt_masks_get_from_id(module->dev, fpt->formid); if(sel) { const int ok = dt_masks_get_mask_roi(module, piece, sel, roi, bufs); const float op = fpt->opacity; const int state = fpt->state; if(ok) { // first see if we need to invert this shape if(state & DT_MASKS_STATE_INVERSE) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs) #else #pragma omp parallel for shared(bufs) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; bufs[index] = 1.0f - bufs[index]; } } if(state & DT_MASKS_STATE_UNION) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; buffer[index] = fmaxf(buffer[index], bufs[index] * op); } } else if(state & DT_MASKS_STATE_INTERSECTION) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; float b1 = buffer[index]; float b2 = b2 = bufs[index]; // FIXME: is this line correct? what it supposed to be doing? if(b1 > 0.0f && b2 > 0.0f) buffer[index] = fminf(b1, b2 * op); else buffer[index] = 0.0f; } } else if(state & DT_MASKS_STATE_DIFFERENCE) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; float b1 = buffer[index]; float b2 = bufs[index] * op; if(b1 > 0.0f && b2 > 0.0f) buffer[index] = b1 * (1.0f - b2); } } else if(state & DT_MASKS_STATE_EXCLUSION) { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; float b1 = buffer[index]; float b2 = bufs[index] * op; if(b1 > 0.0f && b2 > 0.0f) buffer[index] = fmaxf((1.0f - b1) * b2, b1 * (1.0f - b2)); else buffer[index] = fmaxf(b1, b2); } } else // if we are here, this mean that we just have to copy the shape and null other parts { #ifdef _OPENMP #if !defined(__SUNOS__) && !defined(__NetBSD__) #pragma omp parallel for default(none) shared(bufs, buffer) #else #pragma omp parallel for shared(bufs, buffer) #endif #endif for(int y = 0; y < height; y++) for(int x = 0; x < width; x++) { size_t index = (size_t)y * width + x; buffer[index] = bufs[index] * op; } } if(darktable.unmuted & DT_DEBUG_PERF) dt_print(DT_DEBUG_MASKS, "[masks %d] combine took %0.04f sec\n", nb_ok, dt_get_wtime() - start2); start2 = dt_get_wtime(); nb_ok++; } } fpts = g_list_next(fpts); } // and we free the intermediate buffer dt_free_align(bufs); return (nb_ok != 0); }
dt_imageio_retval_t dt_imageio_open_gm( dt_image_t *img, const char *filename, dt_mipmap_cache_allocator_t a) { int err = DT_IMAGEIO_FILE_CORRUPTED; float *buf = NULL; ExceptionInfo exception; Image *image = NULL; ImageInfo *image_info = NULL; uint32_t width, height, orientation; if(!_supported_image(filename)) return DT_IMAGEIO_FILE_CORRUPTED; if(!img->exif_inited) (void) dt_exif_read(img, filename); GetExceptionInfo(&exception); image_info=CloneImageInfo((ImageInfo *) NULL); g_strlcpy(image_info->filename,filename,sizeof(image_info->filename)); image=ReadImage(image_info,&exception); if (exception.severity != UndefinedException) CatchException(&exception); if (!image) { fprintf(stderr, "[GraphicsMagick_open] image `%s' not found\n", img->filename); err = DT_IMAGEIO_FILE_NOT_FOUND; goto error; } width = image->columns; height = image->rows; orientation = image->orientation; if(orientation & 4) { img->width = height; img->height = width; } else { img->width = width; img->height = height; } img->bpp = 4*sizeof(float); float *mipbuf = (float *)dt_mipmap_cache_alloc(img, DT_MIPMAP_FULL, a); if(!mipbuf) { fprintf(stderr, "[GraphicsMagick_open] could not alloc full buffer for image `%s'\n", img->filename); err = DT_IMAGEIO_CACHE_FULL; goto error; } buf = (float *)dt_alloc_align(16, width*img->bpp); if(!buf) goto error; const int ht2 = orientation & 4 ? img->width : img->height; // pretend unrotated, rotate in write_pos const int wd2 = orientation & 4 ? img->height : img->width; for (uint32_t row = 0; row < height; row++) { int ret = DispatchImage(image, 0, row, width, 1, "RGBP", FloatPixel, (void *)buf, &exception); if (exception.severity != UndefinedException) CatchException(&exception); if(ret != MagickPass) { fprintf(stderr, "[GraphicsMagick_open] error reading image `%s'\n", img->filename); err = DT_IMAGEIO_FILE_CORRUPTED; goto error; } for(uint32_t i=0; i<width; i++) for(int k=0; k<4; k++) mipbuf[4*dt_imageio_write_pos(i, row, wd2, ht2, wd2, ht2, orientation) + k] = buf[4*i + k]; } if(buf) dt_free_align(buf); if(image) DestroyImage(image); if(image_info) DestroyImageInfo(image_info); DestroyExceptionInfo(&exception); img->filters = 0; img->flags &= ~DT_IMAGE_RAW; img->flags &= ~DT_IMAGE_HDR; img->flags |= DT_IMAGE_LDR; return DT_IMAGEIO_OK; error: if(buf) dt_free_align(buf); if(image) DestroyImage(image); if(image_info) DestroyImageInfo(image_info); DestroyExceptionInfo(&exception); return err; }
int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data; dt_iop_global_tonemap_global_data_t *gd = (dt_iop_global_tonemap_global_data_t *)self->data; dt_iop_global_tonemap_gui_data_t *g = (dt_iop_global_tonemap_gui_data_t *)self->gui_data; dt_bilateral_cl_t *b = NULL; cl_int err = -999; cl_mem dev_m = NULL; cl_mem dev_r = NULL; float *maximum = NULL; const int devid = piece->pipe->devid; int gtkernel = -1; const int width = roi_out->width; const int height = roi_out->height; float parameters[4] = { 0.0f }; switch(d->operator) { case OPERATOR_REINHARD: gtkernel = gd->kernel_global_tonemap_reinhard; break; case OPERATOR_DRAGO: gtkernel = gd->kernel_global_tonemap_drago; break; case OPERATOR_FILMIC: gtkernel = gd->kernel_global_tonemap_filmic; break; } if(d->operator== OPERATOR_DRAGO) { const float eps = 0.0001f; float tmp_lwmax = NAN; // see comments in process() about lwmax value if(self->dev->gui_attached && g && piece->pipe->type == DT_DEV_PIXELPIPE_FULL) { dt_pthread_mutex_lock(&g->lock); const uint64_t hash = g->hash; dt_pthread_mutex_unlock(&g->lock); if(hash != 0 && !dt_dev_sync_pixelpipe_hash(self->dev, piece->pipe, 0, self->priority, &g->lock, &g->hash)) dt_control_log(_("inconsistent output")); dt_pthread_mutex_lock(&g->lock); tmp_lwmax = g->lwmax; dt_pthread_mutex_unlock(&g->lock); } if(isnan(tmp_lwmax)) { dt_opencl_local_buffer_t flocopt = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1, .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 4, .sizey = 1 << 4 }; if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_first, &flocopt)) goto error; const size_t bwidth = ROUNDUP(width, flocopt.sizex); const size_t bheight = ROUNDUP(height, flocopt.sizey); const int bufsize = (bwidth / flocopt.sizex) * (bheight / flocopt.sizey); dt_opencl_local_buffer_t slocopt = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1, .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 16, .sizey = 1 }; if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_second, &slocopt)) goto error; const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex); size_t sizes[3]; size_t local[3]; dev_m = dt_opencl_alloc_device_buffer(devid, (size_t)bufsize * sizeof(float)); if(dev_m == NULL) goto error; dev_r = dt_opencl_alloc_device_buffer(devid, (size_t)reducesize * sizeof(float)); if(dev_r == NULL) goto error; sizes[0] = bwidth; sizes[1] = bheight; sizes[2] = 1; local[0] = flocopt.sizex; local[1] = flocopt.sizey; local[2] = 1; dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 0, sizeof(cl_mem), &dev_in); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 1, sizeof(int), &width); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 2, sizeof(int), &height); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 3, sizeof(cl_mem), &dev_m); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 4, flocopt.sizex * flocopt.sizey * sizeof(float), NULL); err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_pixelmax_first, sizes, local); if(err != CL_SUCCESS) goto error; sizes[0] = reducesize * slocopt.sizex; sizes[1] = 1; sizes[2] = 1; local[0] = slocopt.sizex; local[1] = 1; local[2] = 1; dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 0, sizeof(cl_mem), &dev_m); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 1, sizeof(cl_mem), &dev_r); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 2, sizeof(int), &bufsize); dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 3, slocopt.sizex * sizeof(float), NULL); err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_pixelmax_second, sizes, local); if(err != CL_SUCCESS) goto error; maximum = dt_alloc_align(16, reducesize * sizeof(float)); err = dt_opencl_read_buffer_from_device(devid, (void *)maximum, dev_r, 0, (size_t)reducesize * sizeof(float), CL_TRUE); if(err != CL_SUCCESS) goto error; dt_opencl_release_mem_object(dev_r); dt_opencl_release_mem_object(dev_m); dev_r = dev_m = NULL; for(int k = 1; k < reducesize; k++) { float mine = maximum[0]; float other = maximum[k]; maximum[0] = (other > mine) ? other : mine; } tmp_lwmax = MAX(eps, (maximum[0] * 0.01f)); dt_free_align(maximum); maximum = NULL; } const float lwmax = tmp_lwmax; const float ldc = d->drago.max_light * 0.01f / log10f(lwmax + 1.0f); const float bl = logf(MAX(eps, d->drago.bias)) / logf(0.5f); parameters[0] = eps; parameters[1] = ldc; parameters[2] = bl; parameters[3] = lwmax; if(self->dev->gui_attached && g && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW) { uint64_t hash = dt_dev_hash_plus(self->dev, piece->pipe, 0, self->priority); dt_pthread_mutex_lock(&g->lock); g->lwmax = lwmax; g->hash = hash; dt_pthread_mutex_unlock(&g->lock); } } const float scale = piece->iscale / roi_in->scale; const float sigma_r = 8.0f; // does not depend on scale const float iw = piece->buf_in.width / scale; const float ih = piece->buf_in.height / scale; const float sigma_s = fminf(iw, ih) * 0.03f; if(d->detail != 0.0f) { b = dt_bilateral_init_cl(devid, roi_in->width, roi_in->height, sigma_s, sigma_r); if(!b) goto error; // get detail from unchanged input buffer err = dt_bilateral_splat_cl(b, dev_in); if(err != CL_SUCCESS) goto error; } size_t sizes[2] = { ROUNDUPWD(width), ROUNDUPHT(height) }; dt_opencl_set_kernel_arg(devid, gtkernel, 0, sizeof(cl_mem), &dev_in); dt_opencl_set_kernel_arg(devid, gtkernel, 1, sizeof(cl_mem), &dev_out); dt_opencl_set_kernel_arg(devid, gtkernel, 2, sizeof(int), &width); dt_opencl_set_kernel_arg(devid, gtkernel, 3, sizeof(int), &height); dt_opencl_set_kernel_arg(devid, gtkernel, 4, 4 * sizeof(float), ¶meters); err = dt_opencl_enqueue_kernel_2d(devid, gtkernel, sizes); if(err != CL_SUCCESS) goto error; if(d->detail != 0.0f) { err = dt_bilateral_blur_cl(b); if(err != CL_SUCCESS) goto error; // and apply it to output buffer after logscale err = dt_bilateral_slice_to_output_cl(b, dev_in, dev_out, d->detail); if(err != CL_SUCCESS) goto error; dt_bilateral_free_cl(b); } return TRUE; error: if(b) dt_bilateral_free_cl(b); dt_opencl_release_mem_object(dev_m); dt_opencl_release_mem_object(dev_r); dt_free_align(maximum); dt_print(DT_DEBUG_OPENCL, "[opencl_global_tonemap] couldn't enqueue kernel! %d\n", err); return FALSE; } #endif void tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out, struct dt_develop_tiling_t *tiling) { dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data; const float scale = piece->iscale / roi_in->scale; const float iw = piece->buf_in.width / scale; const float ih = piece->buf_in.height / scale; const float sigma_s = fminf(iw, ih) * 0.03f; const float sigma_r = 8.0f; const int detail = (d->detail != 0.0f); const int width = roi_in->width; const int height = roi_in->height; const int channels = piece->colors; const size_t basebuffer = width * height * channels * sizeof(float); tiling->factor = 2.0f + (detail ? (float)dt_bilateral_memory_use2(width, height, sigma_s, sigma_r) / basebuffer : 0.0f); tiling->maxbuf = (detail ? MAX(1.0f, (float)dt_bilateral_singlebuffer_size2(width, height, sigma_s, sigma_r) / basebuffer) : 1.0f); tiling->overhead = 0; tiling->overlap = (detail ? ceilf(4 * sigma_s) : 0); tiling->xalign = 1; tiling->yalign = 1; return; } void commit_params(struct dt_iop_module_t *self, dt_iop_params_t *p1, dt_dev_pixelpipe_t *pipe, dt_dev_pixelpipe_iop_t *piece) { dt_iop_global_tonemap_params_t *p = (dt_iop_global_tonemap_params_t *)p1; dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data; d->operator= p->operator; d->drago.bias = p->drago.bias; d->drago.max_light = p->drago.max_light; d->detail = p->detail; // drago needs the maximum L-value of the whole image so it must not use tiling if(d->operator == OPERATOR_DRAGO) piece->process_tiling_ready = 0; #ifdef HAVE_OPENCL if(d->detail != 0.0f) piece->process_cl_ready = (piece->process_cl_ready && !(darktable.opencl->avoid_atomics)); #endif }
void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data; const int ch = piece->colors; const int gamutcheck = (d->mode == DT_PROFILE_GAMUTCHECK); if(!isnan(d->cmatrix[0])) { // fprintf(stderr,"Using cmatrix codepath\n"); // convert to rgb using matrix #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in, roi_out, ivoid, ovoid) #endif for(int j = 0; j < roi_out->height; j++) { float *in = (float *)ivoid + (size_t)ch * roi_in->width * j; float *out = (float *)ovoid + (size_t)ch * roi_out->width * j; const __m128 m0 = _mm_set_ps(0.0f, d->cmatrix[6], d->cmatrix[3], d->cmatrix[0]); const __m128 m1 = _mm_set_ps(0.0f, d->cmatrix[7], d->cmatrix[4], d->cmatrix[1]); const __m128 m2 = _mm_set_ps(0.0f, d->cmatrix[8], d->cmatrix[5], d->cmatrix[2]); for(int i = 0; i < roi_out->width; i++, in += ch, out += ch) { const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in)); const __m128 t = _mm_add_ps(_mm_mul_ps(m0, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(0, 0, 0, 0))), _mm_add_ps(_mm_mul_ps(m1, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(1, 1, 1, 1))), _mm_mul_ps(m2, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(2, 2, 2, 2))))); _mm_stream_ps(out, t); } } _mm_sfence(); // apply profile #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in, roi_out, ivoid, ovoid) #endif for(int j = 0; j < roi_out->height; j++) { float *in = (float *)ivoid + (size_t)ch * roi_in->width * j; float *out = (float *)ovoid + (size_t)ch * roi_out->width * j; for(int i = 0; i < roi_out->width; i++, in += ch, out += ch) { for(int i = 0; i < 3; i++) if(d->lut[i][0] >= 0.0f) { out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]); } } } } else { // fprintf(stderr,"Using xform codepath\n"); const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f); #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(ivoid, ovoid, roi_out) #endif for(int k = 0; k < roi_out->height; k++) { const float *in = ((float *)ivoid) + (size_t)ch * k * roi_out->width; float *out = ((float *)ovoid) + (size_t)ch * k * roi_out->width; if(!gamutcheck) { cmsDoTransform(d->xform, in, out, roi_out->width); } else { void *rgb = dt_alloc_align(16, 4 * sizeof(float) * roi_out->width); cmsDoTransform(d->xform, in, rgb, roi_out->width); float *rgbptr = (float *)rgb; for(int j = 0; j < roi_out->width; j++, rgbptr += 4, out += 4) { const __m128 pixel = _mm_load_ps(rgbptr); __m128 ingamut = _mm_cmplt_ps(pixel, _mm_set_ps(-FLT_MAX, 0.0f, 0.0f, 0.0f)); ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut)); ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut)); const __m128 result = _mm_or_ps(_mm_and_ps(ingamut, outofgamutpixel), _mm_andnot_ps(ingamut, pixel)); _mm_stream_ps(out, result); } dt_free_align(rgb); } } _mm_sfence(); } if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }