void dt_image_cache_init(dt_image_cache_t *cache) { // the image cache does no serialization. // (unsafe. data should be in db/xmp, not in any other additional cache, // also, it should be relatively fast to get the image_t structs from sql.) // TODO: actually an independent conf var? // too large: dangerous and wasteful? // can we get away with a fixed size? const uint32_t max_mem = 50*1024*1024; uint32_t num = (uint32_t)(1.5f*max_mem/sizeof(dt_image_t)); dt_cache_init(&cache->cache, num, 16, 64, max_mem); dt_cache_set_allocate_callback(&cache->cache, &dt_image_cache_allocate, cache); dt_cache_set_cleanup_callback (&cache->cache, &dt_image_cache_deallocate, cache); // might have been rounded to power of two: num = dt_cache_capacity(&cache->cache); cache->images = dt_alloc_align(64, sizeof(dt_image_t)*num); dt_print(DT_DEBUG_CACHE, "[image_cache] has %d entries\n", num); // initialize first image as empty data: dt_image_init(cache->images); for(uint32_t k=1; k<num; k++) { // optimized initialization (avoid accessing conf): memcpy(cache->images + k, cache->images, sizeof(dt_image_t)); } }
void dt_image_cache_init(dt_image_cache_t *cache) { // the image cache does no serialization. // (unsafe. data should be in db/xmp, not in any other additional cache, // also, it should be relatively fast to get the image_t structs from sql.) // TODO: actually an independent conf var? // too large: dangerous and wasteful? // can we get away with a fixed size? const uint32_t max_mem = 50 * 1024 * 1024; uint32_t num = (uint32_t)(1.5f * max_mem / sizeof(dt_image_t)); dt_cache_init(&cache->cache, sizeof(dt_image_t), max_mem); dt_cache_set_allocate_callback(&cache->cache, &dt_image_cache_allocate, cache); dt_cache_set_cleanup_callback(&cache->cache, &dt_image_cache_deallocate, cache); dt_print(DT_DEBUG_CACHE, "[image_cache] has %d entries\n", num); }
int main(int argc, char *arg[]) { dt_cache_t cache; // dt_cache_init(&cache, 110000, 16, 64, 100000); // really hammer it, make quota insanely low: dt_cache_init(&cache, 110000, 16, 64, 100); dt_cache_set_allocate_callback(&cache, alloc_dummy, NULL); #ifdef _OPENMP # pragma omp parallel for default(none) schedule(guided) shared(cache, stderr) num_threads(16) #endif for(int k=0;k<100000;k++) { void *data = (void *)(long int)k; const int size = 0;//dt_cache_size(&cache); const int con1 = dt_cache_contains(&cache, k); const int val1 = (int)(long int)dt_cache_read_get(&cache, k); const int val2 = (int)(long int)dt_cache_read_get(&cache, k); // fprintf(stderr, "\rinserted number %d, size %d, value %d - %d, contains %d - %d", k, size, val1, val2, con1, con2); const int con2 = dt_cache_contains(&cache, k); assert (con1 == 0); assert (con2 == 1); assert (val2 == k); dt_cache_read_release(&cache, k); dt_cache_read_release(&cache, k); } dt_cache_print_locked(&cache); // fprintf(stderr, "\n"); fprintf(stderr, "[passed] inserting 100000 entries concurrently\n"); const int size = dt_cache_size(&cache); const int lru_cnt = lru_check_consistency(&cache); const int lru_cnt_r = lru_check_consistency_reverse(&cache); // fprintf(stderr, "lru list contains %d|%d/%d entries\n", lru_cnt, lru_cnt_r, size); assert(size == lru_cnt); assert(lru_cnt_r == lru_cnt); fprintf(stderr, "[passed] cache lru consistency after removals, have %d entries left.\n", size); dt_cache_cleanup(&cache); exit(0); }
void dt_mipmap_cache_init(dt_mipmap_cache_t *cache) { // make sure static memory is initialized struct dt_mipmap_buffer_dsc *dsc = (struct dt_mipmap_buffer_dsc *)dt_mipmap_cache_static_dead_image; dead_image_f((dt_mipmap_buffer_t *)(dsc+1)); cache->compression_type = 0; gchar *compression = dt_conf_get_string("cache_compression"); if(compression) { if(!strcmp(compression, "low quality (fast)")) cache->compression_type = 1; else if(!strcmp(compression, "high quality (slow)")) cache->compression_type = 2; g_free(compression); } dt_print(DT_DEBUG_CACHE, "[mipmap_cache_init] using %s\n", cache->compression_type == 0 ? "no compression" : (cache->compression_type == 1 ? "low quality compression" : "slow high quality compression")); // adjust numbers to be large enough to hold what mem limit suggests. // we want at least 100MB, and consider 8G just still reasonable. size_t max_mem = CLAMPS(dt_conf_get_int64("cache_memory"), 100u<<20, ((uint64_t)8)<<30); const uint32_t parallel = CLAMP(dt_conf_get_int ("worker_threads")*dt_conf_get_int("parallel_export"), 1, 8); const int32_t max_size = 2048, min_size = 32; int32_t wd = darktable.thumbnail_width; int32_t ht = darktable.thumbnail_height; wd = CLAMPS(wd, min_size, max_size); ht = CLAMPS(ht, min_size, max_size); // round up to a multiple of 8, so we can divide by two 3 times if(wd & 0xf) wd = (wd & ~0xf) + 0x10; if(ht & 0xf) ht = (ht & ~0xf) + 0x10; // cache these, can't change at runtime: cache->mip[DT_MIPMAP_F].max_width = wd; cache->mip[DT_MIPMAP_F].max_height = ht; cache->mip[DT_MIPMAP_F-1].max_width = wd; cache->mip[DT_MIPMAP_F-1].max_height = ht; for(int k=DT_MIPMAP_F-2; k>=DT_MIPMAP_0; k--) { cache->mip[k].max_width = cache->mip[k+1].max_width / 2; cache->mip[k].max_height = cache->mip[k+1].max_height / 2; } // initialize some per-thread cached scratchmem for uncompressed buffers during thumb creation: if(cache->compression_type) { cache->scratchmem.max_width = wd; cache->scratchmem.max_height = ht; cache->scratchmem.buffer_size = wd*ht*sizeof(uint32_t); cache->scratchmem.size = DT_MIPMAP_3; // at max. // TODO: use thread local storage instead (zero performance penalty on linux) dt_cache_init(&cache->scratchmem.cache, parallel, parallel, 64, 0.9f*parallel*wd*ht*sizeof(uint32_t)); // might have been rounded to power of two: const int cnt = dt_cache_capacity(&cache->scratchmem.cache); cache->scratchmem.buf = dt_alloc_align(64, cnt * wd*ht*sizeof(uint32_t)); dt_cache_static_allocation(&cache->scratchmem.cache, (uint8_t *)cache->scratchmem.buf, wd*ht*sizeof(uint32_t)); dt_cache_set_allocate_callback(&cache->scratchmem.cache, scratchmem_allocate, &cache->scratchmem); dt_print(DT_DEBUG_CACHE, "[mipmap_cache_init] cache has % 5d entries for temporary compression buffers (% 4.02f MB).\n", cnt, cnt* wd*ht*sizeof(uint32_t)/(1024.0*1024.0)); } for(int k=DT_MIPMAP_3; k>=0; k--) { // clear stats: cache->mip[k].stats_requests = 0; cache->mip[k].stats_near_match = 0; cache->mip[k].stats_misses = 0; cache->mip[k].stats_fetches = 0; cache->mip[k].stats_standin = 0; // buffer stores width and height + actual data const int width = cache->mip[k].max_width; const int height = cache->mip[k].max_height; // header + adjusted for dxt compression: cache->mip[k].buffer_size = 4*sizeof(uint32_t) + compressed_buffer_size(cache->compression_type, width, height); cache->mip[k].size = k; // level of parallelism also gives minimum size (which is twice that) // is rounded to a power of two by the cache anyways, we might as well. // XXX this needs adjustment for video mode (more full-res thumbs for replay) // TODO: collect hit/miss stats and auto-adjust to user browsing behaviour // TODO: can #prefetches be collected this way, too? const size_t max_mem2 = MAX(0, (k == 0) ? (max_mem) : (max_mem/(k+4))); uint32_t thumbnails = MAX(2, nearest_power_of_two((uint32_t)((double)max_mem2/cache->mip[k].buffer_size))); while(thumbnails > parallel && (size_t)thumbnails * cache->mip[k].buffer_size > max_mem2) thumbnails /= 2; // try to utilize that memory well (use 90% quota), the hopscotch paper claims good scalability up to // even more than that. dt_cache_init(&cache->mip[k].cache, thumbnails, parallel, 64, 0.9f*thumbnails*cache->mip[k].buffer_size); // might have been rounded to power of two: thumbnails = dt_cache_capacity(&cache->mip[k].cache); max_mem -= thumbnails * cache->mip[k].buffer_size; // dt_print(DT_DEBUG_CACHE, "[mipmap mem] %4.02f left\n", max_mem/(1024.0*1024.0)); cache->mip[k].buf = dt_alloc_align(64, thumbnails * cache->mip[k].buffer_size); dt_cache_static_allocation(&cache->mip[k].cache, (uint8_t *)cache->mip[k].buf, cache->mip[k].buffer_size); dt_cache_set_allocate_callback(&cache->mip[k].cache, dt_mipmap_cache_allocate, &cache->mip[k]); // dt_cache_set_cleanup_callback(&cache->mip[k].cache, // &dt_mipmap_cache_deallocate, &cache->mip[k]); dt_print(DT_DEBUG_CACHE, "[mipmap_cache_init] cache has % 5d entries for mip %d (% 4.02f MB).\n", thumbnails, k, thumbnails * cache->mip[k].buffer_size/(1024.0*1024.0)); } // full buffer needs dynamic alloc: const int full_entries = MAX(2, parallel); // even with one thread you want two buffers. one for dr one for thumbs. int32_t max_mem_bufs = nearest_power_of_two(full_entries); // for this buffer, because it can be very busy during import, we want the minimum // number of entries in the hashtable to be 16, but leave the quota as is. the dynamic // alloc/free properties of this cache take care that no more memory is required. dt_cache_init(&cache->mip[DT_MIPMAP_FULL].cache, max_mem_bufs, parallel, 64, max_mem_bufs); dt_cache_set_allocate_callback(&cache->mip[DT_MIPMAP_FULL].cache, dt_mipmap_cache_allocate_dynamic, &cache->mip[DT_MIPMAP_FULL]); // dt_cache_set_cleanup_callback(&cache->mip[DT_MIPMAP_FULL].cache, // &dt_mipmap_cache_deallocate_dynamic, &cache->mip[DT_MIPMAP_FULL]); cache->mip[DT_MIPMAP_FULL].buffer_size = 0; cache->mip[DT_MIPMAP_FULL].size = DT_MIPMAP_FULL; cache->mip[DT_MIPMAP_FULL].buf = NULL; // same for mipf: dt_cache_init(&cache->mip[DT_MIPMAP_F].cache, max_mem_bufs, parallel, 64, max_mem_bufs); dt_cache_set_allocate_callback(&cache->mip[DT_MIPMAP_F].cache, dt_mipmap_cache_allocate_dynamic, &cache->mip[DT_MIPMAP_F]); dt_cache_set_cleanup_callback(&cache->mip[DT_MIPMAP_F].cache, dt_mipmap_cache_deallocate_dynamic, &cache->mip[DT_MIPMAP_F]); cache->mip[DT_MIPMAP_F].buffer_size = 4*sizeof(uint32_t) + 4*sizeof(float) * cache->mip[DT_MIPMAP_F].max_width * cache->mip[DT_MIPMAP_F].max_height; cache->mip[DT_MIPMAP_F].size = DT_MIPMAP_F; cache->mip[DT_MIPMAP_F].buf = NULL; dt_mipmap_cache_deserialize(cache); }