void cfs_lock_file(FileMap* map, char const* file_path) { long delay = CFS_LOCK_MIN_TIMEOUT; while (true) { uint64 count = pg_atomic_fetch_add_u32(&map->lock, 1); if (count < CFS_GC_LOCK) { break; } if (InRecovery) { /* Uhhh... looks like last GC was interrupted. * Try to recover file */ char* map_bck_path = psprintf("%s.map.bck", file_path); char* file_bck_path = psprintf("%s.bck", file_path); if (access(file_bck_path, R_OK) != 0) { /* There is no backup file: new map should be constructed */ int md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0); if (md2 >= 0) { /* Recover map */ if (!cfs_read_file(md2, map, sizeof(FileMap))) { elog(LOG, "Failed to read file %s: %m", map_bck_path); } close(md2); } } else { /* Presence of backup file means that we still have unchanged data and map files. * Just remove backup files, grab lock and continue processing */ unlink(file_bck_path); unlink(map_bck_path); } pfree(file_bck_path); pfree(map_bck_path); break; } pg_atomic_fetch_sub_u32(&map->lock, 1); pg_usleep(delay); if (delay < CFS_LOCK_MAX_TIMEOUT) { delay *= 2; } } if (IsUnderPostmaster && cfs_gc_workers != 0 && pg_atomic_test_set_flag(&cfs_state->gc_started)) { cfs_start_background_gc(); } }
static void test_atomic_uint32(void) { pg_atomic_uint32 var; uint32 expected; int i; pg_atomic_init_u32(&var, 0); if (pg_atomic_read_u32(&var) != 0) elog(ERROR, "atomic_read_u32() #1 wrong"); pg_atomic_write_u32(&var, 3); if (pg_atomic_read_u32(&var) != 3) elog(ERROR, "atomic_read_u32() #2 wrong"); if (pg_atomic_fetch_add_u32(&var, 1) != 3) elog(ERROR, "atomic_fetch_add_u32() #1 wrong"); if (pg_atomic_fetch_sub_u32(&var, 1) != 4) elog(ERROR, "atomic_fetch_sub_u32() #1 wrong"); if (pg_atomic_sub_fetch_u32(&var, 3) != 0) elog(ERROR, "atomic_sub_fetch_u32() #1 wrong"); if (pg_atomic_add_fetch_u32(&var, 10) != 10) elog(ERROR, "atomic_add_fetch_u32() #1 wrong"); if (pg_atomic_exchange_u32(&var, 5) != 10) elog(ERROR, "pg_atomic_exchange_u32() #1 wrong"); if (pg_atomic_exchange_u32(&var, 0) != 5) elog(ERROR, "pg_atomic_exchange_u32() #0 wrong"); /* test around numerical limits */ if (pg_atomic_fetch_add_u32(&var, INT_MAX) != 0) elog(ERROR, "pg_atomic_fetch_add_u32() #2 wrong"); if (pg_atomic_fetch_add_u32(&var, INT_MAX) != INT_MAX) elog(ERROR, "pg_atomic_add_fetch_u32() #3 wrong"); pg_atomic_fetch_add_u32(&var, 1); /* top up to UINT_MAX */ if (pg_atomic_read_u32(&var) != UINT_MAX) elog(ERROR, "atomic_read_u32() #2 wrong"); if (pg_atomic_fetch_sub_u32(&var, INT_MAX) != UINT_MAX) elog(ERROR, "pg_atomic_fetch_sub_u32() #2 wrong"); if (pg_atomic_read_u32(&var) != (uint32) INT_MAX + 1) elog(ERROR, "atomic_read_u32() #3 wrong: %u", pg_atomic_read_u32(&var)); expected = pg_atomic_sub_fetch_u32(&var, INT_MAX); if (expected != 1) elog(ERROR, "pg_atomic_sub_fetch_u32() #3 wrong: %u", expected); pg_atomic_sub_fetch_u32(&var, 1); /* fail exchange because of old expected */ expected = 10; if (pg_atomic_compare_exchange_u32(&var, &expected, 1)) elog(ERROR, "atomic_compare_exchange_u32() changed value spuriously"); /* CAS is allowed to fail due to interrupts, try a couple of times */ for (i = 0; i < 1000; i++) { expected = 0; if (!pg_atomic_compare_exchange_u32(&var, &expected, 1)) break; } if (i == 1000) elog(ERROR, "atomic_compare_exchange_u32() never succeeded"); if (pg_atomic_read_u32(&var) != 1) elog(ERROR, "atomic_compare_exchange_u32() didn't set value properly"); pg_atomic_write_u32(&var, 0); /* try setting flagbits */ if (pg_atomic_fetch_or_u32(&var, 1) & 1) elog(ERROR, "pg_atomic_fetch_or_u32() #1 wrong"); if (!(pg_atomic_fetch_or_u32(&var, 2) & 1)) elog(ERROR, "pg_atomic_fetch_or_u32() #2 wrong"); if (pg_atomic_read_u32(&var) != 3) elog(ERROR, "invalid result after pg_atomic_fetch_or_u32()"); /* try clearing flagbits */ if ((pg_atomic_fetch_and_u32(&var, ~2) & 3) != 3) elog(ERROR, "pg_atomic_fetch_and_u32() #1 wrong"); if (pg_atomic_fetch_and_u32(&var, ~1) != 1) elog(ERROR, "pg_atomic_fetch_and_u32() #2 wrong: is %u", pg_atomic_read_u32(&var)); /* no bits set anymore */ if (pg_atomic_fetch_and_u32(&var, ~0) != 0) elog(ERROR, "pg_atomic_fetch_and_u32() #3 wrong"); }
/* * Perform garbage collection (if required) of file * @param map_path path to file map file (*.map). */ static bool cfs_gc_file(char* map_path) { int md = open(map_path, O_RDWR|PG_BINARY, 0); FileMap* map; uint32 physSize; uint32 usedSize; uint32 virtSize; int suf = strlen(map_path)-4; int fd = -1, fd2 = -1, md2 = -1; bool succeed = true; if (md < 0) { elog(LOG, "Failed to open map file %s: %m", map_path); return false; } map = cfs_mmap(md); if (map == MAP_FAILED) { elog(LOG, "Failed to map file %s: %m", map_path); close(md); return false; } usedSize = pg_atomic_read_u32(&map->usedSize); physSize = pg_atomic_read_u32(&map->physSize); virtSize = pg_atomic_read_u32(&map->virtSize); if ((physSize - usedSize)*100 > physSize*cfs_gc_threshold) /* do we need to perform defragmentation? */ { long delay = CFS_LOCK_MIN_TIMEOUT; char* file_path = (char*)palloc(suf+1); char* map_bck_path = (char*)palloc(suf+10); char* file_bck_path = (char*)palloc(suf+5); FileMap* newMap = (FileMap*)palloc0(sizeof(FileMap)); uint32 newSize = 0; inode_t** inodes = (inode_t**)palloc(RELSEG_SIZE*sizeof(inode_t*)); bool remove_backups = true; int n_pages = virtSize / BLCKSZ; TimestampTz startTime, endTime; long secs; int usecs; int i; startTime = GetCurrentTimestamp(); memcpy(file_path, map_path, suf); file_path[suf] = '\0'; strcat(strcpy(map_bck_path, map_path), ".bck"); strcat(strcpy(file_bck_path, file_path), ".bck"); while (true) { uint32 access_count = 0; if (pg_atomic_compare_exchange_u32(&map->lock, &access_count, CFS_GC_LOCK)) { break; } if (access_count >= CFS_GC_LOCK) { /* Uhhh... looks like last GC was interrupted. * Try to recover file */ if (access(file_bck_path, R_OK) != 0) { /* There is no backup file: new map should be constructed */ md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0); if (md2 >= 0) { /* Recover map */ if (!cfs_read_file(md2, newMap, sizeof(FileMap))) { elog(LOG, "Failed to read file %s: %m", map_bck_path); goto Cleanup; } close(md2); md2 = -1; newSize = pg_atomic_read_u32(&newMap->usedSize); remove_backups = false; goto ReplaceMap; } } else { /* Presence of backup file means that we still have unchanged data and map files. * Just remove backup files, grab lock and continue processing */ unlink(file_bck_path); unlink(map_bck_path); break; } } pg_usleep(delay); if (delay < CFS_LOCK_MAX_TIMEOUT) { delay *= 2; } } md2 = open(map_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600); if (md2 < 0) { goto Cleanup; } for (i = 0; i < n_pages; i++) { newMap->inodes[i] = map->inodes[i]; inodes[i] = &newMap->inodes[i]; } /* sort inodes by offset to improve read locality */ qsort(inodes, n_pages, sizeof(inode_t*), cfs_cmp_page_offs); fd = open(file_path, O_RDWR|PG_BINARY, 0); if (fd < 0) { goto Cleanup; } fd2 = open(file_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600); if (fd2 < 0) { goto Cleanup; } for (i = 0; i < n_pages; i++) { int size = CFS_INODE_SIZE(*inodes[i]); if (size != 0) { char block[BLCKSZ]; off_t rc PG_USED_FOR_ASSERTS_ONLY; uint32 offs = CFS_INODE_OFFS(*inodes[i]); Assert(size <= BLCKSZ); rc = lseek(fd, offs, SEEK_SET); Assert(rc == offs); if (!cfs_read_file(fd, block, size)) { elog(LOG, "Failed to read file %s: %m", file_path); goto Cleanup; } if (!cfs_write_file(fd2, block, size)) { elog(LOG, "Failed to write file %s: %m", file_bck_path); goto Cleanup; } offs = newSize; newSize += size; *inodes[i] = CFS_INODE(size, offs); } } pg_atomic_write_u32(&map->usedSize, newSize); if (close(fd) < 0) { elog(LOG, "Failed to close file %s: %m", file_path); goto Cleanup; } fd = -1; /* Persist copy of data file */ if (pg_fsync(fd2) < 0) { elog(LOG, "Failed to sync file %s: %m", file_bck_path); goto Cleanup; } if (close(fd2) < 0) { elog(LOG, "Failed to close file %s: %m", file_bck_path); goto Cleanup; } fd2 = -1; /* Persist copy of map file */ if (!cfs_write_file(md2, &newMap, sizeof(newMap))) { elog(LOG, "Failed to write file %s: %m", map_bck_path); goto Cleanup; } if (pg_fsync(md2) < 0) { elog(LOG, "Failed to sync file %s: %m", map_bck_path); goto Cleanup; } if (close(md2) < 0) { elog(LOG, "Failed to close file %s: %m", map_bck_path); goto Cleanup; } md2 = -1; /* Persist map with CFS_GC_LOCK set: in case of crash we will know that map may be changed by GC */ if (cfs_msync(map) < 0) { elog(LOG, "Failed to sync map %s: %m", map_path); goto Cleanup; } if (pg_fsync(md) < 0) { elog(LOG, "Failed to sync file %s: %m", map_path); goto Cleanup; } /* * Now all information necessary for recovery is stored. * We are ready to replace existed file with defragmented one. * Use rename and rely on file system to provide atomicity of this operation. */ remove_backups = false; if (rename(file_bck_path, file_path) < 0) { elog(LOG, "Failed to rename file %s: %m", file_path); goto Cleanup; } ReplaceMap: /* At this moment defragmented file version is stored. We can perfrom in-place update of map. * If crash happens at this point, map can be recovered from backup file */ memcpy(map->inodes, newMap->inodes, n_pages * sizeof(inode_t)); pg_atomic_write_u32(&map->usedSize, newSize); pg_atomic_write_u32(&map->physSize, newSize); map->generation += 1; /* force all backends to reopen the file */ /* Before removing backup files and releasing locks we need to flush updated map file */ if (cfs_msync(map) < 0) { elog(LOG, "Failed to sync map %s: %m", map_path); goto Cleanup; } if (pg_fsync(md) < 0) { elog(LOG, "Failed to sync file %s: %m", map_path); Cleanup: if (fd >= 0) close(fd); if (fd2 >= 0) close(fd2); if (md2 >= 0) close(md2); if (remove_backups) { unlink(file_bck_path); unlink(map_bck_path); remove_backups = false; } succeed = false; } else { remove_backups = true; /* now backups are not need any more */ } pg_atomic_fetch_sub_u32(&map->lock, CFS_GC_LOCK); /* release lock */ /* remove map backup file */ if (remove_backups && unlink(map_bck_path)) { elog(LOG, "Failed to unlink file %s: %m", map_bck_path); succeed = false; } endTime = GetCurrentTimestamp(); TimestampDifference(startTime, endTime, &secs, &usecs); elog(LOG, "%d: defragment file %s: old size %d, new size %d, logical size %d, used %d, compression ratio %f, time %ld usec", MyProcPid, file_path, physSize, newSize, virtSize, usedSize, (double)virtSize/newSize, secs*USECS_PER_SEC + usecs); pfree(file_path); pfree(file_bck_path); pfree(map_bck_path); pfree(inodes); pfree(newMap); if (cfs_gc_delay != 0) { int rc = WaitLatch(MyLatch, WL_TIMEOUT | WL_POSTMASTER_DEATH, cfs_gc_delay /* ms */ ); if (rc & WL_POSTMASTER_DEATH) { exit(1); } } } else if (cfs_state->max_iterations == 1) { elog(LOG, "%d: file %.*s: physical size %d, logical size %d, used %d, compression ratio %f", MyProcPid, suf, map_path, physSize, virtSize, usedSize, (double)virtSize/physSize); } if (cfs_munmap(map) < 0) { elog(LOG, "Failed to unmap file %s: %m", map_path); succeed = false; } if (close(md) < 0) { elog(LOG, "Failed to close file %s: %m", map_path); succeed = false; } return succeed; }
/* * Protects file from GC */ void cfs_unlock_file(FileMap* map) { pg_atomic_fetch_sub_u32(&map->lock, 1); }