Ejemplo n.º 1
0
void cfs_lock_file(FileMap* map, char const* file_path)
{
	long delay = CFS_LOCK_MIN_TIMEOUT;
	while (true) { 
		uint64 count = pg_atomic_fetch_add_u32(&map->lock, 1);
		if (count < CFS_GC_LOCK) {
			break;
		} 
		if (InRecovery) { 
			/* Uhhh... looks like last GC was interrupted.
			 * Try to recover file
			 */
			char* map_bck_path = psprintf("%s.map.bck", file_path);
			char* file_bck_path = psprintf("%s.bck", file_path);
			if (access(file_bck_path, R_OK) != 0) {
				/* There is no backup file: new map should be constructed */					
				int md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0);
				if (md2 >= 0) { 
					/* Recover map */
					if (!cfs_read_file(md2, map, sizeof(FileMap))) { 
						elog(LOG, "Failed to read file %s: %m", map_bck_path);
					}
					close(md2);
				} 
			} else { 
				/* Presence of backup file means that we still have unchanged data and map files.
				 * Just remove backup files, grab lock and continue processing
				 */
				unlink(file_bck_path);
				unlink(map_bck_path);
			}
			pfree(file_bck_path);
			pfree(map_bck_path);
			break;
		}
		pg_atomic_fetch_sub_u32(&map->lock, 1);
		pg_usleep(delay);
		if (delay < CFS_LOCK_MAX_TIMEOUT) { 
			delay *= 2;
		}
	}
	if (IsUnderPostmaster && cfs_gc_workers != 0 && pg_atomic_test_set_flag(&cfs_state->gc_started))
	{
		cfs_start_background_gc();
	}
}
Ejemplo n.º 2
0
static void
test_atomic_uint32(void)
{
	pg_atomic_uint32 var;
	uint32		expected;
	int			i;

	pg_atomic_init_u32(&var, 0);

	if (pg_atomic_read_u32(&var) != 0)
		elog(ERROR, "atomic_read_u32() #1 wrong");

	pg_atomic_write_u32(&var, 3);

	if (pg_atomic_read_u32(&var) != 3)
		elog(ERROR, "atomic_read_u32() #2 wrong");

	if (pg_atomic_fetch_add_u32(&var, 1) != 3)
		elog(ERROR, "atomic_fetch_add_u32() #1 wrong");

	if (pg_atomic_fetch_sub_u32(&var, 1) != 4)
		elog(ERROR, "atomic_fetch_sub_u32() #1 wrong");

	if (pg_atomic_sub_fetch_u32(&var, 3) != 0)
		elog(ERROR, "atomic_sub_fetch_u32() #1 wrong");

	if (pg_atomic_add_fetch_u32(&var, 10) != 10)
		elog(ERROR, "atomic_add_fetch_u32() #1 wrong");

	if (pg_atomic_exchange_u32(&var, 5) != 10)
		elog(ERROR, "pg_atomic_exchange_u32() #1 wrong");

	if (pg_atomic_exchange_u32(&var, 0) != 5)
		elog(ERROR, "pg_atomic_exchange_u32() #0 wrong");

	/* test around numerical limits */
	if (pg_atomic_fetch_add_u32(&var, INT_MAX) != 0)
		elog(ERROR, "pg_atomic_fetch_add_u32() #2 wrong");

	if (pg_atomic_fetch_add_u32(&var, INT_MAX) != INT_MAX)
		elog(ERROR, "pg_atomic_add_fetch_u32() #3 wrong");

	pg_atomic_fetch_add_u32(&var, 1);	/* top up to UINT_MAX */

	if (pg_atomic_read_u32(&var) != UINT_MAX)
		elog(ERROR, "atomic_read_u32() #2 wrong");

	if (pg_atomic_fetch_sub_u32(&var, INT_MAX) != UINT_MAX)
		elog(ERROR, "pg_atomic_fetch_sub_u32() #2 wrong");

	if (pg_atomic_read_u32(&var) != (uint32) INT_MAX + 1)
		elog(ERROR, "atomic_read_u32() #3 wrong: %u", pg_atomic_read_u32(&var));

	expected = pg_atomic_sub_fetch_u32(&var, INT_MAX);
	if (expected != 1)
		elog(ERROR, "pg_atomic_sub_fetch_u32() #3 wrong: %u", expected);

	pg_atomic_sub_fetch_u32(&var, 1);

	/* fail exchange because of old expected */
	expected = 10;
	if (pg_atomic_compare_exchange_u32(&var, &expected, 1))
		elog(ERROR, "atomic_compare_exchange_u32() changed value spuriously");

	/* CAS is allowed to fail due to interrupts, try a couple of times */
	for (i = 0; i < 1000; i++)
	{
		expected = 0;
		if (!pg_atomic_compare_exchange_u32(&var, &expected, 1))
			break;
	}
	if (i == 1000)
		elog(ERROR, "atomic_compare_exchange_u32() never succeeded");
	if (pg_atomic_read_u32(&var) != 1)
		elog(ERROR, "atomic_compare_exchange_u32() didn't set value properly");

	pg_atomic_write_u32(&var, 0);

	/* try setting flagbits */
	if (pg_atomic_fetch_or_u32(&var, 1) & 1)
		elog(ERROR, "pg_atomic_fetch_or_u32() #1 wrong");

	if (!(pg_atomic_fetch_or_u32(&var, 2) & 1))
		elog(ERROR, "pg_atomic_fetch_or_u32() #2 wrong");

	if (pg_atomic_read_u32(&var) != 3)
		elog(ERROR, "invalid result after pg_atomic_fetch_or_u32()");

	/* try clearing flagbits */
	if ((pg_atomic_fetch_and_u32(&var, ~2) & 3) != 3)
		elog(ERROR, "pg_atomic_fetch_and_u32() #1 wrong");

	if (pg_atomic_fetch_and_u32(&var, ~1) != 1)
		elog(ERROR, "pg_atomic_fetch_and_u32() #2 wrong: is %u",
			 pg_atomic_read_u32(&var));
	/* no bits set anymore */
	if (pg_atomic_fetch_and_u32(&var, ~0) != 0)
		elog(ERROR, "pg_atomic_fetch_and_u32() #3 wrong");
}
Ejemplo n.º 3
0
/*
 * Perform garbage collection (if required) of file
 * @param map_path path to file map file (*.map). 
 */
static bool cfs_gc_file(char* map_path)
{
	int md = open(map_path, O_RDWR|PG_BINARY, 0);
	FileMap* map;
	uint32 physSize;
	uint32 usedSize;
	uint32 virtSize;
	int suf = strlen(map_path)-4;
	int fd = -1, fd2 = -1, md2 = -1;
	bool succeed = true;

	if (md < 0) { 
		elog(LOG, "Failed to open map file %s: %m", map_path);
		return false;
	}
	map = cfs_mmap(md);
	if (map == MAP_FAILED) {
		elog(LOG, "Failed to map file %s: %m", map_path);
		close(md);
		return false;
	}
	usedSize = pg_atomic_read_u32(&map->usedSize);
	physSize = pg_atomic_read_u32(&map->physSize);
	virtSize = pg_atomic_read_u32(&map->virtSize);
		
	if ((physSize - usedSize)*100 > physSize*cfs_gc_threshold) /* do we need to perform defragmentation? */
	{ 
		long delay = CFS_LOCK_MIN_TIMEOUT;		
		char* file_path = (char*)palloc(suf+1);
		char* map_bck_path = (char*)palloc(suf+10);
		char* file_bck_path = (char*)palloc(suf+5);
		FileMap* newMap = (FileMap*)palloc0(sizeof(FileMap));
		uint32 newSize = 0;
		inode_t** inodes = (inode_t**)palloc(RELSEG_SIZE*sizeof(inode_t*));
		bool remove_backups = true;
		int n_pages = virtSize / BLCKSZ;
		TimestampTz startTime, endTime;
		long secs;
		int usecs;
		int i;
		
		startTime = GetCurrentTimestamp();

		memcpy(file_path, map_path, suf);
		file_path[suf] = '\0';
		strcat(strcpy(map_bck_path, map_path), ".bck");
		strcat(strcpy(file_bck_path, file_path), ".bck");

		while (true) { 
			uint32 access_count = 0;
			if (pg_atomic_compare_exchange_u32(&map->lock, &access_count, CFS_GC_LOCK)) {				
				break;
			}
			if (access_count >= CFS_GC_LOCK) { 
				/* Uhhh... looks like last GC was interrupted.
				 * Try to recover file
				 */
				if (access(file_bck_path, R_OK) != 0) {
					/* There is no backup file: new map should be constructed */					
					md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0);
					if (md2 >= 0) { 
						/* Recover map */
						if (!cfs_read_file(md2, newMap, sizeof(FileMap))) { 
							elog(LOG, "Failed to read file %s: %m", map_bck_path);
							goto Cleanup;
						}
						close(md2);
						md2 = -1;
						newSize = pg_atomic_read_u32(&newMap->usedSize);
						remove_backups = false;
						goto ReplaceMap;
					}
				} else { 
					/* Presence of backup file means that we still have unchanged data and map files.
					 * Just remove backup files, grab lock and continue processing
					 */
					unlink(file_bck_path);
					unlink(map_bck_path);
					break;
				}
			}
			pg_usleep(delay);
			if (delay < CFS_LOCK_MAX_TIMEOUT) { 
				delay *= 2;
			}
		}				 			
		md2 = open(map_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600);
		if (md2 < 0) { 
			goto Cleanup;
		}
		for (i = 0; i < n_pages; i++) { 
			newMap->inodes[i] = map->inodes[i];
		    inodes[i] = &newMap->inodes[i];
		}
		/* sort inodes by offset to improve read locality */
		qsort(inodes, n_pages, sizeof(inode_t*), cfs_cmp_page_offs);
		
		fd = open(file_path, O_RDWR|PG_BINARY, 0);
		if (fd < 0) { 
			goto Cleanup;
		}
		
		fd2 = open(file_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600);
		if (fd2 < 0) { 
			goto Cleanup;
		}
		
		for (i = 0; i < n_pages; i++) { 
			int size = CFS_INODE_SIZE(*inodes[i]);
			if (size != 0) { 
				char block[BLCKSZ];
				off_t rc PG_USED_FOR_ASSERTS_ONLY;
				uint32 offs = CFS_INODE_OFFS(*inodes[i]);
				Assert(size <= BLCKSZ);	
				rc = lseek(fd, offs, SEEK_SET);
				Assert(rc == offs);
				
				if (!cfs_read_file(fd, block, size)) { 
					elog(LOG, "Failed to read file %s: %m", file_path);
					goto Cleanup;
				}
				
				if (!cfs_write_file(fd2, block, size)) { 
					elog(LOG, "Failed to write file %s: %m", file_bck_path);
					goto Cleanup;
				}
				offs = newSize;
				newSize += size;
				*inodes[i] = CFS_INODE(size, offs);
			}
		}
		pg_atomic_write_u32(&map->usedSize, newSize);

		if (close(fd) < 0) { 
			elog(LOG, "Failed to close file %s: %m", file_path);
			goto Cleanup;
		}
		fd = -1;

		/* Persist copy of data file */
		if (pg_fsync(fd2) < 0) { 
			elog(LOG, "Failed to sync file %s: %m", file_bck_path);
			goto Cleanup;
		}
		if (close(fd2) < 0) { 
			elog(LOG, "Failed to close file %s: %m", file_bck_path);
			goto Cleanup;
		}
		fd2 = -1;

		/* Persist copy of map file */
		if (!cfs_write_file(md2, &newMap, sizeof(newMap))) { 
			elog(LOG, "Failed to write file %s: %m", map_bck_path);
			goto Cleanup;
		}
		if (pg_fsync(md2) < 0) { 
			elog(LOG, "Failed to sync file %s: %m", map_bck_path);
			goto Cleanup;
		}
		if (close(md2) < 0) { 
			elog(LOG, "Failed to close file %s: %m", map_bck_path);
			goto Cleanup;
		}
		md2 = -1;

		/* Persist map with CFS_GC_LOCK set: in case of crash we will know that map may be changed by GC */
		if (cfs_msync(map) < 0) {
			elog(LOG, "Failed to sync map %s: %m", map_path);
			goto Cleanup;
		}
		if (pg_fsync(md) < 0) { 
			elog(LOG, "Failed to sync file %s: %m", map_path);
			goto Cleanup;
		}
		
		/* 
		 * Now all information necessary for recovery is stored.
		 * We are ready to replace existed file with defragmented one.
		 * Use rename and rely on file system to provide atomicity of this operation.
		 */
		remove_backups = false;
		if (rename(file_bck_path, file_path) < 0) { 
			elog(LOG, "Failed to rename file %s: %m", file_path);
			goto Cleanup;
		}
	  ReplaceMap:
		/* At this moment defragmented file version is stored. We can perfrom in-place update of map.
		 * If crash happens at this point, map can be recovered from backup file */
		memcpy(map->inodes, newMap->inodes, n_pages * sizeof(inode_t));
		pg_atomic_write_u32(&map->usedSize, newSize);
		pg_atomic_write_u32(&map->physSize, newSize);
		map->generation += 1; /* force all backends to reopen the file */
		
		/* Before removing backup files and releasing locks we need to flush updated map file */
		if (cfs_msync(map) < 0) {
			elog(LOG, "Failed to sync map %s: %m", map_path);
			goto Cleanup;
		}
		if (pg_fsync(md) < 0) { 
			elog(LOG, "Failed to sync file %s: %m", map_path);
		  Cleanup:
			if (fd >= 0) close(fd);
			if (fd2 >= 0) close(fd2);
			if (md2 >= 0) close(md2);
			if (remove_backups) { 
				unlink(file_bck_path);
				unlink(map_bck_path);		
				remove_backups = false;
			}	
			succeed = false;
		} else { 
			remove_backups = true; /* now backups are not need any more */
		}
		pg_atomic_fetch_sub_u32(&map->lock, CFS_GC_LOCK); /* release lock */

		/* remove map backup file */
		if (remove_backups && unlink(map_bck_path)) {
			elog(LOG, "Failed to unlink file %s: %m", map_bck_path);
			succeed = false;
		}
		
		endTime = GetCurrentTimestamp();
		TimestampDifference(startTime, endTime, &secs, &usecs);

		elog(LOG, "%d: defragment file %s: old size %d, new size %d, logical size %d, used %d, compression ratio %f, time %ld usec",
			 MyProcPid, file_path, physSize, newSize, virtSize, usedSize, (double)virtSize/newSize,
			 secs*USECS_PER_SEC + usecs);

		pfree(file_path);
		pfree(file_bck_path);
		pfree(map_bck_path);
		pfree(inodes);
		pfree(newMap);
		
		if (cfs_gc_delay != 0) { 
			int rc = WaitLatch(MyLatch,
							   WL_TIMEOUT | WL_POSTMASTER_DEATH,
							   cfs_gc_delay /* ms */ );
			if (rc & WL_POSTMASTER_DEATH) {
				exit(1);
			}
		}
	} else if (cfs_state->max_iterations == 1) { 
		elog(LOG, "%d: file %.*s: physical size %d, logical size %d, used %d, compression ratio %f",
			 MyProcPid, suf, map_path, physSize, virtSize, usedSize, (double)virtSize/physSize);
	}
	
	if (cfs_munmap(map) < 0) { 
		elog(LOG, "Failed to unmap file %s: %m", map_path);
		succeed = false;
	}
	if (close(md) < 0) { 
		elog(LOG, "Failed to close file %s: %m", map_path);
		succeed = false;
	}
	return succeed;
}
Ejemplo n.º 4
0
/*
 * Protects file from GC
 */
void cfs_unlock_file(FileMap* map)
{
	pg_atomic_fetch_sub_u32(&map->lock, 1);
}