コード例 #1
0
ファイル: camera.c プロジェクト: matrix207/webcamera
/* see: http://stackoverflow.com/a/3756466/965672 */
static off_t sync_and_drop_write_cache(int fd, off_t pos) {
#ifdef SYNC_FILE_RANGE_WRITE
	off_t tmp = lseek(fd, 0, SEEK_CUR) & (~(SYNC_BUFFER_SIZE - 1));
	if (tmp == pos)
		return pos;

	// sync pos, async
	if (sync_file_range(fd,
				pos,
				SYNC_BUFFER_SIZE,
				SYNC_FILE_RANGE_WRITE) < 0)
		perror("sync_file_range");
	if (pos == 0)
		return tmp;

	pos -= SYNC_BUFFER_SIZE;
	// wait for previous sync to finish
	if (sync_file_range(fd,
				pos,
				SYNC_BUFFER_SIZE,
				SYNC_FILE_RANGE_WAIT_BEFORE
				| SYNC_FILE_RANGE_WRITE
				| SYNC_FILE_RANGE_WAIT_AFTER) < 0)
		perror("sync_file_range");
	// drop cache pages
	if (posix_fadvise(fd,
				pos,
				SYNC_BUFFER_SIZE,
				POSIX_FADV_DONTNEED) < 0)
		perror("posix_fadvise");
	return tmp;
#else
	return 0;
#endif
}
コード例 #2
0
ファイル: archives.c プロジェクト: Minipig/dpkg
static void
tar_writeback_barrier(struct fileinlist *files, struct pkginfo *pkg)
{
  struct fileinlist *cfile;

  for (cfile = files; cfile; cfile = cfile->next) {
    struct filenamenode *usenode;
    const char *usename;
    int fd;

    if (!(cfile->namenode->flags & fnnf_deferred_fsync))
      continue;

    usenode = namenodetouse(cfile->namenode, pkg);
    usename = usenode->name + 1; /* Skip the leading '/'. */

    setupfnamevbs(usename);

    fd = open(fnamenewvb.buf, O_WRONLY);
    if (fd < 0)
      ohshite(_("unable to open '%.255s'"), fnamenewvb.buf);
    sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE);
    if (close(fd))
      ohshite(_("error closing/writing `%.255s'"), fnamenewvb.buf);
  }
}
コード例 #3
0
ファイル: file_utils.c プロジェクト: dreamsxin/postgresql-1
static int
pre_sync_fname(const char *fname, bool isdir, const char *progname)
{
	int			fd;

	fd = open(fname, O_RDONLY | PG_BINARY);

	if (fd < 0)
	{
		if (errno == EACCES || (isdir && errno == EISDIR))
			return 0;
		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
				progname, fname, strerror(errno));
		return -1;
	}

	/*
	 * We do what pg_flush_data() would do in the backend: prefer to use
	 * sync_file_range, but fall back to posix_fadvise.  We ignore errors
	 * because this is only a hint.
	 */
#if defined(HAVE_SYNC_FILE_RANGE)
	(void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
	(void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
#else
#error PG_FLUSH_DATA_WORKS should not have been defined
#endif

	(void) close(fd);
	return 0;
}
コード例 #4
0
  virtual int setup() override {

	init_paths();

	// Create test directory A.
	string dir_path = mnt_dir_ + "/" TEST_DIR_A;
	int res = mkdir(dir_path.c_str(), 0777);
    if (res < 0) {
      return -1;
    }

    //Create file foo in TEST_DIR_A 
    const int fd_foo = open(foo_path.c_str(), O_RDWR | O_CREAT, TEST_FILE_PERMS);
    if (fd_foo < 0) {
      return -1;
    }
    const int fd_foo_backup = open(foo_backup_path.c_str(), O_RDWR | O_CREAT, TEST_FILE_PERMS);
    if (fd_foo_backup < 0) {
      return -1;
    }

    // Write some contents to the file
    if (WriteData(fd_foo, 0, 4096) < 0) {
    	return -2;
    }
    // Write some contents to the backup file (for verifying md5sum in check_test)
    if (WriteData(fd_foo_backup, 0, 4096) < 0) {
    	return -2;
    }

    // Sync the file and backup file
    if (fsync(fd_foo) < 0) {
    	return -1;
    }
    if (fsync(fd_foo_backup) < 0) {
    	return -1;
    }

    // write more contents in a different offset
    if (WriteData(fd_foo, 4096, 4096) < 0) {
    	return -2;
    }
    if (WriteData(fd_foo_backup, 4096, 4096) < 0) {
    	return -2;
    }

    // sync range the foo file
    if (sync_file_range(fd_foo, 4096, 4096, 0) < 0) {
    	return -3;
    }
    // fsync the entire backup file
    if (fsync(fd_foo_backup) < 0) {
    	return -1;
    }

    close(fd_foo);
    close(fd_foo_backup);

    return 0;
  }
コード例 #5
0
ファイル: virtio-9p-local.c プロジェクト: 01org/KVMGT-qemu
static ssize_t local_pwritev(FsContext *ctx, V9fsFidOpenState *fs,
                             const struct iovec *iov,
                             int iovcnt, off_t offset)
{
    ssize_t ret
;
#ifdef CONFIG_PREADV
    ret = pwritev(fs->fd, iov, iovcnt, offset);
#else
    int err = lseek(fs->fd, offset, SEEK_SET);
    if (err == -1) {
        return err;
    } else {
        ret = writev(fs->fd, iov, iovcnt);
    }
#endif
#ifdef CONFIG_SYNC_FILE_RANGE
    if (ret > 0 && ctx->export_flags & V9FS_IMMEDIATE_WRITEOUT) {
        /*
         * Initiate a writeback. This is not a data integrity sync.
         * We want to ensure that we don't leave dirty pages in the cache
         * after write when writeout=immediate is sepcified.
         */
        sync_file_range(fs->fd, offset, ret,
                        SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
    }
#endif
    return ret;
}
コード例 #6
0
ファイル: file.cpp プロジェクト: pdh11/chorale
void File::WriteOut(unsigned int pageno)
{
    if (m_fd != -1)
    {
	msync(Page(pageno), PAGE, MS_ASYNC);
	sync_file_range(m_fd, pageno*4096ull, 4096, SYNC_FILE_RANGE_WRITE);
    }
}
コード例 #7
0
ファイル: compat-fcntl.c プロジェクト: 5kg/lttng-tools
int compat_sync_file_range(int fd, off64_t offset, off64_t nbytes,
		unsigned int flags)
{
#ifdef HAVE_SYNC_FILE_RANGE
	return sync_file_range(fd, offset, nbytes, flags);
#else
	return fdatasync(fd);
#endif
}
コード例 #8
0
ファイル: archives.c プロジェクト: Minipig/dpkg
static inline void
fd_writeback_init(int fd)
{
#if defined(SYNC_FILE_RANGE_WRITE)
  sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
#elif defined(HAVE_POSIX_FADVISE)
  posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
#endif
}
コード例 #9
0
int write_junk(const char *fname, int flags, int sync_options, uint64_t file_size)
{
	int fd, len;
	uint64_t offset, generation = 0;
	char *buf;

	len = posix_memalign((void **)&buf, bufsize, bufsize);
	if (len) {
		errno = len;
		perror("alloc");
		return 66;
	}

	fd = open(fname, flags | O_WRONLY);
	if (fd < 1) {
		perror(fname);
		return 64;
	}

	while (1) {
		len = snprintf(buf, bufsize - 1, "%d - %"PRIu64, getpid(), generation++);
		if (flags & O_DIRECT) {
			len = bufsize;
			offset = get_randnum_align(0, file_size - len, bufsize);
		} else {
			offset = get_randnum(0, file_size - len);
		}

		if (pwrite(fd, buf, len, offset) < 0) {
			perror("pwrite");
			close(fd);
			free(buf);
			return 65;
		}
		if ((sync_options & SYNC_RANGE) && sync_file_range(fd, offset, len, SYNC_FILE_RANGE_WAIT_BEFORE |
SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER) < 0) {
			perror("sync_file_range");
			close(fd);
			free(buf);
			return 67;
		}
		if ((sync_options & SYNC_FILE) && fsync(fd)) {
			perror("fsync");
			close(fd);
			free(buf);
			return 68;
		}
	}

	return 0;
}
コード例 #10
0
ファイル: pfile.c プロジェクト: Zhoutall/stasis
static int pfile_async_force(stasis_handle_t *h) {
  TICK(force_range_hist);
  pfile_impl * impl = h->impl;
#ifdef HAVE_SYNC_FILE_RANGE
  // stop of zero syncs to eof.
  DEBUG("pfile_force_range calling sync_file_range %lld %lld\n",
                 start, stop-start); fflush(stdout);
  int ret = sync_file_range(impl->fd, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE);
  ret |= sync_file_range(impl->fd, 0, 0, SYNC_FILE_RANGE_WRITE);
                                                     
  if(ret) {
    int error = errno;
    assert(ret == -1);
    // With the possible exceptions of ENOMEM and ENOSPACE, all of the sync
    // errors are unrecoverable.
    h->error = EBADF;
    ret = error;
  }
#else
#ifdef HAVE_FDATASYNC
  DEBUG("pfile_force_range() is calling fdatasync()\n");
  fdatasync(impl->fd);
#else
  DEBUG("pfile_force_range() is calling fsync()\n");
  fsync(impl->fd);
#endif
  int ret = 0;
#endif
#ifdef HAVE_POSIX_FADVISE
  if(impl->sequential) {
    int err = posix_fadvise(impl->fd, 0, 0, POSIX_FADV_DONTNEED);
    if(err) perror("Attempt to pass POSIX_FADV_SEQUENTIAL (for a range of a file) to kernel failed");
  }
#endif
  TOCK(force_range_hist);
  return ret;
}
コード例 #11
0
ファイル: os_fs.c プロジェクト: GYGit/mongo
/*
 * __posix_file_sync_nowait --
 *	POSIX fsync.
 */
static int
__posix_file_sync_nowait(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
{
	WT_DECL_RET;
	WT_FILE_HANDLE_POSIX *pfh;
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)wt_session;
	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;

	WT_SYSCALL_RETRY(sync_file_range(pfh->fd,
	    (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE), ret);
	if (ret == 0)
		return (0);
	WT_RET_MSG(session, ret,
	    "%s: handle-sync-nowait: sync_file_range", file_handle->name);
}
コード例 #12
0
ファイル: pageFile.c プロジェクト: baskard/stasis
static void pfForceRangePageFile(stasis_page_handle_t * h, lsn_t start, lsn_t stop) {
  if(pageFile_isDurable) {
#ifdef HAVE_SYNC_FILE_RANGE
  int ret = sync_file_range(stable, start, stop,
			      SYNC_FILE_RANGE_WAIT_BEFORE |
			      SYNC_FILE_RANGE_WRITE |
			      SYNC_FILE_RANGE_WAIT_AFTER);
  assert(!ret);
#else
#ifdef HAVE_FDATASYNC
  fdatasync(stable);
#else
  fsync(stable);
#endif
#endif
  }
}
コード例 #13
0
ファイル: os_fsync.c プロジェクト: EaseTech/wiredtiger
/*
 * __wt_fsync_async --
 *	Flush a file handle and don't wait for the result.
 */
int
__wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh)
{
#ifdef	HAVE_SYNC_FILE_RANGE
	WT_DECL_RET;

	WT_RET(__wt_verbose(
	    session, WT_VERB_FILEOPS, "%s: sync_file_range", fh->name));

	if ((ret = sync_file_range(fh->fd,
	    (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) == 0)
		return (0);
	WT_RET_MSG(session, ret, "%s: sync_file_range", fh->name);
#else
	WT_UNUSED(session);
	WT_UNUSED(fh);
	return (0);
#endif
}
コード例 #14
0
int
main(void)
{
	const int fd = -1;
	const off64_t offset = 0xdeadbeefbadc0ded;
	const off64_t nbytes = 0xfacefeedcafef00d;
	const unsigned int flags = -1;

	int rc = sync_file_range(fd, offset, nbytes, flags);
	printf("%s(%d, SYNC_FILE_RANGE_WAIT_BEFORE"
	       "|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER"
	       "|0xfffffff8, %lld, %lld) = %d %s (%m)\n",
	       "sync_file_range2", fd,
	       (long long) offset,
	       (long long) nbytes,
	       rc, errno2name());

	puts("+++ exited with 0 +++");
	return 0;
}
コード例 #15
0
static void close_was_called(int fd)
{
	struct fd_status *fds;

	if (pagecache_max_bytes == 0)
		return;

	fds = get_fd_status(fd);
	if (fds->bytes_written > 0 )  {
	if (time(NULL)-fds->seconds > wait_secs ) { 
		/* >=  /proc/sys/vm/dirty_writeback_centisecs */

		sync_file_range(fd, 0, LONG_MAX,
			SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER);
		pagecache_size_write -= fds->bytes_written;
		fds->bytes_written=0;
#ifdef COUNT_READS
	}}
	posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
#else
		posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
	}}
コード例 #16
0
ファイル: ldbsim.c プロジェクト: mdcallag/mytools
void* compact_func(void* arg) {
  operation_stats* stats = (operation_stats*) arg;
  char** data;
  struct timeval start;
  struct drand48_data ctx;
  int* input_x;
  DFILE* new_dfiles;
  char** old_fnames;
  int* old_fds;
  int i;
  long long read_size = data_block_size * compact_read_blocks;
  char* zbuf = (char*) malloc(data_block_size*3);
  int zbuf_len = data_block_size*3;
  double usecs_per_loop = 1000000.0 * secs_per_loop_per_thr;
  struct timeval rate_start;

  now(&rate_start);

  old_fds = (int*) malloc(sizeof(int) * (fanout+1));
  old_fnames = (char**) malloc(sizeof(char*) * (fanout+1));

  data = (char**) malloc(sizeof(char*) * (fanout+2));
  for (i=0; i < (fanout+2); ++i)
    assert(!posix_memalign((void**) &data[i], data_block_size, read_size));

  input_x = (int*) malloc(sizeof(int) * (fanout+1));
  new_dfiles = (DFILE*) malloc(sizeof(DFILE) * (fanout+1));

  init_rand_ctx(&ctx);

  while (!shutdown) {
    longlong read_offset = 0;
    longlong output_offset = 0;
    int output_x = 0;
    longlong bytes_read = 0;
    longlong bytes_written = 0;
    /* When a compaction uses "cached reads" all of the compaction reads
       use the same input file to simulate most of the reads hitting
       in the OS filesystem cache.
    */
    int uncached_reads = rand_choose(&ctx, 100) < compact_read_miss_pct;

    for (i=0; i < (fanout+1); ++i) {
      int x = lock_random_file(&ctx);

      input_x[i] = x;
      set_compact_options(dfiles[x].dfile_fd);
    }

    now(&start);

    i = get_recycled_files(new_dfiles, fanout+1);

    for (; i < (fanout+1); ++i)
      open_file(1, &new_dfiles[i], NULL);

    while (read_offset < data_file_size) {

      for (i=0; i < (fanout+1); ++i) {
	int b;
        int read_fd_idx = uncached_reads ? input_x[i] : 0;

        check_pread(dfiles[read_fd_idx].dfile_fd, data[i],
                    read_size, read_offset, "compact",
                    dfiles[read_fd_idx].dfile_fname);
        if (uncached_reads)
	  bytes_read += read_size;

	for (b=0; b < compact_read_blocks; ++b) {
	  page_check_checksum(data[i] + (b * data_block_size));
	  if (compress_level)
	    decompress_page(zbuf, zbuf_len);
	}

      }
      read_offset += read_size;

      for (i=0; i < (fanout+1); ++i) {
	int b;

	for (b=0; b < compact_read_blocks; ++b) {
	  page_write_checksum(data[i] + (b * data_block_size), i+b);
	  if (compress_level)
	    compress_page(compressed_page, compressed_page_len, zbuf, zbuf_len);
	}

        check_write(new_dfiles[output_x].dfile_fd, data[i], read_size,
		    "compaction write");
	bytes_written += read_size;

        output_offset += read_size;
        if (output_offset >= data_file_size) {
          new_dfiles[output_x].dfile_len = output_offset;
          if (use_sync_file_range) {
            sync_file_range(new_dfiles[output_x].dfile_fd, 0, 0,
                            SYNC_FILE_RANGE_WRITE);
          } else {
  	    sync_after_writes(new_dfiles[output_x].dfile_fd);
          }
          ++output_x;
          output_offset = 0;
        }
      }
    }

    if (use_sync_file_range)
      for (i=0; i < (fanout+1); ++i)
        sync_after_writes(new_dfiles[i].dfile_fd);

    stats_report(stats, &start, &ctx, bytes_read, bytes_written);

    assert(output_x == (fanout+1));

    pthread_mutex_lock(&dfiles_mutex);
    for (i=0; i < (fanout+1); ++i) {
      DFILE* old_f = &dfiles[input_x[i]];
      DFILE* new_f = &new_dfiles[i];
      
      old_fnames[i] = old_f->dfile_fname;
      old_fds[i] = old_f->dfile_fd;

      *old_f = *new_f;
    }
    pthread_mutex_unlock(&dfiles_mutex);

    handle_old_files(fanout+1, old_fds, old_fnames);

    if (write_bytes_per_second) {
      struct timeval rate_cur;
      long usecs_elapsed;

      now(&rate_cur);
      usecs_elapsed = now_minus_then_usecs(&rate_cur, &rate_start);

      if (usecs_elapsed > 0 && usecs_elapsed < usecs_per_loop) {
        usleep(usecs_per_loop - usecs_elapsed);
        now(&rate_start);
      } else {
        rate_start = rate_cur;
      }
    }
  }

  for (i=0; i < (fanout+2); ++i)
    free(data[i]);
  free(data);
  free(input_x);
  free(new_dfiles);
  free(old_fnames);
  free(old_fds);
  free(zbuf);
  return NULL;
}
コード例 #17
0
ファイル: fsync-tester.c プロジェクト: vdmfernandes/xfstests
/*
 * Randomly write inside of a file, either creating a sparse file or prealloc
 * the file and randomly write within it, depending on the prealloc flag
 */
static int test_three(int *max_blocks, int prealloc, int rand_fsync,
		      int do_sync, int drop_caches)
{
	int size = (random() % 2048) + 4;
	int blocks = size / 2;
	int sync_block = blocks / 2;
	int rand_sync_interval = (random() % blocks) + 1;
	int character = (random() % 126) + 33;

	if (prealloc && fallocate(test_fd, 0, 0, size * 4096)) {
		fprintf(stderr, "Error fallocating %d (%s)\n", errno,
			strerror(errno));
		return 1;
	}

	if (prealloc)
		*max_blocks = size;

	memset(buf, character, 4096);
	while (blocks--) {
		int block = (random() % size);

		if ((block + 1) > *max_blocks)
			*max_blocks = block + 1;

		if (rand_fsync && !(blocks % rand_sync_interval)) {
			if (fsync(test_fd)) {
				fprintf(stderr, "Fsync failed, test results "
					"will be invalid: %d\n", errno);
				return 1;
			}
		}

		/* Force a transaction commit in between just for fun */
		if (blocks == sync_block && (do_sync || drop_caches)) {
			if (do_sync)
				sync();
			else
				sync_file_range(test_fd, 0, 0,
						SYNC_FILE_RANGE_WRITE|
						SYNC_FILE_RANGE_WAIT_AFTER);

			if (drop_caches) {
				close(test_fd);
				drop_all_caches();
				test_fd = open(fname, O_RDWR);
				if (test_fd < 0) {
					test_fd = 0;
					fprintf(stderr, "Error re-opening file: %d\n",
						errno);
					return 1;
				}
			}
		}

		if (pwrite(test_fd, buf, 4096, block * 4096) < 4096) {
			fprintf(stderr, "Short write %d\n", errno);
			return 1;
		}
	}

	return 0;
}
コード例 #18
0
int main(void)
{
	return sync_file_range(0, 0, 1024, 0);
}
コード例 #19
0
ファイル: block_write.c プロジェクト: ckoolkarni/wiredtiger
/*
 * __wt_block_write_off --
 *	Write a buffer into a block, returning the block's addr/size and
 * checksum.
 */
int
__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
    int data_cksum, int locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	off_t offset;
	uint32_t align_size;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	fh = block->fh;

	/* Buffers should be aligned for writing. */
	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL,
		    "direct I/O check: write buffer incorrectly allocated");
	}

	/*
	 * Align the size to an allocation unit.
	 *
	 * The buffer must be big enough for us to zero to the next allocsize
	 * boundary, this is one of the reasons the btree layer must find out
	 * from the block-manager layer the maximum size of the eventual write.
	 */
	align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer incorrectly allocated");
	}

	/* Zero out any unused bytes at the end of the buffer. */
	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);

	/*
	 * Set the disk size so we don't have to incrementally read blocks
	 * during salvage.
	 */
	blk->disk_size = align_size;

	/*
	 * Update the block's checksum: if our caller specifies, checksum the
	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
	 * bytes.  The assumption is applications with good compression support
	 * turn off checksums and assume corrupted blocks won't decompress
	 * correctly.  However, if compression failed to shrink the block, the
	 * block wasn't compressed, in which case our caller will tell us to
	 * checksum the data to detect corruption.   If compression succeeded,
	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
	 * because they're not compressed, both to give salvage a quick test
	 * of whether a block is useful and to give us a test so we don't lose
	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
	 */
	blk->flags = 0;
	if (data_cksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);
	blk->cksum = 0;
	blk->cksum = __wt_cksum(
	    buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);

	if (!locked)
		__wt_spin_lock(session, &block->live_lock);
	ret = __wt_block_alloc(session, block, &offset, (off_t)align_size);
	if (!locked)
		__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

#if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE)
	/*
	 * Extend the file in chunks.  We aren't holding a lock and we'd prefer
	 * to limit the number of threads extending the file at the same time,
	 * so choose the one thread that's crossing the extended boundary.  We
	 * don't extend newly created files, and it's theoretically possible we
	 * might wait so long our extension of the file is passed by another
	 * thread writing single blocks, that's why there's a check in case the
	 * extended file size becomes too small: if the file size catches up,
	 * every thread will try to extend it.
	 */
	if (fh->extend_len != 0 &&
	    (fh->extend_size <= fh->size ||
	    (offset + fh->extend_len <= fh->extend_size &&
	    offset + fh->extend_len + align_size >= fh->extend_size))) {
		fh->extend_size = offset + fh->extend_len * 2;
#if defined(HAVE_POSIX_FALLOCATE)
		if ((ret =
		    posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fallocate", fh->name);
#elif defined(HAVE_FTRUNCATE)
		if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0)
			WT_RET_MSG(session, ret, "%s: ftruncate", fh->name);
#endif
	}
#endif
	if ((ret =
	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
		if (!locked)
			__wt_spin_lock(session, &block->live_lock);
		WT_TRET(
		    __wt_block_off_free(session, block, offset, align_size));
		if (!locked)
			__wt_spin_unlock(session, &block->live_lock);
		WT_RET(ret);
	}

#ifdef HAVE_SYNC_FILE_RANGE
	/*
	 * Optionally schedule writes for dirty pages in the system buffer
	 * cache.
	 */
	if (block->os_cache_dirty_max != 0 &&
	    (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) {
		block->os_cache_dirty = 0;
		if ((ret = sync_file_range(fh->fd,
		    (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: sync_file_range", block->name);
	}
#endif
#ifdef HAVE_POSIX_FADVISE
	/* Optionally discard blocks from the system buffer cache. */
	if (block->os_cache_max != 0 &&
	    (block->os_cache += align_size) > block->os_cache_max) {
		block->os_cache = 0;
		if ((ret = posix_fadvise(fh->fd,
		    (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fadvise", block->name);
	}
#endif
	WT_CSTAT_INCR(session, block_write);
	WT_CSTAT_INCRV(session, block_byte_write, align_size);

	WT_VERBOSE_RET(session, write,
	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
	    (uintmax_t)offset, align_size, blk->cksum);

	*offsetp = offset;
	*sizep = align_size;
	*cksump = blk->cksum;

	return (ret);
}
コード例 #20
0
int main(int argc, char *argv[]) {
	int fd;
	int sem;
	int nrpages = 1;
	int ret = 0;
	int tmp = 0;
	int offset = 0;
	char c;
	char *filename;
	char *actype;
	char *onerror;
	char *p;
	pid_t pid;
	int wait_status;
	uint64_t pflag;
	struct sembuf sembuf;
	struct pagestat pgstat;

	if (argc != 5) {
		printf("Usage: %s filename nrpages accesstype onerror\n", argv[0]);
		exit(EXIT_FAILURE);
	}
	filename = argv[1];
	nrpages = strtol(argv[2], NULL, 10);
	actype = argv[3];
	onerror = argv[4];
	DEB("filename = %s, nrpages = %d, actype = %s, onerror = %s\n",
	       filename, nrpages, actype, onerror);

	if (strcmp(onerror, "onerror") == 0)
		offset = 0;
	else
		offset = PS;

	sem = create_and_init_semaphore();

	fd = open_check(filename, O_RDWR, 0);
	tmp = pread(fd, rbuf, nrpages*PS, 0);
	DEB("parent first read %d [%c,%c]\n", tmp, rbuf[0], rbuf[PS]);

	get_semaphore(sem, &sembuf);
	if ((pid = fork()) == 0) {
		get_semaphore(sem, &sembuf); /* wait parent to dirty page */
		p = mmap_check((void *)REFADDR, nrpages * PS,
			       PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
		if (p != (void *)REFADDR)
			err("mmap");
		if (nrpages == 1) {
			DEB("child read (after dirty) [%c]\n", p[0]);
#ifdef DEBUG
			get_pagestat(p, &pgstat);
#endif
		} else {
			DEB("child read (after dirty) [%c,%c]\n", p[0], p[PS]);
#ifdef DEBUG
			get_pagestat(p, &pgstat);
			get_pagestat(p+PS, &pgstat);
#endif
		}
		DEB("child hwpoison to vaddr %p\n", p);
		madvise(&p[0], PS, 100); /* hwpoison */
		put_semaphore(sem, &sembuf);
		get_semaphore(sem, &sembuf);
		DEB("child terminated\n");
		put_semaphore(sem, &sembuf);
		get_pflags(pgstat.pfn, &pflag, 1);
		exit(EXIT_SUCCESS);
	} else {
		DEB("parent dirty\n");
		usleep(1000);
		memset(wbuf, 49, nrpages * PS);
		pwrite(fd, wbuf, nrpages * PS, 0);
		tmp = pread(fd, rbuf, nrpages * PS, 0);
		DEB("parent second read (after dirty) %d [%c,%c]\n",
		       tmp, rbuf[0], rbuf[PS]);

		put_semaphore(sem, &sembuf); /* kick child to inject error */
		get_semaphore(sem, &sembuf); /* pagecache should be hwpoison */
		DEB("parent check\n");
		if (strcmp(actype, "read") == 0) {
			tmp = pread(fd, rbuf, PS, offset);
			if (tmp < 0)
				DEB("parent first read failed.\n");
			tmp = pread(fd, rbuf, PS, offset);
			DEB("parent read after hwpoison %d [%c,%c]\n",
			       tmp, rbuf[0], rbuf[PS]);
			if (tmp < 0) {
				ret = -1;
				perror("read");
			} else {
				ret = 0;
			}
		} else if (strcmp(actype, "writefull") == 0) {
			memset(wbuf, 50, nrpages * PS);
			tmp = pwrite(fd, wbuf, PS, offset);
			tmp = pwrite(fd, wbuf, PS, offset);
			DEB("parent write after hwpoison %d\n", tmp);
			if (tmp < 0) {
				ret = -1;
				perror("writefull");
			} else {
				ret = 0;
			}
		} else if (strcmp(actype, "writepart") == 0) {
			memset(wbuf, 50, nrpages * PS);
			tmp = pwrite(fd, wbuf, PS / 2, offset);
			tmp = pwrite(fd, wbuf, PS / 2, offset);
			DEB("parent write after hwpoison %d\n", tmp);
			if (tmp < 0) {
				ret = -1;
				perror("writefull");
			} else {
				ret = 0;
			}
		} else if (strcmp(actype, "fsync") == 0) {
			ret = fsync(fd);
			ret = fsync(fd);
			DEB("parent fsync after hwpoison [ret %d]\n", ret);
			if (ret)
				perror("fsync");
		} else if (strcmp(actype, "sync_range_write") == 0) {
			ret = sync_file_range(fd, offset, PS, SYNC_FILE_RANGE_WRITE);
			ret = sync_file_range(fd, offset, PS, SYNC_FILE_RANGE_WRITE);
			if (ret)
				perror("sync_range_write");
		} else if (strcmp(actype, "sync_range_wait") == 0) {
			ret = sync_file_range(fd, offset, PS, SYNC_FILE_RANGE_WAIT_BEFORE);
			ret = sync_file_range(fd, offset, PS, SYNC_FILE_RANGE_WAIT_BEFORE);
			if (ret)
				perror("sync_range_wait");
		} else if (strcmp(actype, "mmapread") == 0) {
			/*
			 * If mmap access failed, this program should be
			 * terminated by segmentation fault with non-zero
			 * returned value. So we don't set ret here.
			 */
			p = mmap_check((void *)REFADDR, nrpages * PS,
				       PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
			if (p != (void *)REFADDR)
				err("mmap");
			c = p[offset];
			DEB("parent mmap() read after hwpoison [%c]\n", p[offset]);
		} else if (strcmp(actype, "mmapwrite") == 0) {
			p = mmap_check((void *)REFADDR, nrpages * PS,
				       PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
			if (p != (void *)REFADDR)
				err("mmap");
			memset(&p[offset], 50, PS);
			DEB("parent mmap() write after hwpoison [%c]\n", p[offset]);
		}
	}
	put_semaphore(sem, &sembuf);

	waitpid(pid, &wait_status, 0);
	if (!WIFEXITED(wait_status))
		err("waitpid");

	delete_semaphore(sem);
	DEB("parent exit %d.\n", ret);
	return ret;
}