Example #1
0
int
main(int argc, char *argv[])
{
	struct stat sb;
	FILE *fp;
	WT_CONNECTION *conn;
	WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_stable;
	WT_RAND_STATE rnd;
	WT_SESSION *session;
	pid_t pid;
	uint64_t absent_coll, absent_local, absent_oplog, count, key, last_key;
	uint64_t first_miss, middle_coll, middle_local, middle_oplog;
	uint64_t stable_fp, stable_val, val[MAX_TH+1];
	uint32_t i, nth, timeout;
	int ch, status, ret;
	const char *working_dir;
	char buf[512], fname[64], kname[64], statname[1024];
	bool fatal, rand_th, rand_time, verify_only;

	(void)testutil_set_progname(argv);

	compat = inmem = false;
	use_ts = true;
	nth = MIN_TH;
	rand_th = rand_time = true;
	timeout = MIN_TIME;
	verify_only = false;
	working_dir = "WT_TEST.timestamp-abort";

	while ((ch = __wt_getopt(progname, argc, argv, "Ch:mT:t:vz")) != EOF)
		switch (ch) {
		case 'C':
			compat = true;
			break;
		case 'h':
			working_dir = __wt_optarg;
			break;
		case 'm':
			inmem = true;
			break;
		case 'T':
			rand_th = false;
			nth = (uint32_t)atoi(__wt_optarg);
			break;
		case 't':
			rand_time = false;
			timeout = (uint32_t)atoi(__wt_optarg);
			break;
		case 'v':
			verify_only = true;
			break;
		case 'z':
			use_ts = false;
			break;
		default:
			usage();
		}
	argc -= __wt_optind;
	argv += __wt_optind;
	if (argc != 0)
		usage();

	testutil_work_dir_from_path(home, sizeof(home), working_dir);
	/*
	 * If the user wants to verify they need to tell us how many threads
	 * there were so we can find the old record files.
	 */
	if (verify_only && rand_th) {
		fprintf(stderr,
		    "Verify option requires specifying number of threads\n");
		exit (EXIT_FAILURE);
	}
	if (!verify_only) {
		testutil_make_work_dir(home);

		__wt_random_init_seed(NULL, &rnd);
		if (rand_time) {
			timeout = __wt_random(&rnd) % MAX_TIME;
			if (timeout < MIN_TIME)
				timeout = MIN_TIME;
		}
		if (rand_th) {
			nth = __wt_random(&rnd) % MAX_TH;
			if (nth < MIN_TH)
				nth = MIN_TH;
		}
		printf("Parent: compatibility: %s, "
		    "in-mem log sync: %s, timestamp in use: %s\n",
		    compat ? "true" : "false",
		    inmem ? "true" : "false",
		    use_ts ? "true" : "false");
		printf("Parent: Create %" PRIu32
		    " threads; sleep %" PRIu32 " seconds\n", nth, timeout);
		/*
		 * Fork a child to insert as many items.  We will then randomly
		 * kill the child, run recovery and make sure all items we wrote
		 * exist after recovery runs.
		 */
		testutil_checksys((pid = fork()) < 0);

		if (pid == 0) { /* child */
			run_workload(nth);
			return (EXIT_SUCCESS);
		}

		/* parent */
		/*
		 * Sleep for the configured amount of time before killing
		 * the child.  Start the timeout from the time we notice that
		 * the file has been created.  That allows the test to run
		 * correctly on really slow machines.  Verify the process ID
		 * still exists in case the child aborts for some reason we
		 * don't stay in this loop forever.
		 */
		testutil_check(__wt_snprintf(
		    statname, sizeof(statname), "%s/%s", home, ckpt_file));
		while (stat(statname, &sb) != 0 && kill(pid, 0) == 0)
			sleep(1);
		sleep(timeout);

		/*
		 * !!! It should be plenty long enough to make sure more than
		 * one log file exists.  If wanted, that check would be added
		 * here.
		 */
		printf("Kill child\n");
		testutil_checksys(kill(pid, SIGKILL) != 0);
		testutil_checksys(waitpid(pid, &status, 0) == -1);
	}
	/*
	 * !!! If we wanted to take a copy of the directory before recovery,
	 * this is the place to do it.
	 */
	if (chdir(home) != 0)
		testutil_die(errno, "parent chdir: %s", home);
	testutil_check(__wt_snprintf(buf, sizeof(buf),
	    "rm -rf ../%s.SAVE && mkdir ../%s.SAVE && "
	    "cp -p WiredTigerLog.* ../%s.SAVE",
	     home, home, home));
	(void)system(buf);
	printf("Open database, run recovery and verify content\n");

	/*
	 * Open the connection which forces recovery to be run.
	 */
	if ((ret = wiredtiger_open(NULL, NULL, ENV_CONFIG_REC, &conn)) != 0)
		testutil_die(ret, "wiredtiger_open");
	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
		testutil_die(ret, "WT_CONNECTION:open_session");
	/*
	 * Open a cursor on all the tables.
	 */
	if ((ret = session->open_cursor(session,
	    uri_collection, NULL, NULL, &cur_coll)) != 0)
		testutil_die(ret, "WT_SESSION.open_cursor: %s", uri_collection);
	if ((ret = session->open_cursor(session,
	    uri_local, NULL, NULL, &cur_local)) != 0)
		testutil_die(ret, "WT_SESSION.open_cursor: %s", uri_local);
	if ((ret = session->open_cursor(session,
	    uri_oplog, NULL, NULL, &cur_oplog)) != 0)
		testutil_die(ret, "WT_SESSION.open_cursor: %s", uri_oplog);
	if ((ret = session->open_cursor(session,
	    stable_store, NULL, NULL, &cur_stable)) != 0)
		testutil_die(ret, "WT_SESSION.open_cursor: %s", stable_store);

	/*
	 * Find the biggest stable timestamp value that was saved.
	 */
	stable_val = 0;
	memset(val, 0, sizeof(val));
	while (cur_stable->next(cur_stable) == 0) {
		cur_stable->get_key(cur_stable, &key);
		cur_stable->get_value(cur_stable, &val[key]);
		if (val[key] > stable_val)
			stable_val = val[key];

		if (use_ts)
			printf("Stable: key %" PRIu64 " value %" PRIu64 "\n",
			    key, val[key]);
	}
	if (use_ts)
		printf("Got stable_val %" PRIu64 "\n", stable_val);

	count = 0;
	absent_coll = absent_local = absent_oplog = 0;
	fatal = false;
	for (i = 1; i <= nth; ++i) {
		first_miss = middle_coll = middle_local = middle_oplog = 0;
		testutil_check(__wt_snprintf(
		    fname, sizeof(fname), RECORDS_FILE, i));
		if ((fp = fopen(fname, "r")) == NULL)
			testutil_die(errno, "fopen: %s", fname);

		/*
		 * For every key in the saved file, verify that the key exists
		 * in the table after recovery.  If we're doing in-memory
		 * log buffering we never expect a record missing in the middle,
		 * but records may be missing at the end.  If we did
		 * write-no-sync, we expect every key to have been recovered.
		 */
		for (last_key = UINT64_MAX;; ++count, last_key = key) {
			ret = fscanf(fp, "%" SCNu64 "%" SCNu64 "\n",
			    &stable_fp, &key);
			if (ret != EOF && ret != 2) {
				/*
				 * If we find a partial line, consider it
				 * like an EOF.
				 */
				if (ret == 1 || ret == 0)
					break;
				testutil_die(errno, "fscanf");
			}
			if (ret == EOF)
				break;
			/*
			 * If we're unlucky, the last line may be a partially
			 * written key at the end that can result in a false
			 * negative error for a missing record.  Detect it.
			 */
			if (last_key != UINT64_MAX && key != last_key + 1) {
				printf("%s: Ignore partial record %" PRIu64
				    " last valid key %" PRIu64 "\n",
				    fname, key, last_key);
				break;
			}
			testutil_check(__wt_snprintf(
			    kname, sizeof(kname), "%" PRIu64, key));
			cur_coll->set_key(cur_coll, kname);
			cur_local->set_key(cur_local, kname);
			cur_oplog->set_key(cur_oplog, kname);
			/*
			 * The collection table should always only have the
			 * data as of the checkpoint.
			 */
			if ((ret = cur_coll->search(cur_coll)) != 0) {
				if (ret != WT_NOTFOUND)
					testutil_die(ret, "search");
				/*
				 * If we don't find a record, the stable
				 * timestamp written to our file better be
				 * larger than the saved one.
				 */
				if (!inmem &&
				    stable_fp != 0 && stable_fp <= val[i]) {
					printf("%s: COLLECTION no record with "
					    "key %" PRIu64 " record ts %" PRIu64
					    " <= stable ts %" PRIu64 "\n",
					    fname, key, stable_fp, val[i]);
					absent_coll++;
				}
				if (middle_coll == 0)
					first_miss = key;
				middle_coll = key;
			} else if (middle_coll != 0) {
				/*
				 * We should never find an existing key after
				 * we have detected one missing.
				 */
				printf("%s: COLLECTION after absent records %"
				    PRIu64 "-%" PRIu64 " key %" PRIu64
				    " exists\n",
				    fname, first_miss, middle_coll, key);
				fatal = true;
			}
			/*
			 * The local table should always have all data.
			 */
			if ((ret = cur_local->search(cur_local)) != 0) {
				if (ret != WT_NOTFOUND)
					testutil_die(ret, "search");
				if (!inmem)
					printf("%s: LOCAL no record with key %"
					    PRIu64 "\n", fname, key);
				absent_local++;
				middle_local = key;
			} else if (middle_local != 0) {
				/*
				 * We should never find an existing key after
				 * we have detected one missing.
				 */
				printf("%s: LOCAL after absent record at %"
				    PRIu64 " key %" PRIu64 " exists\n",
				    fname, middle_local, key);
				fatal = true;
			}
			/*
			 * The oplog table should always have all data.
			 */
			if ((ret = cur_oplog->search(cur_oplog)) != 0) {
				if (ret != WT_NOTFOUND)
					testutil_die(ret, "search");
				if (!inmem)
					printf("%s: OPLOG no record with key %"
					    PRIu64 "\n", fname, key);
				absent_oplog++;
				middle_oplog = key;
			} else if (middle_oplog != 0) {
				/*
				 * We should never find an existing key after
				 * we have detected one missing.
				 */
				printf("%s: OPLOG after absent record at %"
				    PRIu64 " key %" PRIu64 " exists\n",
				    fname, middle_oplog, key);
				fatal = true;
			}
		}
		testutil_checksys(fclose(fp) != 0);
	}
	if ((ret = conn->close(conn, NULL)) != 0)
		testutil_die(ret, "WT_CONNECTION:close");
	if (fatal)
		return (EXIT_FAILURE);
	if (!inmem && absent_coll) {
		printf("COLLECTION: %" PRIu64
		    " record(s) absent from %" PRIu64 "\n",
		    absent_coll, count);
		fatal = true;
	}
	if (!inmem && absent_local) {
		printf("LOCAL: %" PRIu64 " record(s) absent from %" PRIu64 "\n",
		    absent_local, count);
		fatal = true;
	}
	if (!inmem && absent_oplog) {
		printf("OPLOG: %" PRIu64 " record(s) absent from %" PRIu64 "\n",
		    absent_oplog, count);
		fatal = true;
	}
	if (fatal)
		return (EXIT_FAILURE);
	printf("%" PRIu64 " records verified\n", count);
	return (EXIT_SUCCESS);
}
Example #2
0
int
main(int argc, char *argv[])
{
	struct sigaction sa;
	struct stat sb;
	FILE *fp;
	REPORT c_rep[MAX_TH], l_rep[MAX_TH], o_rep[MAX_TH];
	WT_CONNECTION *conn;
	WT_CURSOR *cur_coll, *cur_local, *cur_oplog;
	WT_RAND_STATE rnd;
	WT_SESSION *session;
	pid_t pid;
	uint64_t absent_coll, absent_local, absent_oplog, count, key, last_key;
	uint64_t stable_fp, stable_val;
	uint32_t i, nth, timeout;
	int ch, status, ret;
	const char *working_dir;
	char buf[512], fname[64], kname[64], statname[1024];
	char ts_string[WT_TS_HEX_STRING_SIZE];
	bool fatal, rand_th, rand_time, verify_only;

	(void)testutil_set_progname(argv);

	compat = inmem = false;
	use_ts = true;
	nth = MIN_TH;
	rand_th = rand_time = true;
	timeout = MIN_TIME;
	verify_only = false;
	working_dir = "WT_TEST.timestamp-abort";

	while ((ch = __wt_getopt(progname, argc, argv, "Ch:LmT:t:vz")) != EOF)
		switch (ch) {
		case 'C':
			compat = true;
			break;
		case 'h':
			working_dir = __wt_optarg;
			break;
		case 'L':
			table_pfx = "lsm";
			break;
		case 'm':
			inmem = true;
			break;
		case 'T':
			rand_th = false;
			nth = (uint32_t)atoi(__wt_optarg);
			if (nth > MAX_TH) {
				fprintf(stderr,
				    "Number of threads is larger than the"
				    " maximum %" PRId32 "\n", MAX_TH);
				return (EXIT_FAILURE);
			}
			break;
		case 't':
			rand_time = false;
			timeout = (uint32_t)atoi(__wt_optarg);
			break;
		case 'v':
			verify_only = true;
			break;
		case 'z':
			use_ts = false;
			break;
		default:
			usage();
		}
	argc -= __wt_optind;
	if (argc != 0)
		usage();

	testutil_work_dir_from_path(home, sizeof(home), working_dir);
	testutil_check(pthread_rwlock_init(&ts_lock, NULL));

	/*
	 * If the user wants to verify they need to tell us how many threads
	 * there were so we can find the old record files.
	 */
	if (verify_only && rand_th) {
		fprintf(stderr,
		    "Verify option requires specifying number of threads\n");
		exit (EXIT_FAILURE);
	}
	if (!verify_only) {
		testutil_make_work_dir(home);

		__wt_random_init_seed(NULL, &rnd);
		if (rand_time) {
			timeout = __wt_random(&rnd) % MAX_TIME;
			if (timeout < MIN_TIME)
				timeout = MIN_TIME;
		}
		if (rand_th) {
			nth = __wt_random(&rnd) % MAX_TH;
			if (nth < MIN_TH)
				nth = MIN_TH;
		}

		printf("Parent: compatibility: %s, "
		    "in-mem log sync: %s, timestamp in use: %s\n",
		    compat ? "true" : "false",
		    inmem ? "true" : "false",
		    use_ts ? "true" : "false");
		printf("Parent: Create %" PRIu32
		    " threads; sleep %" PRIu32 " seconds\n", nth, timeout);
		printf("CONFIG: %s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n",
		    progname,
		    compat ? " -C" : "",
		    inmem ? " -m" : "",
		    !use_ts ? " -z" : "",
		    working_dir, nth, timeout);
		/*
		 * Fork a child to insert as many items.  We will then randomly
		 * kill the child, run recovery and make sure all items we wrote
		 * exist after recovery runs.
		 */
		memset(&sa, 0, sizeof(sa));
		sa.sa_handler = handler;
		testutil_checksys(sigaction(SIGCHLD, &sa, NULL));
		testutil_checksys((pid = fork()) < 0);

		if (pid == 0) { /* child */
			run_workload(nth);
			return (EXIT_SUCCESS);
		}

		/* parent */
		/*
		 * Sleep for the configured amount of time before killing
		 * the child.  Start the timeout from the time we notice that
		 * the file has been created.  That allows the test to run
		 * correctly on really slow machines.
		 */
		testutil_check(__wt_snprintf(
		    statname, sizeof(statname), "%s/%s", home, ckpt_file));
		while (stat(statname, &sb) != 0)
			testutil_sleep_wait(1, pid);
		sleep(timeout);
		sa.sa_handler = SIG_DFL;
		testutil_checksys(sigaction(SIGCHLD, &sa, NULL));

		/*
		 * !!! It should be plenty long enough to make sure more than
		 * one log file exists.  If wanted, that check would be added
		 * here.
		 */
		printf("Kill child\n");
		testutil_checksys(kill(pid, SIGKILL) != 0);
		testutil_checksys(waitpid(pid, &status, 0) == -1);
	}
	/*
	 * !!! If we wanted to take a copy of the directory before recovery,
	 * this is the place to do it. Don't do it all the time because
	 * it can use a lot of disk space, which can cause test machine
	 * issues.
	 */
	if (chdir(home) != 0)
		testutil_die(errno, "parent chdir: %s", home);
	/*
	 * The tables can get very large, so while we'd ideally like to
	 * copy the entire database, we only copy the log files for now.
	 * Otherwise it can take far too long to run the test, particularly
	 * in automated testing.
	 */
	testutil_check(__wt_snprintf(buf, sizeof(buf),
	    "rm -rf ../%s.SAVE && mkdir ../%s.SAVE && "
	    "cp -p * ../%s.SAVE",
	     home, home, home));
	if ((status = system(buf)) < 0)
		testutil_die(status, "system: %s", buf);
	printf("Open database, run recovery and verify content\n");

	/*
	 * Open the connection which forces recovery to be run.
	 */
	testutil_check(wiredtiger_open(NULL, NULL, ENV_CONFIG_REC, &conn));
	testutil_check(conn->open_session(conn, NULL, NULL, &session));
	/*
	 * Open a cursor on all the tables.
	 */
	testutil_check(__wt_snprintf(
	    buf, sizeof(buf), "%s:%s", table_pfx, uri_collection));
	testutil_check(session->open_cursor(session,
	    buf, NULL, NULL, &cur_coll));
	testutil_check(__wt_snprintf(
	    buf, sizeof(buf), "%s:%s", table_pfx, uri_local));
	testutil_check(session->open_cursor(session,
	    buf, NULL, NULL, &cur_local));
	testutil_check(__wt_snprintf(
	    buf, sizeof(buf), "%s:%s", table_pfx, uri_oplog));
	testutil_check(session->open_cursor(session,
	    buf, NULL, NULL, &cur_oplog));

	/*
	 * Find the biggest stable timestamp value that was saved.
	 */
	stable_val = 0;
	if (use_ts) {
		testutil_check(
		    conn->query_timestamp(conn, ts_string, "get=recovery"));
		testutil_assert(
		    sscanf(ts_string, "%" SCNx64, &stable_val) == 1);
		printf("Got stable_val %" PRIu64 "\n", stable_val);
	}

	count = 0;
	absent_coll = absent_local = absent_oplog = 0;
	fatal = false;
	for (i = 0; i < nth; ++i) {
		initialize_rep(&c_rep[i]);
		initialize_rep(&l_rep[i]);
		initialize_rep(&o_rep[i]);
		testutil_check(__wt_snprintf(
		    fname, sizeof(fname), RECORDS_FILE, i));
		if ((fp = fopen(fname, "r")) == NULL)
			testutil_die(errno, "fopen: %s", fname);

		/*
		 * For every key in the saved file, verify that the key exists
		 * in the table after recovery.  If we're doing in-memory
		 * log buffering we never expect a record missing in the middle,
		 * but records may be missing at the end.  If we did
		 * write-no-sync, we expect every key to have been recovered.
		 */
		for (last_key = INVALID_KEY;; ++count, last_key = key) {
			ret = fscanf(fp, "%" SCNu64 "%" SCNu64 "\n",
			    &stable_fp, &key);
			if (last_key == INVALID_KEY) {
				c_rep[i].first_key = key;
				l_rep[i].first_key = key;
				o_rep[i].first_key = key;
			}
			if (ret != EOF && ret != 2) {
				/*
				 * If we find a partial line, consider it
				 * like an EOF.
				 */
				if (ret == 1 || ret == 0)
					break;
				testutil_die(errno, "fscanf");
			}
			if (ret == EOF)
				break;
			/*
			 * If we're unlucky, the last line may be a partially
			 * written key at the end that can result in a false
			 * negative error for a missing record.  Detect it.
			 */
			if (last_key != INVALID_KEY && key != last_key + 1) {
				printf("%s: Ignore partial record %" PRIu64
				    " last valid key %" PRIu64 "\n",
				    fname, key, last_key);
				break;
			}
			testutil_check(__wt_snprintf(
			    kname, sizeof(kname), "%" PRIu64, key));
			cur_coll->set_key(cur_coll, kname);
			cur_local->set_key(cur_local, kname);
			cur_oplog->set_key(cur_oplog, kname);
			/*
			 * The collection table should always only have the
			 * data as of the checkpoint.
			 */
			if ((ret = cur_coll->search(cur_coll)) != 0) {
				if (ret != WT_NOTFOUND)
					testutil_die(ret, "search");
				/*
				 * If we don't find a record, the stable
				 * timestamp written to our file better be
				 * larger than the saved one.
				 */
				if (!inmem &&
				    stable_fp != 0 && stable_fp <= stable_val) {
					printf("%s: COLLECTION no record with "
					    "key %" PRIu64 " record ts %" PRIu64
					    " <= stable ts %" PRIu64 "\n",
					    fname, key, stable_fp, stable_val);
					absent_coll++;
				}
				if (c_rep[i].first_miss == INVALID_KEY)
					c_rep[i].first_miss = key;
				c_rep[i].absent_key = key;
			} else if (c_rep[i].absent_key != INVALID_KEY &&
			    c_rep[i].exist_key == INVALID_KEY) {
				/*
				 * If we get here we found a record that exists
				 * after absent records, a hole in our data.
				 */
				c_rep[i].exist_key = key;
				fatal = true;
			} else if (!inmem &&
			    stable_fp != 0 && stable_fp > stable_val) {
				/*
				 * If we found a record, the stable timestamp
				 * written to our file better be no larger
				 * than the checkpoint one.
				 */
				printf("%s: COLLECTION record with "
				    "key %" PRIu64 " record ts %" PRIu64
				    " > stable ts %" PRIu64 "\n",
				    fname, key, stable_fp, stable_val);
				fatal = true;
			}
			/*
			 * The local table should always have all data.
			 */
			if ((ret = cur_local->search(cur_local)) != 0) {
				if (ret != WT_NOTFOUND)
					testutil_die(ret, "search");
				if (!inmem)
					printf("%s: LOCAL no record with key %"
					    PRIu64 "\n", fname, key);
				absent_local++;
				if (l_rep[i].first_miss == INVALID_KEY)
					l_rep[i].first_miss = key;
				l_rep[i].absent_key = key;
			} else if (l_rep[i].absent_key != INVALID_KEY &&
			    l_rep[i].exist_key == INVALID_KEY) {
				/*
				 * We should never find an existing key after
				 * we have detected one missing.
				 */
				l_rep[i].exist_key = key;
				fatal = true;
			}
			/*
			 * The oplog table should always have all data.
			 */
			if ((ret = cur_oplog->search(cur_oplog)) != 0) {
				if (ret != WT_NOTFOUND)
					testutil_die(ret, "search");
				if (!inmem)
					printf("%s: OPLOG no record with key %"
					    PRIu64 "\n", fname, key);
				absent_oplog++;
				if (o_rep[i].first_miss == INVALID_KEY)
					o_rep[i].first_miss = key;
				o_rep[i].absent_key = key;
			} else if (o_rep[i].absent_key != INVALID_KEY &&
			    o_rep[i].exist_key == INVALID_KEY) {
				/*
				 * We should never find an existing key after
				 * we have detected one missing.
				 */
				o_rep[i].exist_key = key;
				fatal = true;
			}
		}
		c_rep[i].last_key = last_key;
		l_rep[i].last_key = last_key;
		o_rep[i].last_key = last_key;
		testutil_checksys(fclose(fp) != 0);
		print_missing(&c_rep[i], fname, "COLLECTION");
		print_missing(&l_rep[i], fname, "LOCAL");
		print_missing(&o_rep[i], fname, "OPLOG");
	}
	testutil_check(conn->close(conn, NULL));
	if (!inmem && absent_coll) {
		printf("COLLECTION: %" PRIu64
		    " record(s) absent from %" PRIu64 "\n",
		    absent_coll, count);
		fatal = true;
	}
	if (!inmem && absent_local) {
		printf("LOCAL: %" PRIu64 " record(s) absent from %" PRIu64 "\n",
		    absent_local, count);
		fatal = true;
	}
	if (!inmem && absent_oplog) {
		printf("OPLOG: %" PRIu64 " record(s) absent from %" PRIu64 "\n",
		    absent_oplog, count);
		fatal = true;
	}
	testutil_check(pthread_rwlock_destroy(&ts_lock));
	if (fatal)
		return (EXIT_FAILURE);
	printf("%" PRIu64 " records verified\n", count);
	return (EXIT_SUCCESS);
}