TEST(ConcurrentQueue, ParallelPushPopAscIntegerAndCalculateTotalSum) {
    tlx::ThreadPool pool(8);

    ConcurrentQueue<size_t, std::allocator<size_t> > queue;
    std::atomic<size_t> count(0);
    std::atomic<size_t> total_sum(0);

    static constexpr size_t num_threads = 4;
    static constexpr size_t num_pushes = 10000;

    // have threads push items

    for (size_t i = 0; i != num_threads; ++i) {
        pool.enqueue([&queue]() {
                         for (size_t i = 0; i != num_pushes; ++i) {
                             queue.push(i);
                         }
                     });
    }

    // have threads try to pop items.

    for (size_t i = 0; i != num_threads; ++i) {
        pool.enqueue([&]() {
                         while (count != num_threads * num_pushes) {
                             size_t item;
                             while (queue.try_pop(item)) {
                                 total_sum += item;
                                 ++count;
                             }
                         }
                     });
    }

    pool.loop_until_empty();

    ASSERT_TRUE(queue.empty());
    ASSERT_EQ(count, num_threads * num_pushes);
    // check total sum, no item gets lost?
    ASSERT_EQ(total_sum, num_threads * num_pushes * (num_pushes - 1) / 2);
}
int main(int argc, char *argv[]) {
	// read config.ini file
	Config config;
	auto nthreads  = config.get<int>("nthreads");
	auto device    = config.get<const char *>("disk_guest");
	auto traceFile = config.get<const char *>("trace_file");
	auto logDir    = config.get<string>("log_dir");
	
	// parse arg, prioritize argv over config
	if (argc > 1) traceFile = argv[1]; 
	if (argc > 2) nthreads = atoi(argv[2]);
	if (strstr(device, "/dev/sda")) { // avoid accidentally writing to system part 
		fprintf(stderr, "Error trying to write to system partition %s\n", device);
		return 1;
	}
	// use default value if not supplied
	if (strcmp(device, "") == 0) device = DEFAULT_DEVICE;
	if (strcmp(traceFile, "") == 0) traceFile = DEFAULT_TRACE_FILE;
	if (strcmp(logDir.c_str(), "") == 0) logDir = DEFAULT_LOG_DIR;
	if (nthreads == 0) nthreads = DEFAULT_NTHREADS;
	
	srand(time(NULL)); 	// initialize seed

	// print configuration
	printf("trace     : %s\n", traceFile);
	printf("nthreads  : %d\n", nthreads);
	printf("device    : %s\n", device);
	printf("log       : %s\n", logDir.c_str());
	printf("precision : %fms\n", Timer::getResolution());

	printf("Opening device %s\n", device);
	int fd = open(device, O_DIRECT | O_RDWR | O_SYNC); 
	if (fd < 0) {
		fprintf(stderr, "Error opening device '%s'\n", device);
		return 1;
	}

	printf("Allocating buffer\n");
	void *buf; 
	if (posix_memalign(&buf, MEM_ALIGN, LARGEST_REQUEST_SIZE * BYTE_PER_BLOCK)) {
		fprintf(stderr, "Error allocating buffer\n");
		return 1;
	}
	//memset(buf, rand() % 256, LARGEST_REQUEST_SIZE * BYTE_PER_BLOCK);

	printf("Opening trace file\n");
	TraceReader trace(traceFile); // open trace file
	ConcurrentQueue<TraceEvent> queue; // queue of trace events
	bool readDone = false; // whether or not we're done reading trace file
	
	printf("Start reading trace\n");
	thread fileThread([&] { // thread to read trace file 
		TraceEvent event;
		while (trace.read(event)) {
			event.time = event.time * 1000; // to microseconds
			event.size = event.bcount * BYTE_PER_BLOCK;
			queue.push(event);
		}
		readDone = true; 
		queue.notifyAll(); // notify worker we're done  
	});	
	queue.waitUntilFull(); // wait until at least queue's full

	printf("Start replaying trace\n");
	vector<thread> workers(nthreads); // generate worker threads
	atomic<int> lateCount(0), threadId(0); // late I/O count and threadId
	for (auto& t : workers) t = thread([&] { // launch workers 
		int myId = ++threadId; // id for this thread
		int myLateCount = 0; // local lateCount for this thread 
		Logger logger(logDir + traceFile + to_string(myId));
		
		Timer timer; // mark the beginning of worker thread	
		while (!readDone or !queue.empty()) { 
			TraceEvent event;
			if (not queue.pop(event)) continue; // retry 
			long currentTime = timer.elapsedTime(), nextIoTime = event.time;
			if (currentTime <= nextIoTime) { // we're early/on-time
				//printf(". next=%ld current=%ld \n", nextIoTime, currentTime);
				Timer::delay(nextIoTime - currentTime); // delay until ~specified time
			} else { // we're late
				//printf("x next=%ld current=%ld delta=%ld\n", 
				// nextIoTime, currentTime, currentTime-nextIoTime);
				++myLateCount;
			}
			
			performIo(fd, buf, event, logger);
		}
		lateCount += myLateCount; // update global lateCount
	});

	fileThread.join(); // wait for all threads to finish
	for (auto& t : workers) t.join(); 

	printf("Late count: %d\n", lateCount.load());
	Logger logger(logDir + traceFile + to_string(0));
	logger.printf("%d\n", lateCount.load());
	printf("Done\n");
	return 0;
}