Loading...
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
/*
 * Benchmark VM fault throughput.
 * This test faults memory for a configurable amount of time across a
 * configurable number of threads. Currently it only measures zero fill faults.
 * Currently it supports two variants:
 * 1. Each thread gets its own vm objects to fault in
 * 2. Threads share vm objects
 *
 * We'll add more fault types as we identify problematic user-facing workloads
 * in macro benchmarks.
 *
 * Throughput is reported as pages / second using both wall time and cpu time.
 * CPU time is a more reliable metric for regression testing, but wall time can
 * highlight blocking in the VM.
 *
 * Running this benchmark directly is not recommended.
 * Use fault_throughput.lua which provides a nicer interface and outputs
 * perfdata.
 */
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>

#include <sys/mman.h>
#include <sys/types.h>
#include <sys/sysctl.h>

/*
 * TODO: Make this benchmark runnable on linux so we can do a perf comparison.
 * We're mostly using POSIX APIs, but we'll need to replace
 * the sysctls with the /proc equivalents, and replace clock_gettime_nsec_np
 * with the linux equivalent.
 */
#include <mach/mach.h>

#include <TargetConditionals.h>

#include <pthread.h>
#include <stdatomic.h>

#include "benchmark/helpers.h"

#if (TARGET_OS_OSX || TARGET_OS_SIMULATOR)
/*
 * On non-embedded platforms we coalesce vm objects up to 128 MB, so
 * we make the objects 128 MB on that platform to ensure they're not
 * merged with anything else.
 */
const static size_t kVmObjectSize = 128 * (1UL << 20);
#else
/*
 * Embedded platforms don't coalesce vm objects. This number
 * needs to be big enough that faulting it in dwarfs the cost of dequeuing
 * it from the work queue, but can't be too large or else we won't be able
 * to allocate one per thread in the separate-objects benchmark.
 */
const static size_t kVmObjectSize = 4 * (1UL << 20);
#endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */
static const clockid_t kThreadCPUTimeClock = CLOCK_THREAD_CPUTIME_ID;
/* These globals are set dynamically during test setup based on sysctls. */
static uint64_t kCacheLineSize = 0;
/* The VM page size */
static size_t kPageSize = 0;


typedef struct fault_buffer {
	unsigned char* fb_start; /* The start of this buffer. */
	size_t fb_size; /* The size of this buffer in bytes. */
} fault_buffer_t;

typedef enum test_variant {
	VARIANT_SEPARATE_VM_OBJECTS,
	VARIANT_SHARE_VM_OBJECTS
} test_variant_t;

typedef struct test_globals {
	/* This lock protects: tg_cv, tg_running_count, tg_done, tg_current_iteration, and tg_iterations_completed. */
	pthread_mutex_t tg_lock;
	pthread_cond_t tg_cv;
	/* The number of currently running threads */
	unsigned int tg_running_count;
	/* Set during cleanup to indicate that the benchmark is over. */
	bool tg_done;
	size_t tg_current_iteration;
	size_t tg_iterations_completed;
	unsigned int tg_num_threads;
	test_variant_t tg_variant;
	bool pin_threads;
	/*
	 * An array of memory objects to fault in.
	 * This is basically a workqueue of
	 * contiguous chunks of memory that the worker threads
	 * will fault in.
	 */
	fault_buffer_t *tg_fault_buffer_arr;
	size_t tg_fault_buffer_arr_length;
	/*
	 * To avoid false sharing, we pad the test globals with an extra cache line and place the atomic
	 * next_fault_buffer_index size_t after the cache line.
	 */
	__unused char padding[];
	/*
	 * This field is directly after the padding buffer.
	 * It is used to synchronize access to tg_fault_buffer_arr.
	 */
	//_Atomic size_t tg_next_fault_buffer_index;
} test_globals_t;

typedef struct {
	void *test_globals;
	uint32_t cpu_id;
} faulting_thread_args_t;

static faulting_thread_args_t *faulting_thread_args;

static const char* kSeparateObjectsArgument = "separate-objects";
static const char* kShareObjectsArgument = "share-objects";

/* Arguments parsed from the command line */
typedef struct test_args {
	uint32_t n_threads;
	uint32_t first_cpu;
	uint64_t duration_seconds;
	test_variant_t variant;
	bool pin_threads;
	bool verbose;
} test_args_t;

/*
 * Fault in the pages in the given buffer.
 */
static void fault_pages(fault_buffer_t *buffer, size_t stride);
/* Get a unique fault buffer from the global work queue. */
static fault_buffer_t *get_fault_buffer(test_globals_t* globals);
/*
 * Grabs buffers from the global test structure and faults them in, using this
 * test variant's stride, until there are no more buffers to grab.
 * Returns the number of microseconds spent on-cpu.
 */
static uint64_t grab_and_fault_pages(test_globals_t* globals);

static bool worker_thread_iteration_setup(size_t current_iteration, test_globals_t *globals);
static void worker_thread_iteration_complete(test_globals_t *globals);

static void parse_arguments(int argc, char **argv, test_args_t *args);
/*
 * Sets up the test globals and spawns the background threads to do the faults.
 * Returns an array of size `num_threads`
 * Containing the thread ids of the forked threads.
 */
static pthread_t* setup_test(test_globals_t *globals, const test_args_t *args, size_t memory_size, bool verbose);
static test_globals_t *allocate_test_globals(void);
/* Initializes variables in the globals array. */
static void init_globals(test_globals_t *globals, const test_args_t *args);
static inline _Atomic size_t *next_fault_buffer_index_ptr(test_globals_t *globals);
/*
 * Called on the main thread.
 * Waits for the background threads to be ready, sets up the memory objects,
 * and then starts a faulting iteration.
 * Returns the start (wall) time.
 */
static uint64_t start_iteration(test_globals_t* globals, test_variant_t variant, bool verbose);
/*
 * Called on the main thread.
 * Waits for the background threads to complete the iteration and cleans up.
 * Returns the total amount of time spent faulting pages in nanoseconds by all threads thus far.
 */
static uint64_t finish_iteration(test_globals_t *globals, uint64_t start_time);
/*
 * Called on the main thread.
 * Maps buffers and places them in the work queue.
 */
static void setup_memory(test_globals_t* globals, test_variant_t variant);
/*
 * Dump test results as a csv to stdout.
 * Use fault_throughput.lua to convert to perfdata.
 */
static void output_results(const test_globals_t *globals, double walltime_elapsed_seconds, double cputime_elapsed_seconds);
static void cleanup_test(test_globals_t *globals);
/*
 * Join the background threads and return the total microseconds
 * of cpu time spent faulting across all of the threads.
 * Takes ownership of the threads array and frees it.
 */
static uint64_t join_background_threads(test_globals_t *globals, pthread_t *threads);
static void unmap_fault_buffers(test_globals_t *globals);
/*
 * Get the stride between each vm object in the fault buffer array.
 */
static size_t fault_buffer_stride(const test_globals_t *globals);

int
main(int argc, char **argv)
{
	/* How much memory should the test consume (per-core on the system)? */
#if (TARGET_OS_OSX || TARGET_OS_SIMULATOR)
	static const size_t memory_per_core = kVmObjectSize;
#else
	static const size_t memory_per_core = 25 * (1UL << 20);
#endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */
	const size_t kMemSize = memory_per_core * (size_t) get_ncpu();
	test_globals_t *globals = allocate_test_globals();
	/* Total wall-time spent faulting in pages. */
	uint64_t wall_time_elapsed_ns = 0;
	/* Total cpu-time spent faulting in pages */
	uint64_t cpu_time_faulting_us = 0;
	uint64_t start_time_ns;
	test_args_t args;
	parse_arguments(argc, argv, &args);
	pthread_t* threads = setup_test(globals, &args, kMemSize, args.verbose);

	/* Keep doing more iterations until we've hit our (wall) time budget */
	while (wall_time_elapsed_ns < args.duration_seconds * kNumNanosecondsInSecond) {
		benchmark_log(args.verbose, "----Starting Iteration %lu-----\n", globals->tg_current_iteration + 1);
		start_time_ns = start_iteration(globals, args.variant, args.verbose);
		wall_time_elapsed_ns += finish_iteration(globals, start_time_ns);
		benchmark_log(args.verbose, "----Completed Iteration %lu----\n", globals->tg_current_iteration);
	}

	benchmark_log(args.verbose, "Hit time budget\nJoining worker threads\n");
	cpu_time_faulting_us = join_background_threads(globals, threads);
	benchmark_log(args.verbose, "----End Test Output----\n");
	output_results(globals, (double) wall_time_elapsed_ns / kNumNanosecondsInSecond,
	    (double)cpu_time_faulting_us / kNumMicrosecondsInSecond);
	cleanup_test(globals);

	return 0;
}


/* The main loop for the worker threads. */
static void*
faulting_thread(void* arg)
{
	test_globals_t* globals = ((faulting_thread_args_t *)arg)->test_globals;
	uint64_t on_cpu_time_faulting = 0;
	size_t current_iteration = 1;

	if (globals->pin_threads) {
		uint32_t cpu_id = ((faulting_thread_args_t *)arg)->cpu_id;
		int err = sysctlbyname("kern.sched_thread_bind_cpu", NULL, 0, &cpu_id, sizeof(cpu_id));
		assert(err == 0);
	}

	while (true) {
		bool should_continue = worker_thread_iteration_setup(current_iteration, globals);
		if (!should_continue) {
			break;
		}
		on_cpu_time_faulting += grab_and_fault_pages(globals);
		worker_thread_iteration_complete(globals);
		current_iteration++;
	}
	return (void*)on_cpu_time_faulting;
}

/*
 * Called on the worker threads before each iteration to synchronize this
 * iteration start with the other threads.
 * Returns true if the iteration should continue, and false if the test is over.
 */
static bool
worker_thread_iteration_setup(size_t current_iteration, test_globals_t *globals)
{
	bool should_continue = false;
	int ret = 0;
	// Gate on the other threads being ready to start
	ret = pthread_mutex_lock(&globals->tg_lock);
	assert(ret == 0);
	globals->tg_running_count++;
	if (globals->tg_running_count == globals->tg_num_threads) {
		// All the worker threads are running.
		// Wake up the main thread so that it can ungate the test.
		ret = pthread_cond_broadcast(&globals->tg_cv);
		assert(ret == 0);
	}
	/*
	 * The main thread will start this iteration by incrementing
	 * tg_current_iteration. Block until that happens.
	 * See start_iteration for the wakeup code.
	 */
	while (!globals->tg_done && globals->tg_current_iteration != current_iteration) {
		ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock);
		assert(ret == 0);
	}
	should_continue = !globals->tg_done;
	ret = pthread_mutex_unlock(&globals->tg_lock);
	assert(ret == 0);
	return should_continue;
}

/*
 * Called on the worker threads before each iteration finishes to synchronize
 * with the other threads.
 */
static void
worker_thread_iteration_complete(test_globals_t *globals)
{
	int ret;
	// Mark ourselves as done and wait for the other threads to finish
	ret = pthread_mutex_lock(&globals->tg_lock);
	assert(ret == 0);
	globals->tg_running_count--;
	if (globals->tg_running_count == 0) {
		// We're the last one to finish. Mark this iteration as completed and wake everyone up.
		globals->tg_iterations_completed++;
		ret = pthread_cond_broadcast(&globals->tg_cv);
		assert(ret == 0);
	} else {
		// Others are running. Wait for them to finish.
		while (globals->tg_iterations_completed != globals->tg_current_iteration) {
			ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock);
			assert(ret == 0);
		}
	}
	ret = pthread_mutex_unlock(&globals->tg_lock);
	assert(ret == 0);
}

static void
fault_pages(fault_buffer_t *buffer, size_t stride)
{
	volatile unsigned char val;
	for (unsigned char* ptr = buffer->fb_start; ptr < buffer->fb_start + buffer->fb_size; ptr += stride) {
		val = *ptr;
	}
}

static fault_buffer_t *
get_fault_buffer(test_globals_t* globals)
{
	size_t index = atomic_fetch_add_explicit(next_fault_buffer_index_ptr(globals), 1UL, memory_order_acq_rel);
	if (index < globals->tg_fault_buffer_arr_length) {
		return &globals->tg_fault_buffer_arr[index];
	}
	return NULL;
}

static uint64_t
grab_and_fault_pages(test_globals_t* globals)
{
	struct timespec start_time, end_time;
	uint64_t nanoseconds_faulting_on_cpu = 0;
	int ret;
	size_t stride = fault_buffer_stride(globals) * kPageSize;
	while (true) {
		fault_buffer_t *object = get_fault_buffer(globals);
		if (object == NULL) {
			break;
		}
		ret = clock_gettime(kThreadCPUTimeClock, &start_time);
		assert(ret == 0);

		fault_pages(object, stride);

		ret = clock_gettime(kThreadCPUTimeClock, &end_time);
		assert(ret == 0);
		nanoseconds_faulting_on_cpu += (unsigned long) timespec_difference_us(&end_time, &start_time);
	}
	return nanoseconds_faulting_on_cpu;
}

static uint64_t
start_iteration(test_globals_t* globals, test_variant_t variant, bool verbose)
{
	int ret;
	uint64_t start_time;
	ret = pthread_mutex_lock(&globals->tg_lock);
	assert(ret == 0);
	benchmark_log(verbose, "Waiting for workers to catch up before starting next iteration.\n");
	/* Wait until all the threads are ready to go to the next iteration */
	while (globals->tg_running_count != globals->tg_num_threads) {
		ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock);
	}
	benchmark_log(verbose, "Workers are all caught up\n");
	setup_memory(globals, variant);
	benchmark_log(verbose, "Initialized data structures for iteration. Waking workers.\n");
	/* Grab a timestamp, tick the current iteration, and wake up the worker threads */
	start_time = current_timestamp_ns();
	globals->tg_current_iteration++;
	ret = pthread_mutex_unlock(&globals->tg_lock);
	assert(ret == 0);
	ret = pthread_cond_broadcast(&globals->tg_cv);
	assert(ret == 0);
	return start_time;
}

static uint64_t
finish_iteration(test_globals_t* globals, uint64_t start_time)
{
	int ret;
	uint64_t end_time;
	ret = pthread_mutex_lock(&globals->tg_lock);
	assert(ret == 0);
	while (globals->tg_iterations_completed != globals->tg_current_iteration) {
		ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock);
	}
	end_time = current_timestamp_ns();
	ret = pthread_mutex_unlock(&globals->tg_lock);
	unmap_fault_buffers(globals);
	assert(ret == 0);
	return end_time - start_time;
}

static void
setup_memory(test_globals_t* globals, test_variant_t variant)
{
	size_t stride = fault_buffer_stride(globals);
	for (size_t i = 0; i < globals->tg_fault_buffer_arr_length; i += stride) {
		fault_buffer_t *object = &globals->tg_fault_buffer_arr[i];
		object->fb_start = mmap_buffer(kVmObjectSize);
		object->fb_size = kVmObjectSize;
		if (variant == VARIANT_SHARE_VM_OBJECTS) {
			/*
			 * Insert another buffer into the work queue for each thread.
			 * Each buffer starts 1 page past where the previous buffer started into the vm object.
			 * Since each thread strides by the number of threads * the page size they won't fault in the same pages.
			 */
			for (size_t j = 1; j < globals->tg_num_threads; j++) {
				size_t offset = kPageSize * j;
				fault_buffer_t *offset_object = &globals->tg_fault_buffer_arr[i + j];
				offset_object->fb_start = object->fb_start + offset;
				offset_object->fb_size = object->fb_size - offset;
			}
		} else if (variant != VARIANT_SEPARATE_VM_OBJECTS) {
			fprintf(stderr, "Unknown test variant.\n");
			exit(2);
		}
	}
	atomic_store_explicit(next_fault_buffer_index_ptr(globals), 0, memory_order_release);
}

static void
unmap_fault_buffers(test_globals_t* globals)
{
	size_t stride = fault_buffer_stride(globals);
	for (size_t i = 0; i < globals->tg_fault_buffer_arr_length; i += stride) {
		fault_buffer_t *buffer = &globals->tg_fault_buffer_arr[i];
		int res = munmap(buffer->fb_start, buffer->fb_size);
		assert(res == 0);
	}
}

static test_globals_t *
allocate_test_globals()
{
	test_globals_t *globals = NULL;
	int ret;
	if (kCacheLineSize == 0) {
		size_t cachelinesize_size = sizeof(kCacheLineSize);
		ret = sysctlbyname("hw.cachelinesize", &kCacheLineSize, &cachelinesize_size, NULL, 0);
		assert(ret == 0);
		assert(kCacheLineSize > 0);
	}
	if (kPageSize == 0) {
		size_t pagesize_size = sizeof(kPageSize);
		ret = sysctlbyname("vm.pagesize", &kPageSize, &pagesize_size, NULL, 0);
		assert(ret == 0);
		assert(kPageSize > 0);
	}
	size_t test_globals_size = sizeof(test_globals_t) + kCacheLineSize + sizeof(_Atomic size_t);
	globals = malloc(test_globals_size);
	assert(globals != NULL);
	memset(globals, 0, test_globals_size);
	return globals;
}

static void
init_globals(test_globals_t *globals, const test_args_t *args)
{
	pthread_mutexattr_t mutex_attrs;
	pthread_condattr_t cond_attrs;
	int ret;
	memset(globals, 0, sizeof(test_globals_t));

	ret = pthread_mutexattr_init(&mutex_attrs);
	assert(ret == 0);
	ret = pthread_mutex_init(&globals->tg_lock, &mutex_attrs);
	assert(ret == 0);
	ret = pthread_condattr_init(&cond_attrs);
	assert(ret == 0);
	ret = pthread_cond_init(&globals->tg_cv, &cond_attrs);
	assert(ret == 0);
	ret = pthread_mutexattr_destroy(&mutex_attrs);
	assert(ret == 0);
	ret = pthread_condattr_destroy(&cond_attrs);
	assert(ret == 0);

	globals->tg_num_threads = args->n_threads;
	globals->tg_variant = args->variant;
	globals->pin_threads = args->pin_threads;
}

static void
init_fault_buffer_arr(test_globals_t *globals, const test_args_t *args, size_t memory_size)
{
	if (args->variant == VARIANT_SEPARATE_VM_OBJECTS) {
		// This variant creates separate vm objects up to memory size bytes total
		globals->tg_fault_buffer_arr_length = memory_size / kVmObjectSize;
	} else if (args->variant == VARIANT_SHARE_VM_OBJECTS) {
		// This variant creates separate vm objects up to memory size bytes total
		// And places a pointer into each vm object for each thread.
		globals->tg_fault_buffer_arr_length = memory_size / kVmObjectSize * globals->tg_num_threads;
	} else {
		fprintf(stderr, "Unsupported test variant.\n");
		exit(2);
	}
	// It doesn't make sense to have more threads than elements in the work queue.
	// NB: Since we scale memory_size by ncpus, this can only happen if the user
	// tries to run the benchmark with many more threads than cores.
	assert(globals->tg_fault_buffer_arr_length >= globals->tg_num_threads);
	globals->tg_fault_buffer_arr = calloc(sizeof(fault_buffer_t), globals->tg_fault_buffer_arr_length);
	assert(globals->tg_fault_buffer_arr);
}

static pthread_t *
spawn_worker_threads(test_globals_t *globals, unsigned int num_threads, unsigned int first_cpu)
{
	int ret;
	pthread_attr_t pthread_attrs;
	globals->tg_num_threads = num_threads;
	pthread_t* threads = malloc(sizeof(pthread_t) * num_threads);
	faulting_thread_args = malloc(sizeof(faulting_thread_args_t) * num_threads);
	assert(threads);
	ret = pthread_attr_init(&pthread_attrs);
	assert(ret == 0);
	// Spawn the background threads
	for (unsigned int i = 0; i < num_threads; i++) {
		if (globals->pin_threads) {
			faulting_thread_args[i].cpu_id = (i + first_cpu) % get_ncpu();
		}
		faulting_thread_args[i].test_globals = globals;
		ret = pthread_create(threads + i, &pthread_attrs, faulting_thread, &faulting_thread_args[i]);
		assert(ret == 0);
	}
	ret = pthread_attr_destroy(&pthread_attrs);
	assert(ret == 0);
	return threads;
}

static pthread_t*
setup_test(test_globals_t *globals, const test_args_t *args, size_t memory_size, bool verbose)
{
	init_globals(globals, args);
	init_fault_buffer_arr(globals, args, memory_size);
	benchmark_log(verbose, "Initialized global data structures.\n");
	pthread_t *workers = spawn_worker_threads(globals, args->n_threads, args->first_cpu);
	benchmark_log(verbose, "Spawned workers.\n");
	return workers;
}

static uint64_t
join_background_threads(test_globals_t *globals, pthread_t *threads)
{
	// Set the done flag so that the background threads exit
	int ret;
	uint64_t total_cputime_spent_faulting = 0;
	ret = pthread_mutex_lock(&globals->tg_lock);
	assert(ret == 0);
	globals->tg_done = true;
	ret = pthread_cond_broadcast(&globals->tg_cv);
	assert(ret == 0);
	ret = pthread_mutex_unlock(&globals->tg_lock);
	assert(ret == 0);

	// Join the background threads
	for (unsigned int i = 0; i < globals->tg_num_threads; i++) {
		uint64_t cputime_spent_faulting = 0;
		ret = pthread_join(threads[i], (void **)&cputime_spent_faulting);
		assert(ret == 0);
		total_cputime_spent_faulting += cputime_spent_faulting;
	}
	free(threads);
	free(faulting_thread_args);
	return total_cputime_spent_faulting;
}

static void
cleanup_test(test_globals_t* globals)
{
	int ret;
	ret = pthread_mutex_destroy(&globals->tg_lock);
	assert(ret == 0);
	ret = pthread_cond_destroy(&globals->tg_cv);
	assert(ret == 0);
	free(globals->tg_fault_buffer_arr);
	free(globals);
}

static void
output_results(const test_globals_t* globals, double walltime_elapsed_seconds, double cputime_elapsed_seconds)
{
	size_t pgsize;
	size_t sysctl_size = sizeof(pgsize);
	int ret = sysctlbyname("vm.pagesize", &pgsize, &sysctl_size, NULL, 0);
	assert(ret == 0);
	size_t num_pages = 0;
	double walltime_throughput, cputime_throughput;
	size_t stride = fault_buffer_stride(globals);
	for (size_t i = 0; i < globals->tg_fault_buffer_arr_length; i += stride) {
		num_pages += globals->tg_fault_buffer_arr[i].fb_size / pgsize;
	}
	num_pages *= globals->tg_iterations_completed;
	walltime_throughput = num_pages / walltime_elapsed_seconds;
	cputime_throughput = num_pages / cputime_elapsed_seconds;
	printf("-----Results-----\n");
	printf("Throughput (pages / wall second), Throughput (pages / CPU second)\n");
	printf("%f,%f\n", walltime_throughput, cputime_throughput);
}

static void
print_help(char** argv)
{
	fprintf(stderr, "%s: <test-variant> [-v] duration num_threads\n", argv[0]);
	fprintf(stderr, "\ntest variants:\n");
	fprintf(stderr, "	%s	Fault in different vm objects in each thread.\n", kSeparateObjectsArgument);
	fprintf(stderr, "	%s		Share vm objects across faulting threads.\n", kShareObjectsArgument);
}

static void
parse_arguments(int argc, char** argv, test_args_t *args)
{
	int current_argument = 1;
	memset(args, 0, sizeof(test_args_t));
	if (argc < 4 || argc > 6) {
		print_help(argv);
		exit(1);
	}
	if (argv[current_argument][0] == '-') {
		if (strcmp(argv[current_argument], "-v") == 0) {
			args->verbose = true;
		} else {
			fprintf(stderr, "Unknown argument %s\n", argv[current_argument]);
			print_help(argv);
			exit(1);
		}
		current_argument++;
	}
	if (strncasecmp(argv[current_argument], kSeparateObjectsArgument, strlen(kSeparateObjectsArgument)) == 0) {
		args->variant = VARIANT_SEPARATE_VM_OBJECTS;
	} else if (strncasecmp(argv[current_argument], kShareObjectsArgument, strlen(kShareObjectsArgument)) == 0) {
		args->variant = VARIANT_SHARE_VM_OBJECTS;
	} else {
		print_help(argv);
		exit(1);
	}
	current_argument++;

	long duration = strtol(argv[current_argument++], NULL, 10);
	if (duration == 0) {
		print_help(argv);
		exit(1);
	}
	long num_cores = strtol(argv[current_argument++], NULL, 10);
	if (num_cores == 0) {
		print_help(argv);
		exit(1);
	}
	if (current_argument < argc) {
		long first_cpu = strtol(argv[current_argument++], NULL, 10);
		assert(first_cpu >= 0 && first_cpu < get_ncpu());
		args->pin_threads = true;
		args->first_cpu = (unsigned int) first_cpu;
	} else {
		args->pin_threads = false;
	}

	assert(num_cores > 0 && num_cores <= get_ncpu());
	args->n_threads = (unsigned int) num_cores;
	args->duration_seconds = (unsigned long) duration;
}

static inline
_Atomic size_t *
next_fault_buffer_index_ptr(test_globals_t *globals)
{
	return (_Atomic size_t *) (((ptrdiff_t)(globals + 1)) + (int64_t)kCacheLineSize);
}
static size_t
fault_buffer_stride(const test_globals_t *globals)
{
	size_t stride;
	if (globals->tg_variant == VARIANT_SEPARATE_VM_OBJECTS) {
		stride = 1;
	} else if (globals->tg_variant == VARIANT_SHARE_VM_OBJECTS) {
		stride = globals->tg_num_threads;
	} else {
		fprintf(stderr, "Unknown variant\n");
		exit(-1);
	}
	return stride;
}