Loading...
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
/*
 * Copyright (c) 2019 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <mach/mach_types.h>
#include <mach/task.h>

#include <kern/ast.h>
#include <kern/kalloc.h>
#include <kern/kern_types.h>
#include <kern/mach_param.h>
#include <kern/machine.h>
#include <kern/misc_protos.h>
#include <kern/processor.h>
#include <kern/queue.h>
#include <kern/restartable.h>
#include <kern/task.h>
#include <kern/thread.h>
#include <kern/waitq.h>

#include <os/atomic_private.h>
#include <os/hash.h>
#include <os/refcnt.h>

/**
 * @file osfmk/kern/restartable.c
 *
 * @brief
 * This module implements restartable userspace functions.
 *
 * @discussion
 * task_restartable_ranges_register() allows task to configure
 * the restartable ranges, only once per task,
 * before it has made its second thread.
 *
 * task_restartable_ranges_synchronize() can later be used to trigger
 * restarts for threads with a PC in a restartable region.
 *
 * It is implemented with an AST (AST_RESET_PCS) that will cause threads
 * as they return to userspace to reset PCs in a restartable region
 * to the recovery offset of this region.
 *
 * Because signal delivery would mask the proper saved PC for threads,
 * sigreturn also forcefully sets the AST and will go through the logic
 * every single time.
 */

typedef int (*cmpfunc_t)(const void *a, const void *b);
extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);

#define RR_RANGES_MAX   64
struct restartable_ranges {
	queue_chain_t            rr_link;
	os_refcnt_t              rr_ref;
	uint32_t                 rr_count;
	uint32_t                 rr_hash;
	task_restartable_range_t rr_ranges[RR_RANGES_MAX];
};

#if DEBUG || DEVELOPMENT
#define RR_HASH_SIZE   256
#else
// Release kernel userspace should have shared caches and a single registration
#define RR_HASH_SIZE    16
#endif

static queue_head_t rr_hash[RR_HASH_SIZE];
LCK_GRP_DECLARE(rr_lock_grp, "restartable ranges");
LCK_SPIN_DECLARE(rr_spinlock, &rr_lock_grp);

#define rr_lock()   lck_spin_lock_grp(&rr_spinlock, &rr_lock_grp)
#define rr_unlock() lck_spin_unlock(&rr_spinlock);

#pragma mark internals

/**
 * @function _ranges_cmp
 *
 * @brief
 * Compares two ranges together.
 */
static int
_ranges_cmp(const void *_r1, const void *_r2)
{
	const task_restartable_range_t *r1 = _r1;
	const task_restartable_range_t *r2 = _r2;

	if (r1->location != r2->location) {
		return r1->location < r2->location ? -1 : 1;
	}
	if (r1->length == r2->length) {
		return 0;
	}
	return r1->length < r2->length ? -1 : 1;
}

/**
 * @function _ranges_validate
 *
 * @brief
 * Validates an array of PC ranges for wraps and intersections.
 *
 * @discussion
 * This sorts and modifies the input.
 *
 * The ranges must:
 * - not wrap around,
 * - have a length/recovery offset within a page of the range start
 *
 * @returns
 * - KERN_SUCCESS:          ranges are valid
 * - KERN_INVALID_ARGUMENT: ranges are invalid
 */
static kern_return_t
_ranges_validate(task_t task, task_restartable_range_t *ranges, uint32_t count)
{
	qsort(ranges, count, sizeof(task_restartable_range_t), _ranges_cmp);
	uint64_t limit = task_has_64Bit_data(task) ? UINT64_MAX : UINT32_MAX;
	uint64_t end, recovery;

	if (count == 0) {
		return KERN_INVALID_ARGUMENT;
	}

	for (size_t i = 0; i < count; i++) {
		if (ranges[i].length > TASK_RESTARTABLE_OFFSET_MAX ||
		    ranges[i].recovery_offs > TASK_RESTARTABLE_OFFSET_MAX) {
			return KERN_INVALID_ARGUMENT;
		}
		if (ranges[i].flags) {
			return KERN_INVALID_ARGUMENT;
		}
		if (os_add_overflow(ranges[i].location, ranges[i].length, &end)) {
			return KERN_INVALID_ARGUMENT;
		}
		if (os_add_overflow(ranges[i].location, ranges[i].recovery_offs, &recovery)) {
			return KERN_INVALID_ARGUMENT;
		}
		if (ranges[i].location > limit || end > limit || recovery > limit) {
			return KERN_INVALID_ARGUMENT;
		}
		if (i + 1 < count && end > ranges[i + 1].location) {
			return KERN_INVALID_ARGUMENT;
		}
	}

	return KERN_SUCCESS;
}

/**
 * @function _ranges_lookup
 *
 * @brief
 * Lookup the left side of a range for a given PC within a set of ranges.
 *
 * @returns
 * - 0: no PC range found
 * - the left-side of the range.
 */
__attribute__((always_inline))
static mach_vm_address_t
_ranges_lookup(struct restartable_ranges *rr, mach_vm_address_t pc)
{
	task_restartable_range_t *ranges = rr->rr_ranges;
	uint32_t l = 0, r = rr->rr_count;

	if (pc <= ranges[0].location) {
		return 0;
	}
	if (pc >= ranges[r - 1].location + ranges[r - 1].length) {
		return 0;
	}

	while (l < r) {
		uint32_t i = (r + l) / 2;
		mach_vm_address_t location = ranges[i].location;

		if (pc <= location) {
			/* if the PC is exactly at pc_start, no reset is needed */
			r = i;
		} else if (location + ranges[i].length <= pc) {
			/* if the PC is exactly at the end, it's out of the function */
			l = i + 1;
		} else {
			/* else it's strictly in the range, return the recovery pc */
			return location + ranges[i].recovery_offs;
		}
	}

	return 0;
}

/**
 * @function _restartable_ranges_dispose
 *
 * @brief
 * Helper to dispose of a range that has reached a 0 refcount.
 */
__attribute__((noinline))
static void
_restartable_ranges_dispose(struct restartable_ranges *rr, bool hash_remove)
{
	if (hash_remove) {
		rr_lock();
		remqueue(&rr->rr_link);
		rr_unlock();
	}
	kfree_type(struct restartable_ranges, rr);
}

/**
 * @function _restartable_ranges_equals
 *
 * @brief
 * Helper to compare two restartable ranges.
 */
static bool
_restartable_ranges_equals(
	const struct restartable_ranges *rr1,
	const struct restartable_ranges *rr2)
{
	size_t rr1_size = rr1->rr_count * sizeof(task_restartable_range_t);
	return rr1->rr_hash == rr2->rr_hash &&
	       rr1->rr_count == rr2->rr_count &&
	       memcmp(rr1->rr_ranges, rr2->rr_ranges, rr1_size) == 0;
}

/**
 * @function _restartable_ranges_create
 *
 * @brief
 * Helper to create a uniqued restartable range.
 *
 * @returns
 * - KERN_SUCCESS
 * - KERN_INVALID_ARGUMENT: the validation of the new ranges failed.
 * - KERN_RESOURCE_SHORTAGE: too many ranges, out of memory
 */
static kern_return_t
_restartable_ranges_create(task_t task, task_restartable_range_t *ranges,
    uint32_t count, struct restartable_ranges **rr_storage)
{
	struct restartable_ranges *rr, *rr_found, *rr_base;
	queue_head_t *head;
	uint32_t base_count, total_count;
	size_t base_size, size;
	kern_return_t kr;

	rr_base = *rr_storage;
	base_count = rr_base ? rr_base->rr_count : 0;
	base_size = sizeof(task_restartable_range_t) * base_count;
	size = sizeof(task_restartable_range_t) * count;

	if (os_add_overflow(base_count, count, &total_count)) {
		return KERN_INVALID_ARGUMENT;
	}
	if (total_count > RR_RANGES_MAX) {
		return KERN_RESOURCE_SHORTAGE;
	}

	rr = kalloc_type(struct restartable_ranges,
	    (zalloc_flags_t) (Z_WAITOK | Z_ZERO | Z_NOFAIL));

	queue_chain_init(rr->rr_link);
	os_ref_init(&rr->rr_ref, NULL);
	rr->rr_count = total_count;
	if (base_size) {
		memcpy(rr->rr_ranges, rr_base->rr_ranges, base_size);
	}
	memcpy(rr->rr_ranges + base_count, ranges, size);
	kr = _ranges_validate(task, rr->rr_ranges, total_count);
	if (kr) {
		_restartable_ranges_dispose(rr, false);
		return kr;
	}
	rr->rr_hash = os_hash_jenkins(rr->rr_ranges,
	    rr->rr_count * sizeof(task_restartable_range_t));

	head = &rr_hash[rr->rr_hash % RR_HASH_SIZE];

	rr_lock();
	queue_iterate(head, rr_found, struct restartable_ranges *, rr_link) {
		if (_restartable_ranges_equals(rr, rr_found) &&
		os_ref_retain_try(&rr_found->rr_ref)) {
			goto found;
		}
	}

	enqueue_tail(head, &rr->rr_link);
	rr_found = rr;

found:
	if (rr_base && os_ref_release_relaxed(&rr_base->rr_ref) == 0) {
		remqueue(&rr_base->rr_link);
	} else {
		rr_base = NULL;
	}
	rr_unlock();

	*rr_storage = rr_found;

	if (rr_found != rr) {
		_restartable_ranges_dispose(rr, false);
	}
	if (rr_base) {
		_restartable_ranges_dispose(rr_base, false);
	}
	return KERN_SUCCESS;
}

#pragma mark extern interfaces

__attribute__((always_inline))
void
restartable_ranges_release(struct restartable_ranges *rr)
{
	if (os_ref_release_relaxed(&rr->rr_ref) == 0) {
		_restartable_ranges_dispose(rr, true);
	}
}

__attribute__((always_inline))
void
thread_reset_pcs_will_fault(thread_t thread)
{
	/*
	 * Called in the exception handling code while interrupts
	 * are still disabled.
	 */
	os_atomic_store(&thread->t_rr_state.trr_fault_state,
	    (uint8_t)TRR_FAULT_PENDING, relaxed);
}

__attribute__((always_inline))
void
thread_reset_pcs_done_faulting(struct thread *thread)
{
	thread_rr_state_t state = {
		.trr_ipi_ack_pending = ~0,
	};

	/*
	 * Called by the exception handling code on the way back,
	 * or when the thread is terminated.
	 */
	state.trr_value = os_atomic_and_orig(&thread->t_rr_state.trr_value,
	    state.trr_value, relaxed);

	if (__improbable(state.trr_sync_waiting)) {
		task_t task = get_threadtask(thread);

		task_lock(task);
		wakeup_all_with_inheritor(&thread->t_rr_state, THREAD_AWAKENED);
		task_unlock(task);
	}
}

void
thread_reset_pcs_ack_IPI(struct thread *thread)
{
	thread_rr_state_t trrs;

	/*
	 * Called under the thread lock from IPI or CSwitch context.
	 */
	trrs.trr_value = os_atomic_load(&thread->t_rr_state.trr_value, relaxed);
	if (__improbable(trrs.trr_ipi_ack_pending)) {
		trrs.trr_ipi_ack_pending = false;
		if (trrs.trr_fault_state) {
			assert3u(trrs.trr_fault_state, ==, TRR_FAULT_PENDING);
			trrs.trr_fault_state = TRR_FAULT_OBSERVED;
		}
		os_atomic_store(&thread->t_rr_state.trr_value,
		    trrs.trr_value, relaxed);
	}
}

static bool
thread_rr_wait_if_needed(task_t task, thread_t thread)
{
	thread_rr_state_t state;
	bool did_unlock = false;

	state.trr_value = os_atomic_load(&thread->t_rr_state.trr_value, relaxed);
	if (state.trr_value == 0) {
		return did_unlock;
	}

	assert(state.trr_sync_waiting == 0);

	thread_reference(thread);

	/*
	 * The thread_rr_state state machine is:
	 *
	 *                        ,------------ IPI ack --------------.
	 *                        v                                   |
	 *        .-----> {f:N, w:0, ipi:0} --- IPI sent ---> {f:N, w:0, ipi:1}
	 *        |           |        ^                              |
	 *        |           |        |                              |
	 *      fault       will     fault                          will
	 *      done        fault    done                           fault
	 *        |           |        |                              |
	 *        |           v        |                              v
	 *        |       {f:P, w:0, ipi:0} --- IPI sent ---> {f:P, w:0, ipi:1}
	 *        |               |                                   |
	 *        |               |                                   |
	 *        |     act_set_ast_reset_pcs()                       |
	 *        |               |                                   |
	 *        |               v                                   |
	 *        +------ {f:O, w:0, ipi:0} <--- IPI Ack -------------'
	 *        |               |
	 *        |               |
	 *        |        wait_if_needed()
	 *        |               |
	 *        |               v
	 *        `------ {f:O, w:1, ipi:0}
	 */

	while (state.trr_ipi_ack_pending) {
		disable_preemption();
		task_unlock(task);

		state.trr_value =
		    hw_wait_while_equals32(&thread->t_rr_state.trr_value,
		    state.trr_value);

		enable_preemption();
		task_lock(task);

		did_unlock = true;
	}

	/*
	 * If a VM fault is in flight we must wait for it to resolve
	 * before we can return from task_restartable_ranges_synchronize(),
	 * as the memory we're faulting against might be freed by the caller
	 * as soon as it returns, leading a crash.
	 */
	if (state.trr_fault_state == TRR_FAULT_OBSERVED) {
		thread_rr_state_t nstate = {
			.trr_fault_state  = TRR_FAULT_OBSERVED,
			.trr_sync_waiting = 1,
		};

		if (os_atomic_cmpxchg(&thread->t_rr_state, state,
		    nstate, relaxed)) {
			lck_mtx_sleep_with_inheritor(&task->lock,
			    LCK_SLEEP_DEFAULT, &thread->t_rr_state,
			    thread, THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
			did_unlock = true;
		}
	}

#if MACH_ASSERT
	state.trr_value = os_atomic_load(&thread->t_rr_state.trr_value, relaxed);
	assert3u(state.trr_fault_state, !=, TRR_FAULT_OBSERVED);
	assert3u(state.trr_ipi_ack_pending, ==, 0);
	assert3u(state.trr_sync_waiting, ==, 0);
#endif

	thread_deallocate_safe(thread);
	return did_unlock;
}

bool
thread_reset_pcs_in_range(task_t task, thread_t thread)
{
	return _ranges_lookup(task->t_rr_ranges, machine_thread_pc(thread)) != 0;
}

void
thread_reset_pcs_ast(task_t task, thread_t thread)
{
	struct restartable_ranges *rr;
	mach_vm_address_t pc;

	/*
	 * Because restartable_ranges are set while the task only has on thread
	 * and can't be mutated outside of this, no lock is required to read this.
	 */
	rr = task->t_rr_ranges;
	if (thread->active && rr) {
		pc = _ranges_lookup(rr, machine_thread_pc(thread));

		if (pc) {
			machine_thread_reset_pc(thread, pc);
		}
	}

#if MACH_ASSERT
	thread_rr_state_t state;

	state.trr_value = os_atomic_load(&thread->t_rr_state.trr_value, relaxed);
	assert3u(state.trr_fault_state, ==, TRR_FAULT_NONE);
	assert3u(state.trr_sync_waiting, ==, 0);
#endif
}

void
restartable_init(void)
{
	for (size_t i = 0; i < RR_HASH_SIZE; i++) {
		queue_head_init(rr_hash[i]);
	}
}

#pragma mark MiG interfaces

kern_return_t
task_restartable_ranges_register(
	task_t                    task,
	task_restartable_range_t *ranges,
	mach_msg_type_number_t    count)
{
	kern_return_t kr;

	if (task != current_task()) {
		return KERN_FAILURE;
	}

#if CONFIG_ROSETTA
	// <rdar://problem/48527888> Obj-C adoption of task_restartable_ranges_register breaks Cambria
	if (task_is_translated(task)) {
		return KERN_RESOURCE_SHORTAGE;
	}
#endif

	kr = _ranges_validate(task, ranges, count);

	if (kr == KERN_SUCCESS) {
		task_lock(task);

		if (task->thread_count > 1) {
			kr = KERN_NOT_SUPPORTED;
#if !DEBUG && !DEVELOPMENT
		} else if (task->t_rr_ranges) {
			/*
			 * For security reasons, on release kernels,
			 * only allow for this to be configured once.
			 *
			 * But to be able to test the feature we need
			 * to relax this for dev kernels.
			 */
			kr = KERN_NOT_SUPPORTED;
#endif
		} else {
			kr = _restartable_ranges_create(task, ranges, count,
			    &task->t_rr_ranges);
		}

		task_unlock(task);
	}

	return kr;
}

kern_return_t
task_restartable_ranges_synchronize(task_t task)
{
	thread_pri_floor_t token;
	thread_t thread;
	bool needs_wait = false;
	kern_return_t kr = KERN_SUCCESS;

	if (task != current_task()) {
		return KERN_FAILURE;
	}

	/*
	 * t_rr_ranges can only be set if the process is single threaded.
	 * As a result, `t_rr_ranges` can _always_ be looked at
	 * from current_thread() without holding a lock:
	 * - either because it's the only thread in the task
	 * - or because the existence of another thread precludes
	 *   modification
	 */
	if (!task->t_rr_ranges) {
		return KERN_SUCCESS;
	}

	/*
	 * When initiating a GC, artificially raise the priority for the
	 * duration of sending ASTs, we want to be preemptible, but this
	 * sequence has to terminate in a timely fashion.
	 */
	token = thread_priority_floor_start();

	task_lock(task);

	/*
	 * In order to avoid trivial deadlocks of 2 threads trying
	 * to wait on each other while in kernel, disallow
	 * concurrent usage of task_restartable_ranges_synchronize().
	 *
	 * At the time this code was written, the one client (Objective-C)
	 * does this under lock which guarantees ordering. If we ever need
	 * more clients, the library around restartable ranges will have
	 * to synchronize in userspace.
	 */
	if (task->task_rr_in_flight) {
		kr = KERN_ALREADY_WAITING;
		goto out;
	}

	task->task_rr_in_flight = true;

	/*
	 * Pair with the acquire barriers handling RR_TSTATE_ONCORE.
	 *
	 * For threads that weren't on core, we rely on the fact
	 * that we are taking their lock in act_set_ast_reset_pcs()
	 * and that the context switch path will also take it before
	 * resuming them which rovides the required ordering.
	 *
	 * For new threads not existing yet, because the task_lock()
	 * is taken to add them to the task thread list,
	 * which also synchronizes with this code.
	 */
	os_atomic_thread_fence(release);

	/*
	 * Set all the AST_RESET_PCS, and see if any thread needs
	 * actual acknowledgement.
	 */
	queue_iterate(&task->threads, thread, thread_t, task_threads) {
		if (thread != current_thread()) {
			needs_wait |= act_set_ast_reset_pcs(task, thread);
		}
	}

	/*
	 * Now wait for acknowledgement if we need any
	 */
	while (needs_wait) {
		needs_wait = false;

		queue_iterate(&task->threads, thread, thread_t, task_threads) {
			if (thread == current_thread()) {
				continue;
			}

			needs_wait = thread_rr_wait_if_needed(task, thread);
			if (needs_wait) {
				/*
				 * We drop the task lock,
				 * we need to restart enumerating threads.
				 */
				break;
			}
		}
	}

	task->task_rr_in_flight = false;

out:
	task_unlock(task);

	thread_priority_floor_end(&token);

	return kr;
}