1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/context_tracking.h>
4 #include <linux/entry-common.h>
5 #include <linux/highmem.h>
6 #include <linux/jump_label.h>
7 #include <linux/livepatch.h>
8 #include <linux/audit.h>
9 #include <linux/tick.h>
13 #define CREATE_TRACE_POINTS
14 #include <trace/events/syscalls.h>
16 /* See comment for enter_from_user_mode() in entry-common.h */
17 static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
19 arch_check_user_regs(regs);
20 lockdep_hardirqs_off(CALLER_ADDR0);
22 CT_WARN_ON(ct_state() != CONTEXT_USER);
25 instrumentation_begin();
26 trace_hardirqs_off_finish();
27 instrumentation_end();
30 void noinstr enter_from_user_mode(struct pt_regs *regs)
32 __enter_from_user_mode(regs);
35 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
37 if (unlikely(audit_context())) {
38 unsigned long args[6];
40 syscall_get_arguments(current, regs, args);
41 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
45 static long syscall_trace_enter(struct pt_regs *regs, long syscall,
51 * Handle Syscall User Dispatch. This must comes first, since
52 * the ABI here can be something that doesn't make sense for
53 * other syscall_work features.
55 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
56 if (syscall_user_dispatch(regs))
61 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
62 ret = arch_syscall_enter_tracehook(regs);
63 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
67 /* Do seccomp after ptrace, to catch any tracer changes. */
68 if (work & SYSCALL_WORK_SECCOMP) {
69 ret = __secure_computing(NULL);
74 /* Either of the above might have changed the syscall number */
75 syscall = syscall_get_nr(current, regs);
77 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
78 trace_sys_enter(regs, syscall);
80 syscall_enter_audit(regs, syscall);
82 return ret ? : syscall;
85 static __always_inline long
86 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
88 unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
90 if (work & SYSCALL_WORK_ENTER)
91 syscall = syscall_trace_enter(regs, syscall, work);
96 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
98 return __syscall_enter_from_user_work(regs, syscall);
101 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
105 __enter_from_user_mode(regs);
107 instrumentation_begin();
109 ret = __syscall_enter_from_user_work(regs, syscall);
110 instrumentation_end();
115 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
117 __enter_from_user_mode(regs);
118 instrumentation_begin();
120 instrumentation_end();
123 /* See comment for exit_to_user_mode() in entry-common.h */
124 static __always_inline void __exit_to_user_mode(void)
126 instrumentation_begin();
127 trace_hardirqs_on_prepare();
128 lockdep_hardirqs_on_prepare(CALLER_ADDR0);
129 instrumentation_end();
132 arch_exit_to_user_mode();
133 lockdep_hardirqs_on(CALLER_ADDR0);
136 void noinstr exit_to_user_mode(void)
138 __exit_to_user_mode();
141 /* Workaround to allow gradual conversion of architecture code */
142 void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
144 static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
146 if (ti_work & _TIF_NOTIFY_SIGNAL)
147 tracehook_notify_signal();
149 arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
152 #ifdef CONFIG_RT_DELAYED_SIGNALS
153 static inline void raise_delayed_signal(void)
155 if (unlikely(current->forced_info.si_signo)) {
156 force_sig_info(¤t->forced_info);
157 current->forced_info.si_signo = 0;
161 static inline void raise_delayed_signal(void) { }
164 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
165 unsigned long ti_work)
168 * Before returning to user space ensure that all pending work
169 * items have been completed.
171 while (ti_work & EXIT_TO_USER_MODE_WORK) {
173 local_irq_enable_exit_to_user(ti_work);
175 if (ti_work & _TIF_NEED_RESCHED)
178 raise_delayed_signal();
180 if (ti_work & _TIF_UPROBE)
181 uprobe_notify_resume(regs);
183 if (ti_work & _TIF_PATCH_PENDING)
184 klp_update_patch_state(current);
186 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
187 handle_signal_work(regs, ti_work);
189 if (ti_work & _TIF_NOTIFY_RESUME)
190 tracehook_notify_resume(regs);
192 /* Architecture specific TIF work */
193 arch_exit_to_user_mode_work(regs, ti_work);
196 * Disable interrupts and reevaluate the work flags as they
197 * might have changed while interrupts and preemption was
200 local_irq_disable_exit_to_user();
202 /* Check if any of the above work has queued a deferred wakeup */
203 tick_nohz_user_enter_prepare();
205 ti_work = read_thread_flags();
208 /* Return the latest work state for arch_exit_to_user_mode() */
212 static void exit_to_user_mode_prepare(struct pt_regs *regs)
214 unsigned long ti_work = read_thread_flags();
216 lockdep_assert_irqs_disabled();
218 /* Flush pending rcuog wakeup before the last need_resched() check */
219 tick_nohz_user_enter_prepare();
221 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
222 ti_work = exit_to_user_mode_loop(regs, ti_work);
224 arch_exit_to_user_mode_prepare(regs, ti_work);
226 /* Ensure that the address limit is intact and no locks are held */
227 addr_limit_user_check();
229 lockdep_assert_irqs_disabled();
234 * If SYSCALL_EMU is set, then the only reason to report is when
235 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
236 * instruction has been already reported in syscall_enter_from_user_mode().
238 static inline bool report_single_step(unsigned long work)
240 if (work & SYSCALL_WORK_SYSCALL_EMU)
243 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
246 static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
251 * If the syscall was rolled back due to syscall user dispatching,
252 * then the tracers below are not invoked for the same reason as
253 * the entry side was not invoked in syscall_trace_enter(): The ABI
254 * of these syscalls is unknown.
256 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
257 if (unlikely(current->syscall_dispatch.on_dispatch)) {
258 current->syscall_dispatch.on_dispatch = false;
263 audit_syscall_exit(regs);
265 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
266 trace_sys_exit(regs, syscall_get_return_value(current, regs));
268 step = report_single_step(work);
269 if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
270 arch_syscall_exit_tracehook(regs, step);
274 * Syscall specific exit to user mode preparation. Runs with interrupts
277 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
279 unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
280 unsigned long nr = syscall_get_nr(current, regs);
282 CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
284 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
285 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
292 * Do one-time syscall specific work. If these work items are
293 * enabled, we want to run them exactly once per syscall exit with
294 * interrupts enabled.
296 if (unlikely(work & SYSCALL_WORK_EXIT))
297 syscall_exit_work(regs, work);
300 static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
302 syscall_exit_to_user_mode_prepare(regs);
303 local_irq_disable_exit_to_user();
304 exit_to_user_mode_prepare(regs);
307 void syscall_exit_to_user_mode_work(struct pt_regs *regs)
309 __syscall_exit_to_user_mode_work(regs);
312 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
314 instrumentation_begin();
315 __syscall_exit_to_user_mode_work(regs);
316 instrumentation_end();
317 __exit_to_user_mode();
320 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
322 __enter_from_user_mode(regs);
325 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
327 instrumentation_begin();
328 exit_to_user_mode_prepare(regs);
329 instrumentation_end();
330 __exit_to_user_mode();
333 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
335 irqentry_state_t ret = {
339 if (user_mode(regs)) {
340 irqentry_enter_from_user_mode(regs);
345 * If this entry hit the idle task invoke rcu_irq_enter() whether
346 * RCU is watching or not.
348 * Interrupts can nest when the first interrupt invokes softirq
349 * processing on return which enables interrupts.
351 * Scheduler ticks in the idle task can mark quiescent state and
352 * terminate a grace period, if and only if the timer interrupt is
353 * not nested into another interrupt.
355 * Checking for rcu_is_watching() here would prevent the nesting
356 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
357 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
358 * assume that it is the first interrupt and eventually claim
359 * quiescent state and end grace periods prematurely.
361 * Unconditionally invoke rcu_irq_enter() so RCU state stays
364 * TINY_RCU does not support EQS, so let the compiler eliminate
365 * this part when enabled.
367 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
369 * If RCU is not watching then the same careful
370 * sequence vs. lockdep and tracing is required
371 * as in irqentry_enter_from_user_mode().
373 lockdep_hardirqs_off(CALLER_ADDR0);
375 instrumentation_begin();
376 trace_hardirqs_off_finish();
377 instrumentation_end();
384 * If RCU is watching then RCU only wants to check whether it needs
385 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
386 * already contains a warning when RCU is not watching, so no point
387 * in having another one here.
389 lockdep_hardirqs_off(CALLER_ADDR0);
390 instrumentation_begin();
391 rcu_irq_enter_check_tick();
392 trace_hardirqs_off_finish();
393 instrumentation_end();
398 void raw_irqentry_exit_cond_resched(void)
400 if (!preempt_count()) {
401 /* Sanity check RCU and thread stack */
402 rcu_irq_exit_check_preempt();
403 if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
404 WARN_ON_ONCE(!on_thread_stack());
406 preempt_schedule_irq();
409 #ifdef CONFIG_PREEMPT_DYNAMIC
410 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
411 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
412 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
413 DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
414 void dynamic_irqentry_exit_cond_resched(void)
416 if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
418 raw_irqentry_exit_cond_resched();
423 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
425 lockdep_assert_irqs_disabled();
427 /* Check whether this returns to user mode */
428 if (user_mode(regs)) {
429 irqentry_exit_to_user_mode(regs);
430 } else if (!regs_irqs_disabled(regs)) {
432 * If RCU was not watching on entry this needs to be done
433 * carefully and needs the same ordering of lockdep/tracing
434 * and RCU as the return to user mode path.
436 if (state.exit_rcu) {
437 instrumentation_begin();
438 /* Tell the tracer that IRET will enable interrupts */
439 trace_hardirqs_on_prepare();
440 lockdep_hardirqs_on_prepare(CALLER_ADDR0);
441 instrumentation_end();
443 lockdep_hardirqs_on(CALLER_ADDR0);
447 instrumentation_begin();
448 if (IS_ENABLED(CONFIG_PREEMPTION))
449 irqentry_exit_cond_resched();
451 /* Covers both tracing and lockdep */
453 instrumentation_end();
456 * IRQ flags state is correct already. Just tell RCU if it
457 * was not watching on entry.
464 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
466 irqentry_state_t irq_state;
468 irq_state.lockdep = lockdep_hardirqs_enabled();
471 lockdep_hardirqs_off(CALLER_ADDR0);
472 lockdep_hardirq_enter();
475 instrumentation_begin();
476 trace_hardirqs_off_finish();
478 instrumentation_end();
483 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
485 instrumentation_begin();
487 if (irq_state.lockdep) {
488 trace_hardirqs_on_prepare();
489 lockdep_hardirqs_on_prepare(CALLER_ADDR0);
491 instrumentation_end();
494 lockdep_hardirq_exit();
495 if (irq_state.lockdep)
496 lockdep_hardirqs_on(CALLER_ADDR0);