Attachment 892741 Details for Bug 916954 – Combined BORE + RT patch for gentoo-sources-6.8.9

[patch] Combined BORE + RT patch for gentoo-sources-6.8.9

0001-linux6.8.y-bore5.1.0_patch-6.8.2-rt11_gentoo-sources-6.8.9.patch (text/plain), 244.82 KB, created by deim on 2024-05-11 14:22:37 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: deim

Created: 2024-05-11 14:22:37 UTC

Size: 244.82 KB

patch

obsolete

>From feae72fd7f2403910c157dd679d6ec240ed1dfbf Mon Sep 17 00:00:00 2001
>From: Masahito S <firelzrd@gmail.com>
>Date: Mon, 22 Apr 2024 04:12:58 +0900
>Subject: [PATCH] linux6.8.y-bore5.1.0
>
>---
> include/linux/sched.h   |  10 ++
> init/Kconfig            |  17 +++
> kernel/sched/core.c     | 143 +++++++++++++++++++++++++
> kernel/sched/debug.c    |  60 ++++++++++-
> kernel/sched/fair.c     | 230 ++++++++++++++++++++++++++++++++++++++--
> kernel/sched/features.h |   4 +
> kernel/sched/sched.h    |   7 ++
> 7 files changed, 462 insertions(+), 9 deletions(-)
>
>diff --git a/include/linux/sched.h b/include/linux/sched.h
>index ffe8f618ab..0ab0b04240 100644
>--- a/include/linux/sched.h
>+++ b/include/linux/sched.h
>@@ -547,6 +547,16 @@ struct sched_entity {
> 	u64				sum_exec_runtime;
> 	u64				prev_sum_exec_runtime;
> 	u64				vruntime;
>+#ifdef CONFIG_SCHED_BORE
>+	u64				burst_time;
>+	u8				prev_burst_penalty;
>+	u8				curr_burst_penalty;
>+	u8				burst_penalty;
>+	u8				burst_score;
>+	u8				child_burst;
>+	u32				child_burst_cnt;
>+	u64				child_burst_last_cached;
>+#endif // CONFIG_SCHED_BORE
> 	s64				vlag;
> 	u64				slice;
> 
>diff --git a/init/Kconfig b/init/Kconfig
>index bee58f7468..13427dbb48 100644
>--- a/init/Kconfig
>+++ b/init/Kconfig
>@@ -1279,6 +1279,23 @@ config CHECKPOINT_RESTORE
> 
> 	  If unsure, say N here.
> 
>+config SCHED_BORE
>+	bool "Burst-Oriented Response Enhancer"
>+	default y
>+	help
>+	  In Desktop and Mobile computing, one might prefer interactive
>+	  tasks to keep responsive no matter what they run in the background.
>+
>+	  Enabling this kernel feature modifies the scheduler to discriminate
>+	  tasks by their burst time (runtime since it last went sleeping or
>+	  yielding state) and prioritize those that run less bursty.
>+	  Such tasks usually include window compositor, widgets backend,
>+	  terminal emulator, video playback, games and so on.
>+	  With a little impact to scheduling fairness, it may improve
>+	  responsiveness especially under heavy background workload.
>+
>+	  If unsure, say Y here.
>+
> config SCHED_AUTOGROUP
> 	bool "Automatic process group scheduling"
> 	select CGROUPS
>diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>index 9116bcc903..d1711f75f8 100644
>--- a/kernel/sched/core.c
>+++ b/kernel/sched/core.c
>@@ -4507,6 +4507,138 @@ int wake_up_state(struct task_struct *p, unsigned int state)
> 	return try_to_wake_up(p, state, 0);
> }
> 
>+#ifdef CONFIG_SCHED_BORE
>+extern u8   sched_burst_fork_atavistic;
>+extern uint sched_burst_cache_lifetime;
>+
>+static void __init sched_init_bore(void) {
>+	init_task.se.burst_time = 0;
>+	init_task.se.prev_burst_penalty = 0;
>+	init_task.se.curr_burst_penalty = 0;
>+	init_task.se.burst_penalty = 0;
>+	init_task.se.burst_score = 0;
>+	init_task.se.child_burst_last_cached = 0;
>+}
>+
>+void inline sched_fork_bore(struct task_struct *p) {
>+	p->se.burst_time = 0;
>+	p->se.curr_burst_penalty = 0;
>+	p->se.burst_score = 0;
>+	p->se.child_burst_last_cached = 0;
>+}
>+
>+static u32 count_child_tasks(struct task_struct *p) {
>+	struct task_struct *child;
>+	u32 cnt = 0;
>+	list_for_each_entry(child, &p->children, sibling) {cnt++;}
>+	return cnt;
>+}
>+
>+static inline bool task_is_inheritable(struct task_struct *p) {
>+	return (p->sched_class == &fair_sched_class);
>+}
>+
>+static inline bool child_burst_cache_expired(struct task_struct *p, u64 now) {
>+	u64 expiration_time =
>+		p->se.child_burst_last_cached + sched_burst_cache_lifetime;
>+	return ((s64)(expiration_time - now) < 0);
>+}
>+
>+static void __update_child_burst_cache(
>+	struct task_struct *p, u32 cnt, u32 sum, u64 now) {
>+	u8 avg = 0;
>+	if (cnt) avg = sum / cnt;
>+	p->se.child_burst = max(avg, p->se.burst_penalty);
>+	p->se.child_burst_cnt = cnt;
>+	p->se.child_burst_last_cached = now;
>+}
>+
>+static inline void update_child_burst_direct(struct task_struct *p, u64 now) {
>+	struct task_struct *child;
>+	u32 cnt = 0;
>+	u32 sum = 0;
>+
>+	list_for_each_entry(child, &p->children, sibling) {
>+		if (!task_is_inheritable(child)) continue;
>+		cnt++;
>+		sum += child->se.burst_penalty;
>+	}
>+
>+	__update_child_burst_cache(p, cnt, sum, now);
>+}
>+
>+static inline u8 __inherit_burst_direct(struct task_struct *p, u64 now) {
>+	struct task_struct *parent = p->real_parent;
>+	if (child_burst_cache_expired(parent, now))
>+		update_child_burst_direct(parent, now);
>+
>+	return parent->se.child_burst;
>+}
>+
>+static void update_child_burst_topological(
>+	struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) {
>+	struct task_struct *child, *dec;
>+	u32 cnt = 0, dcnt = 0;
>+	u32 sum = 0;
>+
>+	list_for_each_entry(child, &p->children, sibling) {
>+		dec = child;
>+		while ((dcnt = count_child_tasks(dec)) == 1)
>+			dec = list_first_entry(&dec->children, struct task_struct, sibling);
>+		
>+		if (!dcnt || !depth) {
>+			if (!task_is_inheritable(dec)) continue;
>+			cnt++;
>+			sum += dec->se.burst_penalty;
>+			continue;
>+		}
>+		if (!child_burst_cache_expired(dec, now)) {
>+			cnt += dec->se.child_burst_cnt;
>+			sum += (u32)dec->se.child_burst * dec->se.child_burst_cnt;
>+			continue;
>+		}
>+		update_child_burst_topological(dec, now, depth - 1, &cnt, &sum);
>+	}
>+
>+	__update_child_burst_cache(p, cnt, sum, now);
>+	*acnt += cnt;
>+	*asum += sum;
>+}
>+
>+static inline u8 __inherit_burst_topological(struct task_struct *p, u64 now) {
>+	struct task_struct *anc = p->real_parent;
>+	u32 cnt = 0, sum = 0;
>+
>+	while (anc->real_parent != anc && count_child_tasks(anc) == 1)
>+		anc = anc->real_parent;
>+
>+	if (child_burst_cache_expired(anc, now))
>+		update_child_burst_topological(
>+			anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum);
>+
>+	return anc->se.child_burst;
>+}
>+
>+static inline void inherit_burst(struct task_struct *p) {
>+	u8 burst_cache;
>+	u64 now = ktime_get_ns();
>+
>+	read_lock(&tasklist_lock);
>+	burst_cache = likely(sched_burst_fork_atavistic)?
>+		__inherit_burst_topological(p, now):
>+		__inherit_burst_direct(p, now);
>+	read_unlock(&tasklist_lock);
>+
>+	p->se.prev_burst_penalty = max(p->se.prev_burst_penalty, burst_cache);
>+}
>+
>+static void sched_post_fork_bore(struct task_struct *p) {
>+	if (p->sched_class == &fair_sched_class)
>+		inherit_burst(p);
>+	p->se.burst_penalty = p->se.prev_burst_penalty;
>+}
>+#endif // CONFIG_SCHED_BORE
>+
> /*
>  * Perform scheduler related setup for a newly forked process p.
>  * p is forked by current.
>@@ -4523,6 +4655,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
> 	p->se.prev_sum_exec_runtime	= 0;
> 	p->se.nr_migrations		= 0;
> 	p->se.vruntime			= 0;
>+#ifdef CONFIG_SCHED_BORE
>+	sched_fork_bore(p);
>+#endif // CONFIG_SCHED_BORE
> 	p->se.vlag			= 0;
> 	p->se.slice			= sysctl_sched_base_slice;
> 	INIT_LIST_HEAD(&p->se.group_node);
>@@ -4839,6 +4974,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
> 
> void sched_post_fork(struct task_struct *p)
> {
>+#ifdef CONFIG_SCHED_BORE
>+	sched_post_fork_bore(p);
>+#endif // CONFIG_SCHED_BORE
> 	uclamp_post_fork(p);
> }
> 
>@@ -9910,6 +10048,11 @@ void __init sched_init(void)
> 	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
> #endif
> 
>+#ifdef CONFIG_SCHED_BORE
>+	sched_init_bore();
>+	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.1.0 by Masahito Suzuki");
>+#endif // CONFIG_SCHED_BORE
>+
> 	wait_bit_init();
> 
> #ifdef CONFIG_FAIR_GROUP_SCHED
>diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>index 8d5d98a583..b178612617 100644
>--- a/kernel/sched/debug.c
>+++ b/kernel/sched/debug.c
>@@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = {
> };
> 
> #ifdef CONFIG_SMP
>+#ifdef CONFIG_SCHED_BORE
>+static ssize_t sched_min_base_slice_write(struct file *filp, const char __user *ubuf,
>+				   size_t cnt, loff_t *ppos)
>+{
>+	char buf[16];
>+	unsigned int value;
>+
>+	if (cnt > 15)
>+		cnt = 15;
>+
>+	if (copy_from_user(&buf, ubuf, cnt))
>+		return -EFAULT;
>+	buf[cnt] = '\0';
>+
>+	if (kstrtouint(buf, 10, &value))
>+		return -EINVAL;
> 
>+	if (!value)
>+		return -EINVAL;
>+
>+	sysctl_sched_min_base_slice = value;
>+	sched_update_min_base_slice();
>+
>+	*ppos += cnt;
>+	return cnt;
>+}
>+
>+static int sched_min_base_slice_show(struct seq_file *m, void *v)
>+{
>+	seq_printf(m, "%d\n", sysctl_sched_min_base_slice);
>+	return 0;
>+}
>+
>+static int sched_min_base_slice_open(struct inode *inode, struct file *filp)
>+{
>+	return single_open(filp, sched_min_base_slice_show, NULL);
>+}
>+
>+static const struct file_operations sched_min_base_slice_fops = {
>+	.open		= sched_min_base_slice_open,
>+	.write		= sched_min_base_slice_write,
>+	.read		= seq_read,
>+	.llseek		= seq_lseek,
>+	.release	= single_release,
>+};
>+#else // !CONFIG_SCHED_BORE
> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
> 				   size_t cnt, loff_t *ppos)
> {
>@@ -213,7 +258,7 @@ static const struct file_operations sched_scaling_fops = {
> 	.llseek		= seq_lseek,
> 	.release	= single_release,
> };
>-
>+#endif // CONFIG_SCHED_BORE
> #endif /* SMP */
> 
> #ifdef CONFIG_PREEMPT_DYNAMIC
>@@ -353,14 +353,21 @@
> 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
> #endif
> 
>-	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
>+#ifdef CONFIG_SCHED_BORE
>+	debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops);
>+	debugfs_create_u32("base_slice_ns", 0400, debugfs_sched, &sysctl_sched_base_slice);
>+#else // !CONFIG_SCHED_BORE
>+ 	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
>+#endif // CONFIG_SCHED_BORE
> 
> #ifndef CONFIG_SCHED_ALT
> 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
> 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
> 
> #ifdef CONFIG_SMP
>+#if !defined(CONFIG_SCHED_BORE)
> 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
>+#endif // CONFIG_SCHED_BORE
> 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
> 	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
> 
>@@ -595,6 +647,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
> 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
> 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
> 
>+#ifdef CONFIG_SCHED_BORE
>+	SEQ_printf(m, " %2d", p->se.burst_score);
>+#endif // CONFIG_SCHED_BORE
> #ifdef CONFIG_NUMA_BALANCING
> 	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
> #endif
>@@ -1068,6 +1123,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
> 
> 	P(se.load.weight);
> #ifdef CONFIG_SMP
>+#ifdef CONFIG_SCHED_BORE
>+	P(se.burst_score);
>+#endif // CONFIG_SCHED_BORE
> 	P(se.avg.load_sum);
> 	P(se.avg.runnable_sum);
> 	P(se.avg.util_sum);
>diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>index 533547e3c9..a2346b1b44 100644
>--- a/kernel/sched/fair.c
>+++ b/kernel/sched/fair.c
>@@ -19,6 +19,9 @@
>  *
>  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
>  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
>+ *
>+ *  Burst-Oriented Response Enhancer (BORE) CPU Scheduler
>+ *  Copyright (C) 2021-2024 Masahito Suzuki <firelzrd@gmail.com>
>  */
> #include <linux/energy_model.h>
> #include <linux/mmap_lock.h>
>@@ -64,20 +67,125 @@
>  *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
>  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
>  *
>- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
>+ * (BORE  default SCHED_TUNABLESCALING_NONE = *1 constant)
>+ * (EEVDF default SCHED_TUNABLESCALING_LOG  = *(1+ilog(ncpus))
>  */
>+#ifdef CONFIG_SCHED_BORE
>+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
>+#else // !CONFIG_SCHED_BORE
> unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
>+#endif // CONFIG_SCHED_BORE
> 
> /*
>  * Minimal preemption granularity for CPU-bound tasks:
>  *
>- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
>+ * (BORE  default: max(1 sec / HZ, min_base_slice) constant, units: nanoseconds)
>+ * (EEVDF default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
>  */
>+#ifdef CONFIG_SCHED_BORE
>+unsigned int            sysctl_sched_base_slice = 1000000000ULL / HZ;
>+static unsigned int configured_sched_base_slice = 1000000000ULL / HZ;
>+unsigned int        sysctl_sched_min_base_slice =    2000000ULL;
>+#else // !CONFIG_SCHED_BORE
> unsigned int sysctl_sched_base_slice			= 750000ULL;
> static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
>+#endif // CONFIG_SCHED_BORE
> 
> const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
> 
>+#ifdef CONFIG_SCHED_BORE
>+u8   __read_mostly sched_bore                   = 1;
>+u8   __read_mostly sched_burst_smoothness_long  = 1;
>+u8   __read_mostly sched_burst_smoothness_short = 0;
>+u8   __read_mostly sched_burst_fork_atavistic   = 2;
>+u8   __read_mostly sched_burst_penalty_offset   = 22;
>+uint __read_mostly sched_burst_penalty_scale    = 1280;
>+uint __read_mostly sched_burst_cache_lifetime   = 60000000;
>+static int __maybe_unused sixty_four     = 64;
>+static int __maybe_unused maxval_12_bits = 4095;
>+
>+#define MAX_BURST_PENALTY (39U <<2)
>+
>+static inline u32 log2plus1_u64_u32f8(u64 v) {
>+	u32 msb = fls64(v);
>+	s32 excess_bits = msb - 9;
>+    u8 fractional = (0 <= excess_bits)? v >> excess_bits: v << -excess_bits;
>+	return msb << 8 | fractional;
>+}
>+
>+static inline u32 calc_burst_penalty(u64 burst_time) {
>+	u32 greed, tolerance, penalty, scaled_penalty;
>+	
>+	greed = log2plus1_u64_u32f8(burst_time);
>+	tolerance = sched_burst_penalty_offset << 8;
>+	penalty = max(0, (s32)greed - (s32)tolerance);
>+	scaled_penalty = penalty * sched_burst_penalty_scale >> 16;
>+
>+	return min(MAX_BURST_PENALTY, scaled_penalty);
>+}
>+
>+static inline u64 scale_slice(u64 delta, struct sched_entity *se) {
>+	return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->burst_score], 22);
>+}
>+
>+static inline u64 __unscale_slice(u64 delta, u8 score) {
>+	return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10);
>+}
>+
>+static inline u64 unscale_slice(u64 delta, struct sched_entity *se) {
>+	return __unscale_slice(delta, se->burst_score);
>+}
>+
>+void reweight_task(struct task_struct *p, int prio);
>+
>+static void update_burst_score(struct sched_entity *se) {
>+	if (!entity_is_task(se)) return;
>+	struct task_struct *p = task_of(se);
>+	u8 prio = p->static_prio - MAX_RT_PRIO;
>+	u8 prev_prio = min(39, prio + se->burst_score);
>+
>+	se->burst_score = se->burst_penalty >> 2;
>+
>+	u8 new_prio = min(39, prio + se->burst_score);
>+	if (new_prio != prev_prio)
>+		reweight_task(p, new_prio);
>+}
>+
>+static void update_burst_penalty(struct sched_entity *se) {
>+	se->curr_burst_penalty = calc_burst_penalty(se->burst_time);
>+	se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty);
>+	update_burst_score(se);
>+}
>+
>+static inline u32 binary_smooth(u32 new, u32 old) {
>+  int increment = new - old;
>+  return (0 <= increment)?
>+    old + ( increment >> (int)sched_burst_smoothness_long):
>+    old - (-increment >> (int)sched_burst_smoothness_short);
>+}
>+
>+static void restart_burst(struct sched_entity *se) {
>+	se->burst_penalty = se->prev_burst_penalty =
>+		binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty);
>+	se->curr_burst_penalty = 0;
>+	se->burst_time = 0;
>+	update_burst_score(se);
>+}
>+
>+static void restart_burst_rescale_deadline(struct sched_entity *se) {
>+	s64 vscaled, wremain, vremain = se->deadline - se->vruntime;
>+	u8 prev_score = se->burst_score;
>+	restart_burst(se);
>+	if (prev_score > se->burst_score) {
>+		wremain = __unscale_slice(abs(vremain), prev_score);
>+		vscaled = scale_slice(wremain, se);
>+		if (unlikely(vremain < 0))
>+			vscaled = -vscaled;
>+		se->deadline = se->vruntime + vscaled;
>+	}
>+}
>+#endif // CONFIG_SCHED_BORE
>+
> int sched_thermal_decay_shift;
> static int __init setup_sched_thermal_decay_shift(char *str)
> {
>@@ -137,6 +245,69 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
> 
> #ifdef CONFIG_SYSCTL
> static struct ctl_table sched_fair_sysctls[] = {
>+#ifdef CONFIG_SCHED_BORE
>+	{
>+		.procname	= "sched_bore",
>+		.data		= &sched_bore,
>+		.maxlen		= sizeof(u8),
>+		.mode		= 0644,
>+		.proc_handler = proc_dou8vec_minmax,
>+		.extra1		= SYSCTL_ONE,
>+		.extra2		= SYSCTL_ONE,
>+	},
>+	{
>+		.procname	= "sched_burst_smoothness_long",
>+		.data		= &sched_burst_smoothness_long,
>+		.maxlen		= sizeof(u8),
>+		.mode		= 0644,
>+		.proc_handler = proc_dou8vec_minmax,
>+		.extra1		= SYSCTL_ZERO,
>+		.extra2		= SYSCTL_ONE,
>+	},
>+	{
>+		.procname	= "sched_burst_smoothness_short",
>+		.data		= &sched_burst_smoothness_short,
>+		.maxlen		= sizeof(u8),
>+		.mode		= 0644,
>+		.proc_handler = proc_dou8vec_minmax,
>+		.extra1		= SYSCTL_ZERO,
>+		.extra2		= SYSCTL_ONE,
>+	},
>+	{
>+		.procname	= "sched_burst_fork_atavistic",
>+		.data		= &sched_burst_fork_atavistic,
>+		.maxlen		= sizeof(u8),
>+		.mode		= 0644,
>+		.proc_handler = proc_dou8vec_minmax,
>+		.extra1		= SYSCTL_ZERO,
>+		.extra2		= SYSCTL_THREE,
>+	},
>+	{
>+		.procname	= "sched_burst_penalty_offset",
>+		.data		= &sched_burst_penalty_offset,
>+		.maxlen		= sizeof(u8),
>+		.mode		= 0644,
>+		.proc_handler = proc_dou8vec_minmax,
>+		.extra1		= SYSCTL_ZERO,
>+		.extra2		= &sixty_four,
>+	},
>+	{
>+		.procname	= "sched_burst_penalty_scale",
>+		.data		= &sched_burst_penalty_scale,
>+		.maxlen		= sizeof(uint),
>+		.mode		= 0644,
>+		.proc_handler = proc_douintvec_minmax,
>+		.extra1		= SYSCTL_ZERO,
>+		.extra2		= &maxval_12_bits,
>+	},
>+	{
>+		.procname	= "sched_burst_cache_lifetime",
>+		.data		= &sched_burst_cache_lifetime,
>+		.maxlen		= sizeof(uint),
>+		.mode		= 0644,
>+		.proc_handler = proc_douintvec,
>+	},
>+#endif // CONFIG_SCHED_BORE
> #ifdef CONFIG_CFS_BANDWIDTH
> 	{
> 		.procname       = "sched_cfs_bandwidth_slice_us",
>@@ -195,6 +366,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
>  *
>  * This idea comes from the SD scheduler of Con Kolivas:
>  */
>+#ifdef CONFIG_SCHED_BORE
>+static void update_sysctl(void) {
>+	sysctl_sched_base_slice =
>+		max(sysctl_sched_min_base_slice, configured_sched_base_slice);
>+}
>+void sched_update_min_base_slice(void) { update_sysctl(); }
>+#else // !CONFIG_SCHED_BORE
> static unsigned int get_update_sysctl_factor(void)
> {
> 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
>@@ -225,6 +403,7 @@ static void update_sysctl(void)
> 	SET_SYSCTL(sched_base_slice);
> #undef SET_SYSCTL
> }
>+#endif // CONFIG_SCHED_BORE
> 
> void __init sched_init_granularity(void)
> {
>@@ -703,6 +703,10 @@
> 	vlag = avruntime - se->vruntime;
> 	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
> 
>+#ifdef CONFIG_SCHED_BORE
>+	limit >>= 1;
>+#endif // CONFIG_SCHED_BORE
>+	
> 	return clamp(vlag, -limit, limit);
> }
> 
>@@ -955,6 +1137,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
>  * Scheduling class statistics methods:
>  */
> #ifdef CONFIG_SMP
>+#if !defined(CONFIG_SCHED_BORE)
> int sched_update_scaling(void)
> {
> 	unsigned int factor = get_update_sysctl_factor();
>@@ -966,6 +1149,7 @@ int sched_update_scaling(void)
> 
> 	return 0;
> }
>+#endif // CONFIG_SCHED_BORE
> #endif
> #endif
> 
>@@ -1165,7 +1349,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
> 	if (unlikely(delta_exec <= 0))
> 		return;
> 
>+#ifdef CONFIG_SCHED_BORE
>+	curr->burst_time += delta_exec;
>+	update_burst_penalty(curr);
>+	curr->vruntime += max(1ULL, calc_delta_fair(delta_exec, curr));
>+#else // !CONFIG_SCHED_BORE
> 	curr->vruntime += calc_delta_fair(delta_exec, curr);
>+#endif // CONFIG_SCHED_BORE
> 	update_deadline(cfs_rq, curr);
> 	update_min_vruntime(cfs_rq);
> 
>@@ -5171,6 +5362,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> 	 *
> 	 * EEVDF: placement strategy #1 / #2
> 	 */
>+#ifdef CONFIG_SCHED_BORE
>+	if (se->vlag)
>+#endif // CONFIG_SCHED_BORE
> 	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
> 		struct sched_entity *curr = cfs_rq->curr;
> 		unsigned long load;
>@@ -6803,6 +6997,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> 	bool was_sched_idle = sched_idle_rq(rq);
> 
> 	util_est_dequeue(&rq->cfs, p);
>+#ifdef CONFIG_SCHED_BORE
>+	if (task_sleep) {
>+		cfs_rq = cfs_rq_of(se);
>+		if (cfs_rq->curr == se)
>+			update_curr(cfs_rq);
>+		restart_burst(se);
>+	}
>+#endif // CONFIG_SCHED_BORE
> 
> 	for_each_sched_entity(se) {
> 		cfs_rq = cfs_rq_of(se);
>@@ -8552,16 +8754,25 @@ static void yield_task_fair(struct rq *rq)
> 	/*
> 	 * Are we the only task in the tree?
> 	 */
>+#if !defined(CONFIG_SCHED_BORE)
> 	if (unlikely(rq->nr_running == 1))
> 		return;
> 
> 	clear_buddies(cfs_rq, se);
>+#endif // CONFIG_SCHED_BORE
> 
> 	update_rq_clock(rq);
> 	/*
> 	 * Update run-time statistics of the 'current'.
> 	 */
> 	update_curr(cfs_rq);
>+#ifdef CONFIG_SCHED_BORE
>+	restart_burst_rescale_deadline(se);
>+	if (unlikely(rq->nr_running == 1))
>+		return;
>+
>+	clear_buddies(cfs_rq, se);
>+#endif // CONFIG_SCHED_BORE
> 	/*
> 	 * Tell update_rq_clock() that we've just updated,
> 	 * so we don't do microscopic update in schedule()
>@@ -12651,6 +12862,9 @@ static void task_fork_fair(struct task_struct *p)
> 	curr = cfs_rq->curr;
> 	if (curr)
> 		update_curr(cfs_rq);
>+#ifdef CONFIG_SCHED_BORE
>+	update_burst_score(se);
>+#endif // CONFIG_SCHED_BORE
> 	place_entity(cfs_rq, se, ENQUEUE_INITIAL);
> 	rq_unlock(rq, &rf);
> }
>diff --git a/kernel/sched/features.h b/kernel/sched/features.h
>index 143f55df89..3f0fe409f5 100644
>--- a/kernel/sched/features.h
>+++ b/kernel/sched/features.h
>@@ -6,7 +6,11 @@
>  */
> SCHED_FEAT(PLACE_LAG, true)
> SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
>+#ifdef CONFIG_SCHED_BORE
>+SCHED_FEAT(RUN_TO_PARITY, false)
>+#else // !CONFIG_SCHED_BORE
> SCHED_FEAT(RUN_TO_PARITY, true)
>+#endif // CONFIG_SCHED_BORE
> 
> /*
>  * Prefer to schedule the task we woke last (assuming it failed
>diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>index 001fe047bd..da3ad1d4e1 100644
>--- a/kernel/sched/sched.h
>+++ b/kernel/sched/sched.h
>@@ -1965,7 +1965,11 @@ static inline void dirty_sched_domain_sysctl(int cpu)
> }
> #endif
> 
>+#ifdef CONFIG_SCHED_BORE
>+extern void sched_update_min_base_slice(void);
>+#else // !CONFIG_SCHED_BORE
> extern int sched_update_scaling(void);
>+#endif // CONFIG_SCHED_BORE
> 
> static inline const struct cpumask *task_user_cpus(struct task_struct *p)
> {
>@@ -2552,6 +2556,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate;
> extern const_debug unsigned int sysctl_sched_migration_cost;
> 
> extern unsigned int sysctl_sched_base_slice;
>+#ifdef CONFIG_SCHED_BORE
>+extern unsigned int sysctl_sched_min_base_slice;
>+#endif // CONFIG_SCHED_BORE
> 
> #ifdef CONFIG_SCHED_DEBUG
> extern int sysctl_resched_latency_warn_ms;
>-- 
>2.34.1
>
>diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>index 73062d47a462e..86e2745a3e8fb 100644
>--- a/Documentation/admin-guide/kernel-parameters.txt
>+++ b/Documentation/admin-guide/kernel-parameters.txt
>@@ -6508,6 +6508,18 @@
> 			Force threading of all interrupt handlers except those
> 			marked explicitly IRQF_NO_THREAD.
> 
>+	threadprintk	[KNL]
>+			Force threaded printing of all legacy consoles. Be
>+			aware that with this option, the shutdown, reboot, and
>+			panic messages may not be printed on the legacy
>+			consoles. Also, earlycon/earlyprintk printing will be
>+			delayed until a regular console or the kthread is
>+			available.
>+
>+			Users can view /proc/consoles to see if their console
>+			driver is legacy or not. Non-legacy (NBCON) console
>+			drivers are already threaded and are shown with 'N'.
>+
> 	topology=	[S390]
> 			Format: {off | on}
> 			Specify if the kernel should make use of the cpu
>diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
>index 0af6709570d14..25424a7468d95 100644
>--- a/arch/arm/Kconfig
>+++ b/arch/arm/Kconfig
>@@ -36,6 +36,7 @@ config ARM
> 	select ARCH_SUPPORTS_ATOMIC_RMW
> 	select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
> 	select ARCH_SUPPORTS_PER_VMA_LOCK
>+	select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK
> 	select ARCH_USE_BUILTIN_BSWAP
> 	select ARCH_USE_CMPXCHG_LOCKREF
> 	select ARCH_USE_MEMTEST
>@@ -75,7 +76,7 @@ config ARM
> 	select HAS_IOPORT
> 	select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT
> 	select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
>-	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
>+	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT
> 	select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL
> 	select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
> 	select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
>@@ -98,7 +99,7 @@ config ARM
> 	select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
> 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
> 	select HAVE_EXIT_THREAD
>-	select HAVE_FAST_GUP if ARM_LPAE
>+	select HAVE_FAST_GUP if ARM_LPAE && !(PREEMPT_RT && HIGHPTE)
> 	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
> 	select HAVE_FUNCTION_ERROR_INJECTION
> 	select HAVE_FUNCTION_GRAPH_TRACER
>@@ -120,6 +121,7 @@ config ARM
> 	select HAVE_PERF_EVENTS
> 	select HAVE_PERF_REGS
> 	select HAVE_PERF_USER_STACK_DUMP
>+	select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM
> 	select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE
> 	select HAVE_REGS_AND_STACK_ACCESS_API
> 	select HAVE_RSEQ
>diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
>index 07565b593ed68..3761c1e995cf6 100644
>--- a/arch/arm/mm/fault.c
>+++ b/arch/arm/mm/fault.c
>@@ -436,6 +436,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
> 	if (addr < TASK_SIZE)
> 		return do_page_fault(addr, fsr, regs);
> 
>+	if (interrupts_enabled(regs))
>+		local_irq_enable();
>+
> 	if (user_mode(regs))
> 		goto bad_area;
> 
>@@ -506,6 +509,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
> static int
> do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
> {
>+	if (interrupts_enabled(regs))
>+		local_irq_enable();
>+
> 	do_bad_area(addr, fsr, regs);
> 	return 0;
> }
>diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
>index b68efe643a12c..48745a3c52618 100644
>--- a/arch/arm/vfp/vfpmodule.c
>+++ b/arch/arm/vfp/vfpmodule.c
>@@ -55,6 +55,34 @@ extern unsigned int VFP_arch_feroceon __alias(VFP_arch);
>  */
> union vfp_state *vfp_current_hw_state[NR_CPUS];
> 
>+/*
>+ * Claim ownership of the VFP unit.
>+ *
>+ * The caller may change VFP registers until vfp_unlock() is called.
>+ *
>+ * local_bh_disable() is used to disable preemption and to disable VFP
>+ * processing in softirq context. On PREEMPT_RT kernels local_bh_disable() is
>+ * not sufficient because it only serializes soft interrupt related sections
>+ * via a local lock, but stays preemptible. Disabling preemption is the right
>+ * choice here as bottom half processing is always in thread context on RT
>+ * kernels so it implicitly prevents bottom half processing as well.
>+ */
>+static void vfp_lock(void)
>+{
>+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>+		local_bh_disable();
>+	else
>+		preempt_disable();
>+}
>+
>+static void vfp_unlock(void)
>+{
>+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>+		local_bh_enable();
>+	else
>+		preempt_enable();
>+}
>+
> /*
>  * Is 'thread's most up to date state stored in this CPUs hardware?
>  * Must be called from non-preemptible context.
>@@ -240,7 +268,7 @@ static void vfp_panic(char *reason, u32 inst)
> /*
>  * Process bitmask of exception conditions.
>  */
>-static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_regs *regs)
>+static int vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr)
> {
> 	int si_code = 0;
> 
>@@ -248,8 +276,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_
> 
> 	if (exceptions == VFP_EXCEPTION_ERROR) {
> 		vfp_panic("unhandled bounce", inst);
>-		vfp_raise_sigfpe(FPE_FLTINV, regs);
>-		return;
>+		return FPE_FLTINV;
> 	}
> 
> 	/*
>@@ -277,8 +304,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_
> 	RAISE(FPSCR_OFC, FPSCR_OFE, FPE_FLTOVF);
> 	RAISE(FPSCR_IOC, FPSCR_IOE, FPE_FLTINV);
> 
>-	if (si_code)
>-		vfp_raise_sigfpe(si_code, regs);
>+	return si_code;
> }
> 
> /*
>@@ -324,6 +350,8 @@ static u32 vfp_emulate_instruction(u32 inst, u32 fpscr, struct pt_regs *regs)
> static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
> {
> 	u32 fpscr, orig_fpscr, fpsid, exceptions;
>+	int si_code2 = 0;
>+	int si_code = 0;
> 
> 	pr_debug("VFP: bounce: trigger %08x fpexc %08x\n", trigger, fpexc);
> 
>@@ -369,8 +397,8 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
> 		 * unallocated VFP instruction but with FPSCR.IXE set and not
> 		 * on VFP subarch 1.
> 		 */
>-		 vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr, regs);
>-		return;
>+		si_code = vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr);
>+		goto exit;
> 	}
> 
> 	/*
>@@ -394,14 +422,14 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
> 	 */
> 	exceptions = vfp_emulate_instruction(trigger, fpscr, regs);
> 	if (exceptions)
>-		vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs);
>+		si_code2 = vfp_raise_exceptions(exceptions, trigger, orig_fpscr);
> 
> 	/*
> 	 * If there isn't a second FP instruction, exit now. Note that
> 	 * the FPEXC.FP2V bit is valid only if FPEXC.EX is 1.
> 	 */
> 	if ((fpexc & (FPEXC_EX | FPEXC_FP2V)) != (FPEXC_EX | FPEXC_FP2V))
>-		return;
>+		goto exit;
> 
> 	/*
> 	 * The barrier() here prevents fpinst2 being read
>@@ -413,7 +441,13 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
>  emulate:
> 	exceptions = vfp_emulate_instruction(trigger, orig_fpscr, regs);
> 	if (exceptions)
>-		vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs);
>+		si_code = vfp_raise_exceptions(exceptions, trigger, orig_fpscr);
>+exit:
>+	vfp_unlock();
>+	if (si_code2)
>+		vfp_raise_sigfpe(si_code2, regs);
>+	if (si_code)
>+		vfp_raise_sigfpe(si_code, regs);
> }
> 
> static void vfp_enable(void *unused)
>@@ -512,11 +546,9 @@ static inline void vfp_pm_init(void) { }
>  */
> void vfp_sync_hwstate(struct thread_info *thread)
> {
>-	unsigned int cpu = get_cpu();
>+	vfp_lock();
> 
>-	local_bh_disable();
>-
>-	if (vfp_state_in_hw(cpu, thread)) {
>+	if (vfp_state_in_hw(raw_smp_processor_id(), thread)) {
> 		u32 fpexc = fmrx(FPEXC);
> 
> 		/*
>@@ -527,8 +559,7 @@ void vfp_sync_hwstate(struct thread_info *thread)
> 		fmxr(FPEXC, fpexc);
> 	}
> 
>-	local_bh_enable();
>-	put_cpu();
>+	vfp_unlock();
> }
> 
> /* Ensure that the thread reloads the hardware VFP state on the next use. */
>@@ -683,7 +714,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger)
> 	if (!user_mode(regs))
> 		return vfp_kmode_exception(regs, trigger);
> 
>-	local_bh_disable();
>+	vfp_lock();
> 	fpexc = fmrx(FPEXC);
> 
> 	/*
>@@ -748,6 +779,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger)
> 		 * replay the instruction that trapped.
> 		 */
> 		fmxr(FPEXC, fpexc);
>+		vfp_unlock();
> 	} else {
> 		/* Check for synchronous or asynchronous exceptions */
> 		if (!(fpexc & (FPEXC_EX | FPEXC_DEX))) {
>@@ -762,17 +794,17 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger)
> 			if (!(fpscr & FPSCR_IXE)) {
> 				if (!(fpscr & FPSCR_LENGTH_MASK)) {
> 					pr_debug("not VFP\n");
>-					local_bh_enable();
>+					vfp_unlock();
> 					return -ENOEXEC;
> 				}
> 				fpexc |= FPEXC_DEX;
> 			}
> 		}
> bounce:		regs->ARM_pc += 4;
>+		/* VFP_bounce() will invoke vfp_unlock() */
> 		VFP_bounce(trigger, fpexc, regs);
> 	}
> 
>-	local_bh_enable();
> 	return 0;
> }
> 
>@@ -837,7 +869,7 @@ void kernel_neon_begin(void)
> 	unsigned int cpu;
> 	u32 fpexc;
> 
>-	local_bh_disable();
>+	vfp_lock();
> 
> 	/*
> 	 * Kernel mode NEON is only allowed outside of hardirq context with
>@@ -868,7 +900,7 @@ void kernel_neon_end(void)
> {
> 	/* Disable the NEON/VFP unit. */
> 	fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
>-	local_bh_enable();
>+	vfp_unlock();
> }
> EXPORT_SYMBOL(kernel_neon_end);
> 
>diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>index 6b96d75a3a3d8..8ecbaa43f133f 100644
>--- a/arch/arm64/Kconfig
>+++ b/arch/arm64/Kconfig
>@@ -98,6 +98,7 @@ config ARM64
> 	select ARCH_SUPPORTS_NUMA_BALANCING
> 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
> 	select ARCH_SUPPORTS_PER_VMA_LOCK
>+	select ARCH_SUPPORTS_RT
> 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
> 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
> 	select ARCH_WANT_DEFAULT_BPF_JIT
>diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>index b9fc064d38d28..e8651e304c888 100644
>--- a/arch/powerpc/Kconfig
>+++ b/arch/powerpc/Kconfig
>@@ -166,6 +166,7 @@ config PPC
> 	select ARCH_STACKWALK
> 	select ARCH_SUPPORTS_ATOMIC_RMW
> 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx || 40x
>+	select ARCH_SUPPORTS_RT			if HAVE_POSIX_CPU_TIMERS_TASK_WORK
> 	select ARCH_USE_BUILTIN_BSWAP
> 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
> 	select ARCH_USE_MEMTEST
>@@ -270,6 +271,7 @@ config PPC
> 	select HAVE_PERF_USER_STACK_DUMP
> 	select HAVE_REGS_AND_STACK_ACCESS_API
> 	select HAVE_RELIABLE_STACKTRACE
>+	select HAVE_POSIX_CPU_TIMERS_TASK_WORK	if !KVM
> 	select HAVE_RSEQ
> 	select HAVE_SETUP_PER_CPU_AREA		if PPC64
> 	select HAVE_SOFTIRQ_ON_OWN_STACK
>diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h
>index 283c346478565..4727f40052ddd 100644
>--- a/arch/powerpc/include/asm/stackprotector.h
>+++ b/arch/powerpc/include/asm/stackprotector.h
>@@ -19,8 +19,13 @@
>  */
> static __always_inline void boot_init_stack_canary(void)
> {
>-	unsigned long canary = get_random_canary();
>+	unsigned long canary;
> 
>+#ifndef CONFIG_PREEMPT_RT
>+	canary = get_random_canary();
>+#else
>+	canary = ((unsigned long)&canary) & CANARY_MASK;
>+#endif
> 	current->stack_canary = canary;
> #ifdef CONFIG_PPC64
> 	get_paca()->canary = canary;
>diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
>index 11e062b47d3f8..f7e22276c97b0 100644
>--- a/arch/powerpc/kernel/traps.c
>+++ b/arch/powerpc/kernel/traps.c
>@@ -261,12 +261,17 @@ static char *get_mmu_str(void)
> 
> static int __die(const char *str, struct pt_regs *regs, long err)
> {
>+	const char *pr = "";
>+
> 	printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
> 
>+	if (IS_ENABLED(CONFIG_PREEMPTION))
>+		pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
>+
> 	printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n",
> 	       IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
> 	       PAGE_SIZE / 1024, get_mmu_str(),
>-	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
>+	       pr,
> 	       IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
> 	       IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
> 	       debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
>diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
>index 074263429faf2..96ab63d035e5c 100644
>--- a/arch/powerpc/kvm/Kconfig
>+++ b/arch/powerpc/kvm/Kconfig
>@@ -222,6 +222,7 @@ config KVM_E500MC
> config KVM_MPIC
> 	bool "KVM in-kernel MPIC emulation"
> 	depends on KVM && PPC_E500
>+	depends on !PREEMPT_RT
> 	select HAVE_KVM_IRQCHIP
> 	select HAVE_KVM_IRQ_ROUTING
> 	select HAVE_KVM_MSI
>diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
>index afc0f6a613372..dc3f63c2687d4 100644
>--- a/arch/powerpc/platforms/pseries/Kconfig
>+++ b/arch/powerpc/platforms/pseries/Kconfig
>@@ -2,6 +2,7 @@
> config PPC_PSERIES
> 	depends on PPC64 && PPC_BOOK3S
> 	bool "IBM pSeries & new (POWER5-based) iSeries"
>+	select GENERIC_ALLOCATOR
> 	select HAVE_PCSPKR_PLATFORM
> 	select MPIC
> 	select OF_DYNAMIC
>diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>index e8c4129697b14..c61e29deac8df 100644
>--- a/arch/powerpc/platforms/pseries/iommu.c
>+++ b/arch/powerpc/platforms/pseries/iommu.c
>@@ -25,6 +25,7 @@
> #include <linux/of_address.h>
> #include <linux/iommu.h>
> #include <linux/rculist.h>
>+#include <linux/local_lock.h>
> #include <asm/io.h>
> #include <asm/prom.h>
> #include <asm/rtas.h>
>@@ -206,7 +207,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
> 	return ret;
> }
> 
>-static DEFINE_PER_CPU(__be64 *, tce_page);
>+struct tce_page {
>+	__be64 * page;
>+	local_lock_t lock;
>+};
>+static DEFINE_PER_CPU(struct tce_page, tce_page) = {
>+	.lock = INIT_LOCAL_LOCK(lock),
>+};
> 
> static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
> 				     long npages, unsigned long uaddr,
>@@ -229,9 +236,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
> 		                           direction, attrs);
> 	}
> 
>-	local_irq_save(flags);	/* to protect tcep and the page behind it */
>+	/* to protect tcep and the page behind it */
>+	local_lock_irqsave(&tce_page.lock, flags);
> 
>-	tcep = __this_cpu_read(tce_page);
>+	tcep = __this_cpu_read(tce_page.page);
> 
> 	/* This is safe to do since interrupts are off when we're called
> 	 * from iommu_alloc{,_sg}()
>@@ -240,12 +248,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
> 		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
> 		/* If allocation fails, fall back to the loop implementation */
> 		if (!tcep) {
>-			local_irq_restore(flags);
>+			local_unlock_irqrestore(&tce_page.lock, flags);
> 			return tce_build_pSeriesLP(tbl->it_index, tcenum,
> 					tceshift,
> 					npages, uaddr, direction, attrs);
> 		}
>-		__this_cpu_write(tce_page, tcep);
>+		__this_cpu_write(tce_page.page, tcep);
> 	}
> 
> 	rpn = __pa(uaddr) >> tceshift;
>@@ -275,7 +283,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
> 		tcenum += limit;
> 	} while (npages > 0 && !rc);
> 
>-	local_irq_restore(flags);
>+	local_unlock_irqrestore(&tce_page.lock, flags);
> 
> 	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
> 		ret = (int)rc;
>@@ -459,16 +467,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
> 				DMA_BIDIRECTIONAL, 0);
> 	}
> 
>-	local_irq_disable();	/* to protect tcep and the page behind it */
>-	tcep = __this_cpu_read(tce_page);
>+	/* to protect tcep and the page behind it */
>+	local_lock_irq(&tce_page.lock);
>+	tcep = __this_cpu_read(tce_page.page);
> 
> 	if (!tcep) {
> 		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
> 		if (!tcep) {
>-			local_irq_enable();
>+			local_unlock_irq(&tce_page.lock);
> 			return -ENOMEM;
> 		}
>-		__this_cpu_write(tce_page, tcep);
>+		__this_cpu_write(tce_page.page, tcep);
> 	}
> 
> 	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
>@@ -511,7 +520,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
> 
> 	/* error cleanup: caller will clear whole range */
> 
>-	local_irq_enable();
>+	local_unlock_irq(&tce_page.lock);
> 	return rc;
> }
> 
>diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>index e3142ce531a09..32c5db19cf899 100644
>--- a/arch/riscv/Kconfig
>+++ b/arch/riscv/Kconfig
>@@ -49,6 +49,7 @@ config RISCV
> 	select ARCH_SUPPORTS_HUGETLBFS if MMU
> 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU
> 	select ARCH_SUPPORTS_PER_VMA_LOCK if MMU
>+	select ARCH_SUPPORTS_RT
> 	select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK
> 	select ARCH_USE_MEMTEST
> 	select ARCH_USE_QUEUED_RWLOCKS
>@@ -142,6 +143,7 @@ config RISCV
> 	select HAVE_PERF_USER_STACK_DUMP
> 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
> 	select HAVE_PREEMPT_DYNAMIC_KEY if !XIP_KERNEL
>+	select HAVE_PREEMPT_AUTO
> 	select HAVE_REGS_AND_STACK_ACCESS_API
> 	select HAVE_RETHOOK if !XIP_KERNEL
> 	select HAVE_RSEQ
>diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
>index 5d473343634b9..23b136286e927 100644
>--- a/arch/riscv/include/asm/thread_info.h
>+++ b/arch/riscv/include/asm/thread_info.h
>@@ -94,6 +94,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
>  * - pending work-to-be-done flags are in lowest half-word
>  * - other flags in upper half-word(s)
>  */
>+#define TIF_ARCH_RESCHED_LAZY	0	/* Lazy rescheduling */
> #define TIF_NOTIFY_RESUME	1	/* callback before returning to user */
> #define TIF_SIGPENDING		2	/* signal pending */
> #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
>@@ -104,6 +105,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
> #define TIF_32BIT		11	/* compat-mode 32bit process */
> #define TIF_RISCV_V_DEFER_RESTORE	12 /* restore Vector before returing to user */
> 
>+#define _TIF_ARCH_RESCHED_LAZY	(1 << TIF_ARCH_RESCHED_LAZY)
> #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
> #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
> #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>index 637e337c332e4..b4ac84206afc9 100644
>--- a/arch/x86/Kconfig
>+++ b/arch/x86/Kconfig
>@@ -28,6 +28,7 @@ config X86_64
> 	select ARCH_HAS_GIGANTIC_PAGE
> 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
> 	select ARCH_SUPPORTS_PER_VMA_LOCK
>+	select ARCH_SUPPORTS_RT
> 	select HAVE_ARCH_SOFT_DIRTY
> 	select MODULES_USE_ELF_RELA
> 	select NEED_DMA_MAP_STATE
>@@ -119,6 +120,7 @@ config X86
> 	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI_CLANG
> 	select ARCH_SUPPORTS_LTO_CLANG
> 	select ARCH_SUPPORTS_LTO_CLANG_THIN
>+	select ARCH_SUPPORTS_RT
> 	select ARCH_USE_BUILTIN_BSWAP
> 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_CMPXCHG64
> 	select ARCH_USE_MEMTEST
>@@ -275,6 +277,7 @@ config X86
> 	select HAVE_STATIC_CALL
> 	select HAVE_STATIC_CALL_INLINE		if HAVE_OBJTOOL
> 	select HAVE_PREEMPT_DYNAMIC_CALL
>+	select HAVE_PREEMPT_AUTO
> 	select HAVE_RSEQ
> 	select HAVE_RUST			if X86_64
> 	select HAVE_SYSCALL_TRACEPOINTS
>diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
>index d63b02940747f..1ff38ebbd5880 100644
>--- a/arch/x86/include/asm/thread_info.h
>+++ b/arch/x86/include/asm/thread_info.h
>@@ -81,8 +81,9 @@ struct thread_info {
> #define TIF_NOTIFY_RESUME	1	/* callback before returning to user */
> #define TIF_SIGPENDING		2	/* signal pending */
> #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
>-#define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
>-#define TIF_SSBD		5	/* Speculative store bypass disable */
>+#define TIF_ARCH_RESCHED_LAZY	4	/* Lazy rescheduling */
>+#define TIF_SINGLESTEP		5	/* reenable singlestep on user return*/
>+#define TIF_SSBD		6	/* Speculative store bypass disable */
> #define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
> #define TIF_SPEC_L1D_FLUSH	10	/* Flush L1D on mm switches (processes) */
> #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
>@@ -104,6 +105,7 @@ struct thread_info {
> #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
> #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
> #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>+#define _TIF_ARCH_RESCHED_LAZY	(1 << TIF_ARCH_RESCHED_LAZY)
> #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
> #define _TIF_SSBD		(1 << TIF_SSBD)
> #define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)
>diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
>index bd6a7857ce058..d45dfd10b6366 100644
>--- a/drivers/acpi/processor_idle.c
>+++ b/drivers/acpi/processor_idle.c
>@@ -108,7 +108,7 @@ static const struct dmi_system_id processor_power_dmi_table[] = {
>  */
> static void __cpuidle acpi_safe_halt(void)
> {
>-	if (!tif_need_resched()) {
>+	if (!need_resched()) {
> 		raw_safe_halt();
> 		raw_local_irq_disable();
> 	}
>diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
>index 6772e0c654fa7..119007f17e13e 100644
>--- a/drivers/block/zram/zram_drv.c
>+++ b/drivers/block/zram/zram_drv.c
>@@ -57,6 +57,41 @@ static void zram_free_page(struct zram *zram, size_t index);
> static int zram_read_page(struct zram *zram, struct page *page, u32 index,
> 			  struct bio *parent);
> 
>+#ifdef CONFIG_PREEMPT_RT
>+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
>+{
>+	size_t index;
>+
>+	for (index = 0; index < num_pages; index++)
>+		spin_lock_init(&zram->table[index].lock);
>+}
>+
>+static int zram_slot_trylock(struct zram *zram, u32 index)
>+{
>+	int ret;
>+
>+	ret = spin_trylock(&zram->table[index].lock);
>+	if (ret)
>+		__set_bit(ZRAM_LOCK, &zram->table[index].flags);
>+	return ret;
>+}
>+
>+static void zram_slot_lock(struct zram *zram, u32 index)
>+{
>+	spin_lock(&zram->table[index].lock);
>+	__set_bit(ZRAM_LOCK, &zram->table[index].flags);
>+}
>+
>+static void zram_slot_unlock(struct zram *zram, u32 index)
>+{
>+	__clear_bit(ZRAM_LOCK, &zram->table[index].flags);
>+	spin_unlock(&zram->table[index].lock);
>+}
>+
>+#else
>+
>+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
>+
> static int zram_slot_trylock(struct zram *zram, u32 index)
> {
> 	return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags);
>@@ -71,6 +106,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
> {
> 	bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
> }
>+#endif
> 
> static inline bool init_done(struct zram *zram)
> {
>@@ -1241,6 +1277,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
> 
> 	if (!huge_class_size)
> 		huge_class_size = zs_huge_class_size(zram->mem_pool);
>+	zram_meta_init_table_locks(zram, num_pages);
> 	return true;
> }
> 
>diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
>index 3b94d12f41b40..dfc364b0d0727 100644
>--- a/drivers/block/zram/zram_drv.h
>+++ b/drivers/block/zram/zram_drv.h
>@@ -69,6 +69,9 @@ struct zram_table_entry {
> 		unsigned long element;
> 	};
> 	unsigned long flags;
>+#ifdef CONFIG_PREEMPT_RT
>+	spinlock_t lock;
>+#endif
> #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
> 	ktime_t ac_time;
> #endif
>diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
>index 3089029abba48..2d5828c5d3596 100644
>--- a/drivers/gpu/drm/i915/Kconfig
>+++ b/drivers/gpu/drm/i915/Kconfig
>@@ -3,7 +3,6 @@ config DRM_I915
> 	tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics"
> 	depends on DRM
> 	depends on X86 && PCI
>-	depends on !PREEMPT_RT
> 	select INTEL_GTT if X86
> 	select INTERVAL_TREE
> 	# we need shmfs for the swappable backing store, and in particular
>diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c
>index 8a84a31c7b48a..73a561af13d16 100644
>--- a/drivers/gpu/drm/i915/display/intel_crtc.c
>+++ b/drivers/gpu/drm/i915/display/intel_crtc.c
>@@ -580,7 +580,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state,
> 	 */
> 	intel_psr_wait_for_idle_locked(new_crtc_state);
> 
>-	local_irq_disable();
>+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>+		local_irq_disable();
> 
> 	crtc->debug.min_vbl = min;
> 	crtc->debug.max_vbl = max;
>@@ -605,11 +606,13 @@ void intel_pipe_update_start(struct intel_atomic_state *state,
> 			break;
> 		}
> 
>-		local_irq_enable();
>+		if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>+			local_irq_enable();
> 
> 		timeout = schedule_timeout(timeout);
> 
>-		local_irq_disable();
>+		if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>+			local_irq_disable();
> 	}
> 
> 	finish_wait(wq, &wait);
>@@ -642,7 +645,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state,
> 	return;
> 
> irq_disable:
>-	local_irq_disable();
>+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>+		local_irq_disable();
> }
> 
> #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE)
>@@ -744,7 +748,8 @@ void intel_pipe_update_end(struct intel_atomic_state *state,
> 	 */
> 	intel_vrr_send_push(new_crtc_state);
> 
>-	local_irq_enable();
>+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>+		local_irq_enable();
> 
> 	if (intel_vgpu_active(dev_priv))
> 		goto out;
>diff --git a/drivers/gpu/drm/i915/display/intel_vblank.c b/drivers/gpu/drm/i915/display/intel_vblank.c
>index fe256bf7b485b..a3c3faa8f305a 100644
>--- a/drivers/gpu/drm/i915/display/intel_vblank.c
>+++ b/drivers/gpu/drm/i915/display/intel_vblank.c
>@@ -275,6 +275,26 @@ int intel_crtc_scanline_to_hw(struct intel_crtc *crtc, int scanline)
>  * all register accesses to the same cacheline to be serialized,
>  * otherwise they may hang.
>  */
>+static void intel_vblank_section_enter_irqsave(struct drm_i915_private *i915, unsigned long *flags)
>+	__acquires(i915->uncore.lock)
>+{
>+#ifdef I915
>+	spin_lock_irqsave(&i915->uncore.lock, *flags);
>+#else
>+	*flags = 0;
>+#endif
>+}
>+
>+static void intel_vblank_section_exit_irqrestore(struct drm_i915_private *i915, unsigned long flags)
>+	__releases(i915->uncore.lock)
>+{
>+#ifdef I915
>+	spin_unlock_irqrestore(&i915->uncore.lock, flags);
>+#else
>+	if (flags)
>+		return;
>+#endif
>+}
> static void intel_vblank_section_enter(struct drm_i915_private *i915)
> 	__acquires(i915->uncore.lock)
> {
>@@ -332,10 +352,10 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
> 	 * timing critical raw register reads, potentially with
> 	 * preemption disabled, so the following code must not block.
> 	 */
>-	local_irq_save(irqflags);
>-	intel_vblank_section_enter(dev_priv);
>+	intel_vblank_section_enter_irqsave(dev_priv, &irqflags);
> 
>-	/* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
>+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
>+		preempt_disable();
> 
> 	/* Get optional system timestamp before query. */
> 	if (stime)
>@@ -399,10 +419,10 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
> 	if (etime)
> 		*etime = ktime_get();
> 
>-	/* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
>+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
>+		preempt_enable();
> 
>-	intel_vblank_section_exit(dev_priv);
>-	local_irq_restore(irqflags);
>+	intel_vblank_section_exit_irqrestore(dev_priv, irqflags);
> 
> 	/*
> 	 * While in vblank, position will be negative
>@@ -440,13 +460,11 @@ int intel_get_crtc_scanline(struct intel_crtc *crtc)
> 	unsigned long irqflags;
> 	int position;
> 
>-	local_irq_save(irqflags);
>-	intel_vblank_section_enter(dev_priv);
>+	intel_vblank_section_enter_irqsave(dev_priv, &irqflags);
> 
> 	position = __intel_get_crtc_scanline(crtc);
> 
>-	intel_vblank_section_exit(dev_priv);
>-	local_irq_restore(irqflags);
>+	intel_vblank_section_exit_irqrestore(dev_priv, irqflags);
> 
> 	return position;
> }
>diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
>index d650beb8ed22f..3dd3e516b80c1 100644
>--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
>+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
>@@ -317,10 +317,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
> 	/* Kick the work once more to drain the signalers, and disarm the irq */
> 	irq_work_sync(&b->irq_work);
> 	while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
>-		local_irq_disable();
>-		signal_irq_work(&b->irq_work);
>-		local_irq_enable();
>+		irq_work_queue(&b->irq_work);
> 		cond_resched();
>+		irq_work_sync(&b->irq_work);
> 	}
> }
> 
>diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>index 42aade0faf2d1..929ca2bad2d2c 100644
>--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>@@ -1303,7 +1303,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> 	 * and context switches) submission.
> 	 */
> 
>-	spin_lock(&sched_engine->lock);
>+	spin_lock_irq(&sched_engine->lock);
> 
> 	/*
> 	 * If the queue is higher priority than the last
>@@ -1403,7 +1403,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> 				 * Even if ELSP[1] is occupied and not worthy
> 				 * of timeslices, our queue might be.
> 				 */
>-				spin_unlock(&sched_engine->lock);
>+				spin_unlock_irq(&sched_engine->lock);
> 				return;
> 			}
> 		}
>@@ -1429,7 +1429,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> 
> 		if (last && !can_merge_rq(last, rq)) {
> 			spin_unlock(&ve->base.sched_engine->lock);
>-			spin_unlock(&engine->sched_engine->lock);
>+			spin_unlock_irq(&engine->sched_engine->lock);
> 			return; /* leave this for another sibling */
> 		}
> 
>@@ -1591,7 +1591,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> 	 */
> 	sched_engine->queue_priority_hint = queue_prio(sched_engine);
> 	i915_sched_engine_reset_on_empty(sched_engine);
>-	spin_unlock(&sched_engine->lock);
>+	spin_unlock_irq(&sched_engine->lock);
> 
> 	/*
> 	 * We can skip poking the HW if we ended up with exactly the same set
>@@ -1617,13 +1617,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> 	}
> }
> 
>-static void execlists_dequeue_irq(struct intel_engine_cs *engine)
>-{
>-	local_irq_disable(); /* Suspend interrupts across request submission */
>-	execlists_dequeue(engine);
>-	local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */
>-}
>-
> static void clear_ports(struct i915_request **ports, int count)
> {
> 	memset_p((void **)ports, NULL, count);
>@@ -2478,7 +2471,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t)
> 	}
> 
> 	if (!engine->execlists.pending[0]) {
>-		execlists_dequeue_irq(engine);
>+		execlists_dequeue(engine);
> 		start_timeslice(engine);
> 	}
> 
>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>index 813cc888e6fae..ab3483a59b79a 100644
>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>@@ -362,7 +362,7 @@ static inline int intel_guc_send_busy_loop(struct intel_guc *guc,
> {
> 	int err;
> 	unsigned int sleep_period_ms = 1;
>-	bool not_atomic = !in_atomic() && !irqs_disabled();
>+	bool not_atomic = !in_atomic() && !irqs_disabled() && !rcu_preempt_depth();
> 
> 	/*
> 	 * FIXME: Have caller pass in if we are in an atomic context to avoid
>diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
>index f59081066a197..014d02029a415 100644
>--- a/drivers/gpu/drm/i915/i915_request.c
>+++ b/drivers/gpu/drm/i915/i915_request.c
>@@ -609,7 +609,6 @@ bool __i915_request_submit(struct i915_request *request)
> 
> 	RQ_TRACE(request, "\n");
> 
>-	GEM_BUG_ON(!irqs_disabled());
> 	lockdep_assert_held(&engine->sched_engine->lock);
> 
> 	/*
>@@ -718,7 +717,6 @@ void __i915_request_unsubmit(struct i915_request *request)
> 	 */
> 	RQ_TRACE(request, "\n");
> 
>-	GEM_BUG_ON(!irqs_disabled());
> 	lockdep_assert_held(&engine->sched_engine->lock);
> 
> 	/*
>diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
>index ce1cbee1b39dd..3c51620d011b1 100644
>--- a/drivers/gpu/drm/i915/i915_trace.h
>+++ b/drivers/gpu/drm/i915/i915_trace.h
>@@ -6,6 +6,10 @@
> #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
> #define _I915_TRACE_H_
> 
>+#ifdef CONFIG_PREEMPT_RT
>+#define NOTRACE
>+#endif
>+
> #include <linux/stringify.h>
> #include <linux/types.h>
> #include <linux/tracepoint.h>
>@@ -322,7 +326,7 @@ DEFINE_EVENT(i915_request, i915_request_add,
> 	     TP_ARGS(rq)
> );
> 
>-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS)
>+#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE)
> DEFINE_EVENT(i915_request, i915_request_guc_submit,
> 	     TP_PROTO(struct i915_request *rq),
> 	     TP_ARGS(rq)
>diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
>index f98577967b7fc..6cc358aa5b2ff 100644
>--- a/drivers/gpu/drm/i915/i915_utils.h
>+++ b/drivers/gpu/drm/i915/i915_utils.h
>@@ -288,7 +288,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
> #define wait_for(COND, MS)		_wait_for((COND), (MS) * 1000, 10, 1000)
> 
> /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */
>-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT)
>+#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT)
> # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic())
> #else
> # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0)
>diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
>index b62ad9006780c..4e4f5501d81da 100644
>--- a/drivers/tty/serial/8250/8250_core.c
>+++ b/drivers/tty/serial/8250/8250_core.c
>@@ -592,6 +592,7 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
> 
> #ifdef CONFIG_SERIAL_8250_CONSOLE
> 
>+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE
> static void univ8250_console_write(struct console *co, const char *s,
> 				   unsigned int count)
> {
>@@ -599,6 +600,39 @@ static void univ8250_console_write(struct console *co, const char *s,
> 
> 	serial8250_console_write(up, s, count);
> }
>+#else
>+static void univ8250_console_write_atomic(struct console *co,
>+					  struct nbcon_write_context *wctxt)
>+{
>+	struct uart_8250_port *up = &serial8250_ports[co->index];
>+
>+	serial8250_console_write_atomic(up, wctxt);
>+}
>+
>+static void univ8250_console_write_thread(struct console *co,
>+					  struct nbcon_write_context *wctxt)
>+{
>+	struct uart_8250_port *up = &serial8250_ports[co->index];
>+
>+	serial8250_console_write_thread(up, wctxt);
>+}
>+
>+static void univ8250_console_device_lock(struct console *con, unsigned long *flags)
>+{
>+	struct uart_port *up = &serial8250_ports[con->index].port;
>+
>+	__uart_port_lock_irqsave(up, flags);
>+}
>+
>+static void univ8250_console_device_unlock(struct console *con, unsigned long flags)
>+{
>+	struct uart_port *up = &serial8250_ports[con->index].port;
>+
>+	__uart_port_unlock_irqrestore(up, flags);
>+}
>+
>+static struct nbcon_drvdata serial8250_nbcon_drvdata;
>+#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */
> 
> static int univ8250_console_setup(struct console *co, char *options)
> {
>@@ -627,11 +661,11 @@ static int univ8250_console_setup(struct console *co, char *options)
> 
> 	port = &serial8250_ports[co->index].port;
> 	/* link port to console */
>-	port->cons = co;
>+	uart_port_set_cons(port, co);
> 
> 	retval = serial8250_console_setup(port, options, false);
> 	if (retval != 0)
>-		port->cons = NULL;
>+		uart_port_set_cons(port, NULL);
> 	return retval;
> }
> 
>@@ -689,7 +723,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx,
> 			continue;
> 
> 		co->index = i;
>-		port->cons = co;
>+		uart_port_set_cons(port, co);
> 		return serial8250_console_setup(port, options, true);
> 	}
> 
>@@ -698,12 +732,21 @@ static int univ8250_console_match(struct console *co, char *name, int idx,
> 
> static struct console univ8250_console = {
> 	.name		= "ttyS",
>+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE
> 	.write		= univ8250_console_write,
>+	.flags		= CON_PRINTBUFFER | CON_ANYTIME,
>+#else
>+	.write_atomic	= univ8250_console_write_atomic,
>+	.write_thread	= univ8250_console_write_thread,
>+	.device_lock	= univ8250_console_device_lock,
>+	.device_unlock	= univ8250_console_device_unlock,
>+	.flags		= CON_PRINTBUFFER | CON_ANYTIME | CON_NBCON,
>+	.nbcon_drvdata	= &serial8250_nbcon_drvdata,
>+#endif
> 	.device		= uart_console_device,
> 	.setup		= univ8250_console_setup,
> 	.exit		= univ8250_console_exit,
> 	.match		= univ8250_console_match,
>-	.flags		= CON_PRINTBUFFER | CON_ANYTIME,
> 	.index		= -1,
> 	.data		= &serial8250_reg,
> };
>diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
>index 8ca061d3bbb92..ed50b434d8c80 100644
>--- a/drivers/tty/serial/8250/8250_port.c
>+++ b/drivers/tty/serial/8250/8250_port.c
>@@ -550,6 +550,13 @@ static int serial8250_em485_init(struct uart_8250_port *p)
> 	if (!p->em485)
> 		return -ENOMEM;
> 
>+#ifndef CONFIG_SERIAL_8250_LEGACY_CONSOLE
>+	if (uart_console(&p->port)) {
>+		dev_warn(p->port.dev, "no atomic printing for rs485 consoles\n");
>+		p->port.cons->write_atomic = NULL;
>+	}
>+#endif
>+
> 	hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC,
> 		     HRTIMER_MODE_REL);
> 	hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC,
>@@ -702,7 +709,11 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
> 	serial8250_rpm_put(p);
> }
> 
>-static void serial8250_clear_IER(struct uart_8250_port *up)
>+/*
>+ * Only to be used by write_atomic() and the legacy write(), which do not
>+ * require port lock.
>+ */
>+static void __serial8250_clear_IER(struct uart_8250_port *up)
> {
> 	if (up->capabilities & UART_CAP_UUE)
> 		serial_out(up, UART_IER, UART_IER_UUE);
>@@ -710,6 +721,14 @@ static void serial8250_clear_IER(struct uart_8250_port *up)
> 		serial_out(up, UART_IER, 0);
> }
> 
>+static inline void serial8250_clear_IER(struct uart_8250_port *up)
>+{
>+	/* Port locked to synchronize UART_IER access against the console. */
>+	lockdep_assert_held_once(&up->port.lock);
>+
>+	__serial8250_clear_IER(up);
>+}
>+
> #ifdef CONFIG_SERIAL_8250_RSA
> /*
>  * Attempts to turn on the RSA FIFO.  Returns zero on failure.
>@@ -3320,6 +3339,11 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
> 
> 	wait_for_xmitr(up, UART_LSR_THRE);
> 	serial_port_out(port, UART_TX, ch);
>+
>+	if (ch == '\n')
>+		up->console_newline_needed = false;
>+	else
>+		up->console_newline_needed = true;
> }
> 
> /*
>@@ -3348,6 +3372,7 @@ static void serial8250_console_restore(struct uart_8250_port *up)
> 	serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
> }
> 
>+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE
> /*
>  * Print a string to the serial port using the device FIFO
>  *
>@@ -3406,7 +3431,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
> 	 *	First save the IER then disable the interrupts
> 	 */
> 	ier = serial_port_in(port, UART_IER);
>-	serial8250_clear_IER(up);
>+	__serial8250_clear_IER(up);
> 
> 	/* check scratch reg to see if port powered off during system sleep */
> 	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
>@@ -3472,6 +3497,131 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
> 	if (locked)
> 		uart_port_unlock_irqrestore(port, flags);
> }
>+#else
>+void serial8250_console_write_thread(struct uart_8250_port *up,
>+				     struct nbcon_write_context *wctxt)
>+{
>+	struct uart_8250_em485 *em485 = up->em485;
>+	struct uart_port *port = &up->port;
>+	unsigned int ier;
>+
>+	touch_nmi_watchdog();
>+
>+	if (!nbcon_enter_unsafe(wctxt))
>+		return;
>+
>+	/* First save IER then disable the interrupts. */
>+	ier = serial_port_in(port, UART_IER);
>+	serial8250_clear_IER(up);
>+
>+	/* Check scratch reg if port powered off during system sleep. */
>+	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
>+		serial8250_console_restore(up);
>+		up->canary = 0;
>+	}
>+
>+	if (em485) {
>+		if (em485->tx_stopped)
>+			up->rs485_start_tx(up);
>+		mdelay(port->rs485.delay_rts_before_send);
>+	}
>+
>+	if (nbcon_exit_unsafe(wctxt)) {
>+		int len = READ_ONCE(wctxt->len);
>+		int i;
>+
>+		/*
>+		 * Write out the message. Toggle unsafe for each byte in order
>+		 * to give another (higher priority) context the opportunity
>+		 * for a friendly takeover. If such a takeover occurs, this
>+		 * context must reacquire ownership in order to perform final
>+		 * actions (such as re-enabling the interrupts).
>+		 *
>+		 * IMPORTANT: wctxt->outbuf and wctxt->len are no longer valid
>+		 *	      after a reacquire so writing the message must be
>+		 *	      aborted.
>+		 */
>+		for (i = 0; i < len; i++) {
>+			if (!nbcon_enter_unsafe(wctxt)) {
>+				nbcon_reacquire(wctxt);
>+				break;
>+			}
>+
>+			uart_console_write(port, wctxt->outbuf + i, 1, serial8250_console_putchar);
>+
>+			if (!nbcon_exit_unsafe(wctxt)) {
>+				nbcon_reacquire(wctxt);
>+				break;
>+			}
>+		}
>+	} else {
>+		nbcon_reacquire(wctxt);
>+	}
>+
>+	while (!nbcon_enter_unsafe(wctxt))
>+		nbcon_reacquire(wctxt);
>+
>+	/* Finally, wait for transmitter to become empty and restore IER. */
>+	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
>+	if (em485) {
>+		mdelay(port->rs485.delay_rts_after_send);
>+		if (em485->tx_stopped)
>+			up->rs485_stop_tx(up);
>+	}
>+	serial_port_out(port, UART_IER, ier);
>+
>+	/*
>+	 * The receive handling will happen properly because the receive ready
>+	 * bit will still be set; it is not cleared on read.  However, modem
>+	 * control will not, we must call it if we have saved something in the
>+	 * saved flags while processing with interrupts off.
>+	 */
>+	if (up->msr_saved_flags)
>+		serial8250_modem_status(up);
>+
>+	nbcon_exit_unsafe(wctxt);
>+}
>+
>+void serial8250_console_write_atomic(struct uart_8250_port *up,
>+				     struct nbcon_write_context *wctxt)
>+{
>+	struct uart_port *port = &up->port;
>+	unsigned int ier;
>+
>+	/* Atomic console not supported for rs485 mode. */
>+	if (WARN_ON_ONCE(up->em485))
>+		return;
>+
>+	touch_nmi_watchdog();
>+
>+	if (!nbcon_enter_unsafe(wctxt))
>+		return;
>+
>+	/*
>+	 * First save IER then disable the interrupts. The special variant to
>+	 * clear IER is used because atomic printing may occur without holding
>+	 * the port lock.
>+	 */
>+	ier = serial_port_in(port, UART_IER);
>+	__serial8250_clear_IER(up);
>+
>+	/* Check scratch reg if port powered off during system sleep. */
>+	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
>+		serial8250_console_restore(up);
>+		up->canary = 0;
>+	}
>+
>+	if (up->console_newline_needed)
>+		uart_console_write(port, "\n", 1, serial8250_console_putchar);
>+	uart_console_write(port, wctxt->outbuf, wctxt->len, serial8250_console_putchar);
>+
>+	/* Finally, wait for transmitter to become empty and restore IER. */
>+	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
>+	serial_port_out(port, UART_IER, ier);
>+
>+	nbcon_exit_unsafe(wctxt);
>+}
>+#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */
> 
> static unsigned int probe_baud(struct uart_port *port)
> {
>@@ -3490,6 +3640,7 @@ static unsigned int probe_baud(struct uart_port *port)
> 
> int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
> {
>+	struct uart_8250_port *up = up_to_u8250p(port);
> 	int baud = 9600;
> 	int bits = 8;
> 	int parity = 'n';
>@@ -3499,6 +3650,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
> 	if (!port->iobase && !port->membase)
> 		return -ENODEV;
> 
>+	up->console_newline_needed = false;
>+
> 	if (options)
> 		uart_parse_options(options, &baud, &parity, &bits, &flow);
> 	else if (probe)
>diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
>index cf2c890a560f0..7618a5783adbc 100644
>--- a/drivers/tty/serial/amba-pl011.c
>+++ b/drivers/tty/serial/amba-pl011.c
>@@ -348,10 +348,7 @@ static int pl011_fifo_to_tty(struct uart_amba_port *uap)
> 				flag = TTY_FRAME;
> 		}
> 
>-		uart_port_unlock(&uap->port);
>-		sysrq = uart_handle_sysrq_char(&uap->port, ch & 255);
>-		uart_port_lock(&uap->port);
>-
>+		sysrq = uart_prepare_sysrq_char(&uap->port, ch & 255);
> 		if (!sysrq)
> 			uart_insert_char(&uap->port, ch, UART011_DR_OE, ch, flag);
> 	}
>@@ -1017,7 +1014,7 @@ static void pl011_dma_rx_callback(void *data)
> 	ret = pl011_dma_rx_trigger_dma(uap);
> 
> 	pl011_dma_rx_chars(uap, pending, lastbuf, false);
>-	uart_port_unlock_irq(&uap->port);
>+	uart_unlock_and_check_sysrq(&uap->port);
> 	/*
> 	 * Do this check after we picked the DMA chars so we don't
> 	 * get some IRQ immediately from RX.
>@@ -1540,11 +1537,10 @@ static void check_apply_cts_event_workaround(struct uart_amba_port *uap)
> static irqreturn_t pl011_int(int irq, void *dev_id)
> {
> 	struct uart_amba_port *uap = dev_id;
>-	unsigned long flags;
> 	unsigned int status, pass_counter = AMBA_ISR_PASS_LIMIT;
> 	int handled = 0;
> 
>-	uart_port_lock_irqsave(&uap->port, &flags);
>+	uart_port_lock(&uap->port);
> 	status = pl011_read(uap, REG_RIS) & uap->im;
> 	if (status) {
> 		do {
>@@ -1573,7 +1569,7 @@ static irqreturn_t pl011_int(int irq, void *dev_id)
> 		handled = 1;
> 	}
> 
>-	uart_port_unlock_irqrestore(&uap->port, flags);
>+	uart_unlock_and_check_sysrq(&uap->port);
> 
> 	return IRQ_RETVAL(handled);
> }
>@@ -2322,13 +2318,10 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
> 
> 	clk_enable(uap->clk);
> 
>-	local_irq_save(flags);
>-	if (uap->port.sysrq)
>-		locked = 0;
>-	else if (oops_in_progress)
>-		locked = uart_port_trylock(&uap->port);
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(&uap->port, &flags);
> 	else
>-		uart_port_lock(&uap->port);
>+		uart_port_lock_irqsave(&uap->port, &flags);
> 
> 	/*
> 	 *	First save the CR then disable the interrupts
>@@ -2354,8 +2347,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
> 		pl011_write(old_cr, uap, REG_CR);
> 
> 	if (locked)
>-		uart_port_unlock(&uap->port);
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(&uap->port, flags);
> 
> 	clk_disable(uap->clk);
> }
>@@ -2496,7 +2488,7 @@ static int pl011_console_match(struct console *co, char *name, int idx,
> 			continue;
> 
> 		co->index = i;
>-		port->cons = co;
>+		uart_port_set_cons(port, co);
> 		return pl011_console_setup(co, options);
> 	}
> 
>diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c
>index 8d09ace062e59..7790cbc57391a 100644
>--- a/drivers/tty/serial/ar933x_uart.c
>+++ b/drivers/tty/serial/ar933x_uart.c
>@@ -378,7 +378,7 @@ static void ar933x_uart_rx_chars(struct ar933x_uart_port *up)
> 		up->port.icount.rx++;
> 		ch = rdata & AR933X_UART_DATA_TX_RX_MASK;
> 
>-		if (uart_handle_sysrq_char(&up->port, ch))
>+		if (uart_prepare_sysrq_char(&up->port, ch))
> 			continue;
> 
> 		if ((up->port.ignore_status_mask & AR933X_DUMMY_STATUS_RD) == 0)
>@@ -468,7 +468,7 @@ static irqreturn_t ar933x_uart_interrupt(int irq, void *dev_id)
> 		ar933x_uart_tx_chars(up);
> 	}
> 
>-	uart_port_unlock(&up->port);
>+	uart_unlock_and_check_sysrq(&up->port);
> 
> 	return IRQ_HANDLED;
> }
>@@ -627,14 +627,10 @@ static void ar933x_uart_console_write(struct console *co, const char *s,
> 	unsigned int int_en;
> 	int locked = 1;
> 
>-	local_irq_save(flags);
>-
>-	if (up->port.sysrq)
>-		locked = 0;
>-	else if (oops_in_progress)
>-		locked = uart_port_trylock(&up->port);
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(&up->port, &flags);
> 	else
>-		uart_port_lock(&up->port);
>+		uart_port_lock_irqsave(&up->port, &flags);
> 
> 	/*
> 	 * First save the IER then disable the interrupts
>@@ -654,9 +650,7 @@ static void ar933x_uart_console_write(struct console *co, const char *s,
> 	ar933x_uart_write(up, AR933X_UART_INT_REG, AR933X_UART_INT_ALLINTS);
> 
> 	if (locked)
>-		uart_port_unlock(&up->port);
>-
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(&up->port, flags);
> }
> 
> static int ar933x_uart_console_setup(struct console *co, char *options)
>diff --git a/drivers/tty/serial/bcm63xx_uart.c b/drivers/tty/serial/bcm63xx_uart.c
>index a3cefa153456d..34801a6f300b6 100644
>--- a/drivers/tty/serial/bcm63xx_uart.c
>+++ b/drivers/tty/serial/bcm63xx_uart.c
>@@ -285,10 +285,9 @@ static void bcm_uart_do_rx(struct uart_port *port)
> 				flag = TTY_PARITY;
> 		}
> 
>-		if (uart_handle_sysrq_char(port, c))
>+		if (uart_prepare_sysrq_char(port, c))
> 			continue;
> 
>-
> 		if ((cstat & port->ignore_status_mask) == 0)
> 			tty_insert_flip_char(tty_port, c, flag);
> 
>@@ -353,7 +352,7 @@ static irqreturn_t bcm_uart_interrupt(int irq, void *dev_id)
> 					       estat & UART_EXTINP_DCD_MASK);
> 	}
> 
>-	uart_port_unlock(port);
>+	uart_unlock_and_check_sysrq(port);
> 	return IRQ_HANDLED;
> }
> 
>@@ -703,20 +702,14 @@ static void bcm_console_write(struct console *co, const char *s,
> {
> 	struct uart_port *port;
> 	unsigned long flags;
>-	int locked;
>+	int locked = 1;
> 
> 	port = &ports[co->index];
> 
>-	local_irq_save(flags);
>-	if (port->sysrq) {
>-		/* bcm_uart_interrupt() already took the lock */
>-		locked = 0;
>-	} else if (oops_in_progress) {
>-		locked = uart_port_trylock(port);
>-	} else {
>-		uart_port_lock(port);
>-		locked = 1;
>-	}
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(port, &flags);
>+	else
>+		uart_port_lock_irqsave(port, &flags);
> 
> 	/* call helper to deal with \r\n */
> 	uart_console_write(port, s, count, bcm_console_putchar);
>@@ -725,8 +718,7 @@ static void bcm_console_write(struct console *co, const char *s,
> 	wait_for_xmitr(port);
> 
> 	if (locked)
>-		uart_port_unlock(port);
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(port, flags);
> }
> 
> /*
>diff --git a/drivers/tty/serial/lpc32xx_hs.c b/drivers/tty/serial/lpc32xx_hs.c
>index ec20329f06036..e70fa59dbcc3b 100644
>--- a/drivers/tty/serial/lpc32xx_hs.c
>+++ b/drivers/tty/serial/lpc32xx_hs.c
>@@ -136,20 +136,16 @@ static void lpc32xx_hsuart_console_write(struct console *co, const char *s,
> 	int locked = 1;
> 
> 	touch_nmi_watchdog();
>-	local_irq_save(flags);
>-	if (up->port.sysrq)
>-		locked = 0;
>-	else if (oops_in_progress)
>-		locked = uart_port_trylock(&up->port);
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(&up->port, &flags);
> 	else
>-		uart_port_lock(&up->port);
>+		uart_port_lock_irqsave(&up->port, &flags);
> 
> 	uart_console_write(&up->port, s, count, lpc32xx_hsuart_console_putchar);
> 	wait_for_xmit_empty(&up->port);
> 
> 	if (locked)
>-		uart_port_unlock(&up->port);
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(&up->port, flags);
> }
> 
> static int __init lpc32xx_hsuart_console_setup(struct console *co,
>@@ -268,7 +264,8 @@ static void __serial_lpc32xx_rx(struct uart_port *port)
> 			tty_insert_flip_char(tport, 0, TTY_FRAME);
> 		}
> 
>-		tty_insert_flip_char(tport, (tmp & 0xFF), flag);
>+		if (!uart_prepare_sysrq_char(port, tmp & 0xff))
>+			tty_insert_flip_char(tport, (tmp & 0xFF), flag);
> 
> 		tmp = readl(LPC32XX_HSUART_FIFO(port->membase));
> 	}
>@@ -333,7 +330,7 @@ static irqreturn_t serial_lpc32xx_interrupt(int irq, void *dev_id)
> 		__serial_lpc32xx_tx(port);
> 	}
> 
>-	uart_port_unlock(port);
>+	uart_unlock_and_check_sysrq(port);
> 
> 	return IRQ_HANDLED;
> }
>diff --git a/drivers/tty/serial/meson_uart.c b/drivers/tty/serial/meson_uart.c
>index 8395688f5ee92..6feac459c0cf4 100644
>--- a/drivers/tty/serial/meson_uart.c
>+++ b/drivers/tty/serial/meson_uart.c
>@@ -220,7 +220,7 @@ static void meson_receive_chars(struct uart_port *port)
> 				continue;
> 		}
> 
>-		if (uart_handle_sysrq_char(port, ch))
>+		if (uart_prepare_sysrq_char(port, ch))
> 			continue;
> 
> 		if ((status & port->ignore_status_mask) == 0)
>@@ -248,7 +248,7 @@ static irqreturn_t meson_uart_interrupt(int irq, void *dev_id)
> 			meson_uart_start_tx(port);
> 	}
> 
>-	uart_port_unlock(port);
>+	uart_unlock_and_check_sysrq(port);
> 
> 	return IRQ_HANDLED;
> }
>@@ -556,18 +556,13 @@ static void meson_serial_port_write(struct uart_port *port, const char *s,
> 				    u_int count)
> {
> 	unsigned long flags;
>-	int locked;
>+	int locked = 1;
> 	u32 val, tmp;
> 
>-	local_irq_save(flags);
>-	if (port->sysrq) {
>-		locked = 0;
>-	} else if (oops_in_progress) {
>-		locked = uart_port_trylock(port);
>-	} else {
>-		uart_port_lock(port);
>-		locked = 1;
>-	}
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(port, &flags);
>+	else
>+		uart_port_lock_irqsave(port, &flags);
> 
> 	val = readl(port->membase + AML_UART_CONTROL);
> 	tmp = val & ~(AML_UART_TX_INT_EN | AML_UART_RX_INT_EN);
>@@ -577,8 +572,7 @@ static void meson_serial_port_write(struct uart_port *port, const char *s,
> 	writel(val, port->membase + AML_UART_CONTROL);
> 
> 	if (locked)
>-		uart_port_unlock(port);
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(port, flags);
> }
> 
> static void meson_serial_console_write(struct console *co, const char *s,
>diff --git a/drivers/tty/serial/msm_serial.c b/drivers/tty/serial/msm_serial.c
>index e24204ad35def..d27c4c8c84e13 100644
>--- a/drivers/tty/serial/msm_serial.c
>+++ b/drivers/tty/serial/msm_serial.c
>@@ -588,16 +588,14 @@ static void msm_complete_rx_dma(void *args)
> 		if (!(port->read_status_mask & MSM_UART_SR_RX_BREAK))
> 			flag = TTY_NORMAL;
> 
>-		uart_port_unlock_irqrestore(port, flags);
>-		sysrq = uart_handle_sysrq_char(port, dma->virt[i]);
>-		uart_port_lock_irqsave(port, &flags);
>+		sysrq = uart_prepare_sysrq_char(port, dma->virt[i]);
> 		if (!sysrq)
> 			tty_insert_flip_char(tport, dma->virt[i], flag);
> 	}
> 
> 	msm_start_rx_dma(msm_port);
> done:
>-	uart_port_unlock_irqrestore(port, flags);
>+	uart_unlock_and_check_sysrq_irqrestore(port, flags);
> 
> 	if (count)
> 		tty_flip_buffer_push(tport);
>@@ -763,9 +761,7 @@ static void msm_handle_rx_dm(struct uart_port *port, unsigned int misr)
> 			if (!(port->read_status_mask & MSM_UART_SR_RX_BREAK))
> 				flag = TTY_NORMAL;
> 
>-			uart_port_unlock(port);
>-			sysrq = uart_handle_sysrq_char(port, buf[i]);
>-			uart_port_lock(port);
>+			sysrq = uart_prepare_sysrq_char(port, buf[i]);
> 			if (!sysrq)
> 				tty_insert_flip_char(tport, buf[i], flag);
> 		}
>@@ -825,9 +821,7 @@ static void msm_handle_rx(struct uart_port *port)
> 		else if (sr & MSM_UART_SR_PAR_FRAME_ERR)
> 			flag = TTY_FRAME;
> 
>-		uart_port_unlock(port);
>-		sysrq = uart_handle_sysrq_char(port, c);
>-		uart_port_lock(port);
>+		sysrq = uart_prepare_sysrq_char(port, c);
> 		if (!sysrq)
> 			tty_insert_flip_char(tport, c, flag);
> 	}
>@@ -948,11 +942,10 @@ static irqreturn_t msm_uart_irq(int irq, void *dev_id)
> 	struct uart_port *port = dev_id;
> 	struct msm_port *msm_port = to_msm_port(port);
> 	struct msm_dma *dma = &msm_port->rx_dma;
>-	unsigned long flags;
> 	unsigned int misr;
> 	u32 val;
> 
>-	uart_port_lock_irqsave(port, &flags);
>+	uart_port_lock(port);
> 	misr = msm_read(port, MSM_UART_MISR);
> 	msm_write(port, 0, MSM_UART_IMR); /* disable interrupt */
> 
>@@ -984,7 +977,7 @@ static irqreturn_t msm_uart_irq(int irq, void *dev_id)
> 		msm_handle_delta_cts(port);
> 
> 	msm_write(port, msm_port->imr, MSM_UART_IMR); /* restore interrupt */
>-	uart_port_unlock_irqrestore(port, flags);
>+	uart_unlock_and_check_sysrq(port);
> 
> 	return IRQ_HANDLED;
> }
>@@ -1621,14 +1614,10 @@ static void __msm_console_write(struct uart_port *port, const char *s,
> 			num_newlines++;
> 	count += num_newlines;
> 
>-	local_irq_save(flags);
>-
>-	if (port->sysrq)
>-		locked = 0;
>-	else if (oops_in_progress)
>-		locked = uart_port_trylock(port);
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(port, &flags);
> 	else
>-		uart_port_lock(port);
>+		uart_port_lock_irqsave(port, &flags);
> 
> 	if (is_uartdm)
> 		msm_reset_dm_count(port, count);
>@@ -1667,9 +1656,7 @@ static void __msm_console_write(struct uart_port *port, const char *s,
> 	}
> 
> 	if (locked)
>-		uart_port_unlock(port);
>-
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(port, flags);
> }
> 
> static void msm_console_write(struct console *co, const char *s,
>diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
>index f5a0b401af63b..9be1c871cf116 100644
>--- a/drivers/tty/serial/omap-serial.c
>+++ b/drivers/tty/serial/omap-serial.c
>@@ -508,7 +508,7 @@ static void serial_omap_rdi(struct uart_omap_port *up, unsigned int lsr)
> 
> 	up->port.icount.rx++;
> 
>-	if (uart_handle_sysrq_char(&up->port, ch))
>+	if (uart_prepare_sysrq_char(&up->port, ch))
> 		return;
> 
> 	uart_insert_char(&up->port, lsr, UART_LSR_OE, ch, TTY_NORMAL);
>@@ -563,7 +563,7 @@ static irqreturn_t serial_omap_irq(int irq, void *dev_id)
> 		}
> 	} while (max_count--);
> 
>-	uart_port_unlock(&up->port);
>+	uart_unlock_and_check_sysrq(&up->port);
> 
> 	tty_flip_buffer_push(&up->port.state->port);
> 
>@@ -1212,13 +1212,10 @@ serial_omap_console_write(struct console *co, const char *s,
> 	unsigned int ier;
> 	int locked = 1;
> 
>-	local_irq_save(flags);
>-	if (up->port.sysrq)
>-		locked = 0;
>-	else if (oops_in_progress)
>-		locked = uart_port_trylock(&up->port);
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(&up->port, &flags);
> 	else
>-		uart_port_lock(&up->port);
>+		uart_port_lock_irqsave(&up->port, &flags);
> 
> 	/*
> 	 * First save the IER then disable the interrupts
>@@ -1245,8 +1242,7 @@ serial_omap_console_write(struct console *co, const char *s,
> 		check_modem_status(up);
> 
> 	if (locked)
>-		uart_port_unlock(&up->port);
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(&up->port, flags);
> }
> 
> static int __init
>diff --git a/drivers/tty/serial/owl-uart.c b/drivers/tty/serial/owl-uart.c
>index d9fe85397741d..8b60ac0ad7cd3 100644
>--- a/drivers/tty/serial/owl-uart.c
>+++ b/drivers/tty/serial/owl-uart.c
>@@ -199,6 +199,7 @@ static void owl_uart_receive_chars(struct uart_port *port)
> 	stat = owl_uart_read(port, OWL_UART_STAT);
> 	while (!(stat & OWL_UART_STAT_RFEM)) {
> 		char flag = TTY_NORMAL;
>+		bool sysrq;
> 
> 		if (stat & OWL_UART_STAT_RXER)
> 			port->icount.overrun++;
>@@ -217,7 +218,9 @@ static void owl_uart_receive_chars(struct uart_port *port)
> 		val = owl_uart_read(port, OWL_UART_RXDAT);
> 		val &= 0xff;
> 
>-		if ((stat & port->ignore_status_mask) == 0)
>+		sysrq = uart_prepare_sysrq_char(port, val);
>+
>+		if (!sysrq && (stat & port->ignore_status_mask) == 0)
> 			tty_insert_flip_char(&port->state->port, val, flag);
> 
> 		stat = owl_uart_read(port, OWL_UART_STAT);
>@@ -229,10 +232,9 @@ static void owl_uart_receive_chars(struct uart_port *port)
> static irqreturn_t owl_uart_irq(int irq, void *dev_id)
> {
> 	struct uart_port *port = dev_id;
>-	unsigned long flags;
> 	u32 stat;
> 
>-	uart_port_lock_irqsave(port, &flags);
>+	uart_port_lock(port);
> 
> 	stat = owl_uart_read(port, OWL_UART_STAT);
> 
>@@ -246,7 +248,7 @@ static irqreturn_t owl_uart_irq(int irq, void *dev_id)
> 	stat |= OWL_UART_STAT_RIP | OWL_UART_STAT_TIP;
> 	owl_uart_write(port, stat, OWL_UART_STAT);
> 
>-	uart_port_unlock_irqrestore(port, flags);
>+	uart_unlock_and_check_sysrq(port);
> 
> 	return IRQ_HANDLED;
> }
>@@ -508,18 +510,12 @@ static void owl_uart_port_write(struct uart_port *port, const char *s,
> {
> 	u32 old_ctl, val;
> 	unsigned long flags;
>-	int locked;
>+	int locked = 1;
> 
>-	local_irq_save(flags);
>-
>-	if (port->sysrq)
>-		locked = 0;
>-	else if (oops_in_progress)
>-		locked = uart_port_trylock(port);
>-	else {
>-		uart_port_lock(port);
>-		locked = 1;
>-	}
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(port, &flags);
>+	else
>+		uart_port_lock_irqsave(port, &flags);
> 
> 	old_ctl = owl_uart_read(port, OWL_UART_CTL);
> 	val = old_ctl | OWL_UART_CTL_TRFS_TX;
>@@ -541,9 +537,7 @@ static void owl_uart_port_write(struct uart_port *port, const char *s,
> 	owl_uart_write(port, old_ctl, OWL_UART_CTL);
> 
> 	if (locked)
>-		uart_port_unlock(port);
>-
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(port, flags);
> }
> 
> static void owl_uart_console_write(struct console *co, const char *s,
>diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c
>index 436cc6d52a11b..89257cddf5405 100644
>--- a/drivers/tty/serial/pch_uart.c
>+++ b/drivers/tty/serial/pch_uart.c
>@@ -237,9 +237,6 @@ struct eg20t_port {
> 
> #define IRQ_NAME_SIZE 17
> 	char				irq_name[IRQ_NAME_SIZE];
>-
>-	/* protect the eg20t_port private structure and io access to membase */
>-	spinlock_t lock;
> };
> 
> /**
>@@ -567,7 +564,7 @@ static int pch_uart_hal_read(struct eg20t_port *priv, unsigned char *buf,
> 			if (uart_handle_break(port))
> 				continue;
> 		}
>-		if (uart_handle_sysrq_char(port, rbr))
>+		if (uart_prepare_sysrq_char(port, rbr))
> 			continue;
> 
> 		buf[i++] = rbr;
>@@ -599,16 +596,14 @@ static void pch_uart_hal_set_break(struct eg20t_port *priv, int on)
> 	iowrite8(lcr, priv->membase + UART_LCR);
> }
> 
>-static int push_rx(struct eg20t_port *priv, const unsigned char *buf,
>-		   int size)
>+static void push_rx(struct eg20t_port *priv, const unsigned char *buf,
>+		    int size)
> {
> 	struct uart_port *port = &priv->port;
> 	struct tty_port *tport = &port->state->port;
> 
> 	tty_insert_flip_string(tport, buf, size);
> 	tty_flip_buffer_push(tport);
>-
>-	return 0;
> }
> 
> static int dma_push_rx(struct eg20t_port *priv, int size)
>@@ -761,7 +756,7 @@ static int handle_rx_to(struct eg20t_port *priv)
> {
> 	struct pch_uart_buffer *buf;
> 	int rx_size;
>-	int ret;
>+
> 	if (!priv->start_rx) {
> 		pch_uart_hal_disable_interrupt(priv, PCH_UART_HAL_RX_INT |
> 						     PCH_UART_HAL_RX_ERR_INT);
>@@ -770,19 +765,12 @@ static int handle_rx_to(struct eg20t_port *priv)
> 	buf = &priv->rxbuf;
> 	do {
> 		rx_size = pch_uart_hal_read(priv, buf->buf, buf->size);
>-		ret = push_rx(priv, buf->buf, rx_size);
>-		if (ret)
>-			return 0;
>+		push_rx(priv, buf->buf, rx_size);
> 	} while (rx_size == buf->size);
> 
> 	return PCH_UART_HANDLED_RX_INT;
> }
> 
>-static int handle_rx(struct eg20t_port *priv)
>-{
>-	return handle_rx_to(priv);
>-}
>-
> static int dma_handle_rx(struct eg20t_port *priv)
> {
> 	struct uart_port *port = &priv->port;
>@@ -1019,11 +1007,10 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id)
> 	u8 lsr;
> 	int ret = 0;
> 	unsigned char iid;
>-	unsigned long flags;
> 	int next = 1;
> 	u8 msr;
> 
>-	spin_lock_irqsave(&priv->lock, flags);
>+	uart_port_lock(&priv->port);
> 	handled = 0;
> 	while (next) {
> 		iid = pch_uart_hal_get_iid(priv);
>@@ -1051,7 +1038,7 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id)
> 						PCH_UART_HAL_RX_INT |
> 						PCH_UART_HAL_RX_ERR_INT);
> 			} else {
>-				ret = handle_rx(priv);
>+				ret = handle_rx_to(priv);
> 			}
> 			break;
> 		case PCH_UART_IID_RDR_TO:	/* Received Data Ready
>@@ -1083,7 +1070,7 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id)
> 		handled |= (unsigned int)ret;
> 	}
> 
>-	spin_unlock_irqrestore(&priv->lock, flags);
>+	uart_unlock_and_check_sysrq(&priv->port);
> 	return IRQ_RETVAL(handled);
> }
> 
>@@ -1194,9 +1181,9 @@ static void pch_uart_break_ctl(struct uart_port *port, int ctl)
> 	unsigned long flags;
> 
> 	priv = container_of(port, struct eg20t_port, port);
>-	spin_lock_irqsave(&priv->lock, flags);
>+	uart_port_lock_irqsave(&priv->port, &flags);
> 	pch_uart_hal_set_break(priv, ctl);
>-	spin_unlock_irqrestore(&priv->lock, flags);
>+	uart_port_unlock_irqrestore(&priv->port, flags);
> }
> 
> /* Grab any interrupt resources and initialise any low level driver state. */
>@@ -1346,8 +1333,7 @@ static void pch_uart_set_termios(struct uart_port *port,
> 
> 	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16);
> 
>-	spin_lock_irqsave(&priv->lock, flags);
>-	uart_port_lock(port);
>+	uart_port_lock_irqsave(port, &flags);
> 
> 	uart_update_timeout(port, termios->c_cflag, baud);
> 	rtn = pch_uart_hal_set_line(priv, baud, parity, bits, stb);
>@@ -1360,8 +1346,7 @@ static void pch_uart_set_termios(struct uart_port *port,
> 		tty_termios_encode_baud_rate(termios, baud, baud);
> 
> out:
>-	uart_port_unlock(port);
>-	spin_unlock_irqrestore(&priv->lock, flags);
>+	uart_port_unlock_irqrestore(port, flags);
> }
> 
> static const char *pch_uart_type(struct uart_port *port)
>@@ -1565,27 +1550,17 @@ pch_console_write(struct console *co, const char *s, unsigned int count)
> {
> 	struct eg20t_port *priv;
> 	unsigned long flags;
>-	int priv_locked = 1;
>-	int port_locked = 1;
>+	int locked = 1;
> 	u8 ier;
> 
> 	priv = pch_uart_ports[co->index];
> 
> 	touch_nmi_watchdog();
> 
>-	local_irq_save(flags);
>-	if (priv->port.sysrq) {
>-		/* call to uart_handle_sysrq_char already took the priv lock */
>-		priv_locked = 0;
>-		/* serial8250_handle_port() already took the port lock */
>-		port_locked = 0;
>-	} else if (oops_in_progress) {
>-		priv_locked = spin_trylock(&priv->lock);
>-		port_locked = uart_port_trylock(&priv->port);
>-	} else {
>-		spin_lock(&priv->lock);
>-		uart_port_lock(&priv->port);
>-	}
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(&priv->port, &flags);
>+	else
>+		uart_port_lock_irqsave(&priv->port, &flags);
> 
> 	/*
> 	 *	First save the IER then disable the interrupts
>@@ -1603,11 +1578,8 @@ pch_console_write(struct console *co, const char *s, unsigned int count)
> 	wait_for_xmitr(priv, UART_LSR_BOTH_EMPTY);
> 	iowrite8(ier, priv->membase + UART_IER);
> 
>-	if (port_locked)
>-		uart_port_unlock(&priv->port);
>-	if (priv_locked)
>-		spin_unlock(&priv->lock);
>-	local_irq_restore(flags);
>+	if (locked)
>+		uart_port_unlock_irqrestore(&priv->port, flags);
> }
> 
> static int __init pch_console_setup(struct console *co, char *options)
>@@ -1704,8 +1676,6 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev,
> 	pci_enable_msi(pdev);
> 	pci_set_master(pdev);
> 
>-	spin_lock_init(&priv->lock);
>-
> 	iobase = pci_resource_start(pdev, 0);
> 	mapbase = pci_resource_start(pdev, 1);
> 	priv->mapbase = mapbase;
>@@ -1735,8 +1705,6 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev,
> 		 KBUILD_MODNAME ":" PCH_UART_DRIVER_DEVICE "%d",
> 		 priv->port.line);
> 
>-	spin_lock_init(&priv->port.lock);
>-
> 	pci_set_drvdata(pdev, priv);
> 	priv->trigger_level = 1;
> 	priv->fcr = 0;
>diff --git a/drivers/tty/serial/pxa.c b/drivers/tty/serial/pxa.c
>index 46e70e155aab2..e395ff29c1a2c 100644
>--- a/drivers/tty/serial/pxa.c
>+++ b/drivers/tty/serial/pxa.c
>@@ -151,7 +151,7 @@ static inline void receive_chars(struct uart_pxa_port *up, int *status)
> 				flag = TTY_FRAME;
> 		}
> 
>-		if (uart_handle_sysrq_char(&up->port, ch))
>+		if (uart_prepare_sysrq_char(&up->port, ch))
> 			goto ignore_char;
> 
> 		uart_insert_char(&up->port, *status, UART_LSR_OE, ch, flag);
>@@ -232,7 +232,7 @@ static inline irqreturn_t serial_pxa_irq(int irq, void *dev_id)
> 	check_modem_status(up);
> 	if (lsr & UART_LSR_THRE)
> 		transmit_chars(up);
>-	uart_port_unlock(&up->port);
>+	uart_unlock_and_check_sysrq(&up->port);
> 	return IRQ_HANDLED;
> }
> 
>@@ -604,13 +604,10 @@ serial_pxa_console_write(struct console *co, const char *s, unsigned int count)
> 	int locked = 1;
> 
> 	clk_enable(up->clk);
>-	local_irq_save(flags);
>-	if (up->port.sysrq)
>-		locked = 0;
>-	else if (oops_in_progress)
>-		locked = uart_port_trylock(&up->port);
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(&up->port, &flags);
> 	else
>-		uart_port_lock(&up->port);
>+		uart_port_lock_irqsave(&up->port, &flags);
> 
> 	/*
> 	 *	First save the IER then disable the interrupts
>@@ -628,10 +625,8 @@ serial_pxa_console_write(struct console *co, const char *s, unsigned int count)
> 	serial_out(up, UART_IER, ier);
> 
> 	if (locked)
>-		uart_port_unlock(&up->port);
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(&up->port, flags);
> 	clk_disable(up->clk);
>-
> }
> 
> #ifdef CONFIG_CONSOLE_POLL
>diff --git a/drivers/tty/serial/rda-uart.c b/drivers/tty/serial/rda-uart.c
>index 13deb355cf1bc..82def9b8632a5 100644
>--- a/drivers/tty/serial/rda-uart.c
>+++ b/drivers/tty/serial/rda-uart.c
>@@ -394,7 +394,8 @@ static void rda_uart_receive_chars(struct uart_port *port)
> 		val &= 0xff;
> 
> 		port->icount.rx++;
>-		tty_insert_flip_char(&port->state->port, val, flag);
>+		if (!uart_prepare_sysrq_char(port, val))
>+			tty_insert_flip_char(&port->state->port, val, flag);
> 
> 		status = rda_uart_read(port, RDA_UART_STATUS);
> 	}
>@@ -405,10 +406,9 @@ static void rda_uart_receive_chars(struct uart_port *port)
> static irqreturn_t rda_interrupt(int irq, void *dev_id)
> {
> 	struct uart_port *port = dev_id;
>-	unsigned long flags;
> 	u32 val, irq_mask;
> 
>-	uart_port_lock_irqsave(port, &flags);
>+	uart_port_lock(port);
> 
> 	/* Clear IRQ cause */
> 	val = rda_uart_read(port, RDA_UART_IRQ_CAUSE);
>@@ -425,7 +425,7 @@ static irqreturn_t rda_interrupt(int irq, void *dev_id)
> 		rda_uart_send_chars(port);
> 	}
> 
>-	uart_port_unlock_irqrestore(port, flags);
>+	uart_unlock_and_check_sysrq(port);
> 
> 	return IRQ_HANDLED;
> }
>@@ -590,18 +590,12 @@ static void rda_uart_port_write(struct uart_port *port, const char *s,
> {
> 	u32 old_irq_mask;
> 	unsigned long flags;
>-	int locked;
>+	int locked = 1;
> 
>-	local_irq_save(flags);
>-
>-	if (port->sysrq) {
>-		locked = 0;
>-	} else if (oops_in_progress) {
>-		locked = uart_port_trylock(port);
>-	} else {
>-		uart_port_lock(port);
>-		locked = 1;
>-	}
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(port, &flags);
>+	else
>+		uart_port_lock_irqsave(port, &flags);
> 
> 	old_irq_mask = rda_uart_read(port, RDA_UART_IRQ_MASK);
> 	rda_uart_write(port, 0, RDA_UART_IRQ_MASK);
>@@ -615,9 +609,7 @@ static void rda_uart_port_write(struct uart_port *port, const char *s,
> 	rda_uart_write(port, old_irq_mask, RDA_UART_IRQ_MASK);
> 
> 	if (locked)
>-		uart_port_unlock(port);
>-
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(port, flags);
> }
> 
> static void rda_uart_console_write(struct console *co, const char *s,
>diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
>index d6a58a9e072a1..0c13ea6a3afaa 100644
>--- a/drivers/tty/serial/serial_core.c
>+++ b/drivers/tty/serial/serial_core.c
>@@ -3145,8 +3145,15 @@ static int serial_core_add_one_port(struct uart_driver *drv, struct uart_port *u
> 	state->uart_port = uport;
> 	uport->state = state;
> 
>+	/*
>+	 * If this port is in use as a console then the spinlock is already
>+	 * initialised.
>+	 */
>+	if (!uart_console_registered(uport))
>+		uart_port_spin_lock_init(uport);
>+
> 	state->pm_state = UART_PM_STATE_UNDEFINED;
>-	uport->cons = drv->cons;
>+	uart_port_set_cons(uport, drv->cons);
> 	uport->minor = drv->tty_driver->minor_start + uport->line;
> 	uport->name = kasprintf(GFP_KERNEL, "%s%d", drv->dev_name,
> 				drv->tty_driver->name_base + uport->line);
>@@ -3155,13 +3162,6 @@ static int serial_core_add_one_port(struct uart_driver *drv, struct uart_port *u
> 		goto out;
> 	}
> 
>-	/*
>-	 * If this port is in use as a console then the spinlock is already
>-	 * initialised.
>-	 */
>-	if (!uart_console_registered(uport))
>-		uart_port_spin_lock_init(uport);
>-
> 	if (uport->cons && uport->dev)
> 		of_console_check(uport->dev->of_node, uport->cons->name, uport->line);
> 
>diff --git a/drivers/tty/serial/sifive.c b/drivers/tty/serial/sifive.c
>index a4cc569a78a25..0670fd9f84967 100644
>--- a/drivers/tty/serial/sifive.c
>+++ b/drivers/tty/serial/sifive.c
>@@ -412,7 +412,8 @@ static void __ssp_receive_chars(struct sifive_serial_port *ssp)
> 			break;
> 
> 		ssp->port.icount.rx++;
>-		uart_insert_char(&ssp->port, 0, 0, ch, TTY_NORMAL);
>+		if (!uart_prepare_sysrq_char(&ssp->port, ch))
>+			uart_insert_char(&ssp->port, 0, 0, ch, TTY_NORMAL);
> 	}
> 
> 	tty_flip_buffer_push(&ssp->port.state->port);
>@@ -534,7 +535,7 @@ static irqreturn_t sifive_serial_irq(int irq, void *dev_id)
> 	if (ip & SIFIVE_SERIAL_IP_TXWM_MASK)
> 		__ssp_transmit_chars(ssp);
> 
>-	uart_port_unlock(&ssp->port);
>+	uart_unlock_and_check_sysrq(&ssp->port);
> 
> 	return IRQ_HANDLED;
> }
>@@ -791,13 +792,10 @@ static void sifive_serial_console_write(struct console *co, const char *s,
> 	if (!ssp)
> 		return;
> 
>-	local_irq_save(flags);
>-	if (ssp->port.sysrq)
>-		locked = 0;
>-	else if (oops_in_progress)
>-		locked = uart_port_trylock(&ssp->port);
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(&ssp->port, &flags);
> 	else
>-		uart_port_lock(&ssp->port);
>+		uart_port_lock_irqsave(&ssp->port, &flags);
> 
> 	ier = __ssp_readl(ssp, SIFIVE_SERIAL_IE_OFFS);
> 	__ssp_writel(0, SIFIVE_SERIAL_IE_OFFS, ssp);
>@@ -807,8 +805,7 @@ static void sifive_serial_console_write(struct console *co, const char *s,
> 	__ssp_writel(ier, SIFIVE_SERIAL_IE_OFFS, ssp);
> 
> 	if (locked)
>-		uart_port_unlock(&ssp->port);
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(&ssp->port, flags);
> }
> 
> static int sifive_serial_console_setup(struct console *co, char *options)
>diff --git a/drivers/tty/serial/sunplus-uart.c b/drivers/tty/serial/sunplus-uart.c
>index 99f5285819d4b..f5e29eb4a4ce4 100644
>--- a/drivers/tty/serial/sunplus-uart.c
>+++ b/drivers/tty/serial/sunplus-uart.c
>@@ -260,7 +260,7 @@ static void receive_chars(struct uart_port *port)
> 		if (port->ignore_status_mask & SUP_DUMMY_READ)
> 			goto ignore_char;
> 
>-		if (uart_handle_sysrq_char(port, ch))
>+		if (uart_prepare_sysrq_char(port, ch))
> 			goto ignore_char;
> 
> 		uart_insert_char(port, lsr, SUP_UART_LSR_OE, ch, flag);
>@@ -287,7 +287,7 @@ static irqreturn_t sunplus_uart_irq(int irq, void *args)
> 	if (isc & SUP_UART_ISC_TX)
> 		transmit_chars(port);
> 
>-	uart_port_unlock(port);
>+	uart_unlock_and_check_sysrq(port);
> 
> 	return IRQ_HANDLED;
> }
>@@ -512,22 +512,16 @@ static void sunplus_console_write(struct console *co,
> 	unsigned long flags;
> 	int locked = 1;
> 
>-	local_irq_save(flags);
>-
>-	if (sunplus_console_ports[co->index]->port.sysrq)
>-		locked = 0;
>-	else if (oops_in_progress)
>-		locked = uart_port_trylock(&sunplus_console_ports[co->index]->port);
>+	if (oops_in_progress)
>+		locked = uart_port_trylock_irqsave(&sunplus_console_ports[co->index]->port, &flags);
> 	else
>-		uart_port_lock(&sunplus_console_ports[co->index]->port);
>+		uart_port_lock_irqsave(&sunplus_console_ports[co->index]->port, &flags);
> 
> 	uart_console_write(&sunplus_console_ports[co->index]->port, s, count,
> 			   sunplus_uart_console_putchar);
> 
> 	if (locked)
>-		uart_port_unlock(&sunplus_console_ports[co->index]->port);
>-
>-	local_irq_restore(flags);
>+		uart_port_unlock_irqrestore(&sunplus_console_ports[co->index]->port, flags);
> }
> 
> static int __init sunplus_console_setup(struct console *co, char *options)
>diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
>index 407b0d87b7c10..c9c914bc033c9 100644
>--- a/drivers/tty/tty_io.c
>+++ b/drivers/tty/tty_io.c
>@@ -3567,8 +3567,13 @@ static ssize_t show_cons_active(struct device *dev,
> 	for_each_console(c) {
> 		if (!c->device)
> 			continue;
>-		if (!c->write)
>-			continue;
>+		if (c->flags & CON_NBCON) {
>+			if (!c->write_atomic && !c->write_thread)
>+				continue;
>+		} else {
>+			if (!c->write)
>+				continue;
>+		}
> 		if ((c->flags & CON_ENABLED) == 0)
> 			continue;
> 		cs[i++] = c;
>diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
>index e0758fe7936dc..2703676549f5e 100644
>--- a/fs/proc/consoles.c
>+++ b/fs/proc/consoles.c
>@@ -21,12 +21,14 @@ static int show_console_dev(struct seq_file *m, void *v)
> 		{ CON_ENABLED,		'E' },
> 		{ CON_CONSDEV,		'C' },
> 		{ CON_BOOT,		'B' },
>+		{ CON_NBCON,		'N' },
> 		{ CON_PRINTBUFFER,	'p' },
> 		{ CON_BRL,		'b' },
> 		{ CON_ANYTIME,		'a' },
> 	};
> 	char flags[ARRAY_SIZE(con_flags) + 1];
> 	struct console *con = v;
>+	char con_write = '-';
> 	unsigned int a;
> 	dev_t dev = 0;
> 
>@@ -57,9 +59,15 @@ static int show_console_dev(struct seq_file *m, void *v)
> 	seq_setwidth(m, 21 - 1);
> 	seq_printf(m, "%s%d", con->name, con->index);
> 	seq_pad(m, ' ');
>-	seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
>-			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
>-			flags);
>+	if (con->flags & CON_NBCON) {
>+		if (con->write_atomic || con->write_thread)
>+			con_write = 'W';
>+	} else {
>+		if (con->write)
>+			con_write = 'W';
>+	}
>+	seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con_write,
>+		   con->unblank ? 'U' : '-', flags);
> 	if (dev)
> 		seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
> 
>diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
>index fc53e0ad56d90..448bbef474564 100644
>--- a/include/linux/bottom_half.h
>+++ b/include/linux/bottom_half.h
>@@ -35,8 +35,10 @@ static inline void local_bh_enable(void)
> 
> #ifdef CONFIG_PREEMPT_RT
> extern bool local_bh_blocked(void);
>+extern void softirq_preempt(void);
> #else
> static inline bool local_bh_blocked(void) { return false; }
>+static inline void softirq_preempt(void) { }
> #endif
> 
> #endif /* _LINUX_BH_H */
>diff --git a/include/linux/console.h b/include/linux/console.h
>index 779d388af8a0a..1eb2d1e58b1c7 100644
>--- a/include/linux/console.h
>+++ b/include/linux/console.h
>@@ -16,7 +16,9 @@
> 
> #include <linux/atomic.h>
> #include <linux/bits.h>
>+#include <linux/irq_work.h>
> #include <linux/rculist.h>
>+#include <linux/rcuwait.h>
> #include <linux/types.h>
> 
> struct vc_data;
>@@ -137,7 +139,7 @@ static inline int con_debug_leave(void)
>  */
> 
> /**
>- * cons_flags - General console flags
>+ * enum cons_flags - General console flags
>  * @CON_PRINTBUFFER:	Used by newly registered consoles to avoid duplicate
>  *			output of messages that were already shown by boot
>  *			consoles or read by userspace via syslog() syscall.
>@@ -218,7 +220,7 @@ struct nbcon_state {
> static_assert(sizeof(struct nbcon_state) <= sizeof(int));
> 
> /**
>- * nbcon_prio - console owner priority for nbcon consoles
>+ * enum nbcon_prio - console owner priority for nbcon consoles
>  * @NBCON_PRIO_NONE:		Unused
>  * @NBCON_PRIO_NORMAL:		Normal (non-emergency) usage
>  * @NBCON_PRIO_EMERGENCY:	Emergency output (WARN/OOPS...)
>@@ -282,10 +284,29 @@ struct nbcon_write_context {
> 	bool			unsafe_takeover;
> };
> 
>+/**
>+ * struct nbcon_drvdata - Data to allow nbcon acquire in non-print context
>+ * @ctxt:		The core console context
>+ * @srcu_cookie:	Storage for a console_srcu_lock cookie, if needed
>+ * @owner_index:	Storage for the owning console index, if needed
>+ * @locked:		Storage for the locked state, if needed
>+ *
>+ * All fields (except for @ctxt) are available exclusively to the driver to
>+ * use as needed. They are not used by the printk subsystem.
>+ */
>+struct nbcon_drvdata {
>+	struct nbcon_context	__private ctxt;
>+
>+	/* reserved for driver use */
>+	int			srcu_cookie;
>+	short			owner_index;
>+	bool			locked;
>+};
>+
> /**
>  * struct console - The console descriptor structure
>  * @name:		The name of the console driver
>- * @write:		Write callback to output messages (Optional)
>+ * @write:		Legacy write callback to output messages (Optional)
>  * @read:		Read callback for console input (Optional)
>  * @device:		The underlying TTY device driver (Optional)
>  * @unblank:		Callback to unblank the console (Optional)
>@@ -302,10 +323,13 @@ struct nbcon_write_context {
>  * @data:		Driver private data
>  * @node:		hlist node for the console list
>  *
>- * @write_atomic:	Write callback for atomic context
>  * @nbcon_state:	State for nbcon consoles
>  * @nbcon_seq:		Sequence number of the next record for nbcon to print
>+ * @nbcon_prev_seq:	Seq num the previous nbcon owner was assigned to print
>  * @pbufs:		Pointer to nbcon private buffer
>+ * @kthread:		Printer kthread for this console
>+ * @rcuwait:		RCU-safe wait object for @kthread waking
>+ * @irq_work:		Defer @kthread waking to IRQ work context
>  */
> struct console {
> 	char			name[16];
>@@ -327,11 +351,122 @@ struct console {
> 	struct hlist_node	node;
> 
> 	/* nbcon console specific members */
>-	bool			(*write_atomic)(struct console *con,
>-						struct nbcon_write_context *wctxt);
>+
>+	/**
>+	 * @write_atomic:
>+	 *
>+	 * NBCON callback to write out text in any context.
>+	 *
>+	 * This callback is called with the console already acquired. The
>+	 * callback can use nbcon_can_proceed() at any time to verify that
>+	 * it is still the owner of the console. In the case that it has
>+	 * lost ownership, it is no longer allowed to go forward. In this
>+	 * case it must back out immediately and carefully. The buffer
>+	 * content is also no longer trusted since it no longer belongs to
>+	 * the context.
>+	 *
>+	 * If the callback needs to perform actions where ownership is not
>+	 * allowed to be taken over, nbcon_enter_unsafe() and
>+	 * nbcon_exit_unsafe() can be used to mark such sections. These
>+	 * functions are also points of possible ownership transfer. If
>+	 * either function returns false, ownership has been lost.
>+	 *
>+	 * If the driver must reacquire ownership in order to finalize or
>+	 * revert hardware changes, nbcon_reacquire() can be used. However,
>+	 * on reacquire the buffer content is no longer available. A
>+	 * reacquire cannot be used to resume printing.
>+	 *
>+	 * This callback can be called from any context (including NMI).
>+	 * Therefore it must avoid usage of any locking and instead rely
>+	 * on the console ownership for synchronization.
>+	 */
>+	void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt);
>+
>+	/**
>+	 * @write_thread:
>+	 *
>+	 * NBCON callback to write out text in task context. (Optional)
>+	 *
>+	 * This callback is called with the console already acquired. Any
>+	 * additional driver synchronization should have been performed by
>+	 * device_lock().
>+	 *
>+	 * This callback is always called from task context but with migration
>+	 * disabled.
>+	 *
>+	 * The same criteria for console ownership verification and unsafe
>+	 * sections applies as with write_atomic(). The difference between
>+	 * this callback and write_atomic() is that this callback is used
>+	 * during normal operation and is always called from task context.
>+	 * This provides drivers with a relatively relaxed locking context
>+	 * for synchronizing output to the hardware.
>+	 */
>+	void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt);
>+
>+	/**
>+	 * @device_lock:
>+	 *
>+	 * NBCON callback to begin synchronization with driver code.
>+	 *
>+	 * Console drivers typically must deal with access to the hardware
>+	 * via user input/output (such as an interactive login shell) and
>+	 * output of kernel messages via printk() calls. This callback is
>+	 * called by the printk-subsystem whenever it needs to synchronize
>+	 * with hardware access by the driver. It should be implemented to
>+	 * use whatever synchronization mechanism the driver is using for
>+	 * itself (for example, the port lock for uart serial consoles).
>+	 *
>+	 * This callback is always called from task context. It may use any
>+	 * synchronization method required by the driver. BUT this callback
>+	 * MUST also disable migration. The console driver may be using a
>+	 * synchronization mechanism that already takes care of this (such as
>+	 * spinlocks). Otherwise this function must explicitly call
>+	 * migrate_disable().
>+	 *
>+	 * The flags argument is provided as a convenience to the driver. It
>+	 * will be passed again to device_unlock(). It can be ignored if the
>+	 * driver does not need it.
>+	 */
>+	void (*device_lock)(struct console *con, unsigned long *flags);
>+
>+	/**
>+	 * @device_unlock:
>+	 *
>+	 * NBCON callback to finish synchronization with driver code.
>+	 *
>+	 * It is the counterpart to device_lock().
>+	 *
>+	 * This callback is always called from task context. It must
>+	 * appropriately re-enable migration (depending on how device_lock()
>+	 * disabled migration).
>+	 *
>+	 * The flags argument is the value of the same variable that was
>+	 * passed to device_lock().
>+	 */
>+	void (*device_unlock)(struct console *con, unsigned long flags);
>+
> 	atomic_t		__private nbcon_state;
> 	atomic_long_t		__private nbcon_seq;
>+	atomic_long_t           __private nbcon_prev_seq;
>+
>+	/**
>+	 * @nbcon_drvdata:
>+	 *
>+	 * Data for nbcon ownership tracking to allow acquiring nbcon consoles
>+	 * in non-printing contexts.
>+	 *
>+	 * Drivers may need to acquire nbcon consoles in non-printing
>+	 * contexts. This is achieved by providing a struct nbcon_drvdata.
>+	 * Then the driver can call nbcon_driver_acquire() and
>+	 * nbcon_driver_release(). The struct does not require any special
>+	 * initialization.
>+	 */
>+	struct nbcon_drvdata	*nbcon_drvdata;
>+
> 	struct printk_buffers	*pbufs;
>+	struct task_struct	*kthread;
>+	struct rcuwait		rcuwait;
>+	struct irq_work		irq_work;
> };
> 
> #ifdef CONFIG_LOCKDEP
>@@ -360,28 +495,29 @@ extern void console_list_unlock(void) __releases(console_mutex);
> extern struct hlist_head console_list;
> 
> /**
>- * console_srcu_read_flags - Locklessly read the console flags
>+ * console_srcu_read_flags - Locklessly read flags of a possibly registered
>+ *				console
>  * @con:	struct console pointer of console to read flags from
>  *
>- * This function provides the necessary READ_ONCE() and data_race()
>- * notation for locklessly reading the console flags. The READ_ONCE()
>- * in this function matches the WRITE_ONCE() when @flags are modified
>- * for registered consoles with console_srcu_write_flags().
>+ * Locklessly reading @con->flags provides a consistent read value because
>+ * there is at most one CPU modifying @con->flags and that CPU is using only
>+ * read-modify-write operations to do so.
>  *
>- * Only use this function to read console flags when locklessly
>- * iterating the console list via srcu.
>+ * Requires console_srcu_read_lock to be held, which implies that @con might
>+ * be a registered console. If the caller is holding the console_list_lock or
>+ * it is certain that the console is not registered, the caller may read
>+ * @con->flags directly instead.
>  *
>  * Context: Any context.
>+ * Return: The current value of the @con->flags field.
>  */
> static inline short console_srcu_read_flags(const struct console *con)
> {
> 	WARN_ON_ONCE(!console_srcu_read_lock_is_held());
> 
> 	/*
>-	 * Locklessly reading console->flags provides a consistent
>-	 * read value because there is at most one CPU modifying
>-	 * console->flags and that CPU is using only read-modify-write
>-	 * operations to do so.
>+	 * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified
>+	 * for registered consoles with console_srcu_write_flags().
> 	 */
> 	return data_race(READ_ONCE(con->flags));
> }
>@@ -459,13 +595,19 @@ static inline bool console_is_registered(const struct console *con)
> 	hlist_for_each_entry(con, &console_list, node)
> 
> #ifdef CONFIG_PRINTK
>+extern void nbcon_cpu_emergency_enter(void);
>+extern void nbcon_cpu_emergency_exit(void);
> extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
> extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
> extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
>+extern void nbcon_reacquire(struct nbcon_write_context *wctxt);
> #else
>+static inline void nbcon_cpu_emergency_enter(void) { }
>+static inline void nbcon_cpu_emergency_exit(void) { }
> static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
> static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
> static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
>+static inline void nbcon_reacquire(struct nbcon_write_context *wctxt) { }
> #endif
> 
> extern int console_set_on_cmdline;
>diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
>index b0fb775a600d9..f5bb19369973a 100644
>--- a/include/linux/entry-common.h
>+++ b/include/linux/entry-common.h
>@@ -65,7 +65,7 @@
> #define EXIT_TO_USER_MODE_WORK						\
> 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
> 	 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |	\
>-	 ARCH_EXIT_TO_USER_MODE_WORK)
>+	 _TIF_NEED_RESCHED_LAZY | ARCH_EXIT_TO_USER_MODE_WORK)
> 
> /**
>  * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs
>diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
>index 6813171afccb2..674a622c91be2 100644
>--- a/include/linux/entry-kvm.h
>+++ b/include/linux/entry-kvm.h
>@@ -18,7 +18,7 @@
> 
> #define XFER_TO_GUEST_MODE_WORK						\
> 	(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL |	\
>-	 _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
>+	 _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED_LAZY | ARCH_XFER_TO_GUEST_MODE_WORK)
> 
> struct kvm_vcpu;
> 
>diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
>index 76121c2bb4f82..f75f6bc195d18 100644
>--- a/include/linux/interrupt.h
>+++ b/include/linux/interrupt.h
>@@ -609,6 +609,35 @@ extern void __raise_softirq_irqoff(unsigned int nr);
> extern void raise_softirq_irqoff(unsigned int nr);
> extern void raise_softirq(unsigned int nr);
> 
>+#ifdef CONFIG_PREEMPT_RT
>+DECLARE_PER_CPU(struct task_struct *, timersd);
>+DECLARE_PER_CPU(unsigned long, pending_timer_softirq);
>+
>+extern void raise_timer_softirq(void);
>+extern void raise_hrtimer_softirq(void);
>+
>+static inline unsigned int local_pending_timers(void)
>+{
>+        return __this_cpu_read(pending_timer_softirq);
>+}
>+
>+#else
>+static inline void raise_timer_softirq(void)
>+{
>+	raise_softirq(TIMER_SOFTIRQ);
>+}
>+
>+static inline void raise_hrtimer_softirq(void)
>+{
>+	raise_softirq_irqoff(HRTIMER_SOFTIRQ);
>+}
>+
>+static inline unsigned int local_pending_timers(void)
>+{
>+        return local_softirq_pending();
>+}
>+#endif
>+
> DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
> 
> static inline struct task_struct *this_cpu_ksoftirqd(void)
>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>index dba428b3a87a5..0db375f9c339b 100644
>--- a/include/linux/netdevice.h
>+++ b/include/linux/netdevice.h
>@@ -3365,6 +3365,7 @@ static inline void dev_xmit_recursion_dec(void)
> 	__this_cpu_dec(softnet_data.xmit.recursion);
> }
> 
>+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
> void __netif_schedule(struct Qdisc *q);
> void netif_schedule_queue(struct netdev_queue *txq);
> 
>diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>index d2a15c0c6f8a9..c1c6600541657 100644
>--- a/include/linux/perf_event.h
>+++ b/include/linux/perf_event.h
>@@ -781,9 +781,9 @@ struct perf_event {
> 	unsigned int			pending_wakeup;
> 	unsigned int			pending_kill;
> 	unsigned int			pending_disable;
>-	unsigned int			pending_sigtrap;
> 	unsigned long			pending_addr;	/* SIGTRAP */
> 	struct irq_work			pending_irq;
>+	struct irq_work			pending_disable_irq;
> 	struct callback_head		pending_task;
> 	unsigned int			pending_work;
> 
>@@ -959,7 +959,7 @@ struct perf_event_context {
> 	struct rcu_head			rcu_head;
> 
> 	/*
>-	 * Sum (event->pending_sigtrap + event->pending_work)
>+	 * Sum (event->pending_work + event->pending_work)
> 	 *
> 	 * The SIGTRAP is targeted at ctx->task, as such it won't do changing
> 	 * that until the signal is delivered.
>diff --git a/include/linux/printk.h b/include/linux/printk.h
>index 8ef499ab3c1ed..f2074b458d801 100644
>--- a/include/linux/printk.h
>+++ b/include/linux/printk.h
>@@ -9,6 +9,8 @@
> #include <linux/ratelimit_types.h>
> #include <linux/once_lite.h>
> 
>+struct console;
>+
> extern const char linux_banner[];
> extern const char linux_proc_banner[];
> 
>@@ -157,15 +159,16 @@ int _printk(const char *fmt, ...);
>  */
> __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);
> 
>-extern void __printk_safe_enter(void);
>-extern void __printk_safe_exit(void);
>+extern void __printk_deferred_enter(void);
>+extern void __printk_deferred_exit(void);
>+
> /*
>  * The printk_deferred_enter/exit macros are available only as a hack for
>  * some code paths that need to defer all printk console printing. Interrupts
>  * must be disabled for the deferred duration.
>  */
>-#define printk_deferred_enter __printk_safe_enter
>-#define printk_deferred_exit __printk_safe_exit
>+#define printk_deferred_enter() __printk_deferred_enter()
>+#define printk_deferred_exit() __printk_deferred_exit()
> 
> /*
>  * Please don't use printk_ratelimit(), because it shares ratelimiting state
>@@ -192,6 +195,10 @@ void show_regs_print_info(const char *log_lvl);
> extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
> extern asmlinkage void dump_stack(void) __cold;
> void printk_trigger_flush(void);
>+void printk_legacy_allow_panic_sync(void);
>+extern void nbcon_driver_acquire(struct console *con);
>+extern void nbcon_driver_release(struct console *con);
>+void nbcon_atomic_flush_unsafe(void);
> #else
> static inline __printf(1, 0)
> int vprintk(const char *s, va_list args)
>@@ -271,6 +271,23 @@
> static inline void printk_trigger_flush(void)
> {
> }
>+
>+static inline void printk_legacy_allow_panic_sync(void)
>+{
>+}
>+
>+static inline void nbcon_driver_acquire(struct console *con)
>+{
>+}
>+
>+static inline void nbcon_driver_release(struct console *con)
>+{
>+}
>+
>+static inline void nbcon_atomic_flush_unsafe(void)
>+{
>+}
>+
> #endif
> 
> bool this_cpu_in_panic(void);
>diff --git a/include/linux/sched.h b/include/linux/sched.h
>index ffe8f618ab869..cb4df5d70e3d0 100644
>--- a/include/linux/sched.h
>+++ b/include/linux/sched.h
>@@ -1791,6 +1791,7 @@ static inline int dl_task_check_affinity(struct task_struct *p, const struct cpu
> }
> #endif
> 
>+extern bool task_is_pi_boosted(const struct task_struct *p);
> extern int yield_to(struct task_struct *p, bool preempt);
> extern void set_user_nice(struct task_struct *p, long nice);
> extern int task_prio(const struct task_struct *p);
>@@ -1933,17 +1934,17 @@ static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
> 	update_ti_thread_flag(task_thread_info(tsk), flag, value);
> }
> 
>-static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
>+static inline bool test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
> {
> 	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
> }
> 
>-static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
>+static inline bool test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
> {
> 	return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
> }
> 
>-static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
>+static inline bool test_tsk_thread_flag(struct task_struct *tsk, int flag)
> {
> 	return test_ti_thread_flag(task_thread_info(tsk), flag);
> }
>@@ -1956,9 +1957,11 @@ static inline void set_tsk_need_resched(struct task_struct *tsk)
> static inline void clear_tsk_need_resched(struct task_struct *tsk)
> {
> 	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
>+	if (IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO))
>+		clear_tsk_thread_flag(tsk, TIF_NEED_RESCHED_LAZY);
> }
> 
>-static inline int test_tsk_need_resched(struct task_struct *tsk)
>+static inline bool test_tsk_need_resched(struct task_struct *tsk)
> {
> 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
> }
>@@ -2099,7 +2102,7 @@ static inline bool preempt_model_preemptible(void)
> 
> static __always_inline bool need_resched(void)
> {
>-	return unlikely(tif_need_resched());
>+	return unlikely(tif_need_resched_lazy() || tif_need_resched());
> }
> 
> /*
>diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
>index 478084f9105e1..719416fe8ddc0 100644
>--- a/include/linux/sched/idle.h
>+++ b/include/linux/sched/idle.h
>@@ -63,7 +63,7 @@ static __always_inline bool __must_check current_set_polling_and_test(void)
> 	 */
> 	smp_mb__after_atomic();
> 
>-	return unlikely(tif_need_resched());
>+	return unlikely(need_resched());
> }
> 
> static __always_inline bool __must_check current_clr_polling_and_test(void)
>@@ -76,7 +76,7 @@ static __always_inline bool __must_check current_clr_polling_and_test(void)
> 	 */
> 	smp_mb__after_atomic();
> 
>-	return unlikely(tif_need_resched());
>+	return unlikely(need_resched());
> }
> 
> #else
>@@ -85,11 +85,11 @@ static inline void __current_clr_polling(void) { }
> 
> static inline bool __must_check current_set_polling_and_test(void)
> {
>-	return unlikely(tif_need_resched());
>+	return unlikely(need_resched());
> }
> static inline bool __must_check current_clr_polling_and_test(void)
> {
>-	return unlikely(tif_need_resched());
>+	return unlikely(need_resched());
> }
> #endif
> 
>diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
>index be65de65fe612..ff445a5fca281 100644
>--- a/include/linux/serial_8250.h
>+++ b/include/linux/serial_8250.h
>@@ -153,6 +153,8 @@ struct uart_8250_port {
> #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
> 	unsigned char		msr_saved_flags;
> 
>+	bool			console_newline_needed;
>+
> 	struct uart_8250_dma	*dma;
> 	const struct uart_8250_ops *ops;
> 
>@@ -204,6 +206,10 @@ void serial8250_init_port(struct uart_8250_port *up);
> void serial8250_set_defaults(struct uart_8250_port *up);
> void serial8250_console_write(struct uart_8250_port *up, const char *s,
> 			      unsigned int count);
>+void serial8250_console_write_atomic(struct uart_8250_port *up,
>+				     struct nbcon_write_context *wctxt);
>+void serial8250_console_write_thread(struct uart_8250_port *up,
>+				     struct nbcon_write_context *wctxt);
> int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
> int serial8250_console_exit(struct uart_port *port);
> 
>diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
>index 55b1f3ba48ac1..9a73dee32ad9a 100644
>--- a/include/linux/serial_core.h
>+++ b/include/linux/serial_core.h
>@@ -8,10 +8,13 @@
> #define LINUX_SERIAL_CORE_H
> 
> #include <linux/bitops.h>
>+#include <linux/bug.h>
> #include <linux/compiler.h>
> #include <linux/console.h>
> #include <linux/interrupt.h>
> #include <linux/circ_buf.h>
>+#include <linux/lockdep.h>
>+#include <linux/printk.h>
> #include <linux/spinlock.h>
> #include <linux/sched.h>
> #include <linux/tty.h>
>@@ -588,6 +591,101 @@ struct uart_port {
> 	void			*private_data;		/* generic platform data pointer */
> };
> 
>+/*
>+ * Only for console->device_lock()/_unlock() callbacks and internal
>+ * port lock wrapper synchronization.
>+ */
>+static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
>+{
>+	spin_lock_irqsave(&up->lock, *flags);
>+}
>+
>+/*
>+ * Only for console->device_lock()/_unlock() callbacks and internal
>+ * port lock wrapper synchronization.
>+ */
>+static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
>+{
>+	spin_unlock_irqrestore(&up->lock, flags);
>+}
>+
>+/**
>+ * uart_port_set_cons - Safely set the @cons field for a uart
>+ * @up:		The uart port to set
>+ * @con:	The new console to set to
>+ *
>+ * This function must be used to set @up->cons. It uses the port lock to
>+ * synchronize with the port lock wrappers in order to ensure that the console
>+ * cannot change or disappear while another context is holding the port lock.
>+ */
>+static inline void uart_port_set_cons(struct uart_port *up, struct console *con)
>+{
>+	unsigned long flags;
>+
>+	__uart_port_lock_irqsave(up, &flags);
>+	up->cons = con;
>+	__uart_port_unlock_irqrestore(up, flags);
>+}
>+
>+/* Only for internal port lock wrapper usage. */
>+static inline void __uart_port_nbcon_acquire(struct uart_port *up)
>+{
>+	lockdep_assert_held_once(&up->lock);
>+
>+	if (likely(!uart_console(up)))
>+		return;
>+
>+	if (up->cons->nbcon_drvdata) {
>+		/*
>+		 * If @up->cons is registered, prevent it from fully
>+		 * unregistering until this context releases the nbcon.
>+		 */
>+		int cookie = console_srcu_read_lock();
>+
>+		/* Ensure console is registered and is an nbcon console. */
>+		if (!hlist_unhashed_lockless(&up->cons->node) &&
>+		    (console_srcu_read_flags(up->cons) & CON_NBCON)) {
>+			WARN_ON_ONCE(up->cons->nbcon_drvdata->locked);
>+
>+			nbcon_driver_acquire(up->cons);
>+
>+			/*
>+			 * Record @up->line to be used during release because
>+			 * @up->cons->index can change while the port and
>+			 * nbcon are locked.
>+			 */
>+			up->cons->nbcon_drvdata->owner_index = up->line;
>+			up->cons->nbcon_drvdata->srcu_cookie = cookie;
>+			up->cons->nbcon_drvdata->locked = true;
>+		} else {
>+			console_srcu_read_unlock(cookie);
>+		}
>+	}
>+}
>+
>+/* Only for internal port lock wrapper usage. */
>+static inline void __uart_port_nbcon_release(struct uart_port *up)
>+{
>+	lockdep_assert_held_once(&up->lock);
>+
>+	/*
>+	 * uart_console() cannot be used here because @up->cons->index might
>+	 * have changed. Check against @up->cons->nbcon_drvdata->owner_index
>+	 * instead.
>+	 */
>+
>+	if (unlikely(up->cons &&
>+		     up->cons->nbcon_drvdata &&
>+		     up->cons->nbcon_drvdata->locked &&
>+		     up->cons->nbcon_drvdata->owner_index == up->line)) {
>+		WARN_ON_ONCE(!up->cons->nbcon_drvdata->locked);
>+
>+		up->cons->nbcon_drvdata->locked = false;
>+		nbcon_driver_release(up->cons);
>+		console_srcu_read_unlock(up->cons->nbcon_drvdata->srcu_cookie);
>+	}
>+}
>+
> /**
>  * uart_port_lock - Lock the UART port
>  * @up:		Pointer to UART port structure
>@@ -595,6 +693,7 @@ struct uart_port {
> static inline void uart_port_lock(struct uart_port *up)
> {
> 	spin_lock(&up->lock);
>+	__uart_port_nbcon_acquire(up);
> }
> 
> /**
>@@ -604,6 +703,7 @@ static inline void uart_port_lock(struct uart_port *up)
> static inline void uart_port_lock_irq(struct uart_port *up)
> {
> 	spin_lock_irq(&up->lock);
>+	__uart_port_nbcon_acquire(up);
> }
> 
> /**
>@@ -614,6 +714,7 @@ static inline void uart_port_lock_irq(struct uart_port *up)
> static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
> {
> 	spin_lock_irqsave(&up->lock, *flags);
>+	__uart_port_nbcon_acquire(up);
> }
> 
> /**
>@@ -624,7 +725,11 @@ static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *f
>  */
> static inline bool uart_port_trylock(struct uart_port *up)
> {
>-	return spin_trylock(&up->lock);
>+	if (!spin_trylock(&up->lock))
>+		return false;
>+
>+	__uart_port_nbcon_acquire(up);
>+	return true;
> }
> 
> /**
>@@ -636,7 +741,11 @@ static inline bool uart_port_trylock(struct uart_port *up)
>  */
> static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags)
> {
>-	return spin_trylock_irqsave(&up->lock, *flags);
>+	if (!spin_trylock_irqsave(&up->lock, *flags))
>+		return false;
>+
>+	__uart_port_nbcon_acquire(up);
>+	return true;
> }
> 
> /**
>@@ -645,6 +754,7 @@ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long
>  */
> static inline void uart_port_unlock(struct uart_port *up)
> {
>+	__uart_port_nbcon_release(up);
> 	spin_unlock(&up->lock);
> }
> 
>@@ -654,6 +764,7 @@ static inline void uart_port_unlock(struct uart_port *up)
>  */
> static inline void uart_port_unlock_irq(struct uart_port *up)
> {
>+	__uart_port_nbcon_release(up);
> 	spin_unlock_irq(&up->lock);
> }
> 
>@@ -664,6 +775,7 @@ static inline void uart_port_unlock_irq(struct uart_port *up)
>  */
> static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
> {
>+	__uart_port_nbcon_release(up);
> 	spin_unlock_irqrestore(&up->lock, flags);
> }
> 
>diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
>index 9ea0b28068f49..5ded1450ac1a1 100644
>--- a/include/linux/thread_info.h
>+++ b/include/linux/thread_info.h
>@@ -59,6 +59,16 @@ enum syscall_work_bit {
> 
> #include <asm/thread_info.h>
> 
>+#ifdef CONFIG_PREEMPT_BUILD_AUTO
>+# define TIF_NEED_RESCHED_LAZY		TIF_ARCH_RESCHED_LAZY
>+# define _TIF_NEED_RESCHED_LAZY		_TIF_ARCH_RESCHED_LAZY
>+# define TIF_NEED_RESCHED_LAZY_OFFSET	(TIF_NEED_RESCHED_LAZY - TIF_NEED_RESCHED)
>+#else
>+# define TIF_NEED_RESCHED_LAZY		TIF_NEED_RESCHED
>+# define _TIF_NEED_RESCHED_LAZY		_TIF_NEED_RESCHED
>+# define TIF_NEED_RESCHED_LAZY_OFFSET	0
>+#endif
>+
> #ifdef __KERNEL__
> 
> #ifndef arch_set_restart_data
>@@ -185,6 +195,13 @@ static __always_inline bool tif_need_resched(void)
> 			     (unsigned long *)(&current_thread_info()->flags));
> }
> 
>+static __always_inline bool tif_need_resched_lazy(void)
>+{
>+	return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) &&
>+		arch_test_bit(TIF_NEED_RESCHED_LAZY,
>+			      (unsigned long *)(&current_thread_info()->flags));
>+}
>+
> #else
> 
> static __always_inline bool tif_need_resched(void)
>@@ -193,6 +210,13 @@ static __always_inline bool tif_need_resched(void)
> 			(unsigned long *)(&current_thread_info()->flags));
> }
> 
>+static __always_inline bool tif_need_resched_lazy(void)
>+{
>+	return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) &&
>+		test_bit(TIF_NEED_RESCHED_LAZY,
>+			 (unsigned long *)(&current_thread_info()->flags));
>+}
>+
> #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
> 
> #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
>diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
>index d68ff9b1247f9..0681b3d5a85c6 100644
>--- a/include/linux/trace_events.h
>+++ b/include/linux/trace_events.h
>@@ -178,8 +178,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
> 
> enum trace_flag_type {
> 	TRACE_FLAG_IRQS_OFF		= 0x01,
>-	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
>-	TRACE_FLAG_NEED_RESCHED		= 0x04,
>+	TRACE_FLAG_NEED_RESCHED		= 0x02,
>+	TRACE_FLAG_NEED_RESCHED_LAZY	= 0x04,
> 	TRACE_FLAG_HARDIRQ		= 0x08,
> 	TRACE_FLAG_SOFTIRQ		= 0x10,
> 	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,
>@@ -205,11 +205,11 @@ static inline unsigned int tracing_gen_ctx(void)
> 
> static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
> {
>-	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
>+	return tracing_gen_ctx_irq_test(0);
> }
> static inline unsigned int tracing_gen_ctx(void)
> {
>-	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
>+	return tracing_gen_ctx_irq_test(0);
> }
> #endif
> 
>diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
>index c2f1fd95a8214..0f3d4c2a41cb7 100644
>--- a/kernel/Kconfig.preempt
>+++ b/kernel/Kconfig.preempt
>@@ -11,6 +11,13 @@ config PREEMPT_BUILD
> 	select PREEMPTION
> 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
> 
>+config PREEMPT_BUILD_AUTO
>+	bool
>+	select PREEMPT_BUILD
>+
>+config HAVE_PREEMPT_AUTO
>+	bool
>+
> choice
> 	prompt "Preemption Model"
> 	default PREEMPT_NONE
>@@ -67,9 +74,17 @@ config PREEMPT
> 	  embedded system with latency requirements in the milliseconds
> 	  range.
> 
>+config PREEMPT_AUTO
>+	bool "Automagic preemption mode with runtime tweaking support"
>+	depends on HAVE_PREEMPT_AUTO
>+	select PREEMPT_BUILD_AUTO
>+	help
>+	  Add some sensible blurb here
>+
> config PREEMPT_RT
> 	bool "Fully Preemptible Kernel (Real-Time)"
> 	depends on EXPERT && ARCH_SUPPORTS_RT
>+	select PREEMPT_BUILD_AUTO if HAVE_PREEMPT_AUTO
> 	select PREEMPTION
> 	help
> 	  This option turns the kernel into a real-time kernel by replacing
>@@ -95,7 +110,7 @@ config PREEMPTION
> 
> config PREEMPT_DYNAMIC
> 	bool "Preemption behaviour defined on boot"
>-	depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
>+	depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT && !PREEMPT_AUTO
> 	select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
> 	select PREEMPT_BUILD
> 	default y if HAVE_PREEMPT_DYNAMIC_CALL
>diff --git a/kernel/entry/common.c b/kernel/entry/common.c
>index 88cb3c88aaa5c..d78b109750a3c 100644
>--- a/kernel/entry/common.c
>+++ b/kernel/entry/common.c
>@@ -92,7 +92,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> 
> 		local_irq_enable_exit_to_user(ti_work);
> 
>-		if (ti_work & _TIF_NEED_RESCHED)
>+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> 			schedule();
> 
> 		if (ti_work & _TIF_UPROBE)
>@@ -301,7 +301,7 @@ void raw_irqentry_exit_cond_resched(void)
> 		rcu_irq_exit_check_preempt();
> 		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
> 			WARN_ON_ONCE(!on_thread_stack());
>-		if (need_resched())
>+		if (test_tsk_need_resched(current))
> 			preempt_schedule_irq();
> 	}
> }
>diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
>index 2e0f75bcb7fd1..d952fa5ee8801 100644
>--- a/kernel/entry/kvm.c
>+++ b/kernel/entry/kvm.c
>@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
> 			return -EINTR;
> 		}
> 
>-		if (ti_work & _TIF_NEED_RESCHED)
>+		if (ti_work & (_TIF_NEED_RESCHED | TIF_NEED_RESCHED_LAZY))
> 			schedule();
> 
> 		if (ti_work & _TIF_NOTIFY_RESUME)
>diff --git a/kernel/events/core.c b/kernel/events/core.c
>index f0f0f71213a1d..d5af4d03c2680 100644
>--- a/kernel/events/core.c
>+++ b/kernel/events/core.c
>@@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> 		state = PERF_EVENT_STATE_OFF;
> 	}
> 
>-	if (event->pending_sigtrap) {
>-		bool dec = true;
>-
>-		event->pending_sigtrap = 0;
>-		if (state != PERF_EVENT_STATE_OFF &&
>-		    !event->pending_work) {
>-			event->pending_work = 1;
>-			dec = false;
>-			WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
>-			task_work_add(current, &event->pending_task, TWA_RESUME);
>-		}
>-		if (dec)
>-			local_dec(&event->ctx->nr_pending);
>-	}
>-
> 	perf_event_set_state(event, state);
> 
> 	if (!is_software_event(event))
>@@ -2464,7 +2449,7 @@ static void __perf_event_disable(struct perf_event *event,
>  * hold the top-level event's child_mutex, so any descendant that
>  * goes to exit will block in perf_event_exit_event().
>  *
>- * When called from perf_pending_irq it's OK because event->ctx
>+ * When called from perf_pending_disable it's OK because event->ctx
>  * is the current context on this CPU and preemption is disabled,
>  * hence we can't get into perf_event_task_sched_out for this context.
>  */
>@@ -2504,7 +2489,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
> void perf_event_disable_inatomic(struct perf_event *event)
> {
> 	event->pending_disable = 1;
>-	irq_work_queue(&event->pending_irq);
>+	irq_work_queue(&event->pending_disable_irq);
> }
> 
> #define MAX_INTERRUPTS (~0ULL)
>@@ -5190,6 +5175,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
> static void _free_event(struct perf_event *event)
> {
> 	irq_work_sync(&event->pending_irq);
>+	irq_work_sync(&event->pending_disable_irq);
> 
> 	unaccount_event(event);
> 
>@@ -6726,7 +6712,7 @@ static void perf_sigtrap(struct perf_event *event)
> /*
>  * Deliver the pending work in-event-context or follow the context.
>  */
>-static void __perf_pending_irq(struct perf_event *event)
>+static void __perf_pending_disable(struct perf_event *event)
> {
> 	int cpu = READ_ONCE(event->oncpu);
> 
>@@ -6741,11 +6727,6 @@ static void __perf_pending_irq(struct perf_event *event)
> 	 * Yay, we hit home and are in the context of the event.
> 	 */
> 	if (cpu == smp_processor_id()) {
>-		if (event->pending_sigtrap) {
>-			event->pending_sigtrap = 0;
>-			perf_sigtrap(event);
>-			local_dec(&event->ctx->nr_pending);
>-		}
> 		if (event->pending_disable) {
> 			event->pending_disable = 0;
> 			perf_event_disable_local(event);
>@@ -6769,11 +6750,26 @@ static void __perf_pending_irq(struct perf_event *event)
> 	 *				  irq_work_queue(); // FAILS
> 	 *
> 	 *  irq_work_run()
>-	 *    perf_pending_irq()
>+	 *    perf_pending_disable()
> 	 *
> 	 * But the event runs on CPU-B and wants disabling there.
> 	 */
>-	irq_work_queue_on(&event->pending_irq, cpu);
>+	irq_work_queue_on(&event->pending_disable_irq, cpu);
>+}
>+
>+static void perf_pending_disable(struct irq_work *entry)
>+{
>+	struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq);
>+	int rctx;
>+
>+	/*
>+	 * If we 'fail' here, that's OK, it means recursion is already disabled
>+	 * and we won't recurse 'further'.
>+	 */
>+	rctx = perf_swevent_get_recursion_context();
>+	__perf_pending_disable(event);
>+	if (rctx >= 0)
>+		perf_swevent_put_recursion_context(rctx);
> }
> 
> static void perf_pending_irq(struct irq_work *entry)
>@@ -6796,8 +6792,6 @@ static void perf_pending_irq(struct irq_work *entry)
> 		perf_event_wakeup(event);
> 	}
> 
>-	__perf_pending_irq(event);
>-
> 	if (rctx >= 0)
> 		perf_swevent_put_recursion_context(rctx);
> }
>@@ -6805,14 +6799,6 @@ static void perf_pending_irq(struct irq_work *entry)
> static void perf_pending_task(struct callback_head *head)
> {
> 	struct perf_event *event = container_of(head, struct perf_event, pending_task);
>-	int rctx;
>-
>-	/*
>-	 * If we 'fail' here, that's OK, it means recursion is already disabled
>-	 * and we won't recurse 'further'.
>-	 */
>-	preempt_disable_notrace();
>-	rctx = perf_swevent_get_recursion_context();
> 
> 	if (event->pending_work) {
> 		event->pending_work = 0;
>@@ -6820,10 +6806,6 @@ static void perf_pending_task(struct callback_head *head)
> 		local_dec(&event->ctx->nr_pending);
> 	}
> 
>-	if (rctx >= 0)
>-		perf_swevent_put_recursion_context(rctx);
>-	preempt_enable_notrace();
>-
> 	put_event(event);
> }
> 
>@@ -9592,13 +9574,23 @@ static int __perf_event_overflow(struct perf_event *event,
> 
> 		if (regs)
> 			pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
>-		if (!event->pending_sigtrap) {
>-			event->pending_sigtrap = pending_id;
>+		if (!event->pending_work) {
>+			event->pending_work = pending_id;
> 			local_inc(&event->ctx->nr_pending);
>+			WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
>+			task_work_add(current, &event->pending_task, TWA_RESUME);
>+			/*
>+			 * The NMI path returns directly to userland. The
>+			 * irq_work is raised as a dummy interrupt to ensure
>+			 * regular return path to user is taken and task_work
>+			 * is processed.
>+			 */
>+			if (in_nmi())
>+				irq_work_queue(&event->pending_disable_irq);
> 		} else if (event->attr.exclude_kernel && valid_sample) {
> 			/*
> 			 * Should not be able to return to user space without
>-			 * consuming pending_sigtrap; with exceptions:
>+			 * consuming pending_work; with exceptions:
> 			 *
> 			 *  1. Where !exclude_kernel, events can overflow again
> 			 *     in the kernel without returning to user space.
>@@ -9608,13 +9600,12 @@ static int __perf_event_overflow(struct perf_event *event,
> 			 *     To approximate progress (with false negatives),
> 			 *     check 32-bit hash of the current IP.
> 			 */
>-			WARN_ON_ONCE(event->pending_sigtrap != pending_id);
>+			WARN_ON_ONCE(event->pending_work != pending_id);
> 		}
> 
> 		event->pending_addr = 0;
> 		if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
> 			event->pending_addr = data->addr;
>-		irq_work_queue(&event->pending_irq);
> 	}
> 
> 	READ_ONCE(event->overflow_handler)(event, data, regs);
>@@ -11935,6 +11926,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> 
> 	init_waitqueue_head(&event->waitq);
> 	init_irq_work(&event->pending_irq, perf_pending_irq);
>+	event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
> 	init_task_work(&event->pending_task, perf_pending_task);
> 
> 	mutex_init(&event->mmap_mutex);
>@@ -13049,6 +13041,13 @@ static void sync_child_event(struct perf_event *child_event)
> 		     &parent_event->child_total_time_running);
> }
> 
>+static bool task_work_cb_match(struct callback_head *cb, void *data)
>+{
>+	struct perf_event *event = container_of(cb, struct perf_event, pending_task);
>+
>+	return event == data;
>+}
>+
> static void
> perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> {
>@@ -13088,6 +13087,17 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> 		 * Kick perf_poll() for is_event_hup();
> 		 */
> 		perf_event_wakeup(parent_event);
>+		/*
>+		 * Cancel pending task_work and update counters if it has not
>+		 * yet been delivered to userland. free_event() expects the
>+		 * reference counter at 1 and keeping the event around until the
>+		 * task return to userland will be a unexpected.
>+		 */
>+		if (event->pending_work &&
>+		    task_work_cancel_match(current, task_work_cb_match, event)) {
>+			put_event(event);
>+			local_dec(&event->ctx->nr_pending);
>+		}
> 		free_event(event);
> 		put_event(parent_event);
> 		return;
>diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
>index 1d4bc493b2f4b..486c68c11bbe2 100644
>--- a/kernel/ksysfs.c
>+++ b/kernel/ksysfs.c
>@@ -179,6 +179,15 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size);
> 
> #endif /* CONFIG_CRASH_CORE */
> 
>+#if defined(CONFIG_PREEMPT_RT)
>+static ssize_t realtime_show(struct kobject *kobj,
>+			     struct kobj_attribute *attr, char *buf)
>+{
>+	return sprintf(buf, "%d\n", 1);
>+}
>+KERNEL_ATTR_RO(realtime);
>+#endif
>+
> /* whether file capabilities are enabled */
> static ssize_t fscaps_show(struct kobject *kobj,
> 				  struct kobj_attribute *attr, char *buf)
>@@ -274,6 +283,9 @@ static struct attribute * kernel_attrs[] = {
> #ifndef CONFIG_TINY_RCU
> 	&rcu_expedited_attr.attr,
> 	&rcu_normal_attr.attr,
>+#endif
>+#ifdef CONFIG_PREEMPT_RT
>+	&realtime_attr.attr,
> #endif
> 	NULL
> };
>diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
>index 151bd3de59363..80cfbe7b340e3 100644
>--- a/kernel/locking/lockdep.c
>+++ b/kernel/locking/lockdep.c
>@@ -56,6 +56,7 @@
> #include <linux/kprobes.h>
> #include <linux/lockdep.h>
> #include <linux/context_tracking.h>
>+#include <linux/console.h>
> 
> #include <asm/sections.h>
> 
>@@ -574,8 +575,10 @@ static struct lock_trace *save_trace(void)
> 		if (!debug_locks_off_graph_unlock())
> 			return NULL;
> 
>+		nbcon_cpu_emergency_enter();
> 		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 
> 		return NULL;
> 	}
>@@ -782,6 +785,8 @@ static void lockdep_print_held_locks(struct task_struct *p)
> {
> 	int i, depth = READ_ONCE(p->lockdep_depth);
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	if (!depth)
> 		printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
> 	else
>@@ -792,11 +797,13 @@ static void lockdep_print_held_locks(struct task_struct *p)
> 	 * and it's not the current task.
> 	 */
> 	if (p != current && task_is_running(p))
>-		return;
>+		goto out;
> 	for (i = 0; i < depth; i++) {
> 		printk(" #%d: ", i);
> 		print_lock(p->held_locks + i);
> 	}
>+out:
>+	nbcon_cpu_emergency_exit();
> }
> 
> static void print_kernel_ident(void)
>@@ -888,11 +895,13 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
> 	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
> 		instrumentation_begin();
> 		debug_locks_off();
>+		nbcon_cpu_emergency_enter();
> 		printk(KERN_ERR
> 			"BUG: looking up invalid subclass: %u\n", subclass);
> 		printk(KERN_ERR
> 			"turning off the locking correctness validator.\n");
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 		instrumentation_end();
> 		return NULL;
> 	}
>@@ -969,11 +978,13 @@ static bool assign_lock_key(struct lockdep_map *lock)
> 	else {
> 		/* Debug-check: all keys must be persistent! */
> 		debug_locks_off();
>+		nbcon_cpu_emergency_enter();
> 		pr_err("INFO: trying to register non-static key.\n");
> 		pr_err("The code is fine but needs lockdep annotation, or maybe\n");
> 		pr_err("you didn't initialize this object before use?\n");
> 		pr_err("turning off the locking correctness validator.\n");
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 		return false;
> 	}
> 
>@@ -1317,8 +1328,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
> 			return NULL;
> 		}
> 
>+		nbcon_cpu_emergency_enter();
> 		print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 		return NULL;
> 	}
> 	nr_lock_classes++;
>@@ -1350,11 +1363,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
> 	if (verbose(class)) {
> 		graph_unlock();
> 
>+		nbcon_cpu_emergency_enter();
> 		printk("\nnew class %px: %s", class->key, class->name);
> 		if (class->name_version > 1)
> 			printk(KERN_CONT "#%d", class->name_version);
> 		printk(KERN_CONT "\n");
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 
> 		if (!graph_lock()) {
> 			return NULL;
>@@ -1393,8 +1408,10 @@ static struct lock_list *alloc_list_entry(void)
> 		if (!debug_locks_off_graph_unlock())
> 			return NULL;
> 
>+		nbcon_cpu_emergency_enter();
> 		print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 		return NULL;
> 	}
> 	nr_list_entries++;
>@@ -2040,6 +2057,8 @@ static noinline void print_circular_bug(struct lock_list *this,
> 
> 	depth = get_lock_depth(target);
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	print_circular_bug_header(target, depth, check_src, check_tgt);
> 
> 	parent = get_lock_parent(target);
>@@ -2058,6 +2077,8 @@ static noinline void print_circular_bug(struct lock_list *this,
> 
> 	printk("\nstack backtrace:\n");
> 	dump_stack();
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> static noinline void print_bfs_bug(int ret)
>@@ -2570,6 +2591,8 @@ print_bad_irq_dependency(struct task_struct *curr,
> 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("=====================================================\n");
> 	pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
>@@ -2619,11 +2642,13 @@ print_bad_irq_dependency(struct task_struct *curr,
> 	pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
> 	next_root->trace = save_trace();
> 	if (!next_root->trace)
>-		return;
>+		goto out;
> 	print_shortest_lock_dependencies(forwards_entry, next_root);
> 
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+out:
>+	nbcon_cpu_emergency_exit();
> }
> 
> static const char *state_names[] = {
>@@ -2988,6 +3013,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
> 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("============================================\n");
> 	pr_warn("WARNING: possible recursive locking detected\n");
>@@ -3010,6 +3037,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
> 
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> /*
>@@ -3607,6 +3636,8 @@ static void print_collision(struct task_struct *curr,
> 			struct held_lock *hlock_next,
> 			struct lock_chain *chain)
> {
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("============================\n");
> 	pr_warn("WARNING: chain_key collision\n");
>@@ -3623,6 +3654,8 @@ static void print_collision(struct task_struct *curr,
> 
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+
>+	nbcon_cpu_emergency_exit();
> }
> #endif
> 
>@@ -3713,8 +3746,10 @@ static inline int add_chain_cache(struct task_struct *curr,
> 		if (!debug_locks_off_graph_unlock())
> 			return 0;
> 
>+		nbcon_cpu_emergency_enter();
> 		print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 		return 0;
> 	}
> 	chain->chain_key = chain_key;
>@@ -3731,8 +3766,10 @@ static inline int add_chain_cache(struct task_struct *curr,
> 		if (!debug_locks_off_graph_unlock())
> 			return 0;
> 
>+		nbcon_cpu_emergency_enter();
> 		print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 		return 0;
> 	}
> 
>@@ -3971,6 +4008,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
> 	if (!debug_locks_off() || debug_locks_silent)
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("================================\n");
> 	pr_warn("WARNING: inconsistent lock state\n");
>@@ -3999,6 +4038,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
> 
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> /*
>@@ -4033,6 +4074,8 @@ print_irq_inversion_bug(struct task_struct *curr,
> 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("========================================================\n");
> 	pr_warn("WARNING: possible irq lock inversion dependency detected\n");
>@@ -4073,11 +4116,13 @@ print_irq_inversion_bug(struct task_struct *curr,
> 	pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
> 	root->trace = save_trace();
> 	if (!root->trace)
>-		return;
>+		goto out;
> 	print_shortest_lock_dependencies(other, root);
> 
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+out:
>+	nbcon_cpu_emergency_exit();
> }
> 
> /*
>@@ -4154,6 +4199,8 @@ void print_irqtrace_events(struct task_struct *curr)
> {
> 	const struct irqtrace_events *trace = &curr->irqtrace;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	printk("irq event stamp: %u\n", trace->irq_events);
> 	printk("hardirqs last  enabled at (%u): [<%px>] %pS\n",
> 		trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip,
>@@ -4167,6 +4214,8 @@ void print_irqtrace_events(struct task_struct *curr)
> 	printk("softirqs last disabled at (%u): [<%px>] %pS\n",
> 		trace->softirq_disable_event, (void *)trace->softirq_disable_ip,
> 		(void *)trace->softirq_disable_ip);
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> static int HARDIRQ_verbose(struct lock_class *class)
>@@ -4687,10 +4736,12 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
> 	 * We must printk outside of the graph_lock:
> 	 */
> 	if (ret == 2) {
>+		nbcon_cpu_emergency_enter();
> 		printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
> 		print_lock(this);
> 		print_irqtrace_events(curr);
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 	}
> 
> 	return ret;
>@@ -4731,6 +4782,8 @@ print_lock_invalid_wait_context(struct task_struct *curr,
> 	if (debug_locks_silent)
> 		return 0;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("=============================\n");
> 	pr_warn("[ BUG: Invalid wait context ]\n");
>@@ -4750,6 +4803,8 @@ print_lock_invalid_wait_context(struct task_struct *curr,
> 	pr_warn("stack backtrace:\n");
> 	dump_stack();
> 
>+	nbcon_cpu_emergency_exit();
>+
> 	return 0;
> }
> 
>@@ -4954,6 +5009,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
> 	if (debug_locks_silent)
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("==================================\n");
> 	pr_warn("WARNING: Nested lock was not taken\n");
>@@ -4974,6 +5031,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
> 
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> static int __lock_is_held(const struct lockdep_map *lock, int read);
>@@ -5019,11 +5078,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
> 	debug_class_ops_inc(class);
> 
> 	if (very_verbose(class)) {
>+		nbcon_cpu_emergency_enter();
> 		printk("\nacquire class [%px] %s", class->key, class->name);
> 		if (class->name_version > 1)
> 			printk(KERN_CONT "#%d", class->name_version);
> 		printk(KERN_CONT "\n");
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 	}
> 
> 	/*
>@@ -5150,6 +5211,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
> #endif
> 	if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
> 		debug_locks_off();
>+		nbcon_cpu_emergency_enter();
> 		print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
> 		printk(KERN_DEBUG "depth: %i  max: %lu!\n",
> 		       curr->lockdep_depth, MAX_LOCK_DEPTH);
>@@ -5157,6 +5219,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
> 		lockdep_print_held_locks(current);
> 		debug_show_all_locks();
> 		dump_stack();
>+		nbcon_cpu_emergency_exit();
> 
> 		return 0;
> 	}
>@@ -5176,6 +5239,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
> 	if (debug_locks_silent)
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("=====================================\n");
> 	pr_warn("WARNING: bad unlock balance detected!\n");
>@@ -5192,6 +5257,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
> 
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> static noinstr int match_held_lock(const struct held_lock *hlock,
>@@ -5895,6 +5962,8 @@ static void print_lock_contention_bug(struct task_struct *curr,
> 	if (debug_locks_silent)
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("=================================\n");
> 	pr_warn("WARNING: bad contention detected!\n");
>@@ -5911,6 +5980,8 @@ static void print_lock_contention_bug(struct task_struct *curr,
> 
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> static void
>@@ -6524,6 +6595,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
> 	if (debug_locks_silent)
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("=========================\n");
> 	pr_warn("WARNING: held lock freed!\n");
>@@ -6536,6 +6609,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
> 
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> static inline int not_in_range(const void* mem_from, unsigned long mem_len,
>@@ -6582,6 +6657,8 @@ static void print_held_locks_bug(void)
> 	if (debug_locks_silent)
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	pr_warn("\n");
> 	pr_warn("====================================\n");
> 	pr_warn("WARNING: %s/%d still has locks held!\n",
>@@ -6591,6 +6668,8 @@ static void print_held_locks_bug(void)
> 	lockdep_print_held_locks(current);
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> void debug_check_no_locks_held(void)
>@@ -6609,6 +6688,7 @@ void debug_show_all_locks(void)
> 		pr_warn("INFO: lockdep is turned off.\n");
> 		return;
> 	}
>+	nbcon_cpu_emergency_enter();
> 	pr_warn("\nShowing all locks held in the system:\n");
> 
> 	rcu_read_lock();
>@@ -6623,6 +6703,7 @@ void debug_show_all_locks(void)
> 
> 	pr_warn("\n");
> 	pr_warn("=============================================\n\n");
>+	nbcon_cpu_emergency_exit();
> }
> EXPORT_SYMBOL_GPL(debug_show_all_locks);
> #endif
>@@ -6648,6 +6729,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
> 	if (unlikely(curr->lockdep_depth)) {
> 		if (!debug_locks_off())
> 			return;
>+		nbcon_cpu_emergency_enter();
> 		pr_warn("\n");
> 		pr_warn("================================================\n");
> 		pr_warn("WARNING: lock held when returning to user space!\n");
>@@ -6656,6 +6738,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
> 		pr_warn("%s/%d is leaving the kernel with locks still held!\n",
> 				curr->comm, curr->pid);
> 		lockdep_print_held_locks(curr);
>+		nbcon_cpu_emergency_exit();
> 	}
> 
> 	/*
>@@ -6672,6 +6755,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
> 	bool rcu = warn_rcu_enter();
> 
> 	/* Note: the following can be executed concurrently, so be careful. */
>+	nbcon_cpu_emergency_enter();
> 	pr_warn("\n");
> 	pr_warn("=============================\n");
> 	pr_warn("WARNING: suspicious RCU usage\n");
>@@ -6710,6 +6794,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
> 	lockdep_print_held_locks(curr);
> 	pr_warn("\nstack backtrace:\n");
> 	dump_stack();
>+	nbcon_cpu_emergency_exit();
> 	warn_rcu_exit(rcu);
> }
> EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
>diff --git a/kernel/panic.c b/kernel/panic.c
>index 2807639aab51d..3754a2471b4ff 100644
>--- a/kernel/panic.c
>+++ b/kernel/panic.c
>@@ -364,6 +364,8 @@ void panic(const char *fmt, ...)
> 
> 	panic_other_cpus_shutdown(_crash_kexec_post_notifiers);
> 
>+	printk_legacy_allow_panic_sync();
>+
> 	/*
> 	 * Run any panic handlers, including those that might need to
> 	 * add information to the kmsg dump output.
>@@ -453,6 +453,7 @@
> 	 * Explicitly flush the kernel log buffer one last time.
> 	 */
> 	console_flush_on_panic(CONSOLE_FLUSH_PENDING);
>+	nbcon_atomic_flush_unsafe();
> 
> 	local_irq_enable();
> 	for (i = 0; ; i += PANIC_TIMER_STEP) {
>@@ -623,6 +634,7 @@ bool oops_may_print(void)
>  */
> void oops_enter(void)
> {
>+	nbcon_cpu_emergency_enter();
> 	tracing_off();
> 	/* can't trust the integrity of the kernel anymore: */
> 	debug_locks_off();
>@@ -645,6 +657,7 @@ void oops_exit(void)
> {
> 	do_oops_enter_exit();
> 	print_oops_end_marker();
>+	nbcon_cpu_emergency_exit();
> 	kmsg_dump(KMSG_DUMP_OOPS);
> }
> 
>@@ -656,6 +669,8 @@ struct warn_args {
> void __warn(const char *file, int line, void *caller, unsigned taint,
> 	    struct pt_regs *regs, struct warn_args *args)
> {
>+	nbcon_cpu_emergency_enter();
>+
> 	disable_trace_on_warning();
> 
> 	if (file)
>@@ -686,6 +701,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
> 
> 	/* Just a warning, don't kill lockdep. */
> 	add_taint(taint, LOCKDEP_STILL_OK);
>+
>+	nbcon_cpu_emergency_exit();
> }
> 
> #ifdef CONFIG_BUG
>diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
>index ac2d9750e5f81..fdf455c890338 100644
>--- a/kernel/printk/internal.h
>+++ b/kernel/printk/internal.h
>@@ -2,11 +2,13 @@
> /*
>  * internal.h - printk internal definitions
>  */
>-#include <linux/percpu.h>
> #include <linux/console.h>
>-#include "printk_ringbuffer.h"
>+#include <linux/jump_label.h>
>+#include <linux/percpu.h>
>+#include <linux/types.h>
> 
> #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
>+struct ctl_table;
> void __init printk_sysctl_init(void);
> int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
> 			      void *buffer, size_t *lenp, loff_t *ppos);
>@@ -20,6 +22,13 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
> 		(con->flags & CON_BOOT) ? "boot" : "",		\
> 		con->name, con->index, ##__VA_ARGS__)
> 
>+#ifdef CONFIG_PREEMPT_RT
>+# define force_printkthreads()		(true)
>+#else
>+DECLARE_STATIC_KEY_FALSE(force_printkthreads_key);
>+# define force_printkthreads()		(static_branch_unlikely(&force_printkthreads_key))
>+#endif
>+
> #ifdef CONFIG_PRINTK
> 
> #ifdef CONFIG_PRINTK_CALLER
>@@ -43,7 +52,11 @@ enum printk_info_flags {
> 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
> };
> 
>+struct printk_ringbuffer;
>+struct dev_printk_info;
>+
> extern struct printk_ringbuffer *prb;
>+extern bool printk_threads_enabled;
> 
> __printf(4, 0)
> int vprintk_store(int facility, int level,
>@@ -53,6 +66,9 @@ int vprintk_store(int facility, int level,
> __printf(1, 0) int vprintk_default(const char *fmt, va_list args);
> __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
> 
>+void __printk_safe_enter(void);
>+void __printk_safe_exit(void);
>+
> bool printk_percpu_data_ready(void);
> 
> #define printk_safe_enter_irqsave(flags)	\
>@@ -71,12 +87,79 @@ void defer_console_output(void);
> 
> u16 printk_parse_prefix(const char *text, int *level,
> 			enum printk_info_flags *flags);
>+void console_lock_spinning_enable(void);
>+int console_lock_spinning_disable_and_check(int cookie);
> 
> u64 nbcon_seq_read(struct console *con);
> void nbcon_seq_force(struct console *con, u64 seq);
> bool nbcon_alloc(struct console *con);
> void nbcon_init(struct console *con);
> void nbcon_free(struct console *con);
>+enum nbcon_prio nbcon_get_default_prio(void);
>+void nbcon_atomic_flush_pending(void);
>+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
>+				   int cookie, bool use_atomic);
>+void nbcon_kthread_create(struct console *con);
>+void nbcon_wake_threads(void);
>+void nbcon_legacy_kthread_create(void);
>+
>+/*
>+ * Check if the given console is currently capable and allowed to print
>+ * records. Note that this function does not consider the current context,
>+ * which can also play a role in deciding if @con can be used to print
>+ * records.
>+ */
>+static inline bool console_is_usable(struct console *con, short flags, bool use_atomic)
>+{
>+	if (!(flags & CON_ENABLED))
>+		return false;
>+
>+	if ((flags & CON_SUSPENDED))
>+		return false;
>+
>+	if (flags & CON_NBCON) {
>+		if (use_atomic) {
>+			if (!con->write_atomic)
>+				return false;
>+		} else {
>+			if (!con->write_thread)
>+				return false;
>+		}
>+	} else {
>+		if (!con->write)
>+			return false;
>+	}
>+
>+	/*
>+	 * Console drivers may assume that per-cpu resources have been
>+	 * allocated. So unless they're explicitly marked as being able to
>+	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
>+	 */
>+	if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
>+		return false;
>+
>+	return true;
>+}
>+
>+/**
>+ * nbcon_kthread_wake - Wake up a printk thread
>+ * @con:        Console to operate on
>+ */
>+static inline void nbcon_kthread_wake(struct console *con)
>+{
>+	/*
>+	 * Guarantee any new records can be seen by tasks preparing to wait
>+	 * before this context checks if the rcuwait is empty.
>+	 *
>+	 * The full memory barrier in rcuwait_wake_up() pairs with the full
>+	 * memory barrier within set_current_state() of
>+	 * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait()
>+	 * adds the waiter but before it has checked the wait condition.
>+	 *
>+	 * This pairs with nbcon_kthread_func:A.
>+	 */
>+	rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */
>+}
> 
> #else
> 
>@@ -84,6 +167,10 @@ void nbcon_free(struct console *con);
> #define PRINTK_MESSAGE_MAX	0
> #define PRINTKRB_RECORD_MAX	0
> 
>+static inline void nbcon_kthread_wake(struct console *con) { }
>+static inline void nbcon_kthread_create(struct console *con) { }
>+#define printk_threads_enabled (false)
>+
> /*
>  * In !PRINTK builds we still export console_sem
>  * semaphore and some of console functions (console_unlock()/etc.), so
>@@ -98,9 +185,27 @@ static inline void nbcon_seq_force(struct console *con, u64 seq) { }
> static inline bool nbcon_alloc(struct console *con) { return false; }
> static inline void nbcon_init(struct console *con) { }
> static inline void nbcon_free(struct console *con) { }
>+static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; }
>+static inline void nbcon_atomic_flush_pending(void) { }
>+static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
>+						 int cookie, bool use_atomic) { return false; }
>+
>+static inline bool console_is_usable(struct console *con, short flags,
>+				     bool use_atomic) { return false; }
> 
> #endif /* CONFIG_PRINTK */
> 
>+extern bool have_boot_console;
>+extern bool have_legacy_console;
>+
>+/*
>+ * Specifies if the console lock/unlock dance is needed for console
>+ * printing. If @have_boot_console is true, the nbcon consoles will
>+ * be printed serially along with the legacy consoles because nbcon
>+ * consoles cannot print simultaneously with boot consoles.
>+ */
>+#define printing_via_unlock (have_legacy_console || have_boot_console)
>+
> extern struct printk_buffers printk_shared_pbufs;
> 
> /**
>@@ -135,4 +135,5 @@
> 
> #ifdef CONFIG_PRINTK
> void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
>+void console_prepend_replay(struct printk_message *pmsg);
> #endif
>diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
>index c8093bcc01fe6..932b888aa4c30 100644
>--- a/kernel/printk/nbcon.c
>+++ b/kernel/printk/nbcon.c
>@@ -2,11 +2,26 @@
> // Copyright (C) 2022 Linutronix GmbH, John Ogness
> // Copyright (C) 2022 Intel, Thomas Gleixner
> 
>-#include <linux/kernel.h>
>+#include <linux/atomic.h>
>+#include <linux/bug.h>
> #include <linux/console.h>
> #include <linux/delay.h>
>+#include <linux/errno.h>
>+#include <linux/export.h>
>+#include <linux/init.h>
>+#include <linux/irqflags.h>
>+#include <linux/kthread.h>
>+#include <linux/minmax.h>
>+#include <linux/percpu.h>
>+#include <linux/preempt.h>
> #include <linux/slab.h>
>+#include <linux/smp.h>
>+#include <linux/stddef.h>
>+#include <linux/string.h>
>+#include <linux/syscore_ops.h>
>+#include <linux/types.h>
> #include "internal.h"
>+#include "printk_ringbuffer.h"
> /*
>  * Printk console printing implementation for consoles which does not depend
>  * on the legacy style console_lock mechanism.
>@@ -172,9 +187,6 @@ void nbcon_seq_force(struct console *con, u64 seq)
> 	u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
> 
> 	atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq));
>-
>-	/* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */
>-	con->seq = 0;
> }
> 
> /**
>@@ -201,6 +213,8 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
> 	}
> }
> 
>+bool printk_threads_enabled __ro_after_init;
>+
> /**
>  * nbcon_context_try_acquire_direct - Try to acquire directly
>  * @ctxt:	The context of the caller
>@@ -531,6 +545,7 @@ static struct printk_buffers panic_nbcon_pbufs;
>  * nbcon_context_try_acquire - Try to acquire nbcon console
>  * @ctxt:	The context of the caller
>  *
>+ * Context:	Any context which could not be migrated to another CPU.
>  * Return:	True if the console was acquired. False otherwise.
>  *
>  * If the caller allowed an unsafe hostile takeover, on success the
>@@ -538,7 +553,6 @@ static struct printk_buffers panic_nbcon_pbufs;
>  * in an unsafe state. Otherwise, on success the caller may assume
>  * the console is not in an unsafe state.
>  */
>-__maybe_unused
> static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
> {
> 	unsigned int cpu = smp_processor_id();
>@@ -824,9 +838,42 @@ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
> }
> EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
> 
>+/**
>+ * nbcon_reacquire - Reacquire a console after losing ownership
>+ * @wctxt:	The write context that was handed to the write function
>+ *
>+ * Since ownership can be lost at any time due to handover or takeover, a
>+ * printing context _should_ be prepared to back out immediately and
>+ * carefully. However, there are many scenarios where the context _must_
>+ * reacquire ownership in order to finalize or revert hardware changes.
>+ *
>+ * This function allows a context to reacquire ownership using the same
>+ * priority as its previous ownership.
>+ *
>+ * Note that for printing contexts, after a successful reacquire the
>+ * context will have no output buffer because that has been lost. This
>+ * function cannot be used to resume printing.
>+ */
>+void nbcon_reacquire(struct nbcon_write_context *wctxt)
>+{
>+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
>+	struct console *con = ctxt->console;
>+	struct nbcon_state cur;
>+
>+	while (!nbcon_context_try_acquire(ctxt))
>+		cpu_relax();
>+
>+	wctxt->outbuf = NULL;
>+	wctxt->len = 0;
>+	nbcon_state_read(con, &cur);
>+	wctxt->unsafe_takeover = cur.unsafe_takeover;
>+}
>+EXPORT_SYMBOL_GPL(nbcon_reacquire);
>+
> /**
>  * nbcon_emit_next_record - Emit a record in the acquired context
>  * @wctxt:	The write context that will be handed to the write function
>+ * @use_atomic:	True if the write_atomic callback is to be used
>  *
>  * Return:	True if this context still owns the console. False if
>  *		ownership was handed over or taken.
>@@ -840,8 +887,7 @@ EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
>  * When true is returned, @wctxt->ctxt.backlog indicates whether there are
>  * still records pending in the ringbuffer,
>  */
>-__maybe_unused
>-static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
>+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic)
> {
> 	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
> 	struct console *con = ctxt->console;
>@@ -852,7 +898,7 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
> 	unsigned long con_dropped;
> 	struct nbcon_state cur;
> 	unsigned long dropped;
>-	bool done;
>+	unsigned long ulseq;
> 
> 	/*
> 	 * The printk buffers are filled within an unsafe section. This
>@@ -878,6 +924,28 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
> 	if (dropped && !is_extended)
> 		console_prepend_dropped(&pmsg, dropped);
> 
>+	/*
>+	 * If the previous owner was assigned the same record, this context
>+	 * has taken over ownership and is replaying the record. Prepend a
>+	 * message to let the user know the record is replayed.
>+	 */
>+	ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq));
>+	if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) {
>+		console_prepend_replay(&pmsg);
>+	} else {
>+		/*
>+		 * Ensure this context is still the owner before trying to
>+		 * update @nbcon_prev_seq. Otherwise the value in @ulseq may
>+		 * not be from the previous owner.
>+		 */
>+		nbcon_state_read(con, &cur);
>+		if (!nbcon_context_can_proceed(ctxt, &cur))
>+			return false;
>+
>+		atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq,
>+					__u64seq_to_ulseq(pmsg.seq));
>+	}
>+
> 	if (!nbcon_context_exit_unsafe(ctxt))
> 		return false;
> 
>@@ -891,17 +959,32 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
> 	nbcon_state_read(con, &cur);
> 	wctxt->unsafe_takeover = cur.unsafe_takeover;
> 
>-	if (con->write_atomic) {
>-		done = con->write_atomic(con, wctxt);
>+	if (use_atomic &&
>+	    con->write_atomic) {
>+		con->write_atomic(con, wctxt);
>+
>+	} else if (!use_atomic &&
>+		   con->write_thread) {
>+		con->write_thread(con, wctxt);
>+
> 	} else {
>-		nbcon_context_release(ctxt);
>+		/*
>+		 * This function should never be called for legacy consoles.
>+		 * Handle it as if ownership was lost and try to continue.
>+		 */
> 		WARN_ON_ONCE(1);
>-		done = false;
>+		nbcon_context_release(ctxt);
>+		return false;
> 	}
> 
>-	/* If not done, the emit was aborted. */
>-	if (!done)
>+	if (!wctxt->outbuf) {
>+		/*
>+		 * Ownership was lost and reacquired by the driver.
>+		 * Handle it as if ownership was lost and try to continue.
>+		 */
>+		nbcon_context_release(ctxt);
> 		return false;
>+	}
> 
> 	/*
> 	 * Since any dropped message was successfully output, reset the
>@@ -928,6 +1011,550 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
> 	return nbcon_context_exit_unsafe(ctxt);
> }
> 
>+/**
>+ * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup
>+ * @con:	Console to operate on
>+ * @ctxt:	The acquire context that contains the state
>+ *		at console_acquire()
>+ *
>+ * Return:	True if the thread should shutdown or if the console is
>+ *		allowed to print and a record is available. False otherwise.
>+ *
>+ * After the thread wakes up, it must first check if it should shutdown before
>+ * attempting any printing.
>+ */
>+static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt)
>+{
>+	bool ret = false;
>+	short flags;
>+	int cookie;
>+
>+	if (kthread_should_stop())
>+		return true;
>+
>+	cookie = console_srcu_read_lock();
>+
>+	flags = console_srcu_read_flags(con);
>+	if (console_is_usable(con, flags, false)) {
>+		/* Bring the sequence in @ctxt up to date */
>+		ctxt->seq = nbcon_seq_read(con);
>+
>+		ret = prb_read_valid(prb, ctxt->seq, NULL);
>+	}
>+
>+	console_srcu_read_unlock(cookie);
>+	return ret;
>+}
>+
>+/**
>+ * nbcon_kthread_func - The printer thread function
>+ * @__console:	Console to operate on
>+ */
>+static int nbcon_kthread_func(void *__console)
>+{
>+	struct console *con = __console;
>+	struct nbcon_write_context wctxt = {
>+		.ctxt.console	= con,
>+		.ctxt.prio	= NBCON_PRIO_NORMAL,
>+	};
>+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
>+	unsigned long flags;
>+	short con_flags;
>+	bool backlog;
>+	int cookie;
>+	int ret;
>+
>+wait_for_event:
>+	/*
>+	 * Guarantee this task is visible on the rcuwait before
>+	 * checking the wake condition.
>+	 *
>+	 * The full memory barrier within set_current_state() of
>+	 * ___rcuwait_wait_event() pairs with the full memory
>+	 * barrier within rcuwait_has_sleeper().
>+	 *
>+	 * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A.
>+	 */
>+	ret = rcuwait_wait_event(&con->rcuwait,
>+				 nbcon_kthread_should_wakeup(con, ctxt),
>+				 TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */
>+
>+	if (kthread_should_stop())
>+		return 0;
>+
>+	/* Wait was interrupted by a spurious signal, go back to sleep. */
>+	if (ret)
>+		goto wait_for_event;
>+
>+	do {
>+		backlog = false;
>+
>+		cookie = console_srcu_read_lock();
>+
>+		con_flags = console_srcu_read_flags(con);
>+
>+		if (console_is_usable(con, con_flags, false)) {
>+			con->device_lock(con, &flags);
>+
>+			/*
>+			 * Ensure this stays on the CPU to make handover and
>+			 * takeover possible.
>+			 */
>+			cant_migrate();
>+
>+			if (nbcon_context_try_acquire(ctxt)) {
>+				/*
>+				 * If the emit fails, this context is no
>+				 * longer the owner.
>+				 */
>+				if (nbcon_emit_next_record(&wctxt, false)) {
>+					nbcon_context_release(ctxt);
>+					backlog = ctxt->backlog;
>+				}
>+			}
>+
>+			con->device_unlock(con, flags);
>+		}
>+
>+		console_srcu_read_unlock(cookie);
>+
>+	} while (backlog);
>+
>+	goto wait_for_event;
>+}
>+
>+/**
>+ * nbcon_irq_work - irq work to wake printk thread
>+ * @irq_work:	The irq work to operate on
>+ */
>+static void nbcon_irq_work(struct irq_work *irq_work)
>+{
>+	struct console *con = container_of(irq_work, struct console, irq_work);
>+
>+	nbcon_kthread_wake(con);
>+}
>+
>+static inline bool rcuwait_has_sleeper(struct rcuwait *w)
>+{
>+	bool has_sleeper;
>+
>+	rcu_read_lock();
>+	/*
>+	 * Guarantee any new records can be seen by tasks preparing to wait
>+	 * before this context checks if the rcuwait is empty.
>+	 *
>+	 * This full memory barrier pairs with the full memory barrier within
>+	 * set_current_state() of ___rcuwait_wait_event(), which is called
>+	 * after prepare_to_rcuwait() adds the waiter but before it has
>+	 * checked the wait condition.
>+	 *
>+	 * This pairs with nbcon_kthread_func:A.
>+	 */
>+	smp_mb(); /* LMM(rcuwait_has_sleeper:A) */
>+	has_sleeper = !!rcu_dereference(w->task);
>+	rcu_read_unlock();
>+
>+	return has_sleeper;
>+}
>+
>+/**
>+ * nbcon_wake_threads - Wake up printing threads using irq_work
>+ */
>+void nbcon_wake_threads(void)
>+{
>+	struct console *con;
>+	int cookie;
>+
>+	cookie = console_srcu_read_lock();
>+	for_each_console_srcu(con) {
>+		/*
>+		 * Only schedule irq_work if the printing thread is
>+		 * actively waiting. If not waiting, the thread will
>+		 * notice by itself that it has work to do.
>+		 */
>+		if (con->kthread && rcuwait_has_sleeper(&con->rcuwait))
>+			irq_work_queue(&con->irq_work);
>+	}
>+	console_srcu_read_unlock(cookie);
>+}
>+
>+/* Track the nbcon emergency nesting per CPU. */
>+static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting);
>+static unsigned int early_nbcon_pcpu_emergency_nesting __initdata;
>+
>+/**
>+ * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer
>+ *
>+ * Return:	Either a pointer to the per CPU emergency nesting counter of
>+ *		the current CPU or to the init data during early boot.
>+ */
>+static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void)
>+{
>+	/*
>+	 * The value of __printk_percpu_data_ready gets set in normal
>+	 * context and before SMP initialization. As a result it could
>+	 * never change while inside an nbcon emergency section.
>+	 */
>+	if (!printk_percpu_data_ready())
>+		return &early_nbcon_pcpu_emergency_nesting;
>+
>+	return this_cpu_ptr(&nbcon_pcpu_emergency_nesting);
>+}
>+
>+/**
>+ * nbcon_emit_one - Print one record for an nbcon console using the
>+ *			specified callback
>+ * @wctxt:	An initialized write context struct to use for this context
>+ * @use_atomic:	True if the write_atomic callback is to be used
>+ *
>+ * Return:	False if it is known there are no more records to print,
>+ *		otherwise true.
>+ *
>+ * This is an internal helper to handle the locking of the console before
>+ * calling nbcon_emit_next_record().
>+ */
>+static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
>+{
>+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
>+
>+	if (!nbcon_context_try_acquire(ctxt))
>+		return true;
>+
>+	/*
>+	 * nbcon_emit_next_record() returns false when the console was
>+	 * handed over or taken over. In both cases the context is no
>+	 * longer valid.
>+	 */
>+	if (!nbcon_emit_next_record(wctxt, use_atomic))
>+		return true;
>+
>+	nbcon_context_release(ctxt);
>+
>+	return ctxt->backlog;
>+}
>+
>+/**
>+ * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon
>+ *				printing on the current CPU
>+ *
>+ * Context:	Any context which could not be migrated to another CPU.
>+ * Return:	The nbcon_prio to use for acquiring an nbcon console in this
>+ *		context for printing.
>+ */
>+enum nbcon_prio nbcon_get_default_prio(void)
>+{
>+	unsigned int *cpu_emergency_nesting;
>+
>+	if (this_cpu_in_panic())
>+		return NBCON_PRIO_PANIC;
>+
>+	cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
>+	if (*cpu_emergency_nesting)
>+		return NBCON_PRIO_EMERGENCY;
>+
>+	return NBCON_PRIO_NORMAL;
>+}
>+
>+/**
>+ * nbcon_legacy_emit_next_record - Print one record for an nbcon console
>+ *					in legacy contexts
>+ * @con:	The console to print on
>+ * @handover:	Will be set to true if a printk waiter has taken over the
>+ *		console_lock, in which case the caller is no longer holding
>+ *		both the console_lock and the SRCU read lock. Otherwise it
>+ *		is set to false.
>+ * @cookie:	The cookie from the SRCU read lock.
>+ * @use_atomic:	True if the write_atomic callback is to be used
>+ *
>+ * Context:	Any context except NMI.
>+ * Return:	False if the given console has no next record to print,
>+ *		otherwise true.
>+ *
>+ * This function is meant to be called by console_flush_all() to print records
>+ * on nbcon consoles from legacy context (printing via console unlocking).
>+ * Essentially it is the nbcon version of console_emit_next_record().
>+ */
>+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
>+				   int cookie, bool use_atomic)
>+{
>+	struct nbcon_write_context wctxt = { };
>+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
>+	unsigned long flags;
>+	bool progress;
>+
>+	*handover = false;
>+
>+	ctxt->console = con;
>+
>+	if (use_atomic) {
>+		/* Use the same procedure as console_emit_next_record(). */
>+		printk_safe_enter_irqsave(flags);
>+		console_lock_spinning_enable();
>+		stop_critical_timings();
>+
>+		ctxt->prio = nbcon_get_default_prio();
>+		progress = nbcon_emit_one(&wctxt, use_atomic);
>+
>+		start_critical_timings();
>+		*handover = console_lock_spinning_disable_and_check(cookie);
>+		printk_safe_exit_irqrestore(flags);
>+	} else {
>+		con->device_lock(con, &flags);
>+		cant_migrate();
>+
>+		ctxt->prio = nbcon_get_default_prio();
>+		progress = nbcon_emit_one(&wctxt, use_atomic);
>+
>+		con->device_unlock(con, flags);
>+	}
>+
>+	return progress;
>+}
>+
>+/**
>+ * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
>+ *					write_atomic() callback
>+ * @con:			The nbcon console to flush
>+ * @stop_seq:			Flush up until this record
>+ * @allow_unsafe_takeover:	True, to allow unsafe hostile takeovers
>+ *
>+ * Return:	True if taken over while printing. Otherwise false.
>+ *
>+ * If flushing up to @stop_seq was not successful, it only makes sense for the
>+ * caller to try again when true was returned. When false is returned, either
>+ * there are no more records available to read or this context is not allowed
>+ * to acquire the console.
>+ */
>+static bool __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
>+					     bool allow_unsafe_takeover)
>+{
>+	struct nbcon_write_context wctxt = { };
>+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
>+
>+	ctxt->console			= con;
>+	ctxt->spinwait_max_us		= 2000;
>+	ctxt->prio			= nbcon_get_default_prio();
>+	ctxt->allow_unsafe_takeover	= allow_unsafe_takeover;
>+
>+	if (!nbcon_context_try_acquire(ctxt))
>+		return false;
>+
>+	while (nbcon_seq_read(con) < stop_seq) {
>+		/*
>+		 * nbcon_emit_next_record() returns false when the console was
>+		 * handed over or taken over. In both cases the context is no
>+		 * longer valid.
>+		 */
>+		if (!nbcon_emit_next_record(&wctxt, true))
>+			return true;
>+
>+		if (!ctxt->backlog)
>+			break;
>+	}
>+
>+	nbcon_context_release(ctxt);
>+
>+	return false;
>+}
>+
>+/**
>+ * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their
>+ *					write_atomic() callback
>+ * @stop_seq:			Flush up until this record
>+ * @allow_unsafe_takeover:	True, to allow unsafe hostile takeovers
>+ */
>+static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover)
>+{
>+	struct console *con;
>+	bool should_retry;
>+	int cookie;
>+
>+	do {
>+		should_retry = false;
>+
>+		cookie = console_srcu_read_lock();
>+		for_each_console_srcu(con) {
>+			short flags = console_srcu_read_flags(con);
>+			unsigned long irq_flags;
>+
>+			if (!(flags & CON_NBCON))
>+				continue;
>+
>+			if (!console_is_usable(con, flags, true))
>+				continue;
>+
>+			if (nbcon_seq_read(con) >= stop_seq)
>+				continue;
>+
>+			/*
>+			 * Atomic flushing does not use console driver
>+			 * synchronization (i.e. it does not hold the port
>+			 * lock for uart consoles). Therefore IRQs must be
>+			 * disabled to avoid being interrupted and then
>+			 * calling into a driver that will deadlock trying
>+			 * to acquire console ownership.
>+			 */
>+			local_irq_save(irq_flags);
>+
>+			should_retry |= __nbcon_atomic_flush_pending_con(con, stop_seq,
>+									 allow_unsafe_takeover);
>+			local_irq_restore(irq_flags);
>+		}
>+		console_srcu_read_unlock(cookie);
>+	} while (should_retry);
>+}
>+
>+/**
>+ * nbcon_atomic_flush_pending - Flush all nbcon consoles using their
>+ *				write_atomic() callback
>+ *
>+ * Flush the backlog up through the currently newest record. Any new
>+ * records added while flushing will not be flushed. This is to avoid
>+ * one CPU printing unbounded because other CPUs continue to add records.
>+ */
>+void nbcon_atomic_flush_pending(void)
>+{
>+	__nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false);
>+}
>+
>+/**
>+ * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their
>+ *	write_atomic() callback and allowing unsafe hostile takeovers
>+ *
>+ * Flush the backlog up through the currently newest record. Unsafe hostile
>+ * takeovers will be performed, if necessary.
>+ */
>+void nbcon_atomic_flush_unsafe(void)
>+{
>+	__nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true);
>+}
>+
>+/**
>+ * nbcon_cpu_emergency_enter - Enter an emergency section where printk()
>+ *	messages for that CPU are only stored
>+ *
>+ * Upon exiting the emergency section, all stored messages are flushed.
>+ *
>+ * Context:	Any context. Disables preemption.
>+ *
>+ * When within an emergency section, no printing occurs on that CPU. This
>+ * is to allow all emergency messages to be dumped into the ringbuffer before
>+ * flushing the ringbuffer. The actual printing occurs when exiting the
>+ * outermost emergency section.
>+ */
>+void nbcon_cpu_emergency_enter(void)
>+{
>+	unsigned int *cpu_emergency_nesting;
>+
>+	preempt_disable();
>+
>+	cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
>+	(*cpu_emergency_nesting)++;
>+}
>+
>+/**
>+ * nbcon_cpu_emergency_exit - Exit an emergency section and flush the
>+ *	stored messages
>+ *
>+ * Flushing only occurs when exiting all nesting for the CPU.
>+ *
>+ * Context:	Any context. Enables preemption.
>+ */
>+void nbcon_cpu_emergency_exit(void)
>+{
>+	unsigned int *cpu_emergency_nesting;
>+	bool do_trigger_flush = false;
>+
>+	cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
>+
>+	WARN_ON_ONCE(*cpu_emergency_nesting == 0);
>+
>+	if (*cpu_emergency_nesting == 1) {
>+		nbcon_atomic_flush_pending();
>+		do_trigger_flush = true;
>+	}
>+
>+	/* Undo the nesting count of nbcon_cpu_emergency_enter(). */
>+	(*cpu_emergency_nesting)--;
>+
>+	preempt_enable();
>+
>+	if (do_trigger_flush)
>+		printk_trigger_flush();
>+}
>+
>+/**
>+ * nbcon_kthread_stop - Stop a printer thread
>+ * @con:	Console to operate on
>+ */
>+static void nbcon_kthread_stop(struct console *con)
>+{
>+	lockdep_assert_console_list_lock_held();
>+
>+	if (!con->kthread)
>+		return;
>+
>+	kthread_stop(con->kthread);
>+	con->kthread = NULL;
>+}
>+
>+/**
>+ * nbcon_kthread_create - Create a printer thread
>+ * @con:	Console to operate on
>+ *
>+ * If it fails, let the console proceed. The atomic part might
>+ * be usable and useful.
>+ */
>+void nbcon_kthread_create(struct console *con)
>+{
>+	struct task_struct *kt;
>+
>+	lockdep_assert_console_list_lock_held();
>+
>+	if (!(con->flags & CON_NBCON) || !con->write_thread)
>+		return;
>+
>+	if (!printk_threads_enabled || con->kthread)
>+		return;
>+
>+	/*
>+	 * Printer threads cannot be started as long as any boot console is
>+	 * registered because there is no way to synchronize the hardware
>+	 * registers between boot console code and regular console code.
>+	 */
>+	if (have_boot_console)
>+		return;
>+
>+	kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index);
>+	if (IS_ERR(kt)) {
>+		con_printk(KERN_ERR, con, "failed to start printing thread\n");
>+		return;
>+	}
>+
>+	con->kthread = kt;
>+
>+	/*
>+	 * It is important that console printing threads are scheduled
>+	 * shortly after a printk call and with generous runtime budgets.
>+	 */
>+	sched_set_normal(con->kthread, -20);
>+}
>+
>+static int __init printk_setup_threads(void)
>+{
>+	struct console *con;
>+
>+	console_list_lock();
>+	printk_threads_enabled = true;
>+	for_each_console(con)
>+		nbcon_kthread_create(con);
>+	if (force_printkthreads() && printing_via_unlock)
>+		nbcon_legacy_kthread_create();
>+	console_list_unlock();
>+	return 0;
>+}
>+early_initcall(printk_setup_threads);
>+
> /**
>  * nbcon_alloc - Allocate buffers needed by the nbcon console
>  * @con:	Console to allocate buffers for
>@@ -964,8 +1591,6 @@ bool nbcon_alloc(struct console *con)
>  *
>  * nbcon_alloc() *must* be called and succeed before this function
>  * is called.
>- *
>- * This function expects that the legacy @con->seq has been set.
>  */
> void nbcon_init(struct console *con)
> {
>@@ -974,8 +1599,12 @@ void nbcon_init(struct console *con)
> 	/* nbcon_alloc() must have been called and successful! */
> 	BUG_ON(!con->pbufs);
> 
>-	nbcon_seq_force(con, con->seq);
>+	rcuwait_init(&con->rcuwait);
>+	init_irq_work(&con->irq_work, nbcon_irq_work);
>+	nbcon_seq_force(con, 0);
>+	atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL);
> 	nbcon_state_set(con, &state);
>+	nbcon_kthread_create(con);
> }
> 
> /**
>@@ -986,6 +1615,7 @@ void nbcon_free(struct console *con)
> {
> 	struct nbcon_state state = { };
> 
>+	nbcon_kthread_stop(con);
> 	nbcon_state_set(con, &state);
> 
> 	/* Boot consoles share global printk buffers. */
>@@ -994,3 +1624,82 @@ void nbcon_free(struct console *con)
> 
> 	con->pbufs = NULL;
> }
>+
>+/**
>+ * nbcon_driver_acquire - Acquire nbcon console and enter unsafe section
>+ * @con:	The nbcon console to acquire
>+ *
>+ * Context:	Any context which could not be migrated to another CPU.
>+ *
>+ * Console drivers will usually use their own internal synchronization
>+ * mechasism to synchronize between console printing and non-printing
>+ * activities (such as setting baud rates). However, nbcon console drivers
>+ * supporting atomic consoles may also want to mark unsafe sections when
>+ * performing non-printing activities.
>+ *
>+ * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL
>+ * and marks it unsafe for handover/takeover.
>+ *
>+ * Console drivers using this function must have provided @nbcon_drvdata in
>+ * their struct console, which is used to track ownership and state
>+ * information.
>+ */
>+void nbcon_driver_acquire(struct console *con)
>+{
>+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(con->nbcon_drvdata, ctxt);
>+
>+	cant_migrate();
>+
>+	do {
>+		do {
>+			memset(ctxt, 0, sizeof(*ctxt));
>+			ctxt->console	= con;
>+			ctxt->prio	= NBCON_PRIO_NORMAL;
>+		} while (!nbcon_context_try_acquire(ctxt));
>+
>+	} while (!nbcon_context_enter_unsafe(ctxt));
>+}
>+EXPORT_SYMBOL_GPL(nbcon_driver_acquire);
>+
>+/**
>+ * nbcon_driver_release - Exit unsafe section and release the nbcon console
>+ * @con:	The nbcon console acquired in nbcon_driver_acquire()
>+ */
>+void nbcon_driver_release(struct console *con)
>+{
>+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(con->nbcon_drvdata, ctxt);
>+
>+	if (nbcon_context_exit_unsafe(ctxt))
>+		nbcon_context_release(ctxt);
>+}
>+EXPORT_SYMBOL_GPL(nbcon_driver_release);
>+
>+/**
>+ * printk_kthread_shutdown - shutdown all threaded printers
>+ *
>+ * On system shutdown all threaded printers are stopped. This allows printk
>+ * to transition back to atomic printing, thus providing a robust mechanism
>+ * for the final shutdown/reboot messages to be output.
>+ */
>+static void printk_kthread_shutdown(void)
>+{
>+	struct console *con;
>+
>+	console_list_lock();
>+	for_each_console(con) {
>+		if (con->flags & CON_NBCON)
>+			nbcon_kthread_stop(con);
>+	}
>+	console_list_unlock();
>+}
>+
>+static struct syscore_ops printk_syscore_ops = {
>+	.shutdown = printk_kthread_shutdown,
>+};
>+
>+static int __init printk_init_ops(void)
>+{
>+	register_syscore_ops(&printk_syscore_ops);
>+	return 0;
>+}
>+device_initcall(printk_init_ops);
>diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
>index 72f6a564e832f..8ee6c60b47c4b 100644
>--- a/kernel/printk/printk.c
>+++ b/kernel/printk/printk.c
>@@ -195,6 +195,17 @@ static int __init control_devkmsg(char *str)
> }
> __setup("printk.devkmsg=", control_devkmsg);
> 
>+#if !defined(CONFIG_PREEMPT_RT)
>+DEFINE_STATIC_KEY_FALSE(force_printkthreads_key);
>+
>+static int __init setup_forced_printkthreads(char *arg)
>+{
>+	static_branch_enable(&force_printkthreads_key);
>+	return 0;
>+}
>+early_param("threadprintk", setup_forced_printkthreads);
>+#endif
>+
> char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
> #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
> int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
>@@ -282,6 +293,7 @@ EXPORT_SYMBOL(console_list_unlock);
>  * Return: A cookie to pass to console_srcu_read_unlock().
>  */
> int console_srcu_read_lock(void)
>+	__acquires(&console_srcu)
> {
> 	return srcu_read_lock_nmisafe(&console_srcu);
> }
>@@ -295,6 +307,7 @@ EXPORT_SYMBOL(console_srcu_read_lock);
>  * Counterpart to console_srcu_read_lock()
>  */
> void console_srcu_read_unlock(int cookie)
>+	__releases(&console_srcu)
> {
> 	srcu_read_unlock_nmisafe(&console_srcu, cookie);
> }
>@@ -461,14 +474,33 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
> /* syslog_lock protects syslog_* variables and write access to clear_seq. */
> static DEFINE_MUTEX(syslog_lock);
> 
>-#ifdef CONFIG_PRINTK
> /*
>- * During panic, heavy printk by other CPUs can delay the
>- * panic and risk deadlock on console resources.
>+ * Specifies if a legacy console is registered. If legacy consoles are
>+ * present, it is necessary to perform the console lock/unlock dance
>+ * whenever console flushing should occur.
>  */
>-static int __read_mostly suppress_panic_printk;
>+bool have_legacy_console;
> 
>+/*
>+ * Specifies if an nbcon console is registered. If nbcon consoles are present,
>+ * synchronous printing of legacy consoles will not occur during panic until
>+ * the backtrace has been stored to the ringbuffer.
>+ */
>+static bool have_nbcon_console;
>+
>+/*
>+ * Specifies if a boot console is registered. If boot consoles are present,
>+ * nbcon consoles cannot print simultaneously and must be synchronized by
>+ * the console lock. This is because boot consoles and nbcon consoles may
>+ * have mapped the same hardware.
>+ */
>+bool have_boot_console;
>+
>+#ifdef CONFIG_PRINTK
> DECLARE_WAIT_QUEUE_HEAD(log_wait);
>+
>+static DECLARE_WAIT_QUEUE_HEAD(legacy_wait);
>+
> /* All 3 protected by @syslog_lock. */
> /* the next printk record to read by syslog(READ) or /proc/kmsg */
> static u64 syslog_seq;
>@@ -1867,7 +1899,7 @@ static bool console_waiter;
>  * there may be a waiter spinning (like a spinlock). Also it must be
>  * ready to hand over the lock at the end of the section.
>  */
>-static void console_lock_spinning_enable(void)
>+void console_lock_spinning_enable(void)
> {
> 	/*
> 	 * Do not use spinning in panic(). The panic CPU wants to keep the lock.
>@@ -1906,7 +1938,7 @@ static void console_lock_spinning_enable(void)
>  *
>  * Return: 1 if the lock rights were passed, 0 otherwise.
>  */
>-static int console_lock_spinning_disable_and_check(int cookie)
>+int console_lock_spinning_disable_and_check(int cookie)
> {
> 	int waiter;
> 
>@@ -2317,54 +2317,116 @@
> 	return ret;
> }
> 
>+static bool legacy_allow_panic_sync;
>+
>+/*
>+ * This acts as a one-way switch to allow legacy consoles to print from
>+ * the printk() caller context on a panic CPU. It also attempts to flush
>+ * the legacy consoles in this context.
>+ */
>+void printk_legacy_allow_panic_sync(void)
>+{
>+	legacy_allow_panic_sync = true;
>+
>+	if (printing_via_unlock && !in_nmi()) {
>+		if (console_trylock())
>+			console_unlock();
>+	}
>+}
>+
> asmlinkage int vprintk_emit(int facility, int level,
> 			    const struct dev_printk_info *dev_info,
> 			    const char *fmt, va_list args)
> {
>+	bool do_trylock_unlock = printing_via_unlock &&
>+				 !force_printkthreads();
> 	int printed_len;
>-	bool in_sched = false;
> 
> 	/* Suppress unimportant messages after panic happens */
> 	if (unlikely(suppress_printk))
> 		return 0;
> 
>-	if (unlikely(suppress_panic_printk) && other_cpu_in_panic())
>-		return 0;
>+ 	if (level == LOGLEVEL_SCHED) {
>+ 		level = LOGLEVEL_DEFAULT;
>+		/* If called from the scheduler, we can not call up(). */
>+		do_trylock_unlock = false;
>+ 	}
>+ 
>+ 	printk_delay(level);
>+ 
>+ 	printed_len = vprintk_store(facility, level, dev_info, fmt, args);
>+ 
>+	if (have_nbcon_console && !have_boot_console) {
>+		bool is_panic_context = this_cpu_in_panic();
> 
>-	if (level == LOGLEVEL_SCHED) {
>-		level = LOGLEVEL_DEFAULT;
>-		in_sched = true;
>-	}
>+		/*
>+		 * In panic, the legacy consoles are not allowed to print from
>+		 * the printk calling context unless explicitly allowed. This
>+		 * gives the safe nbcon consoles a chance to print out all the
>+		 * panic messages first. This restriction only applies if
>+		 * there are nbcon consoles registered.
>+		 */
>+		if (is_panic_context)
>+			do_trylock_unlock &= legacy_allow_panic_sync;
> 
>-	printk_delay(level);
>+		/*
>+		 * There are situations where nbcon atomic printing should
>+		 * happen in the printk() caller context:
>+		 *
>+		 * - When this CPU is in panic.
>+		 *
>+		 * - When booting, before the printing threads have been
>+		 *   started.
>+		 *
>+		 * - During shutdown, since the printing threads may not get
>+		 *   a chance to print the final messages.
>+		 *
>+		 * Note that if boot consoles are registered, the console
>+		 * lock/unlock dance must be relied upon instead because nbcon
>+		 * consoles cannot print simultaneously with boot consoles.
>+		 */
>+		if (is_panic_context ||
>+		    !printk_threads_enabled ||
>+		    (system_state > SYSTEM_RUNNING)) {
>+			nbcon_atomic_flush_pending();
>+		}
>+	}
> 
>-	printed_len = vprintk_store(facility, level, dev_info, fmt, args);
>+	nbcon_wake_threads();
> 
>-	/* If called from the scheduler, we can not call up(). */
>-	if (!in_sched) {
>+	if (do_trylock_unlock) {
> 		/*
> 		 * The caller may be holding system-critical or
> 		 * timing-sensitive locks. Disable preemption during
> 		 * printing of all remaining records to all consoles so that
>-		 * this context can return as soon as possible. Hopefully
>-		 * another printk() caller will take over the printing.
>-		 */
>+ 		 * this context can return as soon as possible. Hopefully
>+ 		 * another printk() caller will take over the printing.
>+		 *
>+		 * Also, nbcon_get_default_prio() requires migration disabled.
>+ 		 */
> 		preempt_disable();
>+
> 		/*
> 		 * Try to acquire and then immediately release the console
> 		 * semaphore. The release will print out buffers. With the
> 		 * spinning variant, this context tries to take over the
> 		 * printing from another printing context.
>-		 */
>-		if (console_trylock_spinning())
>-			console_unlock();
>+		 *
>+		 * Skip it in EMERGENCY priority. The console will be
>+		 * explicitly flushed when exiting the emergency section.
>+ 		 */
>+		if (nbcon_get_default_prio() != NBCON_PRIO_EMERGENCY) {
>+			if (console_trylock_spinning())
>+				console_unlock();
>+		}
>+
> 		preempt_enable();
> 	}
> 
>-	if (in_sched)
>-		defer_console_output();
>-	else
>+	if (do_trylock_unlock)
> 		wake_up_klogd();
>+	else
>+		defer_console_output();
> 
> 	return printed_len;
> }
>@@ -2387,6 +2488,14 @@ EXPORT_SYMBOL(_printk);
> static bool pr_flush(int timeout_ms, bool reset_on_progress);
> static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
> 
>+static struct task_struct *nbcon_legacy_kthread;
>+
>+static inline void wake_up_legacy_kthread(void)
>+{
>+	if (nbcon_legacy_kthread)
>+		wake_up_interruptible(&legacy_wait);
>+}
>+
> #else /* CONFIG_PRINTK */
> 
> #define printk_time		false
>@@ -2400,6 +2509,8 @@ static u64 syslog_seq;
> static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
> static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
> 
>+static inline void nbcon_legacy_kthread_create(void) { }
>+static inline void wake_up_legacy_kthread(void) { }
> #endif /* CONFIG_PRINTK */
> 
> #ifdef CONFIG_EARLY_PRINTK
>@@ -2615,6 +2726,8 @@ void suspend_console(void)
> void resume_console(void)
> {
> 	struct console *con;
>+	short flags;
>+	int cookie;
> 
> 	if (!console_suspend_enabled)
> 		return;
>@@ -2631,6 +2744,20 @@ void resume_console(void)
> 	 */
> 	synchronize_srcu(&console_srcu);
> 
>+	/*
>+	 * Since this runs in task context, wake the threaded printers
>+	 * directly rather than scheduling irq_work to do it.
>+	 */
>+	cookie = console_srcu_read_lock();
>+	for_each_console_srcu(con) {
>+		flags = console_srcu_read_flags(con);
>+		if (flags & CON_NBCON)
>+			nbcon_kthread_wake(con);
>+	}
>+	console_srcu_read_unlock(cookie);
>+
>+	wake_up_legacy_kthread();
>+
> 	pr_flush(1000, true);
> }
> 
>@@ -2645,7 +2772,8 @@ void resume_console(void)
>  */
> static int console_cpu_notify(unsigned int cpu)
> {
>-	if (!cpuhp_tasks_frozen) {
>+	if (!cpuhp_tasks_frozen && printing_via_unlock &&
>+	    !force_printkthreads()) {
> 		/* If trylock fails, someone else is doing the printing */
> 		if (console_trylock())
> 			console_unlock();
>@@ -2702,36 +2830,6 @@ int is_console_locked(void)
> }
> EXPORT_SYMBOL(is_console_locked);
> 
>-/*
>- * Check if the given console is currently capable and allowed to print
>- * records.
>- *
>- * Requires the console_srcu_read_lock.
>- */
>-static inline bool console_is_usable(struct console *con)
>-{
>-	short flags = console_srcu_read_flags(con);
>-
>-	if (!(flags & CON_ENABLED))
>-		return false;
>-
>-	if ((flags & CON_SUSPENDED))
>-		return false;
>-
>-	if (!con->write)
>-		return false;
>-
>-	/*
>-	 * Console drivers may assume that per-cpu resources have been
>-	 * allocated. So unless they're explicitly marked as being able to
>-	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
>-	 */
>-	if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
>-		return false;
>-
>-	return true;
>-}
>-
> static void __console_unlock(void)
> {
> 	console_locked = 0;
>@@ -2741,30 +2839,25 @@ static void __console_unlock(void)
> #ifdef CONFIG_PRINTK
> 
> /*
>- * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
>- * is achieved by shifting the existing message over and inserting the dropped
>- * message.
>+ * Prepend the message in @pmsg->pbufs->outbuf with the message in
>+ * @pmsg->pbufs->scratchbuf. This is achieved by shifting the existing message
>+ * over and inserting the scratchbuf message.
>  *
>  * @pmsg is the printk message to prepend.
>  *
>- * @dropped is the dropped count to report in the dropped message.
>+ * @len is the length of the message in @pmsg->pbufs->scratchbuf.
>  *
>  * If the message text in @pmsg->pbufs->outbuf does not have enough space for
>- * the dropped message, the message text will be sufficiently truncated.
>+ * the scratchbuf message, the message text will be sufficiently truncated.
>  *
>  * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
>  */
>-void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
>+static void __console_prepend_scratch(struct printk_message *pmsg, size_t len)
> {
> 	struct printk_buffers *pbufs = pmsg->pbufs;
>-	const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
> 	const size_t outbuf_sz = sizeof(pbufs->outbuf);
> 	char *scratchbuf = &pbufs->scratchbuf[0];
> 	char *outbuf = &pbufs->outbuf[0];
>-	size_t len;
>-
>-	len = scnprintf(scratchbuf, scratchbuf_sz,
>-		       "** %lu printk messages dropped **\n", dropped);
> 
> 	/*
> 	 * Make sure outbuf is sufficiently large before prepending.
>@@ -2786,6 +2879,46 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
> 	pmsg->outbuf_len += len;
> }
> 
>+/*
>+ * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message".
>+ * @pmsg->outbuf_len is updated appropriately.
>+ *
>+ * @pmsg is the printk message to prepend.
>+ *
>+ * @dropped is the dropped count to report in the dropped message.
>+ */
>+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
>+{
>+	struct printk_buffers *pbufs = pmsg->pbufs;
>+	const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
>+	char *scratchbuf = &pbufs->scratchbuf[0];
>+	size_t len;
>+
>+	len = scnprintf(scratchbuf, scratchbuf_sz,
>+		       "** %lu printk messages dropped **\n", dropped);
>+
>+	__console_prepend_scratch(pmsg, len);
>+}
>+
>+/*
>+ * Prepend the message in @pmsg->pbufs->outbuf with a "replay message".
>+ * @pmsg->outbuf_len is updated appropriately.
>+ *
>+ * @pmsg is the printk message to prepend.
>+ */
>+void console_prepend_replay(struct printk_message *pmsg)
>+{
>+	struct printk_buffers *pbufs = pmsg->pbufs;
>+	const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
>+	char *scratchbuf = &pbufs->scratchbuf[0];
>+	size_t len;
>+
>+	len = scnprintf(scratchbuf, scratchbuf_sz,
>+			"** replaying previous printk message **\n");
>+
>+	__console_prepend_scratch(pmsg, len);
>+}
>+
> /*
>  * Read and format the specified record (or a later record if the specified
>  * record is not available).
>@@ -2808,8 +2941,6 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
> bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
> 			     bool is_extended, bool may_suppress)
> {
>-	static int panic_console_dropped;
>-
> 	struct printk_buffers *pbufs = pmsg->pbufs;
> 	const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
> 	const size_t outbuf_sz = sizeof(pbufs->outbuf);
>@@ -2837,17 +2968,6 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
> 	pmsg->seq = r.info->seq;
> 	pmsg->dropped = r.info->seq - seq;
> 
>-	/*
>-	 * Check for dropped messages in panic here so that printk
>-	 * suppression can occur as early as possible if necessary.
>-	 */
>-	if (pmsg->dropped &&
>-	    panic_in_progress() &&
>-	    panic_console_dropped++ > 10) {
>-		suppress_panic_printk = 1;
>-		pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
>-	}
>-
> 	/* Skip record that has level above the console loglevel. */
> 	if (may_suppress && suppress_message_printing(r.info->level))
> 		goto out;
>@@ -2864,6 +2984,33 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
> 	return true;
> }
> 
>+/*
>+ * Legacy console printing from printk() caller context does not respect
>+ * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a
>+ * false positive. For PREEMPT_RT the false positive condition does not
>+ * occur.
>+ *
>+ * This map is used to establish LD_WAIT_SLEEP context for the console write
>+ * callbacks when legacy printing to avoid false positive lockdep complaints,
>+ * thus allowing lockdep to continue to function for real issues.
>+ */
>+#ifdef CONFIG_PREEMPT_RT
>+static inline void printk_legacy_lock_map_acquire_try(void) { }
>+static inline void printk_legacy_lock_map_release(void) { }
>+#else
>+static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP);
>+
>+static inline void printk_legacy_lock_map_acquire_try(void)
>+{
>+	lock_map_acquire_try(&printk_legacy_map);
>+}
>+
>+static inline void printk_legacy_lock_map_release(void)
>+{
>+	lock_map_release(&printk_legacy_map);
>+}
>+#endif /* CONFIG_PREEMPT_RT */
>+
> /*
>  * Used as the printk buffers for non-panic, serialized console printing.
>  * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
>@@ -2913,31 +3060,45 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co
> 		con->dropped = 0;
> 	}
> 
>-	/*
>-	 * While actively printing out messages, if another printk()
>-	 * were to occur on another CPU, it may wait for this one to
>-	 * finish. This task can not be preempted if there is a
>-	 * waiter waiting to take over.
>-	 *
>-	 * Interrupts are disabled because the hand over to a waiter
>-	 * must not be interrupted until the hand over is completed
>-	 * (@console_waiter is cleared).
>-	 */
>-	printk_safe_enter_irqsave(flags);
>-	console_lock_spinning_enable();
>-
>-	/* Do not trace print latency. */
>-	stop_critical_timings();
>-
> 	/* Write everything out to the hardware. */
>-	con->write(con, outbuf, pmsg.outbuf_len);
> 
>-	start_critical_timings();
>+	if (force_printkthreads()) {
>+		/*
>+		 * With forced threading this function is either in a thread
>+		 * or panic context. So there is no need for concern about
>+		 * printk reentrance, handovers, or lockdep complaints.
>+		 */
> 
>-	con->seq = pmsg.seq + 1;
>+		con->write(con, outbuf, pmsg.outbuf_len);
>+		con->seq = pmsg.seq + 1;
>+	} else {
>+		/*
>+		 * While actively printing out messages, if another printk()
>+		 * were to occur on another CPU, it may wait for this one to
>+		 * finish. This task can not be preempted if there is a
>+		 * waiter waiting to take over.
>+		 *
>+		 * Interrupts are disabled because the hand over to a waiter
>+		 * must not be interrupted until the hand over is completed
>+		 * (@console_waiter is cleared).
>+		 */
>+		printk_safe_enter_irqsave(flags);
>+		console_lock_spinning_enable();
> 
>-	*handover = console_lock_spinning_disable_and_check(cookie);
>-	printk_safe_exit_irqrestore(flags);
>+		/* Do not trace print latency. */
>+		stop_critical_timings();
>+
>+		printk_legacy_lock_map_acquire_try();
>+		con->write(con, outbuf, pmsg.outbuf_len);
>+		printk_legacy_lock_map_release();
>+
>+		start_critical_timings();
>+
>+		con->seq = pmsg.seq + 1;
>+
>+		*handover = console_lock_spinning_disable_and_check(cookie);
>+		printk_safe_exit_irqrestore(flags);
>+	}
> skip:
> 	return true;
> }
>@@ -2990,13 +3151,29 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
> 
> 		cookie = console_srcu_read_lock();
> 		for_each_console_srcu(con) {
>+			short flags = console_srcu_read_flags(con);
>+			u64 printk_seq;
> 			bool progress;
> 
>-			if (!console_is_usable(con))
>+			/*
>+			 * console_flush_all() is only for legacy consoles,
>+			 * unless the nbcon console has no kthread printer.
>+			 */
>+			if ((flags & CON_NBCON) && con->kthread)
>+				continue;
>+
>+			if (!console_is_usable(con, flags, !do_cond_resched))
> 				continue;
> 			any_usable = true;
> 
>-			progress = console_emit_next_record(con, handover, cookie);
>+			if (flags & CON_NBCON) {
>+				progress = nbcon_legacy_emit_next_record(con, handover, cookie,
>+									 !do_cond_resched);
>+				printk_seq = nbcon_seq_read(con);
>+			} else {
>+				progress = console_emit_next_record(con, handover, cookie);
>+				printk_seq = con->seq;
>+			}
> 
> 			/*
> 			 * If a handover has occurred, the SRCU read lock
>@@ -3006,8 +3183,8 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
> 				return false;
> 
> 			/* Track the next of the highest seq flushed. */
>-			if (con->seq > *next_seq)
>-				*next_seq = con->seq;
>+			if (printk_seq > *next_seq)
>+				*next_seq = printk_seq;
> 
> 			if (!progress)
> 				continue;
>@@ -3030,19 +3207,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
> 	return false;
> }
> 
>-/**
>- * console_unlock - unblock the console subsystem from printing
>- *
>- * Releases the console_lock which the caller holds to block printing of
>- * the console subsystem.
>- *
>- * While the console_lock was held, console output may have been buffered
>- * by printk().  If this is the case, console_unlock(); emits
>- * the output prior to releasing the lock.
>- *
>- * console_unlock(); may be called from any context.
>- */
>-void console_unlock(void)
>+static void console_flush_and_unlock(void)
> {
> 	bool do_cond_resched;
> 	bool handover;
>@@ -3086,6 +3251,32 @@ void console_unlock(void)
> 		 */
> 	} while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
> }
>+
>+/**
>+ * console_unlock - unblock the console subsystem from printing
>+ *
>+ * Releases the console_lock which the caller holds to block printing of
>+ * the console subsystem.
>+ *
>+ * While the console_lock was held, console output may have been buffered
>+ * by printk().  If this is the case, console_unlock(); emits
>+ * the output prior to releasing the lock.
>+ *
>+ * console_unlock(); may be called from any context.
>+ */
>+void console_unlock(void)
>+{
>+	/*
>+	 * Forced threading relies on kthread and atomic consoles for
>+	 * printing. It never attempts to print from console_unlock().
>+	 */
>+	if (force_printkthreads()) {
>+		__console_unlock();
>+		return;
>+	}
>+
>+	console_flush_and_unlock();
>+}
> EXPORT_SYMBOL(console_unlock);
> 
> /**
>@@ -3219,7 +3410,10 @@ void console_flush_on_panic(enum con_flush_mode mode)
> 		console_srcu_read_unlock(cookie);
> 	}
> 
>-	console_flush_all(false, &next_seq, &handover);
>+	nbcon_atomic_flush_pending();
>+
>+	if (printing_via_unlock)
>+		console_flush_all(false, &next_seq, &handover);
> }
> 
> /*
>@@ -3276,13 +3470,122 @@ EXPORT_SYMBOL(console_stop);
> 
> void console_start(struct console *console)
> {
>+	short flags;
>+
> 	console_list_lock();
> 	console_srcu_write_flags(console, console->flags | CON_ENABLED);
>+	flags = console->flags;
> 	console_list_unlock();
>+
>+	/*
>+	 * Ensure that all SRCU list walks have completed. The related
>+	 * printing context must be able to see it is enabled so that
>+	 * it is guaranteed to wake up and resume printing.
>+	 */
>+	synchronize_srcu(&console_srcu);
>+
>+	if (flags & CON_NBCON)
>+		nbcon_kthread_wake(console);
>+	else
>+		wake_up_legacy_kthread();
>+
> 	__pr_flush(console, 1000, true);
> }
> EXPORT_SYMBOL(console_start);
> 
>+#ifdef CONFIG_PRINTK
>+static bool printer_should_wake(void)
>+{
>+	bool available = false;
>+	struct console *con;
>+	int cookie;
>+
>+	if (kthread_should_stop())
>+		return true;
>+
>+	cookie = console_srcu_read_lock();
>+	for_each_console_srcu(con) {
>+		short flags = console_srcu_read_flags(con);
>+		u64 printk_seq;
>+
>+		/*
>+		 * The legacy printer thread is only for legacy consoles,
>+		 * unless the nbcon console has no kthread printer.
>+		 */
>+		if ((flags & CON_NBCON) && con->kthread)
>+			continue;
>+
>+		if (!console_is_usable(con, flags, true))
>+			continue;
>+
>+		if (flags & CON_NBCON) {
>+			printk_seq = nbcon_seq_read(con);
>+		} else {
>+			/*
>+			 * It is safe to read @seq because only this
>+			 * thread context updates @seq.
>+			 */
>+			printk_seq = con->seq;
>+		}
>+
>+		if (prb_read_valid(prb, printk_seq, NULL)) {
>+			available = true;
>+			break;
>+		}
>+	}
>+	console_srcu_read_unlock(cookie);
>+
>+	return available;
>+}
>+
>+static int nbcon_legacy_kthread_func(void *unused)
>+{
>+	int error;
>+
>+	for (;;) {
>+		error = wait_event_interruptible(legacy_wait, printer_should_wake());
>+
>+		if (kthread_should_stop())
>+			break;
>+
>+		if (error)
>+			continue;
>+
>+		console_lock();
>+		console_flush_and_unlock();
>+	}
>+
>+	return 0;
>+}
>+
>+void nbcon_legacy_kthread_create(void)
>+{
>+	struct task_struct *kt;
>+
>+	lockdep_assert_held(&console_mutex);
>+
>+	if (!force_printkthreads())
>+		return;
>+
>+	if (!printk_threads_enabled || nbcon_legacy_kthread)
>+		return;
>+
>+	kt = kthread_run(nbcon_legacy_kthread_func, NULL, "pr/legacy");
>+	if (IS_ERR(kt)) {
>+		pr_err("unable to start legacy printing thread\n");
>+		return;
>+	}
>+
>+	nbcon_legacy_kthread = kt;
>+
>+	/*
>+	 * It is important that console printing threads are scheduled
>+	 * shortly after a printk call and with generous runtime budgets.
>+	 */
>+	sched_set_normal(nbcon_legacy_kthread, -20);
>+}
>+#endif /* CONFIG_PRINTK */
>+
> static int __read_mostly keep_bootcon;
> 
> static int __init keep_bootcon_setup(char *str)
>@@ -3366,6 +3669,7 @@ static void try_enable_default_console(struct console *newcon)
> 		newcon->flags |= CON_CONSDEV;
> }
> 
>+/* Set @newcon->seq to the first record this console should print. */
> static void console_init_seq(struct console *newcon, bool bootcon_registered)
> {
> 	struct console *con;
>@@ -3414,11 +3718,20 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered)
> 
> 				newcon->seq = prb_next_seq(prb);
> 				for_each_console(con) {
>-					if ((con->flags & CON_BOOT) &&
>-					    (con->flags & CON_ENABLED) &&
>-					    con->seq < newcon->seq) {
>-						newcon->seq = con->seq;
>+					u64 seq;
>+
>+					if (!((con->flags & CON_BOOT) &&
>+					      (con->flags & CON_ENABLED))) {
>+						continue;
> 					}
>+
>+					if (con->flags & CON_NBCON)
>+						seq = nbcon_seq_read(con);
>+					else
>+						seq = con->seq;
>+
>+					if (seq < newcon->seq)
>+						newcon->seq = seq;
> 				}
> 			}
> 
>@@ -3456,6 +3769,7 @@ void register_console(struct console *newcon)
> 	struct console *con;
> 	bool bootcon_registered = false;
> 	bool realcon_registered = false;
>+	unsigned long flags;
> 	int err;
> 
> 	console_list_lock();
>@@ -3535,9 +3849,38 @@ void register_console(struct console *newcon)
> 	newcon->dropped = 0;
> 	console_init_seq(newcon, bootcon_registered);
> 
>-	if (newcon->flags & CON_NBCON)
>+	if (newcon->flags & CON_NBCON) {
>+		have_nbcon_console = true;
> 		nbcon_init(newcon);
> 
>+		/*
>+		 * nbcon consoles have their own sequence counter. The legacy
>+		 * sequence counter is reset so that it is clear it is not
>+		 * being used.
>+		 */
>+		nbcon_seq_force(newcon, newcon->seq);
>+		newcon->seq = 0;
>+	} else {
>+		have_legacy_console = true;
>+		nbcon_legacy_kthread_create();
>+	}
>+
>+	if (newcon->flags & CON_BOOT)
>+		have_boot_console = true;
>+
>+	/*
>+	 * If another context is actively using the hardware of this new
>+	 * console, it will not be aware of the nbcon synchronization. This
>+	 * is a risk that two contexts could access the hardware
>+	 * simultaneously if this new console is used for atomic printing
>+	 * and the other context is still using the hardware.
>+	 *
>+	 * Use the driver synchronization to ensure that the hardware is not
>+	 * in use while this new console transitions to being registered.
>+	 */
>+	if ((newcon->flags & CON_NBCON) && newcon->write_atomic)
>+		newcon->device_lock(newcon, &flags);
>+
> 	/*
> 	 * Put this console in the list - keep the
> 	 * preferred driver at the head of the list.
>@@ -3562,6 +3905,10 @@ void register_console(struct console *newcon)
> 	 * register_console() completes.
> 	 */
> 
>+	/* This new console is now registered. */
>+	if ((newcon->flags & CON_NBCON) && newcon->write_atomic)
>+		newcon->device_unlock(newcon, flags);
>+
> 	console_sysfs_notify();
> 
> 	/*
>@@ -3590,6 +3937,11 @@ EXPORT_SYMBOL(register_console);
> /* Must be called under console_list_lock(). */
> static int unregister_console_locked(struct console *console)
> {
>+	bool is_boot_con = (console->flags & CON_BOOT);
>+	bool found_legacy_con = false;
>+	bool found_nbcon_con = false;
>+	bool found_boot_con = false;
>+	struct console *c;
> 	int res;
> 
> 	lockdep_assert_console_list_lock_held();
>@@ -3637,6 +3989,42 @@ static int unregister_console_locked(struct console *console)
> 	if (console->exit)
> 		res = console->exit(console);
> 
>+	/*
>+	 * With this console gone, the global flags tracking registered
>+	 * console types may have changed. Update them.
>+	 */
>+	for_each_console(c) {
>+		if (c->flags & CON_BOOT)
>+			found_boot_con = true;
>+
>+		if (c->flags & CON_NBCON)
>+			found_nbcon_con = true;
>+		else
>+			found_legacy_con = true;
>+	}
>+	if (!found_boot_con)
>+		have_boot_console = found_boot_con;
>+	if (!found_legacy_con)
>+		have_legacy_console = found_legacy_con;
>+	if (!found_nbcon_con)
>+		have_nbcon_console = found_nbcon_con;
>+
>+	/*
>+	 * When the last boot console unregisters, start up the
>+	 * printing threads.
>+	 */
>+	if (is_boot_con && !have_boot_console) {
>+		for_each_console(c)
>+			nbcon_kthread_create(c);
>+	}
>+
>+#ifdef CONFIG_PRINTK
>+	if (!printing_via_unlock && nbcon_legacy_kthread) {
>+		kthread_stop(nbcon_legacy_kthread);
>+		nbcon_legacy_kthread = NULL;
>+	}
>+#endif
>+
> 	return res;
> }
> 
>@@ -3795,23 +4183,39 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
> 
> 	seq = prb_next_reserve_seq(prb);
> 
>-	/* Flush the consoles so that records up to @seq are printed. */
>-	console_lock();
>-	console_unlock();
>+	/*
>+	 * Flush the consoles so that records up to @seq are printed.
>+	 * Otherwise this function will just wait for the threaded printers
>+	 * to print up to @seq.
>+	 */
>+	if (printing_via_unlock && !force_printkthreads()) {
>+		console_lock();
>+		console_unlock();
>+	}
> 
> 	for (;;) {
> 		unsigned long begin_jiffies;
> 		unsigned long slept_jiffies;
>+		bool use_console_lock = printing_via_unlock;
>+
>+		/*
>+		 * Ensure the compiler does not optimize @use_console_lock to
>+		 * be @printing_via_unlock since the latter can change at any
>+		 * time.
>+		 */
>+		barrier();
> 
> 		diff = 0;
> 
>-		/*
>-		 * Hold the console_lock to guarantee safe access to
>-		 * console->seq. Releasing console_lock flushes more
>-		 * records in case @seq is still not printed on all
>-		 * usable consoles.
>-		 */
>-		console_lock();
>+		if (use_console_lock) {
>+			/*
>+			 * Hold the console_lock to guarantee safe access to
>+			 * console->seq. Releasing console_lock flushes more
>+			 * records in case @seq is still not printed on all
>+			 * usable consoles.
>+			 */
>+			console_lock();
>+		}
> 
> 		cookie = console_srcu_read_lock();
> 		for_each_console_srcu(c) {
>@@ -3825,12 +4229,15 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
> 			 * that they make forward progress, so only increment
> 			 * @diff for usable consoles.
> 			 */
>-			if (!console_is_usable(c))
>+			if (!console_is_usable(c, flags, true) &&
>+			    !console_is_usable(c, flags, false)) {
> 				continue;
>+			}
> 
> 			if (flags & CON_NBCON) {
> 				printk_seq = nbcon_seq_read(c);
> 			} else {
>+				WARN_ON_ONCE(!use_console_lock);
> 				printk_seq = c->seq;
> 			}
> 
>@@ -3842,7 +4249,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
> 		if (diff != last_diff && reset_on_progress)
> 			remaining_jiffies = timeout_jiffies;
> 
>-		console_unlock();
>+		if (use_console_lock)
>+			console_unlock();
> 
> 		/* Note: @diff is 0 if there are no usable consoles. */
> 		if (diff == 0 || remaining_jiffies == 0)
>@@ -3894,9 +4302,16 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
> 	int pending = this_cpu_xchg(printk_pending, 0);
> 
> 	if (pending & PRINTK_PENDING_OUTPUT) {
>-		/* If trylock fails, someone else is doing the printing */
>-		if (console_trylock())
>-			console_unlock();
>+		if (force_printkthreads()) {
>+			wake_up_legacy_kthread();
>+		} else {
>+			/*
>+			 * If trylock fails, some other context
>+			 * will do the printing.
>+			 */
>+			if (console_trylock())
>+				console_unlock();
>+		}
> 	}
> 
> 	if (pending & PRINTK_PENDING_WAKEUP)
>@@ -3912,6 +4327,7 @@ static void __wake_up_klogd(int val)
> 		return;
> 
> 	preempt_disable();
>+
> 	/*
> 	 * Guarantee any new records can be seen by tasks preparing to wait
> 	 * before this context checks if the wait queue is empty.
>@@ -3923,11 +4339,22 @@ static void __wake_up_klogd(int val)
> 	 *
> 	 * This pairs with devkmsg_read:A and syslog_print:A.
> 	 */
>-	if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
>-	    (val & PRINTK_PENDING_OUTPUT)) {
>+	if (!wq_has_sleeper(&log_wait)) /* LMM(__wake_up_klogd:A) */
>+		val &= ~PRINTK_PENDING_WAKEUP;
>+
>+	/*
>+	 * Simple read is safe. register_console() would flush a newly
>+	 * registered legacy console when writing the message about it
>+	 * being enabled.
>+	 */
>+	if (!printing_via_unlock)
>+		val &= ~PRINTK_PENDING_OUTPUT;
>+
>+	if (val) {
> 		this_cpu_or(printk_pending, val);
> 		irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
> 	}
>+
> 	preempt_enable();
> }
> 
>@@ -3969,6 +4396,7 @@ void defer_console_output(void)
> 
> void printk_trigger_flush(void)
> {
>+	nbcon_wake_threads();
> 	defer_console_output();
> }
> 
>diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
>index f5a8bb606fe50..88e8f3a619229 100644
>--- a/kernel/printk/printk_ringbuffer.c
>+++ b/kernel/printk/printk_ringbuffer.c
>@@ -1034,9 +1034,13 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
> 	unsigned long next_lpos;
> 
> 	if (size == 0) {
>-		/* Specify a data-less block. */
>-		blk_lpos->begin = NO_LPOS;
>-		blk_lpos->next = NO_LPOS;
>+		/*
>+		 * Data blocks are not created for empty lines. Instead, the
>+		 * reader will recognize these special lpos values and handle
>+		 * it appropriately.
>+		 */
>+		blk_lpos->begin = EMPTY_LINE_LPOS;
>+		blk_lpos->next = EMPTY_LINE_LPOS;
> 		return NULL;
> 	}
> 
>@@ -1214,10 +1218,18 @@ static const char *get_data(struct prb_data_ring *data_ring,
> 
> 	/* Data-less data block description. */
> 	if (BLK_DATALESS(blk_lpos)) {
>-		if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) {
>+		/*
>+		 * Records that are just empty lines are also valid, even
>+		 * though they do not have a data block. For such records
>+		 * explicitly return empty string data to signify success.
>+		 */
>+		if (blk_lpos->begin == EMPTY_LINE_LPOS &&
>+		    blk_lpos->next == EMPTY_LINE_LPOS) {
> 			*data_size = 0;
> 			return "";
> 		}
>+
>+		/* Data lost, invalid, or otherwise unavailable. */
> 		return NULL;
> 	}
> 
>diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
>index cb887489d00f0..bd2a892deac1a 100644
>--- a/kernel/printk/printk_ringbuffer.h
>+++ b/kernel/printk/printk_ringbuffer.h
>@@ -5,6 +5,8 @@
> 
> #include <linux/atomic.h>
> #include <linux/dev_printk.h>
>+#include <linux/stddef.h>
>+#include <linux/types.h>
> 
> /*
>  * Meta information about each stored message.
>@@ -127,8 +129,22 @@ enum desc_state {
> #define DESC_SV(id, state)	(((unsigned long)state << DESC_FLAGS_SHIFT) | id)
> #define DESC_ID_MASK		(~DESC_FLAGS_MASK)
> #define DESC_ID(sv)		((sv) & DESC_ID_MASK)
>+
>+/*
>+ * Special data block logical position values (for fields of
>+ * @prb_desc.text_blk_lpos).
>+ *
>+ * - Bit0 is used to identify if the record has no data block. (Implemented in
>+ *   the LPOS_DATALESS() macro.)
>+ *
>+ * - Bit1 specifies the reason for not having a data block.
>+ *
>+ * These special values could never be real lpos values because of the
>+ * meta data and alignment padding of data blocks. (See to_blk_size() for
>+ * details.)
>+ */
> #define FAILED_LPOS		0x1
>-#define NO_LPOS			0x3
>+#define EMPTY_LINE_LPOS		0x3
> 
> #define FAILED_BLK_LPOS	\
> {				\
>diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
>index 6d10927a07d83..4421ccac31133 100644
>--- a/kernel/printk/printk_safe.c
>+++ b/kernel/printk/printk_safe.c
>@@ -26,6 +26,18 @@ void __printk_safe_exit(void)
> 	this_cpu_dec(printk_context);
> }
> 
>+void __printk_deferred_enter(void)
>+{
>+	cant_migrate();
>+	__printk_safe_enter();
>+}
>+
>+void __printk_deferred_exit(void)
>+{
>+	cant_migrate();
>+	__printk_safe_exit();
>+}
>+
> asmlinkage int vprintk(const char *fmt, va_list args)
> {
> #ifdef CONFIG_KGDB_KDB
>diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
>index 7567ca8e743ca..48a9d47ec90eb 100644
>--- a/kernel/rcu/rcutorture.c
>+++ b/kernel/rcu/rcutorture.c
>@@ -2409,6 +2409,12 @@ static int rcutorture_booster_init(unsigned int cpu)
> 		WARN_ON_ONCE(!t);
> 		sp.sched_priority = 2;
> 		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
>+#ifdef CONFIG_PREEMPT_RT
>+		t = per_cpu(timersd, cpu);
>+		WARN_ON_ONCE(!t);
>+		sp.sched_priority = 2;
>+		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
>+#endif
> 	}
> 
> 	/* Don't allow time recalculation while creating a new task. */
>diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
>index 8107f818455da..b17130b7e522d 100644
>--- a/kernel/rcu/tree_exp.h
>+++ b/kernel/rcu/tree_exp.h
>@@ -7,6 +7,7 @@
>  * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
>  */
> 
>+#include <linux/console.h>
> #include <linux/lockdep.h>
> 
> static void rcu_exp_handler(void *unused);
>@@ -636,6 +637,9 @@ static void synchronize_rcu_expedited_wait(void)
> 			return;
> 		if (rcu_stall_is_suppressed())
> 			continue;
>+
>+		nbcon_cpu_emergency_enter();
>+
> 		j = jiffies;
> 		rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
> 		trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
>@@ -689,6 +693,9 @@ static void synchronize_rcu_expedited_wait(void)
> 			rcu_exp_print_detail_task_stall_rnp(rnp);
> 		}
> 		jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
>+
>+		nbcon_cpu_emergency_exit();
>+
> 		panic_on_rcu_stall();
> 	}
> }
>diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
>index 5d666428546b0..f4d73ca20c768 100644
>--- a/kernel/rcu/tree_stall.h
>+++ b/kernel/rcu/tree_stall.h
>@@ -7,6 +7,7 @@
>  * Author: Paul E. McKenney <paulmck@linux.ibm.com>
>  */
> 
>+#include <linux/console.h>
> #include <linux/kvm_para.h>
> #include <linux/rcu_notifier.h>
> 
>@@ -604,6 +605,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
> 	if (rcu_stall_is_suppressed())
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	/*
> 	 * OK, time to rat on our buddy...
> 	 * See Documentation/RCU/stallwarn.rst for info on how to debug
>@@ -655,6 +658,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
> 	rcu_check_gp_kthread_expired_fqs_timer();
> 	rcu_check_gp_kthread_starvation();
> 
>+	nbcon_cpu_emergency_exit();
>+
> 	panic_on_rcu_stall();
> 
> 	rcu_force_quiescent_state();  /* Kick them all. */
>@@ -675,6 +680,8 @@ static void print_cpu_stall(unsigned long gps)
> 	if (rcu_stall_is_suppressed())
> 		return;
> 
>+	nbcon_cpu_emergency_enter();
>+
> 	/*
> 	 * OK, time to rat on ourselves...
> 	 * See Documentation/RCU/stallwarn.rst for info on how to debug
>@@ -703,6 +710,8 @@ static void print_cpu_stall(unsigned long gps)
> 			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
> 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
> 
>+	nbcon_cpu_emergency_exit();
>+
> 	panic_on_rcu_stall();
> 
> 	/*
>diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>index 9116bcc903467..5015768f10256 100644
>--- a/kernel/sched/core.c
>+++ b/kernel/sched/core.c
>@@ -899,14 +899,15 @@ static inline void hrtick_rq_init(struct rq *rq)
> 
> #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
> /*
>- * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
>+ * Atomically set TIF_NEED_RESCHED[_LAZY] and test for TIF_POLLING_NRFLAG,
>  * this avoids any races wrt polling state changes and thereby avoids
>  * spurious IPIs.
>  */
>-static inline bool set_nr_and_not_polling(struct task_struct *p)
>+static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit)
> {
> 	struct thread_info *ti = task_thread_info(p);
>-	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
>+
>+	return !(fetch_or(&ti->flags, 1 << tif_bit) & _TIF_POLLING_NRFLAG);
> }
> 
> /*
>@@ -923,7 +924,7 @@ static bool set_nr_if_polling(struct task_struct *p)
> 	do {
> 		if (!(val & _TIF_POLLING_NRFLAG))
> 			return false;
>-		if (val & _TIF_NEED_RESCHED)
>+		if (val & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> 			return true;
> 	} while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
> 
>@@ -931,9 +932,9 @@ static bool set_nr_if_polling(struct task_struct *p)
> }
> 
> #else
>-static inline bool set_nr_and_not_polling(struct task_struct *p)
>+static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit)
> {
>-	set_tsk_need_resched(p);
>+	set_tsk_thread_flag(p, tif_bit);
> 	return true;
> }
> 
>@@ -1038,28 +1039,47 @@ void wake_up_q(struct wake_q_head *head)
>  * might also involve a cross-CPU call to trigger the scheduler on
>  * the target CPU.
>  */
>-void resched_curr(struct rq *rq)
>+static void __resched_curr(struct rq *rq, int lazy)
> {
>+	int cpu, tif_bit = TIF_NEED_RESCHED + lazy;
> 	struct task_struct *curr = rq->curr;
>-	int cpu;
> 
> 	lockdep_assert_rq_held(rq);
> 
>-	if (test_tsk_need_resched(curr))
>+	if (unlikely(test_tsk_thread_flag(curr, tif_bit)))
> 		return;
> 
> 	cpu = cpu_of(rq);
> 
> 	if (cpu == smp_processor_id()) {
>-		set_tsk_need_resched(curr);
>-		set_preempt_need_resched();
>+		set_tsk_thread_flag(curr, tif_bit);
>+		if (!lazy)
>+			set_preempt_need_resched();
> 		return;
> 	}
> 
>-	if (set_nr_and_not_polling(curr))
>-		smp_send_reschedule(cpu);
>-	else
>+	if (set_nr_and_not_polling(curr, tif_bit)) {
>+		if (!lazy)
>+			smp_send_reschedule(cpu);
>+	} else {
> 		trace_sched_wake_idle_without_ipi(cpu);
>+	}
>+}
>+
>+void resched_curr(struct rq *rq)
>+{
>+	__resched_curr(rq, 0);
>+}
>+
>+void resched_curr_lazy(struct rq *rq)
>+{
>+	int lazy = IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && !sched_feat(FORCE_NEED_RESCHED) ?
>+		TIF_NEED_RESCHED_LAZY_OFFSET : 0;
>+
>+	if (lazy && unlikely(test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED)))
>+		return;
>+
>+	__resched_curr(rq, lazy);
> }
> 
> void resched_cpu(int cpu)
>@@ -1154,7 +1174,7 @@ static void wake_up_idle_cpu(int cpu)
> 	 * and testing of the above solutions didn't appear to report
> 	 * much benefits.
> 	 */
>-	if (set_nr_and_not_polling(rq->idle))
>+	if (set_nr_and_not_polling(rq->idle, TIF_NEED_RESCHED))
> 		smp_send_reschedule(cpu);
> 	else
> 		trace_sched_wake_idle_without_ipi(cpu);
>@@ -8890,6 +8910,21 @@ static inline void preempt_dynamic_init(void) { }
> 
> #endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
> 
>+/*
>+ * task_is_pi_boosted - Check if task has been PI boosted.
>+ * @p:	Task to check.
>+ *
>+ * Return true if task is subject to priority inheritance.
>+ */
>+bool task_is_pi_boosted(const struct task_struct *p)
>+{
>+	int prio = p->prio;
>+
>+	if (!rt_prio(prio))
>+		return false;
>+	return prio != p->normal_prio;
>+}
>+
> /**
>  * yield - yield the current processor to other threads.
>  *
>diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>index 8d5d98a5834df..b462333db26cb 100644
>--- a/kernel/sched/debug.c
>+++ b/kernel/sched/debug.c
>@@ -333,6 +333,23 @@ static const struct file_operations sched_debug_fops = {
> 	.release	= seq_release,
> };
> 
>+static ssize_t sched_hog_write(struct file *filp, const char __user *ubuf,
>+			       size_t cnt, loff_t *ppos)
>+{
>+	unsigned long end = jiffies + 60 * HZ;
>+
>+	for (; time_before(jiffies, end) && !signal_pending(current);)
>+		cpu_relax();
>+
>+	return cnt;
>+}
>+
>+static const struct file_operations sched_hog_fops = {
>+	.write		= sched_hog_write,
>+	.open		= simple_open,
>+	.llseek		= default_llseek,
>+};
>+
> static struct dentry *debugfs_sched;
> 
> static __init int sched_init_debug(void)
>@@ -374,6 +391,8 @@ static __init int sched_init_debug(void)
> 
> 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
> 
>+	debugfs_create_file("hog", 0200, debugfs_sched, NULL, &sched_hog_fops);
>+
> 	return 0;
> }
> late_initcall(sched_init_debug);
>diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>index e2b4e0396af84..df60743b4d2c0 100644
>--- a/kernel/sched/fair.c
>+++ b/kernel/sched/fair.c
>@@ -975,8 +975,10 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
>  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
>  * this is probably good enough.
>  */
>-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
>+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se, bool tick)
> {
>+	struct rq *rq = rq_of(cfs_rq);
>+
> 	if ((s64)(se->vruntime - se->deadline) < 0)
> 		return;
> 
>@@ -995,10 +997,19 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
> 	/*
> 	 * The task has consumed its request, reschedule.
> 	 */
>-	if (cfs_rq->nr_running > 1) {
>-		resched_curr(rq_of(cfs_rq));
>-		clear_buddies(cfs_rq, se);
>+	if (cfs_rq->nr_running < 2)
>+		return;
>+
>+	if (!IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) || sched_feat(FORCE_NEED_RESCHED)) {
>+		resched_curr(rq);
>+	} else {
>+		/* Did the task ignore the lazy reschedule request? */
>+		if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
>+			resched_curr(rq);
>+		else
>+			resched_curr_lazy(rq);
> 	}
>+	clear_buddies(cfs_rq, se);
> }
> 
> #include "pelt.h"
>@@ -1153,7 +1164,7 @@ s64 update_curr_common(struct rq *rq)
> /*
>  * Update the current task's runtime statistics.
>  */
>-static void update_curr(struct cfs_rq *cfs_rq)
>+static void __update_curr(struct cfs_rq *cfs_rq, bool tick)
> {
> 	struct sched_entity *curr = cfs_rq->curr;
> 	s64 delta_exec;
>@@ -1363,7 +1363,7 @@
> #else // !CONFIG_SCHED_BORE
> 	curr->vruntime += calc_delta_fair(delta_exec, curr);
> #endif // CONFIG_SCHED_BORE
>-	update_deadline(cfs_rq, curr);
>+	update_deadline(cfs_rq, curr, tick);
> 	update_min_vruntime(cfs_rq);
> 
> 	if (entity_is_task(curr))
>@@ -1175,6 +1186,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
> 	account_cfs_rq_runtime(cfs_rq, delta_exec);
> }
> 
>+static inline void update_curr(struct cfs_rq *cfs_rq)
>+{
>+	__update_curr(cfs_rq, false);
>+}
>+
> static void update_curr_fair(struct rq *rq)
> {
> 	update_curr(cfs_rq_of(&rq->curr->se));
>@@ -5493,7 +5509,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
> 	/*
> 	 * Update run-time statistics of the 'current'.
> 	 */
>-	update_curr(cfs_rq);
>+	__update_curr(cfs_rq, true);
> 
> 	/*
> 	 * Ensure that runnable average is periodically updated.
>@@ -5507,7 +5523,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
> 	 * validating it and just reschedule.
> 	 */
> 	if (queued) {
>-		resched_curr(rq_of(cfs_rq));
>+		resched_curr_lazy(rq_of(cfs_rq));
> 		return;
> 	}
> 	/*
>@@ -5653,7 +5669,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
> 	 * hierarchy can be throttled
> 	 */
> 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
>-		resched_curr(rq_of(cfs_rq));
>+		resched_curr_lazy(rq_of(cfs_rq));
> }
> 
> static __always_inline
>@@ -5913,7 +5929,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> 
> 	/* Determine whether we need to wake up potentially idle CPU: */
> 	if (rq->curr == rq->idle && rq->cfs.nr_running)
>-		resched_curr(rq);
>+		resched_curr_lazy(rq);
> }
> 
> #ifdef CONFIG_SMP
>@@ -6628,7 +6644,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
> 
> 		if (delta < 0) {
> 			if (task_current(rq, p))
>-				resched_curr(rq);
>+				resched_curr_lazy(rq);
> 			return;
> 		}
> 		hrtick_start(rq, delta);
>@@ -8304,7 +8320,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> 	 * prevents us from potentially nominating it as a false LAST_BUDDY
> 	 * below.
> 	 */
>-	if (test_tsk_need_resched(curr))
>+	if (need_resched())
> 		return;
> 
> 	/* Idle tasks are by definition preempted by non-idle tasks. */
>@@ -8346,7 +8362,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> 	return;
> 
> preempt:
>-	resched_curr(rq);
>+	resched_curr_lazy(rq);
> }
> 
> #ifdef CONFIG_SMP
>@@ -12516,7 +12532,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
> 	 */
> 	if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
> 	    __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
>-		resched_curr(rq);
>+		resched_curr_lazy(rq);
> }
> 
> /*
>@@ -12681,7 +12697,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
> 	 */
> 	if (task_current(rq, p)) {
> 		if (p->prio > oldprio)
>-			resched_curr(rq);
>+			resched_curr_lazy(rq);
> 	} else
> 		wakeup_preempt(rq, p, 0);
> }
>diff --git a/kernel/sched/features.h b/kernel/sched/features.h
>index 143f55df890b1..6de570ab30078 100644
>--- a/kernel/sched/features.h
>+++ b/kernel/sched/features.h
>@@ -87,3 +87,5 @@ SCHED_FEAT(UTIL_EST, true)
> SCHED_FEAT(LATENCY_WARN, false)
> 
> SCHED_FEAT(HZ_BW, true)
>+
>+SCHED_FEAT(FORCE_NEED_RESCHED, false)
>diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
>index 31231925f1ece..58486420f3624 100644
>--- a/kernel/sched/idle.c
>+++ b/kernel/sched/idle.c
>@@ -57,8 +57,7 @@ static noinline int __cpuidle cpu_idle_poll(void)
> 	ct_cpuidle_enter();
> 
> 	raw_local_irq_enable();
>-	while (!tif_need_resched() &&
>-	       (cpu_idle_force_poll || tick_check_broadcast_expired()))
>+	while (!need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired()))
> 		cpu_relax();
> 	raw_local_irq_disable();
> 
>diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>index 3261b067b67e2..8771140e0de5e 100644
>--- a/kernel/sched/rt.c
>+++ b/kernel/sched/rt.c
>@@ -2194,8 +2194,11 @@ static int rto_next_cpu(struct root_domain *rd)
> 
> 		rd->rto_cpu = cpu;
> 
>-		if (cpu < nr_cpu_ids)
>+		if (cpu < nr_cpu_ids) {
>+			if (!has_pushable_tasks(cpu_rq(cpu)))
>+				continue;
> 			return cpu;
>+		}
> 
> 		rd->rto_cpu = -1;
> 
>diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>index 001fe047bd5d8..17424c69537f2 100644
>--- a/kernel/sched/sched.h
>+++ b/kernel/sched/sched.h
>@@ -2463,6 +2463,7 @@ extern void init_sched_fair_class(void);
> extern void reweight_task(struct task_struct *p, int prio);
> 
> extern void resched_curr(struct rq *rq);
>+extern void resched_curr_lazy(struct rq *rq);
> extern void resched_cpu(int cpu);
> 
> extern struct rt_bandwidth def_rt_bandwidth;
>diff --git a/kernel/softirq.c b/kernel/softirq.c
>index 210cf5f8d92c2..cae0ae2e2b0bb 100644
>--- a/kernel/softirq.c
>+++ b/kernel/softirq.c
>@@ -247,6 +247,19 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
> }
> EXPORT_SYMBOL(__local_bh_enable_ip);
> 
>+void softirq_preempt(void)
>+{
>+	if (WARN_ON_ONCE(!preemptible()))
>+		return;
>+
>+	if (WARN_ON_ONCE(__this_cpu_read(softirq_ctrl.cnt) != SOFTIRQ_OFFSET))
>+		return;
>+
>+	__local_bh_enable(SOFTIRQ_OFFSET, true);
>+	/* preemption point */
>+	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
>+}
>+
> /*
>  * Invoked from ksoftirqd_run() outside of the interrupt disabled section
>  * to acquire the per CPU local lock for reentrancy protection.
>@@ -619,6 +632,24 @@ static inline void tick_irq_exit(void)
> #endif
> }
> 
>+#ifdef CONFIG_PREEMPT_RT
>+DEFINE_PER_CPU(struct task_struct *, timersd);
>+DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
>+
>+static void wake_timersd(void)
>+{
>+        struct task_struct *tsk = __this_cpu_read(timersd);
>+
>+        if (tsk)
>+                wake_up_process(tsk);
>+}
>+
>+#else
>+
>+static inline void wake_timersd(void) { }
>+
>+#endif
>+
> static inline void __irq_exit_rcu(void)
> {
> #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
>@@ -631,6 +662,10 @@ static inline void __irq_exit_rcu(void)
> 	if (!in_interrupt() && local_softirq_pending())
> 		invoke_softirq();
> 
>+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() &&
>+	    !(in_nmi() | in_hardirq()))
>+		wake_timersd();
>+
> 	tick_irq_exit();
> }
> 
>@@ -963,12 +998,70 @@ static struct smp_hotplug_thread softirq_threads = {
> 	.thread_comm		= "ksoftirqd/%u",
> };
> 
>+#ifdef CONFIG_PREEMPT_RT
>+static void timersd_setup(unsigned int cpu)
>+{
>+        sched_set_fifo_low(current);
>+}
>+
>+static int timersd_should_run(unsigned int cpu)
>+{
>+        return local_pending_timers();
>+}
>+
>+static void run_timersd(unsigned int cpu)
>+{
>+	unsigned int timer_si;
>+
>+	ksoftirqd_run_begin();
>+
>+	timer_si = local_pending_timers();
>+	__this_cpu_write(pending_timer_softirq, 0);
>+	or_softirq_pending(timer_si);
>+
>+	__do_softirq();
>+
>+	ksoftirqd_run_end();
>+}
>+
>+static void raise_ktimers_thread(unsigned int nr)
>+{
>+	trace_softirq_raise(nr);
>+	__this_cpu_or(pending_timer_softirq, 1 << nr);
>+}
>+
>+void raise_hrtimer_softirq(void)
>+{
>+	raise_ktimers_thread(HRTIMER_SOFTIRQ);
>+}
>+
>+void raise_timer_softirq(void)
>+{
>+	unsigned long flags;
>+
>+	local_irq_save(flags);
>+	raise_ktimers_thread(TIMER_SOFTIRQ);
>+	wake_timersd();
>+	local_irq_restore(flags);
>+}
>+
>+static struct smp_hotplug_thread timer_threads = {
>+        .store                  = &timersd,
>+        .setup                  = timersd_setup,
>+        .thread_should_run      = timersd_should_run,
>+        .thread_fn              = run_timersd,
>+        .thread_comm            = "ktimers/%u",
>+};
>+#endif
>+
> static __init int spawn_ksoftirqd(void)
> {
> 	cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
> 				  takeover_tasklets);
> 	BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
>-
>+#ifdef CONFIG_PREEMPT_RT
>+	BUG_ON(smpboot_register_percpu_thread(&timer_threads));
>+#endif
> 	return 0;
> }
> early_initcall(spawn_ksoftirqd);
>diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
>index edb0f821dceaa..a7290012179a4 100644
>--- a/kernel/time/hrtimer.c
>+++ b/kernel/time/hrtimer.c
>@@ -1809,7 +1809,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
> 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
> 		cpu_base->softirq_expires_next = KTIME_MAX;
> 		cpu_base->softirq_activated = 1;
>-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
>+		raise_hrtimer_softirq();
> 	}
> 
> 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
>@@ -1922,7 +1922,7 @@ void hrtimer_run_queues(void)
> 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
> 		cpu_base->softirq_expires_next = KTIME_MAX;
> 		cpu_base->softirq_activated = 1;
>-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
>+		raise_hrtimer_softirq();
> 	}
> 
> 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
>diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
>index 01fb50c1b17e4..910c04d7fa0d3 100644
>--- a/kernel/time/tick-sched.c
>+++ b/kernel/time/tick-sched.c
>@@ -796,7 +796,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
> 
> static inline bool local_timer_softirq_pending(void)
> {
>-	return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
>+	return local_pending_timers() & BIT(TIMER_SOFTIRQ);
> }
> 
> static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
>diff --git a/kernel/time/timer.c b/kernel/time/timer.c
>index 352b161113cda..d6bf128262c93 100644
>--- a/kernel/time/timer.c
>+++ b/kernel/time/timer.c
>@@ -1470,9 +1470,16 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
>  */
> static void timer_sync_wait_running(struct timer_base *base)
> {
>-	if (atomic_read(&base->timer_waiters)) {
>+	bool need_preempt;
>+
>+	need_preempt = task_is_pi_boosted(current);
>+	if (need_preempt || atomic_read(&base->timer_waiters)) {
> 		raw_spin_unlock_irq(&base->lock);
> 		spin_unlock(&base->expiry_lock);
>+
>+		if (need_preempt)
>+			softirq_preempt();
>+
> 		spin_lock(&base->expiry_lock);
> 		raw_spin_lock_irq(&base->lock);
> 	}
>@@ -2070,7 +2077,7 @@ static void run_local_timers(void)
> 		if (time_before(jiffies, base->next_expiry))
> 			return;
> 	}
>-	raise_softirq(TIMER_SOFTIRQ);
>+	raise_timer_softirq();
> }
> 
> /*
>diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
>index c9c8983073485..947b5f1e799dd 100644
>--- a/kernel/trace/trace.c
>+++ b/kernel/trace/trace.c
>@@ -2717,6 +2717,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
> 
> 	if (tif_need_resched())
> 		trace_flags |= TRACE_FLAG_NEED_RESCHED;
>+	if (tif_need_resched_lazy())
>+		trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
> 	if (test_preempt_need_resched())
> 		trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
> 	return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
>diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
>index d8b302d010830..4f58a196e14c1 100644
>--- a/kernel/trace/trace_output.c
>+++ b/kernel/trace/trace_output.c
>@@ -460,17 +460,29 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
> 		(entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' :
> 		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
> 		bh_off ? 'b' :
>-		(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
>+		!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_SUPPORT) ? 'X' :
> 		'.';
> 
>-	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
>+	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY |
> 				TRACE_FLAG_PREEMPT_RESCHED)) {
>+	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
>+		need_resched = 'B';
>+		break;
> 	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
> 		need_resched = 'N';
> 		break;
>+	case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
>+		need_resched = 'L';
>+		break;
>+	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY:
>+		need_resched = 'b';
>+		break;
> 	case TRACE_FLAG_NEED_RESCHED:
> 		need_resched = 'n';
> 		break;
>+	case TRACE_FLAG_NEED_RESCHED_LAZY:
>+		need_resched = 'l';
>+		break;
> 	case TRACE_FLAG_PREEMPT_RESCHED:
> 		need_resched = 'p';
> 		break;
>diff --git a/localversion-rt b/localversion-rt
>new file mode 100644
>index 0000000000000..05c35cb580779
>--- /dev/null
>+++ b/localversion-rt
>@@ -0,0 +1 @@
>+-rt11
>diff --git a/net/core/dev.c b/net/core/dev.c
>index c9b8412f1c9d3..849d2fe8d9221 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -78,6 +78,7 @@
> #include <linux/slab.h>
> #include <linux/sched.h>
> #include <linux/sched/mm.h>
>+#include <linux/smpboot.h>
> #include <linux/mutex.h>
> #include <linux/rwsem.h>
> #include <linux/string.h>
>@@ -216,35 +217,60 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
> 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
> }
> 
>-static inline void rps_lock_irqsave(struct softnet_data *sd,
>-				    unsigned long *flags)
>+#ifndef CONFIG_PREEMPT_RT
>+
>+static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
>+
>+static int __init setup_backlog_napi_threads(char *arg)
> {
>-	if (IS_ENABLED(CONFIG_RPS))
>+	static_branch_enable(&use_backlog_threads_key);
>+	return 0;
>+}
>+early_param("thread_backlog_napi", setup_backlog_napi_threads);
>+
>+static bool use_backlog_threads(void)
>+{
>+	return static_branch_unlikely(&use_backlog_threads_key);
>+}
>+
>+#else
>+
>+static bool use_backlog_threads(void)
>+{
>+	return true;
>+}
>+
>+#endif
>+
>+static inline void backlog_lock_irq_save(struct softnet_data *sd,
>+					 unsigned long *flags)
>+{
>+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
> 		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
> 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> 		local_irq_save(*flags);
> }
> 
>-static inline void rps_lock_irq_disable(struct softnet_data *sd)
>+static inline void backlog_lock_irq_disable(struct softnet_data *sd)
> {
>-	if (IS_ENABLED(CONFIG_RPS))
>+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
> 		spin_lock_irq(&sd->input_pkt_queue.lock);
> 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> 		local_irq_disable();
> }
> 
>-static inline void rps_unlock_irq_restore(struct softnet_data *sd,
>-					  unsigned long *flags)
>+static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
>+					      unsigned long *flags)
> {
>-	if (IS_ENABLED(CONFIG_RPS))
>+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
> 		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
> 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> 		local_irq_restore(*flags);
> }
> 
>-static inline void rps_unlock_irq_enable(struct softnet_data *sd)
>+static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
> {
>-	if (IS_ENABLED(CONFIG_RPS))
>+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
> 		spin_unlock_irq(&sd->input_pkt_queue.lock);
> 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> 		local_irq_enable();
>@@ -4420,6 +4446,7 @@ EXPORT_SYMBOL(__dev_direct_xmit);
> /*************************************************************************
>  *			Receiver routines
>  *************************************************************************/
>+static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
> 
> int netdev_max_backlog __read_mostly = 1000;
> EXPORT_SYMBOL(netdev_max_backlog);
>@@ -4452,18 +4479,16 @@ static inline void ____napi_schedule(struct softnet_data *sd,
> 		 */
> 		thread = READ_ONCE(napi->thread);
> 		if (thread) {
>-			/* Avoid doing set_bit() if the thread is in
>-			 * INTERRUPTIBLE state, cause napi_thread_wait()
>-			 * makes sure to proceed with napi polling
>-			 * if the thread is explicitly woken from here.
>-			 */
>-			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
>-				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
>+			if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
>+				goto use_local_napi;
>+
>+			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
> 			wake_up_process(thread);
> 			return;
> 		}
> 	}
> 
>+use_local_napi:
> 	list_add_tail(&napi->poll_list, &sd->poll_list);
> 	WRITE_ONCE(napi->list_owner, smp_processor_id());
> 	/* If not called from net_rx_action()
>@@ -4709,6 +4734,11 @@ static void napi_schedule_rps(struct softnet_data *sd)
> 
> #ifdef CONFIG_RPS
> 	if (sd != mysd) {
>+		if (use_backlog_threads()) {
>+			__napi_schedule_irqoff(&sd->backlog);
>+			return;
>+		}
>+
> 		sd->rps_ipi_next = mysd->rps_ipi_list;
> 		mysd->rps_ipi_list = sd;
> 
>@@ -4723,6 +4753,23 @@ static void napi_schedule_rps(struct softnet_data *sd)
> 	__napi_schedule_irqoff(&mysd->backlog);
> }
> 
>+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
>+{
>+	unsigned long flags;
>+
>+	if (use_backlog_threads()) {
>+		backlog_lock_irq_save(sd, &flags);
>+
>+		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
>+			__napi_schedule_irqoff(&sd->backlog);
>+
>+		backlog_unlock_irq_restore(sd, &flags);
>+
>+	} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
>+		smp_call_function_single_async(cpu, &sd->defer_csd);
>+	}
>+}
>+
> #ifdef CONFIG_NET_FLOW_LIMIT
> int netdev_flow_limit_table_len __read_mostly = (1 << 12);
> #endif
>@@ -4778,7 +4825,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
> 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
> 	sd = &per_cpu(softnet_data, cpu);
> 
>-	rps_lock_irqsave(sd, &flags);
>+	backlog_lock_irq_save(sd, &flags);
> 	if (!netif_running(skb->dev))
> 		goto drop;
> 	qlen = skb_queue_len(&sd->input_pkt_queue);
>@@ -4787,7 +4834,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
> enqueue:
> 			__skb_queue_tail(&sd->input_pkt_queue, skb);
> 			input_queue_tail_incr_save(sd, qtail);
>-			rps_unlock_irq_restore(sd, &flags);
>+			backlog_unlock_irq_restore(sd, &flags);
> 			return NET_RX_SUCCESS;
> 		}
> 
>@@ -4802,7 +4849,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
> 
> drop:
> 	sd->dropped++;
>-	rps_unlock_irq_restore(sd, &flags);
>+	backlog_unlock_irq_restore(sd, &flags);
> 
> 	dev_core_stats_rx_dropped_inc(skb->dev);
> 	kfree_skb_reason(skb, reason);
>@@ -5833,7 +5880,7 @@ static void flush_backlog(struct work_struct *work)
> 	local_bh_disable();
> 	sd = this_cpu_ptr(&softnet_data);
> 
>-	rps_lock_irq_disable(sd);
>+	backlog_lock_irq_disable(sd);
> 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
> 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
> 			__skb_unlink(skb, &sd->input_pkt_queue);
>@@ -5841,7 +5888,7 @@ static void flush_backlog(struct work_struct *work)
> 			input_queue_head_incr(sd);
> 		}
> 	}
>-	rps_unlock_irq_enable(sd);
>+	backlog_unlock_irq_enable(sd);
> 
> 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
> 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
>@@ -5859,14 +5906,14 @@ static bool flush_required(int cpu)
> 	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
> 	bool do_flush;
> 
>-	rps_lock_irq_disable(sd);
>+	backlog_lock_irq_disable(sd);
> 
> 	/* as insertion into process_queue happens with the rps lock held,
> 	 * process_queue access may race only with dequeue
> 	 */
> 	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
> 		   !skb_queue_empty_lockless(&sd->process_queue);
>-	rps_unlock_irq_enable(sd);
>+	backlog_unlock_irq_enable(sd);
> 
> 	return do_flush;
> #endif
>@@ -5932,7 +5979,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
> #ifdef CONFIG_RPS
> 	struct softnet_data *remsd = sd->rps_ipi_list;
> 
>-	if (remsd) {
>+	if (!use_backlog_threads() && remsd) {
> 		sd->rps_ipi_list = NULL;
> 
> 		local_irq_enable();
>@@ -5947,7 +5994,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
> static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
> {
> #ifdef CONFIG_RPS
>-	return sd->rps_ipi_list != NULL;
>+	return !use_backlog_threads() && sd->rps_ipi_list;
> #else
> 	return false;
> #endif
>@@ -5981,7 +6028,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
> 
> 		}
> 
>-		rps_lock_irq_disable(sd);
>+		backlog_lock_irq_disable(sd);
> 		if (skb_queue_empty(&sd->input_pkt_queue)) {
> 			/*
> 			 * Inline a custom version of __napi_complete().
>@@ -5991,13 +6038,13 @@ static int process_backlog(struct napi_struct *napi, int quota)
> 			 * We can use a plain write instead of clear_bit(),
> 			 * and we dont need an smp_mb() memory barrier.
> 			 */
>-			napi->state = 0;
>+			napi->state &= NAPIF_STATE_THREADED;
> 			again = false;
> 		} else {
> 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
> 						   &sd->process_queue);
> 		}
>-		rps_unlock_irq_enable(sd);
>+		backlog_unlock_irq_enable(sd);
> 	}
> 
> 	return work;
>@@ -6654,8 +6701,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
> 
> static int napi_thread_wait(struct napi_struct *napi)
> {
>-	bool woken = false;
>-
> 	set_current_state(TASK_INTERRUPTIBLE);
> 
> 	while (!kthread_should_stop()) {
>@@ -6664,15 +6709,13 @@ static int napi_thread_wait(struct napi_struct *napi)
> 		 * Testing SCHED bit is not enough because SCHED bit might be
> 		 * set by some other busy poll thread or by napi_disable().
> 		 */
>-		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
>+		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
> 			WARN_ON(!list_empty(&napi->poll_list));
> 			__set_current_state(TASK_RUNNING);
> 			return 0;
> 		}
> 
> 		schedule();
>-		/* woken being true indicates this thread owns this napi. */
>-		woken = true;
> 		set_current_state(TASK_INTERRUPTIBLE);
> 	}
> 	__set_current_state(TASK_RUNNING);
>@@ -6701,43 +6744,48 @@ static void skb_defer_free_flush(struct softnet_data *sd)
> 	}
> }
> 
>+static void napi_threaded_poll_loop(struct napi_struct *napi)
>+{
>+	struct softnet_data *sd;
>+	unsigned long last_qs = jiffies;
>+
>+	for (;;) {
>+		bool repoll = false;
>+		void *have;
>+
>+		local_bh_disable();
>+		sd = this_cpu_ptr(&softnet_data);
>+		sd->in_napi_threaded_poll = true;
>+
>+		have = netpoll_poll_lock(napi);
>+		__napi_poll(napi, &repoll);
>+		netpoll_poll_unlock(have);
>+
>+		sd->in_napi_threaded_poll = false;
>+		barrier();
>+
>+		if (sd_has_rps_ipi_waiting(sd)) {
>+			local_irq_disable();
>+			net_rps_action_and_irq_enable(sd);
>+		}
>+		skb_defer_free_flush(sd);
>+		local_bh_enable();
>+
>+		if (!repoll)
>+			break;
>+
>+		rcu_softirq_qs_periodic(last_qs);
>+		cond_resched();
>+	}
>+}
>+
> static int napi_threaded_poll(void *data)
> {
> 	struct napi_struct *napi = data;
>-	struct softnet_data *sd;
>-	void *have;
> 
>-	while (!napi_thread_wait(napi)) {
>-		unsigned long last_qs = jiffies;
>+	while (!napi_thread_wait(napi))
>+		napi_threaded_poll_loop(napi);
> 
>-		for (;;) {
>-			bool repoll = false;
>-
>-			local_bh_disable();
>-			sd = this_cpu_ptr(&softnet_data);
>-			sd->in_napi_threaded_poll = true;
>-
>-			have = netpoll_poll_lock(napi);
>-			__napi_poll(napi, &repoll);
>-			netpoll_poll_unlock(have);
>-
>-			sd->in_napi_threaded_poll = false;
>-			barrier();
>-
>-			if (sd_has_rps_ipi_waiting(sd)) {
>-				local_irq_disable();
>-				net_rps_action_and_irq_enable(sd);
>-			}
>-			skb_defer_free_flush(sd);
>-			local_bh_enable();
>-
>-			if (!repoll)
>-				break;
>-
>-			rcu_softirq_qs_periodic(last_qs);
>-			cond_resched();
>-		}
>-	}
> 	return 0;
> }
> 
>@@ -11336,7 +11384,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
> 
> 		list_del_init(&napi->poll_list);
> 		if (napi->poll == process_backlog)
>-			napi->state = 0;
>+			napi->state &= NAPIF_STATE_THREADED;
> 		else
> 			____napi_schedule(sd, napi);
> 	}
>@@ -11344,12 +11392,14 @@ static int dev_cpu_dead(unsigned int oldcpu)
> 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
> 	local_irq_enable();
> 
>+	if (!use_backlog_threads()) {
> #ifdef CONFIG_RPS
>-	remsd = oldsd->rps_ipi_list;
>-	oldsd->rps_ipi_list = NULL;
>+		remsd = oldsd->rps_ipi_list;
>+		oldsd->rps_ipi_list = NULL;
> #endif
>-	/* send out pending IPI's on offline CPU */
>-	net_rps_send_ipi(remsd);
>+		/* send out pending IPI's on offline CPU */
>+		net_rps_send_ipi(remsd);
>+	}
> 
> 	/* Process offline CPU's input_pkt_queue */
> 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
>@@ -11669,6 +11719,38 @@ static void __init net_dev_struct_check(void)
>  *
>  */
> 
>+static int backlog_napi_should_run(unsigned int cpu)
>+{
>+	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
>+	struct napi_struct *napi = &sd->backlog;
>+
>+	return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
>+}
>+
>+static void run_backlog_napi(unsigned int cpu)
>+{
>+	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
>+
>+	napi_threaded_poll_loop(&sd->backlog);
>+}
>+
>+static void backlog_napi_setup(unsigned int cpu)
>+{
>+	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
>+	struct napi_struct *napi = &sd->backlog;
>+
>+	napi->thread = this_cpu_read(backlog_napi);
>+	set_bit(NAPI_STATE_THREADED, &napi->state);
>+}
>+
>+static struct smp_hotplug_thread backlog_threads = {
>+	.store			= &backlog_napi,
>+	.thread_should_run	= backlog_napi_should_run,
>+	.thread_fn		= run_backlog_napi,
>+	.thread_comm		= "backlog_napi/%u",
>+	.setup			= backlog_napi_setup,
>+};
>+
> /*
>  *       This is called single threaded during boot, so no need
>  *       to take the rtnl semaphore.
>@@ -11721,7 +11803,10 @@ static int __init net_dev_init(void)
> 		init_gro_hash(&sd->backlog);
> 		sd->backlog.poll = process_backlog;
> 		sd->backlog.weight = weight_p;
>+		INIT_LIST_HEAD(&sd->backlog.poll_list);
> 	}
>+	if (use_backlog_threads())
>+		smpboot_register_percpu_thread(&backlog_threads);
> 
> 	dev_boot_phase = 0;
> 
>diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>index 71dee435d549d..48570de42a960 100644
>--- a/net/core/skbuff.c
>+++ b/net/core/skbuff.c
>@@ -6929,8 +6929,8 @@ nodefer:	__kfree_skb(skb);
> 	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
> 	 * if we are unlucky enough (this seems very unlikely).
> 	 */
>-	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
>-		smp_call_function_single_async(cpu, &sd->defer_csd);
>+	if (unlikely(kick))
>+		kick_defer_list_purge(sd, cpu);
> }
> 
> static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,

Actions: View | Diff

Attachments on bug 916954: 874143 | 874144 | 874346 | 874895 | 875322 | 875323 | 879000 | 884278 | 884334 | 888012 | 888021 | 889979 | 889980 | 892741 | 894742 | 894745