Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 892741 Details for
Bug 916954
sys-kernel/gentoo-sources-6.{6,7,8,9}.x: modified RT patch with BORE patch
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
Combined BORE + RT patch for gentoo-sources-6.8.9
0001-linux6.8.y-bore5.1.0_patch-6.8.2-rt11_gentoo-sources-6.8.9.patch (text/plain), 244.82 KB, created by
deim
on 2024-05-11 14:22:37 UTC
(
hide
)
Description:
Combined BORE + RT patch for gentoo-sources-6.8.9
Filename:
MIME Type:
Creator:
deim
Created:
2024-05-11 14:22:37 UTC
Size:
244.82 KB
patch
obsolete
>From feae72fd7f2403910c157dd679d6ec240ed1dfbf Mon Sep 17 00:00:00 2001 >From: Masahito S <firelzrd@gmail.com> >Date: Mon, 22 Apr 2024 04:12:58 +0900 >Subject: [PATCH] linux6.8.y-bore5.1.0 > >--- > include/linux/sched.h | 10 ++ > init/Kconfig | 17 +++ > kernel/sched/core.c | 143 +++++++++++++++++++++++++ > kernel/sched/debug.c | 60 ++++++++++- > kernel/sched/fair.c | 230 ++++++++++++++++++++++++++++++++++++++-- > kernel/sched/features.h | 4 + > kernel/sched/sched.h | 7 ++ > 7 files changed, 462 insertions(+), 9 deletions(-) > >diff --git a/include/linux/sched.h b/include/linux/sched.h >index ffe8f618ab..0ab0b04240 100644 >--- a/include/linux/sched.h >+++ b/include/linux/sched.h >@@ -547,6 +547,16 @@ struct sched_entity { > u64 sum_exec_runtime; > u64 prev_sum_exec_runtime; > u64 vruntime; >+#ifdef CONFIG_SCHED_BORE >+ u64 burst_time; >+ u8 prev_burst_penalty; >+ u8 curr_burst_penalty; >+ u8 burst_penalty; >+ u8 burst_score; >+ u8 child_burst; >+ u32 child_burst_cnt; >+ u64 child_burst_last_cached; >+#endif // CONFIG_SCHED_BORE > s64 vlag; > u64 slice; > >diff --git a/init/Kconfig b/init/Kconfig >index bee58f7468..13427dbb48 100644 >--- a/init/Kconfig >+++ b/init/Kconfig >@@ -1279,6 +1279,23 @@ config CHECKPOINT_RESTORE > > If unsure, say N here. > >+config SCHED_BORE >+ bool "Burst-Oriented Response Enhancer" >+ default y >+ help >+ In Desktop and Mobile computing, one might prefer interactive >+ tasks to keep responsive no matter what they run in the background. >+ >+ Enabling this kernel feature modifies the scheduler to discriminate >+ tasks by their burst time (runtime since it last went sleeping or >+ yielding state) and prioritize those that run less bursty. >+ Such tasks usually include window compositor, widgets backend, >+ terminal emulator, video playback, games and so on. >+ With a little impact to scheduling fairness, it may improve >+ responsiveness especially under heavy background workload. >+ >+ If unsure, say Y here. >+ > config SCHED_AUTOGROUP > bool "Automatic process group scheduling" > select CGROUPS >diff --git a/kernel/sched/core.c b/kernel/sched/core.c >index 9116bcc903..d1711f75f8 100644 >--- a/kernel/sched/core.c >+++ b/kernel/sched/core.c >@@ -4507,6 +4507,138 @@ int wake_up_state(struct task_struct *p, unsigned int state) > return try_to_wake_up(p, state, 0); > } > >+#ifdef CONFIG_SCHED_BORE >+extern u8 sched_burst_fork_atavistic; >+extern uint sched_burst_cache_lifetime; >+ >+static void __init sched_init_bore(void) { >+ init_task.se.burst_time = 0; >+ init_task.se.prev_burst_penalty = 0; >+ init_task.se.curr_burst_penalty = 0; >+ init_task.se.burst_penalty = 0; >+ init_task.se.burst_score = 0; >+ init_task.se.child_burst_last_cached = 0; >+} >+ >+void inline sched_fork_bore(struct task_struct *p) { >+ p->se.burst_time = 0; >+ p->se.curr_burst_penalty = 0; >+ p->se.burst_score = 0; >+ p->se.child_burst_last_cached = 0; >+} >+ >+static u32 count_child_tasks(struct task_struct *p) { >+ struct task_struct *child; >+ u32 cnt = 0; >+ list_for_each_entry(child, &p->children, sibling) {cnt++;} >+ return cnt; >+} >+ >+static inline bool task_is_inheritable(struct task_struct *p) { >+ return (p->sched_class == &fair_sched_class); >+} >+ >+static inline bool child_burst_cache_expired(struct task_struct *p, u64 now) { >+ u64 expiration_time = >+ p->se.child_burst_last_cached + sched_burst_cache_lifetime; >+ return ((s64)(expiration_time - now) < 0); >+} >+ >+static void __update_child_burst_cache( >+ struct task_struct *p, u32 cnt, u32 sum, u64 now) { >+ u8 avg = 0; >+ if (cnt) avg = sum / cnt; >+ p->se.child_burst = max(avg, p->se.burst_penalty); >+ p->se.child_burst_cnt = cnt; >+ p->se.child_burst_last_cached = now; >+} >+ >+static inline void update_child_burst_direct(struct task_struct *p, u64 now) { >+ struct task_struct *child; >+ u32 cnt = 0; >+ u32 sum = 0; >+ >+ list_for_each_entry(child, &p->children, sibling) { >+ if (!task_is_inheritable(child)) continue; >+ cnt++; >+ sum += child->se.burst_penalty; >+ } >+ >+ __update_child_burst_cache(p, cnt, sum, now); >+} >+ >+static inline u8 __inherit_burst_direct(struct task_struct *p, u64 now) { >+ struct task_struct *parent = p->real_parent; >+ if (child_burst_cache_expired(parent, now)) >+ update_child_burst_direct(parent, now); >+ >+ return parent->se.child_burst; >+} >+ >+static void update_child_burst_topological( >+ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) { >+ struct task_struct *child, *dec; >+ u32 cnt = 0, dcnt = 0; >+ u32 sum = 0; >+ >+ list_for_each_entry(child, &p->children, sibling) { >+ dec = child; >+ while ((dcnt = count_child_tasks(dec)) == 1) >+ dec = list_first_entry(&dec->children, struct task_struct, sibling); >+ >+ if (!dcnt || !depth) { >+ if (!task_is_inheritable(dec)) continue; >+ cnt++; >+ sum += dec->se.burst_penalty; >+ continue; >+ } >+ if (!child_burst_cache_expired(dec, now)) { >+ cnt += dec->se.child_burst_cnt; >+ sum += (u32)dec->se.child_burst * dec->se.child_burst_cnt; >+ continue; >+ } >+ update_child_burst_topological(dec, now, depth - 1, &cnt, &sum); >+ } >+ >+ __update_child_burst_cache(p, cnt, sum, now); >+ *acnt += cnt; >+ *asum += sum; >+} >+ >+static inline u8 __inherit_burst_topological(struct task_struct *p, u64 now) { >+ struct task_struct *anc = p->real_parent; >+ u32 cnt = 0, sum = 0; >+ >+ while (anc->real_parent != anc && count_child_tasks(anc) == 1) >+ anc = anc->real_parent; >+ >+ if (child_burst_cache_expired(anc, now)) >+ update_child_burst_topological( >+ anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum); >+ >+ return anc->se.child_burst; >+} >+ >+static inline void inherit_burst(struct task_struct *p) { >+ u8 burst_cache; >+ u64 now = ktime_get_ns(); >+ >+ read_lock(&tasklist_lock); >+ burst_cache = likely(sched_burst_fork_atavistic)? >+ __inherit_burst_topological(p, now): >+ __inherit_burst_direct(p, now); >+ read_unlock(&tasklist_lock); >+ >+ p->se.prev_burst_penalty = max(p->se.prev_burst_penalty, burst_cache); >+} >+ >+static void sched_post_fork_bore(struct task_struct *p) { >+ if (p->sched_class == &fair_sched_class) >+ inherit_burst(p); >+ p->se.burst_penalty = p->se.prev_burst_penalty; >+} >+#endif // CONFIG_SCHED_BORE >+ > /* > * Perform scheduler related setup for a newly forked process p. > * p is forked by current. >@@ -4523,6 +4655,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) > p->se.prev_sum_exec_runtime = 0; > p->se.nr_migrations = 0; > p->se.vruntime = 0; >+#ifdef CONFIG_SCHED_BORE >+ sched_fork_bore(p); >+#endif // CONFIG_SCHED_BORE > p->se.vlag = 0; > p->se.slice = sysctl_sched_base_slice; > INIT_LIST_HEAD(&p->se.group_node); >@@ -4839,6 +4974,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) > > void sched_post_fork(struct task_struct *p) > { >+#ifdef CONFIG_SCHED_BORE >+ sched_post_fork_bore(p); >+#endif // CONFIG_SCHED_BORE > uclamp_post_fork(p); > } > >@@ -9910,6 +10048,11 @@ void __init sched_init(void) > BUG_ON(&dl_sched_class != &stop_sched_class + 1); > #endif > >+#ifdef CONFIG_SCHED_BORE >+ sched_init_bore(); >+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.1.0 by Masahito Suzuki"); >+#endif // CONFIG_SCHED_BORE >+ > wait_bit_init(); > > #ifdef CONFIG_FAIR_GROUP_SCHED >diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c >index 8d5d98a583..b178612617 100644 >--- a/kernel/sched/debug.c >+++ b/kernel/sched/debug.c >@@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = { > }; > > #ifdef CONFIG_SMP >+#ifdef CONFIG_SCHED_BORE >+static ssize_t sched_min_base_slice_write(struct file *filp, const char __user *ubuf, >+ size_t cnt, loff_t *ppos) >+{ >+ char buf[16]; >+ unsigned int value; >+ >+ if (cnt > 15) >+ cnt = 15; >+ >+ if (copy_from_user(&buf, ubuf, cnt)) >+ return -EFAULT; >+ buf[cnt] = '\0'; >+ >+ if (kstrtouint(buf, 10, &value)) >+ return -EINVAL; > >+ if (!value) >+ return -EINVAL; >+ >+ sysctl_sched_min_base_slice = value; >+ sched_update_min_base_slice(); >+ >+ *ppos += cnt; >+ return cnt; >+} >+ >+static int sched_min_base_slice_show(struct seq_file *m, void *v) >+{ >+ seq_printf(m, "%d\n", sysctl_sched_min_base_slice); >+ return 0; >+} >+ >+static int sched_min_base_slice_open(struct inode *inode, struct file *filp) >+{ >+ return single_open(filp, sched_min_base_slice_show, NULL); >+} >+ >+static const struct file_operations sched_min_base_slice_fops = { >+ .open = sched_min_base_slice_open, >+ .write = sched_min_base_slice_write, >+ .read = seq_read, >+ .llseek = seq_lseek, >+ .release = single_release, >+}; >+#else // !CONFIG_SCHED_BORE > static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, > size_t cnt, loff_t *ppos) > { >@@ -213,7 +258,7 @@ static const struct file_operations sched_scaling_fops = { > .llseek = seq_lseek, > .release = single_release, > }; >- >+#endif // CONFIG_SCHED_BORE > #endif /* SMP */ > > #ifdef CONFIG_PREEMPT_DYNAMIC >@@ -353,14 +353,21 @@ > debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); > #endif > >- debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); >+#ifdef CONFIG_SCHED_BORE >+ debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); >+ debugfs_create_u32("base_slice_ns", 0400, debugfs_sched, &sysctl_sched_base_slice); >+#else // !CONFIG_SCHED_BORE >+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); >+#endif // CONFIG_SCHED_BORE > > #ifndef CONFIG_SCHED_ALT > debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); > debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); > > #ifdef CONFIG_SMP >+#if !defined(CONFIG_SCHED_BORE) > debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); >+#endif // CONFIG_SCHED_BORE > debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); > debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); > >@@ -595,6 +647,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) > SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), > SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); > >+#ifdef CONFIG_SCHED_BORE >+ SEQ_printf(m, " %2d", p->se.burst_score); >+#endif // CONFIG_SCHED_BORE > #ifdef CONFIG_NUMA_BALANCING > SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); > #endif >@@ -1068,6 +1123,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, > > P(se.load.weight); > #ifdef CONFIG_SMP >+#ifdef CONFIG_SCHED_BORE >+ P(se.burst_score); >+#endif // CONFIG_SCHED_BORE > P(se.avg.load_sum); > P(se.avg.runnable_sum); > P(se.avg.util_sum); >diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >index 533547e3c9..a2346b1b44 100644 >--- a/kernel/sched/fair.c >+++ b/kernel/sched/fair.c >@@ -19,6 +19,9 @@ > * > * Adaptive scheduling granularity, math enhancements by Peter Zijlstra > * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra >+ * >+ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler >+ * Copyright (C) 2021-2024 Masahito Suzuki <firelzrd@gmail.com> > */ > #include <linux/energy_model.h> > #include <linux/mmap_lock.h> >@@ -64,20 +67,125 @@ > * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) > * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus > * >- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) >+ * (BORE default SCHED_TUNABLESCALING_NONE = *1 constant) >+ * (EEVDF default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) > */ >+#ifdef CONFIG_SCHED_BORE >+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; >+#else // !CONFIG_SCHED_BORE > unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; >+#endif // CONFIG_SCHED_BORE > > /* > * Minimal preemption granularity for CPU-bound tasks: > * >- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) >+ * (BORE default: max(1 sec / HZ, min_base_slice) constant, units: nanoseconds) >+ * (EEVDF default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) > */ >+#ifdef CONFIG_SCHED_BORE >+unsigned int sysctl_sched_base_slice = 1000000000ULL / HZ; >+static unsigned int configured_sched_base_slice = 1000000000ULL / HZ; >+unsigned int sysctl_sched_min_base_slice = 2000000ULL; >+#else // !CONFIG_SCHED_BORE > unsigned int sysctl_sched_base_slice = 750000ULL; > static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; >+#endif // CONFIG_SCHED_BORE > > const_debug unsigned int sysctl_sched_migration_cost = 500000UL; > >+#ifdef CONFIG_SCHED_BORE >+u8 __read_mostly sched_bore = 1; >+u8 __read_mostly sched_burst_smoothness_long = 1; >+u8 __read_mostly sched_burst_smoothness_short = 0; >+u8 __read_mostly sched_burst_fork_atavistic = 2; >+u8 __read_mostly sched_burst_penalty_offset = 22; >+uint __read_mostly sched_burst_penalty_scale = 1280; >+uint __read_mostly sched_burst_cache_lifetime = 60000000; >+static int __maybe_unused sixty_four = 64; >+static int __maybe_unused maxval_12_bits = 4095; >+ >+#define MAX_BURST_PENALTY (39U <<2) >+ >+static inline u32 log2plus1_u64_u32f8(u64 v) { >+ u32 msb = fls64(v); >+ s32 excess_bits = msb - 9; >+ u8 fractional = (0 <= excess_bits)? v >> excess_bits: v << -excess_bits; >+ return msb << 8 | fractional; >+} >+ >+static inline u32 calc_burst_penalty(u64 burst_time) { >+ u32 greed, tolerance, penalty, scaled_penalty; >+ >+ greed = log2plus1_u64_u32f8(burst_time); >+ tolerance = sched_burst_penalty_offset << 8; >+ penalty = max(0, (s32)greed - (s32)tolerance); >+ scaled_penalty = penalty * sched_burst_penalty_scale >> 16; >+ >+ return min(MAX_BURST_PENALTY, scaled_penalty); >+} >+ >+static inline u64 scale_slice(u64 delta, struct sched_entity *se) { >+ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->burst_score], 22); >+} >+ >+static inline u64 __unscale_slice(u64 delta, u8 score) { >+ return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10); >+} >+ >+static inline u64 unscale_slice(u64 delta, struct sched_entity *se) { >+ return __unscale_slice(delta, se->burst_score); >+} >+ >+void reweight_task(struct task_struct *p, int prio); >+ >+static void update_burst_score(struct sched_entity *se) { >+ if (!entity_is_task(se)) return; >+ struct task_struct *p = task_of(se); >+ u8 prio = p->static_prio - MAX_RT_PRIO; >+ u8 prev_prio = min(39, prio + se->burst_score); >+ >+ se->burst_score = se->burst_penalty >> 2; >+ >+ u8 new_prio = min(39, prio + se->burst_score); >+ if (new_prio != prev_prio) >+ reweight_task(p, new_prio); >+} >+ >+static void update_burst_penalty(struct sched_entity *se) { >+ se->curr_burst_penalty = calc_burst_penalty(se->burst_time); >+ se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty); >+ update_burst_score(se); >+} >+ >+static inline u32 binary_smooth(u32 new, u32 old) { >+ int increment = new - old; >+ return (0 <= increment)? >+ old + ( increment >> (int)sched_burst_smoothness_long): >+ old - (-increment >> (int)sched_burst_smoothness_short); >+} >+ >+static void restart_burst(struct sched_entity *se) { >+ se->burst_penalty = se->prev_burst_penalty = >+ binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty); >+ se->curr_burst_penalty = 0; >+ se->burst_time = 0; >+ update_burst_score(se); >+} >+ >+static void restart_burst_rescale_deadline(struct sched_entity *se) { >+ s64 vscaled, wremain, vremain = se->deadline - se->vruntime; >+ u8 prev_score = se->burst_score; >+ restart_burst(se); >+ if (prev_score > se->burst_score) { >+ wremain = __unscale_slice(abs(vremain), prev_score); >+ vscaled = scale_slice(wremain, se); >+ if (unlikely(vremain < 0)) >+ vscaled = -vscaled; >+ se->deadline = se->vruntime + vscaled; >+ } >+} >+#endif // CONFIG_SCHED_BORE >+ > int sched_thermal_decay_shift; > static int __init setup_sched_thermal_decay_shift(char *str) > { >@@ -137,6 +245,69 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; > > #ifdef CONFIG_SYSCTL > static struct ctl_table sched_fair_sysctls[] = { >+#ifdef CONFIG_SCHED_BORE >+ { >+ .procname = "sched_bore", >+ .data = &sched_bore, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ONE, >+ .extra2 = SYSCTL_ONE, >+ }, >+ { >+ .procname = "sched_burst_smoothness_long", >+ .data = &sched_burst_smoothness_long, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = SYSCTL_ONE, >+ }, >+ { >+ .procname = "sched_burst_smoothness_short", >+ .data = &sched_burst_smoothness_short, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = SYSCTL_ONE, >+ }, >+ { >+ .procname = "sched_burst_fork_atavistic", >+ .data = &sched_burst_fork_atavistic, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = SYSCTL_THREE, >+ }, >+ { >+ .procname = "sched_burst_penalty_offset", >+ .data = &sched_burst_penalty_offset, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = &sixty_four, >+ }, >+ { >+ .procname = "sched_burst_penalty_scale", >+ .data = &sched_burst_penalty_scale, >+ .maxlen = sizeof(uint), >+ .mode = 0644, >+ .proc_handler = proc_douintvec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = &maxval_12_bits, >+ }, >+ { >+ .procname = "sched_burst_cache_lifetime", >+ .data = &sched_burst_cache_lifetime, >+ .maxlen = sizeof(uint), >+ .mode = 0644, >+ .proc_handler = proc_douintvec, >+ }, >+#endif // CONFIG_SCHED_BORE > #ifdef CONFIG_CFS_BANDWIDTH > { > .procname = "sched_cfs_bandwidth_slice_us", >@@ -195,6 +366,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) > * > * This idea comes from the SD scheduler of Con Kolivas: > */ >+#ifdef CONFIG_SCHED_BORE >+static void update_sysctl(void) { >+ sysctl_sched_base_slice = >+ max(sysctl_sched_min_base_slice, configured_sched_base_slice); >+} >+void sched_update_min_base_slice(void) { update_sysctl(); } >+#else // !CONFIG_SCHED_BORE > static unsigned int get_update_sysctl_factor(void) > { > unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); >@@ -225,6 +403,7 @@ static void update_sysctl(void) > SET_SYSCTL(sched_base_slice); > #undef SET_SYSCTL > } >+#endif // CONFIG_SCHED_BORE > > void __init sched_init_granularity(void) > { >@@ -703,6 +703,10 @@ > vlag = avruntime - se->vruntime; > limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); > >+#ifdef CONFIG_SCHED_BORE >+ limit >>= 1; >+#endif // CONFIG_SCHED_BORE >+ > return clamp(vlag, -limit, limit); > } > >@@ -955,6 +1137,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) > * Scheduling class statistics methods: > */ > #ifdef CONFIG_SMP >+#if !defined(CONFIG_SCHED_BORE) > int sched_update_scaling(void) > { > unsigned int factor = get_update_sysctl_factor(); >@@ -966,6 +1149,7 @@ int sched_update_scaling(void) > > return 0; > } >+#endif // CONFIG_SCHED_BORE > #endif > #endif > >@@ -1165,7 +1349,13 @@ static void update_curr(struct cfs_rq *cfs_rq) > if (unlikely(delta_exec <= 0)) > return; > >+#ifdef CONFIG_SCHED_BORE >+ curr->burst_time += delta_exec; >+ update_burst_penalty(curr); >+ curr->vruntime += max(1ULL, calc_delta_fair(delta_exec, curr)); >+#else // !CONFIG_SCHED_BORE > curr->vruntime += calc_delta_fair(delta_exec, curr); >+#endif // CONFIG_SCHED_BORE > update_deadline(cfs_rq, curr); > update_min_vruntime(cfs_rq); > >@@ -5171,6 +5362,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) > * > * EEVDF: placement strategy #1 / #2 > */ >+#ifdef CONFIG_SCHED_BORE >+ if (se->vlag) >+#endif // CONFIG_SCHED_BORE > if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { > struct sched_entity *curr = cfs_rq->curr; > unsigned long load; >@@ -6803,6 +6997,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > bool was_sched_idle = sched_idle_rq(rq); > > util_est_dequeue(&rq->cfs, p); >+#ifdef CONFIG_SCHED_BORE >+ if (task_sleep) { >+ cfs_rq = cfs_rq_of(se); >+ if (cfs_rq->curr == se) >+ update_curr(cfs_rq); >+ restart_burst(se); >+ } >+#endif // CONFIG_SCHED_BORE > > for_each_sched_entity(se) { > cfs_rq = cfs_rq_of(se); >@@ -8552,16 +8754,25 @@ static void yield_task_fair(struct rq *rq) > /* > * Are we the only task in the tree? > */ >+#if !defined(CONFIG_SCHED_BORE) > if (unlikely(rq->nr_running == 1)) > return; > > clear_buddies(cfs_rq, se); >+#endif // CONFIG_SCHED_BORE > > update_rq_clock(rq); > /* > * Update run-time statistics of the 'current'. > */ > update_curr(cfs_rq); >+#ifdef CONFIG_SCHED_BORE >+ restart_burst_rescale_deadline(se); >+ if (unlikely(rq->nr_running == 1)) >+ return; >+ >+ clear_buddies(cfs_rq, se); >+#endif // CONFIG_SCHED_BORE > /* > * Tell update_rq_clock() that we've just updated, > * so we don't do microscopic update in schedule() >@@ -12651,6 +12862,9 @@ static void task_fork_fair(struct task_struct *p) > curr = cfs_rq->curr; > if (curr) > update_curr(cfs_rq); >+#ifdef CONFIG_SCHED_BORE >+ update_burst_score(se); >+#endif // CONFIG_SCHED_BORE > place_entity(cfs_rq, se, ENQUEUE_INITIAL); > rq_unlock(rq, &rf); > } >diff --git a/kernel/sched/features.h b/kernel/sched/features.h >index 143f55df89..3f0fe409f5 100644 >--- a/kernel/sched/features.h >+++ b/kernel/sched/features.h >@@ -6,7 +6,11 @@ > */ > SCHED_FEAT(PLACE_LAG, true) > SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) >+#ifdef CONFIG_SCHED_BORE >+SCHED_FEAT(RUN_TO_PARITY, false) >+#else // !CONFIG_SCHED_BORE > SCHED_FEAT(RUN_TO_PARITY, true) >+#endif // CONFIG_SCHED_BORE > > /* > * Prefer to schedule the task we woke last (assuming it failed >diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h >index 001fe047bd..da3ad1d4e1 100644 >--- a/kernel/sched/sched.h >+++ b/kernel/sched/sched.h >@@ -1965,7 +1965,11 @@ static inline void dirty_sched_domain_sysctl(int cpu) > } > #endif > >+#ifdef CONFIG_SCHED_BORE >+extern void sched_update_min_base_slice(void); >+#else // !CONFIG_SCHED_BORE > extern int sched_update_scaling(void); >+#endif // CONFIG_SCHED_BORE > > static inline const struct cpumask *task_user_cpus(struct task_struct *p) > { >@@ -2552,6 +2556,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; > extern const_debug unsigned int sysctl_sched_migration_cost; > > extern unsigned int sysctl_sched_base_slice; >+#ifdef CONFIG_SCHED_BORE >+extern unsigned int sysctl_sched_min_base_slice; >+#endif // CONFIG_SCHED_BORE > > #ifdef CONFIG_SCHED_DEBUG > extern int sysctl_resched_latency_warn_ms; >-- >2.34.1 > >diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt >index 73062d47a462e..86e2745a3e8fb 100644 >--- a/Documentation/admin-guide/kernel-parameters.txt >+++ b/Documentation/admin-guide/kernel-parameters.txt >@@ -6508,6 +6508,18 @@ > Force threading of all interrupt handlers except those > marked explicitly IRQF_NO_THREAD. > >+ threadprintk [KNL] >+ Force threaded printing of all legacy consoles. Be >+ aware that with this option, the shutdown, reboot, and >+ panic messages may not be printed on the legacy >+ consoles. Also, earlycon/earlyprintk printing will be >+ delayed until a regular console or the kthread is >+ available. >+ >+ Users can view /proc/consoles to see if their console >+ driver is legacy or not. Non-legacy (NBCON) console >+ drivers are already threaded and are shown with 'N'. >+ > topology= [S390] > Format: {off | on} > Specify if the kernel should make use of the cpu >diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig >index 0af6709570d14..25424a7468d95 100644 >--- a/arch/arm/Kconfig >+++ b/arch/arm/Kconfig >@@ -36,6 +36,7 @@ config ARM > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE > select ARCH_SUPPORTS_PER_VMA_LOCK >+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF > select ARCH_USE_MEMTEST >@@ -75,7 +76,7 @@ config ARM > select HAS_IOPORT > select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT > select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 >- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU >+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT > select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL > select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU > select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL >@@ -98,7 +99,7 @@ config ARM > select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE > select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU > select HAVE_EXIT_THREAD >- select HAVE_FAST_GUP if ARM_LPAE >+ select HAVE_FAST_GUP if ARM_LPAE && !(PREEMPT_RT && HIGHPTE) > select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL > select HAVE_FUNCTION_ERROR_INJECTION > select HAVE_FUNCTION_GRAPH_TRACER >@@ -120,6 +121,7 @@ config ARM > select HAVE_PERF_EVENTS > select HAVE_PERF_REGS > select HAVE_PERF_USER_STACK_DUMP >+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM > select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE > select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RSEQ >diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c >index 07565b593ed68..3761c1e995cf6 100644 >--- a/arch/arm/mm/fault.c >+++ b/arch/arm/mm/fault.c >@@ -436,6 +436,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, > if (addr < TASK_SIZE) > return do_page_fault(addr, fsr, regs); > >+ if (interrupts_enabled(regs)) >+ local_irq_enable(); >+ > if (user_mode(regs)) > goto bad_area; > >@@ -506,6 +509,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, > static int > do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) > { >+ if (interrupts_enabled(regs)) >+ local_irq_enable(); >+ > do_bad_area(addr, fsr, regs); > return 0; > } >diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c >index b68efe643a12c..48745a3c52618 100644 >--- a/arch/arm/vfp/vfpmodule.c >+++ b/arch/arm/vfp/vfpmodule.c >@@ -55,6 +55,34 @@ extern unsigned int VFP_arch_feroceon __alias(VFP_arch); > */ > union vfp_state *vfp_current_hw_state[NR_CPUS]; > >+/* >+ * Claim ownership of the VFP unit. >+ * >+ * The caller may change VFP registers until vfp_unlock() is called. >+ * >+ * local_bh_disable() is used to disable preemption and to disable VFP >+ * processing in softirq context. On PREEMPT_RT kernels local_bh_disable() is >+ * not sufficient because it only serializes soft interrupt related sections >+ * via a local lock, but stays preemptible. Disabling preemption is the right >+ * choice here as bottom half processing is always in thread context on RT >+ * kernels so it implicitly prevents bottom half processing as well. >+ */ >+static void vfp_lock(void) >+{ >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_bh_disable(); >+ else >+ preempt_disable(); >+} >+ >+static void vfp_unlock(void) >+{ >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_bh_enable(); >+ else >+ preempt_enable(); >+} >+ > /* > * Is 'thread's most up to date state stored in this CPUs hardware? > * Must be called from non-preemptible context. >@@ -240,7 +268,7 @@ static void vfp_panic(char *reason, u32 inst) > /* > * Process bitmask of exception conditions. > */ >-static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_regs *regs) >+static int vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr) > { > int si_code = 0; > >@@ -248,8 +276,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ > > if (exceptions == VFP_EXCEPTION_ERROR) { > vfp_panic("unhandled bounce", inst); >- vfp_raise_sigfpe(FPE_FLTINV, regs); >- return; >+ return FPE_FLTINV; > } > > /* >@@ -277,8 +304,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ > RAISE(FPSCR_OFC, FPSCR_OFE, FPE_FLTOVF); > RAISE(FPSCR_IOC, FPSCR_IOE, FPE_FLTINV); > >- if (si_code) >- vfp_raise_sigfpe(si_code, regs); >+ return si_code; > } > > /* >@@ -324,6 +350,8 @@ static u32 vfp_emulate_instruction(u32 inst, u32 fpscr, struct pt_regs *regs) > static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > { > u32 fpscr, orig_fpscr, fpsid, exceptions; >+ int si_code2 = 0; >+ int si_code = 0; > > pr_debug("VFP: bounce: trigger %08x fpexc %08x\n", trigger, fpexc); > >@@ -369,8 +397,8 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > * unallocated VFP instruction but with FPSCR.IXE set and not > * on VFP subarch 1. > */ >- vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr, regs); >- return; >+ si_code = vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr); >+ goto exit; > } > > /* >@@ -394,14 +422,14 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > */ > exceptions = vfp_emulate_instruction(trigger, fpscr, regs); > if (exceptions) >- vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); >+ si_code2 = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); > > /* > * If there isn't a second FP instruction, exit now. Note that > * the FPEXC.FP2V bit is valid only if FPEXC.EX is 1. > */ > if ((fpexc & (FPEXC_EX | FPEXC_FP2V)) != (FPEXC_EX | FPEXC_FP2V)) >- return; >+ goto exit; > > /* > * The barrier() here prevents fpinst2 being read >@@ -413,7 +441,13 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > emulate: > exceptions = vfp_emulate_instruction(trigger, orig_fpscr, regs); > if (exceptions) >- vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); >+ si_code = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); >+exit: >+ vfp_unlock(); >+ if (si_code2) >+ vfp_raise_sigfpe(si_code2, regs); >+ if (si_code) >+ vfp_raise_sigfpe(si_code, regs); > } > > static void vfp_enable(void *unused) >@@ -512,11 +546,9 @@ static inline void vfp_pm_init(void) { } > */ > void vfp_sync_hwstate(struct thread_info *thread) > { >- unsigned int cpu = get_cpu(); >+ vfp_lock(); > >- local_bh_disable(); >- >- if (vfp_state_in_hw(cpu, thread)) { >+ if (vfp_state_in_hw(raw_smp_processor_id(), thread)) { > u32 fpexc = fmrx(FPEXC); > > /* >@@ -527,8 +559,7 @@ void vfp_sync_hwstate(struct thread_info *thread) > fmxr(FPEXC, fpexc); > } > >- local_bh_enable(); >- put_cpu(); >+ vfp_unlock(); > } > > /* Ensure that the thread reloads the hardware VFP state on the next use. */ >@@ -683,7 +714,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) > if (!user_mode(regs)) > return vfp_kmode_exception(regs, trigger); > >- local_bh_disable(); >+ vfp_lock(); > fpexc = fmrx(FPEXC); > > /* >@@ -748,6 +779,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) > * replay the instruction that trapped. > */ > fmxr(FPEXC, fpexc); >+ vfp_unlock(); > } else { > /* Check for synchronous or asynchronous exceptions */ > if (!(fpexc & (FPEXC_EX | FPEXC_DEX))) { >@@ -762,17 +794,17 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) > if (!(fpscr & FPSCR_IXE)) { > if (!(fpscr & FPSCR_LENGTH_MASK)) { > pr_debug("not VFP\n"); >- local_bh_enable(); >+ vfp_unlock(); > return -ENOEXEC; > } > fpexc |= FPEXC_DEX; > } > } > bounce: regs->ARM_pc += 4; >+ /* VFP_bounce() will invoke vfp_unlock() */ > VFP_bounce(trigger, fpexc, regs); > } > >- local_bh_enable(); > return 0; > } > >@@ -837,7 +869,7 @@ void kernel_neon_begin(void) > unsigned int cpu; > u32 fpexc; > >- local_bh_disable(); >+ vfp_lock(); > > /* > * Kernel mode NEON is only allowed outside of hardirq context with >@@ -868,7 +900,7 @@ void kernel_neon_end(void) > { > /* Disable the NEON/VFP unit. */ > fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN); >- local_bh_enable(); >+ vfp_unlock(); > } > EXPORT_SYMBOL(kernel_neon_end); > >diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig >index 6b96d75a3a3d8..8ecbaa43f133f 100644 >--- a/arch/arm64/Kconfig >+++ b/arch/arm64/Kconfig >@@ -98,6 +98,7 @@ config ARM64 > select ARCH_SUPPORTS_NUMA_BALANCING > select ARCH_SUPPORTS_PAGE_TABLE_CHECK > select ARCH_SUPPORTS_PER_VMA_LOCK >+ select ARCH_SUPPORTS_RT > select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH > select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT > select ARCH_WANT_DEFAULT_BPF_JIT >diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig >index b9fc064d38d28..e8651e304c888 100644 >--- a/arch/powerpc/Kconfig >+++ b/arch/powerpc/Kconfig >@@ -166,6 +166,7 @@ config PPC > select ARCH_STACKWALK > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x >+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF if PPC64 > select ARCH_USE_MEMTEST >@@ -270,6 +271,7 @@ config PPC > select HAVE_PERF_USER_STACK_DUMP > select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RELIABLE_STACKTRACE >+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM > select HAVE_RSEQ > select HAVE_SETUP_PER_CPU_AREA if PPC64 > select HAVE_SOFTIRQ_ON_OWN_STACK >diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h >index 283c346478565..4727f40052ddd 100644 >--- a/arch/powerpc/include/asm/stackprotector.h >+++ b/arch/powerpc/include/asm/stackprotector.h >@@ -19,8 +19,13 @@ > */ > static __always_inline void boot_init_stack_canary(void) > { >- unsigned long canary = get_random_canary(); >+ unsigned long canary; > >+#ifndef CONFIG_PREEMPT_RT >+ canary = get_random_canary(); >+#else >+ canary = ((unsigned long)&canary) & CANARY_MASK; >+#endif > current->stack_canary = canary; > #ifdef CONFIG_PPC64 > get_paca()->canary = canary; >diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c >index 11e062b47d3f8..f7e22276c97b0 100644 >--- a/arch/powerpc/kernel/traps.c >+++ b/arch/powerpc/kernel/traps.c >@@ -261,12 +261,17 @@ static char *get_mmu_str(void) > > static int __die(const char *str, struct pt_regs *regs, long err) > { >+ const char *pr = ""; >+ > printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); > >+ if (IS_ENABLED(CONFIG_PREEMPTION)) >+ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; >+ > printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", > IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", > PAGE_SIZE / 1024, get_mmu_str(), >- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", >+ pr, > IS_ENABLED(CONFIG_SMP) ? " SMP" : "", > IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", > debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", >diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig >index 074263429faf2..96ab63d035e5c 100644 >--- a/arch/powerpc/kvm/Kconfig >+++ b/arch/powerpc/kvm/Kconfig >@@ -222,6 +222,7 @@ config KVM_E500MC > config KVM_MPIC > bool "KVM in-kernel MPIC emulation" > depends on KVM && PPC_E500 >+ depends on !PREEMPT_RT > select HAVE_KVM_IRQCHIP > select HAVE_KVM_IRQ_ROUTING > select HAVE_KVM_MSI >diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig >index afc0f6a613372..dc3f63c2687d4 100644 >--- a/arch/powerpc/platforms/pseries/Kconfig >+++ b/arch/powerpc/platforms/pseries/Kconfig >@@ -2,6 +2,7 @@ > config PPC_PSERIES > depends on PPC64 && PPC_BOOK3S > bool "IBM pSeries & new (POWER5-based) iSeries" >+ select GENERIC_ALLOCATOR > select HAVE_PCSPKR_PLATFORM > select MPIC > select OF_DYNAMIC >diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c >index e8c4129697b14..c61e29deac8df 100644 >--- a/arch/powerpc/platforms/pseries/iommu.c >+++ b/arch/powerpc/platforms/pseries/iommu.c >@@ -25,6 +25,7 @@ > #include <linux/of_address.h> > #include <linux/iommu.h> > #include <linux/rculist.h> >+#include <linux/local_lock.h> > #include <asm/io.h> > #include <asm/prom.h> > #include <asm/rtas.h> >@@ -206,7 +207,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, > return ret; > } > >-static DEFINE_PER_CPU(__be64 *, tce_page); >+struct tce_page { >+ __be64 * page; >+ local_lock_t lock; >+}; >+static DEFINE_PER_CPU(struct tce_page, tce_page) = { >+ .lock = INIT_LOCAL_LOCK(lock), >+}; > > static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > long npages, unsigned long uaddr, >@@ -229,9 +236,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > direction, attrs); > } > >- local_irq_save(flags); /* to protect tcep and the page behind it */ >+ /* to protect tcep and the page behind it */ >+ local_lock_irqsave(&tce_page.lock, flags); > >- tcep = __this_cpu_read(tce_page); >+ tcep = __this_cpu_read(tce_page.page); > > /* This is safe to do since interrupts are off when we're called > * from iommu_alloc{,_sg}() >@@ -240,12 +248,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > tcep = (__be64 *)__get_free_page(GFP_ATOMIC); > /* If allocation fails, fall back to the loop implementation */ > if (!tcep) { >- local_irq_restore(flags); >+ local_unlock_irqrestore(&tce_page.lock, flags); > return tce_build_pSeriesLP(tbl->it_index, tcenum, > tceshift, > npages, uaddr, direction, attrs); > } >- __this_cpu_write(tce_page, tcep); >+ __this_cpu_write(tce_page.page, tcep); > } > > rpn = __pa(uaddr) >> tceshift; >@@ -275,7 +283,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > tcenum += limit; > } while (npages > 0 && !rc); > >- local_irq_restore(flags); >+ local_unlock_irqrestore(&tce_page.lock, flags); > > if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { > ret = (int)rc; >@@ -459,16 +467,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, > DMA_BIDIRECTIONAL, 0); > } > >- local_irq_disable(); /* to protect tcep and the page behind it */ >- tcep = __this_cpu_read(tce_page); >+ /* to protect tcep and the page behind it */ >+ local_lock_irq(&tce_page.lock); >+ tcep = __this_cpu_read(tce_page.page); > > if (!tcep) { > tcep = (__be64 *)__get_free_page(GFP_ATOMIC); > if (!tcep) { >- local_irq_enable(); >+ local_unlock_irq(&tce_page.lock); > return -ENOMEM; > } >- __this_cpu_write(tce_page, tcep); >+ __this_cpu_write(tce_page.page, tcep); > } > > proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; >@@ -511,7 +520,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, > > /* error cleanup: caller will clear whole range */ > >- local_irq_enable(); >+ local_unlock_irq(&tce_page.lock); > return rc; > } > >diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >index e3142ce531a09..32c5db19cf899 100644 >--- a/arch/riscv/Kconfig >+++ b/arch/riscv/Kconfig >@@ -49,6 +49,7 @@ config RISCV > select ARCH_SUPPORTS_HUGETLBFS if MMU > select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU > select ARCH_SUPPORTS_PER_VMA_LOCK if MMU >+ select ARCH_SUPPORTS_RT > select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK > select ARCH_USE_MEMTEST > select ARCH_USE_QUEUED_RWLOCKS >@@ -142,6 +143,7 @@ config RISCV > select HAVE_PERF_USER_STACK_DUMP > select HAVE_POSIX_CPU_TIMERS_TASK_WORK > select HAVE_PREEMPT_DYNAMIC_KEY if !XIP_KERNEL >+ select HAVE_PREEMPT_AUTO > select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RETHOOK if !XIP_KERNEL > select HAVE_RSEQ >diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h >index 5d473343634b9..23b136286e927 100644 >--- a/arch/riscv/include/asm/thread_info.h >+++ b/arch/riscv/include/asm/thread_info.h >@@ -94,6 +94,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); > * - pending work-to-be-done flags are in lowest half-word > * - other flags in upper half-word(s) > */ >+#define TIF_ARCH_RESCHED_LAZY 0 /* Lazy rescheduling */ > #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ > #define TIF_SIGPENDING 2 /* signal pending */ > #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ >@@ -104,6 +105,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); > #define TIF_32BIT 11 /* compat-mode 32bit process */ > #define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ > >+#define _TIF_ARCH_RESCHED_LAZY (1 << TIF_ARCH_RESCHED_LAZY) > #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) > #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) > #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) >diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig >index 637e337c332e4..b4ac84206afc9 100644 >--- a/arch/x86/Kconfig >+++ b/arch/x86/Kconfig >@@ -28,6 +28,7 @@ config X86_64 > select ARCH_HAS_GIGANTIC_PAGE > select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 > select ARCH_SUPPORTS_PER_VMA_LOCK >+ select ARCH_SUPPORTS_RT > select HAVE_ARCH_SOFT_DIRTY > select MODULES_USE_ELF_RELA > select NEED_DMA_MAP_STATE >@@ -119,6 +120,7 @@ config X86 > select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG > select ARCH_SUPPORTS_LTO_CLANG > select ARCH_SUPPORTS_LTO_CLANG_THIN >+ select ARCH_SUPPORTS_RT > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 > select ARCH_USE_MEMTEST >@@ -275,6 +277,7 @@ config X86 > select HAVE_STATIC_CALL > select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL > select HAVE_PREEMPT_DYNAMIC_CALL >+ select HAVE_PREEMPT_AUTO > select HAVE_RSEQ > select HAVE_RUST if X86_64 > select HAVE_SYSCALL_TRACEPOINTS >diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h >index d63b02940747f..1ff38ebbd5880 100644 >--- a/arch/x86/include/asm/thread_info.h >+++ b/arch/x86/include/asm/thread_info.h >@@ -81,8 +81,9 @@ struct thread_info { > #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ > #define TIF_SIGPENDING 2 /* signal pending */ > #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ >-#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ >-#define TIF_SSBD 5 /* Speculative store bypass disable */ >+#define TIF_ARCH_RESCHED_LAZY 4 /* Lazy rescheduling */ >+#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ >+#define TIF_SSBD 6 /* Speculative store bypass disable */ > #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ > #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ > #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ >@@ -104,6 +105,7 @@ struct thread_info { > #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) > #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) > #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) >+#define _TIF_ARCH_RESCHED_LAZY (1 << TIF_ARCH_RESCHED_LAZY) > #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) > #define _TIF_SSBD (1 << TIF_SSBD) > #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) >diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c >index bd6a7857ce058..d45dfd10b6366 100644 >--- a/drivers/acpi/processor_idle.c >+++ b/drivers/acpi/processor_idle.c >@@ -108,7 +108,7 @@ static const struct dmi_system_id processor_power_dmi_table[] = { > */ > static void __cpuidle acpi_safe_halt(void) > { >- if (!tif_need_resched()) { >+ if (!need_resched()) { > raw_safe_halt(); > raw_local_irq_disable(); > } >diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c >index 6772e0c654fa7..119007f17e13e 100644 >--- a/drivers/block/zram/zram_drv.c >+++ b/drivers/block/zram/zram_drv.c >@@ -57,6 +57,41 @@ static void zram_free_page(struct zram *zram, size_t index); > static int zram_read_page(struct zram *zram, struct page *page, u32 index, > struct bio *parent); > >+#ifdef CONFIG_PREEMPT_RT >+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) >+{ >+ size_t index; >+ >+ for (index = 0; index < num_pages; index++) >+ spin_lock_init(&zram->table[index].lock); >+} >+ >+static int zram_slot_trylock(struct zram *zram, u32 index) >+{ >+ int ret; >+ >+ ret = spin_trylock(&zram->table[index].lock); >+ if (ret) >+ __set_bit(ZRAM_LOCK, &zram->table[index].flags); >+ return ret; >+} >+ >+static void zram_slot_lock(struct zram *zram, u32 index) >+{ >+ spin_lock(&zram->table[index].lock); >+ __set_bit(ZRAM_LOCK, &zram->table[index].flags); >+} >+ >+static void zram_slot_unlock(struct zram *zram, u32 index) >+{ >+ __clear_bit(ZRAM_LOCK, &zram->table[index].flags); >+ spin_unlock(&zram->table[index].lock); >+} >+ >+#else >+ >+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } >+ > static int zram_slot_trylock(struct zram *zram, u32 index) > { > return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); >@@ -71,6 +106,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) > { > bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); > } >+#endif > > static inline bool init_done(struct zram *zram) > { >@@ -1241,6 +1277,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) > > if (!huge_class_size) > huge_class_size = zs_huge_class_size(zram->mem_pool); >+ zram_meta_init_table_locks(zram, num_pages); > return true; > } > >diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h >index 3b94d12f41b40..dfc364b0d0727 100644 >--- a/drivers/block/zram/zram_drv.h >+++ b/drivers/block/zram/zram_drv.h >@@ -69,6 +69,9 @@ struct zram_table_entry { > unsigned long element; > }; > unsigned long flags; >+#ifdef CONFIG_PREEMPT_RT >+ spinlock_t lock; >+#endif > #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME > ktime_t ac_time; > #endif >diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig >index 3089029abba48..2d5828c5d3596 100644 >--- a/drivers/gpu/drm/i915/Kconfig >+++ b/drivers/gpu/drm/i915/Kconfig >@@ -3,7 +3,6 @@ config DRM_I915 > tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics" > depends on DRM > depends on X86 && PCI >- depends on !PREEMPT_RT > select INTEL_GTT if X86 > select INTERVAL_TREE > # we need shmfs for the swappable backing store, and in particular >diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c >index 8a84a31c7b48a..73a561af13d16 100644 >--- a/drivers/gpu/drm/i915/display/intel_crtc.c >+++ b/drivers/gpu/drm/i915/display/intel_crtc.c >@@ -580,7 +580,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state, > */ > intel_psr_wait_for_idle_locked(new_crtc_state); > >- local_irq_disable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_disable(); > > crtc->debug.min_vbl = min; > crtc->debug.max_vbl = max; >@@ -605,11 +606,13 @@ void intel_pipe_update_start(struct intel_atomic_state *state, > break; > } > >- local_irq_enable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_enable(); > > timeout = schedule_timeout(timeout); > >- local_irq_disable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_disable(); > } > > finish_wait(wq, &wait); >@@ -642,7 +645,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state, > return; > > irq_disable: >- local_irq_disable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_disable(); > } > > #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) >@@ -744,7 +748,8 @@ void intel_pipe_update_end(struct intel_atomic_state *state, > */ > intel_vrr_send_push(new_crtc_state); > >- local_irq_enable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_enable(); > > if (intel_vgpu_active(dev_priv)) > goto out; >diff --git a/drivers/gpu/drm/i915/display/intel_vblank.c b/drivers/gpu/drm/i915/display/intel_vblank.c >index fe256bf7b485b..a3c3faa8f305a 100644 >--- a/drivers/gpu/drm/i915/display/intel_vblank.c >+++ b/drivers/gpu/drm/i915/display/intel_vblank.c >@@ -275,6 +275,26 @@ int intel_crtc_scanline_to_hw(struct intel_crtc *crtc, int scanline) > * all register accesses to the same cacheline to be serialized, > * otherwise they may hang. > */ >+static void intel_vblank_section_enter_irqsave(struct drm_i915_private *i915, unsigned long *flags) >+ __acquires(i915->uncore.lock) >+{ >+#ifdef I915 >+ spin_lock_irqsave(&i915->uncore.lock, *flags); >+#else >+ *flags = 0; >+#endif >+} >+ >+static void intel_vblank_section_exit_irqrestore(struct drm_i915_private *i915, unsigned long flags) >+ __releases(i915->uncore.lock) >+{ >+#ifdef I915 >+ spin_unlock_irqrestore(&i915->uncore.lock, flags); >+#else >+ if (flags) >+ return; >+#endif >+} > static void intel_vblank_section_enter(struct drm_i915_private *i915) > __acquires(i915->uncore.lock) > { >@@ -332,10 +352,10 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, > * timing critical raw register reads, potentially with > * preemption disabled, so the following code must not block. > */ >- local_irq_save(irqflags); >- intel_vblank_section_enter(dev_priv); >+ intel_vblank_section_enter_irqsave(dev_priv, &irqflags); > >- /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ >+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) >+ preempt_disable(); > > /* Get optional system timestamp before query. */ > if (stime) >@@ -399,10 +419,10 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, > if (etime) > *etime = ktime_get(); > >- /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ >+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) >+ preempt_enable(); > >- intel_vblank_section_exit(dev_priv); >- local_irq_restore(irqflags); >+ intel_vblank_section_exit_irqrestore(dev_priv, irqflags); > > /* > * While in vblank, position will be negative >@@ -440,13 +460,11 @@ int intel_get_crtc_scanline(struct intel_crtc *crtc) > unsigned long irqflags; > int position; > >- local_irq_save(irqflags); >- intel_vblank_section_enter(dev_priv); >+ intel_vblank_section_enter_irqsave(dev_priv, &irqflags); > > position = __intel_get_crtc_scanline(crtc); > >- intel_vblank_section_exit(dev_priv); >- local_irq_restore(irqflags); >+ intel_vblank_section_exit_irqrestore(dev_priv, irqflags); > > return position; > } >diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c >index d650beb8ed22f..3dd3e516b80c1 100644 >--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c >+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c >@@ -317,10 +317,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) > /* Kick the work once more to drain the signalers, and disarm the irq */ > irq_work_sync(&b->irq_work); > while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { >- local_irq_disable(); >- signal_irq_work(&b->irq_work); >- local_irq_enable(); >+ irq_work_queue(&b->irq_work); > cond_resched(); >+ irq_work_sync(&b->irq_work); > } > } > >diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c >index 42aade0faf2d1..929ca2bad2d2c 100644 >--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c >+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c >@@ -1303,7 +1303,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > * and context switches) submission. > */ > >- spin_lock(&sched_engine->lock); >+ spin_lock_irq(&sched_engine->lock); > > /* > * If the queue is higher priority than the last >@@ -1403,7 +1403,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > * Even if ELSP[1] is occupied and not worthy > * of timeslices, our queue might be. > */ >- spin_unlock(&sched_engine->lock); >+ spin_unlock_irq(&sched_engine->lock); > return; > } > } >@@ -1429,7 +1429,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > > if (last && !can_merge_rq(last, rq)) { > spin_unlock(&ve->base.sched_engine->lock); >- spin_unlock(&engine->sched_engine->lock); >+ spin_unlock_irq(&engine->sched_engine->lock); > return; /* leave this for another sibling */ > } > >@@ -1591,7 +1591,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > */ > sched_engine->queue_priority_hint = queue_prio(sched_engine); > i915_sched_engine_reset_on_empty(sched_engine); >- spin_unlock(&sched_engine->lock); >+ spin_unlock_irq(&sched_engine->lock); > > /* > * We can skip poking the HW if we ended up with exactly the same set >@@ -1617,13 +1617,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > } > } > >-static void execlists_dequeue_irq(struct intel_engine_cs *engine) >-{ >- local_irq_disable(); /* Suspend interrupts across request submission */ >- execlists_dequeue(engine); >- local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ >-} >- > static void clear_ports(struct i915_request **ports, int count) > { > memset_p((void **)ports, NULL, count); >@@ -2478,7 +2471,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) > } > > if (!engine->execlists.pending[0]) { >- execlists_dequeue_irq(engine); >+ execlists_dequeue(engine); > start_timeslice(engine); > } > >diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h >index 813cc888e6fae..ab3483a59b79a 100644 >--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h >+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h >@@ -362,7 +362,7 @@ static inline int intel_guc_send_busy_loop(struct intel_guc *guc, > { > int err; > unsigned int sleep_period_ms = 1; >- bool not_atomic = !in_atomic() && !irqs_disabled(); >+ bool not_atomic = !in_atomic() && !irqs_disabled() && !rcu_preempt_depth(); > > /* > * FIXME: Have caller pass in if we are in an atomic context to avoid >diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c >index f59081066a197..014d02029a415 100644 >--- a/drivers/gpu/drm/i915/i915_request.c >+++ b/drivers/gpu/drm/i915/i915_request.c >@@ -609,7 +609,6 @@ bool __i915_request_submit(struct i915_request *request) > > RQ_TRACE(request, "\n"); > >- GEM_BUG_ON(!irqs_disabled()); > lockdep_assert_held(&engine->sched_engine->lock); > > /* >@@ -718,7 +717,6 @@ void __i915_request_unsubmit(struct i915_request *request) > */ > RQ_TRACE(request, "\n"); > >- GEM_BUG_ON(!irqs_disabled()); > lockdep_assert_held(&engine->sched_engine->lock); > > /* >diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h >index ce1cbee1b39dd..3c51620d011b1 100644 >--- a/drivers/gpu/drm/i915/i915_trace.h >+++ b/drivers/gpu/drm/i915/i915_trace.h >@@ -6,6 +6,10 @@ > #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) > #define _I915_TRACE_H_ > >+#ifdef CONFIG_PREEMPT_RT >+#define NOTRACE >+#endif >+ > #include <linux/stringify.h> > #include <linux/types.h> > #include <linux/tracepoint.h> >@@ -322,7 +326,7 @@ DEFINE_EVENT(i915_request, i915_request_add, > TP_ARGS(rq) > ); > >-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) >+#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) > DEFINE_EVENT(i915_request, i915_request_guc_submit, > TP_PROTO(struct i915_request *rq), > TP_ARGS(rq) >diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h >index f98577967b7fc..6cc358aa5b2ff 100644 >--- a/drivers/gpu/drm/i915/i915_utils.h >+++ b/drivers/gpu/drm/i915/i915_utils.h >@@ -288,7 +288,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) > #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) > > /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ >-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) >+#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) > # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) > #else > # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) >diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c >index b62ad9006780c..4e4f5501d81da 100644 >--- a/drivers/tty/serial/8250/8250_core.c >+++ b/drivers/tty/serial/8250/8250_core.c >@@ -592,6 +592,7 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) > > #ifdef CONFIG_SERIAL_8250_CONSOLE > >+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE > static void univ8250_console_write(struct console *co, const char *s, > unsigned int count) > { >@@ -599,6 +600,39 @@ static void univ8250_console_write(struct console *co, const char *s, > > serial8250_console_write(up, s, count); > } >+#else >+static void univ8250_console_write_atomic(struct console *co, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_8250_port *up = &serial8250_ports[co->index]; >+ >+ serial8250_console_write_atomic(up, wctxt); >+} >+ >+static void univ8250_console_write_thread(struct console *co, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_8250_port *up = &serial8250_ports[co->index]; >+ >+ serial8250_console_write_thread(up, wctxt); >+} >+ >+static void univ8250_console_device_lock(struct console *con, unsigned long *flags) >+{ >+ struct uart_port *up = &serial8250_ports[con->index].port; >+ >+ __uart_port_lock_irqsave(up, flags); >+} >+ >+static void univ8250_console_device_unlock(struct console *con, unsigned long flags) >+{ >+ struct uart_port *up = &serial8250_ports[con->index].port; >+ >+ __uart_port_unlock_irqrestore(up, flags); >+} >+ >+static struct nbcon_drvdata serial8250_nbcon_drvdata; >+#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */ > > static int univ8250_console_setup(struct console *co, char *options) > { >@@ -627,11 +661,11 @@ static int univ8250_console_setup(struct console *co, char *options) > > port = &serial8250_ports[co->index].port; > /* link port to console */ >- port->cons = co; >+ uart_port_set_cons(port, co); > > retval = serial8250_console_setup(port, options, false); > if (retval != 0) >- port->cons = NULL; >+ uart_port_set_cons(port, NULL); > return retval; > } > >@@ -689,7 +723,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, > continue; > > co->index = i; >- port->cons = co; >+ uart_port_set_cons(port, co); > return serial8250_console_setup(port, options, true); > } > >@@ -698,12 +732,21 @@ static int univ8250_console_match(struct console *co, char *name, int idx, > > static struct console univ8250_console = { > .name = "ttyS", >+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE > .write = univ8250_console_write, >+ .flags = CON_PRINTBUFFER | CON_ANYTIME, >+#else >+ .write_atomic = univ8250_console_write_atomic, >+ .write_thread = univ8250_console_write_thread, >+ .device_lock = univ8250_console_device_lock, >+ .device_unlock = univ8250_console_device_unlock, >+ .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_NBCON, >+ .nbcon_drvdata = &serial8250_nbcon_drvdata, >+#endif > .device = uart_console_device, > .setup = univ8250_console_setup, > .exit = univ8250_console_exit, > .match = univ8250_console_match, >- .flags = CON_PRINTBUFFER | CON_ANYTIME, > .index = -1, > .data = &serial8250_reg, > }; >diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c >index 8ca061d3bbb92..ed50b434d8c80 100644 >--- a/drivers/tty/serial/8250/8250_port.c >+++ b/drivers/tty/serial/8250/8250_port.c >@@ -550,6 +550,13 @@ static int serial8250_em485_init(struct uart_8250_port *p) > if (!p->em485) > return -ENOMEM; > >+#ifndef CONFIG_SERIAL_8250_LEGACY_CONSOLE >+ if (uart_console(&p->port)) { >+ dev_warn(p->port.dev, "no atomic printing for rs485 consoles\n"); >+ p->port.cons->write_atomic = NULL; >+ } >+#endif >+ > hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, > HRTIMER_MODE_REL); > hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, >@@ -702,7 +709,11 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) > serial8250_rpm_put(p); > } > >-static void serial8250_clear_IER(struct uart_8250_port *up) >+/* >+ * Only to be used by write_atomic() and the legacy write(), which do not >+ * require port lock. >+ */ >+static void __serial8250_clear_IER(struct uart_8250_port *up) > { > if (up->capabilities & UART_CAP_UUE) > serial_out(up, UART_IER, UART_IER_UUE); >@@ -710,6 +721,14 @@ static void serial8250_clear_IER(struct uart_8250_port *up) > serial_out(up, UART_IER, 0); > } > >+static inline void serial8250_clear_IER(struct uart_8250_port *up) >+{ >+ /* Port locked to synchronize UART_IER access against the console. */ >+ lockdep_assert_held_once(&up->port.lock); >+ >+ __serial8250_clear_IER(up); >+} >+ > #ifdef CONFIG_SERIAL_8250_RSA > /* > * Attempts to turn on the RSA FIFO. Returns zero on failure. >@@ -3320,6 +3339,11 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) > > wait_for_xmitr(up, UART_LSR_THRE); > serial_port_out(port, UART_TX, ch); >+ >+ if (ch == '\n') >+ up->console_newline_needed = false; >+ else >+ up->console_newline_needed = true; > } > > /* >@@ -3348,6 +3372,7 @@ static void serial8250_console_restore(struct uart_8250_port *up) > serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); > } > >+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE > /* > * Print a string to the serial port using the device FIFO > * >@@ -3406,7 +3431,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, > * First save the IER then disable the interrupts > */ > ier = serial_port_in(port, UART_IER); >- serial8250_clear_IER(up); >+ __serial8250_clear_IER(up); > > /* check scratch reg to see if port powered off during system sleep */ > if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { >@@ -3472,6 +3497,131 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, > if (locked) > uart_port_unlock_irqrestore(port, flags); > } >+#else >+void serial8250_console_write_thread(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_8250_em485 *em485 = up->em485; >+ struct uart_port *port = &up->port; >+ unsigned int ier; >+ >+ touch_nmi_watchdog(); >+ >+ if (!nbcon_enter_unsafe(wctxt)) >+ return; >+ >+ /* First save IER then disable the interrupts. */ >+ ier = serial_port_in(port, UART_IER); >+ serial8250_clear_IER(up); >+ >+ /* Check scratch reg if port powered off during system sleep. */ >+ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { >+ serial8250_console_restore(up); >+ up->canary = 0; >+ } >+ >+ if (em485) { >+ if (em485->tx_stopped) >+ up->rs485_start_tx(up); >+ mdelay(port->rs485.delay_rts_before_send); >+ } >+ >+ if (nbcon_exit_unsafe(wctxt)) { >+ int len = READ_ONCE(wctxt->len); >+ int i; >+ >+ /* >+ * Write out the message. Toggle unsafe for each byte in order >+ * to give another (higher priority) context the opportunity >+ * for a friendly takeover. If such a takeover occurs, this >+ * context must reacquire ownership in order to perform final >+ * actions (such as re-enabling the interrupts). >+ * >+ * IMPORTANT: wctxt->outbuf and wctxt->len are no longer valid >+ * after a reacquire so writing the message must be >+ * aborted. >+ */ >+ for (i = 0; i < len; i++) { >+ if (!nbcon_enter_unsafe(wctxt)) { >+ nbcon_reacquire(wctxt); >+ break; >+ } >+ >+ uart_console_write(port, wctxt->outbuf + i, 1, serial8250_console_putchar); >+ >+ if (!nbcon_exit_unsafe(wctxt)) { >+ nbcon_reacquire(wctxt); >+ break; >+ } >+ } >+ } else { >+ nbcon_reacquire(wctxt); >+ } >+ >+ while (!nbcon_enter_unsafe(wctxt)) >+ nbcon_reacquire(wctxt); >+ >+ /* Finally, wait for transmitter to become empty and restore IER. */ >+ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); >+ if (em485) { >+ mdelay(port->rs485.delay_rts_after_send); >+ if (em485->tx_stopped) >+ up->rs485_stop_tx(up); >+ } >+ serial_port_out(port, UART_IER, ier); >+ >+ /* >+ * The receive handling will happen properly because the receive ready >+ * bit will still be set; it is not cleared on read. However, modem >+ * control will not, we must call it if we have saved something in the >+ * saved flags while processing with interrupts off. >+ */ >+ if (up->msr_saved_flags) >+ serial8250_modem_status(up); >+ >+ nbcon_exit_unsafe(wctxt); >+} >+ >+void serial8250_console_write_atomic(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_port *port = &up->port; >+ unsigned int ier; >+ >+ /* Atomic console not supported for rs485 mode. */ >+ if (WARN_ON_ONCE(up->em485)) >+ return; >+ >+ touch_nmi_watchdog(); >+ >+ if (!nbcon_enter_unsafe(wctxt)) >+ return; >+ >+ /* >+ * First save IER then disable the interrupts. The special variant to >+ * clear IER is used because atomic printing may occur without holding >+ * the port lock. >+ */ >+ ier = serial_port_in(port, UART_IER); >+ __serial8250_clear_IER(up); >+ >+ /* Check scratch reg if port powered off during system sleep. */ >+ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { >+ serial8250_console_restore(up); >+ up->canary = 0; >+ } >+ >+ if (up->console_newline_needed) >+ uart_console_write(port, "\n", 1, serial8250_console_putchar); >+ uart_console_write(port, wctxt->outbuf, wctxt->len, serial8250_console_putchar); >+ >+ /* Finally, wait for transmitter to become empty and restore IER. */ >+ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); >+ serial_port_out(port, UART_IER, ier); >+ >+ nbcon_exit_unsafe(wctxt); >+} >+#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */ > > static unsigned int probe_baud(struct uart_port *port) > { >@@ -3490,6 +3640,7 @@ static unsigned int probe_baud(struct uart_port *port) > > int serial8250_console_setup(struct uart_port *port, char *options, bool probe) > { >+ struct uart_8250_port *up = up_to_u8250p(port); > int baud = 9600; > int bits = 8; > int parity = 'n'; >@@ -3499,6 +3650,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) > if (!port->iobase && !port->membase) > return -ENODEV; > >+ up->console_newline_needed = false; >+ > if (options) > uart_parse_options(options, &baud, &parity, &bits, &flow); > else if (probe) >diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c >index cf2c890a560f0..7618a5783adbc 100644 >--- a/drivers/tty/serial/amba-pl011.c >+++ b/drivers/tty/serial/amba-pl011.c >@@ -348,10 +348,7 @@ static int pl011_fifo_to_tty(struct uart_amba_port *uap) > flag = TTY_FRAME; > } > >- uart_port_unlock(&uap->port); >- sysrq = uart_handle_sysrq_char(&uap->port, ch & 255); >- uart_port_lock(&uap->port); >- >+ sysrq = uart_prepare_sysrq_char(&uap->port, ch & 255); > if (!sysrq) > uart_insert_char(&uap->port, ch, UART011_DR_OE, ch, flag); > } >@@ -1017,7 +1014,7 @@ static void pl011_dma_rx_callback(void *data) > ret = pl011_dma_rx_trigger_dma(uap); > > pl011_dma_rx_chars(uap, pending, lastbuf, false); >- uart_port_unlock_irq(&uap->port); >+ uart_unlock_and_check_sysrq(&uap->port); > /* > * Do this check after we picked the DMA chars so we don't > * get some IRQ immediately from RX. >@@ -1540,11 +1537,10 @@ static void check_apply_cts_event_workaround(struct uart_amba_port *uap) > static irqreturn_t pl011_int(int irq, void *dev_id) > { > struct uart_amba_port *uap = dev_id; >- unsigned long flags; > unsigned int status, pass_counter = AMBA_ISR_PASS_LIMIT; > int handled = 0; > >- uart_port_lock_irqsave(&uap->port, &flags); >+ uart_port_lock(&uap->port); > status = pl011_read(uap, REG_RIS) & uap->im; > if (status) { > do { >@@ -1573,7 +1569,7 @@ static irqreturn_t pl011_int(int irq, void *dev_id) > handled = 1; > } > >- uart_port_unlock_irqrestore(&uap->port, flags); >+ uart_unlock_and_check_sysrq(&uap->port); > > return IRQ_RETVAL(handled); > } >@@ -2322,13 +2318,10 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) > > clk_enable(uap->clk); > >- local_irq_save(flags); >- if (uap->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&uap->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&uap->port, &flags); > else >- uart_port_lock(&uap->port); >+ uart_port_lock_irqsave(&uap->port, &flags); > > /* > * First save the CR then disable the interrupts >@@ -2354,8 +2347,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) > pl011_write(old_cr, uap, REG_CR); > > if (locked) >- uart_port_unlock(&uap->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&uap->port, flags); > > clk_disable(uap->clk); > } >@@ -2496,7 +2488,7 @@ static int pl011_console_match(struct console *co, char *name, int idx, > continue; > > co->index = i; >- port->cons = co; >+ uart_port_set_cons(port, co); > return pl011_console_setup(co, options); > } > >diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c >index 8d09ace062e59..7790cbc57391a 100644 >--- a/drivers/tty/serial/ar933x_uart.c >+++ b/drivers/tty/serial/ar933x_uart.c >@@ -378,7 +378,7 @@ static void ar933x_uart_rx_chars(struct ar933x_uart_port *up) > up->port.icount.rx++; > ch = rdata & AR933X_UART_DATA_TX_RX_MASK; > >- if (uart_handle_sysrq_char(&up->port, ch)) >+ if (uart_prepare_sysrq_char(&up->port, ch)) > continue; > > if ((up->port.ignore_status_mask & AR933X_DUMMY_STATUS_RD) == 0) >@@ -468,7 +468,7 @@ static irqreturn_t ar933x_uart_interrupt(int irq, void *dev_id) > ar933x_uart_tx_chars(up); > } > >- uart_port_unlock(&up->port); >+ uart_unlock_and_check_sysrq(&up->port); > > return IRQ_HANDLED; > } >@@ -627,14 +627,10 @@ static void ar933x_uart_console_write(struct console *co, const char *s, > unsigned int int_en; > int locked = 1; > >- local_irq_save(flags); >- >- if (up->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&up->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&up->port, &flags); > else >- uart_port_lock(&up->port); >+ uart_port_lock_irqsave(&up->port, &flags); > > /* > * First save the IER then disable the interrupts >@@ -654,9 +650,7 @@ static void ar933x_uart_console_write(struct console *co, const char *s, > ar933x_uart_write(up, AR933X_UART_INT_REG, AR933X_UART_INT_ALLINTS); > > if (locked) >- uart_port_unlock(&up->port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&up->port, flags); > } > > static int ar933x_uart_console_setup(struct console *co, char *options) >diff --git a/drivers/tty/serial/bcm63xx_uart.c b/drivers/tty/serial/bcm63xx_uart.c >index a3cefa153456d..34801a6f300b6 100644 >--- a/drivers/tty/serial/bcm63xx_uart.c >+++ b/drivers/tty/serial/bcm63xx_uart.c >@@ -285,10 +285,9 @@ static void bcm_uart_do_rx(struct uart_port *port) > flag = TTY_PARITY; > } > >- if (uart_handle_sysrq_char(port, c)) >+ if (uart_prepare_sysrq_char(port, c)) > continue; > >- > if ((cstat & port->ignore_status_mask) == 0) > tty_insert_flip_char(tty_port, c, flag); > >@@ -353,7 +352,7 @@ static irqreturn_t bcm_uart_interrupt(int irq, void *dev_id) > estat & UART_EXTINP_DCD_MASK); > } > >- uart_port_unlock(port); >+ uart_unlock_and_check_sysrq(port); > return IRQ_HANDLED; > } > >@@ -703,20 +702,14 @@ static void bcm_console_write(struct console *co, const char *s, > { > struct uart_port *port; > unsigned long flags; >- int locked; >+ int locked = 1; > > port = &ports[co->index]; > >- local_irq_save(flags); >- if (port->sysrq) { >- /* bcm_uart_interrupt() already took the lock */ >- locked = 0; >- } else if (oops_in_progress) { >- locked = uart_port_trylock(port); >- } else { >- uart_port_lock(port); >- locked = 1; >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); >+ else >+ uart_port_lock_irqsave(port, &flags); > > /* call helper to deal with \r\n */ > uart_console_write(port, s, count, bcm_console_putchar); >@@ -725,8 +718,7 @@ static void bcm_console_write(struct console *co, const char *s, > wait_for_xmitr(port); > > if (locked) >- uart_port_unlock(port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > /* >diff --git a/drivers/tty/serial/lpc32xx_hs.c b/drivers/tty/serial/lpc32xx_hs.c >index ec20329f06036..e70fa59dbcc3b 100644 >--- a/drivers/tty/serial/lpc32xx_hs.c >+++ b/drivers/tty/serial/lpc32xx_hs.c >@@ -136,20 +136,16 @@ static void lpc32xx_hsuart_console_write(struct console *co, const char *s, > int locked = 1; > > touch_nmi_watchdog(); >- local_irq_save(flags); >- if (up->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&up->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&up->port, &flags); > else >- uart_port_lock(&up->port); >+ uart_port_lock_irqsave(&up->port, &flags); > > uart_console_write(&up->port, s, count, lpc32xx_hsuart_console_putchar); > wait_for_xmit_empty(&up->port); > > if (locked) >- uart_port_unlock(&up->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&up->port, flags); > } > > static int __init lpc32xx_hsuart_console_setup(struct console *co, >@@ -268,7 +264,8 @@ static void __serial_lpc32xx_rx(struct uart_port *port) > tty_insert_flip_char(tport, 0, TTY_FRAME); > } > >- tty_insert_flip_char(tport, (tmp & 0xFF), flag); >+ if (!uart_prepare_sysrq_char(port, tmp & 0xff)) >+ tty_insert_flip_char(tport, (tmp & 0xFF), flag); > > tmp = readl(LPC32XX_HSUART_FIFO(port->membase)); > } >@@ -333,7 +330,7 @@ static irqreturn_t serial_lpc32xx_interrupt(int irq, void *dev_id) > __serial_lpc32xx_tx(port); > } > >- uart_port_unlock(port); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >diff --git a/drivers/tty/serial/meson_uart.c b/drivers/tty/serial/meson_uart.c >index 8395688f5ee92..6feac459c0cf4 100644 >--- a/drivers/tty/serial/meson_uart.c >+++ b/drivers/tty/serial/meson_uart.c >@@ -220,7 +220,7 @@ static void meson_receive_chars(struct uart_port *port) > continue; > } > >- if (uart_handle_sysrq_char(port, ch)) >+ if (uart_prepare_sysrq_char(port, ch)) > continue; > > if ((status & port->ignore_status_mask) == 0) >@@ -248,7 +248,7 @@ static irqreturn_t meson_uart_interrupt(int irq, void *dev_id) > meson_uart_start_tx(port); > } > >- uart_port_unlock(port); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -556,18 +556,13 @@ static void meson_serial_port_write(struct uart_port *port, const char *s, > u_int count) > { > unsigned long flags; >- int locked; >+ int locked = 1; > u32 val, tmp; > >- local_irq_save(flags); >- if (port->sysrq) { >- locked = 0; >- } else if (oops_in_progress) { >- locked = uart_port_trylock(port); >- } else { >- uart_port_lock(port); >- locked = 1; >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); >+ else >+ uart_port_lock_irqsave(port, &flags); > > val = readl(port->membase + AML_UART_CONTROL); > tmp = val & ~(AML_UART_TX_INT_EN | AML_UART_RX_INT_EN); >@@ -577,8 +572,7 @@ static void meson_serial_port_write(struct uart_port *port, const char *s, > writel(val, port->membase + AML_UART_CONTROL); > > if (locked) >- uart_port_unlock(port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static void meson_serial_console_write(struct console *co, const char *s, >diff --git a/drivers/tty/serial/msm_serial.c b/drivers/tty/serial/msm_serial.c >index e24204ad35def..d27c4c8c84e13 100644 >--- a/drivers/tty/serial/msm_serial.c >+++ b/drivers/tty/serial/msm_serial.c >@@ -588,16 +588,14 @@ static void msm_complete_rx_dma(void *args) > if (!(port->read_status_mask & MSM_UART_SR_RX_BREAK)) > flag = TTY_NORMAL; > >- uart_port_unlock_irqrestore(port, flags); >- sysrq = uart_handle_sysrq_char(port, dma->virt[i]); >- uart_port_lock_irqsave(port, &flags); >+ sysrq = uart_prepare_sysrq_char(port, dma->virt[i]); > if (!sysrq) > tty_insert_flip_char(tport, dma->virt[i], flag); > } > > msm_start_rx_dma(msm_port); > done: >- uart_port_unlock_irqrestore(port, flags); >+ uart_unlock_and_check_sysrq_irqrestore(port, flags); > > if (count) > tty_flip_buffer_push(tport); >@@ -763,9 +761,7 @@ static void msm_handle_rx_dm(struct uart_port *port, unsigned int misr) > if (!(port->read_status_mask & MSM_UART_SR_RX_BREAK)) > flag = TTY_NORMAL; > >- uart_port_unlock(port); >- sysrq = uart_handle_sysrq_char(port, buf[i]); >- uart_port_lock(port); >+ sysrq = uart_prepare_sysrq_char(port, buf[i]); > if (!sysrq) > tty_insert_flip_char(tport, buf[i], flag); > } >@@ -825,9 +821,7 @@ static void msm_handle_rx(struct uart_port *port) > else if (sr & MSM_UART_SR_PAR_FRAME_ERR) > flag = TTY_FRAME; > >- uart_port_unlock(port); >- sysrq = uart_handle_sysrq_char(port, c); >- uart_port_lock(port); >+ sysrq = uart_prepare_sysrq_char(port, c); > if (!sysrq) > tty_insert_flip_char(tport, c, flag); > } >@@ -948,11 +942,10 @@ static irqreturn_t msm_uart_irq(int irq, void *dev_id) > struct uart_port *port = dev_id; > struct msm_port *msm_port = to_msm_port(port); > struct msm_dma *dma = &msm_port->rx_dma; >- unsigned long flags; > unsigned int misr; > u32 val; > >- uart_port_lock_irqsave(port, &flags); >+ uart_port_lock(port); > misr = msm_read(port, MSM_UART_MISR); > msm_write(port, 0, MSM_UART_IMR); /* disable interrupt */ > >@@ -984,7 +977,7 @@ static irqreturn_t msm_uart_irq(int irq, void *dev_id) > msm_handle_delta_cts(port); > > msm_write(port, msm_port->imr, MSM_UART_IMR); /* restore interrupt */ >- uart_port_unlock_irqrestore(port, flags); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -1621,14 +1614,10 @@ static void __msm_console_write(struct uart_port *port, const char *s, > num_newlines++; > count += num_newlines; > >- local_irq_save(flags); >- >- if (port->sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); > else >- uart_port_lock(port); >+ uart_port_lock_irqsave(port, &flags); > > if (is_uartdm) > msm_reset_dm_count(port, count); >@@ -1667,9 +1656,7 @@ static void __msm_console_write(struct uart_port *port, const char *s, > } > > if (locked) >- uart_port_unlock(port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static void msm_console_write(struct console *co, const char *s, >diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c >index f5a0b401af63b..9be1c871cf116 100644 >--- a/drivers/tty/serial/omap-serial.c >+++ b/drivers/tty/serial/omap-serial.c >@@ -508,7 +508,7 @@ static void serial_omap_rdi(struct uart_omap_port *up, unsigned int lsr) > > up->port.icount.rx++; > >- if (uart_handle_sysrq_char(&up->port, ch)) >+ if (uart_prepare_sysrq_char(&up->port, ch)) > return; > > uart_insert_char(&up->port, lsr, UART_LSR_OE, ch, TTY_NORMAL); >@@ -563,7 +563,7 @@ static irqreturn_t serial_omap_irq(int irq, void *dev_id) > } > } while (max_count--); > >- uart_port_unlock(&up->port); >+ uart_unlock_and_check_sysrq(&up->port); > > tty_flip_buffer_push(&up->port.state->port); > >@@ -1212,13 +1212,10 @@ serial_omap_console_write(struct console *co, const char *s, > unsigned int ier; > int locked = 1; > >- local_irq_save(flags); >- if (up->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&up->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&up->port, &flags); > else >- uart_port_lock(&up->port); >+ uart_port_lock_irqsave(&up->port, &flags); > > /* > * First save the IER then disable the interrupts >@@ -1245,8 +1242,7 @@ serial_omap_console_write(struct console *co, const char *s, > check_modem_status(up); > > if (locked) >- uart_port_unlock(&up->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&up->port, flags); > } > > static int __init >diff --git a/drivers/tty/serial/owl-uart.c b/drivers/tty/serial/owl-uart.c >index d9fe85397741d..8b60ac0ad7cd3 100644 >--- a/drivers/tty/serial/owl-uart.c >+++ b/drivers/tty/serial/owl-uart.c >@@ -199,6 +199,7 @@ static void owl_uart_receive_chars(struct uart_port *port) > stat = owl_uart_read(port, OWL_UART_STAT); > while (!(stat & OWL_UART_STAT_RFEM)) { > char flag = TTY_NORMAL; >+ bool sysrq; > > if (stat & OWL_UART_STAT_RXER) > port->icount.overrun++; >@@ -217,7 +218,9 @@ static void owl_uart_receive_chars(struct uart_port *port) > val = owl_uart_read(port, OWL_UART_RXDAT); > val &= 0xff; > >- if ((stat & port->ignore_status_mask) == 0) >+ sysrq = uart_prepare_sysrq_char(port, val); >+ >+ if (!sysrq && (stat & port->ignore_status_mask) == 0) > tty_insert_flip_char(&port->state->port, val, flag); > > stat = owl_uart_read(port, OWL_UART_STAT); >@@ -229,10 +232,9 @@ static void owl_uart_receive_chars(struct uart_port *port) > static irqreturn_t owl_uart_irq(int irq, void *dev_id) > { > struct uart_port *port = dev_id; >- unsigned long flags; > u32 stat; > >- uart_port_lock_irqsave(port, &flags); >+ uart_port_lock(port); > > stat = owl_uart_read(port, OWL_UART_STAT); > >@@ -246,7 +248,7 @@ static irqreturn_t owl_uart_irq(int irq, void *dev_id) > stat |= OWL_UART_STAT_RIP | OWL_UART_STAT_TIP; > owl_uart_write(port, stat, OWL_UART_STAT); > >- uart_port_unlock_irqrestore(port, flags); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -508,18 +510,12 @@ static void owl_uart_port_write(struct uart_port *port, const char *s, > { > u32 old_ctl, val; > unsigned long flags; >- int locked; >+ int locked = 1; > >- local_irq_save(flags); >- >- if (port->sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(port); >- else { >- uart_port_lock(port); >- locked = 1; >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); >+ else >+ uart_port_lock_irqsave(port, &flags); > > old_ctl = owl_uart_read(port, OWL_UART_CTL); > val = old_ctl | OWL_UART_CTL_TRFS_TX; >@@ -541,9 +537,7 @@ static void owl_uart_port_write(struct uart_port *port, const char *s, > owl_uart_write(port, old_ctl, OWL_UART_CTL); > > if (locked) >- uart_port_unlock(port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static void owl_uart_console_write(struct console *co, const char *s, >diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c >index 436cc6d52a11b..89257cddf5405 100644 >--- a/drivers/tty/serial/pch_uart.c >+++ b/drivers/tty/serial/pch_uart.c >@@ -237,9 +237,6 @@ struct eg20t_port { > > #define IRQ_NAME_SIZE 17 > char irq_name[IRQ_NAME_SIZE]; >- >- /* protect the eg20t_port private structure and io access to membase */ >- spinlock_t lock; > }; > > /** >@@ -567,7 +564,7 @@ static int pch_uart_hal_read(struct eg20t_port *priv, unsigned char *buf, > if (uart_handle_break(port)) > continue; > } >- if (uart_handle_sysrq_char(port, rbr)) >+ if (uart_prepare_sysrq_char(port, rbr)) > continue; > > buf[i++] = rbr; >@@ -599,16 +596,14 @@ static void pch_uart_hal_set_break(struct eg20t_port *priv, int on) > iowrite8(lcr, priv->membase + UART_LCR); > } > >-static int push_rx(struct eg20t_port *priv, const unsigned char *buf, >- int size) >+static void push_rx(struct eg20t_port *priv, const unsigned char *buf, >+ int size) > { > struct uart_port *port = &priv->port; > struct tty_port *tport = &port->state->port; > > tty_insert_flip_string(tport, buf, size); > tty_flip_buffer_push(tport); >- >- return 0; > } > > static int dma_push_rx(struct eg20t_port *priv, int size) >@@ -761,7 +756,7 @@ static int handle_rx_to(struct eg20t_port *priv) > { > struct pch_uart_buffer *buf; > int rx_size; >- int ret; >+ > if (!priv->start_rx) { > pch_uart_hal_disable_interrupt(priv, PCH_UART_HAL_RX_INT | > PCH_UART_HAL_RX_ERR_INT); >@@ -770,19 +765,12 @@ static int handle_rx_to(struct eg20t_port *priv) > buf = &priv->rxbuf; > do { > rx_size = pch_uart_hal_read(priv, buf->buf, buf->size); >- ret = push_rx(priv, buf->buf, rx_size); >- if (ret) >- return 0; >+ push_rx(priv, buf->buf, rx_size); > } while (rx_size == buf->size); > > return PCH_UART_HANDLED_RX_INT; > } > >-static int handle_rx(struct eg20t_port *priv) >-{ >- return handle_rx_to(priv); >-} >- > static int dma_handle_rx(struct eg20t_port *priv) > { > struct uart_port *port = &priv->port; >@@ -1019,11 +1007,10 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id) > u8 lsr; > int ret = 0; > unsigned char iid; >- unsigned long flags; > int next = 1; > u8 msr; > >- spin_lock_irqsave(&priv->lock, flags); >+ uart_port_lock(&priv->port); > handled = 0; > while (next) { > iid = pch_uart_hal_get_iid(priv); >@@ -1051,7 +1038,7 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id) > PCH_UART_HAL_RX_INT | > PCH_UART_HAL_RX_ERR_INT); > } else { >- ret = handle_rx(priv); >+ ret = handle_rx_to(priv); > } > break; > case PCH_UART_IID_RDR_TO: /* Received Data Ready >@@ -1083,7 +1070,7 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id) > handled |= (unsigned int)ret; > } > >- spin_unlock_irqrestore(&priv->lock, flags); >+ uart_unlock_and_check_sysrq(&priv->port); > return IRQ_RETVAL(handled); > } > >@@ -1194,9 +1181,9 @@ static void pch_uart_break_ctl(struct uart_port *port, int ctl) > unsigned long flags; > > priv = container_of(port, struct eg20t_port, port); >- spin_lock_irqsave(&priv->lock, flags); >+ uart_port_lock_irqsave(&priv->port, &flags); > pch_uart_hal_set_break(priv, ctl); >- spin_unlock_irqrestore(&priv->lock, flags); >+ uart_port_unlock_irqrestore(&priv->port, flags); > } > > /* Grab any interrupt resources and initialise any low level driver state. */ >@@ -1346,8 +1333,7 @@ static void pch_uart_set_termios(struct uart_port *port, > > baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16); > >- spin_lock_irqsave(&priv->lock, flags); >- uart_port_lock(port); >+ uart_port_lock_irqsave(port, &flags); > > uart_update_timeout(port, termios->c_cflag, baud); > rtn = pch_uart_hal_set_line(priv, baud, parity, bits, stb); >@@ -1360,8 +1346,7 @@ static void pch_uart_set_termios(struct uart_port *port, > tty_termios_encode_baud_rate(termios, baud, baud); > > out: >- uart_port_unlock(port); >- spin_unlock_irqrestore(&priv->lock, flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static const char *pch_uart_type(struct uart_port *port) >@@ -1565,27 +1550,17 @@ pch_console_write(struct console *co, const char *s, unsigned int count) > { > struct eg20t_port *priv; > unsigned long flags; >- int priv_locked = 1; >- int port_locked = 1; >+ int locked = 1; > u8 ier; > > priv = pch_uart_ports[co->index]; > > touch_nmi_watchdog(); > >- local_irq_save(flags); >- if (priv->port.sysrq) { >- /* call to uart_handle_sysrq_char already took the priv lock */ >- priv_locked = 0; >- /* serial8250_handle_port() already took the port lock */ >- port_locked = 0; >- } else if (oops_in_progress) { >- priv_locked = spin_trylock(&priv->lock); >- port_locked = uart_port_trylock(&priv->port); >- } else { >- spin_lock(&priv->lock); >- uart_port_lock(&priv->port); >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&priv->port, &flags); >+ else >+ uart_port_lock_irqsave(&priv->port, &flags); > > /* > * First save the IER then disable the interrupts >@@ -1603,11 +1578,8 @@ pch_console_write(struct console *co, const char *s, unsigned int count) > wait_for_xmitr(priv, UART_LSR_BOTH_EMPTY); > iowrite8(ier, priv->membase + UART_IER); > >- if (port_locked) >- uart_port_unlock(&priv->port); >- if (priv_locked) >- spin_unlock(&priv->lock); >- local_irq_restore(flags); >+ if (locked) >+ uart_port_unlock_irqrestore(&priv->port, flags); > } > > static int __init pch_console_setup(struct console *co, char *options) >@@ -1704,8 +1676,6 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev, > pci_enable_msi(pdev); > pci_set_master(pdev); > >- spin_lock_init(&priv->lock); >- > iobase = pci_resource_start(pdev, 0); > mapbase = pci_resource_start(pdev, 1); > priv->mapbase = mapbase; >@@ -1735,8 +1705,6 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev, > KBUILD_MODNAME ":" PCH_UART_DRIVER_DEVICE "%d", > priv->port.line); > >- spin_lock_init(&priv->port.lock); >- > pci_set_drvdata(pdev, priv); > priv->trigger_level = 1; > priv->fcr = 0; >diff --git a/drivers/tty/serial/pxa.c b/drivers/tty/serial/pxa.c >index 46e70e155aab2..e395ff29c1a2c 100644 >--- a/drivers/tty/serial/pxa.c >+++ b/drivers/tty/serial/pxa.c >@@ -151,7 +151,7 @@ static inline void receive_chars(struct uart_pxa_port *up, int *status) > flag = TTY_FRAME; > } > >- if (uart_handle_sysrq_char(&up->port, ch)) >+ if (uart_prepare_sysrq_char(&up->port, ch)) > goto ignore_char; > > uart_insert_char(&up->port, *status, UART_LSR_OE, ch, flag); >@@ -232,7 +232,7 @@ static inline irqreturn_t serial_pxa_irq(int irq, void *dev_id) > check_modem_status(up); > if (lsr & UART_LSR_THRE) > transmit_chars(up); >- uart_port_unlock(&up->port); >+ uart_unlock_and_check_sysrq(&up->port); > return IRQ_HANDLED; > } > >@@ -604,13 +604,10 @@ serial_pxa_console_write(struct console *co, const char *s, unsigned int count) > int locked = 1; > > clk_enable(up->clk); >- local_irq_save(flags); >- if (up->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&up->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&up->port, &flags); > else >- uart_port_lock(&up->port); >+ uart_port_lock_irqsave(&up->port, &flags); > > /* > * First save the IER then disable the interrupts >@@ -628,10 +625,8 @@ serial_pxa_console_write(struct console *co, const char *s, unsigned int count) > serial_out(up, UART_IER, ier); > > if (locked) >- uart_port_unlock(&up->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&up->port, flags); > clk_disable(up->clk); >- > } > > #ifdef CONFIG_CONSOLE_POLL >diff --git a/drivers/tty/serial/rda-uart.c b/drivers/tty/serial/rda-uart.c >index 13deb355cf1bc..82def9b8632a5 100644 >--- a/drivers/tty/serial/rda-uart.c >+++ b/drivers/tty/serial/rda-uart.c >@@ -394,7 +394,8 @@ static void rda_uart_receive_chars(struct uart_port *port) > val &= 0xff; > > port->icount.rx++; >- tty_insert_flip_char(&port->state->port, val, flag); >+ if (!uart_prepare_sysrq_char(port, val)) >+ tty_insert_flip_char(&port->state->port, val, flag); > > status = rda_uart_read(port, RDA_UART_STATUS); > } >@@ -405,10 +406,9 @@ static void rda_uart_receive_chars(struct uart_port *port) > static irqreturn_t rda_interrupt(int irq, void *dev_id) > { > struct uart_port *port = dev_id; >- unsigned long flags; > u32 val, irq_mask; > >- uart_port_lock_irqsave(port, &flags); >+ uart_port_lock(port); > > /* Clear IRQ cause */ > val = rda_uart_read(port, RDA_UART_IRQ_CAUSE); >@@ -425,7 +425,7 @@ static irqreturn_t rda_interrupt(int irq, void *dev_id) > rda_uart_send_chars(port); > } > >- uart_port_unlock_irqrestore(port, flags); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -590,18 +590,12 @@ static void rda_uart_port_write(struct uart_port *port, const char *s, > { > u32 old_irq_mask; > unsigned long flags; >- int locked; >+ int locked = 1; > >- local_irq_save(flags); >- >- if (port->sysrq) { >- locked = 0; >- } else if (oops_in_progress) { >- locked = uart_port_trylock(port); >- } else { >- uart_port_lock(port); >- locked = 1; >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); >+ else >+ uart_port_lock_irqsave(port, &flags); > > old_irq_mask = rda_uart_read(port, RDA_UART_IRQ_MASK); > rda_uart_write(port, 0, RDA_UART_IRQ_MASK); >@@ -615,9 +609,7 @@ static void rda_uart_port_write(struct uart_port *port, const char *s, > rda_uart_write(port, old_irq_mask, RDA_UART_IRQ_MASK); > > if (locked) >- uart_port_unlock(port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static void rda_uart_console_write(struct console *co, const char *s, >diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c >index d6a58a9e072a1..0c13ea6a3afaa 100644 >--- a/drivers/tty/serial/serial_core.c >+++ b/drivers/tty/serial/serial_core.c >@@ -3145,8 +3145,15 @@ static int serial_core_add_one_port(struct uart_driver *drv, struct uart_port *u > state->uart_port = uport; > uport->state = state; > >+ /* >+ * If this port is in use as a console then the spinlock is already >+ * initialised. >+ */ >+ if (!uart_console_registered(uport)) >+ uart_port_spin_lock_init(uport); >+ > state->pm_state = UART_PM_STATE_UNDEFINED; >- uport->cons = drv->cons; >+ uart_port_set_cons(uport, drv->cons); > uport->minor = drv->tty_driver->minor_start + uport->line; > uport->name = kasprintf(GFP_KERNEL, "%s%d", drv->dev_name, > drv->tty_driver->name_base + uport->line); >@@ -3155,13 +3162,6 @@ static int serial_core_add_one_port(struct uart_driver *drv, struct uart_port *u > goto out; > } > >- /* >- * If this port is in use as a console then the spinlock is already >- * initialised. >- */ >- if (!uart_console_registered(uport)) >- uart_port_spin_lock_init(uport); >- > if (uport->cons && uport->dev) > of_console_check(uport->dev->of_node, uport->cons->name, uport->line); > >diff --git a/drivers/tty/serial/sifive.c b/drivers/tty/serial/sifive.c >index a4cc569a78a25..0670fd9f84967 100644 >--- a/drivers/tty/serial/sifive.c >+++ b/drivers/tty/serial/sifive.c >@@ -412,7 +412,8 @@ static void __ssp_receive_chars(struct sifive_serial_port *ssp) > break; > > ssp->port.icount.rx++; >- uart_insert_char(&ssp->port, 0, 0, ch, TTY_NORMAL); >+ if (!uart_prepare_sysrq_char(&ssp->port, ch)) >+ uart_insert_char(&ssp->port, 0, 0, ch, TTY_NORMAL); > } > > tty_flip_buffer_push(&ssp->port.state->port); >@@ -534,7 +535,7 @@ static irqreturn_t sifive_serial_irq(int irq, void *dev_id) > if (ip & SIFIVE_SERIAL_IP_TXWM_MASK) > __ssp_transmit_chars(ssp); > >- uart_port_unlock(&ssp->port); >+ uart_unlock_and_check_sysrq(&ssp->port); > > return IRQ_HANDLED; > } >@@ -791,13 +792,10 @@ static void sifive_serial_console_write(struct console *co, const char *s, > if (!ssp) > return; > >- local_irq_save(flags); >- if (ssp->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&ssp->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&ssp->port, &flags); > else >- uart_port_lock(&ssp->port); >+ uart_port_lock_irqsave(&ssp->port, &flags); > > ier = __ssp_readl(ssp, SIFIVE_SERIAL_IE_OFFS); > __ssp_writel(0, SIFIVE_SERIAL_IE_OFFS, ssp); >@@ -807,8 +805,7 @@ static void sifive_serial_console_write(struct console *co, const char *s, > __ssp_writel(ier, SIFIVE_SERIAL_IE_OFFS, ssp); > > if (locked) >- uart_port_unlock(&ssp->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&ssp->port, flags); > } > > static int sifive_serial_console_setup(struct console *co, char *options) >diff --git a/drivers/tty/serial/sunplus-uart.c b/drivers/tty/serial/sunplus-uart.c >index 99f5285819d4b..f5e29eb4a4ce4 100644 >--- a/drivers/tty/serial/sunplus-uart.c >+++ b/drivers/tty/serial/sunplus-uart.c >@@ -260,7 +260,7 @@ static void receive_chars(struct uart_port *port) > if (port->ignore_status_mask & SUP_DUMMY_READ) > goto ignore_char; > >- if (uart_handle_sysrq_char(port, ch)) >+ if (uart_prepare_sysrq_char(port, ch)) > goto ignore_char; > > uart_insert_char(port, lsr, SUP_UART_LSR_OE, ch, flag); >@@ -287,7 +287,7 @@ static irqreturn_t sunplus_uart_irq(int irq, void *args) > if (isc & SUP_UART_ISC_TX) > transmit_chars(port); > >- uart_port_unlock(port); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -512,22 +512,16 @@ static void sunplus_console_write(struct console *co, > unsigned long flags; > int locked = 1; > >- local_irq_save(flags); >- >- if (sunplus_console_ports[co->index]->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&sunplus_console_ports[co->index]->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&sunplus_console_ports[co->index]->port, &flags); > else >- uart_port_lock(&sunplus_console_ports[co->index]->port); >+ uart_port_lock_irqsave(&sunplus_console_ports[co->index]->port, &flags); > > uart_console_write(&sunplus_console_ports[co->index]->port, s, count, > sunplus_uart_console_putchar); > > if (locked) >- uart_port_unlock(&sunplus_console_ports[co->index]->port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&sunplus_console_ports[co->index]->port, flags); > } > > static int __init sunplus_console_setup(struct console *co, char *options) >diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c >index 407b0d87b7c10..c9c914bc033c9 100644 >--- a/drivers/tty/tty_io.c >+++ b/drivers/tty/tty_io.c >@@ -3567,8 +3567,13 @@ static ssize_t show_cons_active(struct device *dev, > for_each_console(c) { > if (!c->device) > continue; >- if (!c->write) >- continue; >+ if (c->flags & CON_NBCON) { >+ if (!c->write_atomic && !c->write_thread) >+ continue; >+ } else { >+ if (!c->write) >+ continue; >+ } > if ((c->flags & CON_ENABLED) == 0) > continue; > cs[i++] = c; >diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c >index e0758fe7936dc..2703676549f5e 100644 >--- a/fs/proc/consoles.c >+++ b/fs/proc/consoles.c >@@ -21,12 +21,14 @@ static int show_console_dev(struct seq_file *m, void *v) > { CON_ENABLED, 'E' }, > { CON_CONSDEV, 'C' }, > { CON_BOOT, 'B' }, >+ { CON_NBCON, 'N' }, > { CON_PRINTBUFFER, 'p' }, > { CON_BRL, 'b' }, > { CON_ANYTIME, 'a' }, > }; > char flags[ARRAY_SIZE(con_flags) + 1]; > struct console *con = v; >+ char con_write = '-'; > unsigned int a; > dev_t dev = 0; > >@@ -57,9 +59,15 @@ static int show_console_dev(struct seq_file *m, void *v) > seq_setwidth(m, 21 - 1); > seq_printf(m, "%s%d", con->name, con->index); > seq_pad(m, ' '); >- seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', >- con->write ? 'W' : '-', con->unblank ? 'U' : '-', >- flags); >+ if (con->flags & CON_NBCON) { >+ if (con->write_atomic || con->write_thread) >+ con_write = 'W'; >+ } else { >+ if (con->write) >+ con_write = 'W'; >+ } >+ seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con_write, >+ con->unblank ? 'U' : '-', flags); > if (dev) > seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); > >diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h >index fc53e0ad56d90..448bbef474564 100644 >--- a/include/linux/bottom_half.h >+++ b/include/linux/bottom_half.h >@@ -35,8 +35,10 @@ static inline void local_bh_enable(void) > > #ifdef CONFIG_PREEMPT_RT > extern bool local_bh_blocked(void); >+extern void softirq_preempt(void); > #else > static inline bool local_bh_blocked(void) { return false; } >+static inline void softirq_preempt(void) { } > #endif > > #endif /* _LINUX_BH_H */ >diff --git a/include/linux/console.h b/include/linux/console.h >index 779d388af8a0a..1eb2d1e58b1c7 100644 >--- a/include/linux/console.h >+++ b/include/linux/console.h >@@ -16,7 +16,9 @@ > > #include <linux/atomic.h> > #include <linux/bits.h> >+#include <linux/irq_work.h> > #include <linux/rculist.h> >+#include <linux/rcuwait.h> > #include <linux/types.h> > > struct vc_data; >@@ -137,7 +139,7 @@ static inline int con_debug_leave(void) > */ > > /** >- * cons_flags - General console flags >+ * enum cons_flags - General console flags > * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate > * output of messages that were already shown by boot > * consoles or read by userspace via syslog() syscall. >@@ -218,7 +220,7 @@ struct nbcon_state { > static_assert(sizeof(struct nbcon_state) <= sizeof(int)); > > /** >- * nbcon_prio - console owner priority for nbcon consoles >+ * enum nbcon_prio - console owner priority for nbcon consoles > * @NBCON_PRIO_NONE: Unused > * @NBCON_PRIO_NORMAL: Normal (non-emergency) usage > * @NBCON_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) >@@ -282,10 +284,29 @@ struct nbcon_write_context { > bool unsafe_takeover; > }; > >+/** >+ * struct nbcon_drvdata - Data to allow nbcon acquire in non-print context >+ * @ctxt: The core console context >+ * @srcu_cookie: Storage for a console_srcu_lock cookie, if needed >+ * @owner_index: Storage for the owning console index, if needed >+ * @locked: Storage for the locked state, if needed >+ * >+ * All fields (except for @ctxt) are available exclusively to the driver to >+ * use as needed. They are not used by the printk subsystem. >+ */ >+struct nbcon_drvdata { >+ struct nbcon_context __private ctxt; >+ >+ /* reserved for driver use */ >+ int srcu_cookie; >+ short owner_index; >+ bool locked; >+}; >+ > /** > * struct console - The console descriptor structure > * @name: The name of the console driver >- * @write: Write callback to output messages (Optional) >+ * @write: Legacy write callback to output messages (Optional) > * @read: Read callback for console input (Optional) > * @device: The underlying TTY device driver (Optional) > * @unblank: Callback to unblank the console (Optional) >@@ -302,10 +323,13 @@ struct nbcon_write_context { > * @data: Driver private data > * @node: hlist node for the console list > * >- * @write_atomic: Write callback for atomic context > * @nbcon_state: State for nbcon consoles > * @nbcon_seq: Sequence number of the next record for nbcon to print >+ * @nbcon_prev_seq: Seq num the previous nbcon owner was assigned to print > * @pbufs: Pointer to nbcon private buffer >+ * @kthread: Printer kthread for this console >+ * @rcuwait: RCU-safe wait object for @kthread waking >+ * @irq_work: Defer @kthread waking to IRQ work context > */ > struct console { > char name[16]; >@@ -327,11 +351,122 @@ struct console { > struct hlist_node node; > > /* nbcon console specific members */ >- bool (*write_atomic)(struct console *con, >- struct nbcon_write_context *wctxt); >+ >+ /** >+ * @write_atomic: >+ * >+ * NBCON callback to write out text in any context. >+ * >+ * This callback is called with the console already acquired. The >+ * callback can use nbcon_can_proceed() at any time to verify that >+ * it is still the owner of the console. In the case that it has >+ * lost ownership, it is no longer allowed to go forward. In this >+ * case it must back out immediately and carefully. The buffer >+ * content is also no longer trusted since it no longer belongs to >+ * the context. >+ * >+ * If the callback needs to perform actions where ownership is not >+ * allowed to be taken over, nbcon_enter_unsafe() and >+ * nbcon_exit_unsafe() can be used to mark such sections. These >+ * functions are also points of possible ownership transfer. If >+ * either function returns false, ownership has been lost. >+ * >+ * If the driver must reacquire ownership in order to finalize or >+ * revert hardware changes, nbcon_reacquire() can be used. However, >+ * on reacquire the buffer content is no longer available. A >+ * reacquire cannot be used to resume printing. >+ * >+ * This callback can be called from any context (including NMI). >+ * Therefore it must avoid usage of any locking and instead rely >+ * on the console ownership for synchronization. >+ */ >+ void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); >+ >+ /** >+ * @write_thread: >+ * >+ * NBCON callback to write out text in task context. (Optional) >+ * >+ * This callback is called with the console already acquired. Any >+ * additional driver synchronization should have been performed by >+ * device_lock(). >+ * >+ * This callback is always called from task context but with migration >+ * disabled. >+ * >+ * The same criteria for console ownership verification and unsafe >+ * sections applies as with write_atomic(). The difference between >+ * this callback and write_atomic() is that this callback is used >+ * during normal operation and is always called from task context. >+ * This provides drivers with a relatively relaxed locking context >+ * for synchronizing output to the hardware. >+ */ >+ void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt); >+ >+ /** >+ * @device_lock: >+ * >+ * NBCON callback to begin synchronization with driver code. >+ * >+ * Console drivers typically must deal with access to the hardware >+ * via user input/output (such as an interactive login shell) and >+ * output of kernel messages via printk() calls. This callback is >+ * called by the printk-subsystem whenever it needs to synchronize >+ * with hardware access by the driver. It should be implemented to >+ * use whatever synchronization mechanism the driver is using for >+ * itself (for example, the port lock for uart serial consoles). >+ * >+ * This callback is always called from task context. It may use any >+ * synchronization method required by the driver. BUT this callback >+ * MUST also disable migration. The console driver may be using a >+ * synchronization mechanism that already takes care of this (such as >+ * spinlocks). Otherwise this function must explicitly call >+ * migrate_disable(). >+ * >+ * The flags argument is provided as a convenience to the driver. It >+ * will be passed again to device_unlock(). It can be ignored if the >+ * driver does not need it. >+ */ >+ void (*device_lock)(struct console *con, unsigned long *flags); >+ >+ /** >+ * @device_unlock: >+ * >+ * NBCON callback to finish synchronization with driver code. >+ * >+ * It is the counterpart to device_lock(). >+ * >+ * This callback is always called from task context. It must >+ * appropriately re-enable migration (depending on how device_lock() >+ * disabled migration). >+ * >+ * The flags argument is the value of the same variable that was >+ * passed to device_lock(). >+ */ >+ void (*device_unlock)(struct console *con, unsigned long flags); >+ > atomic_t __private nbcon_state; > atomic_long_t __private nbcon_seq; >+ atomic_long_t __private nbcon_prev_seq; >+ >+ /** >+ * @nbcon_drvdata: >+ * >+ * Data for nbcon ownership tracking to allow acquiring nbcon consoles >+ * in non-printing contexts. >+ * >+ * Drivers may need to acquire nbcon consoles in non-printing >+ * contexts. This is achieved by providing a struct nbcon_drvdata. >+ * Then the driver can call nbcon_driver_acquire() and >+ * nbcon_driver_release(). The struct does not require any special >+ * initialization. >+ */ >+ struct nbcon_drvdata *nbcon_drvdata; >+ > struct printk_buffers *pbufs; >+ struct task_struct *kthread; >+ struct rcuwait rcuwait; >+ struct irq_work irq_work; > }; > > #ifdef CONFIG_LOCKDEP >@@ -360,28 +495,29 @@ extern void console_list_unlock(void) __releases(console_mutex); > extern struct hlist_head console_list; > > /** >- * console_srcu_read_flags - Locklessly read the console flags >+ * console_srcu_read_flags - Locklessly read flags of a possibly registered >+ * console > * @con: struct console pointer of console to read flags from > * >- * This function provides the necessary READ_ONCE() and data_race() >- * notation for locklessly reading the console flags. The READ_ONCE() >- * in this function matches the WRITE_ONCE() when @flags are modified >- * for registered consoles with console_srcu_write_flags(). >+ * Locklessly reading @con->flags provides a consistent read value because >+ * there is at most one CPU modifying @con->flags and that CPU is using only >+ * read-modify-write operations to do so. > * >- * Only use this function to read console flags when locklessly >- * iterating the console list via srcu. >+ * Requires console_srcu_read_lock to be held, which implies that @con might >+ * be a registered console. If the caller is holding the console_list_lock or >+ * it is certain that the console is not registered, the caller may read >+ * @con->flags directly instead. > * > * Context: Any context. >+ * Return: The current value of the @con->flags field. > */ > static inline short console_srcu_read_flags(const struct console *con) > { > WARN_ON_ONCE(!console_srcu_read_lock_is_held()); > > /* >- * Locklessly reading console->flags provides a consistent >- * read value because there is at most one CPU modifying >- * console->flags and that CPU is using only read-modify-write >- * operations to do so. >+ * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified >+ * for registered consoles with console_srcu_write_flags(). > */ > return data_race(READ_ONCE(con->flags)); > } >@@ -459,13 +595,19 @@ static inline bool console_is_registered(const struct console *con) > hlist_for_each_entry(con, &console_list, node) > > #ifdef CONFIG_PRINTK >+extern void nbcon_cpu_emergency_enter(void); >+extern void nbcon_cpu_emergency_exit(void); > extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); > extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); > extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); >+extern void nbcon_reacquire(struct nbcon_write_context *wctxt); > #else >+static inline void nbcon_cpu_emergency_enter(void) { } >+static inline void nbcon_cpu_emergency_exit(void) { } > static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } > static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } > static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } >+static inline void nbcon_reacquire(struct nbcon_write_context *wctxt) { } > #endif > > extern int console_set_on_cmdline; >diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h >index b0fb775a600d9..f5bb19369973a 100644 >--- a/include/linux/entry-common.h >+++ b/include/linux/entry-common.h >@@ -65,7 +65,7 @@ > #define EXIT_TO_USER_MODE_WORK \ > (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ > _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ >- ARCH_EXIT_TO_USER_MODE_WORK) >+ _TIF_NEED_RESCHED_LAZY | ARCH_EXIT_TO_USER_MODE_WORK) > > /** > * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs >diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h >index 6813171afccb2..674a622c91be2 100644 >--- a/include/linux/entry-kvm.h >+++ b/include/linux/entry-kvm.h >@@ -18,7 +18,7 @@ > > #define XFER_TO_GUEST_MODE_WORK \ > (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ >- _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) >+ _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED_LAZY | ARCH_XFER_TO_GUEST_MODE_WORK) > > struct kvm_vcpu; > >diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h >index 76121c2bb4f82..f75f6bc195d18 100644 >--- a/include/linux/interrupt.h >+++ b/include/linux/interrupt.h >@@ -609,6 +609,35 @@ extern void __raise_softirq_irqoff(unsigned int nr); > extern void raise_softirq_irqoff(unsigned int nr); > extern void raise_softirq(unsigned int nr); > >+#ifdef CONFIG_PREEMPT_RT >+DECLARE_PER_CPU(struct task_struct *, timersd); >+DECLARE_PER_CPU(unsigned long, pending_timer_softirq); >+ >+extern void raise_timer_softirq(void); >+extern void raise_hrtimer_softirq(void); >+ >+static inline unsigned int local_pending_timers(void) >+{ >+ return __this_cpu_read(pending_timer_softirq); >+} >+ >+#else >+static inline void raise_timer_softirq(void) >+{ >+ raise_softirq(TIMER_SOFTIRQ); >+} >+ >+static inline void raise_hrtimer_softirq(void) >+{ >+ raise_softirq_irqoff(HRTIMER_SOFTIRQ); >+} >+ >+static inline unsigned int local_pending_timers(void) >+{ >+ return local_softirq_pending(); >+} >+#endif >+ > DECLARE_PER_CPU(struct task_struct *, ksoftirqd); > > static inline struct task_struct *this_cpu_ksoftirqd(void) >diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >index dba428b3a87a5..0db375f9c339b 100644 >--- a/include/linux/netdevice.h >+++ b/include/linux/netdevice.h >@@ -3365,6 +3365,7 @@ static inline void dev_xmit_recursion_dec(void) > __this_cpu_dec(softnet_data.xmit.recursion); > } > >+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); > void __netif_schedule(struct Qdisc *q); > void netif_schedule_queue(struct netdev_queue *txq); > >diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h >index d2a15c0c6f8a9..c1c6600541657 100644 >--- a/include/linux/perf_event.h >+++ b/include/linux/perf_event.h >@@ -781,9 +781,9 @@ struct perf_event { > unsigned int pending_wakeup; > unsigned int pending_kill; > unsigned int pending_disable; >- unsigned int pending_sigtrap; > unsigned long pending_addr; /* SIGTRAP */ > struct irq_work pending_irq; >+ struct irq_work pending_disable_irq; > struct callback_head pending_task; > unsigned int pending_work; > >@@ -959,7 +959,7 @@ struct perf_event_context { > struct rcu_head rcu_head; > > /* >- * Sum (event->pending_sigtrap + event->pending_work) >+ * Sum (event->pending_work + event->pending_work) > * > * The SIGTRAP is targeted at ctx->task, as such it won't do changing > * that until the signal is delivered. >diff --git a/include/linux/printk.h b/include/linux/printk.h >index 8ef499ab3c1ed..f2074b458d801 100644 >--- a/include/linux/printk.h >+++ b/include/linux/printk.h >@@ -9,6 +9,8 @@ > #include <linux/ratelimit_types.h> > #include <linux/once_lite.h> > >+struct console; >+ > extern const char linux_banner[]; > extern const char linux_proc_banner[]; > >@@ -157,15 +159,16 @@ int _printk(const char *fmt, ...); > */ > __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); > >-extern void __printk_safe_enter(void); >-extern void __printk_safe_exit(void); >+extern void __printk_deferred_enter(void); >+extern void __printk_deferred_exit(void); >+ > /* > * The printk_deferred_enter/exit macros are available only as a hack for > * some code paths that need to defer all printk console printing. Interrupts > * must be disabled for the deferred duration. > */ >-#define printk_deferred_enter __printk_safe_enter >-#define printk_deferred_exit __printk_safe_exit >+#define printk_deferred_enter() __printk_deferred_enter() >+#define printk_deferred_exit() __printk_deferred_exit() > > /* > * Please don't use printk_ratelimit(), because it shares ratelimiting state >@@ -192,6 +195,10 @@ void show_regs_print_info(const char *log_lvl); > extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; > extern asmlinkage void dump_stack(void) __cold; > void printk_trigger_flush(void); >+void printk_legacy_allow_panic_sync(void); >+extern void nbcon_driver_acquire(struct console *con); >+extern void nbcon_driver_release(struct console *con); >+void nbcon_atomic_flush_unsafe(void); > #else > static inline __printf(1, 0) > int vprintk(const char *s, va_list args) >@@ -271,6 +271,23 @@ > static inline void printk_trigger_flush(void) > { > } >+ >+static inline void printk_legacy_allow_panic_sync(void) >+{ >+} >+ >+static inline void nbcon_driver_acquire(struct console *con) >+{ >+} >+ >+static inline void nbcon_driver_release(struct console *con) >+{ >+} >+ >+static inline void nbcon_atomic_flush_unsafe(void) >+{ >+} >+ > #endif > > bool this_cpu_in_panic(void); >diff --git a/include/linux/sched.h b/include/linux/sched.h >index ffe8f618ab869..cb4df5d70e3d0 100644 >--- a/include/linux/sched.h >+++ b/include/linux/sched.h >@@ -1791,6 +1791,7 @@ static inline int dl_task_check_affinity(struct task_struct *p, const struct cpu > } > #endif > >+extern bool task_is_pi_boosted(const struct task_struct *p); > extern int yield_to(struct task_struct *p, bool preempt); > extern void set_user_nice(struct task_struct *p, long nice); > extern int task_prio(const struct task_struct *p); >@@ -1933,17 +1934,17 @@ static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, > update_ti_thread_flag(task_thread_info(tsk), flag, value); > } > >-static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) >+static inline bool test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) > { > return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); > } > >-static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) >+static inline bool test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) > { > return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); > } > >-static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) >+static inline bool test_tsk_thread_flag(struct task_struct *tsk, int flag) > { > return test_ti_thread_flag(task_thread_info(tsk), flag); > } >@@ -1956,9 +1957,11 @@ static inline void set_tsk_need_resched(struct task_struct *tsk) > static inline void clear_tsk_need_resched(struct task_struct *tsk) > { > clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); >+ if (IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO)) >+ clear_tsk_thread_flag(tsk, TIF_NEED_RESCHED_LAZY); > } > >-static inline int test_tsk_need_resched(struct task_struct *tsk) >+static inline bool test_tsk_need_resched(struct task_struct *tsk) > { > return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); > } >@@ -2099,7 +2102,7 @@ static inline bool preempt_model_preemptible(void) > > static __always_inline bool need_resched(void) > { >- return unlikely(tif_need_resched()); >+ return unlikely(tif_need_resched_lazy() || tif_need_resched()); > } > > /* >diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h >index 478084f9105e1..719416fe8ddc0 100644 >--- a/include/linux/sched/idle.h >+++ b/include/linux/sched/idle.h >@@ -63,7 +63,7 @@ static __always_inline bool __must_check current_set_polling_and_test(void) > */ > smp_mb__after_atomic(); > >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > > static __always_inline bool __must_check current_clr_polling_and_test(void) >@@ -76,7 +76,7 @@ static __always_inline bool __must_check current_clr_polling_and_test(void) > */ > smp_mb__after_atomic(); > >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > > #else >@@ -85,11 +85,11 @@ static inline void __current_clr_polling(void) { } > > static inline bool __must_check current_set_polling_and_test(void) > { >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > static inline bool __must_check current_clr_polling_and_test(void) > { >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > #endif > >diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h >index be65de65fe612..ff445a5fca281 100644 >--- a/include/linux/serial_8250.h >+++ b/include/linux/serial_8250.h >@@ -153,6 +153,8 @@ struct uart_8250_port { > #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA > unsigned char msr_saved_flags; > >+ bool console_newline_needed; >+ > struct uart_8250_dma *dma; > const struct uart_8250_ops *ops; > >@@ -204,6 +206,10 @@ void serial8250_init_port(struct uart_8250_port *up); > void serial8250_set_defaults(struct uart_8250_port *up); > void serial8250_console_write(struct uart_8250_port *up, const char *s, > unsigned int count); >+void serial8250_console_write_atomic(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt); >+void serial8250_console_write_thread(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt); > int serial8250_console_setup(struct uart_port *port, char *options, bool probe); > int serial8250_console_exit(struct uart_port *port); > >diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h >index 55b1f3ba48ac1..9a73dee32ad9a 100644 >--- a/include/linux/serial_core.h >+++ b/include/linux/serial_core.h >@@ -8,10 +8,13 @@ > #define LINUX_SERIAL_CORE_H > > #include <linux/bitops.h> >+#include <linux/bug.h> > #include <linux/compiler.h> > #include <linux/console.h> > #include <linux/interrupt.h> > #include <linux/circ_buf.h> >+#include <linux/lockdep.h> >+#include <linux/printk.h> > #include <linux/spinlock.h> > #include <linux/sched.h> > #include <linux/tty.h> >@@ -588,6 +591,101 @@ struct uart_port { > void *private_data; /* generic platform data pointer */ > }; > >+/* >+ * Only for console->device_lock()/_unlock() callbacks and internal >+ * port lock wrapper synchronization. >+ */ >+static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) >+{ >+ spin_lock_irqsave(&up->lock, *flags); >+} >+ >+/* >+ * Only for console->device_lock()/_unlock() callbacks and internal >+ * port lock wrapper synchronization. >+ */ >+static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) >+{ >+ spin_unlock_irqrestore(&up->lock, flags); >+} >+ >+/** >+ * uart_port_set_cons - Safely set the @cons field for a uart >+ * @up: The uart port to set >+ * @con: The new console to set to >+ * >+ * This function must be used to set @up->cons. It uses the port lock to >+ * synchronize with the port lock wrappers in order to ensure that the console >+ * cannot change or disappear while another context is holding the port lock. >+ */ >+static inline void uart_port_set_cons(struct uart_port *up, struct console *con) >+{ >+ unsigned long flags; >+ >+ __uart_port_lock_irqsave(up, &flags); >+ up->cons = con; >+ __uart_port_unlock_irqrestore(up, flags); >+} >+ >+/* Only for internal port lock wrapper usage. */ >+static inline void __uart_port_nbcon_acquire(struct uart_port *up) >+{ >+ lockdep_assert_held_once(&up->lock); >+ >+ if (likely(!uart_console(up))) >+ return; >+ >+ if (up->cons->nbcon_drvdata) { >+ /* >+ * If @up->cons is registered, prevent it from fully >+ * unregistering until this context releases the nbcon. >+ */ >+ int cookie = console_srcu_read_lock(); >+ >+ /* Ensure console is registered and is an nbcon console. */ >+ if (!hlist_unhashed_lockless(&up->cons->node) && >+ (console_srcu_read_flags(up->cons) & CON_NBCON)) { >+ WARN_ON_ONCE(up->cons->nbcon_drvdata->locked); >+ >+ nbcon_driver_acquire(up->cons); >+ >+ /* >+ * Record @up->line to be used during release because >+ * @up->cons->index can change while the port and >+ * nbcon are locked. >+ */ >+ up->cons->nbcon_drvdata->owner_index = up->line; >+ up->cons->nbcon_drvdata->srcu_cookie = cookie; >+ up->cons->nbcon_drvdata->locked = true; >+ } else { >+ console_srcu_read_unlock(cookie); >+ } >+ } >+} >+ >+/* Only for internal port lock wrapper usage. */ >+static inline void __uart_port_nbcon_release(struct uart_port *up) >+{ >+ lockdep_assert_held_once(&up->lock); >+ >+ /* >+ * uart_console() cannot be used here because @up->cons->index might >+ * have changed. Check against @up->cons->nbcon_drvdata->owner_index >+ * instead. >+ */ >+ >+ if (unlikely(up->cons && >+ up->cons->nbcon_drvdata && >+ up->cons->nbcon_drvdata->locked && >+ up->cons->nbcon_drvdata->owner_index == up->line)) { >+ WARN_ON_ONCE(!up->cons->nbcon_drvdata->locked); >+ >+ up->cons->nbcon_drvdata->locked = false; >+ nbcon_driver_release(up->cons); >+ console_srcu_read_unlock(up->cons->nbcon_drvdata->srcu_cookie); >+ } >+} >+ > /** > * uart_port_lock - Lock the UART port > * @up: Pointer to UART port structure >@@ -595,6 +693,7 @@ struct uart_port { > static inline void uart_port_lock(struct uart_port *up) > { > spin_lock(&up->lock); >+ __uart_port_nbcon_acquire(up); > } > > /** >@@ -604,6 +703,7 @@ static inline void uart_port_lock(struct uart_port *up) > static inline void uart_port_lock_irq(struct uart_port *up) > { > spin_lock_irq(&up->lock); >+ __uart_port_nbcon_acquire(up); > } > > /** >@@ -614,6 +714,7 @@ static inline void uart_port_lock_irq(struct uart_port *up) > static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) > { > spin_lock_irqsave(&up->lock, *flags); >+ __uart_port_nbcon_acquire(up); > } > > /** >@@ -624,7 +725,11 @@ static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *f > */ > static inline bool uart_port_trylock(struct uart_port *up) > { >- return spin_trylock(&up->lock); >+ if (!spin_trylock(&up->lock)) >+ return false; >+ >+ __uart_port_nbcon_acquire(up); >+ return true; > } > > /** >@@ -636,7 +741,11 @@ static inline bool uart_port_trylock(struct uart_port *up) > */ > static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags) > { >- return spin_trylock_irqsave(&up->lock, *flags); >+ if (!spin_trylock_irqsave(&up->lock, *flags)) >+ return false; >+ >+ __uart_port_nbcon_acquire(up); >+ return true; > } > > /** >@@ -645,6 +754,7 @@ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long > */ > static inline void uart_port_unlock(struct uart_port *up) > { >+ __uart_port_nbcon_release(up); > spin_unlock(&up->lock); > } > >@@ -654,6 +764,7 @@ static inline void uart_port_unlock(struct uart_port *up) > */ > static inline void uart_port_unlock_irq(struct uart_port *up) > { >+ __uart_port_nbcon_release(up); > spin_unlock_irq(&up->lock); > } > >@@ -664,6 +775,7 @@ static inline void uart_port_unlock_irq(struct uart_port *up) > */ > static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) > { >+ __uart_port_nbcon_release(up); > spin_unlock_irqrestore(&up->lock, flags); > } > >diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h >index 9ea0b28068f49..5ded1450ac1a1 100644 >--- a/include/linux/thread_info.h >+++ b/include/linux/thread_info.h >@@ -59,6 +59,16 @@ enum syscall_work_bit { > > #include <asm/thread_info.h> > >+#ifdef CONFIG_PREEMPT_BUILD_AUTO >+# define TIF_NEED_RESCHED_LAZY TIF_ARCH_RESCHED_LAZY >+# define _TIF_NEED_RESCHED_LAZY _TIF_ARCH_RESCHED_LAZY >+# define TIF_NEED_RESCHED_LAZY_OFFSET (TIF_NEED_RESCHED_LAZY - TIF_NEED_RESCHED) >+#else >+# define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED >+# define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED >+# define TIF_NEED_RESCHED_LAZY_OFFSET 0 >+#endif >+ > #ifdef __KERNEL__ > > #ifndef arch_set_restart_data >@@ -185,6 +195,13 @@ static __always_inline bool tif_need_resched(void) > (unsigned long *)(¤t_thread_info()->flags)); > } > >+static __always_inline bool tif_need_resched_lazy(void) >+{ >+ return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && >+ arch_test_bit(TIF_NEED_RESCHED_LAZY, >+ (unsigned long *)(¤t_thread_info()->flags)); >+} >+ > #else > > static __always_inline bool tif_need_resched(void) >@@ -193,6 +210,13 @@ static __always_inline bool tif_need_resched(void) > (unsigned long *)(¤t_thread_info()->flags)); > } > >+static __always_inline bool tif_need_resched_lazy(void) >+{ >+ return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && >+ test_bit(TIF_NEED_RESCHED_LAZY, >+ (unsigned long *)(¤t_thread_info()->flags)); >+} >+ > #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ > > #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES >diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h >index d68ff9b1247f9..0681b3d5a85c6 100644 >--- a/include/linux/trace_events.h >+++ b/include/linux/trace_events.h >@@ -178,8 +178,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); > > enum trace_flag_type { > TRACE_FLAG_IRQS_OFF = 0x01, >- TRACE_FLAG_IRQS_NOSUPPORT = 0x02, >- TRACE_FLAG_NEED_RESCHED = 0x04, >+ TRACE_FLAG_NEED_RESCHED = 0x02, >+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x04, > TRACE_FLAG_HARDIRQ = 0x08, > TRACE_FLAG_SOFTIRQ = 0x10, > TRACE_FLAG_PREEMPT_RESCHED = 0x20, >@@ -205,11 +205,11 @@ static inline unsigned int tracing_gen_ctx(void) > > static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) > { >- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); >+ return tracing_gen_ctx_irq_test(0); > } > static inline unsigned int tracing_gen_ctx(void) > { >- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); >+ return tracing_gen_ctx_irq_test(0); > } > #endif > >diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt >index c2f1fd95a8214..0f3d4c2a41cb7 100644 >--- a/kernel/Kconfig.preempt >+++ b/kernel/Kconfig.preempt >@@ -11,6 +11,13 @@ config PREEMPT_BUILD > select PREEMPTION > select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK > >+config PREEMPT_BUILD_AUTO >+ bool >+ select PREEMPT_BUILD >+ >+config HAVE_PREEMPT_AUTO >+ bool >+ > choice > prompt "Preemption Model" > default PREEMPT_NONE >@@ -67,9 +74,17 @@ config PREEMPT > embedded system with latency requirements in the milliseconds > range. > >+config PREEMPT_AUTO >+ bool "Automagic preemption mode with runtime tweaking support" >+ depends on HAVE_PREEMPT_AUTO >+ select PREEMPT_BUILD_AUTO >+ help >+ Add some sensible blurb here >+ > config PREEMPT_RT > bool "Fully Preemptible Kernel (Real-Time)" > depends on EXPERT && ARCH_SUPPORTS_RT >+ select PREEMPT_BUILD_AUTO if HAVE_PREEMPT_AUTO > select PREEMPTION > help > This option turns the kernel into a real-time kernel by replacing >@@ -95,7 +110,7 @@ config PREEMPTION > > config PREEMPT_DYNAMIC > bool "Preemption behaviour defined on boot" >- depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT >+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT && !PREEMPT_AUTO > select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY > select PREEMPT_BUILD > default y if HAVE_PREEMPT_DYNAMIC_CALL >diff --git a/kernel/entry/common.c b/kernel/entry/common.c >index 88cb3c88aaa5c..d78b109750a3c 100644 >--- a/kernel/entry/common.c >+++ b/kernel/entry/common.c >@@ -92,7 +92,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, > > local_irq_enable_exit_to_user(ti_work); > >- if (ti_work & _TIF_NEED_RESCHED) >+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) > schedule(); > > if (ti_work & _TIF_UPROBE) >@@ -301,7 +301,7 @@ void raw_irqentry_exit_cond_resched(void) > rcu_irq_exit_check_preempt(); > if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) > WARN_ON_ONCE(!on_thread_stack()); >- if (need_resched()) >+ if (test_tsk_need_resched(current)) > preempt_schedule_irq(); > } > } >diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c >index 2e0f75bcb7fd1..d952fa5ee8801 100644 >--- a/kernel/entry/kvm.c >+++ b/kernel/entry/kvm.c >@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) > return -EINTR; > } > >- if (ti_work & _TIF_NEED_RESCHED) >+ if (ti_work & (_TIF_NEED_RESCHED | TIF_NEED_RESCHED_LAZY)) > schedule(); > > if (ti_work & _TIF_NOTIFY_RESUME) >diff --git a/kernel/events/core.c b/kernel/events/core.c >index f0f0f71213a1d..d5af4d03c2680 100644 >--- a/kernel/events/core.c >+++ b/kernel/events/core.c >@@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) > state = PERF_EVENT_STATE_OFF; > } > >- if (event->pending_sigtrap) { >- bool dec = true; >- >- event->pending_sigtrap = 0; >- if (state != PERF_EVENT_STATE_OFF && >- !event->pending_work) { >- event->pending_work = 1; >- dec = false; >- WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); >- task_work_add(current, &event->pending_task, TWA_RESUME); >- } >- if (dec) >- local_dec(&event->ctx->nr_pending); >- } >- > perf_event_set_state(event, state); > > if (!is_software_event(event)) >@@ -2464,7 +2449,7 @@ static void __perf_event_disable(struct perf_event *event, > * hold the top-level event's child_mutex, so any descendant that > * goes to exit will block in perf_event_exit_event(). > * >- * When called from perf_pending_irq it's OK because event->ctx >+ * When called from perf_pending_disable it's OK because event->ctx > * is the current context on this CPU and preemption is disabled, > * hence we can't get into perf_event_task_sched_out for this context. > */ >@@ -2504,7 +2489,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable); > void perf_event_disable_inatomic(struct perf_event *event) > { > event->pending_disable = 1; >- irq_work_queue(&event->pending_irq); >+ irq_work_queue(&event->pending_disable_irq); > } > > #define MAX_INTERRUPTS (~0ULL) >@@ -5190,6 +5175,7 @@ static void perf_addr_filters_splice(struct perf_event *event, > static void _free_event(struct perf_event *event) > { > irq_work_sync(&event->pending_irq); >+ irq_work_sync(&event->pending_disable_irq); > > unaccount_event(event); > >@@ -6726,7 +6712,7 @@ static void perf_sigtrap(struct perf_event *event) > /* > * Deliver the pending work in-event-context or follow the context. > */ >-static void __perf_pending_irq(struct perf_event *event) >+static void __perf_pending_disable(struct perf_event *event) > { > int cpu = READ_ONCE(event->oncpu); > >@@ -6741,11 +6727,6 @@ static void __perf_pending_irq(struct perf_event *event) > * Yay, we hit home and are in the context of the event. > */ > if (cpu == smp_processor_id()) { >- if (event->pending_sigtrap) { >- event->pending_sigtrap = 0; >- perf_sigtrap(event); >- local_dec(&event->ctx->nr_pending); >- } > if (event->pending_disable) { > event->pending_disable = 0; > perf_event_disable_local(event); >@@ -6769,11 +6750,26 @@ static void __perf_pending_irq(struct perf_event *event) > * irq_work_queue(); // FAILS > * > * irq_work_run() >- * perf_pending_irq() >+ * perf_pending_disable() > * > * But the event runs on CPU-B and wants disabling there. > */ >- irq_work_queue_on(&event->pending_irq, cpu); >+ irq_work_queue_on(&event->pending_disable_irq, cpu); >+} >+ >+static void perf_pending_disable(struct irq_work *entry) >+{ >+ struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); >+ int rctx; >+ >+ /* >+ * If we 'fail' here, that's OK, it means recursion is already disabled >+ * and we won't recurse 'further'. >+ */ >+ rctx = perf_swevent_get_recursion_context(); >+ __perf_pending_disable(event); >+ if (rctx >= 0) >+ perf_swevent_put_recursion_context(rctx); > } > > static void perf_pending_irq(struct irq_work *entry) >@@ -6796,8 +6792,6 @@ static void perf_pending_irq(struct irq_work *entry) > perf_event_wakeup(event); > } > >- __perf_pending_irq(event); >- > if (rctx >= 0) > perf_swevent_put_recursion_context(rctx); > } >@@ -6805,14 +6799,6 @@ static void perf_pending_irq(struct irq_work *entry) > static void perf_pending_task(struct callback_head *head) > { > struct perf_event *event = container_of(head, struct perf_event, pending_task); >- int rctx; >- >- /* >- * If we 'fail' here, that's OK, it means recursion is already disabled >- * and we won't recurse 'further'. >- */ >- preempt_disable_notrace(); >- rctx = perf_swevent_get_recursion_context(); > > if (event->pending_work) { > event->pending_work = 0; >@@ -6820,10 +6806,6 @@ static void perf_pending_task(struct callback_head *head) > local_dec(&event->ctx->nr_pending); > } > >- if (rctx >= 0) >- perf_swevent_put_recursion_context(rctx); >- preempt_enable_notrace(); >- > put_event(event); > } > >@@ -9592,13 +9574,23 @@ static int __perf_event_overflow(struct perf_event *event, > > if (regs) > pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; >- if (!event->pending_sigtrap) { >- event->pending_sigtrap = pending_id; >+ if (!event->pending_work) { >+ event->pending_work = pending_id; > local_inc(&event->ctx->nr_pending); >+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); >+ task_work_add(current, &event->pending_task, TWA_RESUME); >+ /* >+ * The NMI path returns directly to userland. The >+ * irq_work is raised as a dummy interrupt to ensure >+ * regular return path to user is taken and task_work >+ * is processed. >+ */ >+ if (in_nmi()) >+ irq_work_queue(&event->pending_disable_irq); > } else if (event->attr.exclude_kernel && valid_sample) { > /* > * Should not be able to return to user space without >- * consuming pending_sigtrap; with exceptions: >+ * consuming pending_work; with exceptions: > * > * 1. Where !exclude_kernel, events can overflow again > * in the kernel without returning to user space. >@@ -9608,13 +9600,12 @@ static int __perf_event_overflow(struct perf_event *event, > * To approximate progress (with false negatives), > * check 32-bit hash of the current IP. > */ >- WARN_ON_ONCE(event->pending_sigtrap != pending_id); >+ WARN_ON_ONCE(event->pending_work != pending_id); > } > > event->pending_addr = 0; > if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) > event->pending_addr = data->addr; >- irq_work_queue(&event->pending_irq); > } > > READ_ONCE(event->overflow_handler)(event, data, regs); >@@ -11935,6 +11926,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, > > init_waitqueue_head(&event->waitq); > init_irq_work(&event->pending_irq, perf_pending_irq); >+ event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); > init_task_work(&event->pending_task, perf_pending_task); > > mutex_init(&event->mmap_mutex); >@@ -13049,6 +13041,13 @@ static void sync_child_event(struct perf_event *child_event) > &parent_event->child_total_time_running); > } > >+static bool task_work_cb_match(struct callback_head *cb, void *data) >+{ >+ struct perf_event *event = container_of(cb, struct perf_event, pending_task); >+ >+ return event == data; >+} >+ > static void > perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) > { >@@ -13088,6 +13087,17 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) > * Kick perf_poll() for is_event_hup(); > */ > perf_event_wakeup(parent_event); >+ /* >+ * Cancel pending task_work and update counters if it has not >+ * yet been delivered to userland. free_event() expects the >+ * reference counter at 1 and keeping the event around until the >+ * task return to userland will be a unexpected. >+ */ >+ if (event->pending_work && >+ task_work_cancel_match(current, task_work_cb_match, event)) { >+ put_event(event); >+ local_dec(&event->ctx->nr_pending); >+ } > free_event(event); > put_event(parent_event); > return; >diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c >index 1d4bc493b2f4b..486c68c11bbe2 100644 >--- a/kernel/ksysfs.c >+++ b/kernel/ksysfs.c >@@ -179,6 +179,15 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size); > > #endif /* CONFIG_CRASH_CORE */ > >+#if defined(CONFIG_PREEMPT_RT) >+static ssize_t realtime_show(struct kobject *kobj, >+ struct kobj_attribute *attr, char *buf) >+{ >+ return sprintf(buf, "%d\n", 1); >+} >+KERNEL_ATTR_RO(realtime); >+#endif >+ > /* whether file capabilities are enabled */ > static ssize_t fscaps_show(struct kobject *kobj, > struct kobj_attribute *attr, char *buf) >@@ -274,6 +283,9 @@ static struct attribute * kernel_attrs[] = { > #ifndef CONFIG_TINY_RCU > &rcu_expedited_attr.attr, > &rcu_normal_attr.attr, >+#endif >+#ifdef CONFIG_PREEMPT_RT >+ &realtime_attr.attr, > #endif > NULL > }; >diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c >index 151bd3de59363..80cfbe7b340e3 100644 >--- a/kernel/locking/lockdep.c >+++ b/kernel/locking/lockdep.c >@@ -56,6 +56,7 @@ > #include <linux/kprobes.h> > #include <linux/lockdep.h> > #include <linux/context_tracking.h> >+#include <linux/console.h> > > #include <asm/sections.h> > >@@ -574,8 +575,10 @@ static struct lock_trace *save_trace(void) > if (!debug_locks_off_graph_unlock()) > return NULL; > >+ nbcon_cpu_emergency_enter(); > print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > > return NULL; > } >@@ -782,6 +785,8 @@ static void lockdep_print_held_locks(struct task_struct *p) > { > int i, depth = READ_ONCE(p->lockdep_depth); > >+ nbcon_cpu_emergency_enter(); >+ > if (!depth) > printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p)); > else >@@ -792,11 +797,13 @@ static void lockdep_print_held_locks(struct task_struct *p) > * and it's not the current task. > */ > if (p != current && task_is_running(p)) >- return; >+ goto out; > for (i = 0; i < depth; i++) { > printk(" #%d: ", i); > print_lock(p->held_locks + i); > } >+out: >+ nbcon_cpu_emergency_exit(); > } > > static void print_kernel_ident(void) >@@ -888,11 +895,13 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) > if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { > instrumentation_begin(); > debug_locks_off(); >+ nbcon_cpu_emergency_enter(); > printk(KERN_ERR > "BUG: looking up invalid subclass: %u\n", subclass); > printk(KERN_ERR > "turning off the locking correctness validator.\n"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > instrumentation_end(); > return NULL; > } >@@ -969,11 +978,13 @@ static bool assign_lock_key(struct lockdep_map *lock) > else { > /* Debug-check: all keys must be persistent! */ > debug_locks_off(); >+ nbcon_cpu_emergency_enter(); > pr_err("INFO: trying to register non-static key.\n"); > pr_err("The code is fine but needs lockdep annotation, or maybe\n"); > pr_err("you didn't initialize this object before use?\n"); > pr_err("turning off the locking correctness validator.\n"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > return false; > } > >@@ -1317,8 +1328,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) > return NULL; > } > >+ nbcon_cpu_emergency_enter(); > print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > return NULL; > } > nr_lock_classes++; >@@ -1350,11 +1363,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) > if (verbose(class)) { > graph_unlock(); > >+ nbcon_cpu_emergency_enter(); > printk("\nnew class %px: %s", class->key, class->name); > if (class->name_version > 1) > printk(KERN_CONT "#%d", class->name_version); > printk(KERN_CONT "\n"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > > if (!graph_lock()) { > return NULL; >@@ -1393,8 +1408,10 @@ static struct lock_list *alloc_list_entry(void) > if (!debug_locks_off_graph_unlock()) > return NULL; > >+ nbcon_cpu_emergency_enter(); > print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > return NULL; > } > nr_list_entries++; >@@ -2040,6 +2057,8 @@ static noinline void print_circular_bug(struct lock_list *this, > > depth = get_lock_depth(target); > >+ nbcon_cpu_emergency_enter(); >+ > print_circular_bug_header(target, depth, check_src, check_tgt); > > parent = get_lock_parent(target); >@@ -2058,6 +2077,8 @@ static noinline void print_circular_bug(struct lock_list *this, > > printk("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > static noinline void print_bfs_bug(int ret) >@@ -2570,6 +2591,8 @@ print_bad_irq_dependency(struct task_struct *curr, > if (!debug_locks_off_graph_unlock() || debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("=====================================================\n"); > pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", >@@ -2619,11 +2642,13 @@ print_bad_irq_dependency(struct task_struct *curr, > pr_warn(" and %s-irq-unsafe lock:\n", irqclass); > next_root->trace = save_trace(); > if (!next_root->trace) >- return; >+ goto out; > print_shortest_lock_dependencies(forwards_entry, next_root); > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+out: >+ nbcon_cpu_emergency_exit(); > } > > static const char *state_names[] = { >@@ -2988,6 +3013,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, > if (!debug_locks_off_graph_unlock() || debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("============================================\n"); > pr_warn("WARNING: possible recursive locking detected\n"); >@@ -3010,6 +3037,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > /* >@@ -3607,6 +3636,8 @@ static void print_collision(struct task_struct *curr, > struct held_lock *hlock_next, > struct lock_chain *chain) > { >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("============================\n"); > pr_warn("WARNING: chain_key collision\n"); >@@ -3623,6 +3654,8 @@ static void print_collision(struct task_struct *curr, > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > #endif > >@@ -3713,8 +3746,10 @@ static inline int add_chain_cache(struct task_struct *curr, > if (!debug_locks_off_graph_unlock()) > return 0; > >+ nbcon_cpu_emergency_enter(); > print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > return 0; > } > chain->chain_key = chain_key; >@@ -3731,8 +3766,10 @@ static inline int add_chain_cache(struct task_struct *curr, > if (!debug_locks_off_graph_unlock()) > return 0; > >+ nbcon_cpu_emergency_enter(); > print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > return 0; > } > >@@ -3971,6 +4008,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, > if (!debug_locks_off() || debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("================================\n"); > pr_warn("WARNING: inconsistent lock state\n"); >@@ -3999,6 +4038,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > /* >@@ -4033,6 +4074,8 @@ print_irq_inversion_bug(struct task_struct *curr, > if (!debug_locks_off_graph_unlock() || debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("========================================================\n"); > pr_warn("WARNING: possible irq lock inversion dependency detected\n"); >@@ -4073,11 +4116,13 @@ print_irq_inversion_bug(struct task_struct *curr, > pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); > root->trace = save_trace(); > if (!root->trace) >- return; >+ goto out; > print_shortest_lock_dependencies(other, root); > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+out: >+ nbcon_cpu_emergency_exit(); > } > > /* >@@ -4154,6 +4199,8 @@ void print_irqtrace_events(struct task_struct *curr) > { > const struct irqtrace_events *trace = &curr->irqtrace; > >+ nbcon_cpu_emergency_enter(); >+ > printk("irq event stamp: %u\n", trace->irq_events); > printk("hardirqs last enabled at (%u): [<%px>] %pS\n", > trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, >@@ -4167,6 +4214,8 @@ void print_irqtrace_events(struct task_struct *curr) > printk("softirqs last disabled at (%u): [<%px>] %pS\n", > trace->softirq_disable_event, (void *)trace->softirq_disable_ip, > (void *)trace->softirq_disable_ip); >+ >+ nbcon_cpu_emergency_exit(); > } > > static int HARDIRQ_verbose(struct lock_class *class) >@@ -4687,10 +4736,12 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, > * We must printk outside of the graph_lock: > */ > if (ret == 2) { >+ nbcon_cpu_emergency_enter(); > printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); > print_lock(this); > print_irqtrace_events(curr); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > } > > return ret; >@@ -4731,6 +4782,8 @@ print_lock_invalid_wait_context(struct task_struct *curr, > if (debug_locks_silent) > return 0; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("=============================\n"); > pr_warn("[ BUG: Invalid wait context ]\n"); >@@ -4750,6 +4803,8 @@ print_lock_invalid_wait_context(struct task_struct *curr, > pr_warn("stack backtrace:\n"); > dump_stack(); > >+ nbcon_cpu_emergency_exit(); >+ > return 0; > } > >@@ -4954,6 +5009,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr, > if (debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("==================================\n"); > pr_warn("WARNING: Nested lock was not taken\n"); >@@ -4974,6 +5031,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr, > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > static int __lock_is_held(const struct lockdep_map *lock, int read); >@@ -5019,11 +5078,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, > debug_class_ops_inc(class); > > if (very_verbose(class)) { >+ nbcon_cpu_emergency_enter(); > printk("\nacquire class [%px] %s", class->key, class->name); > if (class->name_version > 1) > printk(KERN_CONT "#%d", class->name_version); > printk(KERN_CONT "\n"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > } > > /* >@@ -5150,6 +5211,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, > #endif > if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { > debug_locks_off(); >+ nbcon_cpu_emergency_enter(); > print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); > printk(KERN_DEBUG "depth: %i max: %lu!\n", > curr->lockdep_depth, MAX_LOCK_DEPTH); >@@ -5157,6 +5219,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, > lockdep_print_held_locks(current); > debug_show_all_locks(); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > > return 0; > } >@@ -5176,6 +5239,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, > if (debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("=====================================\n"); > pr_warn("WARNING: bad unlock balance detected!\n"); >@@ -5192,6 +5257,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > static noinstr int match_held_lock(const struct held_lock *hlock, >@@ -5895,6 +5962,8 @@ static void print_lock_contention_bug(struct task_struct *curr, > if (debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("=================================\n"); > pr_warn("WARNING: bad contention detected!\n"); >@@ -5911,6 +5980,8 @@ static void print_lock_contention_bug(struct task_struct *curr, > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > static void >@@ -6524,6 +6595,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, > if (debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("=========================\n"); > pr_warn("WARNING: held lock freed!\n"); >@@ -6536,6 +6609,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > static inline int not_in_range(const void* mem_from, unsigned long mem_len, >@@ -6582,6 +6657,8 @@ static void print_held_locks_bug(void) > if (debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("====================================\n"); > pr_warn("WARNING: %s/%d still has locks held!\n", >@@ -6591,6 +6668,8 @@ static void print_held_locks_bug(void) > lockdep_print_held_locks(current); > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > void debug_check_no_locks_held(void) >@@ -6609,6 +6688,7 @@ void debug_show_all_locks(void) > pr_warn("INFO: lockdep is turned off.\n"); > return; > } >+ nbcon_cpu_emergency_enter(); > pr_warn("\nShowing all locks held in the system:\n"); > > rcu_read_lock(); >@@ -6623,6 +6703,7 @@ void debug_show_all_locks(void) > > pr_warn("\n"); > pr_warn("=============================================\n\n"); >+ nbcon_cpu_emergency_exit(); > } > EXPORT_SYMBOL_GPL(debug_show_all_locks); > #endif >@@ -6648,6 +6729,7 @@ asmlinkage __visible void lockdep_sys_exit(void) > if (unlikely(curr->lockdep_depth)) { > if (!debug_locks_off()) > return; >+ nbcon_cpu_emergency_enter(); > pr_warn("\n"); > pr_warn("================================================\n"); > pr_warn("WARNING: lock held when returning to user space!\n"); >@@ -6656,6 +6738,7 @@ asmlinkage __visible void lockdep_sys_exit(void) > pr_warn("%s/%d is leaving the kernel with locks still held!\n", > curr->comm, curr->pid); > lockdep_print_held_locks(curr); >+ nbcon_cpu_emergency_exit(); > } > > /* >@@ -6672,6 +6755,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) > bool rcu = warn_rcu_enter(); > > /* Note: the following can be executed concurrently, so be careful. */ >+ nbcon_cpu_emergency_enter(); > pr_warn("\n"); > pr_warn("=============================\n"); > pr_warn("WARNING: suspicious RCU usage\n"); >@@ -6710,6 +6794,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) > lockdep_print_held_locks(curr); > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ nbcon_cpu_emergency_exit(); > warn_rcu_exit(rcu); > } > EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); >diff --git a/kernel/panic.c b/kernel/panic.c >index 2807639aab51d..3754a2471b4ff 100644 >--- a/kernel/panic.c >+++ b/kernel/panic.c >@@ -364,6 +364,8 @@ void panic(const char *fmt, ...) > > panic_other_cpus_shutdown(_crash_kexec_post_notifiers); > >+ printk_legacy_allow_panic_sync(); >+ > /* > * Run any panic handlers, including those that might need to > * add information to the kmsg dump output. >@@ -453,6 +453,7 @@ > * Explicitly flush the kernel log buffer one last time. > */ > console_flush_on_panic(CONSOLE_FLUSH_PENDING); >+ nbcon_atomic_flush_unsafe(); > > local_irq_enable(); > for (i = 0; ; i += PANIC_TIMER_STEP) { >@@ -623,6 +634,7 @@ bool oops_may_print(void) > */ > void oops_enter(void) > { >+ nbcon_cpu_emergency_enter(); > tracing_off(); > /* can't trust the integrity of the kernel anymore: */ > debug_locks_off(); >@@ -645,6 +657,7 @@ void oops_exit(void) > { > do_oops_enter_exit(); > print_oops_end_marker(); >+ nbcon_cpu_emergency_exit(); > kmsg_dump(KMSG_DUMP_OOPS); > } > >@@ -656,6 +669,8 @@ struct warn_args { > void __warn(const char *file, int line, void *caller, unsigned taint, > struct pt_regs *regs, struct warn_args *args) > { >+ nbcon_cpu_emergency_enter(); >+ > disable_trace_on_warning(); > > if (file) >@@ -686,6 +701,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, > > /* Just a warning, don't kill lockdep. */ > add_taint(taint, LOCKDEP_STILL_OK); >+ >+ nbcon_cpu_emergency_exit(); > } > > #ifdef CONFIG_BUG >diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h >index ac2d9750e5f81..fdf455c890338 100644 >--- a/kernel/printk/internal.h >+++ b/kernel/printk/internal.h >@@ -2,11 +2,13 @@ > /* > * internal.h - printk internal definitions > */ >-#include <linux/percpu.h> > #include <linux/console.h> >-#include "printk_ringbuffer.h" >+#include <linux/jump_label.h> >+#include <linux/percpu.h> >+#include <linux/types.h> > > #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) >+struct ctl_table; > void __init printk_sysctl_init(void); > int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, > void *buffer, size_t *lenp, loff_t *ppos); >@@ -20,6 +22,13 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, > (con->flags & CON_BOOT) ? "boot" : "", \ > con->name, con->index, ##__VA_ARGS__) > >+#ifdef CONFIG_PREEMPT_RT >+# define force_printkthreads() (true) >+#else >+DECLARE_STATIC_KEY_FALSE(force_printkthreads_key); >+# define force_printkthreads() (static_branch_unlikely(&force_printkthreads_key)) >+#endif >+ > #ifdef CONFIG_PRINTK > > #ifdef CONFIG_PRINTK_CALLER >@@ -43,7 +52,11 @@ enum printk_info_flags { > LOG_CONT = 8, /* text is a fragment of a continuation line */ > }; > >+struct printk_ringbuffer; >+struct dev_printk_info; >+ > extern struct printk_ringbuffer *prb; >+extern bool printk_threads_enabled; > > __printf(4, 0) > int vprintk_store(int facility, int level, >@@ -53,6 +66,9 @@ int vprintk_store(int facility, int level, > __printf(1, 0) int vprintk_default(const char *fmt, va_list args); > __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); > >+void __printk_safe_enter(void); >+void __printk_safe_exit(void); >+ > bool printk_percpu_data_ready(void); > > #define printk_safe_enter_irqsave(flags) \ >@@ -71,12 +87,79 @@ void defer_console_output(void); > > u16 printk_parse_prefix(const char *text, int *level, > enum printk_info_flags *flags); >+void console_lock_spinning_enable(void); >+int console_lock_spinning_disable_and_check(int cookie); > > u64 nbcon_seq_read(struct console *con); > void nbcon_seq_force(struct console *con, u64 seq); > bool nbcon_alloc(struct console *con); > void nbcon_init(struct console *con); > void nbcon_free(struct console *con); >+enum nbcon_prio nbcon_get_default_prio(void); >+void nbcon_atomic_flush_pending(void); >+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, >+ int cookie, bool use_atomic); >+void nbcon_kthread_create(struct console *con); >+void nbcon_wake_threads(void); >+void nbcon_legacy_kthread_create(void); >+ >+/* >+ * Check if the given console is currently capable and allowed to print >+ * records. Note that this function does not consider the current context, >+ * which can also play a role in deciding if @con can be used to print >+ * records. >+ */ >+static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) >+{ >+ if (!(flags & CON_ENABLED)) >+ return false; >+ >+ if ((flags & CON_SUSPENDED)) >+ return false; >+ >+ if (flags & CON_NBCON) { >+ if (use_atomic) { >+ if (!con->write_atomic) >+ return false; >+ } else { >+ if (!con->write_thread) >+ return false; >+ } >+ } else { >+ if (!con->write) >+ return false; >+ } >+ >+ /* >+ * Console drivers may assume that per-cpu resources have been >+ * allocated. So unless they're explicitly marked as being able to >+ * cope (CON_ANYTIME) don't call them until this CPU is officially up. >+ */ >+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) >+ return false; >+ >+ return true; >+} >+ >+/** >+ * nbcon_kthread_wake - Wake up a printk thread >+ * @con: Console to operate on >+ */ >+static inline void nbcon_kthread_wake(struct console *con) >+{ >+ /* >+ * Guarantee any new records can be seen by tasks preparing to wait >+ * before this context checks if the rcuwait is empty. >+ * >+ * The full memory barrier in rcuwait_wake_up() pairs with the full >+ * memory barrier within set_current_state() of >+ * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait() >+ * adds the waiter but before it has checked the wait condition. >+ * >+ * This pairs with nbcon_kthread_func:A. >+ */ >+ rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */ >+} > > #else > >@@ -84,6 +167,10 @@ void nbcon_free(struct console *con); > #define PRINTK_MESSAGE_MAX 0 > #define PRINTKRB_RECORD_MAX 0 > >+static inline void nbcon_kthread_wake(struct console *con) { } >+static inline void nbcon_kthread_create(struct console *con) { } >+#define printk_threads_enabled (false) >+ > /* > * In !PRINTK builds we still export console_sem > * semaphore and some of console functions (console_unlock()/etc.), so >@@ -98,9 +185,27 @@ static inline void nbcon_seq_force(struct console *con, u64 seq) { } > static inline bool nbcon_alloc(struct console *con) { return false; } > static inline void nbcon_init(struct console *con) { } > static inline void nbcon_free(struct console *con) { } >+static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; } >+static inline void nbcon_atomic_flush_pending(void) { } >+static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, >+ int cookie, bool use_atomic) { return false; } >+ >+static inline bool console_is_usable(struct console *con, short flags, >+ bool use_atomic) { return false; } > > #endif /* CONFIG_PRINTK */ > >+extern bool have_boot_console; >+extern bool have_legacy_console; >+ >+/* >+ * Specifies if the console lock/unlock dance is needed for console >+ * printing. If @have_boot_console is true, the nbcon consoles will >+ * be printed serially along with the legacy consoles because nbcon >+ * consoles cannot print simultaneously with boot consoles. >+ */ >+#define printing_via_unlock (have_legacy_console || have_boot_console) >+ > extern struct printk_buffers printk_shared_pbufs; > > /** >@@ -135,4 +135,5 @@ > > #ifdef CONFIG_PRINTK > void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped); >+void console_prepend_replay(struct printk_message *pmsg); > #endif >diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c >index c8093bcc01fe6..932b888aa4c30 100644 >--- a/kernel/printk/nbcon.c >+++ b/kernel/printk/nbcon.c >@@ -2,11 +2,26 @@ > // Copyright (C) 2022 Linutronix GmbH, John Ogness > // Copyright (C) 2022 Intel, Thomas Gleixner > >-#include <linux/kernel.h> >+#include <linux/atomic.h> >+#include <linux/bug.h> > #include <linux/console.h> > #include <linux/delay.h> >+#include <linux/errno.h> >+#include <linux/export.h> >+#include <linux/init.h> >+#include <linux/irqflags.h> >+#include <linux/kthread.h> >+#include <linux/minmax.h> >+#include <linux/percpu.h> >+#include <linux/preempt.h> > #include <linux/slab.h> >+#include <linux/smp.h> >+#include <linux/stddef.h> >+#include <linux/string.h> >+#include <linux/syscore_ops.h> >+#include <linux/types.h> > #include "internal.h" >+#include "printk_ringbuffer.h" > /* > * Printk console printing implementation for consoles which does not depend > * on the legacy style console_lock mechanism. >@@ -172,9 +187,6 @@ void nbcon_seq_force(struct console *con, u64 seq) > u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); > > atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); >- >- /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */ >- con->seq = 0; > } > > /** >@@ -201,6 +213,8 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) > } > } > >+bool printk_threads_enabled __ro_after_init; >+ > /** > * nbcon_context_try_acquire_direct - Try to acquire directly > * @ctxt: The context of the caller >@@ -531,6 +545,7 @@ static struct printk_buffers panic_nbcon_pbufs; > * nbcon_context_try_acquire - Try to acquire nbcon console > * @ctxt: The context of the caller > * >+ * Context: Any context which could not be migrated to another CPU. > * Return: True if the console was acquired. False otherwise. > * > * If the caller allowed an unsafe hostile takeover, on success the >@@ -538,7 +553,6 @@ static struct printk_buffers panic_nbcon_pbufs; > * in an unsafe state. Otherwise, on success the caller may assume > * the console is not in an unsafe state. > */ >-__maybe_unused > static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) > { > unsigned int cpu = smp_processor_id(); >@@ -824,9 +838,42 @@ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) > } > EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); > >+/** >+ * nbcon_reacquire - Reacquire a console after losing ownership >+ * @wctxt: The write context that was handed to the write function >+ * >+ * Since ownership can be lost at any time due to handover or takeover, a >+ * printing context _should_ be prepared to back out immediately and >+ * carefully. However, there are many scenarios where the context _must_ >+ * reacquire ownership in order to finalize or revert hardware changes. >+ * >+ * This function allows a context to reacquire ownership using the same >+ * priority as its previous ownership. >+ * >+ * Note that for printing contexts, after a successful reacquire the >+ * context will have no output buffer because that has been lost. This >+ * function cannot be used to resume printing. >+ */ >+void nbcon_reacquire(struct nbcon_write_context *wctxt) >+{ >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); >+ struct console *con = ctxt->console; >+ struct nbcon_state cur; >+ >+ while (!nbcon_context_try_acquire(ctxt)) >+ cpu_relax(); >+ >+ wctxt->outbuf = NULL; >+ wctxt->len = 0; >+ nbcon_state_read(con, &cur); >+ wctxt->unsafe_takeover = cur.unsafe_takeover; >+} >+EXPORT_SYMBOL_GPL(nbcon_reacquire); >+ > /** > * nbcon_emit_next_record - Emit a record in the acquired context > * @wctxt: The write context that will be handed to the write function >+ * @use_atomic: True if the write_atomic callback is to be used > * > * Return: True if this context still owns the console. False if > * ownership was handed over or taken. >@@ -840,8 +887,7 @@ EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); > * When true is returned, @wctxt->ctxt.backlog indicates whether there are > * still records pending in the ringbuffer, > */ >-__maybe_unused >-static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) >+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) > { > struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); > struct console *con = ctxt->console; >@@ -852,7 +898,7 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > unsigned long con_dropped; > struct nbcon_state cur; > unsigned long dropped; >- bool done; >+ unsigned long ulseq; > > /* > * The printk buffers are filled within an unsafe section. This >@@ -878,6 +924,28 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > if (dropped && !is_extended) > console_prepend_dropped(&pmsg, dropped); > >+ /* >+ * If the previous owner was assigned the same record, this context >+ * has taken over ownership and is replaying the record. Prepend a >+ * message to let the user know the record is replayed. >+ */ >+ ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq)); >+ if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) { >+ console_prepend_replay(&pmsg); >+ } else { >+ /* >+ * Ensure this context is still the owner before trying to >+ * update @nbcon_prev_seq. Otherwise the value in @ulseq may >+ * not be from the previous owner. >+ */ >+ nbcon_state_read(con, &cur); >+ if (!nbcon_context_can_proceed(ctxt, &cur)) >+ return false; >+ >+ atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq, >+ __u64seq_to_ulseq(pmsg.seq)); >+ } >+ > if (!nbcon_context_exit_unsafe(ctxt)) > return false; > >@@ -891,17 +959,32 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > nbcon_state_read(con, &cur); > wctxt->unsafe_takeover = cur.unsafe_takeover; > >- if (con->write_atomic) { >- done = con->write_atomic(con, wctxt); >+ if (use_atomic && >+ con->write_atomic) { >+ con->write_atomic(con, wctxt); >+ >+ } else if (!use_atomic && >+ con->write_thread) { >+ con->write_thread(con, wctxt); >+ > } else { >- nbcon_context_release(ctxt); >+ /* >+ * This function should never be called for legacy consoles. >+ * Handle it as if ownership was lost and try to continue. >+ */ > WARN_ON_ONCE(1); >- done = false; >+ nbcon_context_release(ctxt); >+ return false; > } > >- /* If not done, the emit was aborted. */ >- if (!done) >+ if (!wctxt->outbuf) { >+ /* >+ * Ownership was lost and reacquired by the driver. >+ * Handle it as if ownership was lost and try to continue. >+ */ >+ nbcon_context_release(ctxt); > return false; >+ } > > /* > * Since any dropped message was successfully output, reset the >@@ -928,6 +1011,550 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > return nbcon_context_exit_unsafe(ctxt); > } > >+/** >+ * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup >+ * @con: Console to operate on >+ * @ctxt: The acquire context that contains the state >+ * at console_acquire() >+ * >+ * Return: True if the thread should shutdown or if the console is >+ * allowed to print and a record is available. False otherwise. >+ * >+ * After the thread wakes up, it must first check if it should shutdown before >+ * attempting any printing. >+ */ >+static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) >+{ >+ bool ret = false; >+ short flags; >+ int cookie; >+ >+ if (kthread_should_stop()) >+ return true; >+ >+ cookie = console_srcu_read_lock(); >+ >+ flags = console_srcu_read_flags(con); >+ if (console_is_usable(con, flags, false)) { >+ /* Bring the sequence in @ctxt up to date */ >+ ctxt->seq = nbcon_seq_read(con); >+ >+ ret = prb_read_valid(prb, ctxt->seq, NULL); >+ } >+ >+ console_srcu_read_unlock(cookie); >+ return ret; >+} >+ >+/** >+ * nbcon_kthread_func - The printer thread function >+ * @__console: Console to operate on >+ */ >+static int nbcon_kthread_func(void *__console) >+{ >+ struct console *con = __console; >+ struct nbcon_write_context wctxt = { >+ .ctxt.console = con, >+ .ctxt.prio = NBCON_PRIO_NORMAL, >+ }; >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); >+ unsigned long flags; >+ short con_flags; >+ bool backlog; >+ int cookie; >+ int ret; >+ >+wait_for_event: >+ /* >+ * Guarantee this task is visible on the rcuwait before >+ * checking the wake condition. >+ * >+ * The full memory barrier within set_current_state() of >+ * ___rcuwait_wait_event() pairs with the full memory >+ * barrier within rcuwait_has_sleeper(). >+ * >+ * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. >+ */ >+ ret = rcuwait_wait_event(&con->rcuwait, >+ nbcon_kthread_should_wakeup(con, ctxt), >+ TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ >+ >+ if (kthread_should_stop()) >+ return 0; >+ >+ /* Wait was interrupted by a spurious signal, go back to sleep. */ >+ if (ret) >+ goto wait_for_event; >+ >+ do { >+ backlog = false; >+ >+ cookie = console_srcu_read_lock(); >+ >+ con_flags = console_srcu_read_flags(con); >+ >+ if (console_is_usable(con, con_flags, false)) { >+ con->device_lock(con, &flags); >+ >+ /* >+ * Ensure this stays on the CPU to make handover and >+ * takeover possible. >+ */ >+ cant_migrate(); >+ >+ if (nbcon_context_try_acquire(ctxt)) { >+ /* >+ * If the emit fails, this context is no >+ * longer the owner. >+ */ >+ if (nbcon_emit_next_record(&wctxt, false)) { >+ nbcon_context_release(ctxt); >+ backlog = ctxt->backlog; >+ } >+ } >+ >+ con->device_unlock(con, flags); >+ } >+ >+ console_srcu_read_unlock(cookie); >+ >+ } while (backlog); >+ >+ goto wait_for_event; >+} >+ >+/** >+ * nbcon_irq_work - irq work to wake printk thread >+ * @irq_work: The irq work to operate on >+ */ >+static void nbcon_irq_work(struct irq_work *irq_work) >+{ >+ struct console *con = container_of(irq_work, struct console, irq_work); >+ >+ nbcon_kthread_wake(con); >+} >+ >+static inline bool rcuwait_has_sleeper(struct rcuwait *w) >+{ >+ bool has_sleeper; >+ >+ rcu_read_lock(); >+ /* >+ * Guarantee any new records can be seen by tasks preparing to wait >+ * before this context checks if the rcuwait is empty. >+ * >+ * This full memory barrier pairs with the full memory barrier within >+ * set_current_state() of ___rcuwait_wait_event(), which is called >+ * after prepare_to_rcuwait() adds the waiter but before it has >+ * checked the wait condition. >+ * >+ * This pairs with nbcon_kthread_func:A. >+ */ >+ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ >+ has_sleeper = !!rcu_dereference(w->task); >+ rcu_read_unlock(); >+ >+ return has_sleeper; >+} >+ >+/** >+ * nbcon_wake_threads - Wake up printing threads using irq_work >+ */ >+void nbcon_wake_threads(void) >+{ >+ struct console *con; >+ int cookie; >+ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ /* >+ * Only schedule irq_work if the printing thread is >+ * actively waiting. If not waiting, the thread will >+ * notice by itself that it has work to do. >+ */ >+ if (con->kthread && rcuwait_has_sleeper(&con->rcuwait)) >+ irq_work_queue(&con->irq_work); >+ } >+ console_srcu_read_unlock(cookie); >+} >+ >+/* Track the nbcon emergency nesting per CPU. */ >+static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); >+static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; >+ >+/** >+ * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer >+ * >+ * Return: Either a pointer to the per CPU emergency nesting counter of >+ * the current CPU or to the init data during early boot. >+ */ >+static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) >+{ >+ /* >+ * The value of __printk_percpu_data_ready gets set in normal >+ * context and before SMP initialization. As a result it could >+ * never change while inside an nbcon emergency section. >+ */ >+ if (!printk_percpu_data_ready()) >+ return &early_nbcon_pcpu_emergency_nesting; >+ >+ return this_cpu_ptr(&nbcon_pcpu_emergency_nesting); >+} >+ >+/** >+ * nbcon_emit_one - Print one record for an nbcon console using the >+ * specified callback >+ * @wctxt: An initialized write context struct to use for this context >+ * @use_atomic: True if the write_atomic callback is to be used >+ * >+ * Return: False if it is known there are no more records to print, >+ * otherwise true. >+ * >+ * This is an internal helper to handle the locking of the console before >+ * calling nbcon_emit_next_record(). >+ */ >+static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) >+{ >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); >+ >+ if (!nbcon_context_try_acquire(ctxt)) >+ return true; >+ >+ /* >+ * nbcon_emit_next_record() returns false when the console was >+ * handed over or taken over. In both cases the context is no >+ * longer valid. >+ */ >+ if (!nbcon_emit_next_record(wctxt, use_atomic)) >+ return true; >+ >+ nbcon_context_release(ctxt); >+ >+ return ctxt->backlog; >+} >+ >+/** >+ * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon >+ * printing on the current CPU >+ * >+ * Context: Any context which could not be migrated to another CPU. >+ * Return: The nbcon_prio to use for acquiring an nbcon console in this >+ * context for printing. >+ */ >+enum nbcon_prio nbcon_get_default_prio(void) >+{ >+ unsigned int *cpu_emergency_nesting; >+ >+ if (this_cpu_in_panic()) >+ return NBCON_PRIO_PANIC; >+ >+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); >+ if (*cpu_emergency_nesting) >+ return NBCON_PRIO_EMERGENCY; >+ >+ return NBCON_PRIO_NORMAL; >+} >+ >+/** >+ * nbcon_legacy_emit_next_record - Print one record for an nbcon console >+ * in legacy contexts >+ * @con: The console to print on >+ * @handover: Will be set to true if a printk waiter has taken over the >+ * console_lock, in which case the caller is no longer holding >+ * both the console_lock and the SRCU read lock. Otherwise it >+ * is set to false. >+ * @cookie: The cookie from the SRCU read lock. >+ * @use_atomic: True if the write_atomic callback is to be used >+ * >+ * Context: Any context except NMI. >+ * Return: False if the given console has no next record to print, >+ * otherwise true. >+ * >+ * This function is meant to be called by console_flush_all() to print records >+ * on nbcon consoles from legacy context (printing via console unlocking). >+ * Essentially it is the nbcon version of console_emit_next_record(). >+ */ >+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, >+ int cookie, bool use_atomic) >+{ >+ struct nbcon_write_context wctxt = { }; >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); >+ unsigned long flags; >+ bool progress; >+ >+ *handover = false; >+ >+ ctxt->console = con; >+ >+ if (use_atomic) { >+ /* Use the same procedure as console_emit_next_record(). */ >+ printk_safe_enter_irqsave(flags); >+ console_lock_spinning_enable(); >+ stop_critical_timings(); >+ >+ ctxt->prio = nbcon_get_default_prio(); >+ progress = nbcon_emit_one(&wctxt, use_atomic); >+ >+ start_critical_timings(); >+ *handover = console_lock_spinning_disable_and_check(cookie); >+ printk_safe_exit_irqrestore(flags); >+ } else { >+ con->device_lock(con, &flags); >+ cant_migrate(); >+ >+ ctxt->prio = nbcon_get_default_prio(); >+ progress = nbcon_emit_one(&wctxt, use_atomic); >+ >+ con->device_unlock(con, flags); >+ } >+ >+ return progress; >+} >+ >+/** >+ * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its >+ * write_atomic() callback >+ * @con: The nbcon console to flush >+ * @stop_seq: Flush up until this record >+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers >+ * >+ * Return: True if taken over while printing. Otherwise false. >+ * >+ * If flushing up to @stop_seq was not successful, it only makes sense for the >+ * caller to try again when true was returned. When false is returned, either >+ * there are no more records available to read or this context is not allowed >+ * to acquire the console. >+ */ >+static bool __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, >+ bool allow_unsafe_takeover) >+{ >+ struct nbcon_write_context wctxt = { }; >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); >+ >+ ctxt->console = con; >+ ctxt->spinwait_max_us = 2000; >+ ctxt->prio = nbcon_get_default_prio(); >+ ctxt->allow_unsafe_takeover = allow_unsafe_takeover; >+ >+ if (!nbcon_context_try_acquire(ctxt)) >+ return false; >+ >+ while (nbcon_seq_read(con) < stop_seq) { >+ /* >+ * nbcon_emit_next_record() returns false when the console was >+ * handed over or taken over. In both cases the context is no >+ * longer valid. >+ */ >+ if (!nbcon_emit_next_record(&wctxt, true)) >+ return true; >+ >+ if (!ctxt->backlog) >+ break; >+ } >+ >+ nbcon_context_release(ctxt); >+ >+ return false; >+} >+ >+/** >+ * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their >+ * write_atomic() callback >+ * @stop_seq: Flush up until this record >+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers >+ */ >+static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) >+{ >+ struct console *con; >+ bool should_retry; >+ int cookie; >+ >+ do { >+ should_retry = false; >+ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ short flags = console_srcu_read_flags(con); >+ unsigned long irq_flags; >+ >+ if (!(flags & CON_NBCON)) >+ continue; >+ >+ if (!console_is_usable(con, flags, true)) >+ continue; >+ >+ if (nbcon_seq_read(con) >= stop_seq) >+ continue; >+ >+ /* >+ * Atomic flushing does not use console driver >+ * synchronization (i.e. it does not hold the port >+ * lock for uart consoles). Therefore IRQs must be >+ * disabled to avoid being interrupted and then >+ * calling into a driver that will deadlock trying >+ * to acquire console ownership. >+ */ >+ local_irq_save(irq_flags); >+ >+ should_retry |= __nbcon_atomic_flush_pending_con(con, stop_seq, >+ allow_unsafe_takeover); >+ local_irq_restore(irq_flags); >+ } >+ console_srcu_read_unlock(cookie); >+ } while (should_retry); >+} >+ >+/** >+ * nbcon_atomic_flush_pending - Flush all nbcon consoles using their >+ * write_atomic() callback >+ * >+ * Flush the backlog up through the currently newest record. Any new >+ * records added while flushing will not be flushed. This is to avoid >+ * one CPU printing unbounded because other CPUs continue to add records. >+ */ >+void nbcon_atomic_flush_pending(void) >+{ >+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); >+} >+ >+/** >+ * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their >+ * write_atomic() callback and allowing unsafe hostile takeovers >+ * >+ * Flush the backlog up through the currently newest record. Unsafe hostile >+ * takeovers will be performed, if necessary. >+ */ >+void nbcon_atomic_flush_unsafe(void) >+{ >+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); >+} >+ >+/** >+ * nbcon_cpu_emergency_enter - Enter an emergency section where printk() >+ * messages for that CPU are only stored >+ * >+ * Upon exiting the emergency section, all stored messages are flushed. >+ * >+ * Context: Any context. Disables preemption. >+ * >+ * When within an emergency section, no printing occurs on that CPU. This >+ * is to allow all emergency messages to be dumped into the ringbuffer before >+ * flushing the ringbuffer. The actual printing occurs when exiting the >+ * outermost emergency section. >+ */ >+void nbcon_cpu_emergency_enter(void) >+{ >+ unsigned int *cpu_emergency_nesting; >+ >+ preempt_disable(); >+ >+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); >+ (*cpu_emergency_nesting)++; >+} >+ >+/** >+ * nbcon_cpu_emergency_exit - Exit an emergency section and flush the >+ * stored messages >+ * >+ * Flushing only occurs when exiting all nesting for the CPU. >+ * >+ * Context: Any context. Enables preemption. >+ */ >+void nbcon_cpu_emergency_exit(void) >+{ >+ unsigned int *cpu_emergency_nesting; >+ bool do_trigger_flush = false; >+ >+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); >+ >+ WARN_ON_ONCE(*cpu_emergency_nesting == 0); >+ >+ if (*cpu_emergency_nesting == 1) { >+ nbcon_atomic_flush_pending(); >+ do_trigger_flush = true; >+ } >+ >+ /* Undo the nesting count of nbcon_cpu_emergency_enter(). */ >+ (*cpu_emergency_nesting)--; >+ >+ preempt_enable(); >+ >+ if (do_trigger_flush) >+ printk_trigger_flush(); >+} >+ >+/** >+ * nbcon_kthread_stop - Stop a printer thread >+ * @con: Console to operate on >+ */ >+static void nbcon_kthread_stop(struct console *con) >+{ >+ lockdep_assert_console_list_lock_held(); >+ >+ if (!con->kthread) >+ return; >+ >+ kthread_stop(con->kthread); >+ con->kthread = NULL; >+} >+ >+/** >+ * nbcon_kthread_create - Create a printer thread >+ * @con: Console to operate on >+ * >+ * If it fails, let the console proceed. The atomic part might >+ * be usable and useful. >+ */ >+void nbcon_kthread_create(struct console *con) >+{ >+ struct task_struct *kt; >+ >+ lockdep_assert_console_list_lock_held(); >+ >+ if (!(con->flags & CON_NBCON) || !con->write_thread) >+ return; >+ >+ if (!printk_threads_enabled || con->kthread) >+ return; >+ >+ /* >+ * Printer threads cannot be started as long as any boot console is >+ * registered because there is no way to synchronize the hardware >+ * registers between boot console code and regular console code. >+ */ >+ if (have_boot_console) >+ return; >+ >+ kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); >+ if (IS_ERR(kt)) { >+ con_printk(KERN_ERR, con, "failed to start printing thread\n"); >+ return; >+ } >+ >+ con->kthread = kt; >+ >+ /* >+ * It is important that console printing threads are scheduled >+ * shortly after a printk call and with generous runtime budgets. >+ */ >+ sched_set_normal(con->kthread, -20); >+} >+ >+static int __init printk_setup_threads(void) >+{ >+ struct console *con; >+ >+ console_list_lock(); >+ printk_threads_enabled = true; >+ for_each_console(con) >+ nbcon_kthread_create(con); >+ if (force_printkthreads() && printing_via_unlock) >+ nbcon_legacy_kthread_create(); >+ console_list_unlock(); >+ return 0; >+} >+early_initcall(printk_setup_threads); >+ > /** > * nbcon_alloc - Allocate buffers needed by the nbcon console > * @con: Console to allocate buffers for >@@ -964,8 +1591,6 @@ bool nbcon_alloc(struct console *con) > * > * nbcon_alloc() *must* be called and succeed before this function > * is called. >- * >- * This function expects that the legacy @con->seq has been set. > */ > void nbcon_init(struct console *con) > { >@@ -974,8 +1599,12 @@ void nbcon_init(struct console *con) > /* nbcon_alloc() must have been called and successful! */ > BUG_ON(!con->pbufs); > >- nbcon_seq_force(con, con->seq); >+ rcuwait_init(&con->rcuwait); >+ init_irq_work(&con->irq_work, nbcon_irq_work); >+ nbcon_seq_force(con, 0); >+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL); > nbcon_state_set(con, &state); >+ nbcon_kthread_create(con); > } > > /** >@@ -986,6 +1615,7 @@ void nbcon_free(struct console *con) > { > struct nbcon_state state = { }; > >+ nbcon_kthread_stop(con); > nbcon_state_set(con, &state); > > /* Boot consoles share global printk buffers. */ >@@ -994,3 +1624,82 @@ void nbcon_free(struct console *con) > > con->pbufs = NULL; > } >+ >+/** >+ * nbcon_driver_acquire - Acquire nbcon console and enter unsafe section >+ * @con: The nbcon console to acquire >+ * >+ * Context: Any context which could not be migrated to another CPU. >+ * >+ * Console drivers will usually use their own internal synchronization >+ * mechasism to synchronize between console printing and non-printing >+ * activities (such as setting baud rates). However, nbcon console drivers >+ * supporting atomic consoles may also want to mark unsafe sections when >+ * performing non-printing activities. >+ * >+ * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL >+ * and marks it unsafe for handover/takeover. >+ * >+ * Console drivers using this function must have provided @nbcon_drvdata in >+ * their struct console, which is used to track ownership and state >+ * information. >+ */ >+void nbcon_driver_acquire(struct console *con) >+{ >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(con->nbcon_drvdata, ctxt); >+ >+ cant_migrate(); >+ >+ do { >+ do { >+ memset(ctxt, 0, sizeof(*ctxt)); >+ ctxt->console = con; >+ ctxt->prio = NBCON_PRIO_NORMAL; >+ } while (!nbcon_context_try_acquire(ctxt)); >+ >+ } while (!nbcon_context_enter_unsafe(ctxt)); >+} >+EXPORT_SYMBOL_GPL(nbcon_driver_acquire); >+ >+/** >+ * nbcon_driver_release - Exit unsafe section and release the nbcon console >+ * @con: The nbcon console acquired in nbcon_driver_acquire() >+ */ >+void nbcon_driver_release(struct console *con) >+{ >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(con->nbcon_drvdata, ctxt); >+ >+ if (nbcon_context_exit_unsafe(ctxt)) >+ nbcon_context_release(ctxt); >+} >+EXPORT_SYMBOL_GPL(nbcon_driver_release); >+ >+/** >+ * printk_kthread_shutdown - shutdown all threaded printers >+ * >+ * On system shutdown all threaded printers are stopped. This allows printk >+ * to transition back to atomic printing, thus providing a robust mechanism >+ * for the final shutdown/reboot messages to be output. >+ */ >+static void printk_kthread_shutdown(void) >+{ >+ struct console *con; >+ >+ console_list_lock(); >+ for_each_console(con) { >+ if (con->flags & CON_NBCON) >+ nbcon_kthread_stop(con); >+ } >+ console_list_unlock(); >+} >+ >+static struct syscore_ops printk_syscore_ops = { >+ .shutdown = printk_kthread_shutdown, >+}; >+ >+static int __init printk_init_ops(void) >+{ >+ register_syscore_ops(&printk_syscore_ops); >+ return 0; >+} >+device_initcall(printk_init_ops); >diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c >index 72f6a564e832f..8ee6c60b47c4b 100644 >--- a/kernel/printk/printk.c >+++ b/kernel/printk/printk.c >@@ -195,6 +195,17 @@ static int __init control_devkmsg(char *str) > } > __setup("printk.devkmsg=", control_devkmsg); > >+#if !defined(CONFIG_PREEMPT_RT) >+DEFINE_STATIC_KEY_FALSE(force_printkthreads_key); >+ >+static int __init setup_forced_printkthreads(char *arg) >+{ >+ static_branch_enable(&force_printkthreads_key); >+ return 0; >+} >+early_param("threadprintk", setup_forced_printkthreads); >+#endif >+ > char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; > #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) > int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, >@@ -282,6 +293,7 @@ EXPORT_SYMBOL(console_list_unlock); > * Return: A cookie to pass to console_srcu_read_unlock(). > */ > int console_srcu_read_lock(void) >+ __acquires(&console_srcu) > { > return srcu_read_lock_nmisafe(&console_srcu); > } >@@ -295,6 +307,7 @@ EXPORT_SYMBOL(console_srcu_read_lock); > * Counterpart to console_srcu_read_lock() > */ > void console_srcu_read_unlock(int cookie) >+ __releases(&console_srcu) > { > srcu_read_unlock_nmisafe(&console_srcu, cookie); > } >@@ -461,14 +474,33 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; > /* syslog_lock protects syslog_* variables and write access to clear_seq. */ > static DEFINE_MUTEX(syslog_lock); > >-#ifdef CONFIG_PRINTK > /* >- * During panic, heavy printk by other CPUs can delay the >- * panic and risk deadlock on console resources. >+ * Specifies if a legacy console is registered. If legacy consoles are >+ * present, it is necessary to perform the console lock/unlock dance >+ * whenever console flushing should occur. > */ >-static int __read_mostly suppress_panic_printk; >+bool have_legacy_console; > >+/* >+ * Specifies if an nbcon console is registered. If nbcon consoles are present, >+ * synchronous printing of legacy consoles will not occur during panic until >+ * the backtrace has been stored to the ringbuffer. >+ */ >+static bool have_nbcon_console; >+ >+/* >+ * Specifies if a boot console is registered. If boot consoles are present, >+ * nbcon consoles cannot print simultaneously and must be synchronized by >+ * the console lock. This is because boot consoles and nbcon consoles may >+ * have mapped the same hardware. >+ */ >+bool have_boot_console; >+ >+#ifdef CONFIG_PRINTK > DECLARE_WAIT_QUEUE_HEAD(log_wait); >+ >+static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); >+ > /* All 3 protected by @syslog_lock. */ > /* the next printk record to read by syslog(READ) or /proc/kmsg */ > static u64 syslog_seq; >@@ -1867,7 +1899,7 @@ static bool console_waiter; > * there may be a waiter spinning (like a spinlock). Also it must be > * ready to hand over the lock at the end of the section. > */ >-static void console_lock_spinning_enable(void) >+void console_lock_spinning_enable(void) > { > /* > * Do not use spinning in panic(). The panic CPU wants to keep the lock. >@@ -1906,7 +1938,7 @@ static void console_lock_spinning_enable(void) > * > * Return: 1 if the lock rights were passed, 0 otherwise. > */ >-static int console_lock_spinning_disable_and_check(int cookie) >+int console_lock_spinning_disable_and_check(int cookie) > { > int waiter; > >@@ -2317,54 +2317,116 @@ > return ret; > } > >+static bool legacy_allow_panic_sync; >+ >+/* >+ * This acts as a one-way switch to allow legacy consoles to print from >+ * the printk() caller context on a panic CPU. It also attempts to flush >+ * the legacy consoles in this context. >+ */ >+void printk_legacy_allow_panic_sync(void) >+{ >+ legacy_allow_panic_sync = true; >+ >+ if (printing_via_unlock && !in_nmi()) { >+ if (console_trylock()) >+ console_unlock(); >+ } >+} >+ > asmlinkage int vprintk_emit(int facility, int level, > const struct dev_printk_info *dev_info, > const char *fmt, va_list args) > { >+ bool do_trylock_unlock = printing_via_unlock && >+ !force_printkthreads(); > int printed_len; >- bool in_sched = false; > > /* Suppress unimportant messages after panic happens */ > if (unlikely(suppress_printk)) > return 0; > >- if (unlikely(suppress_panic_printk) && other_cpu_in_panic()) >- return 0; >+ if (level == LOGLEVEL_SCHED) { >+ level = LOGLEVEL_DEFAULT; >+ /* If called from the scheduler, we can not call up(). */ >+ do_trylock_unlock = false; >+ } >+ >+ printk_delay(level); >+ >+ printed_len = vprintk_store(facility, level, dev_info, fmt, args); >+ >+ if (have_nbcon_console && !have_boot_console) { >+ bool is_panic_context = this_cpu_in_panic(); > >- if (level == LOGLEVEL_SCHED) { >- level = LOGLEVEL_DEFAULT; >- in_sched = true; >- } >+ /* >+ * In panic, the legacy consoles are not allowed to print from >+ * the printk calling context unless explicitly allowed. This >+ * gives the safe nbcon consoles a chance to print out all the >+ * panic messages first. This restriction only applies if >+ * there are nbcon consoles registered. >+ */ >+ if (is_panic_context) >+ do_trylock_unlock &= legacy_allow_panic_sync; > >- printk_delay(level); >+ /* >+ * There are situations where nbcon atomic printing should >+ * happen in the printk() caller context: >+ * >+ * - When this CPU is in panic. >+ * >+ * - When booting, before the printing threads have been >+ * started. >+ * >+ * - During shutdown, since the printing threads may not get >+ * a chance to print the final messages. >+ * >+ * Note that if boot consoles are registered, the console >+ * lock/unlock dance must be relied upon instead because nbcon >+ * consoles cannot print simultaneously with boot consoles. >+ */ >+ if (is_panic_context || >+ !printk_threads_enabled || >+ (system_state > SYSTEM_RUNNING)) { >+ nbcon_atomic_flush_pending(); >+ } >+ } > >- printed_len = vprintk_store(facility, level, dev_info, fmt, args); >+ nbcon_wake_threads(); > >- /* If called from the scheduler, we can not call up(). */ >- if (!in_sched) { >+ if (do_trylock_unlock) { > /* > * The caller may be holding system-critical or > * timing-sensitive locks. Disable preemption during > * printing of all remaining records to all consoles so that >- * this context can return as soon as possible. Hopefully >- * another printk() caller will take over the printing. >- */ >+ * this context can return as soon as possible. Hopefully >+ * another printk() caller will take over the printing. >+ * >+ * Also, nbcon_get_default_prio() requires migration disabled. >+ */ > preempt_disable(); >+ > /* > * Try to acquire and then immediately release the console > * semaphore. The release will print out buffers. With the > * spinning variant, this context tries to take over the > * printing from another printing context. >- */ >- if (console_trylock_spinning()) >- console_unlock(); >+ * >+ * Skip it in EMERGENCY priority. The console will be >+ * explicitly flushed when exiting the emergency section. >+ */ >+ if (nbcon_get_default_prio() != NBCON_PRIO_EMERGENCY) { >+ if (console_trylock_spinning()) >+ console_unlock(); >+ } >+ > preempt_enable(); > } > >- if (in_sched) >- defer_console_output(); >- else >+ if (do_trylock_unlock) > wake_up_klogd(); >+ else >+ defer_console_output(); > > return printed_len; > } >@@ -2387,6 +2488,14 @@ EXPORT_SYMBOL(_printk); > static bool pr_flush(int timeout_ms, bool reset_on_progress); > static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); > >+static struct task_struct *nbcon_legacy_kthread; >+ >+static inline void wake_up_legacy_kthread(void) >+{ >+ if (nbcon_legacy_kthread) >+ wake_up_interruptible(&legacy_wait); >+} >+ > #else /* CONFIG_PRINTK */ > > #define printk_time false >@@ -2400,6 +2509,8 @@ static u64 syslog_seq; > static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } > static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } > >+static inline void nbcon_legacy_kthread_create(void) { } >+static inline void wake_up_legacy_kthread(void) { } > #endif /* CONFIG_PRINTK */ > > #ifdef CONFIG_EARLY_PRINTK >@@ -2615,6 +2726,8 @@ void suspend_console(void) > void resume_console(void) > { > struct console *con; >+ short flags; >+ int cookie; > > if (!console_suspend_enabled) > return; >@@ -2631,6 +2744,20 @@ void resume_console(void) > */ > synchronize_srcu(&console_srcu); > >+ /* >+ * Since this runs in task context, wake the threaded printers >+ * directly rather than scheduling irq_work to do it. >+ */ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ flags = console_srcu_read_flags(con); >+ if (flags & CON_NBCON) >+ nbcon_kthread_wake(con); >+ } >+ console_srcu_read_unlock(cookie); >+ >+ wake_up_legacy_kthread(); >+ > pr_flush(1000, true); > } > >@@ -2645,7 +2772,8 @@ void resume_console(void) > */ > static int console_cpu_notify(unsigned int cpu) > { >- if (!cpuhp_tasks_frozen) { >+ if (!cpuhp_tasks_frozen && printing_via_unlock && >+ !force_printkthreads()) { > /* If trylock fails, someone else is doing the printing */ > if (console_trylock()) > console_unlock(); >@@ -2702,36 +2830,6 @@ int is_console_locked(void) > } > EXPORT_SYMBOL(is_console_locked); > >-/* >- * Check if the given console is currently capable and allowed to print >- * records. >- * >- * Requires the console_srcu_read_lock. >- */ >-static inline bool console_is_usable(struct console *con) >-{ >- short flags = console_srcu_read_flags(con); >- >- if (!(flags & CON_ENABLED)) >- return false; >- >- if ((flags & CON_SUSPENDED)) >- return false; >- >- if (!con->write) >- return false; >- >- /* >- * Console drivers may assume that per-cpu resources have been >- * allocated. So unless they're explicitly marked as being able to >- * cope (CON_ANYTIME) don't call them until this CPU is officially up. >- */ >- if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) >- return false; >- >- return true; >-} >- > static void __console_unlock(void) > { > console_locked = 0; >@@ -2741,30 +2839,25 @@ static void __console_unlock(void) > #ifdef CONFIG_PRINTK > > /* >- * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This >- * is achieved by shifting the existing message over and inserting the dropped >- * message. >+ * Prepend the message in @pmsg->pbufs->outbuf with the message in >+ * @pmsg->pbufs->scratchbuf. This is achieved by shifting the existing message >+ * over and inserting the scratchbuf message. > * > * @pmsg is the printk message to prepend. > * >- * @dropped is the dropped count to report in the dropped message. >+ * @len is the length of the message in @pmsg->pbufs->scratchbuf. > * > * If the message text in @pmsg->pbufs->outbuf does not have enough space for >- * the dropped message, the message text will be sufficiently truncated. >+ * the scratchbuf message, the message text will be sufficiently truncated. > * > * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. > */ >-void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) >+static void __console_prepend_scratch(struct printk_message *pmsg, size_t len) > { > struct printk_buffers *pbufs = pmsg->pbufs; >- const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); > const size_t outbuf_sz = sizeof(pbufs->outbuf); > char *scratchbuf = &pbufs->scratchbuf[0]; > char *outbuf = &pbufs->outbuf[0]; >- size_t len; >- >- len = scnprintf(scratchbuf, scratchbuf_sz, >- "** %lu printk messages dropped **\n", dropped); > > /* > * Make sure outbuf is sufficiently large before prepending. >@@ -2786,6 +2879,46 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) > pmsg->outbuf_len += len; > } > >+/* >+ * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". >+ * @pmsg->outbuf_len is updated appropriately. >+ * >+ * @pmsg is the printk message to prepend. >+ * >+ * @dropped is the dropped count to report in the dropped message. >+ */ >+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) >+{ >+ struct printk_buffers *pbufs = pmsg->pbufs; >+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); >+ char *scratchbuf = &pbufs->scratchbuf[0]; >+ size_t len; >+ >+ len = scnprintf(scratchbuf, scratchbuf_sz, >+ "** %lu printk messages dropped **\n", dropped); >+ >+ __console_prepend_scratch(pmsg, len); >+} >+ >+/* >+ * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". >+ * @pmsg->outbuf_len is updated appropriately. >+ * >+ * @pmsg is the printk message to prepend. >+ */ >+void console_prepend_replay(struct printk_message *pmsg) >+{ >+ struct printk_buffers *pbufs = pmsg->pbufs; >+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); >+ char *scratchbuf = &pbufs->scratchbuf[0]; >+ size_t len; >+ >+ len = scnprintf(scratchbuf, scratchbuf_sz, >+ "** replaying previous printk message **\n"); >+ >+ __console_prepend_scratch(pmsg, len); >+} >+ > /* > * Read and format the specified record (or a later record if the specified > * record is not available). >@@ -2808,8 +2941,6 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) > bool printk_get_next_message(struct printk_message *pmsg, u64 seq, > bool is_extended, bool may_suppress) > { >- static int panic_console_dropped; >- > struct printk_buffers *pbufs = pmsg->pbufs; > const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); > const size_t outbuf_sz = sizeof(pbufs->outbuf); >@@ -2837,17 +2968,6 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, > pmsg->seq = r.info->seq; > pmsg->dropped = r.info->seq - seq; > >- /* >- * Check for dropped messages in panic here so that printk >- * suppression can occur as early as possible if necessary. >- */ >- if (pmsg->dropped && >- panic_in_progress() && >- panic_console_dropped++ > 10) { >- suppress_panic_printk = 1; >- pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); >- } >- > /* Skip record that has level above the console loglevel. */ > if (may_suppress && suppress_message_printing(r.info->level)) > goto out; >@@ -2864,6 +2984,33 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, > return true; > } > >+/* >+ * Legacy console printing from printk() caller context does not respect >+ * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a >+ * false positive. For PREEMPT_RT the false positive condition does not >+ * occur. >+ * >+ * This map is used to establish LD_WAIT_SLEEP context for the console write >+ * callbacks when legacy printing to avoid false positive lockdep complaints, >+ * thus allowing lockdep to continue to function for real issues. >+ */ >+#ifdef CONFIG_PREEMPT_RT >+static inline void printk_legacy_lock_map_acquire_try(void) { } >+static inline void printk_legacy_lock_map_release(void) { } >+#else >+static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); >+ >+static inline void printk_legacy_lock_map_acquire_try(void) >+{ >+ lock_map_acquire_try(&printk_legacy_map); >+} >+ >+static inline void printk_legacy_lock_map_release(void) >+{ >+ lock_map_release(&printk_legacy_map); >+} >+#endif /* CONFIG_PREEMPT_RT */ >+ > /* > * Used as the printk buffers for non-panic, serialized console printing. > * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. >@@ -2913,31 +3060,45 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co > con->dropped = 0; > } > >- /* >- * While actively printing out messages, if another printk() >- * were to occur on another CPU, it may wait for this one to >- * finish. This task can not be preempted if there is a >- * waiter waiting to take over. >- * >- * Interrupts are disabled because the hand over to a waiter >- * must not be interrupted until the hand over is completed >- * (@console_waiter is cleared). >- */ >- printk_safe_enter_irqsave(flags); >- console_lock_spinning_enable(); >- >- /* Do not trace print latency. */ >- stop_critical_timings(); >- > /* Write everything out to the hardware. */ >- con->write(con, outbuf, pmsg.outbuf_len); > >- start_critical_timings(); >+ if (force_printkthreads()) { >+ /* >+ * With forced threading this function is either in a thread >+ * or panic context. So there is no need for concern about >+ * printk reentrance, handovers, or lockdep complaints. >+ */ > >- con->seq = pmsg.seq + 1; >+ con->write(con, outbuf, pmsg.outbuf_len); >+ con->seq = pmsg.seq + 1; >+ } else { >+ /* >+ * While actively printing out messages, if another printk() >+ * were to occur on another CPU, it may wait for this one to >+ * finish. This task can not be preempted if there is a >+ * waiter waiting to take over. >+ * >+ * Interrupts are disabled because the hand over to a waiter >+ * must not be interrupted until the hand over is completed >+ * (@console_waiter is cleared). >+ */ >+ printk_safe_enter_irqsave(flags); >+ console_lock_spinning_enable(); > >- *handover = console_lock_spinning_disable_and_check(cookie); >- printk_safe_exit_irqrestore(flags); >+ /* Do not trace print latency. */ >+ stop_critical_timings(); >+ >+ printk_legacy_lock_map_acquire_try(); >+ con->write(con, outbuf, pmsg.outbuf_len); >+ printk_legacy_lock_map_release(); >+ >+ start_critical_timings(); >+ >+ con->seq = pmsg.seq + 1; >+ >+ *handover = console_lock_spinning_disable_and_check(cookie); >+ printk_safe_exit_irqrestore(flags); >+ } > skip: > return true; > } >@@ -2990,13 +3151,29 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove > > cookie = console_srcu_read_lock(); > for_each_console_srcu(con) { >+ short flags = console_srcu_read_flags(con); >+ u64 printk_seq; > bool progress; > >- if (!console_is_usable(con)) >+ /* >+ * console_flush_all() is only for legacy consoles, >+ * unless the nbcon console has no kthread printer. >+ */ >+ if ((flags & CON_NBCON) && con->kthread) >+ continue; >+ >+ if (!console_is_usable(con, flags, !do_cond_resched)) > continue; > any_usable = true; > >- progress = console_emit_next_record(con, handover, cookie); >+ if (flags & CON_NBCON) { >+ progress = nbcon_legacy_emit_next_record(con, handover, cookie, >+ !do_cond_resched); >+ printk_seq = nbcon_seq_read(con); >+ } else { >+ progress = console_emit_next_record(con, handover, cookie); >+ printk_seq = con->seq; >+ } > > /* > * If a handover has occurred, the SRCU read lock >@@ -3006,8 +3183,8 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove > return false; > > /* Track the next of the highest seq flushed. */ >- if (con->seq > *next_seq) >- *next_seq = con->seq; >+ if (printk_seq > *next_seq) >+ *next_seq = printk_seq; > > if (!progress) > continue; >@@ -3030,19 +3207,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove > return false; > } > >-/** >- * console_unlock - unblock the console subsystem from printing >- * >- * Releases the console_lock which the caller holds to block printing of >- * the console subsystem. >- * >- * While the console_lock was held, console output may have been buffered >- * by printk(). If this is the case, console_unlock(); emits >- * the output prior to releasing the lock. >- * >- * console_unlock(); may be called from any context. >- */ >-void console_unlock(void) >+static void console_flush_and_unlock(void) > { > bool do_cond_resched; > bool handover; >@@ -3086,6 +3251,32 @@ void console_unlock(void) > */ > } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); > } >+ >+/** >+ * console_unlock - unblock the console subsystem from printing >+ * >+ * Releases the console_lock which the caller holds to block printing of >+ * the console subsystem. >+ * >+ * While the console_lock was held, console output may have been buffered >+ * by printk(). If this is the case, console_unlock(); emits >+ * the output prior to releasing the lock. >+ * >+ * console_unlock(); may be called from any context. >+ */ >+void console_unlock(void) >+{ >+ /* >+ * Forced threading relies on kthread and atomic consoles for >+ * printing. It never attempts to print from console_unlock(). >+ */ >+ if (force_printkthreads()) { >+ __console_unlock(); >+ return; >+ } >+ >+ console_flush_and_unlock(); >+} > EXPORT_SYMBOL(console_unlock); > > /** >@@ -3219,7 +3410,10 @@ void console_flush_on_panic(enum con_flush_mode mode) > console_srcu_read_unlock(cookie); > } > >- console_flush_all(false, &next_seq, &handover); >+ nbcon_atomic_flush_pending(); >+ >+ if (printing_via_unlock) >+ console_flush_all(false, &next_seq, &handover); > } > > /* >@@ -3276,13 +3470,122 @@ EXPORT_SYMBOL(console_stop); > > void console_start(struct console *console) > { >+ short flags; >+ > console_list_lock(); > console_srcu_write_flags(console, console->flags | CON_ENABLED); >+ flags = console->flags; > console_list_unlock(); >+ >+ /* >+ * Ensure that all SRCU list walks have completed. The related >+ * printing context must be able to see it is enabled so that >+ * it is guaranteed to wake up and resume printing. >+ */ >+ synchronize_srcu(&console_srcu); >+ >+ if (flags & CON_NBCON) >+ nbcon_kthread_wake(console); >+ else >+ wake_up_legacy_kthread(); >+ > __pr_flush(console, 1000, true); > } > EXPORT_SYMBOL(console_start); > >+#ifdef CONFIG_PRINTK >+static bool printer_should_wake(void) >+{ >+ bool available = false; >+ struct console *con; >+ int cookie; >+ >+ if (kthread_should_stop()) >+ return true; >+ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ short flags = console_srcu_read_flags(con); >+ u64 printk_seq; >+ >+ /* >+ * The legacy printer thread is only for legacy consoles, >+ * unless the nbcon console has no kthread printer. >+ */ >+ if ((flags & CON_NBCON) && con->kthread) >+ continue; >+ >+ if (!console_is_usable(con, flags, true)) >+ continue; >+ >+ if (flags & CON_NBCON) { >+ printk_seq = nbcon_seq_read(con); >+ } else { >+ /* >+ * It is safe to read @seq because only this >+ * thread context updates @seq. >+ */ >+ printk_seq = con->seq; >+ } >+ >+ if (prb_read_valid(prb, printk_seq, NULL)) { >+ available = true; >+ break; >+ } >+ } >+ console_srcu_read_unlock(cookie); >+ >+ return available; >+} >+ >+static int nbcon_legacy_kthread_func(void *unused) >+{ >+ int error; >+ >+ for (;;) { >+ error = wait_event_interruptible(legacy_wait, printer_should_wake()); >+ >+ if (kthread_should_stop()) >+ break; >+ >+ if (error) >+ continue; >+ >+ console_lock(); >+ console_flush_and_unlock(); >+ } >+ >+ return 0; >+} >+ >+void nbcon_legacy_kthread_create(void) >+{ >+ struct task_struct *kt; >+ >+ lockdep_assert_held(&console_mutex); >+ >+ if (!force_printkthreads()) >+ return; >+ >+ if (!printk_threads_enabled || nbcon_legacy_kthread) >+ return; >+ >+ kt = kthread_run(nbcon_legacy_kthread_func, NULL, "pr/legacy"); >+ if (IS_ERR(kt)) { >+ pr_err("unable to start legacy printing thread\n"); >+ return; >+ } >+ >+ nbcon_legacy_kthread = kt; >+ >+ /* >+ * It is important that console printing threads are scheduled >+ * shortly after a printk call and with generous runtime budgets. >+ */ >+ sched_set_normal(nbcon_legacy_kthread, -20); >+} >+#endif /* CONFIG_PRINTK */ >+ > static int __read_mostly keep_bootcon; > > static int __init keep_bootcon_setup(char *str) >@@ -3366,6 +3669,7 @@ static void try_enable_default_console(struct console *newcon) > newcon->flags |= CON_CONSDEV; > } > >+/* Set @newcon->seq to the first record this console should print. */ > static void console_init_seq(struct console *newcon, bool bootcon_registered) > { > struct console *con; >@@ -3414,11 +3718,20 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered) > > newcon->seq = prb_next_seq(prb); > for_each_console(con) { >- if ((con->flags & CON_BOOT) && >- (con->flags & CON_ENABLED) && >- con->seq < newcon->seq) { >- newcon->seq = con->seq; >+ u64 seq; >+ >+ if (!((con->flags & CON_BOOT) && >+ (con->flags & CON_ENABLED))) { >+ continue; > } >+ >+ if (con->flags & CON_NBCON) >+ seq = nbcon_seq_read(con); >+ else >+ seq = con->seq; >+ >+ if (seq < newcon->seq) >+ newcon->seq = seq; > } > } > >@@ -3456,6 +3769,7 @@ void register_console(struct console *newcon) > struct console *con; > bool bootcon_registered = false; > bool realcon_registered = false; >+ unsigned long flags; > int err; > > console_list_lock(); >@@ -3535,9 +3849,38 @@ void register_console(struct console *newcon) > newcon->dropped = 0; > console_init_seq(newcon, bootcon_registered); > >- if (newcon->flags & CON_NBCON) >+ if (newcon->flags & CON_NBCON) { >+ have_nbcon_console = true; > nbcon_init(newcon); > >+ /* >+ * nbcon consoles have their own sequence counter. The legacy >+ * sequence counter is reset so that it is clear it is not >+ * being used. >+ */ >+ nbcon_seq_force(newcon, newcon->seq); >+ newcon->seq = 0; >+ } else { >+ have_legacy_console = true; >+ nbcon_legacy_kthread_create(); >+ } >+ >+ if (newcon->flags & CON_BOOT) >+ have_boot_console = true; >+ >+ /* >+ * If another context is actively using the hardware of this new >+ * console, it will not be aware of the nbcon synchronization. This >+ * is a risk that two contexts could access the hardware >+ * simultaneously if this new console is used for atomic printing >+ * and the other context is still using the hardware. >+ * >+ * Use the driver synchronization to ensure that the hardware is not >+ * in use while this new console transitions to being registered. >+ */ >+ if ((newcon->flags & CON_NBCON) && newcon->write_atomic) >+ newcon->device_lock(newcon, &flags); >+ > /* > * Put this console in the list - keep the > * preferred driver at the head of the list. >@@ -3562,6 +3905,10 @@ void register_console(struct console *newcon) > * register_console() completes. > */ > >+ /* This new console is now registered. */ >+ if ((newcon->flags & CON_NBCON) && newcon->write_atomic) >+ newcon->device_unlock(newcon, flags); >+ > console_sysfs_notify(); > > /* >@@ -3590,6 +3937,11 @@ EXPORT_SYMBOL(register_console); > /* Must be called under console_list_lock(). */ > static int unregister_console_locked(struct console *console) > { >+ bool is_boot_con = (console->flags & CON_BOOT); >+ bool found_legacy_con = false; >+ bool found_nbcon_con = false; >+ bool found_boot_con = false; >+ struct console *c; > int res; > > lockdep_assert_console_list_lock_held(); >@@ -3637,6 +3989,42 @@ static int unregister_console_locked(struct console *console) > if (console->exit) > res = console->exit(console); > >+ /* >+ * With this console gone, the global flags tracking registered >+ * console types may have changed. Update them. >+ */ >+ for_each_console(c) { >+ if (c->flags & CON_BOOT) >+ found_boot_con = true; >+ >+ if (c->flags & CON_NBCON) >+ found_nbcon_con = true; >+ else >+ found_legacy_con = true; >+ } >+ if (!found_boot_con) >+ have_boot_console = found_boot_con; >+ if (!found_legacy_con) >+ have_legacy_console = found_legacy_con; >+ if (!found_nbcon_con) >+ have_nbcon_console = found_nbcon_con; >+ >+ /* >+ * When the last boot console unregisters, start up the >+ * printing threads. >+ */ >+ if (is_boot_con && !have_boot_console) { >+ for_each_console(c) >+ nbcon_kthread_create(c); >+ } >+ >+#ifdef CONFIG_PRINTK >+ if (!printing_via_unlock && nbcon_legacy_kthread) { >+ kthread_stop(nbcon_legacy_kthread); >+ nbcon_legacy_kthread = NULL; >+ } >+#endif >+ > return res; > } > >@@ -3795,23 +4183,39 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre > > seq = prb_next_reserve_seq(prb); > >- /* Flush the consoles so that records up to @seq are printed. */ >- console_lock(); >- console_unlock(); >+ /* >+ * Flush the consoles so that records up to @seq are printed. >+ * Otherwise this function will just wait for the threaded printers >+ * to print up to @seq. >+ */ >+ if (printing_via_unlock && !force_printkthreads()) { >+ console_lock(); >+ console_unlock(); >+ } > > for (;;) { > unsigned long begin_jiffies; > unsigned long slept_jiffies; >+ bool use_console_lock = printing_via_unlock; >+ >+ /* >+ * Ensure the compiler does not optimize @use_console_lock to >+ * be @printing_via_unlock since the latter can change at any >+ * time. >+ */ >+ barrier(); > > diff = 0; > >- /* >- * Hold the console_lock to guarantee safe access to >- * console->seq. Releasing console_lock flushes more >- * records in case @seq is still not printed on all >- * usable consoles. >- */ >- console_lock(); >+ if (use_console_lock) { >+ /* >+ * Hold the console_lock to guarantee safe access to >+ * console->seq. Releasing console_lock flushes more >+ * records in case @seq is still not printed on all >+ * usable consoles. >+ */ >+ console_lock(); >+ } > > cookie = console_srcu_read_lock(); > for_each_console_srcu(c) { >@@ -3825,12 +4229,15 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre > * that they make forward progress, so only increment > * @diff for usable consoles. > */ >- if (!console_is_usable(c)) >+ if (!console_is_usable(c, flags, true) && >+ !console_is_usable(c, flags, false)) { > continue; >+ } > > if (flags & CON_NBCON) { > printk_seq = nbcon_seq_read(c); > } else { >+ WARN_ON_ONCE(!use_console_lock); > printk_seq = c->seq; > } > >@@ -3842,7 +4249,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre > if (diff != last_diff && reset_on_progress) > remaining_jiffies = timeout_jiffies; > >- console_unlock(); >+ if (use_console_lock) >+ console_unlock(); > > /* Note: @diff is 0 if there are no usable consoles. */ > if (diff == 0 || remaining_jiffies == 0) >@@ -3894,9 +4302,16 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) > int pending = this_cpu_xchg(printk_pending, 0); > > if (pending & PRINTK_PENDING_OUTPUT) { >- /* If trylock fails, someone else is doing the printing */ >- if (console_trylock()) >- console_unlock(); >+ if (force_printkthreads()) { >+ wake_up_legacy_kthread(); >+ } else { >+ /* >+ * If trylock fails, some other context >+ * will do the printing. >+ */ >+ if (console_trylock()) >+ console_unlock(); >+ } > } > > if (pending & PRINTK_PENDING_WAKEUP) >@@ -3912,6 +4327,7 @@ static void __wake_up_klogd(int val) > return; > > preempt_disable(); >+ > /* > * Guarantee any new records can be seen by tasks preparing to wait > * before this context checks if the wait queue is empty. >@@ -3923,11 +4339,22 @@ static void __wake_up_klogd(int val) > * > * This pairs with devkmsg_read:A and syslog_print:A. > */ >- if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ >- (val & PRINTK_PENDING_OUTPUT)) { >+ if (!wq_has_sleeper(&log_wait)) /* LMM(__wake_up_klogd:A) */ >+ val &= ~PRINTK_PENDING_WAKEUP; >+ >+ /* >+ * Simple read is safe. register_console() would flush a newly >+ * registered legacy console when writing the message about it >+ * being enabled. >+ */ >+ if (!printing_via_unlock) >+ val &= ~PRINTK_PENDING_OUTPUT; >+ >+ if (val) { > this_cpu_or(printk_pending, val); > irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); > } >+ > preempt_enable(); > } > >@@ -3969,6 +4396,7 @@ void defer_console_output(void) > > void printk_trigger_flush(void) > { >+ nbcon_wake_threads(); > defer_console_output(); > } > >diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c >index f5a8bb606fe50..88e8f3a619229 100644 >--- a/kernel/printk/printk_ringbuffer.c >+++ b/kernel/printk/printk_ringbuffer.c >@@ -1034,9 +1034,13 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, > unsigned long next_lpos; > > if (size == 0) { >- /* Specify a data-less block. */ >- blk_lpos->begin = NO_LPOS; >- blk_lpos->next = NO_LPOS; >+ /* >+ * Data blocks are not created for empty lines. Instead, the >+ * reader will recognize these special lpos values and handle >+ * it appropriately. >+ */ >+ blk_lpos->begin = EMPTY_LINE_LPOS; >+ blk_lpos->next = EMPTY_LINE_LPOS; > return NULL; > } > >@@ -1214,10 +1218,18 @@ static const char *get_data(struct prb_data_ring *data_ring, > > /* Data-less data block description. */ > if (BLK_DATALESS(blk_lpos)) { >- if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { >+ /* >+ * Records that are just empty lines are also valid, even >+ * though they do not have a data block. For such records >+ * explicitly return empty string data to signify success. >+ */ >+ if (blk_lpos->begin == EMPTY_LINE_LPOS && >+ blk_lpos->next == EMPTY_LINE_LPOS) { > *data_size = 0; > return ""; > } >+ >+ /* Data lost, invalid, or otherwise unavailable. */ > return NULL; > } > >diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h >index cb887489d00f0..bd2a892deac1a 100644 >--- a/kernel/printk/printk_ringbuffer.h >+++ b/kernel/printk/printk_ringbuffer.h >@@ -5,6 +5,8 @@ > > #include <linux/atomic.h> > #include <linux/dev_printk.h> >+#include <linux/stddef.h> >+#include <linux/types.h> > > /* > * Meta information about each stored message. >@@ -127,8 +129,22 @@ enum desc_state { > #define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) > #define DESC_ID_MASK (~DESC_FLAGS_MASK) > #define DESC_ID(sv) ((sv) & DESC_ID_MASK) >+ >+/* >+ * Special data block logical position values (for fields of >+ * @prb_desc.text_blk_lpos). >+ * >+ * - Bit0 is used to identify if the record has no data block. (Implemented in >+ * the LPOS_DATALESS() macro.) >+ * >+ * - Bit1 specifies the reason for not having a data block. >+ * >+ * These special values could never be real lpos values because of the >+ * meta data and alignment padding of data blocks. (See to_blk_size() for >+ * details.) >+ */ > #define FAILED_LPOS 0x1 >-#define NO_LPOS 0x3 >+#define EMPTY_LINE_LPOS 0x3 > > #define FAILED_BLK_LPOS \ > { \ >diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c >index 6d10927a07d83..4421ccac31133 100644 >--- a/kernel/printk/printk_safe.c >+++ b/kernel/printk/printk_safe.c >@@ -26,6 +26,18 @@ void __printk_safe_exit(void) > this_cpu_dec(printk_context); > } > >+void __printk_deferred_enter(void) >+{ >+ cant_migrate(); >+ __printk_safe_enter(); >+} >+ >+void __printk_deferred_exit(void) >+{ >+ cant_migrate(); >+ __printk_safe_exit(); >+} >+ > asmlinkage int vprintk(const char *fmt, va_list args) > { > #ifdef CONFIG_KGDB_KDB >diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c >index 7567ca8e743ca..48a9d47ec90eb 100644 >--- a/kernel/rcu/rcutorture.c >+++ b/kernel/rcu/rcutorture.c >@@ -2409,6 +2409,12 @@ static int rcutorture_booster_init(unsigned int cpu) > WARN_ON_ONCE(!t); > sp.sched_priority = 2; > sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); >+#ifdef CONFIG_PREEMPT_RT >+ t = per_cpu(timersd, cpu); >+ WARN_ON_ONCE(!t); >+ sp.sched_priority = 2; >+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); >+#endif > } > > /* Don't allow time recalculation while creating a new task. */ >diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h >index 8107f818455da..b17130b7e522d 100644 >--- a/kernel/rcu/tree_exp.h >+++ b/kernel/rcu/tree_exp.h >@@ -7,6 +7,7 @@ > * Authors: Paul E. McKenney <paulmck@linux.ibm.com> > */ > >+#include <linux/console.h> > #include <linux/lockdep.h> > > static void rcu_exp_handler(void *unused); >@@ -636,6 +637,9 @@ static void synchronize_rcu_expedited_wait(void) > return; > if (rcu_stall_is_suppressed()) > continue; >+ >+ nbcon_cpu_emergency_enter(); >+ > j = jiffies; > rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); > trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); >@@ -689,6 +693,9 @@ static void synchronize_rcu_expedited_wait(void) > rcu_exp_print_detail_task_stall_rnp(rnp); > } > jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; >+ >+ nbcon_cpu_emergency_exit(); >+ > panic_on_rcu_stall(); > } > } >diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h >index 5d666428546b0..f4d73ca20c768 100644 >--- a/kernel/rcu/tree_stall.h >+++ b/kernel/rcu/tree_stall.h >@@ -7,6 +7,7 @@ > * Author: Paul E. McKenney <paulmck@linux.ibm.com> > */ > >+#include <linux/console.h> > #include <linux/kvm_para.h> > #include <linux/rcu_notifier.h> > >@@ -604,6 +605,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) > if (rcu_stall_is_suppressed()) > return; > >+ nbcon_cpu_emergency_enter(); >+ > /* > * OK, time to rat on our buddy... > * See Documentation/RCU/stallwarn.rst for info on how to debug >@@ -655,6 +658,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) > rcu_check_gp_kthread_expired_fqs_timer(); > rcu_check_gp_kthread_starvation(); > >+ nbcon_cpu_emergency_exit(); >+ > panic_on_rcu_stall(); > > rcu_force_quiescent_state(); /* Kick them all. */ >@@ -675,6 +680,8 @@ static void print_cpu_stall(unsigned long gps) > if (rcu_stall_is_suppressed()) > return; > >+ nbcon_cpu_emergency_enter(); >+ > /* > * OK, time to rat on ourselves... > * See Documentation/RCU/stallwarn.rst for info on how to debug >@@ -703,6 +710,8 @@ static void print_cpu_stall(unsigned long gps) > jiffies + 3 * rcu_jiffies_till_stall_check() + 3); > raw_spin_unlock_irqrestore_rcu_node(rnp, flags); > >+ nbcon_cpu_emergency_exit(); >+ > panic_on_rcu_stall(); > > /* >diff --git a/kernel/sched/core.c b/kernel/sched/core.c >index 9116bcc903467..5015768f10256 100644 >--- a/kernel/sched/core.c >+++ b/kernel/sched/core.c >@@ -899,14 +899,15 @@ static inline void hrtick_rq_init(struct rq *rq) > > #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) > /* >- * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, >+ * Atomically set TIF_NEED_RESCHED[_LAZY] and test for TIF_POLLING_NRFLAG, > * this avoids any races wrt polling state changes and thereby avoids > * spurious IPIs. > */ >-static inline bool set_nr_and_not_polling(struct task_struct *p) >+static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit) > { > struct thread_info *ti = task_thread_info(p); >- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); >+ >+ return !(fetch_or(&ti->flags, 1 << tif_bit) & _TIF_POLLING_NRFLAG); > } > > /* >@@ -923,7 +924,7 @@ static bool set_nr_if_polling(struct task_struct *p) > do { > if (!(val & _TIF_POLLING_NRFLAG)) > return false; >- if (val & _TIF_NEED_RESCHED) >+ if (val & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) > return true; > } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); > >@@ -931,9 +932,9 @@ static bool set_nr_if_polling(struct task_struct *p) > } > > #else >-static inline bool set_nr_and_not_polling(struct task_struct *p) >+static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit) > { >- set_tsk_need_resched(p); >+ set_tsk_thread_flag(p, tif_bit); > return true; > } > >@@ -1038,28 +1039,47 @@ void wake_up_q(struct wake_q_head *head) > * might also involve a cross-CPU call to trigger the scheduler on > * the target CPU. > */ >-void resched_curr(struct rq *rq) >+static void __resched_curr(struct rq *rq, int lazy) > { >+ int cpu, tif_bit = TIF_NEED_RESCHED + lazy; > struct task_struct *curr = rq->curr; >- int cpu; > > lockdep_assert_rq_held(rq); > >- if (test_tsk_need_resched(curr)) >+ if (unlikely(test_tsk_thread_flag(curr, tif_bit))) > return; > > cpu = cpu_of(rq); > > if (cpu == smp_processor_id()) { >- set_tsk_need_resched(curr); >- set_preempt_need_resched(); >+ set_tsk_thread_flag(curr, tif_bit); >+ if (!lazy) >+ set_preempt_need_resched(); > return; > } > >- if (set_nr_and_not_polling(curr)) >- smp_send_reschedule(cpu); >- else >+ if (set_nr_and_not_polling(curr, tif_bit)) { >+ if (!lazy) >+ smp_send_reschedule(cpu); >+ } else { > trace_sched_wake_idle_without_ipi(cpu); >+ } >+} >+ >+void resched_curr(struct rq *rq) >+{ >+ __resched_curr(rq, 0); >+} >+ >+void resched_curr_lazy(struct rq *rq) >+{ >+ int lazy = IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && !sched_feat(FORCE_NEED_RESCHED) ? >+ TIF_NEED_RESCHED_LAZY_OFFSET : 0; >+ >+ if (lazy && unlikely(test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED))) >+ return; >+ >+ __resched_curr(rq, lazy); > } > > void resched_cpu(int cpu) >@@ -1154,7 +1174,7 @@ static void wake_up_idle_cpu(int cpu) > * and testing of the above solutions didn't appear to report > * much benefits. > */ >- if (set_nr_and_not_polling(rq->idle)) >+ if (set_nr_and_not_polling(rq->idle, TIF_NEED_RESCHED)) > smp_send_reschedule(cpu); > else > trace_sched_wake_idle_without_ipi(cpu); >@@ -8890,6 +8910,21 @@ static inline void preempt_dynamic_init(void) { } > > #endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ > >+/* >+ * task_is_pi_boosted - Check if task has been PI boosted. >+ * @p: Task to check. >+ * >+ * Return true if task is subject to priority inheritance. >+ */ >+bool task_is_pi_boosted(const struct task_struct *p) >+{ >+ int prio = p->prio; >+ >+ if (!rt_prio(prio)) >+ return false; >+ return prio != p->normal_prio; >+} >+ > /** > * yield - yield the current processor to other threads. > * >diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c >index 8d5d98a5834df..b462333db26cb 100644 >--- a/kernel/sched/debug.c >+++ b/kernel/sched/debug.c >@@ -333,6 +333,23 @@ static const struct file_operations sched_debug_fops = { > .release = seq_release, > }; > >+static ssize_t sched_hog_write(struct file *filp, const char __user *ubuf, >+ size_t cnt, loff_t *ppos) >+{ >+ unsigned long end = jiffies + 60 * HZ; >+ >+ for (; time_before(jiffies, end) && !signal_pending(current);) >+ cpu_relax(); >+ >+ return cnt; >+} >+ >+static const struct file_operations sched_hog_fops = { >+ .write = sched_hog_write, >+ .open = simple_open, >+ .llseek = default_llseek, >+}; >+ > static struct dentry *debugfs_sched; > > static __init int sched_init_debug(void) >@@ -374,6 +391,8 @@ static __init int sched_init_debug(void) > > debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); > >+ debugfs_create_file("hog", 0200, debugfs_sched, NULL, &sched_hog_fops); >+ > return 0; > } > late_initcall(sched_init_debug); >diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >index e2b4e0396af84..df60743b4d2c0 100644 >--- a/kernel/sched/fair.c >+++ b/kernel/sched/fair.c >@@ -975,8 +975,10 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); > * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i > * this is probably good enough. > */ >-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) >+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se, bool tick) > { >+ struct rq *rq = rq_of(cfs_rq); >+ > if ((s64)(se->vruntime - se->deadline) < 0) > return; > >@@ -995,10 +997,19 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) > /* > * The task has consumed its request, reschedule. > */ >- if (cfs_rq->nr_running > 1) { >- resched_curr(rq_of(cfs_rq)); >- clear_buddies(cfs_rq, se); >+ if (cfs_rq->nr_running < 2) >+ return; >+ >+ if (!IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) || sched_feat(FORCE_NEED_RESCHED)) { >+ resched_curr(rq); >+ } else { >+ /* Did the task ignore the lazy reschedule request? */ >+ if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY)) >+ resched_curr(rq); >+ else >+ resched_curr_lazy(rq); > } >+ clear_buddies(cfs_rq, se); > } > > #include "pelt.h" >@@ -1153,7 +1164,7 @@ s64 update_curr_common(struct rq *rq) > /* > * Update the current task's runtime statistics. > */ >-static void update_curr(struct cfs_rq *cfs_rq) >+static void __update_curr(struct cfs_rq *cfs_rq, bool tick) > { > struct sched_entity *curr = cfs_rq->curr; > s64 delta_exec; >@@ -1363,7 +1363,7 @@ > #else // !CONFIG_SCHED_BORE > curr->vruntime += calc_delta_fair(delta_exec, curr); > #endif // CONFIG_SCHED_BORE >- update_deadline(cfs_rq, curr); >+ update_deadline(cfs_rq, curr, tick); > update_min_vruntime(cfs_rq); > > if (entity_is_task(curr)) >@@ -1175,6 +1186,11 @@ static void update_curr(struct cfs_rq *cfs_rq) > account_cfs_rq_runtime(cfs_rq, delta_exec); > } > >+static inline void update_curr(struct cfs_rq *cfs_rq) >+{ >+ __update_curr(cfs_rq, false); >+} >+ > static void update_curr_fair(struct rq *rq) > { > update_curr(cfs_rq_of(&rq->curr->se)); >@@ -5493,7 +5509,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) > /* > * Update run-time statistics of the 'current'. > */ >- update_curr(cfs_rq); >+ __update_curr(cfs_rq, true); > > /* > * Ensure that runnable average is periodically updated. >@@ -5507,7 +5523,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) > * validating it and just reschedule. > */ > if (queued) { >- resched_curr(rq_of(cfs_rq)); >+ resched_curr_lazy(rq_of(cfs_rq)); > return; > } > /* >@@ -5653,7 +5669,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) > * hierarchy can be throttled > */ > if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) >- resched_curr(rq_of(cfs_rq)); >+ resched_curr_lazy(rq_of(cfs_rq)); > } > > static __always_inline >@@ -5913,7 +5929,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) > > /* Determine whether we need to wake up potentially idle CPU: */ > if (rq->curr == rq->idle && rq->cfs.nr_running) >- resched_curr(rq); >+ resched_curr_lazy(rq); > } > > #ifdef CONFIG_SMP >@@ -6628,7 +6644,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) > > if (delta < 0) { > if (task_current(rq, p)) >- resched_curr(rq); >+ resched_curr_lazy(rq); > return; > } > hrtick_start(rq, delta); >@@ -8304,7 +8320,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int > * prevents us from potentially nominating it as a false LAST_BUDDY > * below. > */ >- if (test_tsk_need_resched(curr)) >+ if (need_resched()) > return; > > /* Idle tasks are by definition preempted by non-idle tasks. */ >@@ -8346,7 +8362,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int > return; > > preempt: >- resched_curr(rq); >+ resched_curr_lazy(rq); > } > > #ifdef CONFIG_SMP >@@ -12516,7 +12532,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) > */ > if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && > __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) >- resched_curr(rq); >+ resched_curr_lazy(rq); > } > > /* >@@ -12681,7 +12697,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) > */ > if (task_current(rq, p)) { > if (p->prio > oldprio) >- resched_curr(rq); >+ resched_curr_lazy(rq); > } else > wakeup_preempt(rq, p, 0); > } >diff --git a/kernel/sched/features.h b/kernel/sched/features.h >index 143f55df890b1..6de570ab30078 100644 >--- a/kernel/sched/features.h >+++ b/kernel/sched/features.h >@@ -87,3 +87,5 @@ SCHED_FEAT(UTIL_EST, true) > SCHED_FEAT(LATENCY_WARN, false) > > SCHED_FEAT(HZ_BW, true) >+ >+SCHED_FEAT(FORCE_NEED_RESCHED, false) >diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c >index 31231925f1ece..58486420f3624 100644 >--- a/kernel/sched/idle.c >+++ b/kernel/sched/idle.c >@@ -57,8 +57,7 @@ static noinline int __cpuidle cpu_idle_poll(void) > ct_cpuidle_enter(); > > raw_local_irq_enable(); >- while (!tif_need_resched() && >- (cpu_idle_force_poll || tick_check_broadcast_expired())) >+ while (!need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) > cpu_relax(); > raw_local_irq_disable(); > >diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c >index 3261b067b67e2..8771140e0de5e 100644 >--- a/kernel/sched/rt.c >+++ b/kernel/sched/rt.c >@@ -2194,8 +2194,11 @@ static int rto_next_cpu(struct root_domain *rd) > > rd->rto_cpu = cpu; > >- if (cpu < nr_cpu_ids) >+ if (cpu < nr_cpu_ids) { >+ if (!has_pushable_tasks(cpu_rq(cpu))) >+ continue; > return cpu; >+ } > > rd->rto_cpu = -1; > >diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h >index 001fe047bd5d8..17424c69537f2 100644 >--- a/kernel/sched/sched.h >+++ b/kernel/sched/sched.h >@@ -2463,6 +2463,7 @@ extern void init_sched_fair_class(void); > extern void reweight_task(struct task_struct *p, int prio); > > extern void resched_curr(struct rq *rq); >+extern void resched_curr_lazy(struct rq *rq); > extern void resched_cpu(int cpu); > > extern struct rt_bandwidth def_rt_bandwidth; >diff --git a/kernel/softirq.c b/kernel/softirq.c >index 210cf5f8d92c2..cae0ae2e2b0bb 100644 >--- a/kernel/softirq.c >+++ b/kernel/softirq.c >@@ -247,6 +247,19 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) > } > EXPORT_SYMBOL(__local_bh_enable_ip); > >+void softirq_preempt(void) >+{ >+ if (WARN_ON_ONCE(!preemptible())) >+ return; >+ >+ if (WARN_ON_ONCE(__this_cpu_read(softirq_ctrl.cnt) != SOFTIRQ_OFFSET)) >+ return; >+ >+ __local_bh_enable(SOFTIRQ_OFFSET, true); >+ /* preemption point */ >+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); >+} >+ > /* > * Invoked from ksoftirqd_run() outside of the interrupt disabled section > * to acquire the per CPU local lock for reentrancy protection. >@@ -619,6 +632,24 @@ static inline void tick_irq_exit(void) > #endif > } > >+#ifdef CONFIG_PREEMPT_RT >+DEFINE_PER_CPU(struct task_struct *, timersd); >+DEFINE_PER_CPU(unsigned long, pending_timer_softirq); >+ >+static void wake_timersd(void) >+{ >+ struct task_struct *tsk = __this_cpu_read(timersd); >+ >+ if (tsk) >+ wake_up_process(tsk); >+} >+ >+#else >+ >+static inline void wake_timersd(void) { } >+ >+#endif >+ > static inline void __irq_exit_rcu(void) > { > #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED >@@ -631,6 +662,10 @@ static inline void __irq_exit_rcu(void) > if (!in_interrupt() && local_softirq_pending()) > invoke_softirq(); > >+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() && >+ !(in_nmi() | in_hardirq())) >+ wake_timersd(); >+ > tick_irq_exit(); > } > >@@ -963,12 +998,70 @@ static struct smp_hotplug_thread softirq_threads = { > .thread_comm = "ksoftirqd/%u", > }; > >+#ifdef CONFIG_PREEMPT_RT >+static void timersd_setup(unsigned int cpu) >+{ >+ sched_set_fifo_low(current); >+} >+ >+static int timersd_should_run(unsigned int cpu) >+{ >+ return local_pending_timers(); >+} >+ >+static void run_timersd(unsigned int cpu) >+{ >+ unsigned int timer_si; >+ >+ ksoftirqd_run_begin(); >+ >+ timer_si = local_pending_timers(); >+ __this_cpu_write(pending_timer_softirq, 0); >+ or_softirq_pending(timer_si); >+ >+ __do_softirq(); >+ >+ ksoftirqd_run_end(); >+} >+ >+static void raise_ktimers_thread(unsigned int nr) >+{ >+ trace_softirq_raise(nr); >+ __this_cpu_or(pending_timer_softirq, 1 << nr); >+} >+ >+void raise_hrtimer_softirq(void) >+{ >+ raise_ktimers_thread(HRTIMER_SOFTIRQ); >+} >+ >+void raise_timer_softirq(void) >+{ >+ unsigned long flags; >+ >+ local_irq_save(flags); >+ raise_ktimers_thread(TIMER_SOFTIRQ); >+ wake_timersd(); >+ local_irq_restore(flags); >+} >+ >+static struct smp_hotplug_thread timer_threads = { >+ .store = &timersd, >+ .setup = timersd_setup, >+ .thread_should_run = timersd_should_run, >+ .thread_fn = run_timersd, >+ .thread_comm = "ktimers/%u", >+}; >+#endif >+ > static __init int spawn_ksoftirqd(void) > { > cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, > takeover_tasklets); > BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); >- >+#ifdef CONFIG_PREEMPT_RT >+ BUG_ON(smpboot_register_percpu_thread(&timer_threads)); >+#endif > return 0; > } > early_initcall(spawn_ksoftirqd); >diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c >index edb0f821dceaa..a7290012179a4 100644 >--- a/kernel/time/hrtimer.c >+++ b/kernel/time/hrtimer.c >@@ -1809,7 +1809,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) > if (!ktime_before(now, cpu_base->softirq_expires_next)) { > cpu_base->softirq_expires_next = KTIME_MAX; > cpu_base->softirq_activated = 1; >- raise_softirq_irqoff(HRTIMER_SOFTIRQ); >+ raise_hrtimer_softirq(); > } > > __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); >@@ -1922,7 +1922,7 @@ void hrtimer_run_queues(void) > if (!ktime_before(now, cpu_base->softirq_expires_next)) { > cpu_base->softirq_expires_next = KTIME_MAX; > cpu_base->softirq_activated = 1; >- raise_softirq_irqoff(HRTIMER_SOFTIRQ); >+ raise_hrtimer_softirq(); > } > > __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); >diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c >index 01fb50c1b17e4..910c04d7fa0d3 100644 >--- a/kernel/time/tick-sched.c >+++ b/kernel/time/tick-sched.c >@@ -796,7 +796,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) > > static inline bool local_timer_softirq_pending(void) > { >- return local_softirq_pending() & BIT(TIMER_SOFTIRQ); >+ return local_pending_timers() & BIT(TIMER_SOFTIRQ); > } > > static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) >diff --git a/kernel/time/timer.c b/kernel/time/timer.c >index 352b161113cda..d6bf128262c93 100644 >--- a/kernel/time/timer.c >+++ b/kernel/time/timer.c >@@ -1470,9 +1470,16 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) > */ > static void timer_sync_wait_running(struct timer_base *base) > { >- if (atomic_read(&base->timer_waiters)) { >+ bool need_preempt; >+ >+ need_preempt = task_is_pi_boosted(current); >+ if (need_preempt || atomic_read(&base->timer_waiters)) { > raw_spin_unlock_irq(&base->lock); > spin_unlock(&base->expiry_lock); >+ >+ if (need_preempt) >+ softirq_preempt(); >+ > spin_lock(&base->expiry_lock); > raw_spin_lock_irq(&base->lock); > } >@@ -2070,7 +2077,7 @@ static void run_local_timers(void) > if (time_before(jiffies, base->next_expiry)) > return; > } >- raise_softirq(TIMER_SOFTIRQ); >+ raise_timer_softirq(); > } > > /* >diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c >index c9c8983073485..947b5f1e799dd 100644 >--- a/kernel/trace/trace.c >+++ b/kernel/trace/trace.c >@@ -2717,6 +2717,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) > > if (tif_need_resched()) > trace_flags |= TRACE_FLAG_NEED_RESCHED; >+ if (tif_need_resched_lazy()) >+ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; > if (test_preempt_need_resched()) > trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; > return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | >diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c >index d8b302d010830..4f58a196e14c1 100644 >--- a/kernel/trace/trace_output.c >+++ b/kernel/trace/trace_output.c >@@ -460,17 +460,29 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) > (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : > (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : > bh_off ? 'b' : >- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : >+ !IS_ENABLED(CONFIG_TRACE_IRQFLAGS_SUPPORT) ? 'X' : > '.'; > >- switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | >+ switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | > TRACE_FLAG_PREEMPT_RESCHED)) { >+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: >+ need_resched = 'B'; >+ break; > case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: > need_resched = 'N'; > break; >+ case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: >+ need_resched = 'L'; >+ break; >+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY: >+ need_resched = 'b'; >+ break; > case TRACE_FLAG_NEED_RESCHED: > need_resched = 'n'; > break; >+ case TRACE_FLAG_NEED_RESCHED_LAZY: >+ need_resched = 'l'; >+ break; > case TRACE_FLAG_PREEMPT_RESCHED: > need_resched = 'p'; > break; >diff --git a/localversion-rt b/localversion-rt >new file mode 100644 >index 0000000000000..05c35cb580779 >--- /dev/null >+++ b/localversion-rt >@@ -0,0 +1 @@ >+-rt11 >diff --git a/net/core/dev.c b/net/core/dev.c >index c9b8412f1c9d3..849d2fe8d9221 100644 >--- a/net/core/dev.c >+++ b/net/core/dev.c >@@ -78,6 +78,7 @@ > #include <linux/slab.h> > #include <linux/sched.h> > #include <linux/sched/mm.h> >+#include <linux/smpboot.h> > #include <linux/mutex.h> > #include <linux/rwsem.h> > #include <linux/string.h> >@@ -216,35 +217,60 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) > return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; > } > >-static inline void rps_lock_irqsave(struct softnet_data *sd, >- unsigned long *flags) >+#ifndef CONFIG_PREEMPT_RT >+ >+static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); >+ >+static int __init setup_backlog_napi_threads(char *arg) > { >- if (IS_ENABLED(CONFIG_RPS)) >+ static_branch_enable(&use_backlog_threads_key); >+ return 0; >+} >+early_param("thread_backlog_napi", setup_backlog_napi_threads); >+ >+static bool use_backlog_threads(void) >+{ >+ return static_branch_unlikely(&use_backlog_threads_key); >+} >+ >+#else >+ >+static bool use_backlog_threads(void) >+{ >+ return true; >+} >+ >+#endif >+ >+static inline void backlog_lock_irq_save(struct softnet_data *sd, >+ unsigned long *flags) >+{ >+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) > spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); > else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > local_irq_save(*flags); > } > >-static inline void rps_lock_irq_disable(struct softnet_data *sd) >+static inline void backlog_lock_irq_disable(struct softnet_data *sd) > { >- if (IS_ENABLED(CONFIG_RPS)) >+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) > spin_lock_irq(&sd->input_pkt_queue.lock); > else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > local_irq_disable(); > } > >-static inline void rps_unlock_irq_restore(struct softnet_data *sd, >- unsigned long *flags) >+static inline void backlog_unlock_irq_restore(struct softnet_data *sd, >+ unsigned long *flags) > { >- if (IS_ENABLED(CONFIG_RPS)) >+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) > spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); > else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > local_irq_restore(*flags); > } > >-static inline void rps_unlock_irq_enable(struct softnet_data *sd) >+static inline void backlog_unlock_irq_enable(struct softnet_data *sd) > { >- if (IS_ENABLED(CONFIG_RPS)) >+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) > spin_unlock_irq(&sd->input_pkt_queue.lock); > else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > local_irq_enable(); >@@ -4420,6 +4446,7 @@ EXPORT_SYMBOL(__dev_direct_xmit); > /************************************************************************* > * Receiver routines > *************************************************************************/ >+static DEFINE_PER_CPU(struct task_struct *, backlog_napi); > > int netdev_max_backlog __read_mostly = 1000; > EXPORT_SYMBOL(netdev_max_backlog); >@@ -4452,18 +4479,16 @@ static inline void ____napi_schedule(struct softnet_data *sd, > */ > thread = READ_ONCE(napi->thread); > if (thread) { >- /* Avoid doing set_bit() if the thread is in >- * INTERRUPTIBLE state, cause napi_thread_wait() >- * makes sure to proceed with napi polling >- * if the thread is explicitly woken from here. >- */ >- if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) >- set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); >+ if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) >+ goto use_local_napi; >+ >+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); > wake_up_process(thread); > return; > } > } > >+use_local_napi: > list_add_tail(&napi->poll_list, &sd->poll_list); > WRITE_ONCE(napi->list_owner, smp_processor_id()); > /* If not called from net_rx_action() >@@ -4709,6 +4734,11 @@ static void napi_schedule_rps(struct softnet_data *sd) > > #ifdef CONFIG_RPS > if (sd != mysd) { >+ if (use_backlog_threads()) { >+ __napi_schedule_irqoff(&sd->backlog); >+ return; >+ } >+ > sd->rps_ipi_next = mysd->rps_ipi_list; > mysd->rps_ipi_list = sd; > >@@ -4723,6 +4753,23 @@ static void napi_schedule_rps(struct softnet_data *sd) > __napi_schedule_irqoff(&mysd->backlog); > } > >+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) >+{ >+ unsigned long flags; >+ >+ if (use_backlog_threads()) { >+ backlog_lock_irq_save(sd, &flags); >+ >+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) >+ __napi_schedule_irqoff(&sd->backlog); >+ >+ backlog_unlock_irq_restore(sd, &flags); >+ >+ } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { >+ smp_call_function_single_async(cpu, &sd->defer_csd); >+ } >+} >+ > #ifdef CONFIG_NET_FLOW_LIMIT > int netdev_flow_limit_table_len __read_mostly = (1 << 12); > #endif >@@ -4778,7 +4825,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > reason = SKB_DROP_REASON_NOT_SPECIFIED; > sd = &per_cpu(softnet_data, cpu); > >- rps_lock_irqsave(sd, &flags); >+ backlog_lock_irq_save(sd, &flags); > if (!netif_running(skb->dev)) > goto drop; > qlen = skb_queue_len(&sd->input_pkt_queue); >@@ -4787,7 +4834,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > enqueue: > __skb_queue_tail(&sd->input_pkt_queue, skb); > input_queue_tail_incr_save(sd, qtail); >- rps_unlock_irq_restore(sd, &flags); >+ backlog_unlock_irq_restore(sd, &flags); > return NET_RX_SUCCESS; > } > >@@ -4802,7 +4849,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > > drop: > sd->dropped++; >- rps_unlock_irq_restore(sd, &flags); >+ backlog_unlock_irq_restore(sd, &flags); > > dev_core_stats_rx_dropped_inc(skb->dev); > kfree_skb_reason(skb, reason); >@@ -5833,7 +5880,7 @@ static void flush_backlog(struct work_struct *work) > local_bh_disable(); > sd = this_cpu_ptr(&softnet_data); > >- rps_lock_irq_disable(sd); >+ backlog_lock_irq_disable(sd); > skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { > if (skb->dev->reg_state == NETREG_UNREGISTERING) { > __skb_unlink(skb, &sd->input_pkt_queue); >@@ -5841,7 +5888,7 @@ static void flush_backlog(struct work_struct *work) > input_queue_head_incr(sd); > } > } >- rps_unlock_irq_enable(sd); >+ backlog_unlock_irq_enable(sd); > > skb_queue_walk_safe(&sd->process_queue, skb, tmp) { > if (skb->dev->reg_state == NETREG_UNREGISTERING) { >@@ -5859,14 +5906,14 @@ static bool flush_required(int cpu) > struct softnet_data *sd = &per_cpu(softnet_data, cpu); > bool do_flush; > >- rps_lock_irq_disable(sd); >+ backlog_lock_irq_disable(sd); > > /* as insertion into process_queue happens with the rps lock held, > * process_queue access may race only with dequeue > */ > do_flush = !skb_queue_empty(&sd->input_pkt_queue) || > !skb_queue_empty_lockless(&sd->process_queue); >- rps_unlock_irq_enable(sd); >+ backlog_unlock_irq_enable(sd); > > return do_flush; > #endif >@@ -5932,7 +5979,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) > #ifdef CONFIG_RPS > struct softnet_data *remsd = sd->rps_ipi_list; > >- if (remsd) { >+ if (!use_backlog_threads() && remsd) { > sd->rps_ipi_list = NULL; > > local_irq_enable(); >@@ -5947,7 +5994,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) > static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) > { > #ifdef CONFIG_RPS >- return sd->rps_ipi_list != NULL; >+ return !use_backlog_threads() && sd->rps_ipi_list; > #else > return false; > #endif >@@ -5981,7 +6028,7 @@ static int process_backlog(struct napi_struct *napi, int quota) > > } > >- rps_lock_irq_disable(sd); >+ backlog_lock_irq_disable(sd); > if (skb_queue_empty(&sd->input_pkt_queue)) { > /* > * Inline a custom version of __napi_complete(). >@@ -5991,13 +6038,13 @@ static int process_backlog(struct napi_struct *napi, int quota) > * We can use a plain write instead of clear_bit(), > * and we dont need an smp_mb() memory barrier. > */ >- napi->state = 0; >+ napi->state &= NAPIF_STATE_THREADED; > again = false; > } else { > skb_queue_splice_tail_init(&sd->input_pkt_queue, > &sd->process_queue); > } >- rps_unlock_irq_enable(sd); >+ backlog_unlock_irq_enable(sd); > } > > return work; >@@ -6654,8 +6701,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) > > static int napi_thread_wait(struct napi_struct *napi) > { >- bool woken = false; >- > set_current_state(TASK_INTERRUPTIBLE); > > while (!kthread_should_stop()) { >@@ -6664,15 +6709,13 @@ static int napi_thread_wait(struct napi_struct *napi) > * Testing SCHED bit is not enough because SCHED bit might be > * set by some other busy poll thread or by napi_disable(). > */ >- if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { >+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { > WARN_ON(!list_empty(&napi->poll_list)); > __set_current_state(TASK_RUNNING); > return 0; > } > > schedule(); >- /* woken being true indicates this thread owns this napi. */ >- woken = true; > set_current_state(TASK_INTERRUPTIBLE); > } > __set_current_state(TASK_RUNNING); >@@ -6701,43 +6744,48 @@ static void skb_defer_free_flush(struct softnet_data *sd) > } > } > >+static void napi_threaded_poll_loop(struct napi_struct *napi) >+{ >+ struct softnet_data *sd; >+ unsigned long last_qs = jiffies; >+ >+ for (;;) { >+ bool repoll = false; >+ void *have; >+ >+ local_bh_disable(); >+ sd = this_cpu_ptr(&softnet_data); >+ sd->in_napi_threaded_poll = true; >+ >+ have = netpoll_poll_lock(napi); >+ __napi_poll(napi, &repoll); >+ netpoll_poll_unlock(have); >+ >+ sd->in_napi_threaded_poll = false; >+ barrier(); >+ >+ if (sd_has_rps_ipi_waiting(sd)) { >+ local_irq_disable(); >+ net_rps_action_and_irq_enable(sd); >+ } >+ skb_defer_free_flush(sd); >+ local_bh_enable(); >+ >+ if (!repoll) >+ break; >+ >+ rcu_softirq_qs_periodic(last_qs); >+ cond_resched(); >+ } >+} >+ > static int napi_threaded_poll(void *data) > { > struct napi_struct *napi = data; >- struct softnet_data *sd; >- void *have; > >- while (!napi_thread_wait(napi)) { >- unsigned long last_qs = jiffies; >+ while (!napi_thread_wait(napi)) >+ napi_threaded_poll_loop(napi); > >- for (;;) { >- bool repoll = false; >- >- local_bh_disable(); >- sd = this_cpu_ptr(&softnet_data); >- sd->in_napi_threaded_poll = true; >- >- have = netpoll_poll_lock(napi); >- __napi_poll(napi, &repoll); >- netpoll_poll_unlock(have); >- >- sd->in_napi_threaded_poll = false; >- barrier(); >- >- if (sd_has_rps_ipi_waiting(sd)) { >- local_irq_disable(); >- net_rps_action_and_irq_enable(sd); >- } >- skb_defer_free_flush(sd); >- local_bh_enable(); >- >- if (!repoll) >- break; >- >- rcu_softirq_qs_periodic(last_qs); >- cond_resched(); >- } >- } > return 0; > } > >@@ -11336,7 +11384,7 @@ static int dev_cpu_dead(unsigned int oldcpu) > > list_del_init(&napi->poll_list); > if (napi->poll == process_backlog) >- napi->state = 0; >+ napi->state &= NAPIF_STATE_THREADED; > else > ____napi_schedule(sd, napi); > } >@@ -11344,12 +11392,14 @@ static int dev_cpu_dead(unsigned int oldcpu) > raise_softirq_irqoff(NET_TX_SOFTIRQ); > local_irq_enable(); > >+ if (!use_backlog_threads()) { > #ifdef CONFIG_RPS >- remsd = oldsd->rps_ipi_list; >- oldsd->rps_ipi_list = NULL; >+ remsd = oldsd->rps_ipi_list; >+ oldsd->rps_ipi_list = NULL; > #endif >- /* send out pending IPI's on offline CPU */ >- net_rps_send_ipi(remsd); >+ /* send out pending IPI's on offline CPU */ >+ net_rps_send_ipi(remsd); >+ } > > /* Process offline CPU's input_pkt_queue */ > while ((skb = __skb_dequeue(&oldsd->process_queue))) { >@@ -11669,6 +11719,38 @@ static void __init net_dev_struct_check(void) > * > */ > >+static int backlog_napi_should_run(unsigned int cpu) >+{ >+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); >+ struct napi_struct *napi = &sd->backlog; >+ >+ return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); >+} >+ >+static void run_backlog_napi(unsigned int cpu) >+{ >+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); >+ >+ napi_threaded_poll_loop(&sd->backlog); >+} >+ >+static void backlog_napi_setup(unsigned int cpu) >+{ >+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); >+ struct napi_struct *napi = &sd->backlog; >+ >+ napi->thread = this_cpu_read(backlog_napi); >+ set_bit(NAPI_STATE_THREADED, &napi->state); >+} >+ >+static struct smp_hotplug_thread backlog_threads = { >+ .store = &backlog_napi, >+ .thread_should_run = backlog_napi_should_run, >+ .thread_fn = run_backlog_napi, >+ .thread_comm = "backlog_napi/%u", >+ .setup = backlog_napi_setup, >+}; >+ > /* > * This is called single threaded during boot, so no need > * to take the rtnl semaphore. >@@ -11721,7 +11803,10 @@ static int __init net_dev_init(void) > init_gro_hash(&sd->backlog); > sd->backlog.poll = process_backlog; > sd->backlog.weight = weight_p; >+ INIT_LIST_HEAD(&sd->backlog.poll_list); > } >+ if (use_backlog_threads()) >+ smpboot_register_percpu_thread(&backlog_threads); > > dev_boot_phase = 0; > >diff --git a/net/core/skbuff.c b/net/core/skbuff.c >index 71dee435d549d..48570de42a960 100644 >--- a/net/core/skbuff.c >+++ b/net/core/skbuff.c >@@ -6929,8 +6929,8 @@ nodefer: __kfree_skb(skb); > /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU > * if we are unlucky enough (this seems very unlikely). > */ >- if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) >- smp_call_function_single_async(cpu, &sd->defer_csd); >+ if (unlikely(kick)) >+ kick_defer_list_purge(sd, cpu); > } > > static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 916954
:
874143
|
874144
|
874346
|
874895
|
875322
|
875323
|
879000
|
884278
|
884334
|
888012
|
888021
|
889979
|
889980
| 892741 |
894742
|
894745