Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 888021 Details for
Bug 916954
sys-kernel/gentoo-sources-6.{6,7,8}.x: modified RT patch with BORE patch
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
Combined BORE + RT patch for gentoo-sources 6.8.1
gentoo_sources_6.8.1_experimental_bore4.5.0_rt8.patch (text/plain), 244.01 KB, created by
deim
on 2024-03-21 05:47:56 UTC
(
hide
)
Description:
Combined BORE + RT patch for gentoo-sources 6.8.1
Filename:
MIME Type:
Creator:
deim
Created:
2024-03-21 05:47:56 UTC
Size:
244.01 KB
patch
obsolete
>From d8d9f2ff3cbf43b41269da34772b59bc9a11c5c2 Mon Sep 17 00:00:00 2001 >From: Masahito S <firelzrd@gmail.com> >Date: Sun, 3 Mar 2024 05:14:31 +0900 >Subject: [PATCH] linux6.8.y-bore4.5.0 > >--- > include/linux/sched.h | 12 ++ > init/Kconfig | 19 +++ > kernel/sched/core.c | 148 ++++++++++++++++++++ > kernel/sched/debug.c | 61 +++++++- > kernel/sched/fair.c | 302 ++++++++++++++++++++++++++++++++++++++-- > kernel/sched/features.h | 4 + > kernel/sched/sched.h | 7 + > 7 files changed, 538 insertions(+), 15 deletions(-) > >diff --git a/include/linux/sched.h b/include/linux/sched.h >index ffe8f618ab..7ac6163f90 100644 >--- a/include/linux/sched.h >+++ b/include/linux/sched.h >@@ -547,6 +547,18 @@ struct sched_entity { > u64 sum_exec_runtime; > u64 prev_sum_exec_runtime; > u64 vruntime; >+#ifdef CONFIG_SCHED_BORE >+ u64 burst_time; >+ u8 prev_burst_penalty; >+ u8 curr_burst_penalty; >+ u8 burst_penalty; >+ u8 burst_score; >+ u32 burst_load; >+ bool on_cfs_rq; >+ u8 child_burst; >+ u32 child_burst_cnt; >+ u64 child_burst_last_cached; >+#endif // CONFIG_SCHED_BORE > s64 vlag; > u64 slice; > >diff --git a/init/Kconfig b/init/Kconfig >index deda3d1413..9fa6b45c03 100644 >--- a/init/Kconfig >+++ b/init/Kconfig >@@ -1270,6 +1270,25 @@ config CHECKPOINT_RESTORE > > If unsure, say N here. > >+config SCHED_BORE >+ bool "Burst-Oriented Response Enhancer" >+ default y >+ help >+ In Desktop and Mobile computing, one might prefer interactive >+ tasks to keep responsive no matter what they run in the background. >+ >+ Enabling this kernel feature modifies the scheduler to discriminate >+ tasks by their burst time (runtime since it last went sleeping or >+ yielding state) and prioritize those that run less bursty. >+ Such tasks usually include window compositor, widgets backend, >+ terminal emulator, video playback, games and so on. >+ With a little impact to scheduling fairness, it may improve >+ responsiveness especially under heavy background workload. >+ >+ You can turn it off by setting the sysctl kernel.sched_bore = 0. >+ >+ If unsure, say Y here. >+ > config SCHED_AUTOGROUP > bool "Automatic process group scheduling" > select CGROUPS >diff --git a/kernel/sched/core.c b/kernel/sched/core.c >index 9116bcc903..b4bd85b3bf 100644 >--- a/kernel/sched/core.c >+++ b/kernel/sched/core.c >@@ -4507,6 +4507,143 @@ int wake_up_state(struct task_struct *p, unsigned int state) > return try_to_wake_up(p, state, 0); > } > >+#ifdef CONFIG_SCHED_BORE >+extern bool sched_bore; >+extern u8 sched_burst_fork_atavistic; >+extern uint sched_burst_cache_lifetime; >+ >+static void __init sched_init_bore(void) { >+ init_task.se.burst_time = 0; >+ init_task.se.prev_burst_penalty = 0; >+ init_task.se.curr_burst_penalty = 0; >+ init_task.se.burst_penalty = 0; >+ init_task.se.burst_score = 0; >+ init_task.se.on_cfs_rq = false; >+ init_task.se.child_burst_last_cached = 0; >+ init_task.se.burst_load = 0; >+} >+ >+void inline sched_fork_bore(struct task_struct *p) { >+ p->se.burst_time = 0; >+ p->se.curr_burst_penalty = 0; >+ p->se.burst_score = 0; >+ p->se.on_cfs_rq = false; >+ p->se.child_burst_last_cached = 0; >+ p->se.burst_load = 0; >+} >+ >+static u32 count_child_tasks(struct task_struct *p) { >+ struct task_struct *child; >+ u32 cnt = 0; >+ list_for_each_entry(child, &p->children, sibling) {cnt++;} >+ return cnt; >+} >+ >+static inline bool task_is_inheritable(struct task_struct *p) { >+ return (p->sched_class == &fair_sched_class); >+} >+ >+static inline bool child_burst_cache_expired(struct task_struct *p, u64 now) { >+ u64 expiration_time = >+ p->se.child_burst_last_cached + sched_burst_cache_lifetime; >+ return ((s64)(expiration_time - now) < 0); >+} >+ >+static void __update_child_burst_cache( >+ struct task_struct *p, u32 cnt, u32 sum, u64 now) { >+ u8 avg = 0; >+ if (cnt) avg = sum / cnt; >+ p->se.child_burst = max(avg, p->se.burst_penalty); >+ p->se.child_burst_cnt = cnt; >+ p->se.child_burst_last_cached = now; >+} >+ >+static inline void update_child_burst_direct(struct task_struct *p, u64 now) { >+ struct task_struct *child; >+ u32 cnt = 0; >+ u32 sum = 0; >+ >+ list_for_each_entry(child, &p->children, sibling) { >+ if (!task_is_inheritable(child)) continue; >+ cnt++; >+ sum += child->se.burst_penalty; >+ } >+ >+ __update_child_burst_cache(p, cnt, sum, now); >+} >+ >+static inline u8 __inherit_burst_direct(struct task_struct *p, u64 now) { >+ struct task_struct *parent = p->real_parent; >+ if (child_burst_cache_expired(parent, now)) >+ update_child_burst_direct(parent, now); >+ >+ return parent->se.child_burst; >+} >+ >+static void update_child_burst_topological( >+ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) { >+ struct task_struct *child, *dec; >+ u32 cnt = 0, dcnt = 0; >+ u32 sum = 0; >+ >+ list_for_each_entry(child, &p->children, sibling) { >+ dec = child; >+ while ((dcnt = count_child_tasks(dec)) == 1) >+ dec = list_first_entry(&dec->children, struct task_struct, sibling); >+ >+ if (!dcnt || !depth) { >+ if (!task_is_inheritable(dec)) continue; >+ cnt++; >+ sum += dec->se.burst_penalty; >+ continue; >+ } >+ if (!child_burst_cache_expired(dec, now)) { >+ cnt += dec->se.child_burst_cnt; >+ sum += (u32)dec->se.child_burst * dec->se.child_burst_cnt; >+ continue; >+ } >+ update_child_burst_topological(dec, now, depth - 1, &cnt, &sum); >+ } >+ >+ __update_child_burst_cache(p, cnt, sum, now); >+ *acnt += cnt; >+ *asum += sum; >+} >+ >+static inline u8 __inherit_burst_topological(struct task_struct *p, u64 now) { >+ struct task_struct *anc = p->real_parent; >+ u32 cnt = 0, sum = 0; >+ >+ while (anc->real_parent != anc && count_child_tasks(anc) == 1) >+ anc = anc->real_parent; >+ >+ if (child_burst_cache_expired(anc, now)) >+ update_child_burst_topological( >+ anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum); >+ >+ return anc->se.child_burst; >+} >+ >+static inline void inherit_burst(struct task_struct *p) { >+ u8 burst_cache; >+ u64 now = ktime_get_ns(); >+ >+ read_lock(&tasklist_lock); >+ burst_cache = likely(sched_burst_fork_atavistic)? >+ __inherit_burst_topological(p, now): >+ __inherit_burst_direct(p, now); >+ read_unlock(&tasklist_lock); >+ >+ p->se.prev_burst_penalty = max(p->se.prev_burst_penalty, burst_cache); >+} >+ >+static void sched_post_fork_bore(struct task_struct *p) { >+ if (p->sched_class == &fair_sched_class && likely(sched_bore)) >+ inherit_burst(p); >+ p->se.burst_penalty = p->se.prev_burst_penalty; >+} >+#endif // CONFIG_SCHED_BORE >+ > /* > * Perform scheduler related setup for a newly forked process p. > * p is forked by current. >@@ -4523,6 +4660,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) > p->se.prev_sum_exec_runtime = 0; > p->se.nr_migrations = 0; > p->se.vruntime = 0; >+#ifdef CONFIG_SCHED_BORE >+ sched_fork_bore(p); >+#endif // CONFIG_SCHED_BORE > p->se.vlag = 0; > p->se.slice = sysctl_sched_base_slice; > INIT_LIST_HEAD(&p->se.group_node); >@@ -4839,6 +4979,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) > > void sched_post_fork(struct task_struct *p) > { >+#ifdef CONFIG_SCHED_BORE >+ sched_post_fork_bore(p); >+#endif // CONFIG_SCHED_BORE > uclamp_post_fork(p); > } > >@@ -9910,6 +10053,11 @@ void __init sched_init(void) > BUG_ON(&dl_sched_class != &stop_sched_class + 1); > #endif > >+#ifdef CONFIG_SCHED_BORE >+ sched_init_bore(); >+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 4.5.0 by Masahito Suzuki"); >+#endif // CONFIG_SCHED_BORE >+ > wait_bit_init(); > > #ifdef CONFIG_FAIR_GROUP_SCHED >diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c >index 8d5d98a583..a565363fdd 100644 >--- a/kernel/sched/debug.c >+++ b/kernel/sched/debug.c >@@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = { > }; > > #ifdef CONFIG_SMP >+#ifdef CONFIG_SCHED_BORE >+static ssize_t sched_min_base_slice_write(struct file *filp, const char __user *ubuf, >+ size_t cnt, loff_t *ppos) >+{ >+ char buf[16]; >+ unsigned int value; >+ >+ if (cnt > 15) >+ cnt = 15; >+ >+ if (copy_from_user(&buf, ubuf, cnt)) >+ return -EFAULT; >+ buf[cnt] = '\0'; >+ >+ if (kstrtouint(buf, 10, &value)) >+ return -EINVAL; > >+ if (!value) >+ return -EINVAL; >+ >+ sysctl_sched_min_base_slice = value; >+ sched_update_min_base_slice(); >+ >+ *ppos += cnt; >+ return cnt; >+} >+ >+static int sched_min_base_slice_show(struct seq_file *m, void *v) >+{ >+ seq_printf(m, "%d\n", sysctl_sched_min_base_slice); >+ return 0; >+} >+ >+static int sched_min_base_slice_open(struct inode *inode, struct file *filp) >+{ >+ return single_open(filp, sched_min_base_slice_show, NULL); >+} >+ >+static const struct file_operations sched_min_base_slice_fops = { >+ .open = sched_min_base_slice_open, >+ .write = sched_min_base_slice_write, >+ .read = seq_read, >+ .llseek = seq_lseek, >+ .release = single_release, >+}; >+#else // !CONFIG_SCHED_BORE > static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, > size_t cnt, loff_t *ppos) > { >@@ -213,7 +258,7 @@ static const struct file_operations sched_scaling_fops = { > .llseek = seq_lseek, > .release = single_release, > }; >- >+#endif // CONFIG_SCHED_BORE > #endif /* SMP */ > > #ifdef CONFIG_PREEMPT_DYNAMIC >@@ -353,14 +353,21 @@ > debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); > #endif > >- debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); >+#ifdef CONFIG_SCHED_BORE >+ debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); >+ debugfs_create_u32("base_slice_ns", 0400, debugfs_sched, &sysctl_sched_base_slice); >+#else // !CONFIG_SCHED_BORE >+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); >+#endif // CONFIG_SCHED_BORE > > #ifndef CONFIG_SCHED_ALT > debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); >- debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); >+ debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); > > #ifdef CONFIG_SMP >- debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); >+#if !defined(CONFIG_SCHED_BORE) >+ debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); >+#endif // CONFIG_SCHED_BORE > debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); > debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); > >@@ -595,6 +647,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) > SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), > SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); > >+#ifdef CONFIG_SCHED_BORE >+ SEQ_printf(m, " %2d", p->se.burst_score); >+#endif // CONFIG_SCHED_BORE > #ifdef CONFIG_NUMA_BALANCING > SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); > #endif >@@ -1068,6 +1123,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, > > P(se.load.weight); > #ifdef CONFIG_SMP >+#ifdef CONFIG_SCHED_BORE >+ P(se.burst_load); >+ P(se.burst_score); >+#endif // CONFIG_SCHED_BORE > P(se.avg.load_sum); > P(se.avg.runnable_sum); > P(se.avg.util_sum); >diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >index 533547e3c9..429753c008 100644 >--- a/kernel/sched/fair.c >+++ b/kernel/sched/fair.c >@@ -19,6 +19,9 @@ > * > * Adaptive scheduling granularity, math enhancements by Peter Zijlstra > * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra >+ * >+ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler >+ * Copyright (C) 2021-2024 Masahito Suzuki <firelzrd@gmail.com> > */ > #include <linux/energy_model.h> > #include <linux/mmap_lock.h> >@@ -64,20 +67,129 @@ > * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) > * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus > * >- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) >+ * (BORE default SCHED_TUNABLESCALING_NONE = *1 constant) >+ * (EEVDF default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) > */ >+#ifdef CONFIG_SCHED_BORE >+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; >+#else // !CONFIG_SCHED_BORE > unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; >+#endif // CONFIG_SCHED_BORE > > /* > * Minimal preemption granularity for CPU-bound tasks: > * >- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) >+ * (BORE default: max(1 sec / HZ, min_base_slice) constant, units: nanoseconds) >+ * (EEVDF default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) > */ >+#ifdef CONFIG_SCHED_BORE >+unsigned int sysctl_sched_base_slice = 1000000000ULL / HZ; >+static unsigned int configured_sched_base_slice = 1000000000ULL / HZ; >+unsigned int sysctl_sched_min_base_slice = 2000000ULL; >+#else // !CONFIG_SCHED_BORE > unsigned int sysctl_sched_base_slice = 750000ULL; > static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; >+#endif // CONFIG_SCHED_BORE > > const_debug unsigned int sysctl_sched_migration_cost = 500000UL; > >+#ifdef CONFIG_SCHED_BORE >+u8 __read_mostly sched_bore = 1; >+u8 __read_mostly sched_burst_score_rounding = 0; >+u8 __read_mostly sched_burst_smoothness_long = 1; >+u8 __read_mostly sched_burst_smoothness_short = 0; >+u8 __read_mostly sched_burst_fork_atavistic = 2; >+u8 __read_mostly sched_burst_penalty_offset = 22; >+uint __read_mostly sched_burst_penalty_scale = 1280; >+uint __read_mostly sched_burst_cache_lifetime = 60000000; >+u8 __read_mostly sched_vlag_deviation_limit = 8; >+static int __maybe_unused thirty_two = 32; >+static int __maybe_unused sixty_four = 64; >+static int __maybe_unused maxval_12_bits = 4095; >+ >+#define MAX_BURST_PENALTY (39U <<2) >+ >+static inline u32 log2plus1_u64_u32f8(u64 v) { >+ u32 msb = fls64(v); >+ s32 excess_bits = msb - 9; >+ u8 fractional = (0 <= excess_bits)? v >> excess_bits: v << -excess_bits; >+ return msb << 8 | fractional; >+} >+ >+static inline u32 calc_burst_penalty(u64 burst_time) { >+ u32 greed, tolerance, penalty, scaled_penalty; >+ >+ greed = log2plus1_u64_u32f8(burst_time); >+ tolerance = sched_burst_penalty_offset << 8; >+ penalty = max(0, (s32)greed - (s32)tolerance); >+ scaled_penalty = penalty * sched_burst_penalty_scale >> 16; >+ >+ return min(MAX_BURST_PENALTY, scaled_penalty); >+} >+ >+static inline u64 scale_slice(u64 delta, struct sched_entity *se) { >+ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->burst_score], 22); >+} >+ >+static inline u64 __unscale_slice(u64 delta, u8 score) { >+ return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10); >+} >+ >+static inline u64 unscale_slice(u64 delta, struct sched_entity *se) { >+ return __unscale_slice(delta, se->burst_score); >+} >+ >+static void avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se); >+static void avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se); >+ >+static void update_burst_score(struct sched_entity *se) { >+ struct cfs_rq *cfs_rq = cfs_rq_of(se); >+ u8 prev_score = se->burst_score; >+ u32 penalty = se->burst_penalty; >+ if (sched_burst_score_rounding) penalty += 0x2U; >+ se->burst_score = penalty >> 2; >+ >+ if ((se->burst_score != prev_score) && se->on_cfs_rq) { >+ avg_vruntime_sub(cfs_rq, se); >+ avg_vruntime_add(cfs_rq, se); >+ } >+} >+ >+static void update_burst_penalty(struct sched_entity *se) { >+ se->curr_burst_penalty = calc_burst_penalty(se->burst_time); >+ se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty); >+ update_burst_score(se); >+} >+ >+static inline u32 binary_smooth(u32 new, u32 old) { >+ int increment = new - old; >+ return (0 <= increment)? >+ old + ( increment >> (int)sched_burst_smoothness_long): >+ old - (-increment >> (int)sched_burst_smoothness_short); >+} >+ >+static void restart_burst(struct sched_entity *se) { >+ se->burst_penalty = se->prev_burst_penalty = >+ binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty); >+ se->curr_burst_penalty = 0; >+ se->burst_time = 0; >+ update_burst_score(se); >+} >+ >+static void restart_burst_rescale_deadline(struct sched_entity *se) { >+ s64 vscaled, wremain, vremain = se->deadline - se->vruntime; >+ u8 prev_score = se->burst_score; >+ restart_burst(se); >+ if (prev_score > se->burst_score) { >+ wremain = __unscale_slice(abs(vremain), prev_score); >+ vscaled = scale_slice(wremain, se); >+ if (unlikely(vremain < 0)) >+ vscaled = -vscaled; >+ se->deadline = se->vruntime + vscaled; >+ } >+} >+#endif // CONFIG_SCHED_BORE >+ > int sched_thermal_decay_shift; > static int __init setup_sched_thermal_decay_shift(char *str) > { >@@ -137,6 +249,87 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; > > #ifdef CONFIG_SYSCTL > static struct ctl_table sched_fair_sysctls[] = { >+#ifdef CONFIG_SCHED_BORE >+ { >+ .procname = "sched_bore", >+ .data = &sched_bore, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = SYSCTL_ONE, >+ }, >+ { >+ .procname = "sched_burst_score_rounding", >+ .data = &sched_burst_score_rounding, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = SYSCTL_ONE, >+ }, >+ { >+ .procname = "sched_burst_smoothness_long", >+ .data = &sched_burst_smoothness_long, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = SYSCTL_ONE, >+ }, >+ { >+ .procname = "sched_burst_smoothness_short", >+ .data = &sched_burst_smoothness_short, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = SYSCTL_ONE, >+ }, >+ { >+ .procname = "sched_burst_fork_atavistic", >+ .data = &sched_burst_fork_atavistic, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = SYSCTL_THREE, >+ }, >+ { >+ .procname = "sched_burst_penalty_offset", >+ .data = &sched_burst_penalty_offset, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = &sixty_four, >+ }, >+ { >+ .procname = "sched_burst_penalty_scale", >+ .data = &sched_burst_penalty_scale, >+ .maxlen = sizeof(uint), >+ .mode = 0644, >+ .proc_handler = proc_douintvec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = &maxval_12_bits, >+ }, >+ { >+ .procname = "sched_burst_cache_lifetime", >+ .data = &sched_burst_cache_lifetime, >+ .maxlen = sizeof(uint), >+ .mode = 0644, >+ .proc_handler = proc_douintvec, >+ }, >+ { >+ .procname = "sched_vlag_deviation_limit", >+ .data = &sched_vlag_deviation_limit, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = &thirty_two, >+ }, >+#endif // CONFIG_SCHED_BORE > #ifdef CONFIG_CFS_BANDWIDTH > { > .procname = "sched_cfs_bandwidth_slice_us", >@@ -195,6 +388,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) > * > * This idea comes from the SD scheduler of Con Kolivas: > */ >+#ifdef CONFIG_SCHED_BORE >+static void update_sysctl(void) { >+ sysctl_sched_base_slice = >+ max(sysctl_sched_min_base_slice, configured_sched_base_slice); >+} >+void sched_update_min_base_slice(void) { update_sysctl(); } >+#else // !CONFIG_SCHED_BORE > static unsigned int get_update_sysctl_factor(void) > { > unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); >@@ -225,6 +425,7 @@ static void update_sysctl(void) > SET_SYSCTL(sched_base_slice); > #undef SET_SYSCTL > } >+#endif // CONFIG_SCHED_BORE > > void __init sched_init_granularity(void) > { >@@ -298,6 +499,9 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) > if (unlikely(se->load.weight != NICE_0_LOAD)) > delta = __calc_delta(delta, NICE_0_LOAD, &se->load); > >+#ifdef CONFIG_SCHED_BORE >+ if (likely(sched_bore)) delta = scale_slice(delta, se); >+#endif // CONFIG_SCHED_BORE > return delta; > } > >@@ -624,10 +828,26 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) > * > * As measured, the max (key * weight) value was ~44 bits for a kernel build. > */ >+#if !defined(CONFIG_SCHED_BORE) >+#define entity_weight(se) scale_load_down(se->load.weight) >+#else // CONFIG_SCHED_BORE >+static unsigned long entity_weight(struct sched_entity *se) { >+ unsigned long weight = se->load.weight; >+ if (likely(sched_bore)) weight = unscale_slice(weight, se); >+#ifdef CONFIG_64BIT >+ weight >>= SCHED_FIXEDPOINT_SHIFT - 3; >+#endif // CONFIG_64BIT >+ return weight; >+} >+#endif // CONFIG_SCHED_BORE >+ > static void > avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) > { >- unsigned long weight = scale_load_down(se->load.weight); >+ unsigned long weight = entity_weight(se); >+#ifdef CONFIG_SCHED_BORE >+ se->burst_load = weight; >+#endif // CONFIG_SCHED_BORE > s64 key = entity_key(cfs_rq, se); > > cfs_rq->avg_vruntime += key * weight; >@@ -637,7 +857,12 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) > static void > avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) > { >- unsigned long weight = scale_load_down(se->load.weight); >+#if !defined(CONFIG_SCHED_BORE) >+ unsigned long weight = entity_weight(se); >+#else // CONFIG_SCHED_BORE >+ unsigned long weight = se->burst_load; >+ se->burst_load = 0; >+#endif // CONFIG_SCHED_BORE > s64 key = entity_key(cfs_rq, se); > > cfs_rq->avg_vruntime -= key * weight; >@@ -657,14 +882,14 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) > * Specifically: avg_runtime() + 0 must result in entity_eligible() := true > * For this to be so, the result of this function must have a left bias. > */ >-u64 avg_vruntime(struct cfs_rq *cfs_rq) >+static u64 avg_key(struct cfs_rq *cfs_rq) > { > struct sched_entity *curr = cfs_rq->curr; > s64 avg = cfs_rq->avg_vruntime; > long load = cfs_rq->avg_load; > > if (curr && curr->on_rq) { >- unsigned long weight = scale_load_down(curr->load.weight); >+ unsigned long weight = entity_weight(curr); > > avg += entity_key(cfs_rq, curr) * weight; > load += weight; >@@ -674,12 +899,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) > /* sign flips effective floor / ceil */ > if (avg < 0) > avg -= (load - 1); >- avg = div_s64(avg, load); >+ avg = div64_s64(avg, load); > } > >- return cfs_rq->min_vruntime + avg; >+ return avg; > } > >+u64 avg_vruntime(struct cfs_rq *cfs_rq) { >+ return cfs_rq->min_vruntime + avg_key(cfs_rq); >+} > /* > * lag_i = S - s_i = w_i * (V - v_i) > * >@@ -704,6 +932,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) > lag = avg_vruntime(cfs_rq) - se->vruntime; > > limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); >+#ifdef CONFIG_SCHED_BORE >+ if (likely(sched_bore)) limit >>= 1; >+#endif // CONFIG_SCHED_BORE > se->vlag = clamp(lag, -limit, limit); > } > >@@ -731,7 +962,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) > long load = cfs_rq->avg_load; > > if (curr && curr->on_rq) { >- unsigned long weight = scale_load_down(curr->load.weight); >+ unsigned long weight = entity_weight(curr); > > avg += entity_key(cfs_rq, curr) * weight; > load += weight; >@@ -827,10 +1058,16 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) > se->min_vruntime = se->vruntime; > rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, > __entity_less, &min_vruntime_cb); >+#ifdef CONFIG_SCHED_BORE >+ se->on_cfs_rq = true; >+#endif // CONFIG_SCHED_BORE > } > > static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) > { >+#ifdef CONFIG_SCHED_BORE >+ se->on_cfs_rq = false; >+#endif // CONFIG_SCHED_BORE > rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, > &min_vruntime_cb); > avg_vruntime_sub(cfs_rq, se); >@@ -955,6 +1192,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) > * Scheduling class statistics methods: > */ > #ifdef CONFIG_SMP >+#if !defined(CONFIG_SCHED_BORE) > int sched_update_scaling(void) > { > unsigned int factor = get_update_sysctl_factor(); >@@ -966,6 +1204,7 @@ int sched_update_scaling(void) > > return 0; > } >+#endif // CONFIG_SCHED_BORE > #endif > #endif > >@@ -1165,7 +1404,13 @@ static void update_curr(struct cfs_rq *cfs_rq) > if (unlikely(delta_exec <= 0)) > return; > >+#ifdef CONFIG_SCHED_BORE >+ curr->burst_time += delta_exec; >+ update_burst_penalty(curr); >+ curr->vruntime += max(1ULL, calc_delta_fair(delta_exec, curr)); >+#else // !CONFIG_SCHED_BORE > curr->vruntime += calc_delta_fair(delta_exec, curr); >+#endif // CONFIG_SCHED_BORE > update_deadline(cfs_rq, curr); > update_min_vruntime(cfs_rq); > >@@ -5157,8 +5402,8 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} > static void > place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) > { >- u64 vslice, vruntime = avg_vruntime(cfs_rq); >- s64 lag = 0; >+ s64 lag = 0, key = avg_key(cfs_rq); >+ u64 vslice, vruntime = cfs_rq->min_vruntime + key; > > se->slice = sysctl_sched_base_slice; > vslice = calc_delta_fair(se->slice, se); >@@ -5171,6 +5416,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) > * > * EEVDF: placement strategy #1 / #2 > */ >+#ifdef CONFIG_SCHED_BORE >+ if (unlikely(!sched_bore) || se->vlag) >+#endif // CONFIG_SCHED_BORE > if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { > struct sched_entity *curr = cfs_rq->curr; > unsigned long load; >@@ -5231,12 +5479,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) > */ > load = cfs_rq->avg_load; > if (curr && curr->on_rq) >- load += scale_load_down(curr->load.weight); >+ load += entity_weight(curr); > >- lag *= load + scale_load_down(se->load.weight); >+ lag *= load + entity_weight(se); > if (WARN_ON_ONCE(!load)) > load = 1; >- lag = div_s64(lag, load); >+ lag = div64_s64(lag, load); >+#ifdef CONFIG_SCHED_BORE >+ if (likely(sched_bore)) { >+ s64 limit = vslice << sched_vlag_deviation_limit; >+ lag = clamp(lag, -limit, limit); >+ } >+#endif // CONFIG_SCHED_BORE > } > > se->vruntime = vruntime - lag; >@@ -6803,6 +7057,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > bool was_sched_idle = sched_idle_rq(rq); > > util_est_dequeue(&rq->cfs, p); >+#ifdef CONFIG_SCHED_BORE >+ if (task_sleep) { >+ cfs_rq = cfs_rq_of(se); >+ if (cfs_rq->curr == se) >+ update_curr(cfs_rq); >+ restart_burst(se); >+ } >+#endif // CONFIG_SCHED_BORE > > for_each_sched_entity(se) { > cfs_rq = cfs_rq_of(se); >@@ -8552,16 +8814,25 @@ static void yield_task_fair(struct rq *rq) > /* > * Are we the only task in the tree? > */ >+#if !defined(CONFIG_SCHED_BORE) > if (unlikely(rq->nr_running == 1)) > return; > > clear_buddies(cfs_rq, se); >+#endif // CONFIG_SCHED_BORE > > update_rq_clock(rq); > /* > * Update run-time statistics of the 'current'. > */ > update_curr(cfs_rq); >+#ifdef CONFIG_SCHED_BORE >+ restart_burst_rescale_deadline(se); >+ if (unlikely(rq->nr_running == 1)) >+ return; >+ >+ clear_buddies(cfs_rq, se); >+#endif // CONFIG_SCHED_BORE > /* > * Tell update_rq_clock() that we've just updated, > * so we don't do microscopic update in schedule() >@@ -12651,6 +12922,9 @@ static void task_fork_fair(struct task_struct *p) > curr = cfs_rq->curr; > if (curr) > update_curr(cfs_rq); >+#ifdef CONFIG_SCHED_BORE >+ update_burst_score(se); >+#endif // CONFIG_SCHED_BORE > place_entity(cfs_rq, se, ENQUEUE_INITIAL); > rq_unlock(rq, &rf); > } >diff --git a/kernel/sched/features.h b/kernel/sched/features.h >index 143f55df89..3f0fe409f5 100644 >--- a/kernel/sched/features.h >+++ b/kernel/sched/features.h >@@ -6,7 +6,11 @@ > */ > SCHED_FEAT(PLACE_LAG, true) > SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) >+#ifdef CONFIG_SCHED_BORE >+SCHED_FEAT(RUN_TO_PARITY, false) >+#else // !CONFIG_SCHED_BORE > SCHED_FEAT(RUN_TO_PARITY, true) >+#endif // CONFIG_SCHED_BORE > > /* > * Prefer to schedule the task we woke last (assuming it failed >diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h >index 001fe047bd..da3ad1d4e1 100644 >--- a/kernel/sched/sched.h >+++ b/kernel/sched/sched.h >@@ -1965,7 +1965,11 @@ static inline void dirty_sched_domain_sysctl(int cpu) > } > #endif > >+#ifdef CONFIG_SCHED_BORE >+extern void sched_update_min_base_slice(void); >+#else // !CONFIG_SCHED_BORE > extern int sched_update_scaling(void); >+#endif // CONFIG_SCHED_BORE > > static inline const struct cpumask *task_user_cpus(struct task_struct *p) > { >@@ -2552,6 +2556,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; > extern const_debug unsigned int sysctl_sched_migration_cost; > > extern unsigned int sysctl_sched_base_slice; >+#ifdef CONFIG_SCHED_BORE >+extern unsigned int sysctl_sched_min_base_slice; >+#endif // CONFIG_SCHED_BORE > > #ifdef CONFIG_SCHED_DEBUG > extern int sysctl_resched_latency_warn_ms; >-- >2.34.1 > >diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt >index 31b3a25680d08..b77aa4b1674d5 100644 >--- a/Documentation/admin-guide/kernel-parameters.txt >+++ b/Documentation/admin-guide/kernel-parameters.txt >@@ -6487,6 +6487,18 @@ > Force threading of all interrupt handlers except those > marked explicitly IRQF_NO_THREAD. > >+ threadprintk [KNL] >+ Force threaded printing of all legacy consoles. Be >+ aware that with this option, the shutdown, reboot, and >+ panic messages may not be printed on the legacy >+ consoles. Also, earlycon/earlyprintk printing will be >+ delayed until a regular console or the kthread is >+ available. >+ >+ Users can view /proc/consoles to see if their console >+ driver is legacy or not. Non-legacy (NBCON) console >+ drivers are already threaded and are shown with 'N'. >+ > topology= [S390] > Format: {off | on} > Specify if the kernel should make use of the cpu >diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig >index 0af6709570d14..25424a7468d95 100644 >--- a/arch/arm/Kconfig >+++ b/arch/arm/Kconfig >@@ -36,6 +36,7 @@ config ARM > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE > select ARCH_SUPPORTS_PER_VMA_LOCK >+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF > select ARCH_USE_MEMTEST >@@ -75,7 +76,7 @@ config ARM > select HAS_IOPORT > select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT > select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 >- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU >+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT > select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL > select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU > select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL >@@ -98,7 +99,7 @@ config ARM > select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE > select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU > select HAVE_EXIT_THREAD >- select HAVE_FAST_GUP if ARM_LPAE >+ select HAVE_FAST_GUP if ARM_LPAE && !(PREEMPT_RT && HIGHPTE) > select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL > select HAVE_FUNCTION_ERROR_INJECTION > select HAVE_FUNCTION_GRAPH_TRACER >@@ -120,6 +121,7 @@ config ARM > select HAVE_PERF_EVENTS > select HAVE_PERF_REGS > select HAVE_PERF_USER_STACK_DUMP >+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM > select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE > select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RSEQ >diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c >index 07565b593ed68..3761c1e995cf6 100644 >--- a/arch/arm/mm/fault.c >+++ b/arch/arm/mm/fault.c >@@ -436,6 +436,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, > if (addr < TASK_SIZE) > return do_page_fault(addr, fsr, regs); > >+ if (interrupts_enabled(regs)) >+ local_irq_enable(); >+ > if (user_mode(regs)) > goto bad_area; > >@@ -506,6 +509,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, > static int > do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) > { >+ if (interrupts_enabled(regs)) >+ local_irq_enable(); >+ > do_bad_area(addr, fsr, regs); > return 0; > } >diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c >index b68efe643a12c..48745a3c52618 100644 >--- a/arch/arm/vfp/vfpmodule.c >+++ b/arch/arm/vfp/vfpmodule.c >@@ -55,6 +55,34 @@ extern unsigned int VFP_arch_feroceon __alias(VFP_arch); > */ > union vfp_state *vfp_current_hw_state[NR_CPUS]; > >+/* >+ * Claim ownership of the VFP unit. >+ * >+ * The caller may change VFP registers until vfp_unlock() is called. >+ * >+ * local_bh_disable() is used to disable preemption and to disable VFP >+ * processing in softirq context. On PREEMPT_RT kernels local_bh_disable() is >+ * not sufficient because it only serializes soft interrupt related sections >+ * via a local lock, but stays preemptible. Disabling preemption is the right >+ * choice here as bottom half processing is always in thread context on RT >+ * kernels so it implicitly prevents bottom half processing as well. >+ */ >+static void vfp_lock(void) >+{ >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_bh_disable(); >+ else >+ preempt_disable(); >+} >+ >+static void vfp_unlock(void) >+{ >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_bh_enable(); >+ else >+ preempt_enable(); >+} >+ > /* > * Is 'thread's most up to date state stored in this CPUs hardware? > * Must be called from non-preemptible context. >@@ -240,7 +268,7 @@ static void vfp_panic(char *reason, u32 inst) > /* > * Process bitmask of exception conditions. > */ >-static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_regs *regs) >+static int vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr) > { > int si_code = 0; > >@@ -248,8 +276,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ > > if (exceptions == VFP_EXCEPTION_ERROR) { > vfp_panic("unhandled bounce", inst); >- vfp_raise_sigfpe(FPE_FLTINV, regs); >- return; >+ return FPE_FLTINV; > } > > /* >@@ -277,8 +304,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ > RAISE(FPSCR_OFC, FPSCR_OFE, FPE_FLTOVF); > RAISE(FPSCR_IOC, FPSCR_IOE, FPE_FLTINV); > >- if (si_code) >- vfp_raise_sigfpe(si_code, regs); >+ return si_code; > } > > /* >@@ -324,6 +350,8 @@ static u32 vfp_emulate_instruction(u32 inst, u32 fpscr, struct pt_regs *regs) > static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > { > u32 fpscr, orig_fpscr, fpsid, exceptions; >+ int si_code2 = 0; >+ int si_code = 0; > > pr_debug("VFP: bounce: trigger %08x fpexc %08x\n", trigger, fpexc); > >@@ -369,8 +397,8 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > * unallocated VFP instruction but with FPSCR.IXE set and not > * on VFP subarch 1. > */ >- vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr, regs); >- return; >+ si_code = vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr); >+ goto exit; > } > > /* >@@ -394,14 +422,14 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > */ > exceptions = vfp_emulate_instruction(trigger, fpscr, regs); > if (exceptions) >- vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); >+ si_code2 = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); > > /* > * If there isn't a second FP instruction, exit now. Note that > * the FPEXC.FP2V bit is valid only if FPEXC.EX is 1. > */ > if ((fpexc & (FPEXC_EX | FPEXC_FP2V)) != (FPEXC_EX | FPEXC_FP2V)) >- return; >+ goto exit; > > /* > * The barrier() here prevents fpinst2 being read >@@ -413,7 +441,13 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > emulate: > exceptions = vfp_emulate_instruction(trigger, orig_fpscr, regs); > if (exceptions) >- vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); >+ si_code = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); >+exit: >+ vfp_unlock(); >+ if (si_code2) >+ vfp_raise_sigfpe(si_code2, regs); >+ if (si_code) >+ vfp_raise_sigfpe(si_code, regs); > } > > static void vfp_enable(void *unused) >@@ -512,11 +546,9 @@ static inline void vfp_pm_init(void) { } > */ > void vfp_sync_hwstate(struct thread_info *thread) > { >- unsigned int cpu = get_cpu(); >+ vfp_lock(); > >- local_bh_disable(); >- >- if (vfp_state_in_hw(cpu, thread)) { >+ if (vfp_state_in_hw(raw_smp_processor_id(), thread)) { > u32 fpexc = fmrx(FPEXC); > > /* >@@ -527,8 +559,7 @@ void vfp_sync_hwstate(struct thread_info *thread) > fmxr(FPEXC, fpexc); > } > >- local_bh_enable(); >- put_cpu(); >+ vfp_unlock(); > } > > /* Ensure that the thread reloads the hardware VFP state on the next use. */ >@@ -683,7 +714,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) > if (!user_mode(regs)) > return vfp_kmode_exception(regs, trigger); > >- local_bh_disable(); >+ vfp_lock(); > fpexc = fmrx(FPEXC); > > /* >@@ -748,6 +779,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) > * replay the instruction that trapped. > */ > fmxr(FPEXC, fpexc); >+ vfp_unlock(); > } else { > /* Check for synchronous or asynchronous exceptions */ > if (!(fpexc & (FPEXC_EX | FPEXC_DEX))) { >@@ -762,17 +794,17 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) > if (!(fpscr & FPSCR_IXE)) { > if (!(fpscr & FPSCR_LENGTH_MASK)) { > pr_debug("not VFP\n"); >- local_bh_enable(); >+ vfp_unlock(); > return -ENOEXEC; > } > fpexc |= FPEXC_DEX; > } > } > bounce: regs->ARM_pc += 4; >+ /* VFP_bounce() will invoke vfp_unlock() */ > VFP_bounce(trigger, fpexc, regs); > } > >- local_bh_enable(); > return 0; > } > >@@ -837,7 +869,7 @@ void kernel_neon_begin(void) > unsigned int cpu; > u32 fpexc; > >- local_bh_disable(); >+ vfp_lock(); > > /* > * Kernel mode NEON is only allowed outside of hardirq context with >@@ -868,7 +900,7 @@ void kernel_neon_end(void) > { > /* Disable the NEON/VFP unit. */ > fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN); >- local_bh_enable(); >+ vfp_unlock(); > } > EXPORT_SYMBOL(kernel_neon_end); > >diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig >index aa7c1d4351396..fc970ce1f3cfa 100644 >--- a/arch/arm64/Kconfig >+++ b/arch/arm64/Kconfig >@@ -98,6 +98,7 @@ config ARM64 > select ARCH_SUPPORTS_NUMA_BALANCING > select ARCH_SUPPORTS_PAGE_TABLE_CHECK > select ARCH_SUPPORTS_PER_VMA_LOCK >+ select ARCH_SUPPORTS_RT > select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH > select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT > select ARCH_WANT_DEFAULT_BPF_JIT >diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig >index b9fc064d38d28..e8651e304c888 100644 >--- a/arch/powerpc/Kconfig >+++ b/arch/powerpc/Kconfig >@@ -166,6 +166,7 @@ config PPC > select ARCH_STACKWALK > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x >+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF if PPC64 > select ARCH_USE_MEMTEST >@@ -270,6 +271,7 @@ config PPC > select HAVE_PERF_USER_STACK_DUMP > select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RELIABLE_STACKTRACE >+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM > select HAVE_RSEQ > select HAVE_SETUP_PER_CPU_AREA if PPC64 > select HAVE_SOFTIRQ_ON_OWN_STACK >diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h >index 283c346478565..4727f40052ddd 100644 >--- a/arch/powerpc/include/asm/stackprotector.h >+++ b/arch/powerpc/include/asm/stackprotector.h >@@ -19,8 +19,13 @@ > */ > static __always_inline void boot_init_stack_canary(void) > { >- unsigned long canary = get_random_canary(); >+ unsigned long canary; > >+#ifndef CONFIG_PREEMPT_RT >+ canary = get_random_canary(); >+#else >+ canary = ((unsigned long)&canary) & CANARY_MASK; >+#endif > current->stack_canary = canary; > #ifdef CONFIG_PPC64 > get_paca()->canary = canary; >diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c >index 11e062b47d3f8..f7e22276c97b0 100644 >--- a/arch/powerpc/kernel/traps.c >+++ b/arch/powerpc/kernel/traps.c >@@ -261,12 +261,17 @@ static char *get_mmu_str(void) > > static int __die(const char *str, struct pt_regs *regs, long err) > { >+ const char *pr = ""; >+ > printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); > >+ if (IS_ENABLED(CONFIG_PREEMPTION)) >+ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; >+ > printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", > IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", > PAGE_SIZE / 1024, get_mmu_str(), >- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", >+ pr, > IS_ENABLED(CONFIG_SMP) ? " SMP" : "", > IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", > debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", >diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig >index 074263429faf2..96ab63d035e5c 100644 >--- a/arch/powerpc/kvm/Kconfig >+++ b/arch/powerpc/kvm/Kconfig >@@ -222,6 +222,7 @@ config KVM_E500MC > config KVM_MPIC > bool "KVM in-kernel MPIC emulation" > depends on KVM && PPC_E500 >+ depends on !PREEMPT_RT > select HAVE_KVM_IRQCHIP > select HAVE_KVM_IRQ_ROUTING > select HAVE_KVM_MSI >diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig >index afc0f6a613372..dc3f63c2687d4 100644 >--- a/arch/powerpc/platforms/pseries/Kconfig >+++ b/arch/powerpc/platforms/pseries/Kconfig >@@ -2,6 +2,7 @@ > config PPC_PSERIES > depends on PPC64 && PPC_BOOK3S > bool "IBM pSeries & new (POWER5-based) iSeries" >+ select GENERIC_ALLOCATOR > select HAVE_PCSPKR_PLATFORM > select MPIC > select OF_DYNAMIC >diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c >index e8c4129697b14..c61e29deac8df 100644 >--- a/arch/powerpc/platforms/pseries/iommu.c >+++ b/arch/powerpc/platforms/pseries/iommu.c >@@ -25,6 +25,7 @@ > #include <linux/of_address.h> > #include <linux/iommu.h> > #include <linux/rculist.h> >+#include <linux/local_lock.h> > #include <asm/io.h> > #include <asm/prom.h> > #include <asm/rtas.h> >@@ -206,7 +207,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, > return ret; > } > >-static DEFINE_PER_CPU(__be64 *, tce_page); >+struct tce_page { >+ __be64 * page; >+ local_lock_t lock; >+}; >+static DEFINE_PER_CPU(struct tce_page, tce_page) = { >+ .lock = INIT_LOCAL_LOCK(lock), >+}; > > static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > long npages, unsigned long uaddr, >@@ -229,9 +236,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > direction, attrs); > } > >- local_irq_save(flags); /* to protect tcep and the page behind it */ >+ /* to protect tcep and the page behind it */ >+ local_lock_irqsave(&tce_page.lock, flags); > >- tcep = __this_cpu_read(tce_page); >+ tcep = __this_cpu_read(tce_page.page); > > /* This is safe to do since interrupts are off when we're called > * from iommu_alloc{,_sg}() >@@ -240,12 +248,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > tcep = (__be64 *)__get_free_page(GFP_ATOMIC); > /* If allocation fails, fall back to the loop implementation */ > if (!tcep) { >- local_irq_restore(flags); >+ local_unlock_irqrestore(&tce_page.lock, flags); > return tce_build_pSeriesLP(tbl->it_index, tcenum, > tceshift, > npages, uaddr, direction, attrs); > } >- __this_cpu_write(tce_page, tcep); >+ __this_cpu_write(tce_page.page, tcep); > } > > rpn = __pa(uaddr) >> tceshift; >@@ -275,7 +283,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > tcenum += limit; > } while (npages > 0 && !rc); > >- local_irq_restore(flags); >+ local_unlock_irqrestore(&tce_page.lock, flags); > > if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { > ret = (int)rc; >@@ -459,16 +467,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, > DMA_BIDIRECTIONAL, 0); > } > >- local_irq_disable(); /* to protect tcep and the page behind it */ >- tcep = __this_cpu_read(tce_page); >+ /* to protect tcep and the page behind it */ >+ local_lock_irq(&tce_page.lock); >+ tcep = __this_cpu_read(tce_page.page); > > if (!tcep) { > tcep = (__be64 *)__get_free_page(GFP_ATOMIC); > if (!tcep) { >- local_irq_enable(); >+ local_unlock_irq(&tce_page.lock); > return -ENOMEM; > } >- __this_cpu_write(tce_page, tcep); >+ __this_cpu_write(tce_page.page, tcep); > } > > proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; >@@ -511,7 +520,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, > > /* error cleanup: caller will clear whole range */ > >- local_irq_enable(); >+ local_unlock_irq(&tce_page.lock); > return rc; > } > >diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >index e3142ce531a09..32c5db19cf899 100644 >--- a/arch/riscv/Kconfig >+++ b/arch/riscv/Kconfig >@@ -49,6 +49,7 @@ config RISCV > select ARCH_SUPPORTS_HUGETLBFS if MMU > select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU > select ARCH_SUPPORTS_PER_VMA_LOCK if MMU >+ select ARCH_SUPPORTS_RT > select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK > select ARCH_USE_MEMTEST > select ARCH_USE_QUEUED_RWLOCKS >@@ -142,6 +143,7 @@ config RISCV > select HAVE_PERF_USER_STACK_DUMP > select HAVE_POSIX_CPU_TIMERS_TASK_WORK > select HAVE_PREEMPT_DYNAMIC_KEY if !XIP_KERNEL >+ select HAVE_PREEMPT_AUTO > select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RETHOOK if !XIP_KERNEL > select HAVE_RSEQ >diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h >index 5d473343634b9..23b136286e927 100644 >--- a/arch/riscv/include/asm/thread_info.h >+++ b/arch/riscv/include/asm/thread_info.h >@@ -94,6 +94,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); > * - pending work-to-be-done flags are in lowest half-word > * - other flags in upper half-word(s) > */ >+#define TIF_ARCH_RESCHED_LAZY 0 /* Lazy rescheduling */ > #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ > #define TIF_SIGPENDING 2 /* signal pending */ > #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ >@@ -104,6 +105,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); > #define TIF_32BIT 11 /* compat-mode 32bit process */ > #define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ > >+#define _TIF_ARCH_RESCHED_LAZY (1 << TIF_ARCH_RESCHED_LAZY) > #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) > #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) > #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) >diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig >index 5edec175b9bfc..b5c4f365af372 100644 >--- a/arch/x86/Kconfig >+++ b/arch/x86/Kconfig >@@ -28,6 +28,7 @@ config X86_64 > select ARCH_HAS_GIGANTIC_PAGE > select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 > select ARCH_SUPPORTS_PER_VMA_LOCK >+ select ARCH_SUPPORTS_RT > select HAVE_ARCH_SOFT_DIRTY > select MODULES_USE_ELF_RELA > select NEED_DMA_MAP_STATE >@@ -119,6 +120,7 @@ config X86 > select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG > select ARCH_SUPPORTS_LTO_CLANG > select ARCH_SUPPORTS_LTO_CLANG_THIN >+ select ARCH_SUPPORTS_RT > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 > select ARCH_USE_MEMTEST >@@ -275,6 +277,7 @@ config X86 > select HAVE_STATIC_CALL > select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL > select HAVE_PREEMPT_DYNAMIC_CALL >+ select HAVE_PREEMPT_AUTO > select HAVE_RSEQ > select HAVE_RUST if X86_64 > select HAVE_SYSCALL_TRACEPOINTS >diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h >index d63b02940747f..1ff38ebbd5880 100644 >--- a/arch/x86/include/asm/thread_info.h >+++ b/arch/x86/include/asm/thread_info.h >@@ -81,8 +81,9 @@ struct thread_info { > #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ > #define TIF_SIGPENDING 2 /* signal pending */ > #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ >-#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ >-#define TIF_SSBD 5 /* Speculative store bypass disable */ >+#define TIF_ARCH_RESCHED_LAZY 4 /* Lazy rescheduling */ >+#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ >+#define TIF_SSBD 6 /* Speculative store bypass disable */ > #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ > #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ > #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ >@@ -104,6 +105,7 @@ struct thread_info { > #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) > #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) > #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) >+#define _TIF_ARCH_RESCHED_LAZY (1 << TIF_ARCH_RESCHED_LAZY) > #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) > #define _TIF_SSBD (1 << TIF_SSBD) > #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) >diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c >index 55437f5e0c3ae..7fc47007b9263 100644 >--- a/drivers/acpi/processor_idle.c >+++ b/drivers/acpi/processor_idle.c >@@ -108,7 +108,7 @@ static const struct dmi_system_id processor_power_dmi_table[] = { > */ > static void __cpuidle acpi_safe_halt(void) > { >- if (!tif_need_resched()) { >+ if (!need_resched()) { > raw_safe_halt(); > raw_local_irq_disable(); > } >diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c >index 6772e0c654fa7..119007f17e13e 100644 >--- a/drivers/block/zram/zram_drv.c >+++ b/drivers/block/zram/zram_drv.c >@@ -57,6 +57,41 @@ static void zram_free_page(struct zram *zram, size_t index); > static int zram_read_page(struct zram *zram, struct page *page, u32 index, > struct bio *parent); > >+#ifdef CONFIG_PREEMPT_RT >+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) >+{ >+ size_t index; >+ >+ for (index = 0; index < num_pages; index++) >+ spin_lock_init(&zram->table[index].lock); >+} >+ >+static int zram_slot_trylock(struct zram *zram, u32 index) >+{ >+ int ret; >+ >+ ret = spin_trylock(&zram->table[index].lock); >+ if (ret) >+ __set_bit(ZRAM_LOCK, &zram->table[index].flags); >+ return ret; >+} >+ >+static void zram_slot_lock(struct zram *zram, u32 index) >+{ >+ spin_lock(&zram->table[index].lock); >+ __set_bit(ZRAM_LOCK, &zram->table[index].flags); >+} >+ >+static void zram_slot_unlock(struct zram *zram, u32 index) >+{ >+ __clear_bit(ZRAM_LOCK, &zram->table[index].flags); >+ spin_unlock(&zram->table[index].lock); >+} >+ >+#else >+ >+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } >+ > static int zram_slot_trylock(struct zram *zram, u32 index) > { > return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); >@@ -71,6 +106,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) > { > bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); > } >+#endif > > static inline bool init_done(struct zram *zram) > { >@@ -1241,6 +1277,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) > > if (!huge_class_size) > huge_class_size = zs_huge_class_size(zram->mem_pool); >+ zram_meta_init_table_locks(zram, num_pages); > return true; > } > >diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h >index 3b94d12f41b40..dfc364b0d0727 100644 >--- a/drivers/block/zram/zram_drv.h >+++ b/drivers/block/zram/zram_drv.h >@@ -69,6 +69,9 @@ struct zram_table_entry { > unsigned long element; > }; > unsigned long flags; >+#ifdef CONFIG_PREEMPT_RT >+ spinlock_t lock; >+#endif > #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME > ktime_t ac_time; > #endif >diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig >index 3089029abba48..2d5828c5d3596 100644 >--- a/drivers/gpu/drm/i915/Kconfig >+++ b/drivers/gpu/drm/i915/Kconfig >@@ -3,7 +3,6 @@ config DRM_I915 > tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics" > depends on DRM > depends on X86 && PCI >- depends on !PREEMPT_RT > select INTEL_GTT if X86 > select INTERVAL_TREE > # we need shmfs for the swappable backing store, and in particular >diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c >index 8a84a31c7b48a..73a561af13d16 100644 >--- a/drivers/gpu/drm/i915/display/intel_crtc.c >+++ b/drivers/gpu/drm/i915/display/intel_crtc.c >@@ -580,7 +580,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state, > */ > intel_psr_wait_for_idle_locked(new_crtc_state); > >- local_irq_disable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_disable(); > > crtc->debug.min_vbl = min; > crtc->debug.max_vbl = max; >@@ -605,11 +606,13 @@ void intel_pipe_update_start(struct intel_atomic_state *state, > break; > } > >- local_irq_enable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_enable(); > > timeout = schedule_timeout(timeout); > >- local_irq_disable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_disable(); > } > > finish_wait(wq, &wait); >@@ -642,7 +645,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state, > return; > > irq_disable: >- local_irq_disable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_disable(); > } > > #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) >@@ -744,7 +748,8 @@ void intel_pipe_update_end(struct intel_atomic_state *state, > */ > intel_vrr_send_push(new_crtc_state); > >- local_irq_enable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_enable(); > > if (intel_vgpu_active(dev_priv)) > goto out; >diff --git a/drivers/gpu/drm/i915/display/intel_vblank.c b/drivers/gpu/drm/i915/display/intel_vblank.c >index fe256bf7b485b..a3c3faa8f305a 100644 >--- a/drivers/gpu/drm/i915/display/intel_vblank.c >+++ b/drivers/gpu/drm/i915/display/intel_vblank.c >@@ -275,6 +275,26 @@ int intel_crtc_scanline_to_hw(struct intel_crtc *crtc, int scanline) > * all register accesses to the same cacheline to be serialized, > * otherwise they may hang. > */ >+static void intel_vblank_section_enter_irqsave(struct drm_i915_private *i915, unsigned long *flags) >+ __acquires(i915->uncore.lock) >+{ >+#ifdef I915 >+ spin_lock_irqsave(&i915->uncore.lock, *flags); >+#else >+ *flags = 0; >+#endif >+} >+ >+static void intel_vblank_section_exit_irqrestore(struct drm_i915_private *i915, unsigned long flags) >+ __releases(i915->uncore.lock) >+{ >+#ifdef I915 >+ spin_unlock_irqrestore(&i915->uncore.lock, flags); >+#else >+ if (flags) >+ return; >+#endif >+} > static void intel_vblank_section_enter(struct drm_i915_private *i915) > __acquires(i915->uncore.lock) > { >@@ -332,10 +352,10 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, > * timing critical raw register reads, potentially with > * preemption disabled, so the following code must not block. > */ >- local_irq_save(irqflags); >- intel_vblank_section_enter(dev_priv); >+ intel_vblank_section_enter_irqsave(dev_priv, &irqflags); > >- /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ >+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) >+ preempt_disable(); > > /* Get optional system timestamp before query. */ > if (stime) >@@ -399,10 +419,10 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, > if (etime) > *etime = ktime_get(); > >- /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ >+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) >+ preempt_enable(); > >- intel_vblank_section_exit(dev_priv); >- local_irq_restore(irqflags); >+ intel_vblank_section_exit_irqrestore(dev_priv, irqflags); > > /* > * While in vblank, position will be negative >@@ -440,13 +460,11 @@ int intel_get_crtc_scanline(struct intel_crtc *crtc) > unsigned long irqflags; > int position; > >- local_irq_save(irqflags); >- intel_vblank_section_enter(dev_priv); >+ intel_vblank_section_enter_irqsave(dev_priv, &irqflags); > > position = __intel_get_crtc_scanline(crtc); > >- intel_vblank_section_exit(dev_priv); >- local_irq_restore(irqflags); >+ intel_vblank_section_exit_irqrestore(dev_priv, irqflags); > > return position; > } >diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c >index d650beb8ed22f..3dd3e516b80c1 100644 >--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c >+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c >@@ -317,10 +317,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) > /* Kick the work once more to drain the signalers, and disarm the irq */ > irq_work_sync(&b->irq_work); > while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { >- local_irq_disable(); >- signal_irq_work(&b->irq_work); >- local_irq_enable(); >+ irq_work_queue(&b->irq_work); > cond_resched(); >+ irq_work_sync(&b->irq_work); > } > } > >diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c >index 42aade0faf2d1..929ca2bad2d2c 100644 >--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c >+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c >@@ -1303,7 +1303,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > * and context switches) submission. > */ > >- spin_lock(&sched_engine->lock); >+ spin_lock_irq(&sched_engine->lock); > > /* > * If the queue is higher priority than the last >@@ -1403,7 +1403,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > * Even if ELSP[1] is occupied and not worthy > * of timeslices, our queue might be. > */ >- spin_unlock(&sched_engine->lock); >+ spin_unlock_irq(&sched_engine->lock); > return; > } > } >@@ -1429,7 +1429,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > > if (last && !can_merge_rq(last, rq)) { > spin_unlock(&ve->base.sched_engine->lock); >- spin_unlock(&engine->sched_engine->lock); >+ spin_unlock_irq(&engine->sched_engine->lock); > return; /* leave this for another sibling */ > } > >@@ -1591,7 +1591,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > */ > sched_engine->queue_priority_hint = queue_prio(sched_engine); > i915_sched_engine_reset_on_empty(sched_engine); >- spin_unlock(&sched_engine->lock); >+ spin_unlock_irq(&sched_engine->lock); > > /* > * We can skip poking the HW if we ended up with exactly the same set >@@ -1617,13 +1617,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > } > } > >-static void execlists_dequeue_irq(struct intel_engine_cs *engine) >-{ >- local_irq_disable(); /* Suspend interrupts across request submission */ >- execlists_dequeue(engine); >- local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ >-} >- > static void clear_ports(struct i915_request **ports, int count) > { > memset_p((void **)ports, NULL, count); >@@ -2478,7 +2471,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) > } > > if (!engine->execlists.pending[0]) { >- execlists_dequeue_irq(engine); >+ execlists_dequeue(engine); > start_timeslice(engine); > } > >diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h >index 813cc888e6fae..ab3483a59b79a 100644 >--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h >+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h >@@ -362,7 +362,7 @@ static inline int intel_guc_send_busy_loop(struct intel_guc *guc, > { > int err; > unsigned int sleep_period_ms = 1; >- bool not_atomic = !in_atomic() && !irqs_disabled(); >+ bool not_atomic = !in_atomic() && !irqs_disabled() && !rcu_preempt_depth(); > > /* > * FIXME: Have caller pass in if we are in an atomic context to avoid >diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c >index f59081066a197..014d02029a415 100644 >--- a/drivers/gpu/drm/i915/i915_request.c >+++ b/drivers/gpu/drm/i915/i915_request.c >@@ -609,7 +609,6 @@ bool __i915_request_submit(struct i915_request *request) > > RQ_TRACE(request, "\n"); > >- GEM_BUG_ON(!irqs_disabled()); > lockdep_assert_held(&engine->sched_engine->lock); > > /* >@@ -718,7 +717,6 @@ void __i915_request_unsubmit(struct i915_request *request) > */ > RQ_TRACE(request, "\n"); > >- GEM_BUG_ON(!irqs_disabled()); > lockdep_assert_held(&engine->sched_engine->lock); > > /* >diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h >index ce1cbee1b39dd..3c51620d011b1 100644 >--- a/drivers/gpu/drm/i915/i915_trace.h >+++ b/drivers/gpu/drm/i915/i915_trace.h >@@ -6,6 +6,10 @@ > #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) > #define _I915_TRACE_H_ > >+#ifdef CONFIG_PREEMPT_RT >+#define NOTRACE >+#endif >+ > #include <linux/stringify.h> > #include <linux/types.h> > #include <linux/tracepoint.h> >@@ -322,7 +326,7 @@ DEFINE_EVENT(i915_request, i915_request_add, > TP_ARGS(rq) > ); > >-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) >+#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) > DEFINE_EVENT(i915_request, i915_request_guc_submit, > TP_PROTO(struct i915_request *rq), > TP_ARGS(rq) >diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h >index f98577967b7fc..6cc358aa5b2ff 100644 >--- a/drivers/gpu/drm/i915/i915_utils.h >+++ b/drivers/gpu/drm/i915/i915_utils.h >@@ -288,7 +288,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) > #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) > > /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ >-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) >+#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) > # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) > #else > # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) >diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c >index b62ad9006780c..ae637155fe7cd 100644 >--- a/drivers/tty/serial/8250/8250_core.c >+++ b/drivers/tty/serial/8250/8250_core.c >@@ -592,6 +592,7 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) > > #ifdef CONFIG_SERIAL_8250_CONSOLE > >+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE > static void univ8250_console_write(struct console *co, const char *s, > unsigned int count) > { >@@ -599,6 +600,37 @@ static void univ8250_console_write(struct console *co, const char *s, > > serial8250_console_write(up, s, count); > } >+#else >+static bool univ8250_console_write_atomic(struct console *co, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_8250_port *up = &serial8250_ports[co->index]; >+ >+ return serial8250_console_write_atomic(up, wctxt); >+} >+ >+static bool univ8250_console_write_thread(struct console *co, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_8250_port *up = &serial8250_ports[co->index]; >+ >+ return serial8250_console_write_thread(up, wctxt); >+} >+ >+static void univ8250_console_driver_enter(struct console *con, unsigned long *flags) >+{ >+ struct uart_port *up = &serial8250_ports[con->index].port; >+ >+ __uart_port_lock_irqsave(up, flags); >+} >+ >+static void univ8250_console_driver_exit(struct console *con, unsigned long flags) >+{ >+ struct uart_port *up = &serial8250_ports[con->index].port; >+ >+ __uart_port_unlock_irqrestore(up, flags); >+} >+#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */ > > static int univ8250_console_setup(struct console *co, char *options) > { >@@ -698,12 +730,20 @@ static int univ8250_console_match(struct console *co, char *name, int idx, > > static struct console univ8250_console = { > .name = "ttyS", >+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE > .write = univ8250_console_write, >+ .flags = CON_PRINTBUFFER | CON_ANYTIME, >+#else >+ .write_atomic = univ8250_console_write_atomic, >+ .write_thread = univ8250_console_write_thread, >+ .driver_enter = univ8250_console_driver_enter, >+ .driver_exit = univ8250_console_driver_exit, >+ .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_NBCON, >+#endif > .device = uart_console_device, > .setup = univ8250_console_setup, > .exit = univ8250_console_exit, > .match = univ8250_console_match, >- .flags = CON_PRINTBUFFER | CON_ANYTIME, > .index = -1, > .data = &serial8250_reg, > }; >diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c >index 8ca061d3bbb92..f799c34f1603c 100644 >--- a/drivers/tty/serial/8250/8250_port.c >+++ b/drivers/tty/serial/8250/8250_port.c >@@ -550,6 +550,11 @@ static int serial8250_em485_init(struct uart_8250_port *p) > if (!p->em485) > return -ENOMEM; > >+#ifndef CONFIG_SERIAL_8250_LEGACY_CONSOLE >+ if (uart_console(&p->port)) >+ dev_warn(p->port.dev, "no atomic printing for rs485 consoles\n"); >+#endif >+ > hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, > HRTIMER_MODE_REL); > hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, >@@ -702,7 +707,11 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) > serial8250_rpm_put(p); > } > >-static void serial8250_clear_IER(struct uart_8250_port *up) >+/* >+ * Only to be used by write_atomic() and the legacy write(), which do not >+ * require port lock. >+ */ >+static void __serial8250_clear_IER(struct uart_8250_port *up) > { > if (up->capabilities & UART_CAP_UUE) > serial_out(up, UART_IER, UART_IER_UUE); >@@ -710,6 +719,14 @@ static void serial8250_clear_IER(struct uart_8250_port *up) > serial_out(up, UART_IER, 0); > } > >+static inline void serial8250_clear_IER(struct uart_8250_port *up) >+{ >+ /* Port locked to synchronize UART_IER access against the console. */ >+ lockdep_assert_held_once(&up->port.lock); >+ >+ __serial8250_clear_IER(up); >+} >+ > #ifdef CONFIG_SERIAL_8250_RSA > /* > * Attempts to turn on the RSA FIFO. Returns zero on failure. >@@ -3276,6 +3293,7 @@ void serial8250_init_port(struct uart_8250_port *up) > struct uart_port *port = &up->port; > > spin_lock_init(&port->lock); >+ port->nbcon_locked_port = false; > port->ctrl_id = 0; > port->pm = NULL; > port->ops = &serial8250_pops; >@@ -3320,6 +3338,11 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) > > wait_for_xmitr(up, UART_LSR_THRE); > serial_port_out(port, UART_TX, ch); >+ >+ if (ch == '\n') >+ up->console_newline_needed = false; >+ else >+ up->console_newline_needed = true; > } > > /* >@@ -3348,6 +3371,7 @@ static void serial8250_console_restore(struct uart_8250_port *up) > serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); > } > >+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE > /* > * Print a string to the serial port using the device FIFO > * >@@ -3406,7 +3430,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, > * First save the IER then disable the interrupts > */ > ier = serial_port_in(port, UART_IER); >- serial8250_clear_IER(up); >+ __serial8250_clear_IER(up); > > /* check scratch reg to see if port powered off during system sleep */ > if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { >@@ -3472,6 +3496,135 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, > if (locked) > uart_port_unlock_irqrestore(port, flags); > } >+#else >+bool serial8250_console_write_thread(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_8250_em485 *em485 = up->em485; >+ struct uart_port *port = &up->port; >+ bool done = false; >+ unsigned int ier; >+ >+ touch_nmi_watchdog(); >+ >+ if (!nbcon_enter_unsafe(wctxt)) >+ return false; >+ >+ /* First save IER then disable the interrupts. */ >+ ier = serial_port_in(port, UART_IER); >+ serial8250_clear_IER(up); >+ >+ /* Check scratch reg if port powered off during system sleep. */ >+ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { >+ serial8250_console_restore(up); >+ up->canary = 0; >+ } >+ >+ if (em485) { >+ if (em485->tx_stopped) >+ up->rs485_start_tx(up); >+ mdelay(port->rs485.delay_rts_before_send); >+ } >+ >+ if (nbcon_exit_unsafe(wctxt)) { >+ int len = READ_ONCE(wctxt->len); >+ int i; >+ >+ /* >+ * Write out the message. Toggle unsafe for each byte in order >+ * to give another (higher priority) context the opportunity >+ * for a friendly takeover. If such a takeover occurs, this >+ * context must reacquire ownership in order to perform final >+ * actions (such as re-enabling the interrupts). >+ * >+ * IMPORTANT: wctxt->outbuf and wctxt->len are no longer valid >+ * after a reacquire so writing the message must be >+ * aborted. >+ */ >+ for (i = 0; i < len; i++) { >+ if (!nbcon_enter_unsafe(wctxt)) { >+ nbcon_reacquire(wctxt); >+ break; >+ } >+ >+ uart_console_write(port, wctxt->outbuf + i, 1, serial8250_console_putchar); >+ >+ if (!nbcon_exit_unsafe(wctxt)) { >+ nbcon_reacquire(wctxt); >+ break; >+ } >+ } >+ done = (i == len); >+ } else { >+ nbcon_reacquire(wctxt); >+ } >+ >+ while (!nbcon_enter_unsafe(wctxt)) >+ nbcon_reacquire(wctxt); >+ >+ /* Finally, wait for transmitter to become empty and restore IER. */ >+ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); >+ if (em485) { >+ mdelay(port->rs485.delay_rts_after_send); >+ if (em485->tx_stopped) >+ up->rs485_stop_tx(up); >+ } >+ serial_port_out(port, UART_IER, ier); >+ >+ /* >+ * The receive handling will happen properly because the receive ready >+ * bit will still be set; it is not cleared on read. However, modem >+ * control will not, we must call it if we have saved something in the >+ * saved flags while processing with interrupts off. >+ */ >+ if (up->msr_saved_flags) >+ serial8250_modem_status(up); >+ >+ /* Success if no handover/takeover and message fully printed. */ >+ return (nbcon_exit_unsafe(wctxt) && done); >+} >+ >+bool serial8250_console_write_atomic(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_port *port = &up->port; >+ unsigned int ier; >+ >+ /* Atomic console not supported for rs485 mode. */ >+ if (up->em485) >+ return false; >+ >+ touch_nmi_watchdog(); >+ >+ if (!nbcon_enter_unsafe(wctxt)) >+ return false; >+ >+ /* >+ * First save IER then disable the interrupts. The special variant to >+ * clear IER is used because atomic printing may occur without holding >+ * the port lock. >+ */ >+ ier = serial_port_in(port, UART_IER); >+ __serial8250_clear_IER(up); >+ >+ /* Check scratch reg if port powered off during system sleep. */ >+ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { >+ serial8250_console_restore(up); >+ up->canary = 0; >+ } >+ >+ if (up->console_newline_needed) >+ uart_console_write(port, "\n", 1, serial8250_console_putchar); >+ uart_console_write(port, wctxt->outbuf, wctxt->len, serial8250_console_putchar); >+ >+ /* Finally, wait for transmitter to become empty and restore IER. */ >+ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); >+ serial_port_out(port, UART_IER, ier); >+ >+ /* Success if no handover/takeover. */ >+ return nbcon_exit_unsafe(wctxt); >+} >+#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */ > > static unsigned int probe_baud(struct uart_port *port) > { >@@ -3490,6 +3643,7 @@ static unsigned int probe_baud(struct uart_port *port) > > int serial8250_console_setup(struct uart_port *port, char *options, bool probe) > { >+ struct uart_8250_port *up = up_to_u8250p(port); > int baud = 9600; > int bits = 8; > int parity = 'n'; >@@ -3499,6 +3653,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) > if (!port->iobase && !port->membase) > return -ENODEV; > >+ up->console_newline_needed = false; >+ > if (options) > uart_parse_options(options, &baud, &parity, &bits, &flow); > else if (probe) >diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c >index cf2c890a560f0..2fa3fb30dc6c7 100644 >--- a/drivers/tty/serial/amba-pl011.c >+++ b/drivers/tty/serial/amba-pl011.c >@@ -348,10 +348,7 @@ static int pl011_fifo_to_tty(struct uart_amba_port *uap) > flag = TTY_FRAME; > } > >- uart_port_unlock(&uap->port); >- sysrq = uart_handle_sysrq_char(&uap->port, ch & 255); >- uart_port_lock(&uap->port); >- >+ sysrq = uart_prepare_sysrq_char(&uap->port, ch & 255); > if (!sysrq) > uart_insert_char(&uap->port, ch, UART011_DR_OE, ch, flag); > } >@@ -1017,7 +1014,7 @@ static void pl011_dma_rx_callback(void *data) > ret = pl011_dma_rx_trigger_dma(uap); > > pl011_dma_rx_chars(uap, pending, lastbuf, false); >- uart_port_unlock_irq(&uap->port); >+ uart_unlock_and_check_sysrq(&uap->port); > /* > * Do this check after we picked the DMA chars so we don't > * get some IRQ immediately from RX. >@@ -1540,11 +1537,10 @@ static void check_apply_cts_event_workaround(struct uart_amba_port *uap) > static irqreturn_t pl011_int(int irq, void *dev_id) > { > struct uart_amba_port *uap = dev_id; >- unsigned long flags; > unsigned int status, pass_counter = AMBA_ISR_PASS_LIMIT; > int handled = 0; > >- uart_port_lock_irqsave(&uap->port, &flags); >+ uart_port_lock(&uap->port); > status = pl011_read(uap, REG_RIS) & uap->im; > if (status) { > do { >@@ -1573,7 +1569,7 @@ static irqreturn_t pl011_int(int irq, void *dev_id) > handled = 1; > } > >- uart_port_unlock_irqrestore(&uap->port, flags); >+ uart_unlock_and_check_sysrq(&uap->port); > > return IRQ_RETVAL(handled); > } >@@ -2322,13 +2318,10 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) > > clk_enable(uap->clk); > >- local_irq_save(flags); >- if (uap->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&uap->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&uap->port, &flags); > else >- uart_port_lock(&uap->port); >+ uart_port_lock_irqsave(&uap->port, &flags); > > /* > * First save the CR then disable the interrupts >@@ -2354,8 +2347,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) > pl011_write(old_cr, uap, REG_CR); > > if (locked) >- uart_port_unlock(&uap->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&uap->port, flags); > > clk_disable(uap->clk); > } >diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c >index 8d09ace062e59..7790cbc57391a 100644 >--- a/drivers/tty/serial/ar933x_uart.c >+++ b/drivers/tty/serial/ar933x_uart.c >@@ -378,7 +378,7 @@ static void ar933x_uart_rx_chars(struct ar933x_uart_port *up) > up->port.icount.rx++; > ch = rdata & AR933X_UART_DATA_TX_RX_MASK; > >- if (uart_handle_sysrq_char(&up->port, ch)) >+ if (uart_prepare_sysrq_char(&up->port, ch)) > continue; > > if ((up->port.ignore_status_mask & AR933X_DUMMY_STATUS_RD) == 0) >@@ -468,7 +468,7 @@ static irqreturn_t ar933x_uart_interrupt(int irq, void *dev_id) > ar933x_uart_tx_chars(up); > } > >- uart_port_unlock(&up->port); >+ uart_unlock_and_check_sysrq(&up->port); > > return IRQ_HANDLED; > } >@@ -627,14 +627,10 @@ static void ar933x_uart_console_write(struct console *co, const char *s, > unsigned int int_en; > int locked = 1; > >- local_irq_save(flags); >- >- if (up->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&up->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&up->port, &flags); > else >- uart_port_lock(&up->port); >+ uart_port_lock_irqsave(&up->port, &flags); > > /* > * First save the IER then disable the interrupts >@@ -654,9 +650,7 @@ static void ar933x_uart_console_write(struct console *co, const char *s, > ar933x_uart_write(up, AR933X_UART_INT_REG, AR933X_UART_INT_ALLINTS); > > if (locked) >- uart_port_unlock(&up->port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&up->port, flags); > } > > static int ar933x_uart_console_setup(struct console *co, char *options) >diff --git a/drivers/tty/serial/bcm63xx_uart.c b/drivers/tty/serial/bcm63xx_uart.c >index a3cefa153456d..34801a6f300b6 100644 >--- a/drivers/tty/serial/bcm63xx_uart.c >+++ b/drivers/tty/serial/bcm63xx_uart.c >@@ -285,10 +285,9 @@ static void bcm_uart_do_rx(struct uart_port *port) > flag = TTY_PARITY; > } > >- if (uart_handle_sysrq_char(port, c)) >+ if (uart_prepare_sysrq_char(port, c)) > continue; > >- > if ((cstat & port->ignore_status_mask) == 0) > tty_insert_flip_char(tty_port, c, flag); > >@@ -353,7 +352,7 @@ static irqreturn_t bcm_uart_interrupt(int irq, void *dev_id) > estat & UART_EXTINP_DCD_MASK); > } > >- uart_port_unlock(port); >+ uart_unlock_and_check_sysrq(port); > return IRQ_HANDLED; > } > >@@ -703,20 +702,14 @@ static void bcm_console_write(struct console *co, const char *s, > { > struct uart_port *port; > unsigned long flags; >- int locked; >+ int locked = 1; > > port = &ports[co->index]; > >- local_irq_save(flags); >- if (port->sysrq) { >- /* bcm_uart_interrupt() already took the lock */ >- locked = 0; >- } else if (oops_in_progress) { >- locked = uart_port_trylock(port); >- } else { >- uart_port_lock(port); >- locked = 1; >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); >+ else >+ uart_port_lock_irqsave(port, &flags); > > /* call helper to deal with \r\n */ > uart_console_write(port, s, count, bcm_console_putchar); >@@ -725,8 +718,7 @@ static void bcm_console_write(struct console *co, const char *s, > wait_for_xmitr(port); > > if (locked) >- uart_port_unlock(port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > /* >diff --git a/drivers/tty/serial/lpc32xx_hs.c b/drivers/tty/serial/lpc32xx_hs.c >index ec20329f06036..e70fa59dbcc3b 100644 >--- a/drivers/tty/serial/lpc32xx_hs.c >+++ b/drivers/tty/serial/lpc32xx_hs.c >@@ -136,20 +136,16 @@ static void lpc32xx_hsuart_console_write(struct console *co, const char *s, > int locked = 1; > > touch_nmi_watchdog(); >- local_irq_save(flags); >- if (up->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&up->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&up->port, &flags); > else >- uart_port_lock(&up->port); >+ uart_port_lock_irqsave(&up->port, &flags); > > uart_console_write(&up->port, s, count, lpc32xx_hsuart_console_putchar); > wait_for_xmit_empty(&up->port); > > if (locked) >- uart_port_unlock(&up->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&up->port, flags); > } > > static int __init lpc32xx_hsuart_console_setup(struct console *co, >@@ -268,7 +264,8 @@ static void __serial_lpc32xx_rx(struct uart_port *port) > tty_insert_flip_char(tport, 0, TTY_FRAME); > } > >- tty_insert_flip_char(tport, (tmp & 0xFF), flag); >+ if (!uart_prepare_sysrq_char(port, tmp & 0xff)) >+ tty_insert_flip_char(tport, (tmp & 0xFF), flag); > > tmp = readl(LPC32XX_HSUART_FIFO(port->membase)); > } >@@ -333,7 +330,7 @@ static irqreturn_t serial_lpc32xx_interrupt(int irq, void *dev_id) > __serial_lpc32xx_tx(port); > } > >- uart_port_unlock(port); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >diff --git a/drivers/tty/serial/meson_uart.c b/drivers/tty/serial/meson_uart.c >index 8395688f5ee92..6feac459c0cf4 100644 >--- a/drivers/tty/serial/meson_uart.c >+++ b/drivers/tty/serial/meson_uart.c >@@ -220,7 +220,7 @@ static void meson_receive_chars(struct uart_port *port) > continue; > } > >- if (uart_handle_sysrq_char(port, ch)) >+ if (uart_prepare_sysrq_char(port, ch)) > continue; > > if ((status & port->ignore_status_mask) == 0) >@@ -248,7 +248,7 @@ static irqreturn_t meson_uart_interrupt(int irq, void *dev_id) > meson_uart_start_tx(port); > } > >- uart_port_unlock(port); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -556,18 +556,13 @@ static void meson_serial_port_write(struct uart_port *port, const char *s, > u_int count) > { > unsigned long flags; >- int locked; >+ int locked = 1; > u32 val, tmp; > >- local_irq_save(flags); >- if (port->sysrq) { >- locked = 0; >- } else if (oops_in_progress) { >- locked = uart_port_trylock(port); >- } else { >- uart_port_lock(port); >- locked = 1; >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); >+ else >+ uart_port_lock_irqsave(port, &flags); > > val = readl(port->membase + AML_UART_CONTROL); > tmp = val & ~(AML_UART_TX_INT_EN | AML_UART_RX_INT_EN); >@@ -577,8 +572,7 @@ static void meson_serial_port_write(struct uart_port *port, const char *s, > writel(val, port->membase + AML_UART_CONTROL); > > if (locked) >- uart_port_unlock(port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static void meson_serial_console_write(struct console *co, const char *s, >diff --git a/drivers/tty/serial/msm_serial.c b/drivers/tty/serial/msm_serial.c >index e24204ad35def..d27c4c8c84e13 100644 >--- a/drivers/tty/serial/msm_serial.c >+++ b/drivers/tty/serial/msm_serial.c >@@ -588,16 +588,14 @@ static void msm_complete_rx_dma(void *args) > if (!(port->read_status_mask & MSM_UART_SR_RX_BREAK)) > flag = TTY_NORMAL; > >- uart_port_unlock_irqrestore(port, flags); >- sysrq = uart_handle_sysrq_char(port, dma->virt[i]); >- uart_port_lock_irqsave(port, &flags); >+ sysrq = uart_prepare_sysrq_char(port, dma->virt[i]); > if (!sysrq) > tty_insert_flip_char(tport, dma->virt[i], flag); > } > > msm_start_rx_dma(msm_port); > done: >- uart_port_unlock_irqrestore(port, flags); >+ uart_unlock_and_check_sysrq_irqrestore(port, flags); > > if (count) > tty_flip_buffer_push(tport); >@@ -763,9 +761,7 @@ static void msm_handle_rx_dm(struct uart_port *port, unsigned int misr) > if (!(port->read_status_mask & MSM_UART_SR_RX_BREAK)) > flag = TTY_NORMAL; > >- uart_port_unlock(port); >- sysrq = uart_handle_sysrq_char(port, buf[i]); >- uart_port_lock(port); >+ sysrq = uart_prepare_sysrq_char(port, buf[i]); > if (!sysrq) > tty_insert_flip_char(tport, buf[i], flag); > } >@@ -825,9 +821,7 @@ static void msm_handle_rx(struct uart_port *port) > else if (sr & MSM_UART_SR_PAR_FRAME_ERR) > flag = TTY_FRAME; > >- uart_port_unlock(port); >- sysrq = uart_handle_sysrq_char(port, c); >- uart_port_lock(port); >+ sysrq = uart_prepare_sysrq_char(port, c); > if (!sysrq) > tty_insert_flip_char(tport, c, flag); > } >@@ -948,11 +942,10 @@ static irqreturn_t msm_uart_irq(int irq, void *dev_id) > struct uart_port *port = dev_id; > struct msm_port *msm_port = to_msm_port(port); > struct msm_dma *dma = &msm_port->rx_dma; >- unsigned long flags; > unsigned int misr; > u32 val; > >- uart_port_lock_irqsave(port, &flags); >+ uart_port_lock(port); > misr = msm_read(port, MSM_UART_MISR); > msm_write(port, 0, MSM_UART_IMR); /* disable interrupt */ > >@@ -984,7 +977,7 @@ static irqreturn_t msm_uart_irq(int irq, void *dev_id) > msm_handle_delta_cts(port); > > msm_write(port, msm_port->imr, MSM_UART_IMR); /* restore interrupt */ >- uart_port_unlock_irqrestore(port, flags); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -1621,14 +1614,10 @@ static void __msm_console_write(struct uart_port *port, const char *s, > num_newlines++; > count += num_newlines; > >- local_irq_save(flags); >- >- if (port->sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); > else >- uart_port_lock(port); >+ uart_port_lock_irqsave(port, &flags); > > if (is_uartdm) > msm_reset_dm_count(port, count); >@@ -1667,9 +1656,7 @@ static void __msm_console_write(struct uart_port *port, const char *s, > } > > if (locked) >- uart_port_unlock(port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static void msm_console_write(struct console *co, const char *s, >diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c >index f5a0b401af63b..9be1c871cf116 100644 >--- a/drivers/tty/serial/omap-serial.c >+++ b/drivers/tty/serial/omap-serial.c >@@ -508,7 +508,7 @@ static void serial_omap_rdi(struct uart_omap_port *up, unsigned int lsr) > > up->port.icount.rx++; > >- if (uart_handle_sysrq_char(&up->port, ch)) >+ if (uart_prepare_sysrq_char(&up->port, ch)) > return; > > uart_insert_char(&up->port, lsr, UART_LSR_OE, ch, TTY_NORMAL); >@@ -563,7 +563,7 @@ static irqreturn_t serial_omap_irq(int irq, void *dev_id) > } > } while (max_count--); > >- uart_port_unlock(&up->port); >+ uart_unlock_and_check_sysrq(&up->port); > > tty_flip_buffer_push(&up->port.state->port); > >@@ -1212,13 +1212,10 @@ serial_omap_console_write(struct console *co, const char *s, > unsigned int ier; > int locked = 1; > >- local_irq_save(flags); >- if (up->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&up->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&up->port, &flags); > else >- uart_port_lock(&up->port); >+ uart_port_lock_irqsave(&up->port, &flags); > > /* > * First save the IER then disable the interrupts >@@ -1245,8 +1242,7 @@ serial_omap_console_write(struct console *co, const char *s, > check_modem_status(up); > > if (locked) >- uart_port_unlock(&up->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&up->port, flags); > } > > static int __init >diff --git a/drivers/tty/serial/owl-uart.c b/drivers/tty/serial/owl-uart.c >index d9fe85397741d..8b60ac0ad7cd3 100644 >--- a/drivers/tty/serial/owl-uart.c >+++ b/drivers/tty/serial/owl-uart.c >@@ -199,6 +199,7 @@ static void owl_uart_receive_chars(struct uart_port *port) > stat = owl_uart_read(port, OWL_UART_STAT); > while (!(stat & OWL_UART_STAT_RFEM)) { > char flag = TTY_NORMAL; >+ bool sysrq; > > if (stat & OWL_UART_STAT_RXER) > port->icount.overrun++; >@@ -217,7 +218,9 @@ static void owl_uart_receive_chars(struct uart_port *port) > val = owl_uart_read(port, OWL_UART_RXDAT); > val &= 0xff; > >- if ((stat & port->ignore_status_mask) == 0) >+ sysrq = uart_prepare_sysrq_char(port, val); >+ >+ if (!sysrq && (stat & port->ignore_status_mask) == 0) > tty_insert_flip_char(&port->state->port, val, flag); > > stat = owl_uart_read(port, OWL_UART_STAT); >@@ -229,10 +232,9 @@ static void owl_uart_receive_chars(struct uart_port *port) > static irqreturn_t owl_uart_irq(int irq, void *dev_id) > { > struct uart_port *port = dev_id; >- unsigned long flags; > u32 stat; > >- uart_port_lock_irqsave(port, &flags); >+ uart_port_lock(port); > > stat = owl_uart_read(port, OWL_UART_STAT); > >@@ -246,7 +248,7 @@ static irqreturn_t owl_uart_irq(int irq, void *dev_id) > stat |= OWL_UART_STAT_RIP | OWL_UART_STAT_TIP; > owl_uart_write(port, stat, OWL_UART_STAT); > >- uart_port_unlock_irqrestore(port, flags); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -508,18 +510,12 @@ static void owl_uart_port_write(struct uart_port *port, const char *s, > { > u32 old_ctl, val; > unsigned long flags; >- int locked; >+ int locked = 1; > >- local_irq_save(flags); >- >- if (port->sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(port); >- else { >- uart_port_lock(port); >- locked = 1; >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); >+ else >+ uart_port_lock_irqsave(port, &flags); > > old_ctl = owl_uart_read(port, OWL_UART_CTL); > val = old_ctl | OWL_UART_CTL_TRFS_TX; >@@ -541,9 +537,7 @@ static void owl_uart_port_write(struct uart_port *port, const char *s, > owl_uart_write(port, old_ctl, OWL_UART_CTL); > > if (locked) >- uart_port_unlock(port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static void owl_uart_console_write(struct console *co, const char *s, >diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c >index 436cc6d52a11b..89257cddf5405 100644 >--- a/drivers/tty/serial/pch_uart.c >+++ b/drivers/tty/serial/pch_uart.c >@@ -237,9 +237,6 @@ struct eg20t_port { > > #define IRQ_NAME_SIZE 17 > char irq_name[IRQ_NAME_SIZE]; >- >- /* protect the eg20t_port private structure and io access to membase */ >- spinlock_t lock; > }; > > /** >@@ -567,7 +564,7 @@ static int pch_uart_hal_read(struct eg20t_port *priv, unsigned char *buf, > if (uart_handle_break(port)) > continue; > } >- if (uart_handle_sysrq_char(port, rbr)) >+ if (uart_prepare_sysrq_char(port, rbr)) > continue; > > buf[i++] = rbr; >@@ -599,16 +596,14 @@ static void pch_uart_hal_set_break(struct eg20t_port *priv, int on) > iowrite8(lcr, priv->membase + UART_LCR); > } > >-static int push_rx(struct eg20t_port *priv, const unsigned char *buf, >- int size) >+static void push_rx(struct eg20t_port *priv, const unsigned char *buf, >+ int size) > { > struct uart_port *port = &priv->port; > struct tty_port *tport = &port->state->port; > > tty_insert_flip_string(tport, buf, size); > tty_flip_buffer_push(tport); >- >- return 0; > } > > static int dma_push_rx(struct eg20t_port *priv, int size) >@@ -761,7 +756,7 @@ static int handle_rx_to(struct eg20t_port *priv) > { > struct pch_uart_buffer *buf; > int rx_size; >- int ret; >+ > if (!priv->start_rx) { > pch_uart_hal_disable_interrupt(priv, PCH_UART_HAL_RX_INT | > PCH_UART_HAL_RX_ERR_INT); >@@ -770,19 +765,12 @@ static int handle_rx_to(struct eg20t_port *priv) > buf = &priv->rxbuf; > do { > rx_size = pch_uart_hal_read(priv, buf->buf, buf->size); >- ret = push_rx(priv, buf->buf, rx_size); >- if (ret) >- return 0; >+ push_rx(priv, buf->buf, rx_size); > } while (rx_size == buf->size); > > return PCH_UART_HANDLED_RX_INT; > } > >-static int handle_rx(struct eg20t_port *priv) >-{ >- return handle_rx_to(priv); >-} >- > static int dma_handle_rx(struct eg20t_port *priv) > { > struct uart_port *port = &priv->port; >@@ -1019,11 +1007,10 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id) > u8 lsr; > int ret = 0; > unsigned char iid; >- unsigned long flags; > int next = 1; > u8 msr; > >- spin_lock_irqsave(&priv->lock, flags); >+ uart_port_lock(&priv->port); > handled = 0; > while (next) { > iid = pch_uart_hal_get_iid(priv); >@@ -1051,7 +1038,7 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id) > PCH_UART_HAL_RX_INT | > PCH_UART_HAL_RX_ERR_INT); > } else { >- ret = handle_rx(priv); >+ ret = handle_rx_to(priv); > } > break; > case PCH_UART_IID_RDR_TO: /* Received Data Ready >@@ -1083,7 +1070,7 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id) > handled |= (unsigned int)ret; > } > >- spin_unlock_irqrestore(&priv->lock, flags); >+ uart_unlock_and_check_sysrq(&priv->port); > return IRQ_RETVAL(handled); > } > >@@ -1194,9 +1181,9 @@ static void pch_uart_break_ctl(struct uart_port *port, int ctl) > unsigned long flags; > > priv = container_of(port, struct eg20t_port, port); >- spin_lock_irqsave(&priv->lock, flags); >+ uart_port_lock_irqsave(&priv->port, &flags); > pch_uart_hal_set_break(priv, ctl); >- spin_unlock_irqrestore(&priv->lock, flags); >+ uart_port_unlock_irqrestore(&priv->port, flags); > } > > /* Grab any interrupt resources and initialise any low level driver state. */ >@@ -1346,8 +1333,7 @@ static void pch_uart_set_termios(struct uart_port *port, > > baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16); > >- spin_lock_irqsave(&priv->lock, flags); >- uart_port_lock(port); >+ uart_port_lock_irqsave(port, &flags); > > uart_update_timeout(port, termios->c_cflag, baud); > rtn = pch_uart_hal_set_line(priv, baud, parity, bits, stb); >@@ -1360,8 +1346,7 @@ static void pch_uart_set_termios(struct uart_port *port, > tty_termios_encode_baud_rate(termios, baud, baud); > > out: >- uart_port_unlock(port); >- spin_unlock_irqrestore(&priv->lock, flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static const char *pch_uart_type(struct uart_port *port) >@@ -1565,27 +1550,17 @@ pch_console_write(struct console *co, const char *s, unsigned int count) > { > struct eg20t_port *priv; > unsigned long flags; >- int priv_locked = 1; >- int port_locked = 1; >+ int locked = 1; > u8 ier; > > priv = pch_uart_ports[co->index]; > > touch_nmi_watchdog(); > >- local_irq_save(flags); >- if (priv->port.sysrq) { >- /* call to uart_handle_sysrq_char already took the priv lock */ >- priv_locked = 0; >- /* serial8250_handle_port() already took the port lock */ >- port_locked = 0; >- } else if (oops_in_progress) { >- priv_locked = spin_trylock(&priv->lock); >- port_locked = uart_port_trylock(&priv->port); >- } else { >- spin_lock(&priv->lock); >- uart_port_lock(&priv->port); >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&priv->port, &flags); >+ else >+ uart_port_lock_irqsave(&priv->port, &flags); > > /* > * First save the IER then disable the interrupts >@@ -1603,11 +1578,8 @@ pch_console_write(struct console *co, const char *s, unsigned int count) > wait_for_xmitr(priv, UART_LSR_BOTH_EMPTY); > iowrite8(ier, priv->membase + UART_IER); > >- if (port_locked) >- uart_port_unlock(&priv->port); >- if (priv_locked) >- spin_unlock(&priv->lock); >- local_irq_restore(flags); >+ if (locked) >+ uart_port_unlock_irqrestore(&priv->port, flags); > } > > static int __init pch_console_setup(struct console *co, char *options) >@@ -1704,8 +1676,6 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev, > pci_enable_msi(pdev); > pci_set_master(pdev); > >- spin_lock_init(&priv->lock); >- > iobase = pci_resource_start(pdev, 0); > mapbase = pci_resource_start(pdev, 1); > priv->mapbase = mapbase; >@@ -1735,8 +1705,6 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev, > KBUILD_MODNAME ":" PCH_UART_DRIVER_DEVICE "%d", > priv->port.line); > >- spin_lock_init(&priv->port.lock); >- > pci_set_drvdata(pdev, priv); > priv->trigger_level = 1; > priv->fcr = 0; >diff --git a/drivers/tty/serial/pxa.c b/drivers/tty/serial/pxa.c >index 46e70e155aab2..e395ff29c1a2c 100644 >--- a/drivers/tty/serial/pxa.c >+++ b/drivers/tty/serial/pxa.c >@@ -151,7 +151,7 @@ static inline void receive_chars(struct uart_pxa_port *up, int *status) > flag = TTY_FRAME; > } > >- if (uart_handle_sysrq_char(&up->port, ch)) >+ if (uart_prepare_sysrq_char(&up->port, ch)) > goto ignore_char; > > uart_insert_char(&up->port, *status, UART_LSR_OE, ch, flag); >@@ -232,7 +232,7 @@ static inline irqreturn_t serial_pxa_irq(int irq, void *dev_id) > check_modem_status(up); > if (lsr & UART_LSR_THRE) > transmit_chars(up); >- uart_port_unlock(&up->port); >+ uart_unlock_and_check_sysrq(&up->port); > return IRQ_HANDLED; > } > >@@ -604,13 +604,10 @@ serial_pxa_console_write(struct console *co, const char *s, unsigned int count) > int locked = 1; > > clk_enable(up->clk); >- local_irq_save(flags); >- if (up->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&up->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&up->port, &flags); > else >- uart_port_lock(&up->port); >+ uart_port_lock_irqsave(&up->port, &flags); > > /* > * First save the IER then disable the interrupts >@@ -628,10 +625,8 @@ serial_pxa_console_write(struct console *co, const char *s, unsigned int count) > serial_out(up, UART_IER, ier); > > if (locked) >- uart_port_unlock(&up->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&up->port, flags); > clk_disable(up->clk); >- > } > > #ifdef CONFIG_CONSOLE_POLL >diff --git a/drivers/tty/serial/rda-uart.c b/drivers/tty/serial/rda-uart.c >index 13deb355cf1bc..82def9b8632a5 100644 >--- a/drivers/tty/serial/rda-uart.c >+++ b/drivers/tty/serial/rda-uart.c >@@ -394,7 +394,8 @@ static void rda_uart_receive_chars(struct uart_port *port) > val &= 0xff; > > port->icount.rx++; >- tty_insert_flip_char(&port->state->port, val, flag); >+ if (!uart_prepare_sysrq_char(port, val)) >+ tty_insert_flip_char(&port->state->port, val, flag); > > status = rda_uart_read(port, RDA_UART_STATUS); > } >@@ -405,10 +406,9 @@ static void rda_uart_receive_chars(struct uart_port *port) > static irqreturn_t rda_interrupt(int irq, void *dev_id) > { > struct uart_port *port = dev_id; >- unsigned long flags; > u32 val, irq_mask; > >- uart_port_lock_irqsave(port, &flags); >+ uart_port_lock(port); > > /* Clear IRQ cause */ > val = rda_uart_read(port, RDA_UART_IRQ_CAUSE); >@@ -425,7 +425,7 @@ static irqreturn_t rda_interrupt(int irq, void *dev_id) > rda_uart_send_chars(port); > } > >- uart_port_unlock_irqrestore(port, flags); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -590,18 +590,12 @@ static void rda_uart_port_write(struct uart_port *port, const char *s, > { > u32 old_irq_mask; > unsigned long flags; >- int locked; >+ int locked = 1; > >- local_irq_save(flags); >- >- if (port->sysrq) { >- locked = 0; >- } else if (oops_in_progress) { >- locked = uart_port_trylock(port); >- } else { >- uart_port_lock(port); >- locked = 1; >- } >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(port, &flags); >+ else >+ uart_port_lock_irqsave(port, &flags); > > old_irq_mask = rda_uart_read(port, RDA_UART_IRQ_MASK); > rda_uart_write(port, 0, RDA_UART_IRQ_MASK); >@@ -615,9 +609,7 @@ static void rda_uart_port_write(struct uart_port *port, const char *s, > rda_uart_write(port, old_irq_mask, RDA_UART_IRQ_MASK); > > if (locked) >- uart_port_unlock(port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(port, flags); > } > > static void rda_uart_console_write(struct console *co, const char *s, >diff --git a/drivers/tty/serial/sifive.c b/drivers/tty/serial/sifive.c >index a4cc569a78a25..0670fd9f84967 100644 >--- a/drivers/tty/serial/sifive.c >+++ b/drivers/tty/serial/sifive.c >@@ -412,7 +412,8 @@ static void __ssp_receive_chars(struct sifive_serial_port *ssp) > break; > > ssp->port.icount.rx++; >- uart_insert_char(&ssp->port, 0, 0, ch, TTY_NORMAL); >+ if (!uart_prepare_sysrq_char(&ssp->port, ch)) >+ uart_insert_char(&ssp->port, 0, 0, ch, TTY_NORMAL); > } > > tty_flip_buffer_push(&ssp->port.state->port); >@@ -534,7 +535,7 @@ static irqreturn_t sifive_serial_irq(int irq, void *dev_id) > if (ip & SIFIVE_SERIAL_IP_TXWM_MASK) > __ssp_transmit_chars(ssp); > >- uart_port_unlock(&ssp->port); >+ uart_unlock_and_check_sysrq(&ssp->port); > > return IRQ_HANDLED; > } >@@ -791,13 +792,10 @@ static void sifive_serial_console_write(struct console *co, const char *s, > if (!ssp) > return; > >- local_irq_save(flags); >- if (ssp->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&ssp->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&ssp->port, &flags); > else >- uart_port_lock(&ssp->port); >+ uart_port_lock_irqsave(&ssp->port, &flags); > > ier = __ssp_readl(ssp, SIFIVE_SERIAL_IE_OFFS); > __ssp_writel(0, SIFIVE_SERIAL_IE_OFFS, ssp); >@@ -807,8 +805,7 @@ static void sifive_serial_console_write(struct console *co, const char *s, > __ssp_writel(ier, SIFIVE_SERIAL_IE_OFFS, ssp); > > if (locked) >- uart_port_unlock(&ssp->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&ssp->port, flags); > } > > static int sifive_serial_console_setup(struct console *co, char *options) >diff --git a/drivers/tty/serial/sunplus-uart.c b/drivers/tty/serial/sunplus-uart.c >index 99f5285819d4b..f5e29eb4a4ce4 100644 >--- a/drivers/tty/serial/sunplus-uart.c >+++ b/drivers/tty/serial/sunplus-uart.c >@@ -260,7 +260,7 @@ static void receive_chars(struct uart_port *port) > if (port->ignore_status_mask & SUP_DUMMY_READ) > goto ignore_char; > >- if (uart_handle_sysrq_char(port, ch)) >+ if (uart_prepare_sysrq_char(port, ch)) > goto ignore_char; > > uart_insert_char(port, lsr, SUP_UART_LSR_OE, ch, flag); >@@ -287,7 +287,7 @@ static irqreturn_t sunplus_uart_irq(int irq, void *args) > if (isc & SUP_UART_ISC_TX) > transmit_chars(port); > >- uart_port_unlock(port); >+ uart_unlock_and_check_sysrq(port); > > return IRQ_HANDLED; > } >@@ -512,22 +512,16 @@ static void sunplus_console_write(struct console *co, > unsigned long flags; > int locked = 1; > >- local_irq_save(flags); >- >- if (sunplus_console_ports[co->index]->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&sunplus_console_ports[co->index]->port); >+ if (oops_in_progress) >+ locked = uart_port_trylock_irqsave(&sunplus_console_ports[co->index]->port, &flags); > else >- uart_port_lock(&sunplus_console_ports[co->index]->port); >+ uart_port_lock_irqsave(&sunplus_console_ports[co->index]->port, &flags); > > uart_console_write(&sunplus_console_ports[co->index]->port, s, count, > sunplus_uart_console_putchar); > > if (locked) >- uart_port_unlock(&sunplus_console_ports[co->index]->port); >- >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&sunplus_console_ports[co->index]->port, flags); > } > > static int __init sunplus_console_setup(struct console *co, char *options) >diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c >index 407b0d87b7c10..c9c914bc033c9 100644 >--- a/drivers/tty/tty_io.c >+++ b/drivers/tty/tty_io.c >@@ -3567,8 +3567,13 @@ static ssize_t show_cons_active(struct device *dev, > for_each_console(c) { > if (!c->device) > continue; >- if (!c->write) >- continue; >+ if (c->flags & CON_NBCON) { >+ if (!c->write_atomic && !c->write_thread) >+ continue; >+ } else { >+ if (!c->write) >+ continue; >+ } > if ((c->flags & CON_ENABLED) == 0) > continue; > cs[i++] = c; >diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c >index e0758fe7936dc..2703676549f5e 100644 >--- a/fs/proc/consoles.c >+++ b/fs/proc/consoles.c >@@ -21,12 +21,14 @@ static int show_console_dev(struct seq_file *m, void *v) > { CON_ENABLED, 'E' }, > { CON_CONSDEV, 'C' }, > { CON_BOOT, 'B' }, >+ { CON_NBCON, 'N' }, > { CON_PRINTBUFFER, 'p' }, > { CON_BRL, 'b' }, > { CON_ANYTIME, 'a' }, > }; > char flags[ARRAY_SIZE(con_flags) + 1]; > struct console *con = v; >+ char con_write = '-'; > unsigned int a; > dev_t dev = 0; > >@@ -57,9 +59,15 @@ static int show_console_dev(struct seq_file *m, void *v) > seq_setwidth(m, 21 - 1); > seq_printf(m, "%s%d", con->name, con->index); > seq_pad(m, ' '); >- seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', >- con->write ? 'W' : '-', con->unblank ? 'U' : '-', >- flags); >+ if (con->flags & CON_NBCON) { >+ if (con->write_atomic || con->write_thread) >+ con_write = 'W'; >+ } else { >+ if (con->write) >+ con_write = 'W'; >+ } >+ seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con_write, >+ con->unblank ? 'U' : '-', flags); > if (dev) > seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); > >diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h >index fc53e0ad56d90..448bbef474564 100644 >--- a/include/linux/bottom_half.h >+++ b/include/linux/bottom_half.h >@@ -35,8 +35,10 @@ static inline void local_bh_enable(void) > > #ifdef CONFIG_PREEMPT_RT > extern bool local_bh_blocked(void); >+extern void softirq_preempt(void); > #else > static inline bool local_bh_blocked(void) { return false; } >+static inline void softirq_preempt(void) { } > #endif > > #endif /* _LINUX_BH_H */ >diff --git a/include/linux/console.h b/include/linux/console.h >index 779d388af8a0a..02d6cabbe5009 100644 >--- a/include/linux/console.h >+++ b/include/linux/console.h >@@ -16,7 +16,9 @@ > > #include <linux/atomic.h> > #include <linux/bits.h> >+#include <linux/irq_work.h> > #include <linux/rculist.h> >+#include <linux/rcuwait.h> > #include <linux/types.h> > > struct vc_data; >@@ -137,7 +139,7 @@ static inline int con_debug_leave(void) > */ > > /** >- * cons_flags - General console flags >+ * enum cons_flags - General console flags > * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate > * output of messages that were already shown by boot > * consoles or read by userspace via syslog() syscall. >@@ -218,7 +220,7 @@ struct nbcon_state { > static_assert(sizeof(struct nbcon_state) <= sizeof(int)); > > /** >- * nbcon_prio - console owner priority for nbcon consoles >+ * enum nbcon_prio - console owner priority for nbcon consoles > * @NBCON_PRIO_NONE: Unused > * @NBCON_PRIO_NORMAL: Normal (non-emergency) usage > * @NBCON_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) >@@ -285,7 +287,7 @@ struct nbcon_write_context { > /** > * struct console - The console descriptor structure > * @name: The name of the console driver >- * @write: Write callback to output messages (Optional) >+ * @write: Legacy write callback to output messages (Optional) > * @read: Read callback for console input (Optional) > * @device: The underlying TTY device driver (Optional) > * @unblank: Callback to unblank the console (Optional) >@@ -302,10 +304,12 @@ struct nbcon_write_context { > * @data: Driver private data > * @node: hlist node for the console list > * >- * @write_atomic: Write callback for atomic context > * @nbcon_state: State for nbcon consoles > * @nbcon_seq: Sequence number of the next record for nbcon to print > * @pbufs: Pointer to nbcon private buffer >+ * @kthread: Printer kthread for this console >+ * @rcuwait: RCU-safe wait object for @kthread waking >+ * @irq_work: Defer @kthread waking to IRQ work context > */ > struct console { > char name[16]; >@@ -327,11 +331,116 @@ struct console { > struct hlist_node node; > > /* nbcon console specific members */ >- bool (*write_atomic)(struct console *con, >- struct nbcon_write_context *wctxt); >+ >+ /** >+ * @write_atomic: >+ * >+ * NBCON callback to write out text in any context. (Optional) >+ * >+ * This callback is called with the console already acquired. The >+ * callback can use nbcon_can_proceed() at any time to verify that >+ * it is still the owner of the console. In the case that it has >+ * lost ownership, it is no longer allowed to go forward. In this >+ * case it must back out immediately and carefully. The buffer >+ * content is also no longer trusted since it no longer belongs to >+ * the context. >+ * >+ * If the callback needs to perform actions where ownership is not >+ * allowed to be taken over, nbcon_enter_unsafe() and >+ * nbcon_exit_unsafe() can be used to mark such sections. These >+ * functions are also points of possible ownership transfer. If >+ * either function returns false, ownership has been lost. >+ * >+ * If the driver must reacquire ownership in order to finalize or >+ * revert hardware changes, nbcon_reacquire() can be used. However, >+ * on reacquire the buffer content is no longer available. A >+ * reacquire cannot be used to resume printing. >+ * >+ * This callback can be called from any context (including NMI). >+ * Therefore it must avoid usage of any locking and instead rely >+ * on the console ownership for synchronization. >+ * >+ * Returns true if all text was successfully written out and >+ * ownership was never lost, otherwise false. >+ */ >+ bool (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); >+ >+ /** >+ * @write_thread: >+ * >+ * NBCON callback to write out text in task context. (Optional) >+ * >+ * This callback is called with the console already acquired. Any >+ * additional driver synchronization should have been performed by >+ * driver_enter(). >+ * >+ * This callback is always called from task context but with migration >+ * disabled. >+ * >+ * The same criteria for console ownership verification and unsafe >+ * sections applies as with write_atomic(). The difference between >+ * this callback and write_atomic() is that this callback is used >+ * during normal operation and is always called from task context. >+ * This provides drivers with a relatively relaxed locking context >+ * for synchronizing output to the hardware. >+ * >+ * Returns true if all text was successfully written out, otherwise >+ * false. >+ */ >+ bool (*write_thread)(struct console *con, struct nbcon_write_context *wctxt); >+ >+ /** >+ * @driver_enter: >+ * >+ * NBCON callback to begin synchronization with driver code. >+ * (Required for NBCON if write_thread is provided) >+ * >+ * Console drivers typically must deal with access to the hardware >+ * via user input/output (such as an interactive login shell) and >+ * output of kernel messages via printk() calls. This callback is >+ * called before the kernel begins output via the write_thread() >+ * callback due to printk() calls. The driver can use this >+ * callback to acquire some driver lock in order to synchronize >+ * against user input/output (or any other driver functionality). >+ * >+ * This callback is always called from task context. It may use any >+ * synchronization method required by the driver. BUT this callback >+ * MUST disable migration. The console driver may be using a >+ * sychronization mechanism that already takes care of this (such as >+ * spinlocks). Otherwise this function must explicitly call >+ * migrate_disable(). >+ * >+ * The flags argument is provided as a convenience to the driver. It >+ * will be passed again to driver_exit() when printing is completed >+ * (for example, if spin_lock_irqsave() was used). It can be ignored >+ * if the driver does not need it. >+ */ >+ void (*driver_enter)(struct console *con, unsigned long *flags); >+ >+ /** >+ * @driver_exit: >+ * >+ * NBCON callback to finish synchronization with driver code. >+ * (Required for NBCON if write_thread is provided) >+ * >+ * This callback is called after the kernel has finished printing a >+ * printk message. It is the counterpart to driver_enter(). >+ * >+ * This callback is always called from task context. It must >+ * appropriately re-enable migration (depending on how driver_enter() >+ * disabled migration). >+ * >+ * The flags argument is the value of the same variable that was >+ * passed to driver_enter(). >+ */ >+ void (*driver_exit)(struct console *con, unsigned long flags); >+ > atomic_t __private nbcon_state; > atomic_long_t __private nbcon_seq; > struct printk_buffers *pbufs; >+ struct task_struct *kthread; >+ struct rcuwait rcuwait; >+ struct irq_work irq_work; > }; > > #ifdef CONFIG_LOCKDEP >@@ -459,13 +568,19 @@ static inline bool console_is_registered(const struct console *con) > hlist_for_each_entry(con, &console_list, node) > > #ifdef CONFIG_PRINTK >+extern void nbcon_cpu_emergency_enter(void); >+extern void nbcon_cpu_emergency_exit(void); > extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); > extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); > extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); >+extern void nbcon_reacquire(struct nbcon_write_context *wctxt); > #else >+static inline void nbcon_cpu_emergency_enter(void) { } >+static inline void nbcon_cpu_emergency_exit(void) { } > static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } > static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } > static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } >+static inline void nbcon_reacquire(struct nbcon_write_context *wctxt) { } > #endif > > extern int console_set_on_cmdline; >diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h >index b0fb775a600d9..f5bb19369973a 100644 >--- a/include/linux/entry-common.h >+++ b/include/linux/entry-common.h >@@ -65,7 +65,7 @@ > #define EXIT_TO_USER_MODE_WORK \ > (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ > _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ >- ARCH_EXIT_TO_USER_MODE_WORK) >+ _TIF_NEED_RESCHED_LAZY | ARCH_EXIT_TO_USER_MODE_WORK) > > /** > * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs >diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h >index 6813171afccb2..674a622c91be2 100644 >--- a/include/linux/entry-kvm.h >+++ b/include/linux/entry-kvm.h >@@ -18,7 +18,7 @@ > > #define XFER_TO_GUEST_MODE_WORK \ > (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ >- _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) >+ _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED_LAZY | ARCH_XFER_TO_GUEST_MODE_WORK) > > struct kvm_vcpu; > >diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h >index 76121c2bb4f82..f75f6bc195d18 100644 >--- a/include/linux/interrupt.h >+++ b/include/linux/interrupt.h >@@ -609,6 +609,35 @@ extern void __raise_softirq_irqoff(unsigned int nr); > extern void raise_softirq_irqoff(unsigned int nr); > extern void raise_softirq(unsigned int nr); > >+#ifdef CONFIG_PREEMPT_RT >+DECLARE_PER_CPU(struct task_struct *, timersd); >+DECLARE_PER_CPU(unsigned long, pending_timer_softirq); >+ >+extern void raise_timer_softirq(void); >+extern void raise_hrtimer_softirq(void); >+ >+static inline unsigned int local_pending_timers(void) >+{ >+ return __this_cpu_read(pending_timer_softirq); >+} >+ >+#else >+static inline void raise_timer_softirq(void) >+{ >+ raise_softirq(TIMER_SOFTIRQ); >+} >+ >+static inline void raise_hrtimer_softirq(void) >+{ >+ raise_softirq_irqoff(HRTIMER_SOFTIRQ); >+} >+ >+static inline unsigned int local_pending_timers(void) >+{ >+ return local_softirq_pending(); >+} >+#endif >+ > DECLARE_PER_CPU(struct task_struct *, ksoftirqd); > > static inline struct task_struct *this_cpu_ksoftirqd(void) >diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >index 78a09af89e39b..0a29b856786e7 100644 >--- a/include/linux/netdevice.h >+++ b/include/linux/netdevice.h >@@ -3365,6 +3365,7 @@ static inline void dev_xmit_recursion_dec(void) > __this_cpu_dec(softnet_data.xmit.recursion); > } > >+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); > void __netif_schedule(struct Qdisc *q); > void netif_schedule_queue(struct netdev_queue *txq); > >diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h >index d2a15c0c6f8a9..c1c6600541657 100644 >--- a/include/linux/perf_event.h >+++ b/include/linux/perf_event.h >@@ -781,9 +781,9 @@ struct perf_event { > unsigned int pending_wakeup; > unsigned int pending_kill; > unsigned int pending_disable; >- unsigned int pending_sigtrap; > unsigned long pending_addr; /* SIGTRAP */ > struct irq_work pending_irq; >+ struct irq_work pending_disable_irq; > struct callback_head pending_task; > unsigned int pending_work; > >@@ -959,7 +959,7 @@ struct perf_event_context { > struct rcu_head rcu_head; > > /* >- * Sum (event->pending_sigtrap + event->pending_work) >+ * Sum (event->pending_work + event->pending_work) > * > * The SIGTRAP is targeted at ctx->task, as such it won't do changing > * that until the signal is delivered. >diff --git a/include/linux/printk.h b/include/linux/printk.h >index 8ef499ab3c1ed..a2d40a6372266 100644 >--- a/include/linux/printk.h >+++ b/include/linux/printk.h >@@ -9,6 +9,8 @@ > #include <linux/ratelimit_types.h> > #include <linux/once_lite.h> > >+struct uart_port; >+ > extern const char linux_banner[]; > extern const char linux_proc_banner[]; > >@@ -159,13 +161,16 @@ __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); > > extern void __printk_safe_enter(void); > extern void __printk_safe_exit(void); >+extern void __printk_deferred_enter(void); >+extern void __printk_deferred_exit(void); >+ > /* > * The printk_deferred_enter/exit macros are available only as a hack for > * some code paths that need to defer all printk console printing. Interrupts > * must be disabled for the deferred duration. > */ >-#define printk_deferred_enter __printk_safe_enter >-#define printk_deferred_exit __printk_safe_exit >+#define printk_deferred_enter() __printk_deferred_enter() >+#define printk_deferred_exit() __printk_deferred_exit() > > /* > * Please don't use printk_ratelimit(), because it shares ratelimiting state >@@ -192,6 +197,10 @@ void show_regs_print_info(const char *log_lvl); > extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; > extern asmlinkage void dump_stack(void) __cold; > void printk_trigger_flush(void); >+void printk_legacy_allow_panic_sync(void); >+extern void uart_nbcon_acquire(struct uart_port *up); >+extern void uart_nbcon_release(struct uart_port *up); >+void nbcon_atomic_flush_unsafe(void); > #else > static inline __printf(1, 0) > int vprintk(const char *s, va_list args) >@@ -271,8 +280,27 @@ static inline void dump_stack(void) > static inline void printk_trigger_flush(void) > { > } >+ >+static inline void printk_legacy_allow_panic_sync(void) >+{ >+} >+ >+static inline void uart_nbcon_acquire(struct uart_port *up) >+{ >+} >+ >+static inline void uart_nbcon_release(struct uart_port *up) >+{ >+} >+ >+static inline void nbcon_atomic_flush_unsafe(void) >+{ >+} >+ > #endif > >+bool this_cpu_in_panic(void); >+ > #ifdef CONFIG_SMP > extern int __printk_cpu_sync_try_get(void); > extern void __printk_cpu_sync_wait(void); >diff --git a/include/linux/sched.h b/include/linux/sched.h >index ffe8f618ab869..cb4df5d70e3d0 100644 >--- a/include/linux/sched.h >+++ b/include/linux/sched.h >@@ -1791,6 +1791,7 @@ static inline int dl_task_check_affinity(struct task_struct *p, const struct cpu > } > #endif > >+extern bool task_is_pi_boosted(const struct task_struct *p); > extern int yield_to(struct task_struct *p, bool preempt); > extern void set_user_nice(struct task_struct *p, long nice); > extern int task_prio(const struct task_struct *p); >@@ -1933,17 +1934,17 @@ static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, > update_ti_thread_flag(task_thread_info(tsk), flag, value); > } > >-static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) >+static inline bool test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) > { > return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); > } > >-static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) >+static inline bool test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) > { > return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); > } > >-static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) >+static inline bool test_tsk_thread_flag(struct task_struct *tsk, int flag) > { > return test_ti_thread_flag(task_thread_info(tsk), flag); > } >@@ -1956,9 +1957,11 @@ static inline void set_tsk_need_resched(struct task_struct *tsk) > static inline void clear_tsk_need_resched(struct task_struct *tsk) > { > clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); >+ if (IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO)) >+ clear_tsk_thread_flag(tsk, TIF_NEED_RESCHED_LAZY); > } > >-static inline int test_tsk_need_resched(struct task_struct *tsk) >+static inline bool test_tsk_need_resched(struct task_struct *tsk) > { > return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); > } >@@ -2099,7 +2102,7 @@ static inline bool preempt_model_preemptible(void) > > static __always_inline bool need_resched(void) > { >- return unlikely(tif_need_resched()); >+ return unlikely(tif_need_resched_lazy() || tif_need_resched()); > } > > /* >diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h >index 478084f9105e1..719416fe8ddc0 100644 >--- a/include/linux/sched/idle.h >+++ b/include/linux/sched/idle.h >@@ -63,7 +63,7 @@ static __always_inline bool __must_check current_set_polling_and_test(void) > */ > smp_mb__after_atomic(); > >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > > static __always_inline bool __must_check current_clr_polling_and_test(void) >@@ -76,7 +76,7 @@ static __always_inline bool __must_check current_clr_polling_and_test(void) > */ > smp_mb__after_atomic(); > >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > > #else >@@ -85,11 +85,11 @@ static inline void __current_clr_polling(void) { } > > static inline bool __must_check current_set_polling_and_test(void) > { >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > static inline bool __must_check current_clr_polling_and_test(void) > { >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > #endif > >diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h >index be65de65fe612..ec46e3b49ee99 100644 >--- a/include/linux/serial_8250.h >+++ b/include/linux/serial_8250.h >@@ -153,6 +153,8 @@ struct uart_8250_port { > #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA > unsigned char msr_saved_flags; > >+ bool console_newline_needed; >+ > struct uart_8250_dma *dma; > const struct uart_8250_ops *ops; > >@@ -204,6 +206,10 @@ void serial8250_init_port(struct uart_8250_port *up); > void serial8250_set_defaults(struct uart_8250_port *up); > void serial8250_console_write(struct uart_8250_port *up, const char *s, > unsigned int count); >+bool serial8250_console_write_atomic(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt); >+bool serial8250_console_write_thread(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt); > int serial8250_console_setup(struct uart_port *port, char *options, bool probe); > int serial8250_console_exit(struct uart_port *port); > >diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h >index 55b1f3ba48ac1..b003e658bb39e 100644 >--- a/include/linux/serial_core.h >+++ b/include/linux/serial_core.h >@@ -488,6 +488,7 @@ struct uart_port { > struct uart_icount icount; /* statistics */ > > struct console *cons; /* struct console, if any */ >+ bool nbcon_locked_port; /* True, if the port is locked by nbcon */ > /* flags must be updated while holding port mutex */ > upf_t flags; > >@@ -595,6 +596,7 @@ struct uart_port { > static inline void uart_port_lock(struct uart_port *up) > { > spin_lock(&up->lock); >+ uart_nbcon_acquire(up); > } > > /** >@@ -604,6 +606,7 @@ static inline void uart_port_lock(struct uart_port *up) > static inline void uart_port_lock_irq(struct uart_port *up) > { > spin_lock_irq(&up->lock); >+ uart_nbcon_acquire(up); > } > > /** >@@ -614,6 +617,7 @@ static inline void uart_port_lock_irq(struct uart_port *up) > static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) > { > spin_lock_irqsave(&up->lock, *flags); >+ uart_nbcon_acquire(up); > } > > /** >@@ -624,7 +628,11 @@ static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *f > */ > static inline bool uart_port_trylock(struct uart_port *up) > { >- return spin_trylock(&up->lock); >+ if (!spin_trylock(&up->lock)) >+ return false; >+ >+ uart_nbcon_acquire(up); >+ return true; > } > > /** >@@ -636,7 +644,11 @@ static inline bool uart_port_trylock(struct uart_port *up) > */ > static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags) > { >- return spin_trylock_irqsave(&up->lock, *flags); >+ if (!spin_trylock_irqsave(&up->lock, *flags)) >+ return false; >+ >+ uart_nbcon_acquire(up); >+ return true; > } > > /** >@@ -645,6 +657,7 @@ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long > */ > static inline void uart_port_unlock(struct uart_port *up) > { >+ uart_nbcon_release(up); > spin_unlock(&up->lock); > } > >@@ -654,6 +667,7 @@ static inline void uart_port_unlock(struct uart_port *up) > */ > static inline void uart_port_unlock_irq(struct uart_port *up) > { >+ uart_nbcon_release(up); > spin_unlock_irq(&up->lock); > } > >@@ -663,6 +677,19 @@ static inline void uart_port_unlock_irq(struct uart_port *up) > * @flags: The saved interrupt flags for restore > */ > static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) >+{ >+ uart_nbcon_release(up); >+ spin_unlock_irqrestore(&up->lock, flags); >+} >+ >+/* Only for use in the console->driver_enter() callback. */ >+static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) >+{ >+ spin_lock_irqsave(&up->lock, *flags); >+} >+ >+/* Only for use in the console->driver_exit() callback. */ >+static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) > { > spin_unlock_irqrestore(&up->lock, flags); > } >diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h >index 9ea0b28068f49..5ded1450ac1a1 100644 >--- a/include/linux/thread_info.h >+++ b/include/linux/thread_info.h >@@ -59,6 +59,16 @@ enum syscall_work_bit { > > #include <asm/thread_info.h> > >+#ifdef CONFIG_PREEMPT_BUILD_AUTO >+# define TIF_NEED_RESCHED_LAZY TIF_ARCH_RESCHED_LAZY >+# define _TIF_NEED_RESCHED_LAZY _TIF_ARCH_RESCHED_LAZY >+# define TIF_NEED_RESCHED_LAZY_OFFSET (TIF_NEED_RESCHED_LAZY - TIF_NEED_RESCHED) >+#else >+# define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED >+# define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED >+# define TIF_NEED_RESCHED_LAZY_OFFSET 0 >+#endif >+ > #ifdef __KERNEL__ > > #ifndef arch_set_restart_data >@@ -185,6 +195,13 @@ static __always_inline bool tif_need_resched(void) > (unsigned long *)(¤t_thread_info()->flags)); > } > >+static __always_inline bool tif_need_resched_lazy(void) >+{ >+ return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && >+ arch_test_bit(TIF_NEED_RESCHED_LAZY, >+ (unsigned long *)(¤t_thread_info()->flags)); >+} >+ > #else > > static __always_inline bool tif_need_resched(void) >@@ -193,6 +210,13 @@ static __always_inline bool tif_need_resched(void) > (unsigned long *)(¤t_thread_info()->flags)); > } > >+static __always_inline bool tif_need_resched_lazy(void) >+{ >+ return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && >+ test_bit(TIF_NEED_RESCHED_LAZY, >+ (unsigned long *)(¤t_thread_info()->flags)); >+} >+ > #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ > > #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES >diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h >index d68ff9b1247f9..0681b3d5a85c6 100644 >--- a/include/linux/trace_events.h >+++ b/include/linux/trace_events.h >@@ -178,8 +178,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); > > enum trace_flag_type { > TRACE_FLAG_IRQS_OFF = 0x01, >- TRACE_FLAG_IRQS_NOSUPPORT = 0x02, >- TRACE_FLAG_NEED_RESCHED = 0x04, >+ TRACE_FLAG_NEED_RESCHED = 0x02, >+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x04, > TRACE_FLAG_HARDIRQ = 0x08, > TRACE_FLAG_SOFTIRQ = 0x10, > TRACE_FLAG_PREEMPT_RESCHED = 0x20, >@@ -205,11 +205,11 @@ static inline unsigned int tracing_gen_ctx(void) > > static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) > { >- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); >+ return tracing_gen_ctx_irq_test(0); > } > static inline unsigned int tracing_gen_ctx(void) > { >- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); >+ return tracing_gen_ctx_irq_test(0); > } > #endif > >diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt >index c2f1fd95a8214..0f3d4c2a41cb7 100644 >--- a/kernel/Kconfig.preempt >+++ b/kernel/Kconfig.preempt >@@ -11,6 +11,13 @@ config PREEMPT_BUILD > select PREEMPTION > select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK > >+config PREEMPT_BUILD_AUTO >+ bool >+ select PREEMPT_BUILD >+ >+config HAVE_PREEMPT_AUTO >+ bool >+ > choice > prompt "Preemption Model" > default PREEMPT_NONE >@@ -67,9 +74,17 @@ config PREEMPT > embedded system with latency requirements in the milliseconds > range. > >+config PREEMPT_AUTO >+ bool "Automagic preemption mode with runtime tweaking support" >+ depends on HAVE_PREEMPT_AUTO >+ select PREEMPT_BUILD_AUTO >+ help >+ Add some sensible blurb here >+ > config PREEMPT_RT > bool "Fully Preemptible Kernel (Real-Time)" > depends on EXPERT && ARCH_SUPPORTS_RT >+ select PREEMPT_BUILD_AUTO if HAVE_PREEMPT_AUTO > select PREEMPTION > help > This option turns the kernel into a real-time kernel by replacing >@@ -95,7 +110,7 @@ config PREEMPTION > > config PREEMPT_DYNAMIC > bool "Preemption behaviour defined on boot" >- depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT >+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT && !PREEMPT_AUTO > select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY > select PREEMPT_BUILD > default y if HAVE_PREEMPT_DYNAMIC_CALL >diff --git a/kernel/entry/common.c b/kernel/entry/common.c >index 88cb3c88aaa5c..d78b109750a3c 100644 >--- a/kernel/entry/common.c >+++ b/kernel/entry/common.c >@@ -92,7 +92,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, > > local_irq_enable_exit_to_user(ti_work); > >- if (ti_work & _TIF_NEED_RESCHED) >+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) > schedule(); > > if (ti_work & _TIF_UPROBE) >@@ -301,7 +301,7 @@ void raw_irqentry_exit_cond_resched(void) > rcu_irq_exit_check_preempt(); > if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) > WARN_ON_ONCE(!on_thread_stack()); >- if (need_resched()) >+ if (test_tsk_need_resched(current)) > preempt_schedule_irq(); > } > } >diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c >index 2e0f75bcb7fd1..d952fa5ee8801 100644 >--- a/kernel/entry/kvm.c >+++ b/kernel/entry/kvm.c >@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) > return -EINTR; > } > >- if (ti_work & _TIF_NEED_RESCHED) >+ if (ti_work & (_TIF_NEED_RESCHED | TIF_NEED_RESCHED_LAZY)) > schedule(); > > if (ti_work & _TIF_NOTIFY_RESUME) >diff --git a/kernel/events/core.c b/kernel/events/core.c >index f0f0f71213a1d..d5af4d03c2680 100644 >--- a/kernel/events/core.c >+++ b/kernel/events/core.c >@@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) > state = PERF_EVENT_STATE_OFF; > } > >- if (event->pending_sigtrap) { >- bool dec = true; >- >- event->pending_sigtrap = 0; >- if (state != PERF_EVENT_STATE_OFF && >- !event->pending_work) { >- event->pending_work = 1; >- dec = false; >- WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); >- task_work_add(current, &event->pending_task, TWA_RESUME); >- } >- if (dec) >- local_dec(&event->ctx->nr_pending); >- } >- > perf_event_set_state(event, state); > > if (!is_software_event(event)) >@@ -2464,7 +2449,7 @@ static void __perf_event_disable(struct perf_event *event, > * hold the top-level event's child_mutex, so any descendant that > * goes to exit will block in perf_event_exit_event(). > * >- * When called from perf_pending_irq it's OK because event->ctx >+ * When called from perf_pending_disable it's OK because event->ctx > * is the current context on this CPU and preemption is disabled, > * hence we can't get into perf_event_task_sched_out for this context. > */ >@@ -2504,7 +2489,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable); > void perf_event_disable_inatomic(struct perf_event *event) > { > event->pending_disable = 1; >- irq_work_queue(&event->pending_irq); >+ irq_work_queue(&event->pending_disable_irq); > } > > #define MAX_INTERRUPTS (~0ULL) >@@ -5190,6 +5175,7 @@ static void perf_addr_filters_splice(struct perf_event *event, > static void _free_event(struct perf_event *event) > { > irq_work_sync(&event->pending_irq); >+ irq_work_sync(&event->pending_disable_irq); > > unaccount_event(event); > >@@ -6726,7 +6712,7 @@ static void perf_sigtrap(struct perf_event *event) > /* > * Deliver the pending work in-event-context or follow the context. > */ >-static void __perf_pending_irq(struct perf_event *event) >+static void __perf_pending_disable(struct perf_event *event) > { > int cpu = READ_ONCE(event->oncpu); > >@@ -6741,11 +6727,6 @@ static void __perf_pending_irq(struct perf_event *event) > * Yay, we hit home and are in the context of the event. > */ > if (cpu == smp_processor_id()) { >- if (event->pending_sigtrap) { >- event->pending_sigtrap = 0; >- perf_sigtrap(event); >- local_dec(&event->ctx->nr_pending); >- } > if (event->pending_disable) { > event->pending_disable = 0; > perf_event_disable_local(event); >@@ -6769,11 +6750,26 @@ static void __perf_pending_irq(struct perf_event *event) > * irq_work_queue(); // FAILS > * > * irq_work_run() >- * perf_pending_irq() >+ * perf_pending_disable() > * > * But the event runs on CPU-B and wants disabling there. > */ >- irq_work_queue_on(&event->pending_irq, cpu); >+ irq_work_queue_on(&event->pending_disable_irq, cpu); >+} >+ >+static void perf_pending_disable(struct irq_work *entry) >+{ >+ struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); >+ int rctx; >+ >+ /* >+ * If we 'fail' here, that's OK, it means recursion is already disabled >+ * and we won't recurse 'further'. >+ */ >+ rctx = perf_swevent_get_recursion_context(); >+ __perf_pending_disable(event); >+ if (rctx >= 0) >+ perf_swevent_put_recursion_context(rctx); > } > > static void perf_pending_irq(struct irq_work *entry) >@@ -6796,8 +6792,6 @@ static void perf_pending_irq(struct irq_work *entry) > perf_event_wakeup(event); > } > >- __perf_pending_irq(event); >- > if (rctx >= 0) > perf_swevent_put_recursion_context(rctx); > } >@@ -6805,14 +6799,6 @@ static void perf_pending_irq(struct irq_work *entry) > static void perf_pending_task(struct callback_head *head) > { > struct perf_event *event = container_of(head, struct perf_event, pending_task); >- int rctx; >- >- /* >- * If we 'fail' here, that's OK, it means recursion is already disabled >- * and we won't recurse 'further'. >- */ >- preempt_disable_notrace(); >- rctx = perf_swevent_get_recursion_context(); > > if (event->pending_work) { > event->pending_work = 0; >@@ -6820,10 +6806,6 @@ static void perf_pending_task(struct callback_head *head) > local_dec(&event->ctx->nr_pending); > } > >- if (rctx >= 0) >- perf_swevent_put_recursion_context(rctx); >- preempt_enable_notrace(); >- > put_event(event); > } > >@@ -9592,13 +9574,23 @@ static int __perf_event_overflow(struct perf_event *event, > > if (regs) > pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; >- if (!event->pending_sigtrap) { >- event->pending_sigtrap = pending_id; >+ if (!event->pending_work) { >+ event->pending_work = pending_id; > local_inc(&event->ctx->nr_pending); >+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); >+ task_work_add(current, &event->pending_task, TWA_RESUME); >+ /* >+ * The NMI path returns directly to userland. The >+ * irq_work is raised as a dummy interrupt to ensure >+ * regular return path to user is taken and task_work >+ * is processed. >+ */ >+ if (in_nmi()) >+ irq_work_queue(&event->pending_disable_irq); > } else if (event->attr.exclude_kernel && valid_sample) { > /* > * Should not be able to return to user space without >- * consuming pending_sigtrap; with exceptions: >+ * consuming pending_work; with exceptions: > * > * 1. Where !exclude_kernel, events can overflow again > * in the kernel without returning to user space. >@@ -9608,13 +9600,12 @@ static int __perf_event_overflow(struct perf_event *event, > * To approximate progress (with false negatives), > * check 32-bit hash of the current IP. > */ >- WARN_ON_ONCE(event->pending_sigtrap != pending_id); >+ WARN_ON_ONCE(event->pending_work != pending_id); > } > > event->pending_addr = 0; > if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) > event->pending_addr = data->addr; >- irq_work_queue(&event->pending_irq); > } > > READ_ONCE(event->overflow_handler)(event, data, regs); >@@ -11935,6 +11926,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, > > init_waitqueue_head(&event->waitq); > init_irq_work(&event->pending_irq, perf_pending_irq); >+ event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); > init_task_work(&event->pending_task, perf_pending_task); > > mutex_init(&event->mmap_mutex); >@@ -13049,6 +13041,13 @@ static void sync_child_event(struct perf_event *child_event) > &parent_event->child_total_time_running); > } > >+static bool task_work_cb_match(struct callback_head *cb, void *data) >+{ >+ struct perf_event *event = container_of(cb, struct perf_event, pending_task); >+ >+ return event == data; >+} >+ > static void > perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) > { >@@ -13088,6 +13087,17 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) > * Kick perf_poll() for is_event_hup(); > */ > perf_event_wakeup(parent_event); >+ /* >+ * Cancel pending task_work and update counters if it has not >+ * yet been delivered to userland. free_event() expects the >+ * reference counter at 1 and keeping the event around until the >+ * task return to userland will be a unexpected. >+ */ >+ if (event->pending_work && >+ task_work_cancel_match(current, task_work_cb_match, event)) { >+ put_event(event); >+ local_dec(&event->ctx->nr_pending); >+ } > free_event(event); > put_event(parent_event); > return; >diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c >index 1d4bc493b2f4b..486c68c11bbe2 100644 >--- a/kernel/ksysfs.c >+++ b/kernel/ksysfs.c >@@ -179,6 +179,15 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size); > > #endif /* CONFIG_CRASH_CORE */ > >+#if defined(CONFIG_PREEMPT_RT) >+static ssize_t realtime_show(struct kobject *kobj, >+ struct kobj_attribute *attr, char *buf) >+{ >+ return sprintf(buf, "%d\n", 1); >+} >+KERNEL_ATTR_RO(realtime); >+#endif >+ > /* whether file capabilities are enabled */ > static ssize_t fscaps_show(struct kobject *kobj, > struct kobj_attribute *attr, char *buf) >@@ -274,6 +283,9 @@ static struct attribute * kernel_attrs[] = { > #ifndef CONFIG_TINY_RCU > &rcu_expedited_attr.attr, > &rcu_normal_attr.attr, >+#endif >+#ifdef CONFIG_PREEMPT_RT >+ &realtime_attr.attr, > #endif > NULL > }; >diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c >index 151bd3de59363..5c21ba41e3087 100644 >--- a/kernel/locking/lockdep.c >+++ b/kernel/locking/lockdep.c >@@ -56,6 +56,7 @@ > #include <linux/kprobes.h> > #include <linux/lockdep.h> > #include <linux/context_tracking.h> >+#include <linux/console.h> > > #include <asm/sections.h> > >@@ -3971,6 +3972,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, > if (!debug_locks_off() || debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("================================\n"); > pr_warn("WARNING: inconsistent lock state\n"); >@@ -3999,6 +4002,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > /* >diff --git a/kernel/panic.c b/kernel/panic.c >index 2807639aab51d..9fa44bc38f466 100644 >--- a/kernel/panic.c >+++ b/kernel/panic.c >@@ -370,6 +370,8 @@ void panic(const char *fmt, ...) > */ > atomic_notifier_call_chain(&panic_notifier_list, 0, buf); > >+ printk_legacy_allow_panic_sync(); >+ > panic_print_sys_info(false); > > kmsg_dump(KMSG_DUMP_PANIC); >@@ -446,6 +448,15 @@ void panic(const char *fmt, ...) > > /* Do not scroll important messages printed above */ > suppress_printk = 1; >+ >+ /* >+ * The final messages may not have been printed if in a context that >+ * defers printing (such as NMI) and irq_work is not available. >+ * Explicitly flush the kernel log buffer one last time. >+ */ >+ console_flush_on_panic(CONSOLE_FLUSH_PENDING); >+ nbcon_atomic_flush_unsafe(); >+ > local_irq_enable(); > for (i = 0; ; i += PANIC_TIMER_STEP) { > touch_softlockup_watchdog(); >@@ -623,6 +634,7 @@ bool oops_may_print(void) > */ > void oops_enter(void) > { >+ nbcon_cpu_emergency_enter(); > tracing_off(); > /* can't trust the integrity of the kernel anymore: */ > debug_locks_off(); >@@ -645,6 +657,7 @@ void oops_exit(void) > { > do_oops_enter_exit(); > print_oops_end_marker(); >+ nbcon_cpu_emergency_exit(); > kmsg_dump(KMSG_DUMP_OOPS); > } > >@@ -656,6 +669,8 @@ struct warn_args { > void __warn(const char *file, int line, void *caller, unsigned taint, > struct pt_regs *regs, struct warn_args *args) > { >+ nbcon_cpu_emergency_enter(); >+ > disable_trace_on_warning(); > > if (file) >@@ -686,6 +701,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, > > /* Just a warning, don't kill lockdep. */ > add_taint(taint, LOCKDEP_STILL_OK); >+ >+ nbcon_cpu_emergency_exit(); > } > > #ifdef CONFIG_BUG >diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h >index 6c2afee5ef620..1abb4207186f0 100644 >--- a/kernel/printk/internal.h >+++ b/kernel/printk/internal.h >@@ -4,6 +4,7 @@ > */ > #include <linux/percpu.h> > #include <linux/console.h> >+#include <linux/jump_label.h> > #include "printk_ringbuffer.h" > > #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) >@@ -20,6 +21,13 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, > (con->flags & CON_BOOT) ? "boot" : "", \ > con->name, con->index, ##__VA_ARGS__) > >+#ifdef CONFIG_PREEMPT_RT >+# define force_printkthreads() (true) >+#else >+DECLARE_STATIC_KEY_FALSE(force_printkthreads_key); >+# define force_printkthreads() (static_branch_unlikely(&force_printkthreads_key)) >+#endif >+ > #ifdef CONFIG_PRINTK > > #ifdef CONFIG_PRINTK_CALLER >@@ -44,6 +52,17 @@ enum printk_info_flags { > }; > > extern struct printk_ringbuffer *prb; >+extern bool printk_threads_enabled; >+extern bool have_legacy_console; >+extern bool have_boot_console; >+ >+/* >+ * Specifies if the console lock/unlock dance is needed for console >+ * printing. If @have_boot_console is true, the nbcon consoles will >+ * be printed serially along with the legacy consoles because nbcon >+ * consoles cannot print simultaneously with boot consoles. >+ */ >+#define printing_via_unlock (have_legacy_console || have_boot_console) > > __printf(4, 0) > int vprintk_store(int facility, int level, >@@ -71,12 +90,79 @@ void defer_console_output(void); > > u16 printk_parse_prefix(const char *text, int *level, > enum printk_info_flags *flags); >+void console_lock_spinning_enable(void); >+int console_lock_spinning_disable_and_check(int cookie); > > u64 nbcon_seq_read(struct console *con); > void nbcon_seq_force(struct console *con, u64 seq); > bool nbcon_alloc(struct console *con); > void nbcon_init(struct console *con); > void nbcon_free(struct console *con); >+enum nbcon_prio nbcon_get_default_prio(void); >+void nbcon_atomic_flush_all(void); >+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, >+ int cookie, bool use_atomic); >+void nbcon_kthread_create(struct console *con); >+void nbcon_wake_threads(void); >+void nbcon_legacy_kthread_create(void); >+ >+/* >+ * Check if the given console is currently capable and allowed to print >+ * records. Note that this function does not consider the current context, >+ * which can also play a role in deciding if @con can be used to print >+ * records. >+ */ >+static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) >+{ >+ if (!(flags & CON_ENABLED)) >+ return false; >+ >+ if ((flags & CON_SUSPENDED)) >+ return false; >+ >+ if (flags & CON_NBCON) { >+ if (use_atomic) { >+ if (!con->write_atomic) >+ return false; >+ } else { >+ if (!con->write_thread) >+ return false; >+ } >+ } else { >+ if (!con->write) >+ return false; >+ } >+ >+ /* >+ * Console drivers may assume that per-cpu resources have been >+ * allocated. So unless they're explicitly marked as being able to >+ * cope (CON_ANYTIME) don't call them until this CPU is officially up. >+ */ >+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) >+ return false; >+ >+ return true; >+} >+ >+/** >+ * nbcon_kthread_wake - Wake up a printk thread >+ * @con: Console to operate on >+ */ >+static inline void nbcon_kthread_wake(struct console *con) >+{ >+ /* >+ * Guarantee any new records can be seen by tasks preparing to wait >+ * before this context checks if the rcuwait is empty. >+ * >+ * The full memory barrier in rcuwait_wake_up() pairs with the full >+ * memory barrier within set_current_state() of >+ * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait() >+ * adds the waiter but before it has checked the wait condition. >+ * >+ * This pairs with nbcon_kthread_func:A. >+ */ >+ rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */ >+} > > #else > >@@ -84,6 +170,11 @@ void nbcon_free(struct console *con); > #define PRINTK_MESSAGE_MAX 0 > #define PRINTKRB_RECORD_MAX 0 > >+static inline void nbcon_kthread_wake(struct console *con) { } >+static inline void nbcon_kthread_create(struct console *con) { } >+#define printk_threads_enabled (false) >+#define printing_via_unlock (false) >+ > /* > * In !PRINTK builds we still export console_sem > * semaphore and some of console functions (console_unlock()/etc.), so >@@ -98,6 +189,13 @@ static inline void nbcon_seq_force(struct console *con, u64 seq) { } > static inline bool nbcon_alloc(struct console *con) { return false; } > static inline void nbcon_init(struct console *con) { } > static inline void nbcon_free(struct console *con) { } >+static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; } >+static inline void nbcon_atomic_flush_all(void) { } >+static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, >+ int cookie, bool use_atomic) { return false; } >+ >+static inline bool console_is_usable(struct console *con, short flags, >+ bool use_atomic) { return false; } > > #endif /* CONFIG_PRINTK */ > >diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c >index b96077152f49d..f2d59a0a93e58 100644 >--- a/kernel/printk/nbcon.c >+++ b/kernel/printk/nbcon.c >@@ -5,7 +5,11 @@ > #include <linux/kernel.h> > #include <linux/console.h> > #include <linux/delay.h> >+#include <linux/kthread.h> > #include <linux/slab.h> >+#include <linux/serial_core.h> >+#include <linux/syscore_ops.h> >+#include "printk_ringbuffer.h" > #include "internal.h" > /* > * Printk console printing implementation for consoles which does not depend >@@ -140,39 +144,6 @@ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_sta > return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom); > } > >-#ifdef CONFIG_64BIT >- >-#define __seq_to_nbcon_seq(seq) (seq) >-#define __nbcon_seq_to_seq(seq) (seq) >- >-#else /* CONFIG_64BIT */ >- >-#define __seq_to_nbcon_seq(seq) ((u32)seq) >- >-static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq) >-{ >- u64 seq; >- u64 rb_next_seq; >- >- /* >- * The provided sequence is only the lower 32 bits of the ringbuffer >- * sequence. It needs to be expanded to 64bit. Get the next sequence >- * number from the ringbuffer and fold it. >- * >- * Having a 32bit representation in the console is sufficient. >- * If a console ever gets more than 2^31 records behind >- * the ringbuffer then this is the least of the problems. >- * >- * Also the access to the ring buffer is always safe. >- */ >- rb_next_seq = prb_next_seq(prb); >- seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq); >- >- return seq; >-} >- >-#endif /* CONFIG_64BIT */ >- > /** > * nbcon_seq_read - Read the current console sequence > * @con: Console to read the sequence of >@@ -183,7 +154,7 @@ u64 nbcon_seq_read(struct console *con) > { > unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq)); > >- return __nbcon_seq_to_seq(nbcon_seq); >+ return __ulseq_to_u64seq(prb, nbcon_seq); > } > > /** >@@ -204,7 +175,7 @@ void nbcon_seq_force(struct console *con, u64 seq) > */ > u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); > >- atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq)); >+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); > > /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */ > con->seq = 0; >@@ -223,17 +194,19 @@ void nbcon_seq_force(struct console *con, u64 seq) > */ > static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) > { >- unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq); >+ unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq); > struct console *con = ctxt->console; > > if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq, >- __seq_to_nbcon_seq(new_seq))) { >+ __u64seq_to_ulseq(new_seq))) { > ctxt->seq = new_seq; > } else { > ctxt->seq = nbcon_seq_read(con); > } > } > >+bool printk_threads_enabled __ro_after_init; >+ > /** > * nbcon_context_try_acquire_direct - Try to acquire directly > * @ctxt: The context of the caller >@@ -564,6 +537,7 @@ static struct printk_buffers panic_nbcon_pbufs; > * nbcon_context_try_acquire - Try to acquire nbcon console > * @ctxt: The context of the caller > * >+ * Context: Any context which could not be migrated to another CPU. > * Return: True if the console was acquired. False otherwise. > * > * If the caller allowed an unsafe hostile takeover, on success the >@@ -571,7 +545,6 @@ static struct printk_buffers panic_nbcon_pbufs; > * in an unsafe state. Otherwise, on success the caller may assume > * the console is not in an unsafe state. > */ >-__maybe_unused > static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) > { > unsigned int cpu = smp_processor_id(); >@@ -857,9 +830,42 @@ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) > } > EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); > >+/** >+ * nbcon_reacquire - Reacquire a console after losing ownership >+ * @wctxt: The write context that was handed to the write function >+ * >+ * Since ownership can be lost at any time due to handover or takeover, a >+ * printing context _should_ be prepared to back out immediately and >+ * carefully. However, there are many scenarios where the context _must_ >+ * reacquire ownership in order to finalize or revert hardware changes. >+ * >+ * This function allows a context to reacquire ownership using the same >+ * priority as its previous ownership. >+ * >+ * Note that for printing contexts, after a successful reacquire the >+ * context will have no output buffer because that has been lost. This >+ * function cannot be used to resume printing. >+ */ >+void nbcon_reacquire(struct nbcon_write_context *wctxt) >+{ >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); >+ struct console *con = ctxt->console; >+ struct nbcon_state cur; >+ >+ while (!nbcon_context_try_acquire(ctxt)) >+ cpu_relax(); >+ >+ wctxt->outbuf = NULL; >+ wctxt->len = 0; >+ nbcon_state_read(con, &cur); >+ wctxt->unsafe_takeover = cur.unsafe_takeover; >+} >+EXPORT_SYMBOL_GPL(nbcon_reacquire); >+ > /** > * nbcon_emit_next_record - Emit a record in the acquired context > * @wctxt: The write context that will be handed to the write function >+ * @use_atomic: True if the write_atomic callback is to be used > * > * Return: True if this context still owns the console. False if > * ownership was handed over or taken. >@@ -873,8 +879,7 @@ EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); > * When true is returned, @wctxt->ctxt.backlog indicates whether there are > * still records pending in the ringbuffer, > */ >-__maybe_unused >-static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) >+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) > { > struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); > struct console *con = ctxt->console; >@@ -885,7 +890,7 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > unsigned long con_dropped; > struct nbcon_state cur; > unsigned long dropped; >- bool done; >+ bool done = false; > > /* > * The printk buffers are filled within an unsafe section. This >@@ -924,17 +929,24 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > nbcon_state_read(con, &cur); > wctxt->unsafe_takeover = cur.unsafe_takeover; > >- if (con->write_atomic) { >+ if (use_atomic && >+ con->write_atomic) { > done = con->write_atomic(con, wctxt); >- } else { >- nbcon_context_release(ctxt); >- WARN_ON_ONCE(1); >- done = false; >+ >+ } else if (!use_atomic && >+ con->write_thread) { >+ done = con->write_thread(con, wctxt); > } > >- /* If not done, the emit was aborted. */ >- if (!done) >+ if (!done) { >+ /* >+ * The emit was aborted, probably due to a loss of ownership. >+ * Ensure ownership was lost or released before reporting the >+ * loss. >+ */ >+ nbcon_context_release(ctxt); > return false; >+ } > > /* > * Since any dropped message was successfully output, reset the >@@ -961,6 +973,514 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > return nbcon_context_exit_unsafe(ctxt); > } > >+/** >+ * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup >+ * @con: Console to operate on >+ * @ctxt: The acquire context that contains the state >+ * at console_acquire() >+ * >+ * Return: True if the thread should shutdown or if the console is >+ * allowed to print and a record is available. False otherwise. >+ * >+ * After the thread wakes up, it must first check if it should shutdown before >+ * attempting any printing. >+ */ >+static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) >+{ >+ bool is_usable; >+ short flags; >+ int cookie; >+ >+ if (kthread_should_stop()) >+ return true; >+ >+ cookie = console_srcu_read_lock(); >+ flags = console_srcu_read_flags(con); >+ is_usable = console_is_usable(con, flags, false); >+ console_srcu_read_unlock(cookie); >+ >+ if (!is_usable) >+ return false; >+ >+ /* Bring the sequence in @ctxt up to date */ >+ ctxt->seq = nbcon_seq_read(con); >+ >+ return prb_read_valid(prb, ctxt->seq, NULL); >+} >+ >+/** >+ * nbcon_kthread_func - The printer thread function >+ * @__console: Console to operate on >+ */ >+static int nbcon_kthread_func(void *__console) >+{ >+ struct console *con = __console; >+ struct nbcon_write_context wctxt = { >+ .ctxt.console = con, >+ .ctxt.prio = NBCON_PRIO_NORMAL, >+ }; >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); >+ unsigned long flags; >+ short con_flags; >+ bool backlog; >+ int cookie; >+ int ret; >+ >+wait_for_event: >+ /* >+ * Guarantee this task is visible on the rcuwait before >+ * checking the wake condition. >+ * >+ * The full memory barrier within set_current_state() of >+ * ___rcuwait_wait_event() pairs with the full memory >+ * barrier within rcuwait_has_sleeper(). >+ * >+ * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. >+ */ >+ ret = rcuwait_wait_event(&con->rcuwait, >+ nbcon_kthread_should_wakeup(con, ctxt), >+ TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ >+ >+ if (kthread_should_stop()) >+ return 0; >+ >+ /* Wait was interrupted by a spurious signal, go back to sleep. */ >+ if (ret) >+ goto wait_for_event; >+ >+ do { >+ backlog = false; >+ >+ cookie = console_srcu_read_lock(); >+ >+ con_flags = console_srcu_read_flags(con); >+ >+ if (console_is_usable(con, con_flags, false)) { >+ con->driver_enter(con, &flags); >+ >+ /* >+ * Ensure this stays on the CPU to make handover and >+ * takeover possible. >+ */ >+ cant_migrate(); >+ >+ if (nbcon_context_try_acquire(ctxt)) { >+ /* >+ * If the emit fails, this context is no >+ * longer the owner. >+ */ >+ if (nbcon_emit_next_record(&wctxt, false)) { >+ nbcon_context_release(ctxt); >+ backlog = ctxt->backlog; >+ } >+ } >+ >+ con->driver_exit(con, flags); >+ } >+ >+ console_srcu_read_unlock(cookie); >+ >+ } while (backlog); >+ >+ goto wait_for_event; >+} >+ >+/** >+ * nbcon_irq_work - irq work to wake printk thread >+ * @irq_work: The irq work to operate on >+ */ >+static void nbcon_irq_work(struct irq_work *irq_work) >+{ >+ struct console *con = container_of(irq_work, struct console, irq_work); >+ >+ nbcon_kthread_wake(con); >+} >+ >+static inline bool rcuwait_has_sleeper(struct rcuwait *w) >+{ >+ bool has_sleeper; >+ >+ rcu_read_lock(); >+ /* >+ * Guarantee any new records can be seen by tasks preparing to wait >+ * before this context checks if the rcuwait is empty. >+ * >+ * This full memory barrier pairs with the full memory barrier within >+ * set_current_state() of ___rcuwait_wait_event(), which is called >+ * after prepare_to_rcuwait() adds the waiter but before it has >+ * checked the wait condition. >+ * >+ * This pairs with nbcon_kthread_func:A. >+ */ >+ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ >+ has_sleeper = !!rcu_dereference(w->task); >+ rcu_read_unlock(); >+ >+ return has_sleeper; >+} >+ >+/** >+ * nbcon_wake_threads - Wake up printing threads using irq_work >+ */ >+void nbcon_wake_threads(void) >+{ >+ struct console *con; >+ int cookie; >+ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ /* >+ * Only schedule irq_work if the printing thread is >+ * actively waiting. If not waiting, the thread will >+ * notice by itself that it has work to do. >+ */ >+ if (con->kthread && rcuwait_has_sleeper(&con->rcuwait)) >+ irq_work_queue(&con->irq_work); >+ } >+ console_srcu_read_unlock(cookie); >+} >+ >+/* Track the nbcon emergency nesting per CPU. */ >+static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); >+static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; >+ >+/** >+ * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer >+ * >+ * Return: Either a pointer to the per CPU emergency nesting counter of >+ * the current CPU or to the init data during early boot. >+ */ >+static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) >+{ >+ /* >+ * The value of __printk_percpu_data_ready gets set in normal >+ * context and before SMP initialization. As a result it could >+ * never change while inside an nbcon emergency section. >+ */ >+ if (!printk_percpu_data_ready()) >+ return &early_nbcon_pcpu_emergency_nesting; >+ >+ return this_cpu_ptr(&nbcon_pcpu_emergency_nesting); >+} >+ >+/** >+ * nbcon_emit_one - Print one record for an nbcon console using the >+ * specified callback >+ * @wctxt: An initialized write context struct to use >+ * for this context >+ * @use_atomic: True if the write_atomic callback is to be used >+ * >+ * Return: False if the given console could not print a record or there >+ * are no more records to print, otherwise true. >+ * >+ * This is an internal helper to handle the locking of the console before >+ * calling nbcon_emit_next_record(). >+ */ >+static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) >+{ >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); >+ >+ if (!nbcon_context_try_acquire(ctxt)) >+ return false; >+ >+ /* >+ * nbcon_emit_next_record() returns false when the console was >+ * handed over or taken over. In both cases the context is no >+ * longer valid. >+ */ >+ if (!nbcon_emit_next_record(wctxt, use_atomic)) >+ return false; >+ >+ nbcon_context_release(ctxt); >+ >+ return ctxt->backlog; >+} >+ >+/** >+ * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon >+ * printing on the current CPU >+ * >+ * Context: Any context which could not be migrated to another CPU. >+ * Return: The nbcon_prio to use for acquiring an nbcon console in this >+ * context for printing. >+ */ >+enum nbcon_prio nbcon_get_default_prio(void) >+{ >+ unsigned int *cpu_emergency_nesting; >+ >+ if (this_cpu_in_panic()) >+ return NBCON_PRIO_PANIC; >+ >+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); >+ if (*cpu_emergency_nesting) >+ return NBCON_PRIO_EMERGENCY; >+ >+ return NBCON_PRIO_NORMAL; >+} >+ >+/** >+ * nbcon_legacy_emit_next_record - Print one record for an nbcon console >+ * in legacy contexts >+ * @con: The console to print on >+ * @handover: Will be set to true if a printk waiter has taken over the >+ * console_lock, in which case the caller is no longer holding >+ * both the console_lock and the SRCU read lock. Otherwise it >+ * is set to false. >+ * @cookie: The cookie from the SRCU read lock. >+ * @use_atomic: True if the write_atomic callback is to be used >+ * >+ * Context: Any context which could not be migrated to another CPU. >+ * Return: True if a record could be printed, otherwise false. >+ * >+ * This function is meant to be called by console_flush_all() to print records >+ * on nbcon consoles from legacy context (printing via console unlocking). >+ * Essentially it is the nbcon version of console_emit_next_record(). >+ */ >+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, >+ int cookie, bool use_atomic) >+{ >+ struct nbcon_write_context wctxt = { }; >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); >+ bool progress = false; >+ unsigned long flags; >+ >+ *handover = false; >+ >+ ctxt->console = con; >+ >+ if (use_atomic) { >+ /* Use the same procedure as console_emit_next_record(). */ >+ printk_safe_enter_irqsave(flags); >+ console_lock_spinning_enable(); >+ stop_critical_timings(); >+ >+ ctxt->prio = nbcon_get_default_prio(); >+ progress = nbcon_emit_one(&wctxt, use_atomic); >+ >+ start_critical_timings(); >+ *handover = console_lock_spinning_disable_and_check(cookie); >+ printk_safe_exit_irqrestore(flags); >+ } else { >+ con->driver_enter(con, &flags); >+ cant_migrate(); >+ >+ ctxt->prio = nbcon_get_default_prio(); >+ progress = nbcon_emit_one(&wctxt, use_atomic); >+ >+ con->driver_exit(con, flags); >+ } >+ >+ return progress; >+} >+ >+/** >+ * __nbcon_atomic_flush_all - Flush all nbcon consoles using their >+ * write_atomic() callback >+ * @stop_seq: Flush up until this record >+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers >+ */ >+static void __nbcon_atomic_flush_all(u64 stop_seq, bool allow_unsafe_takeover) >+{ >+ struct nbcon_write_context wctxt = { }; >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); >+ struct console *con; >+ bool any_progress; >+ int cookie; >+ >+ do { >+ any_progress = false; >+ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ short flags = console_srcu_read_flags(con); >+ unsigned long irq_flags; >+ >+ if (!(flags & CON_NBCON)) >+ continue; >+ >+ if (!console_is_usable(con, flags, true)) >+ continue; >+ >+ if (nbcon_seq_read(con) >= stop_seq) >+ continue; >+ >+ memset(ctxt, 0, sizeof(*ctxt)); >+ ctxt->console = con; >+ ctxt->spinwait_max_us = 2000; >+ ctxt->allow_unsafe_takeover = allow_unsafe_takeover; >+ >+ /* >+ * Atomic flushing does not use console driver >+ * synchronization (i.e. it does not hold the port >+ * lock for uart consoles). Therefore IRQs must be >+ * disabled to avoid being interrupted and then >+ * calling into a driver that will deadlock trying >+ * acquire console ownership. >+ * >+ * This also disables migration in order to get the >+ * current CPU priority. >+ */ >+ local_irq_save(irq_flags); >+ >+ ctxt->prio = nbcon_get_default_prio(); >+ >+ any_progress |= nbcon_emit_one(&wctxt, true); >+ >+ local_irq_restore(irq_flags); >+ } >+ console_srcu_read_unlock(cookie); >+ } while (any_progress); >+} >+ >+/** >+ * nbcon_atomic_flush_all - Flush all nbcon consoles using their >+ * write_atomic() callback >+ * >+ * Flush the backlog up through the currently newest record. Any new >+ * records added while flushing will not be flushed. This is to avoid >+ * one CPU printing unbounded because other CPUs continue to add records. >+ */ >+void nbcon_atomic_flush_all(void) >+{ >+ __nbcon_atomic_flush_all(prb_next_reserve_seq(prb), false); >+} >+ >+/** >+ * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their >+ * write_atomic() callback and allowing unsafe hostile takeovers >+ * >+ * Flush the backlog up through the currently newest record. Unsafe hostile >+ * takeovers will be performed, if necessary. >+ */ >+void nbcon_atomic_flush_unsafe(void) >+{ >+ __nbcon_atomic_flush_all(prb_next_reserve_seq(prb), true); >+} >+ >+/** >+ * nbcon_cpu_emergency_enter - Enter an emergency section where printk() >+ * messages for that CPU are only stored >+ * >+ * Upon exiting the emergency section, all stored messages are flushed. >+ * >+ * Context: Any context. Disables preemption. >+ * >+ * When within an emergency section, no printing occurs on that CPU. This >+ * is to allow all emergency messages to be dumped into the ringbuffer before >+ * flushing the ringbuffer. The actual printing occurs when exiting the >+ * outermost emergency section. >+ */ >+void nbcon_cpu_emergency_enter(void) >+{ >+ unsigned int *cpu_emergency_nesting; >+ >+ preempt_disable(); >+ >+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); >+ (*cpu_emergency_nesting)++; >+} >+ >+/** >+ * nbcon_cpu_emergency_exit - Exit an emergency section and flush the >+ * stored messages >+ * >+ * Flushing only occurs when exiting all nesting for the CPU. >+ * >+ * Context: Any context. Enables preemption. >+ */ >+void nbcon_cpu_emergency_exit(void) >+{ >+ unsigned int *cpu_emergency_nesting; >+ bool do_trigger_flush = false; >+ >+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); >+ >+ WARN_ON_ONCE(*cpu_emergency_nesting == 0); >+ >+ if (*cpu_emergency_nesting == 1) >+ do_trigger_flush = true; >+ >+ /* Undo the nesting count of nbcon_cpu_emergency_enter(). */ >+ (*cpu_emergency_nesting)--; >+ >+ preempt_enable(); >+ >+ if (do_trigger_flush) >+ printk_trigger_flush(); >+} >+ >+/** >+ * nbcon_kthread_stop - Stop a printer thread >+ * @con: Console to operate on >+ */ >+static void nbcon_kthread_stop(struct console *con) >+{ >+ lockdep_assert_console_list_lock_held(); >+ >+ if (!con->kthread) >+ return; >+ >+ kthread_stop(con->kthread); >+ con->kthread = NULL; >+} >+ >+/** >+ * nbcon_kthread_create - Create a printer thread >+ * @con: Console to operate on >+ * >+ * If it fails, let the console proceed. The atomic part might >+ * be usable and useful. >+ */ >+void nbcon_kthread_create(struct console *con) >+{ >+ struct task_struct *kt; >+ >+ lockdep_assert_console_list_lock_held(); >+ >+ if (!(con->flags & CON_NBCON) || !con->write_thread) >+ return; >+ >+ if (!printk_threads_enabled || con->kthread) >+ return; >+ >+ /* >+ * Printer threads cannot be started as long as any boot console is >+ * registered because there is no way to synchronize the hardware >+ * registers between boot console code and regular console code. >+ */ >+ if (have_boot_console) >+ return; >+ >+ kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); >+ if (IS_ERR(kt)) { >+ con_printk(KERN_ERR, con, "failed to start printing thread\n"); >+ return; >+ } >+ >+ con->kthread = kt; >+ >+ /* >+ * It is important that console printing threads are scheduled >+ * shortly after a printk call and with generous runtime budgets. >+ */ >+ sched_set_normal(con->kthread, -20); >+} >+ >+static int __init printk_setup_threads(void) >+{ >+ struct console *con; >+ >+ console_list_lock(); >+ printk_threads_enabled = true; >+ for_each_console(con) >+ nbcon_kthread_create(con); >+ if (force_printkthreads() && printing_via_unlock) >+ nbcon_legacy_kthread_create(); >+ console_list_unlock(); >+ return 0; >+} >+early_initcall(printk_setup_threads); >+ > /** > * nbcon_alloc - Allocate buffers needed by the nbcon console > * @con: Console to allocate buffers for >@@ -1007,8 +1527,11 @@ void nbcon_init(struct console *con) > /* nbcon_alloc() must have been called and successful! */ > BUG_ON(!con->pbufs); > >+ rcuwait_init(&con->rcuwait); >+ init_irq_work(&con->irq_work, nbcon_irq_work); > nbcon_seq_force(con, con->seq); > nbcon_state_set(con, &state); >+ nbcon_kthread_create(con); > } > > /** >@@ -1019,6 +1542,7 @@ void nbcon_free(struct console *con) > { > struct nbcon_state state = { }; > >+ nbcon_kthread_stop(con); > nbcon_state_set(con, &state); > > /* Boot consoles share global printk buffers. */ >@@ -1027,3 +1551,115 @@ void nbcon_free(struct console *con) > > con->pbufs = NULL; > } >+ >+static inline bool uart_is_nbcon(struct uart_port *up) >+{ >+ int cookie; >+ bool ret; >+ >+ if (!uart_console(up)) >+ return false; >+ >+ cookie = console_srcu_read_lock(); >+ ret = (console_srcu_read_flags(up->cons) & CON_NBCON); >+ console_srcu_read_unlock(cookie); >+ return ret; >+} >+ >+/** >+ * uart_nbcon_acquire - The second half of the port locking wrapper >+ * @up: The uart port whose @lock was locked >+ * >+ * The uart_port_lock() wrappers will first lock the spin_lock @up->lock. >+ * Then this function is called to implement nbcon-specific processing. >+ * >+ * If @up is an nbcon console, this console will be acquired and marked as >+ * unsafe. Otherwise this function does nothing. >+ * >+ * nbcon consoles acquired via the port lock wrapper always use priority >+ * NBCON_PRIO_NORMAL. >+ */ >+void uart_nbcon_acquire(struct uart_port *up) >+{ >+ struct console *con = up->cons; >+ struct nbcon_context ctxt; >+ >+ if (!uart_is_nbcon(up)) >+ return; >+ >+ WARN_ON_ONCE(up->nbcon_locked_port); >+ >+ do { >+ do { >+ memset(&ctxt, 0, sizeof(ctxt)); >+ ctxt.console = con; >+ ctxt.prio = NBCON_PRIO_NORMAL; >+ } while (!nbcon_context_try_acquire(&ctxt)); >+ >+ } while (!nbcon_context_enter_unsafe(&ctxt)); >+ >+ up->nbcon_locked_port = true; >+} >+EXPORT_SYMBOL_GPL(uart_nbcon_acquire); >+ >+/** >+ * uart_nbcon_release - The first half of the port unlocking wrapper >+ * @up: The uart port whose @lock is about to be unlocked >+ * >+ * The uart_port_unlock() wrappers will first call this function to implement >+ * nbcon-specific processing. Then afterwards the uart_port_unlock() wrappers >+ * will unlock the spin_lock @up->lock. >+ * >+ * If @up is an nbcon console, the console will be marked as safe and >+ * released. Otherwise this function does nothing. >+ * >+ * nbcon consoles acquired via the port lock wrapper always use priority >+ * NBCON_PRIO_NORMAL. >+ */ >+void uart_nbcon_release(struct uart_port *up) >+{ >+ struct console *con = up->cons; >+ struct nbcon_context ctxt = { >+ .console = con, >+ .prio = NBCON_PRIO_NORMAL, >+ }; >+ >+ if (!up->nbcon_locked_port) >+ return; >+ >+ if (nbcon_context_exit_unsafe(&ctxt)) >+ nbcon_context_release(&ctxt); >+ >+ up->nbcon_locked_port = false; >+} >+EXPORT_SYMBOL_GPL(uart_nbcon_release); >+ >+/** >+ * printk_kthread_shutdown - shutdown all threaded printers >+ * >+ * On system shutdown all threaded printers are stopped. This allows printk >+ * to transition back to atomic printing, thus providing a robust mechanism >+ * for the final shutdown/reboot messages to be output. >+ */ >+static void printk_kthread_shutdown(void) >+{ >+ struct console *con; >+ >+ console_list_lock(); >+ for_each_console(con) { >+ if (con->flags & CON_NBCON) >+ nbcon_kthread_stop(con); >+ } >+ console_list_unlock(); >+} >+ >+static struct syscore_ops printk_syscore_ops = { >+ .shutdown = printk_kthread_shutdown, >+}; >+ >+static int __init printk_init_ops(void) >+{ >+ register_syscore_ops(&printk_syscore_ops); >+ return 0; >+} >+device_initcall(printk_init_ops); >diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c >index f2444b581e16c..e29f77f4f8b46 100644 >--- a/kernel/printk/printk.c >+++ b/kernel/printk/printk.c >@@ -195,6 +195,17 @@ static int __init control_devkmsg(char *str) > } > __setup("printk.devkmsg=", control_devkmsg); > >+#if !defined(CONFIG_PREEMPT_RT) >+DEFINE_STATIC_KEY_FALSE(force_printkthreads_key); >+ >+static int __init setup_forced_printkthreads(char *arg) >+{ >+ static_branch_enable(&force_printkthreads_key); >+ return 0; >+} >+early_param("threadprintk", setup_forced_printkthreads); >+#endif >+ > char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; > #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) > int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, >@@ -282,6 +293,7 @@ EXPORT_SYMBOL(console_list_unlock); > * Return: A cookie to pass to console_srcu_read_unlock(). > */ > int console_srcu_read_lock(void) >+ __acquires(&console_srcu) > { > return srcu_read_lock_nmisafe(&console_srcu); > } >@@ -295,6 +307,7 @@ EXPORT_SYMBOL(console_srcu_read_lock); > * Counterpart to console_srcu_read_lock() > */ > void console_srcu_read_unlock(int cookie) >+ __releases(&console_srcu) > { > srcu_read_unlock_nmisafe(&console_srcu, cookie); > } >@@ -347,6 +360,29 @@ static bool panic_in_progress(void) > return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); > } > >+/* Return true if a panic is in progress on the current CPU. */ >+bool this_cpu_in_panic(void) >+{ >+ /* >+ * We can use raw_smp_processor_id() here because it is impossible for >+ * the task to be migrated to the panic_cpu, or away from it. If >+ * panic_cpu has already been set, and we're not currently executing on >+ * that CPU, then we never will be. >+ */ >+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); >+} >+ >+/* >+ * Return true if a panic is in progress on a remote CPU. >+ * >+ * On true, the local CPU should immediately release any printing resources >+ * that may be needed by the panic CPU. >+ */ >+bool other_cpu_in_panic(void) >+{ >+ return (panic_in_progress() && !this_cpu_in_panic()); >+} >+ > /* > * This is used for debugging the mess that is the VT code by > * keeping track if we have the console semaphore held. It's >@@ -438,14 +474,33 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; > /* syslog_lock protects syslog_* variables and write access to clear_seq. */ > static DEFINE_MUTEX(syslog_lock); > >-#ifdef CONFIG_PRINTK > /* >- * During panic, heavy printk by other CPUs can delay the >- * panic and risk deadlock on console resources. >+ * Specifies if a legacy console is registered. If legacy consoles are >+ * present, it is necessary to perform the console_lock/console_unlock dance >+ * whenever console flushing should occur. > */ >-static int __read_mostly suppress_panic_printk; >+bool have_legacy_console; > >+/* >+ * Specifies if an nbcon console is registered. If nbcon consoles are present, >+ * synchronous printing of legacy consoles will not occur during panic until >+ * the backtrace has been stored to the ringbuffer. >+ */ >+bool have_nbcon_console; >+ >+/* >+ * Specifies if a boot console is registered. If boot consoles are present, >+ * nbcon consoles cannot print simultaneously and must be synchronized by >+ * the console lock. This is because boot consoles and nbcon consoles may >+ * have mapped the same hardware. >+ */ >+bool have_boot_console; >+ >+#ifdef CONFIG_PRINTK > DECLARE_WAIT_QUEUE_HEAD(log_wait); >+ >+static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); >+ > /* All 3 protected by @syslog_lock. */ > /* the next printk record to read by syslog(READ) or /proc/kmsg */ > static u64 syslog_seq; >@@ -1844,12 +1899,25 @@ static bool console_waiter; > * there may be a waiter spinning (like a spinlock). Also it must be > * ready to hand over the lock at the end of the section. > */ >-static void console_lock_spinning_enable(void) >+void console_lock_spinning_enable(void) > { >+ /* >+ * Do not use spinning in panic(). The panic CPU wants to keep the lock. >+ * Non-panic CPUs abandon the flush anyway. >+ * >+ * Just keep the lockdep annotation. The panic-CPU should avoid >+ * taking console_owner_lock because it might cause a deadlock. >+ * This looks like the easiest way how to prevent false lockdep >+ * reports without handling races a lockless way. >+ */ >+ if (panic_in_progress()) >+ goto lockdep; >+ > raw_spin_lock(&console_owner_lock); > console_owner = current; > raw_spin_unlock(&console_owner_lock); > >+lockdep: > /* The waiter may spin on us after setting console_owner */ > spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); > } >@@ -1870,10 +1938,26 @@ static void console_lock_spinning_enable(void) > * > * Return: 1 if the lock rights were passed, 0 otherwise. > */ >-static int console_lock_spinning_disable_and_check(int cookie) >+int console_lock_spinning_disable_and_check(int cookie) > { > int waiter; > >+ /* >+ * Ignore spinning waiters during panic() because they might get stopped >+ * or blocked at any time, >+ * >+ * It is safe because nobody is allowed to start spinning during panic >+ * in the first place. If there has been a waiter then non panic CPUs >+ * might stay spinning. They would get stopped anyway. The panic context >+ * will never start spinning and an interrupted spin on panic CPU will >+ * never continue. >+ */ >+ if (panic_in_progress()) { >+ /* Keep lockdep happy. */ >+ spin_release(&console_owner_dep_map, _THIS_IP_); >+ return 0; >+ } >+ > raw_spin_lock(&console_owner_lock); > waiter = READ_ONCE(console_waiter); > console_owner = NULL; >@@ -2259,55 +2343,123 @@ int vprintk_store(int facility, int level, > return ret; > } > >+static bool legacy_allow_panic_sync; >+ >+/* >+ * This acts as a one-way switch to allow legacy consoles to print from >+ * the printk() caller context on a panic CPU. >+ */ >+void printk_legacy_allow_panic_sync(void) >+{ >+ legacy_allow_panic_sync = true; >+} >+ > asmlinkage int vprintk_emit(int facility, int level, > const struct dev_printk_info *dev_info, > const char *fmt, va_list args) > { >+ bool do_trylock_unlock = printing_via_unlock && >+ !force_printkthreads(); > int printed_len; >- bool in_sched = false; > > /* Suppress unimportant messages after panic happens */ > if (unlikely(suppress_printk)) > return 0; > >- if (unlikely(suppress_panic_printk) && >- atomic_read(&panic_cpu) != raw_smp_processor_id()) >+ /* >+ * The messages on the panic CPU are the most important. If >+ * non-panic CPUs are generating any messages, they will be >+ * silently dropped. >+ */ >+ if (other_cpu_in_panic()) > return 0; > > if (level == LOGLEVEL_SCHED) { > level = LOGLEVEL_DEFAULT; >- in_sched = true; >+ /* If called from the scheduler, we can not call up(). */ >+ do_trylock_unlock = false; > } > > printk_delay(level); > > printed_len = vprintk_store(facility, level, dev_info, fmt, args); > >- /* If called from the scheduler, we can not call up(). */ >- if (!in_sched) { >+ if (!have_boot_console && have_nbcon_console) { >+ bool is_panic_context = this_cpu_in_panic(); >+ >+ /* >+ * In panic, the legacy consoles are not allowed to print from >+ * the printk calling context unless explicitly allowed. This >+ * gives the safe nbcon consoles a chance to print out all the >+ * panic messages first. This restriction only applies if >+ * there are nbcon consoles registered. >+ */ >+ if (is_panic_context) >+ do_trylock_unlock &= legacy_allow_panic_sync; >+ >+ /* >+ * There are situations where nbcon atomic printing should >+ * happen in the printk() caller context: >+ * >+ * - When this CPU is in panic. >+ * >+ * - When booting, before the printing threads have been >+ * started. >+ * >+ * - During shutdown, since the printing threads may not get >+ * a chance to print the final messages. >+ * >+ * Note that if boot consoles are registered, the >+ * console_lock/console_unlock dance must be relied upon >+ * instead because nbcon consoles cannot print simultaneously >+ * with boot consoles. >+ */ >+ if (is_panic_context || >+ !printk_threads_enabled || >+ (system_state > SYSTEM_RUNNING)) { >+ nbcon_atomic_flush_all(); >+ } >+ } >+ >+ nbcon_wake_threads(); >+ >+ if (do_trylock_unlock) { > /* > * The caller may be holding system-critical or > * timing-sensitive locks. Disable preemption during > * printing of all remaining records to all consoles so that > * this context can return as soon as possible. Hopefully > * another printk() caller will take over the printing. >+ * >+ * Also, nbcon_get_default_prio() requires migration disabled. > */ > preempt_disable(); >+ > /* >- * Try to acquire and then immediately release the console >- * semaphore. The release will print out buffers. With the >- * spinning variant, this context tries to take over the >- * printing from another printing context. >+ * Do not emit for EMERGENCY priority. The console will be >+ * explicitly flushed when exiting the emergency section. > */ >- if (console_trylock_spinning()) >- console_unlock(); >+ if (nbcon_get_default_prio() == NBCON_PRIO_EMERGENCY) { >+ do_trylock_unlock = false; >+ } else { >+ /* >+ * Try to acquire and then immediately release the >+ * console semaphore. The release will print out >+ * buffers. With the spinning variant, this context >+ * tries to take over the printing from another >+ * printing context. >+ */ >+ if (console_trylock_spinning()) >+ console_unlock(); >+ } >+ > preempt_enable(); > } > >- if (in_sched) >- defer_console_output(); >- else >+ if (do_trylock_unlock) > wake_up_klogd(); >+ else >+ defer_console_output(); > > return printed_len; > } >@@ -2335,6 +2487,14 @@ EXPORT_SYMBOL(_printk); > static bool pr_flush(int timeout_ms, bool reset_on_progress); > static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); > >+static struct task_struct *nbcon_legacy_kthread; >+ >+static inline void wake_up_legacy_kthread(void) >+{ >+ if (nbcon_legacy_kthread) >+ wake_up_interruptible(&legacy_wait); >+} >+ > #else /* CONFIG_PRINTK */ > > #define printk_time false >@@ -2348,6 +2508,8 @@ static u64 syslog_seq; > static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } > static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } > >+static inline void nbcon_legacy_kthread_create(void) { } >+static inline void wake_up_legacy_kthread(void) { } > #endif /* CONFIG_PRINTK */ > > #ifdef CONFIG_EARLY_PRINTK >@@ -2563,6 +2725,8 @@ void suspend_console(void) > void resume_console(void) > { > struct console *con; >+ short flags; >+ int cookie; > > if (!console_suspend_enabled) > return; >@@ -2579,6 +2743,20 @@ void resume_console(void) > */ > synchronize_srcu(&console_srcu); > >+ /* >+ * Since this runs in task context, wake the threaded printers >+ * directly rather than scheduling irq_work to do it. >+ */ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ flags = console_srcu_read_flags(con); >+ if (flags & CON_NBCON) >+ nbcon_kthread_wake(con); >+ } >+ console_srcu_read_unlock(cookie); >+ >+ wake_up_legacy_kthread(); >+ > pr_flush(1000, true); > } > >@@ -2593,7 +2771,8 @@ void resume_console(void) > */ > static int console_cpu_notify(unsigned int cpu) > { >- if (!cpuhp_tasks_frozen) { >+ if (!cpuhp_tasks_frozen && printing_via_unlock && >+ !force_printkthreads()) { > /* If trylock fails, someone else is doing the printing */ > if (console_trylock()) > console_unlock(); >@@ -2601,26 +2780,6 @@ static int console_cpu_notify(unsigned int cpu) > return 0; > } > >-/* >- * Return true if a panic is in progress on a remote CPU. >- * >- * On true, the local CPU should immediately release any printing resources >- * that may be needed by the panic CPU. >- */ >-bool other_cpu_in_panic(void) >-{ >- if (!panic_in_progress()) >- return false; >- >- /* >- * We can use raw_smp_processor_id() here because it is impossible for >- * the task to be migrated to the panic_cpu, or away from it. If >- * panic_cpu has already been set, and we're not currently executing on >- * that CPU, then we never will be. >- */ >- return atomic_read(&panic_cpu) != raw_smp_processor_id(); >-} >- > /** > * console_lock - block the console subsystem from printing > * >@@ -2670,36 +2829,6 @@ int is_console_locked(void) > } > EXPORT_SYMBOL(is_console_locked); > >-/* >- * Check if the given console is currently capable and allowed to print >- * records. >- * >- * Requires the console_srcu_read_lock. >- */ >-static inline bool console_is_usable(struct console *con) >-{ >- short flags = console_srcu_read_flags(con); >- >- if (!(flags & CON_ENABLED)) >- return false; >- >- if ((flags & CON_SUSPENDED)) >- return false; >- >- if (!con->write) >- return false; >- >- /* >- * Console drivers may assume that per-cpu resources have been >- * allocated. So unless they're explicitly marked as being able to >- * cope (CON_ANYTIME) don't call them until this CPU is officially up. >- */ >- if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) >- return false; >- >- return true; >-} >- > static void __console_unlock(void) > { > console_locked = 0; >@@ -2776,8 +2905,6 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) > bool printk_get_next_message(struct printk_message *pmsg, u64 seq, > bool is_extended, bool may_suppress) > { >- static int panic_console_dropped; >- > struct printk_buffers *pbufs = pmsg->pbufs; > const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); > const size_t outbuf_sz = sizeof(pbufs->outbuf); >@@ -2805,17 +2932,6 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, > pmsg->seq = r.info->seq; > pmsg->dropped = r.info->seq - seq; > >- /* >- * Check for dropped messages in panic here so that printk >- * suppression can occur as early as possible if necessary. >- */ >- if (pmsg->dropped && >- panic_in_progress() && >- panic_console_dropped++ > 10) { >- suppress_panic_printk = 1; >- pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); >- } >- > /* Skip record that has level above the console loglevel. */ > if (may_suppress && suppress_message_printing(r.info->level)) > goto out; >@@ -2832,6 +2948,33 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, > return true; > } > >+/* >+ * Legacy console printing from printk() caller context does not respect >+ * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a >+ * false positive. For PREEMPT_RT the false positive condition does not >+ * occur. >+ * >+ * This map is used to establish LD_WAIT_SLEEP context for the console write >+ * callbacks when legacy printing to avoid false positive lockdep complaints, >+ * thus allowing lockdep to continue to function for real issues. >+ */ >+#ifdef CONFIG_PREEMPT_RT >+static inline void printk_legacy_lock_map_acquire_try(void) { } >+static inline void printk_legacy_lock_map_release(void) { } >+#else >+DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); >+ >+static inline void printk_legacy_lock_map_acquire_try(void) >+{ >+ lock_map_acquire_try(&printk_legacy_map); >+} >+ >+static inline void printk_legacy_lock_map_release(void) >+{ >+ lock_map_release(&printk_legacy_map); >+} >+#endif /* CONFIG_PREEMPT_RT */ >+ > /* > * Used as the printk buffers for non-panic, serialized console printing. > * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. >@@ -2881,31 +3024,45 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co > con->dropped = 0; > } > >- /* >- * While actively printing out messages, if another printk() >- * were to occur on another CPU, it may wait for this one to >- * finish. This task can not be preempted if there is a >- * waiter waiting to take over. >- * >- * Interrupts are disabled because the hand over to a waiter >- * must not be interrupted until the hand over is completed >- * (@console_waiter is cleared). >- */ >- printk_safe_enter_irqsave(flags); >- console_lock_spinning_enable(); >- >- /* Do not trace print latency. */ >- stop_critical_timings(); >- > /* Write everything out to the hardware. */ >- con->write(con, outbuf, pmsg.outbuf_len); > >- start_critical_timings(); >+ if (force_printkthreads()) { >+ /* >+ * With forced threading this function is either in a thread >+ * or panic context. So there is no need for concern about >+ * printk reentrance, handovers, or lockdep complaints. >+ */ > >- con->seq = pmsg.seq + 1; >+ con->write(con, outbuf, pmsg.outbuf_len); >+ con->seq = pmsg.seq + 1; >+ } else { >+ /* >+ * While actively printing out messages, if another printk() >+ * were to occur on another CPU, it may wait for this one to >+ * finish. This task can not be preempted if there is a >+ * waiter waiting to take over. >+ * >+ * Interrupts are disabled because the hand over to a waiter >+ * must not be interrupted until the hand over is completed >+ * (@console_waiter is cleared). >+ */ >+ printk_safe_enter_irqsave(flags); >+ console_lock_spinning_enable(); > >- *handover = console_lock_spinning_disable_and_check(cookie); >- printk_safe_exit_irqrestore(flags); >+ /* Do not trace print latency. */ >+ stop_critical_timings(); >+ >+ printk_legacy_lock_map_acquire_try(); >+ con->write(con, outbuf, pmsg.outbuf_len); >+ printk_legacy_lock_map_release(); >+ >+ start_critical_timings(); >+ >+ con->seq = pmsg.seq + 1; >+ >+ *handover = console_lock_spinning_disable_and_check(cookie); >+ printk_safe_exit_irqrestore(flags); >+ } > skip: > return true; > } >@@ -2958,13 +3115,30 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove > > cookie = console_srcu_read_lock(); > for_each_console_srcu(con) { >+ short flags = console_srcu_read_flags(con); >+ u64 printk_seq; > bool progress; > >- if (!console_is_usable(con)) >+ /* >+ * console_flush_all() is only for legacy consoles, >+ * unless the nbcon console has no kthread printer. >+ */ >+ if ((flags & CON_NBCON) && con->kthread) >+ continue; >+ >+ if (!console_is_usable(con, flags, !do_cond_resched)) > continue; > any_usable = true; > >- progress = console_emit_next_record(con, handover, cookie); >+ if (flags & CON_NBCON) { >+ progress = nbcon_legacy_emit_next_record(con, handover, cookie, >+ !do_cond_resched); >+ printk_seq = nbcon_seq_read(con); >+ } else { >+ progress = console_emit_next_record(con, handover, cookie); >+ >+ printk_seq = con->seq; >+ } > > /* > * If a handover has occurred, the SRCU read lock >@@ -2974,8 +3148,8 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove > return false; > > /* Track the next of the highest seq flushed. */ >- if (con->seq > *next_seq) >- *next_seq = con->seq; >+ if (printk_seq > *next_seq) >+ *next_seq = printk_seq; > > if (!progress) > continue; >@@ -2998,19 +3172,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove > return false; > } > >-/** >- * console_unlock - unblock the console subsystem from printing >- * >- * Releases the console_lock which the caller holds to block printing of >- * the console subsystem. >- * >- * While the console_lock was held, console output may have been buffered >- * by printk(). If this is the case, console_unlock(); emits >- * the output prior to releasing the lock. >- * >- * console_unlock(); may be called from any context. >- */ >-void console_unlock(void) >+static void console_flush_and_unlock(void) > { > bool do_cond_resched; > bool handover; >@@ -3054,6 +3216,32 @@ void console_unlock(void) > */ > } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); > } >+ >+/** >+ * console_unlock - unblock the console subsystem from printing >+ * >+ * Releases the console_lock which the caller holds to block printing of >+ * the console subsystem. >+ * >+ * While the console_lock was held, console output may have been buffered >+ * by printk(). If this is the case, console_unlock(); emits >+ * the output prior to releasing the lock. >+ * >+ * console_unlock(); may be called from any context. >+ */ >+void console_unlock(void) >+{ >+ /* >+ * Forced threading relies on kthread and atomic consoles for >+ * printing. It never attempts to print from console_unlock(). >+ */ >+ if (force_printkthreads()) { >+ __console_unlock(); >+ return; >+ } >+ >+ console_flush_and_unlock(); >+} > EXPORT_SYMBOL(console_unlock); > > /** >@@ -3187,7 +3375,10 @@ void console_flush_on_panic(enum con_flush_mode mode) > console_srcu_read_unlock(cookie); > } > >- console_flush_all(false, &next_seq, &handover); >+ nbcon_atomic_flush_all(); >+ >+ if (printing_via_unlock) >+ console_flush_all(false, &next_seq, &handover); > } > > /* >@@ -3244,13 +3435,122 @@ EXPORT_SYMBOL(console_stop); > > void console_start(struct console *console) > { >+ short flags; >+ > console_list_lock(); > console_srcu_write_flags(console, console->flags | CON_ENABLED); >+ flags = console->flags; > console_list_unlock(); >+ >+ /* >+ * Ensure that all SRCU list walks have completed. The related >+ * printing context must be able to see it is enabled so that >+ * it is guaranteed to wake up and resume printing. >+ */ >+ synchronize_srcu(&console_srcu); >+ >+ if (flags & CON_NBCON) >+ nbcon_kthread_wake(console); >+ else >+ wake_up_legacy_kthread(); >+ > __pr_flush(console, 1000, true); > } > EXPORT_SYMBOL(console_start); > >+#ifdef CONFIG_PRINTK >+static bool printer_should_wake(void) >+{ >+ bool available = false; >+ struct console *con; >+ int cookie; >+ >+ if (kthread_should_stop()) >+ return true; >+ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ short flags = console_srcu_read_flags(con); >+ u64 printk_seq; >+ >+ /* >+ * The legacy printer thread is only for legacy consoles, >+ * unless the nbcon console has no kthread printer. >+ */ >+ if ((flags & CON_NBCON) && con->kthread) >+ continue; >+ >+ if (!console_is_usable(con, flags, true)) >+ continue; >+ >+ if (flags & CON_NBCON) { >+ printk_seq = nbcon_seq_read(con); >+ } else { >+ /* >+ * It is safe to read @seq because only this >+ * thread context updates @seq. >+ */ >+ printk_seq = con->seq; >+ } >+ >+ if (prb_read_valid(prb, printk_seq, NULL)) { >+ available = true; >+ break; >+ } >+ } >+ console_srcu_read_unlock(cookie); >+ >+ return available; >+} >+ >+static int nbcon_legacy_kthread_func(void *unused) >+{ >+ int error; >+ >+ for (;;) { >+ error = wait_event_interruptible(legacy_wait, printer_should_wake()); >+ >+ if (kthread_should_stop()) >+ break; >+ >+ if (error) >+ continue; >+ >+ console_lock(); >+ console_flush_and_unlock(); >+ } >+ >+ return 0; >+} >+ >+void nbcon_legacy_kthread_create(void) >+{ >+ struct task_struct *kt; >+ >+ lockdep_assert_held(&console_mutex); >+ >+ if (!force_printkthreads()) >+ return; >+ >+ if (!printk_threads_enabled || nbcon_legacy_kthread) >+ return; >+ >+ kt = kthread_run(nbcon_legacy_kthread_func, NULL, "pr/legacy"); >+ if (IS_ERR(kt)) { >+ pr_err("unable to start legacy printing thread\n"); >+ return; >+ } >+ >+ nbcon_legacy_kthread = kt; >+ >+ /* >+ * It is important that console printing threads are scheduled >+ * shortly after a printk call and with generous runtime budgets. >+ */ >+ sched_set_normal(nbcon_legacy_kthread, -20); >+} >+#endif /* CONFIG_PRINTK */ >+ > static int __read_mostly keep_bootcon; > > static int __init keep_bootcon_setup(char *str) >@@ -3382,11 +3682,20 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered) > > newcon->seq = prb_next_seq(prb); > for_each_console(con) { >- if ((con->flags & CON_BOOT) && >- (con->flags & CON_ENABLED) && >- con->seq < newcon->seq) { >- newcon->seq = con->seq; >+ u64 seq; >+ >+ if (!((con->flags & CON_BOOT) && >+ (con->flags & CON_ENABLED))) { >+ continue; > } >+ >+ if (con->flags & CON_NBCON) >+ seq = nbcon_seq_read(con); >+ else >+ seq = con->seq; >+ >+ if (seq < newcon->seq) >+ newcon->seq = seq; > } > } > >@@ -3503,8 +3812,16 @@ void register_console(struct console *newcon) > newcon->dropped = 0; > console_init_seq(newcon, bootcon_registered); > >- if (newcon->flags & CON_NBCON) >+ if (newcon->flags & CON_NBCON) { >+ have_nbcon_console = true; > nbcon_init(newcon); >+ } else { >+ have_legacy_console = true; >+ nbcon_legacy_kthread_create(); >+ } >+ >+ if (newcon->flags & CON_BOOT) >+ have_boot_console = true; > > /* > * Put this console in the list - keep the >@@ -3558,6 +3875,11 @@ EXPORT_SYMBOL(register_console); > /* Must be called under console_list_lock(). */ > static int unregister_console_locked(struct console *console) > { >+ bool is_boot_con = (console->flags & CON_BOOT); >+ bool found_legacy_con = false; >+ bool found_nbcon_con = false; >+ bool found_boot_con = false; >+ struct console *c; > int res; > > lockdep_assert_console_list_lock_held(); >@@ -3605,6 +3927,42 @@ static int unregister_console_locked(struct console *console) > if (console->exit) > res = console->exit(console); > >+ /* >+ * With this console gone, the global flags tracking registered >+ * console types may have changed. Update them. >+ */ >+ for_each_console(c) { >+ if (c->flags & CON_BOOT) >+ found_boot_con = true; >+ >+ if (c->flags & CON_NBCON) >+ found_nbcon_con = true; >+ else >+ found_legacy_con = true; >+ } >+ if (!found_boot_con) >+ have_boot_console = false; >+ if (!found_legacy_con) >+ have_legacy_console = false; >+ if (!found_nbcon_con) >+ have_nbcon_console = false; >+ >+ /* >+ * When the last boot console unregisters, start up the >+ * printing threads. >+ */ >+ if (is_boot_con && !have_boot_console) { >+ for_each_console(c) >+ nbcon_kthread_create(c); >+ } >+ >+#ifdef CONFIG_PRINTK >+ if (!printing_via_unlock && nbcon_legacy_kthread) { >+ kthread_stop(nbcon_legacy_kthread); >+ nbcon_legacy_kthread = NULL; >+ } >+#endif >+ > return res; > } > >@@ -3755,31 +4113,42 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre > u64 last_diff = 0; > u64 printk_seq; > short flags; >+ bool locked; > int cookie; > u64 diff; > u64 seq; > > might_sleep(); > >- seq = prb_next_seq(prb); >+ seq = prb_next_reserve_seq(prb); > >- /* Flush the consoles so that records up to @seq are printed. */ >- console_lock(); >- console_unlock(); >+ /* >+ * Flush the consoles so that records up to @seq are printed. >+ * Otherwise this function will just wait for the threaded printers >+ * to print up to @seq. >+ */ >+ if (printing_via_unlock && !force_printkthreads()) { >+ console_lock(); >+ console_unlock(); >+ } > > for (;;) { > unsigned long begin_jiffies; > unsigned long slept_jiffies; > >+ locked = false; > diff = 0; > >- /* >- * Hold the console_lock to guarantee safe access to >- * console->seq. Releasing console_lock flushes more >- * records in case @seq is still not printed on all >- * usable consoles. >- */ >- console_lock(); >+ if (printing_via_unlock) { >+ /* >+ * Hold the console_lock to guarantee safe access to >+ * console->seq. Releasing console_lock flushes more >+ * records in case @seq is still not printed on all >+ * usable consoles. >+ */ >+ console_lock(); >+ locked = true; >+ } > > cookie = console_srcu_read_lock(); > for_each_console_srcu(c) { >@@ -3793,12 +4162,15 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre > * that they make forward progress, so only increment > * @diff for usable consoles. > */ >- if (!console_is_usable(c)) >+ if (!console_is_usable(c, flags, true) && >+ !console_is_usable(c, flags, false)) { > continue; >+ } > > if (flags & CON_NBCON) { > printk_seq = nbcon_seq_read(c); > } else { >+ WARN_ON_ONCE(!locked); > printk_seq = c->seq; > } > >@@ -3810,7 +4182,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre > if (diff != last_diff && reset_on_progress) > remaining_jiffies = timeout_jiffies; > >- console_unlock(); >+ if (locked) >+ console_unlock(); > > /* Note: @diff is 0 if there are no usable consoles. */ > if (diff == 0 || remaining_jiffies == 0) >@@ -3862,9 +4235,16 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) > int pending = this_cpu_xchg(printk_pending, 0); > > if (pending & PRINTK_PENDING_OUTPUT) { >- /* If trylock fails, someone else is doing the printing */ >- if (console_trylock()) >- console_unlock(); >+ if (force_printkthreads()) { >+ wake_up_legacy_kthread(); >+ } else { >+ /* >+ * If trylock fails, some other context >+ * will do the printing. >+ */ >+ if (console_trylock()) >+ console_unlock(); >+ } > } > > if (pending & PRINTK_PENDING_WAKEUP) >@@ -3932,11 +4312,16 @@ void defer_console_output(void) > * New messages may have been added directly to the ringbuffer > * using vprintk_store(), so wake any waiters as well. > */ >- __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); >+ int val = PRINTK_PENDING_WAKEUP; >+ >+ if (printing_via_unlock) >+ val |= PRINTK_PENDING_OUTPUT; >+ __wake_up_klogd(val); > } > > void printk_trigger_flush(void) > { >+ nbcon_wake_threads(); > defer_console_output(); > } > >diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c >index fde338606ce83..88e8f3a619229 100644 >--- a/kernel/printk/printk_ringbuffer.c >+++ b/kernel/printk/printk_ringbuffer.c >@@ -6,6 +6,7 @@ > #include <linux/errno.h> > #include <linux/bug.h> > #include "printk_ringbuffer.h" >+#include "internal.h" > > /** > * DOC: printk_ringbuffer overview >@@ -303,6 +304,9 @@ > * > * desc_push_tail:B / desc_reserve:D > * set descriptor reusable (state), then push descriptor tail (id) >+ * >+ * desc_update_last_finalized:A / desc_last_finalized_seq:A >+ * store finalized record, then set new highest finalized sequence number > */ > > #define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) >@@ -1030,9 +1034,13 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, > unsigned long next_lpos; > > if (size == 0) { >- /* Specify a data-less block. */ >- blk_lpos->begin = NO_LPOS; >- blk_lpos->next = NO_LPOS; >+ /* >+ * Data blocks are not created for empty lines. Instead, the >+ * reader will recognize these special lpos values and handle >+ * it appropriately. >+ */ >+ blk_lpos->begin = EMPTY_LINE_LPOS; >+ blk_lpos->next = EMPTY_LINE_LPOS; > return NULL; > } > >@@ -1210,10 +1218,18 @@ static const char *get_data(struct prb_data_ring *data_ring, > > /* Data-less data block description. */ > if (BLK_DATALESS(blk_lpos)) { >- if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { >+ /* >+ * Records that are just empty lines are also valid, even >+ * though they do not have a data block. For such records >+ * explicitly return empty string data to signify success. >+ */ >+ if (blk_lpos->begin == EMPTY_LINE_LPOS && >+ blk_lpos->next == EMPTY_LINE_LPOS) { > *data_size = 0; > return ""; > } >+ >+ /* Data lost, invalid, or otherwise unavailable. */ > return NULL; > } > >@@ -1441,20 +1457,118 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer > return false; > } > >+/* >+ * @last_finalized_seq value guarantees that all records up to and including >+ * this sequence number are finalized and can be read. The only exception are >+ * too old records which have already been overwritten. >+ * >+ * It is also guaranteed that @last_finalized_seq only increases. >+ * >+ * Be aware that finalized records following non-finalized records are not >+ * reported because they are not yet available to the reader. For example, >+ * a new record stored via printk() will not be available to a printer if >+ * it follows a record that has not been finalized yet. However, once that >+ * non-finalized record becomes finalized, @last_finalized_seq will be >+ * appropriately updated and the full set of finalized records will be >+ * available to the printer. And since each printk() caller will either >+ * directly print or trigger deferred printing of all available unprinted >+ * records, all printk() messages will get printed. >+ */ >+static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb) >+{ >+ struct prb_desc_ring *desc_ring = &rb->desc_ring; >+ unsigned long ulseq; >+ >+ /* >+ * Guarantee the sequence number is loaded before loading the >+ * associated record in order to guarantee that the record can be >+ * seen by this CPU. This pairs with desc_update_last_finalized:A. >+ */ >+ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq >+ ); /* LMM(desc_last_finalized_seq:A) */ >+ >+ return __ulseq_to_u64seq(rb, ulseq); >+} >+ >+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, >+ struct printk_record *r, unsigned int *line_count); >+ >+/* >+ * Check if there are records directly following @last_finalized_seq that are >+ * finalized. If so, update @last_finalized_seq to the latest of these >+ * records. It is not allowed to skip over records that are not yet finalized. >+ */ >+static void desc_update_last_finalized(struct printk_ringbuffer *rb) >+{ >+ struct prb_desc_ring *desc_ring = &rb->desc_ring; >+ u64 old_seq = desc_last_finalized_seq(rb); >+ unsigned long oldval; >+ unsigned long newval; >+ u64 finalized_seq; >+ u64 try_seq; >+ >+try_again: >+ finalized_seq = old_seq; >+ try_seq = finalized_seq + 1; >+ >+ /* Try to find later finalized records. */ >+ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) { >+ finalized_seq = try_seq; >+ try_seq++; >+ } >+ >+ /* No update needed if no later finalized record was found. */ >+ if (finalized_seq == old_seq) >+ return; >+ >+ oldval = __u64seq_to_ulseq(old_seq); >+ newval = __u64seq_to_ulseq(finalized_seq); >+ >+ /* >+ * Set the sequence number of a later finalized record that has been >+ * seen. >+ * >+ * Guarantee the record data is visible to other CPUs before storing >+ * its sequence number. This pairs with desc_last_finalized_seq:A. >+ * >+ * Memory barrier involvement: >+ * >+ * If desc_last_finalized_seq:A reads from >+ * desc_update_last_finalized:A, then desc_read:A reads from >+ * _prb_commit:B. >+ * >+ * Relies on: >+ * >+ * RELEASE from _prb_commit:B to desc_update_last_finalized:A >+ * matching >+ * ACQUIRE from desc_last_finalized_seq:A to desc_read:A >+ * >+ * Note: _prb_commit:B and desc_update_last_finalized:A can be >+ * different CPUs. However, the desc_update_last_finalized:A >+ * CPU (which performs the release) must have previously seen >+ * _prb_commit:B. >+ */ >+ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq, >+ &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */ >+ old_seq = __ulseq_to_u64seq(rb, oldval); >+ goto try_again; >+ } >+} >+ > /* > * Attempt to finalize a specified descriptor. If this fails, the descriptor > * is either already final or it will finalize itself when the writer commits. > */ >-static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) >+static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id) > { >+ struct prb_desc_ring *desc_ring = &rb->desc_ring; > unsigned long prev_state_val = DESC_SV(id, desc_committed); > struct prb_desc *d = to_desc(desc_ring, id); > >- atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, >- DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ >- >- /* Best effort to remember the last finalized @id. */ >- atomic_long_set(&desc_ring->last_finalized_id, id); >+ if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val, >+ DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */ >+ desc_update_last_finalized(rb); >+ } > } > > /** >@@ -1550,7 +1664,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, > * readers. (For seq==0 there is no previous descriptor.) > */ > if (info->seq > 0) >- desc_make_final(desc_ring, DESC_ID(id - 1)); >+ desc_make_final(rb, DESC_ID(id - 1)); > > r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); > /* If text data allocation fails, a data-less record is committed. */ >@@ -1643,7 +1757,7 @@ void prb_commit(struct prb_reserved_entry *e) > */ > head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ > if (head_id != e->id) >- desc_make_final(desc_ring, e->id); >+ desc_make_final(e->rb, e->id); > } > > /** >@@ -1663,12 +1777,9 @@ void prb_commit(struct prb_reserved_entry *e) > */ > void prb_final_commit(struct prb_reserved_entry *e) > { >- struct prb_desc_ring *desc_ring = &e->rb->desc_ring; >- > _prb_commit(e, desc_finalized); > >- /* Best effort to remember the last finalized @id. */ >- atomic_long_set(&desc_ring->last_finalized_id, e->id); >+ desc_update_last_finalized(e->rb); > } > > /* >@@ -1832,7 +1943,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, > } > > /* Get the sequence number of the tail descriptor. */ >-static u64 prb_first_seq(struct printk_ringbuffer *rb) >+u64 prb_first_seq(struct printk_ringbuffer *rb) > { > struct prb_desc_ring *desc_ring = &rb->desc_ring; > enum desc_state d_state; >@@ -1875,12 +1986,123 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) > return seq; > } > >-/* >- * Non-blocking read of a record. Updates @seq to the last finalized record >- * (which may have no data available). >+/** >+ * prb_next_reserve_seq() - Get the sequence number after the most recently >+ * reserved record. > * >- * See the description of prb_read_valid() and prb_read_valid_info() >- * for details. >+ * @rb: The ringbuffer to get the sequence number from. >+ * >+ * This is the public function available to readers to see what sequence >+ * number will be assigned to the next reserved record. >+ * >+ * Note that depending on the situation, this value can be equal to or >+ * higher than the sequence number returned by prb_next_seq(). >+ * >+ * Context: Any context. >+ * Return: The sequence number that will be assigned to the next record >+ * reserved. >+ */ >+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb) >+{ >+ struct prb_desc_ring *desc_ring = &rb->desc_ring; >+ unsigned long last_finalized_id; >+ atomic_long_t *state_var; >+ u64 last_finalized_seq; >+ unsigned long head_id; >+ struct prb_desc desc; >+ unsigned long diff; >+ struct prb_desc *d; >+ int err; >+ >+ /* >+ * It may not be possible to read a sequence number for @head_id. >+ * So the ID of @last_finailzed_seq is used to calculate what the >+ * sequence number of @head_id will be. >+ */ >+ >+try_again: >+ last_finalized_seq = desc_last_finalized_seq(rb); >+ >+ /* >+ * @head_id is loaded after @last_finalized_seq to ensure that >+ * it points to the record with @last_finalized_seq or newer. >+ * >+ * Memory barrier involvement: >+ * >+ * If desc_last_finalized_seq:A reads from >+ * desc_update_last_finalized:A, then >+ * prb_next_reserve_seq:A reads from desc_reserve:D. >+ * >+ * Relies on: >+ * >+ * RELEASE from desc_reserve:D to desc_update_last_finalized:A >+ * matching >+ * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A >+ * >+ * Note: desc_reserve:D and desc_update_last_finalized:A can be >+ * different CPUs. However, the desc_update_last_finalized:A CPU >+ * (which performs the release) must have previously seen >+ * desc_read:C, which implies desc_reserve:D can be seen. >+ */ >+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */ >+ >+ d = to_desc(desc_ring, last_finalized_seq); >+ state_var = &d->state_var; >+ >+ /* Extract the ID, used to specify the descriptor to read. */ >+ last_finalized_id = DESC_ID(atomic_long_read(state_var)); >+ >+ /* Ensure @last_finalized_id is correct. */ >+ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc); >+ >+ if (err == -EINVAL) { >+ if (last_finalized_seq == 0) { >+ /* >+ * No record has been finalized or even reserved yet. >+ * >+ * The @head_id is initialized such that the first >+ * increment will yield the first record (seq=0). >+ * Handle it separately to avoid a negative @diff >+ * below. >+ */ >+ if (head_id == DESC0_ID(desc_ring->count_bits)) >+ return 0; >+ >+ /* >+ * One or more descriptors are already reserved. Use >+ * the descriptor ID of the first one (@seq=0) for >+ * the @diff below. >+ */ >+ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1; >+ } else { >+ /* Record must have been overwritten. Try again. */ >+ goto try_again; >+ } >+ } >+ >+ /* Diff of known descriptor IDs to compute related sequence numbers. */ >+ diff = head_id - last_finalized_id; >+ >+ /* >+ * @head_id points to the most recently reserved record, but this >+ * function returns the sequence number that will be assigned to the >+ * next (not yet reserved) record. Thus +1 is needed. >+ */ >+ return (last_finalized_seq + diff + 1); >+} >+ >+/* >+ * Non-blocking read of a record. >+ * >+ * On success @seq is updated to the record that was read and (if provided) >+ * @r and @line_count will contain the read/calculated data. >+ * >+ * On failure @seq is updated to a record that is not yet available to the >+ * reader, but it will be the next record available to the reader. >+ * >+ * Note: When the current CPU is in panic, this function will skip over any >+ * non-existent/non-finalized records in order to allow the panic CPU >+ * to print any and all records that have been finalized. > */ > static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, > struct printk_record *r, unsigned int *line_count) >@@ -1899,12 +2121,32 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, > *seq = tail_seq; > > } else if (err == -ENOENT) { >- /* Record exists, but no data available. Skip. */ >+ /* Record exists, but the data was lost. Skip. */ > (*seq)++; > > } else { >- /* Non-existent/non-finalized record. Must stop. */ >- return false; >+ /* >+ * Non-existent/non-finalized record. Must stop. >+ * >+ * For panic situations it cannot be expected that >+ * non-finalized records will become finalized. But >+ * there may be other finalized records beyond that >+ * need to be printed for a panic situation. If this >+ * is the panic CPU, skip this >+ * non-existent/non-finalized record unless it is >+ * at or beyond the head, in which case it is not >+ * possible to continue. >+ * >+ * Note that new messages printed on panic CPU are >+ * finalized when we are here. The only exception >+ * might be the last message without trailing newline. >+ * But it would have the sequence number returned >+ * by "prb_next_reserve_seq() - 1". >+ */ >+ if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb))) >+ (*seq)++; >+ else >+ return false; > } > } > >@@ -1932,7 +2174,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, > * On success, the reader must check r->info.seq to see which record was > * actually read. This allows the reader to detect dropped records. > * >- * Failure means @seq refers to a not yet written record. >+ * Failure means @seq refers to a record not yet available to the reader. > */ > bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, > struct printk_record *r) >@@ -1962,7 +2204,7 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, > * On success, the reader must check info->seq to see which record meta data > * was actually read. This allows the reader to detect dropped records. > * >- * Failure means @seq refers to a not yet written record. >+ * Failure means @seq refers to a record not yet available to the reader. > */ > bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, > struct printk_info *info, unsigned int *line_count) >@@ -2008,7 +2250,9 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) > * newest sequence number available to readers will be. > * > * This provides readers a sequence number to jump to if all currently >- * available records should be skipped. >+ * available records should be skipped. It is guaranteed that all records >+ * previous to the returned value have been finalized and are (or were) >+ * available to the reader. > * > * Context: Any context. > * Return: The sequence number of the next newest (not yet available) record >@@ -2016,34 +2260,19 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) > */ > u64 prb_next_seq(struct printk_ringbuffer *rb) > { >- struct prb_desc_ring *desc_ring = &rb->desc_ring; >- enum desc_state d_state; >- unsigned long id; > u64 seq; > >- /* Check if the cached @id still points to a valid @seq. */ >- id = atomic_long_read(&desc_ring->last_finalized_id); >- d_state = desc_read(desc_ring, id, NULL, &seq, NULL); >+ seq = desc_last_finalized_seq(rb); > >- if (d_state == desc_finalized || d_state == desc_reusable) { >- /* >- * Begin searching after the last finalized record. >- * >- * On 0, the search must begin at 0 because of hack#2 >- * of the bootstrapping phase it is not known if a >- * record at index 0 exists. >- */ >- if (seq != 0) >- seq++; >- } else { >- /* >- * The information about the last finalized sequence number >- * has gone. It should happen only when there is a flood of >- * new messages and the ringbuffer is rapidly recycled. >- * Give up and start from the beginning. >- */ >- seq = 0; >- } >+ /* >+ * Begin searching after the last finalized record. >+ * >+ * On 0, the search must begin at 0 because of hack#2 >+ * of the bootstrapping phase it is not known if a >+ * record at index 0 exists. >+ */ >+ if (seq != 0) >+ seq++; > > /* > * The information about the last finalized @seq might be inaccurate. >@@ -2085,7 +2314,7 @@ void prb_init(struct printk_ringbuffer *rb, > rb->desc_ring.infos = infos; > atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); > atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); >- atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits)); >+ atomic_long_set(&rb->desc_ring.last_finalized_seq, 0); > > rb->text_data_ring.size_bits = textbits; > rb->text_data_ring.data = text_buf; >diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h >index 18cd25e489b89..52626d0f1fa37 100644 >--- a/kernel/printk/printk_ringbuffer.h >+++ b/kernel/printk/printk_ringbuffer.h >@@ -75,7 +75,7 @@ struct prb_desc_ring { > struct printk_info *infos; > atomic_long_t head_id; > atomic_long_t tail_id; >- atomic_long_t last_finalized_id; >+ atomic_long_t last_finalized_seq; > }; > > /* >@@ -127,8 +127,22 @@ enum desc_state { > #define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) > #define DESC_ID_MASK (~DESC_FLAGS_MASK) > #define DESC_ID(sv) ((sv) & DESC_ID_MASK) >+ >+/* >+ * Special data block logical position values (for fields of >+ * @prb_desc.text_blk_lpos). >+ * >+ * - Bit0 is used to identify if the record has no data block. (Implemented in >+ * the LPOS_DATALESS() macro.) >+ * >+ * - Bit1 specifies the reason for not having a data block. >+ * >+ * These special values could never be real lpos values because of the >+ * meta data and alignment padding of data blocks. (See to_blk_size() for >+ * details.) >+ */ > #define FAILED_LPOS 0x1 >-#define NO_LPOS 0x3 >+#define EMPTY_LINE_LPOS 0x3 > > #define FAILED_BLK_LPOS \ > { \ >@@ -259,7 +273,7 @@ static struct printk_ringbuffer name = { \ > .infos = &_##name##_infos[0], \ > .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ > .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ >- .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \ >+ .last_finalized_seq = ATOMIC_INIT(0), \ > }, \ > .text_data_ring = { \ > .size_bits = (avgtextbits) + (descbits), \ >@@ -378,7 +392,41 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, > bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, > struct printk_info *info, unsigned int *line_count); > >+u64 prb_first_seq(struct printk_ringbuffer *rb); > u64 prb_first_valid_seq(struct printk_ringbuffer *rb); > u64 prb_next_seq(struct printk_ringbuffer *rb); >+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb); >+ >+#ifdef CONFIG_64BIT >+ >+#define __u64seq_to_ulseq(u64seq) (u64seq) >+#define __ulseq_to_u64seq(rb, ulseq) (ulseq) >+ >+#else /* CONFIG_64BIT */ >+ >+#define __u64seq_to_ulseq(u64seq) ((u32)u64seq) >+ >+static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq) >+{ >+ u64 rb_first_seq = prb_first_seq(rb); >+ u64 seq; >+ >+ /* >+ * The provided sequence is only the lower 32 bits of the ringbuffer >+ * sequence. It needs to be expanded to 64bit. Get the first sequence >+ * number from the ringbuffer and fold it. >+ * >+ * Having a 32bit representation in the console is sufficient. >+ * If a console ever gets more than 2^31 records behind >+ * the ringbuffer then this is the least of the problems. >+ * >+ * Also the access to the ring buffer is always safe. >+ */ >+ seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq); >+ >+ return seq; >+} >+ >+#endif /* CONFIG_64BIT */ > > #endif /* _KERNEL_PRINTK_RINGBUFFER_H */ >diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c >index 6d10927a07d83..8d9408d653de5 100644 >--- a/kernel/printk/printk_safe.c >+++ b/kernel/printk/printk_safe.c >@@ -26,6 +26,18 @@ void __printk_safe_exit(void) > this_cpu_dec(printk_context); > } > >+void __printk_deferred_enter(void) >+{ >+ cant_migrate(); >+ this_cpu_inc(printk_context); >+} >+ >+void __printk_deferred_exit(void) >+{ >+ cant_migrate(); >+ this_cpu_dec(printk_context); >+} >+ > asmlinkage int vprintk(const char *fmt, va_list args) > { > #ifdef CONFIG_KGDB_KDB >diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c >index 7567ca8e743ca..48a9d47ec90eb 100644 >--- a/kernel/rcu/rcutorture.c >+++ b/kernel/rcu/rcutorture.c >@@ -2409,6 +2409,12 @@ static int rcutorture_booster_init(unsigned int cpu) > WARN_ON_ONCE(!t); > sp.sched_priority = 2; > sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); >+#ifdef CONFIG_PREEMPT_RT >+ t = per_cpu(timersd, cpu); >+ WARN_ON_ONCE(!t); >+ sp.sched_priority = 2; >+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); >+#endif > } > > /* Don't allow time recalculation while creating a new task. */ >diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h >index 5d666428546b0..5887c00e67389 100644 >--- a/kernel/rcu/tree_stall.h >+++ b/kernel/rcu/tree_stall.h >@@ -9,6 +9,7 @@ > > #include <linux/kvm_para.h> > #include <linux/rcu_notifier.h> >+#include <linux/console.h> > > ////////////////////////////////////////////////////////////////////////////// > // >@@ -604,6 +605,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) > if (rcu_stall_is_suppressed()) > return; > >+ nbcon_cpu_emergency_enter(); >+ > /* > * OK, time to rat on our buddy... > * See Documentation/RCU/stallwarn.rst for info on how to debug >@@ -658,6 +661,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) > panic_on_rcu_stall(); > > rcu_force_quiescent_state(); /* Kick them all. */ >+ >+ nbcon_cpu_emergency_exit(); > } > > static void print_cpu_stall(unsigned long gps) >diff --git a/kernel/sched/core.c b/kernel/sched/core.c >index 9116bcc903467..5015768f10256 100644 >--- a/kernel/sched/core.c >+++ b/kernel/sched/core.c >@@ -899,14 +899,15 @@ static inline void hrtick_rq_init(struct rq *rq) > > #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) > /* >- * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, >+ * Atomically set TIF_NEED_RESCHED[_LAZY] and test for TIF_POLLING_NRFLAG, > * this avoids any races wrt polling state changes and thereby avoids > * spurious IPIs. > */ >-static inline bool set_nr_and_not_polling(struct task_struct *p) >+static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit) > { > struct thread_info *ti = task_thread_info(p); >- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); >+ >+ return !(fetch_or(&ti->flags, 1 << tif_bit) & _TIF_POLLING_NRFLAG); > } > > /* >@@ -923,7 +924,7 @@ static bool set_nr_if_polling(struct task_struct *p) > do { > if (!(val & _TIF_POLLING_NRFLAG)) > return false; >- if (val & _TIF_NEED_RESCHED) >+ if (val & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) > return true; > } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); > >@@ -931,9 +932,9 @@ static bool set_nr_if_polling(struct task_struct *p) > } > > #else >-static inline bool set_nr_and_not_polling(struct task_struct *p) >+static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit) > { >- set_tsk_need_resched(p); >+ set_tsk_thread_flag(p, tif_bit); > return true; > } > >@@ -1038,28 +1039,47 @@ void wake_up_q(struct wake_q_head *head) > * might also involve a cross-CPU call to trigger the scheduler on > * the target CPU. > */ >-void resched_curr(struct rq *rq) >+static void __resched_curr(struct rq *rq, int lazy) > { >+ int cpu, tif_bit = TIF_NEED_RESCHED + lazy; > struct task_struct *curr = rq->curr; >- int cpu; > > lockdep_assert_rq_held(rq); > >- if (test_tsk_need_resched(curr)) >+ if (unlikely(test_tsk_thread_flag(curr, tif_bit))) > return; > > cpu = cpu_of(rq); > > if (cpu == smp_processor_id()) { >- set_tsk_need_resched(curr); >- set_preempt_need_resched(); >+ set_tsk_thread_flag(curr, tif_bit); >+ if (!lazy) >+ set_preempt_need_resched(); > return; > } > >- if (set_nr_and_not_polling(curr)) >- smp_send_reschedule(cpu); >- else >+ if (set_nr_and_not_polling(curr, tif_bit)) { >+ if (!lazy) >+ smp_send_reschedule(cpu); >+ } else { > trace_sched_wake_idle_without_ipi(cpu); >+ } >+} >+ >+void resched_curr(struct rq *rq) >+{ >+ __resched_curr(rq, 0); >+} >+ >+void resched_curr_lazy(struct rq *rq) >+{ >+ int lazy = IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && !sched_feat(FORCE_NEED_RESCHED) ? >+ TIF_NEED_RESCHED_LAZY_OFFSET : 0; >+ >+ if (lazy && unlikely(test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED))) >+ return; >+ >+ __resched_curr(rq, lazy); > } > > void resched_cpu(int cpu) >@@ -1154,7 +1174,7 @@ static void wake_up_idle_cpu(int cpu) > * and testing of the above solutions didn't appear to report > * much benefits. > */ >- if (set_nr_and_not_polling(rq->idle)) >+ if (set_nr_and_not_polling(rq->idle, TIF_NEED_RESCHED)) > smp_send_reschedule(cpu); > else > trace_sched_wake_idle_without_ipi(cpu); >@@ -8890,6 +8910,21 @@ static inline void preempt_dynamic_init(void) { } > > #endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ > >+/* >+ * task_is_pi_boosted - Check if task has been PI boosted. >+ * @p: Task to check. >+ * >+ * Return true if task is subject to priority inheritance. >+ */ >+bool task_is_pi_boosted(const struct task_struct *p) >+{ >+ int prio = p->prio; >+ >+ if (!rt_prio(prio)) >+ return false; >+ return prio != p->normal_prio; >+} >+ > /** > * yield - yield the current processor to other threads. > * >diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c >index 8d5d98a5834df..b462333db26cb 100644 >--- a/kernel/sched/debug.c >+++ b/kernel/sched/debug.c >@@ -333,6 +333,23 @@ static const struct file_operations sched_debug_fops = { > .release = seq_release, > }; > >+static ssize_t sched_hog_write(struct file *filp, const char __user *ubuf, >+ size_t cnt, loff_t *ppos) >+{ >+ unsigned long end = jiffies + 60 * HZ; >+ >+ for (; time_before(jiffies, end) && !signal_pending(current);) >+ cpu_relax(); >+ >+ return cnt; >+} >+ >+static const struct file_operations sched_hog_fops = { >+ .write = sched_hog_write, >+ .open = simple_open, >+ .llseek = default_llseek, >+}; >+ > static struct dentry *debugfs_sched; > > static __init int sched_init_debug(void) >@@ -374,6 +391,8 @@ static __init int sched_init_debug(void) > > debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); > >+ debugfs_create_file("hog", 0200, debugfs_sched, NULL, &sched_hog_fops); >+ > return 0; > } > late_initcall(sched_init_debug); >diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >index 533547e3c90a7..05d7fa9a499ed 100644 >--- a/kernel/sched/fair.c >+++ b/kernel/sched/fair.c >@@ -975,8 +975,10 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); > * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i > * this is probably good enough. > */ >-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) >+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se, bool tick) > { >+ struct rq *rq = rq_of(cfs_rq); >+ > if ((s64)(se->vruntime - se->deadline) < 0) > return; > >@@ -995,10 +997,19 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) > /* > * The task has consumed its request, reschedule. > */ >- if (cfs_rq->nr_running > 1) { >- resched_curr(rq_of(cfs_rq)); >- clear_buddies(cfs_rq, se); >+ if (cfs_rq->nr_running < 2) >+ return; >+ >+ if (!IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) || sched_feat(FORCE_NEED_RESCHED)) { >+ resched_curr(rq); >+ } else { >+ /* Did the task ignore the lazy reschedule request? */ >+ if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY)) >+ resched_curr(rq); >+ else >+ resched_curr_lazy(rq); > } >+ clear_buddies(cfs_rq, se); > } > > #include "pelt.h" >@@ -1153,7 +1164,7 @@ s64 update_curr_common(struct rq *rq) > /* > * Update the current task's runtime statistics. > */ >-static void update_curr(struct cfs_rq *cfs_rq) >+static void __update_curr(struct cfs_rq *cfs_rq, bool tick) > { > struct sched_entity *curr = cfs_rq->curr; > s64 delta_exec; >@@ -1411,7 +1411,7 @@ > #else // !CONFIG_SCHED_BORE > curr->vruntime += calc_delta_fair(delta_exec, curr); > #endif // CONFIG_SCHED_BORE >- update_deadline(cfs_rq, curr); >+ update_deadline(cfs_rq, curr, tick); > update_min_vruntime(cfs_rq); > > if (entity_is_task(curr)) >@@ -1175,6 +1186,11 @@ static void update_curr(struct cfs_rq *cfs_rq) > account_cfs_rq_runtime(cfs_rq, delta_exec); > } > >+static inline void update_curr(struct cfs_rq *cfs_rq) >+{ >+ __update_curr(cfs_rq, false); >+} >+ > static void update_curr_fair(struct rq *rq) > { > update_curr(cfs_rq_of(&rq->curr->se)); >@@ -5493,7 +5509,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) > /* > * Update run-time statistics of the 'current'. > */ >- update_curr(cfs_rq); >+ __update_curr(cfs_rq, true); > > /* > * Ensure that runnable average is periodically updated. >@@ -5507,7 +5523,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) > * validating it and just reschedule. > */ > if (queued) { >- resched_curr(rq_of(cfs_rq)); >+ resched_curr_lazy(rq_of(cfs_rq)); > return; > } > /* >@@ -5653,7 +5669,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) > * hierarchy can be throttled > */ > if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) >- resched_curr(rq_of(cfs_rq)); >+ resched_curr_lazy(rq_of(cfs_rq)); > } > > static __always_inline >@@ -5913,7 +5929,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) > > /* Determine whether we need to wake up potentially idle CPU: */ > if (rq->curr == rq->idle && rq->cfs.nr_running) >- resched_curr(rq); >+ resched_curr_lazy(rq); > } > > #ifdef CONFIG_SMP >@@ -6628,7 +6644,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) > > if (delta < 0) { > if (task_current(rq, p)) >- resched_curr(rq); >+ resched_curr_lazy(rq); > return; > } > hrtick_start(rq, delta); >@@ -8298,7 +8314,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int > * prevents us from potentially nominating it as a false LAST_BUDDY > * below. > */ >- if (test_tsk_need_resched(curr)) >+ if (need_resched()) > return; > > /* Idle tasks are by definition preempted by non-idle tasks. */ >@@ -8340,7 +8356,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int > return; > > preempt: >- resched_curr(rq); >+ resched_curr_lazy(rq); > } > > #ifdef CONFIG_SMP >@@ -12510,7 +12526,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) > */ > if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && > __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) >- resched_curr(rq); >+ resched_curr_lazy(rq); > } > > /* >@@ -12675,7 +12691,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) > */ > if (task_current(rq, p)) { > if (p->prio > oldprio) >- resched_curr(rq); >+ resched_curr_lazy(rq); > } else > wakeup_preempt(rq, p, 0); > } >diff --git a/kernel/sched/features.h b/kernel/sched/features.h >index 143f55df890b1..6de570ab30078 100644 >--- a/kernel/sched/features.h >+++ b/kernel/sched/features.h >@@ -87,3 +87,5 @@ SCHED_FEAT(UTIL_EST, true) > SCHED_FEAT(LATENCY_WARN, false) > > SCHED_FEAT(HZ_BW, true) >+ >+SCHED_FEAT(FORCE_NEED_RESCHED, false) >diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c >index 31231925f1ece..58486420f3624 100644 >--- a/kernel/sched/idle.c >+++ b/kernel/sched/idle.c >@@ -57,8 +57,7 @@ static noinline int __cpuidle cpu_idle_poll(void) > ct_cpuidle_enter(); > > raw_local_irq_enable(); >- while (!tif_need_resched() && >- (cpu_idle_force_poll || tick_check_broadcast_expired())) >+ while (!need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) > cpu_relax(); > raw_local_irq_disable(); > >diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c >index 3261b067b67e2..8771140e0de5e 100644 >--- a/kernel/sched/rt.c >+++ b/kernel/sched/rt.c >@@ -2194,8 +2194,11 @@ static int rto_next_cpu(struct root_domain *rd) > > rd->rto_cpu = cpu; > >- if (cpu < nr_cpu_ids) >+ if (cpu < nr_cpu_ids) { >+ if (!has_pushable_tasks(cpu_rq(cpu))) >+ continue; > return cpu; >+ } > > rd->rto_cpu = -1; > >diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h >index 001fe047bd5d8..17424c69537f2 100644 >--- a/kernel/sched/sched.h >+++ b/kernel/sched/sched.h >@@ -2463,6 +2463,7 @@ extern void init_sched_fair_class(void); > extern void reweight_task(struct task_struct *p, int prio); > > extern void resched_curr(struct rq *rq); >+extern void resched_curr_lazy(struct rq *rq); > extern void resched_cpu(int cpu); > > extern struct rt_bandwidth def_rt_bandwidth; >diff --git a/kernel/softirq.c b/kernel/softirq.c >index 210cf5f8d92c2..cae0ae2e2b0bb 100644 >--- a/kernel/softirq.c >+++ b/kernel/softirq.c >@@ -247,6 +247,19 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) > } > EXPORT_SYMBOL(__local_bh_enable_ip); > >+void softirq_preempt(void) >+{ >+ if (WARN_ON_ONCE(!preemptible())) >+ return; >+ >+ if (WARN_ON_ONCE(__this_cpu_read(softirq_ctrl.cnt) != SOFTIRQ_OFFSET)) >+ return; >+ >+ __local_bh_enable(SOFTIRQ_OFFSET, true); >+ /* preemption point */ >+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); >+} >+ > /* > * Invoked from ksoftirqd_run() outside of the interrupt disabled section > * to acquire the per CPU local lock for reentrancy protection. >@@ -619,6 +632,24 @@ static inline void tick_irq_exit(void) > #endif > } > >+#ifdef CONFIG_PREEMPT_RT >+DEFINE_PER_CPU(struct task_struct *, timersd); >+DEFINE_PER_CPU(unsigned long, pending_timer_softirq); >+ >+static void wake_timersd(void) >+{ >+ struct task_struct *tsk = __this_cpu_read(timersd); >+ >+ if (tsk) >+ wake_up_process(tsk); >+} >+ >+#else >+ >+static inline void wake_timersd(void) { } >+ >+#endif >+ > static inline void __irq_exit_rcu(void) > { > #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED >@@ -631,6 +662,10 @@ static inline void __irq_exit_rcu(void) > if (!in_interrupt() && local_softirq_pending()) > invoke_softirq(); > >+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() && >+ !(in_nmi() | in_hardirq())) >+ wake_timersd(); >+ > tick_irq_exit(); > } > >@@ -963,12 +998,70 @@ static struct smp_hotplug_thread softirq_threads = { > .thread_comm = "ksoftirqd/%u", > }; > >+#ifdef CONFIG_PREEMPT_RT >+static void timersd_setup(unsigned int cpu) >+{ >+ sched_set_fifo_low(current); >+} >+ >+static int timersd_should_run(unsigned int cpu) >+{ >+ return local_pending_timers(); >+} >+ >+static void run_timersd(unsigned int cpu) >+{ >+ unsigned int timer_si; >+ >+ ksoftirqd_run_begin(); >+ >+ timer_si = local_pending_timers(); >+ __this_cpu_write(pending_timer_softirq, 0); >+ or_softirq_pending(timer_si); >+ >+ __do_softirq(); >+ >+ ksoftirqd_run_end(); >+} >+ >+static void raise_ktimers_thread(unsigned int nr) >+{ >+ trace_softirq_raise(nr); >+ __this_cpu_or(pending_timer_softirq, 1 << nr); >+} >+ >+void raise_hrtimer_softirq(void) >+{ >+ raise_ktimers_thread(HRTIMER_SOFTIRQ); >+} >+ >+void raise_timer_softirq(void) >+{ >+ unsigned long flags; >+ >+ local_irq_save(flags); >+ raise_ktimers_thread(TIMER_SOFTIRQ); >+ wake_timersd(); >+ local_irq_restore(flags); >+} >+ >+static struct smp_hotplug_thread timer_threads = { >+ .store = &timersd, >+ .setup = timersd_setup, >+ .thread_should_run = timersd_should_run, >+ .thread_fn = run_timersd, >+ .thread_comm = "ktimers/%u", >+}; >+#endif >+ > static __init int spawn_ksoftirqd(void) > { > cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, > takeover_tasklets); > BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); >- >+#ifdef CONFIG_PREEMPT_RT >+ BUG_ON(smpboot_register_percpu_thread(&timer_threads)); >+#endif > return 0; > } > early_initcall(spawn_ksoftirqd); >diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c >index edb0f821dceaa..a7290012179a4 100644 >--- a/kernel/time/hrtimer.c >+++ b/kernel/time/hrtimer.c >@@ -1809,7 +1809,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) > if (!ktime_before(now, cpu_base->softirq_expires_next)) { > cpu_base->softirq_expires_next = KTIME_MAX; > cpu_base->softirq_activated = 1; >- raise_softirq_irqoff(HRTIMER_SOFTIRQ); >+ raise_hrtimer_softirq(); > } > > __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); >@@ -1922,7 +1922,7 @@ void hrtimer_run_queues(void) > if (!ktime_before(now, cpu_base->softirq_expires_next)) { > cpu_base->softirq_expires_next = KTIME_MAX; > cpu_base->softirq_activated = 1; >- raise_softirq_irqoff(HRTIMER_SOFTIRQ); >+ raise_hrtimer_softirq(); > } > > __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); >diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c >index 01fb50c1b17e4..910c04d7fa0d3 100644 >--- a/kernel/time/tick-sched.c >+++ b/kernel/time/tick-sched.c >@@ -796,7 +796,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) > > static inline bool local_timer_softirq_pending(void) > { >- return local_softirq_pending() & BIT(TIMER_SOFTIRQ); >+ return local_pending_timers() & BIT(TIMER_SOFTIRQ); > } > > static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) >diff --git a/kernel/time/timer.c b/kernel/time/timer.c >index 352b161113cda..d6bf128262c93 100644 >--- a/kernel/time/timer.c >+++ b/kernel/time/timer.c >@@ -1470,9 +1470,16 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) > */ > static void timer_sync_wait_running(struct timer_base *base) > { >- if (atomic_read(&base->timer_waiters)) { >+ bool need_preempt; >+ >+ need_preempt = task_is_pi_boosted(current); >+ if (need_preempt || atomic_read(&base->timer_waiters)) { > raw_spin_unlock_irq(&base->lock); > spin_unlock(&base->expiry_lock); >+ >+ if (need_preempt) >+ softirq_preempt(); >+ > spin_lock(&base->expiry_lock); > raw_spin_lock_irq(&base->lock); > } >@@ -2070,7 +2077,7 @@ static void run_local_timers(void) > if (time_before(jiffies, base->next_expiry)) > return; > } >- raise_softirq(TIMER_SOFTIRQ); >+ raise_timer_softirq(); > } > > /* >diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c >index c9c8983073485..947b5f1e799dd 100644 >--- a/kernel/trace/trace.c >+++ b/kernel/trace/trace.c >@@ -2717,6 +2717,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) > > if (tif_need_resched()) > trace_flags |= TRACE_FLAG_NEED_RESCHED; >+ if (tif_need_resched_lazy()) >+ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; > if (test_preempt_need_resched()) > trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; > return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | >diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c >index d8b302d010830..4f58a196e14c1 100644 >--- a/kernel/trace/trace_output.c >+++ b/kernel/trace/trace_output.c >@@ -460,17 +460,29 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) > (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : > (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : > bh_off ? 'b' : >- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : >+ !IS_ENABLED(CONFIG_TRACE_IRQFLAGS_SUPPORT) ? 'X' : > '.'; > >- switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | >+ switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | > TRACE_FLAG_PREEMPT_RESCHED)) { >+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: >+ need_resched = 'B'; >+ break; > case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: > need_resched = 'N'; > break; >+ case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: >+ need_resched = 'L'; >+ break; >+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY: >+ need_resched = 'b'; >+ break; > case TRACE_FLAG_NEED_RESCHED: > need_resched = 'n'; > break; >+ case TRACE_FLAG_NEED_RESCHED_LAZY: >+ need_resched = 'l'; >+ break; > case TRACE_FLAG_PREEMPT_RESCHED: > need_resched = 'p'; > break; >diff --git a/lib/dump_stack.c b/lib/dump_stack.c >index 83471e81501a7..222c6d6c8281a 100644 >--- a/lib/dump_stack.c >+++ b/lib/dump_stack.c >@@ -96,15 +96,25 @@ static void __dump_stack(const char *log_lvl) > */ > asmlinkage __visible void dump_stack_lvl(const char *log_lvl) > { >+ bool in_panic = this_cpu_in_panic(); > unsigned long flags; > > /* > * Permit this cpu to perform nested stack dumps while serialising >- * against other CPUs >+ * against other CPUs, unless this CPU is in panic. >+ * >+ * When in panic, non-panic CPUs are not permitted to store new >+ * printk messages so there is no need to synchronize the output. >+ * This avoids potential deadlock in panic() if another CPU is >+ * holding and unable to release the printk_cpu_sync. > */ >- printk_cpu_sync_get_irqsave(flags); >+ if (!in_panic) >+ printk_cpu_sync_get_irqsave(flags); >+ > __dump_stack(log_lvl); >- printk_cpu_sync_put_irqrestore(flags); >+ >+ if (!in_panic) >+ printk_cpu_sync_put_irqrestore(flags); > } > EXPORT_SYMBOL(dump_stack_lvl); > >diff --git a/localversion-rt b/localversion-rt >new file mode 100644 >index 0000000000000..700c857efd9ba >--- /dev/null >+++ b/localversion-rt >@@ -0,0 +1 @@ >+-rt8 >diff --git a/net/core/dev.c b/net/core/dev.c >index 76e6438f4858e..9e66559611e73 100644 >--- a/net/core/dev.c >+++ b/net/core/dev.c >@@ -78,6 +78,7 @@ > #include <linux/slab.h> > #include <linux/sched.h> > #include <linux/sched/mm.h> >+#include <linux/smpboot.h> > #include <linux/mutex.h> > #include <linux/rwsem.h> > #include <linux/string.h> >@@ -216,35 +217,60 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) > return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; > } > >-static inline void rps_lock_irqsave(struct softnet_data *sd, >- unsigned long *flags) >+#ifndef CONFIG_PREEMPT_RT >+ >+static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); >+ >+static int __init setup_backlog_napi_threads(char *arg) > { >- if (IS_ENABLED(CONFIG_RPS)) >+ static_branch_enable(&use_backlog_threads_key); >+ return 0; >+} >+early_param("thread_backlog_napi", setup_backlog_napi_threads); >+ >+static bool use_backlog_threads(void) >+{ >+ return static_branch_unlikely(&use_backlog_threads_key); >+} >+ >+#else >+ >+static bool use_backlog_threads(void) >+{ >+ return true; >+} >+ >+#endif >+ >+static inline void backlog_lock_irq_save(struct softnet_data *sd, >+ unsigned long *flags) >+{ >+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) > spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); > else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > local_irq_save(*flags); > } > >-static inline void rps_lock_irq_disable(struct softnet_data *sd) >+static inline void backlog_lock_irq_disable(struct softnet_data *sd) > { >- if (IS_ENABLED(CONFIG_RPS)) >+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) > spin_lock_irq(&sd->input_pkt_queue.lock); > else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > local_irq_disable(); > } > >-static inline void rps_unlock_irq_restore(struct softnet_data *sd, >- unsigned long *flags) >+static inline void backlog_unlock_irq_restore(struct softnet_data *sd, >+ unsigned long *flags) > { >- if (IS_ENABLED(CONFIG_RPS)) >+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) > spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); > else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > local_irq_restore(*flags); > } > >-static inline void rps_unlock_irq_enable(struct softnet_data *sd) >+static inline void backlog_unlock_irq_enable(struct softnet_data *sd) > { >- if (IS_ENABLED(CONFIG_RPS)) >+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) > spin_unlock_irq(&sd->input_pkt_queue.lock); > else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > local_irq_enable(); >@@ -4420,6 +4446,7 @@ EXPORT_SYMBOL(__dev_direct_xmit); > /************************************************************************* > * Receiver routines > *************************************************************************/ >+static DEFINE_PER_CPU(struct task_struct *, backlog_napi); > > int netdev_max_backlog __read_mostly = 1000; > EXPORT_SYMBOL(netdev_max_backlog); >@@ -4452,18 +4479,16 @@ static inline void ____napi_schedule(struct softnet_data *sd, > */ > thread = READ_ONCE(napi->thread); > if (thread) { >- /* Avoid doing set_bit() if the thread is in >- * INTERRUPTIBLE state, cause napi_thread_wait() >- * makes sure to proceed with napi polling >- * if the thread is explicitly woken from here. >- */ >- if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) >- set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); >+ if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) >+ goto use_local_napi; >+ >+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); > wake_up_process(thread); > return; > } > } > >+use_local_napi: > list_add_tail(&napi->poll_list, &sd->poll_list); > WRITE_ONCE(napi->list_owner, smp_processor_id()); > /* If not called from net_rx_action() >@@ -4709,6 +4734,11 @@ static void napi_schedule_rps(struct softnet_data *sd) > > #ifdef CONFIG_RPS > if (sd != mysd) { >+ if (use_backlog_threads()) { >+ __napi_schedule_irqoff(&sd->backlog); >+ return; >+ } >+ > sd->rps_ipi_next = mysd->rps_ipi_list; > mysd->rps_ipi_list = sd; > >@@ -4723,6 +4753,23 @@ static void napi_schedule_rps(struct softnet_data *sd) > __napi_schedule_irqoff(&mysd->backlog); > } > >+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) >+{ >+ unsigned long flags; >+ >+ if (use_backlog_threads()) { >+ backlog_lock_irq_save(sd, &flags); >+ >+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) >+ __napi_schedule_irqoff(&sd->backlog); >+ >+ backlog_unlock_irq_restore(sd, &flags); >+ >+ } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { >+ smp_call_function_single_async(cpu, &sd->defer_csd); >+ } >+} >+ > #ifdef CONFIG_NET_FLOW_LIMIT > int netdev_flow_limit_table_len __read_mostly = (1 << 12); > #endif >@@ -4778,7 +4825,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > reason = SKB_DROP_REASON_NOT_SPECIFIED; > sd = &per_cpu(softnet_data, cpu); > >- rps_lock_irqsave(sd, &flags); >+ backlog_lock_irq_save(sd, &flags); > if (!netif_running(skb->dev)) > goto drop; > qlen = skb_queue_len(&sd->input_pkt_queue); >@@ -4787,7 +4834,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > enqueue: > __skb_queue_tail(&sd->input_pkt_queue, skb); > input_queue_tail_incr_save(sd, qtail); >- rps_unlock_irq_restore(sd, &flags); >+ backlog_unlock_irq_restore(sd, &flags); > return NET_RX_SUCCESS; > } > >@@ -4802,7 +4849,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > > drop: > sd->dropped++; >- rps_unlock_irq_restore(sd, &flags); >+ backlog_unlock_irq_restore(sd, &flags); > > dev_core_stats_rx_dropped_inc(skb->dev); > kfree_skb_reason(skb, reason); >@@ -5833,7 +5880,7 @@ static void flush_backlog(struct work_struct *work) > local_bh_disable(); > sd = this_cpu_ptr(&softnet_data); > >- rps_lock_irq_disable(sd); >+ backlog_lock_irq_disable(sd); > skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { > if (skb->dev->reg_state == NETREG_UNREGISTERING) { > __skb_unlink(skb, &sd->input_pkt_queue); >@@ -5841,7 +5888,7 @@ static void flush_backlog(struct work_struct *work) > input_queue_head_incr(sd); > } > } >- rps_unlock_irq_enable(sd); >+ backlog_unlock_irq_enable(sd); > > skb_queue_walk_safe(&sd->process_queue, skb, tmp) { > if (skb->dev->reg_state == NETREG_UNREGISTERING) { >@@ -5859,14 +5906,14 @@ static bool flush_required(int cpu) > struct softnet_data *sd = &per_cpu(softnet_data, cpu); > bool do_flush; > >- rps_lock_irq_disable(sd); >+ backlog_lock_irq_disable(sd); > > /* as insertion into process_queue happens with the rps lock held, > * process_queue access may race only with dequeue > */ > do_flush = !skb_queue_empty(&sd->input_pkt_queue) || > !skb_queue_empty_lockless(&sd->process_queue); >- rps_unlock_irq_enable(sd); >+ backlog_unlock_irq_enable(sd); > > return do_flush; > #endif >@@ -5932,7 +5979,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) > #ifdef CONFIG_RPS > struct softnet_data *remsd = sd->rps_ipi_list; > >- if (remsd) { >+ if (!use_backlog_threads() && remsd) { > sd->rps_ipi_list = NULL; > > local_irq_enable(); >@@ -5947,7 +5994,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) > static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) > { > #ifdef CONFIG_RPS >- return sd->rps_ipi_list != NULL; >+ return !use_backlog_threads() && sd->rps_ipi_list; > #else > return false; > #endif >@@ -5981,7 +6028,7 @@ static int process_backlog(struct napi_struct *napi, int quota) > > } > >- rps_lock_irq_disable(sd); >+ backlog_lock_irq_disable(sd); > if (skb_queue_empty(&sd->input_pkt_queue)) { > /* > * Inline a custom version of __napi_complete(). >@@ -5991,13 +6038,13 @@ static int process_backlog(struct napi_struct *napi, int quota) > * We can use a plain write instead of clear_bit(), > * and we dont need an smp_mb() memory barrier. > */ >- napi->state = 0; >+ napi->state &= NAPIF_STATE_THREADED; > again = false; > } else { > skb_queue_splice_tail_init(&sd->input_pkt_queue, > &sd->process_queue); > } >- rps_unlock_irq_enable(sd); >+ backlog_unlock_irq_enable(sd); > } > > return work; >@@ -6654,8 +6701,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) > > static int napi_thread_wait(struct napi_struct *napi) > { >- bool woken = false; >- > set_current_state(TASK_INTERRUPTIBLE); > > while (!kthread_should_stop()) { >@@ -6664,15 +6709,13 @@ static int napi_thread_wait(struct napi_struct *napi) > * Testing SCHED bit is not enough because SCHED bit might be > * set by some other busy poll thread or by napi_disable(). > */ >- if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { >+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { > WARN_ON(!list_empty(&napi->poll_list)); > __set_current_state(TASK_RUNNING); > return 0; > } > > schedule(); >- /* woken being true indicates this thread owns this napi. */ >- woken = true; > set_current_state(TASK_INTERRUPTIBLE); > } > __set_current_state(TASK_RUNNING); >@@ -6701,40 +6744,46 @@ static void skb_defer_free_flush(struct softnet_data *sd) > } > } > >+static void napi_threaded_poll_loop(struct napi_struct *napi) >+{ >+ struct softnet_data *sd; >+ >+ for (;;) { >+ bool repoll = false; >+ void *have; >+ >+ local_bh_disable(); >+ sd = this_cpu_ptr(&softnet_data); >+ sd->in_napi_threaded_poll = true; >+ >+ have = netpoll_poll_lock(napi); >+ __napi_poll(napi, &repoll); >+ netpoll_poll_unlock(have); >+ >+ sd->in_napi_threaded_poll = false; >+ barrier(); >+ >+ if (sd_has_rps_ipi_waiting(sd)) { >+ local_irq_disable(); >+ net_rps_action_and_irq_enable(sd); >+ } >+ skb_defer_free_flush(sd); >+ local_bh_enable(); >+ >+ if (!repoll) >+ break; >+ >+ cond_resched(); >+ } >+} >+ > static int napi_threaded_poll(void *data) > { > struct napi_struct *napi = data; >- struct softnet_data *sd; >- void *have; > >- while (!napi_thread_wait(napi)) { >- for (;;) { >- bool repoll = false; >+ while (!napi_thread_wait(napi)) >+ napi_threaded_poll_loop(napi); > >- local_bh_disable(); >- sd = this_cpu_ptr(&softnet_data); >- sd->in_napi_threaded_poll = true; >- >- have = netpoll_poll_lock(napi); >- __napi_poll(napi, &repoll); >- netpoll_poll_unlock(have); >- >- sd->in_napi_threaded_poll = false; >- barrier(); >- >- if (sd_has_rps_ipi_waiting(sd)) { >- local_irq_disable(); >- net_rps_action_and_irq_enable(sd); >- } >- skb_defer_free_flush(sd); >- local_bh_enable(); >- >- if (!repoll) >- break; >- >- cond_resched(); >- } >- } > return 0; > } > >@@ -11333,7 +11382,7 @@ static int dev_cpu_dead(unsigned int oldcpu) > > list_del_init(&napi->poll_list); > if (napi->poll == process_backlog) >- napi->state = 0; >+ napi->state &= NAPIF_STATE_THREADED; > else > ____napi_schedule(sd, napi); > } >@@ -11341,12 +11390,14 @@ static int dev_cpu_dead(unsigned int oldcpu) > raise_softirq_irqoff(NET_TX_SOFTIRQ); > local_irq_enable(); > >+ if (!use_backlog_threads()) { > #ifdef CONFIG_RPS >- remsd = oldsd->rps_ipi_list; >- oldsd->rps_ipi_list = NULL; >+ remsd = oldsd->rps_ipi_list; >+ oldsd->rps_ipi_list = NULL; > #endif >- /* send out pending IPI's on offline CPU */ >- net_rps_send_ipi(remsd); >+ /* send out pending IPI's on offline CPU */ >+ net_rps_send_ipi(remsd); >+ } > > /* Process offline CPU's input_pkt_queue */ > while ((skb = __skb_dequeue(&oldsd->process_queue))) { >@@ -11665,6 +11716,38 @@ static void __init net_dev_struct_check(void) > * > */ > >+static int backlog_napi_should_run(unsigned int cpu) >+{ >+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); >+ struct napi_struct *napi = &sd->backlog; >+ >+ return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); >+} >+ >+static void run_backlog_napi(unsigned int cpu) >+{ >+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); >+ >+ napi_threaded_poll_loop(&sd->backlog); >+} >+ >+static void backlog_napi_setup(unsigned int cpu) >+{ >+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); >+ struct napi_struct *napi = &sd->backlog; >+ >+ napi->thread = this_cpu_read(backlog_napi); >+ set_bit(NAPI_STATE_THREADED, &napi->state); >+} >+ >+static struct smp_hotplug_thread backlog_threads = { >+ .store = &backlog_napi, >+ .thread_should_run = backlog_napi_should_run, >+ .thread_fn = run_backlog_napi, >+ .thread_comm = "backlog_napi/%u", >+ .setup = backlog_napi_setup, >+}; >+ > /* > * This is called single threaded during boot, so no need > * to take the rtnl semaphore. >@@ -11717,7 +11800,10 @@ static int __init net_dev_init(void) > init_gro_hash(&sd->backlog); > sd->backlog.poll = process_backlog; > sd->backlog.weight = weight_p; >+ INIT_LIST_HEAD(&sd->backlog.poll_list); > } >+ if (use_backlog_threads()) >+ smpboot_register_percpu_thread(&backlog_threads); > > dev_boot_phase = 0; > >diff --git a/net/core/skbuff.c b/net/core/skbuff.c >index edbbef563d4d9..bd4985ff5717a 100644 >--- a/net/core/skbuff.c >+++ b/net/core/skbuff.c >@@ -6921,8 +6921,8 @@ nodefer: __kfree_skb(skb); > /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU > * if we are unlucky enough (this seems very unlikely). > */ >- if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) >- smp_call_function_single_async(cpu, &sd->defer_csd); >+ if (unlikely(kick)) >+ kick_defer_list_purge(sd, cpu); > } > > static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 916954
:
874143
|
874144
|
874346
|
874895
|
875322
|
875323
|
879000
|
884278
|
884334
|
888012
| 888021 |
889979
|
889980