Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 884278 Details for
Bug 916954
sys-kernel/gentoo-sources-6.{6,7,8}.x: modified RT patch with BORE patch
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
Combined BORE + RT patch for gentoo-sources 6.7.2
patch-6.7-rt6_deim.patch (text/plain), 189.43 KB, created by
deim
on 2024-02-05 17:15:36 UTC
(
hide
)
Description:
Combined BORE + RT patch for gentoo-sources 6.7.2
Filename:
MIME Type:
Creator:
deim
Created:
2024-02-05 17:15:36 UTC
Size:
189.43 KB
patch
obsolete
>From 67192bfc56aaf53324d34091a5d8487e278d9b58 Mon Sep 17 00:00:00 2001 >From: Masahito S <firelzrd@gmail.com> >Date: Thu, 18 Jan 2024 15:17:39 +0900 >Subject: [PATCH] linux6.7.y-bore4.1.3 > >--- > include/linux/sched.h | 11 ++ > init/Kconfig | 19 +++ > kernel/sched/core.c | 140 +++++++++++++++++++++ > kernel/sched/debug.c | 57 ++++++++- > kernel/sched/fair.c | 273 +++++++++++++++++++++++++++++++++++++--- > kernel/sched/features.h | 4 + > kernel/sched/sched.h | 18 +++ > 7 files changed, 501 insertions(+), 21 deletions(-) > >diff --git a/include/linux/sched.h b/include/linux/sched.h >index 292c316972..8a9e843ec5 100644 >--- a/include/linux/sched.h >+++ b/include/linux/sched.h >@@ -562,6 +562,17 @@ struct sched_entity { > u64 sum_exec_runtime; > u64 prev_sum_exec_runtime; > u64 vruntime; >+#ifdef CONFIG_SCHED_BORE >+ u64 burst_time; >+ u8 prev_burst_penalty; >+ u8 curr_burst_penalty; >+ u8 burst_penalty; >+ u8 slice_score; >+ u8 child_burst; >+ u16 child_burst_cnt; >+ u64 child_burst_last_cached; >+ u32 slice_load; >+#endif // CONFIG_SCHED_BORE > s64 vlag; > u64 slice; > >diff --git a/init/Kconfig b/init/Kconfig >index 9ffb103fc9..4492c5de88 100644 >--- a/init/Kconfig >+++ b/init/Kconfig >@@ -1258,6 +1258,25 @@ config CHECKPOINT_RESTORE > > If unsure, say N here. > >+config SCHED_BORE >+ bool "Burst-Oriented Response Enhancer" >+ default y >+ help >+ In Desktop and Mobile computing, one might prefer interactive >+ tasks to keep responsive no matter what they run in the background. >+ >+ Enabling this kernel feature modifies the scheduler to discriminate >+ tasks by their burst time (runtime since it last went sleeping or >+ yielding state) and prioritize those that run less bursty. >+ Such tasks usually include window compositor, widgets backend, >+ terminal emulator, video playback, games and so on. >+ With a little impact to scheduling fairness, it may improve >+ responsiveness especially under heavy background workload. >+ >+ You can turn it off by setting the sysctl kernel.sched_bore = 0. >+ >+ If unsure, say Y here. >+ > config SCHED_AUTOGROUP > bool "Automatic process group scheduling" > select CGROUPS >diff --git a/kernel/sched/core.c b/kernel/sched/core.c >index a708d225c2..d11854a6be 100644 >--- a/kernel/sched/core.c >+++ b/kernel/sched/core.c >@@ -4480,6 +4480,135 @@ int wake_up_state(struct task_struct *p, unsigned int state) > return try_to_wake_up(p, state, 0); > } > >+#ifdef CONFIG_SCHED_BORE >+extern bool sched_bore; >+extern u8 sched_burst_fork_atavistic; >+extern uint sched_burst_cache_lifetime; >+ >+void __init sched_init_bore(void) { >+ init_task.se.burst_time = 0; >+ init_task.se.prev_burst_penalty = 0; >+ init_task.se.curr_burst_penalty = 0; >+ init_task.se.burst_penalty = 0; >+ init_task.se.slice_score = 0; >+ init_task.se.child_burst_last_cached = 0; >+ init_task.se.slice_load = 0; >+} >+ >+void inline sched_fork_bore(struct task_struct *p) { >+ p->se.burst_time = 0; >+ p->se.curr_burst_penalty = 0; >+ p->se.slice_score = 0; >+ p->se.child_burst_last_cached = 0; >+ p->se.slice_load = 0; >+} >+ >+static u32 count_child_tasks(struct task_struct *p) { >+ struct task_struct *child; >+ u32 cnt = 0; >+ list_for_each_entry(child, &p->children, sibling) {cnt++;} >+ return cnt; >+} >+ >+static inline bool child_burst_cache_expired(struct task_struct *p, u64 now) { >+ return (p->se.child_burst_last_cached + sched_burst_cache_lifetime < now); >+} >+ >+static void __update_child_burst_cache( >+ struct task_struct *p, u32 cnt, u32 sum, u64 now) { >+ u8 avg = 0; >+ if (cnt) avg = sum / cnt; >+ p->se.child_burst = max(avg, p->se.burst_penalty); >+ p->se.child_burst_cnt = cnt; >+ p->se.child_burst_last_cached = now; >+} >+ >+static inline void update_child_burst_direct(struct task_struct *p, u64 now) { >+ struct task_struct *child; >+ u32 cnt = 0; >+ u32 sum = 0; >+ >+ list_for_each_entry(child, &p->children, sibling) { >+ if (child->sched_class != &fair_sched_class) continue; >+ cnt++; >+ sum += child->se.burst_penalty; >+ } >+ >+ __update_child_burst_cache(p, cnt, sum, now); >+} >+ >+static inline u8 __inherit_burst_direct(struct task_struct *p, u64 now) { >+ struct task_struct *parent = p->real_parent; >+ if (child_burst_cache_expired(parent, now)) >+ update_child_burst_direct(parent, now); >+ >+ return parent->se.child_burst; >+} >+ >+static inline void update_child_burst_topological( >+ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) { >+ struct task_struct *child, *dec; >+ u32 cnt = 0, dcnt = 0; >+ u32 sum = 0; >+ >+ list_for_each_entry(child, &p->children, sibling) { >+ dec = child; >+ while ((dcnt = count_child_tasks(dec)) == 1) >+ dec = list_first_entry(&dec->children, struct task_struct, sibling); >+ >+ if (!dcnt || !depth) { >+ if (dec->sched_class != &fair_sched_class) continue; >+ cnt++; >+ sum += dec->se.burst_penalty; >+ continue; >+ } >+ if (!child_burst_cache_expired(dec, now)) { >+ cnt += dec->se.child_burst_cnt; >+ sum += (u32)dec->se.child_burst * dec->se.child_burst_cnt; >+ continue; >+ } >+ update_child_burst_topological(dec, now, depth - 1, &cnt, &sum); >+ } >+ >+ __update_child_burst_cache(p, cnt, sum, now); >+ *acnt += cnt; >+ *asum += sum; >+} >+ >+static inline u8 __inherit_burst_topological(struct task_struct *p, u64 now) { >+ struct task_struct *anc = p->real_parent; >+ u32 cnt = 0, sum = 0; >+ >+ while (anc->real_parent != anc && count_child_tasks(anc) == 1) >+ anc = anc->real_parent; >+ >+ if (child_burst_cache_expired(anc, now)) >+ update_child_burst_topological( >+ anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum); >+ >+ return anc->se.child_burst; >+} >+ >+static inline void inherit_burst(struct task_struct *p) { >+ u8 burst_cache; >+ u64 now = ktime_get_ns(); >+ >+ read_lock(&tasklist_lock); >+ burst_cache = likely(sched_burst_fork_atavistic)? >+ __inherit_burst_topological(p, now): >+ __inherit_burst_direct(p, now); >+ read_unlock(&tasklist_lock); >+ >+ p->se.prev_burst_penalty = max(p->se.prev_burst_penalty, burst_cache); >+} >+ >+static inline void sched_post_fork_bore(struct task_struct *p) { >+ if (p->sched_class == &fair_sched_class && likely(sched_bore)) >+ inherit_burst(p); >+ p->se.burst_penalty = p->se.prev_burst_penalty; >+} >+#endif // CONFIG_SCHED_BORE >+ > /* > * Perform scheduler related setup for a newly forked process p. > * p is forked by current. >@@ -4496,6 +4625,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) > p->se.prev_sum_exec_runtime = 0; > p->se.nr_migrations = 0; > p->se.vruntime = 0; >+#ifdef CONFIG_SCHED_BORE >+ sched_fork_bore(p); >+#endif // CONFIG_SCHED_BORE > p->se.vlag = 0; > p->se.slice = sysctl_sched_base_slice; > INIT_LIST_HEAD(&p->se.group_node); >@@ -4815,6 +4947,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) > > void sched_post_fork(struct task_struct *p) > { >+#ifdef CONFIG_SCHED_BORE >+ sched_post_fork_bore(p); >+#endif // CONFIG_SCHED_BORE > uclamp_post_fork(p); > } > >@@ -9885,6 +10020,11 @@ void __init sched_init(void) > BUG_ON(&dl_sched_class != &stop_sched_class + 1); > #endif > >+#ifdef CONFIG_SCHED_BORE >+ sched_init_bore(); >+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 4.1.3 by Masahito Suzuki"); >+#endif // CONFIG_SCHED_BORE >+ > wait_bit_init(); > > #ifdef CONFIG_FAIR_GROUP_SCHED >diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c >index 4580a45070..f079b57758 100644 >--- a/kernel/sched/debug.c >+++ b/kernel/sched/debug.c >@@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = { > }; > > #ifdef CONFIG_SMP >+#ifdef CONFIG_SCHED_BORE >+static ssize_t sched_min_base_slice_write(struct file *filp, const char __user *ubuf, >+ size_t cnt, loff_t *ppos) >+{ >+ char buf[16]; >+ unsigned int value; >+ >+ if (cnt > 15) >+ cnt = 15; >+ >+ if (copy_from_user(&buf, ubuf, cnt)) >+ return -EFAULT; >+ buf[cnt] = '\0'; >+ >+ if (kstrtouint(buf, 10, &value)) >+ return -EINVAL; > >+ if (!value) >+ return -EINVAL; >+ >+ sysctl_sched_min_base_slice = value; >+ sched_update_min_base_slice(); >+ >+ *ppos += cnt; >+ return cnt; >+} >+ >+static int sched_min_base_slice_show(struct seq_file *m, void *v) >+{ >+ seq_printf(m, "%d\n", sysctl_sched_min_base_slice); >+ return 0; >+} >+ >+static int sched_min_base_slice_open(struct inode *inode, struct file *filp) >+{ >+ return single_open(filp, sched_min_base_slice_show, NULL); >+} >+ >+static const struct file_operations sched_min_base_slice_fops = { >+ .open = sched_min_base_slice_open, >+ .write = sched_min_base_slice_write, >+ .read = seq_read, >+ .llseek = seq_lseek, >+ .release = single_release, >+}; >+#else // CONFIG_SCHED_BORE > static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, > size_t cnt, loff_t *ppos) > { >@@ -213,7 +258,7 @@ static const struct file_operations sched_scaling_fops = { > .llseek = seq_lseek, > .release = single_release, > }; >- >+#endif // CONFIG_SCHED_BORE > #endif /* SMP */ > > #ifdef CONFIG_PREEMPT_DYNAMIC >@@ -355,15 +355,22 @@ > #endif > > #ifndef CONFIG_SCHED_ALT >- debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); >- >- debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); >- debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); >- >-#ifdef CONFIG_SMP >- debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); >- debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); >- debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); >+#ifdef CONFIG_SCHED_BORE >+ debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); >+ debugfs_create_u32("base_slice_ns", 0400, debugfs_sched, &sysctl_sched_base_slice); >+#else // CONFIG_SCHED_BORE >+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); >+#endif // CONFIG_SCHED_BORE >+ >+ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); >+ debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); >+ >+ #ifdef CONFIG_SMP >+#if !defined(CONFIG_SCHED_BORE) >+ debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); >+#endif // CONFIG_SCHED_BORE >+ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); >+ debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); > > mutex_lock(&sched_domains_mutex); > update_sched_domain_debugfs(); >@@ -595,6 +647,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) > SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), > SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); > >+#ifdef CONFIG_SCHED_BORE >+ SEQ_printf(m, " %2d", p->se.slice_score); >+#endif // CONFIG_SCHED_BORE > #ifdef CONFIG_NUMA_BALANCING > SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); > #endif >diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >index d7a3c63a21..2c50268245 100644 >--- a/kernel/sched/fair.c >+++ b/kernel/sched/fair.c >@@ -19,6 +19,9 @@ > * > * Adaptive scheduling granularity, math enhancements by Peter Zijlstra > * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra >+ * >+ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler >+ * Copyright (C) 2021-2024 Masahito Suzuki <firelzrd@gmail.com> > */ > #include <linux/energy_model.h> > #include <linux/mmap_lock.h> >@@ -64,20 +67,123 @@ > * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) > * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus > * >- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) >+ * (BORE default SCHED_TUNABLESCALING_NONE = *1 constant) >+ * (EEVDF default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) > */ >+#ifdef CONFIG_SCHED_BORE >+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; >+#else // CONFIG_SCHED_BORE > unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; >+#endif // CONFIG_SCHED_BORE > > /* > * Minimal preemption granularity for CPU-bound tasks: > * >- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) >+ * (BORE default: max(1 sec / HZ, min_base_slice) constant, units: nanoseconds) >+ * (EEVDF default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) > */ >+#ifdef CONFIG_SCHED_BORE >+unsigned int sysctl_sched_base_slice = 1000000000ULL / HZ; >+static unsigned int configured_sched_base_slice = 1000000000ULL / HZ; >+unsigned int sysctl_sched_min_base_slice = 2000000ULL; >+#else // CONFIG_SCHED_BORE > unsigned int sysctl_sched_base_slice = 750000ULL; > static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; >+#endif // CONFIG_SCHED_BORE > > const_debug unsigned int sysctl_sched_migration_cost = 500000UL; > >+#ifdef CONFIG_SCHED_BORE >+bool __read_mostly sched_bore = 1; >+bool __read_mostly sched_burst_score_rounding = 0; >+bool __read_mostly sched_burst_smoothness_long = 1; >+bool __read_mostly sched_burst_smoothness_short = 0; >+u8 __read_mostly sched_burst_fork_atavistic = 2; >+u8 __read_mostly sched_burst_penalty_offset = 22; >+uint __read_mostly sched_burst_penalty_scale = 1280; >+uint __read_mostly sched_burst_cache_lifetime = 60000000; >+static u8 sixty_four = 64; >+static uint maxval_12_bits = 4095; >+ >+#define MAX_BURST_PENALTY (39U <<2) >+ >+static inline u32 log2plus1_u64_u32f8(u64 v) { >+ u32 msb = fls64(v); >+ s32 excess_bits = msb - 9; >+ u8 fractional = (0 <= excess_bits)? v >> excess_bits: v << -excess_bits; >+ return msb << 8 | fractional; >+} >+ >+static inline u32 calc_burst_penalty(u64 burst_time) { >+ u32 greed, tolerance, penalty, scaled_penalty; >+ >+ greed = log2plus1_u64_u32f8(burst_time); >+ tolerance = sched_burst_penalty_offset << 8; >+ penalty = max(0, (s32)greed - (s32)tolerance); >+ scaled_penalty = penalty * sched_burst_penalty_scale >> 16; >+ >+ return min(MAX_BURST_PENALTY, scaled_penalty); >+} >+ >+static inline void update_burst_penalty(struct sched_entity *se) { >+ se->curr_burst_penalty = calc_burst_penalty(se->burst_time); >+ se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty); >+} >+ >+static inline u64 scale_slice(u64 delta, struct sched_entity *se) { >+ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->slice_score], 22); >+} >+ >+static inline u64 __unscale_slice(u64 delta, u8 score) { >+ return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10); >+} >+ >+static inline u64 unscale_slice(u64 delta, struct sched_entity *se) { >+ return __unscale_slice(delta, se->slice_score); >+} >+ >+static void avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se); >+static void avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se); >+ >+static void update_slice_score(struct sched_entity *se) { >+ struct cfs_rq *cfs_rq = cfs_rq_of(se); >+ u8 prev_score = se->slice_score; >+ u32 penalty = se->burst_penalty; >+ if (sched_burst_score_rounding) penalty += 0x2U; >+ se->slice_score = penalty >> 2; >+ >+ if (se->slice_score != prev_score && se->slice_load) { >+ avg_vruntime_sub(cfs_rq, se); >+ avg_vruntime_add(cfs_rq, se); >+ } >+} >+ >+static inline u32 binary_smooth(u32 new, u32 old) { >+ int increment = new - old; >+ return (0 <= increment)? >+ old + ( increment >> (int)sched_burst_smoothness_long): >+ old - (-increment >> (int)sched_burst_smoothness_short); >+} >+ >+static void restart_burst(struct sched_entity *se) { >+ se->burst_penalty = se->prev_burst_penalty = >+ binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty); >+ se->curr_burst_penalty = 0; >+ se->burst_time = 0; >+ update_slice_score(se); >+} >+ >+static inline void restart_burst_rescale_deadline(struct sched_entity *se) { >+ u64 wremain, vremain = se->deadline - se->vruntime; >+ u8 prev_score = se->slice_score; >+ restart_burst(se); >+ if (prev_score > se->slice_score) { >+ wremain = __unscale_slice(vremain, prev_score); >+ se->deadline = se->vruntime + scale_slice(wremain, se); >+ } >+} >+#endif // CONFIG_SCHED_BORE >+ > int sched_thermal_decay_shift; > static int __init setup_sched_thermal_decay_shift(char *str) > { >@@ -137,6 +243,70 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; > > #ifdef CONFIG_SYSCTL > static struct ctl_table sched_fair_sysctls[] = { >+#ifdef CONFIG_SCHED_BORE >+ { >+ .procname = "sched_bore", >+ .data = &sched_bore, >+ .maxlen = sizeof(bool), >+ .mode = 0644, >+ .proc_handler = &proc_dobool, >+ }, >+ { >+ .procname = "sched_burst_cache_lifetime", >+ .data = &sched_burst_cache_lifetime, >+ .maxlen = sizeof(uint), >+ .mode = 0644, >+ .proc_handler = proc_douintvec, >+ }, >+ { >+ .procname = "sched_burst_fork_atavistic", >+ .data = &sched_burst_fork_atavistic, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = &proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = SYSCTL_THREE, >+ }, >+ { >+ .procname = "sched_burst_penalty_offset", >+ .data = &sched_burst_penalty_offset, >+ .maxlen = sizeof(u8), >+ .mode = 0644, >+ .proc_handler = &proc_dou8vec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = &sixty_four, >+ }, >+ { >+ .procname = "sched_burst_penalty_scale", >+ .data = &sched_burst_penalty_scale, >+ .maxlen = sizeof(uint), >+ .mode = 0644, >+ .proc_handler = &proc_douintvec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ .extra2 = &maxval_12_bits, >+ }, >+ { >+ .procname = "sched_burst_score_rounding", >+ .data = &sched_burst_score_rounding, >+ .maxlen = sizeof(bool), >+ .mode = 0644, >+ .proc_handler = &proc_dobool, >+ }, >+ { >+ .procname = "sched_burst_smoothness_long", >+ .data = &sched_burst_smoothness_long, >+ .maxlen = sizeof(bool), >+ .mode = 0644, >+ .proc_handler = &proc_dobool, >+ }, >+ { >+ .procname = "sched_burst_smoothness_short", >+ .data = &sched_burst_smoothness_short, >+ .maxlen = sizeof(bool), >+ .mode = 0644, >+ .proc_handler = &proc_dobool, >+ }, >+#endif // CONFIG_SCHED_BORE > #ifdef CONFIG_CFS_BANDWIDTH > { > .procname = "sched_cfs_bandwidth_slice_us", >@@ -195,6 +365,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) > * > * This idea comes from the SD scheduler of Con Kolivas: > */ >+#ifdef CONFIG_SCHED_BORE >+static void update_sysctl(void) { >+ sysctl_sched_base_slice = >+ max(sysctl_sched_min_base_slice, configured_sched_base_slice); >+} >+void sched_update_min_base_slice(void) { update_sysctl(); } >+#else // CONFIG_SCHED_BORE > static unsigned int get_update_sysctl_factor(void) > { > unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); >@@ -225,6 +402,7 @@ static void update_sysctl(void) > SET_SYSCTL(sched_base_slice); > #undef SET_SYSCTL > } >+#endif // CONFIG_SCHED_BORE > > void __init sched_init_granularity(void) > { >@@ -298,6 +476,9 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) > if (unlikely(se->load.weight != NICE_0_LOAD)) > delta = __calc_delta(delta, NICE_0_LOAD, &se->load); > >+#ifdef CONFIG_SCHED_BORE >+ if (likely(sched_bore)) delta = scale_slice(delta, se); >+#endif // CONFIG_SCHED_BORE > return delta; > } > >@@ -620,10 +801,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) > * > * As measured, the max (key * weight) value was ~44 bits for a kernel build. > */ >+#if !defined(CONFIG_SCHED_BORE) >+#define entity_weight(se) scale_load_down(se->load.weight) >+#else // CONFIG_SCHED_BORE >+static unsigned long entity_weight(struct sched_entity *se) { >+ unsigned long weight = se->load.weight >> SCHED_AVG_LOAD_SHIFT; >+ if (likely(weight)) { >+ weight >>= SCHED_AVG_LOAD_SHIFT; >+ if (likely(sched_bore)) weight = unscale_slice(weight, se); >+ weight = max(2UL, weight); >+ } >+ return weight; >+} >+#endif // CONFIG_SCHED_BORE >+ > static void > avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) > { >- unsigned long weight = scale_load_down(se->load.weight); >+ unsigned long weight = entity_weight(se); >+#ifdef CONFIG_SCHED_BORE >+ se->slice_load = weight; >+#endif // CONFIG_SCHED_BORE > s64 key = entity_key(cfs_rq, se); > > cfs_rq->avg_vruntime += key * weight; >@@ -633,7 +831,13 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) > static void > avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) > { >- unsigned long weight = scale_load_down(se->load.weight); >+ unsigned long weight; >+#if !defined(CONFIG_SCHED_BORE) >+ weight = scale_load_down(se->load.weight); >+#else // CONFIG_SCHED_BORE >+ weight = se->slice_load; >+ se->slice_load = 0; >+#endif // CONFIG_SCHED_BORE > s64 key = entity_key(cfs_rq, se); > > cfs_rq->avg_vruntime -= key * weight; >@@ -653,14 +857,14 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) > * Specifically: avg_runtime() + 0 must result in entity_eligible() := true > * For this to be so, the result of this function must have a left bias. > */ >-u64 avg_vruntime(struct cfs_rq *cfs_rq) >+static u64 avg_key(struct cfs_rq *cfs_rq) > { > struct sched_entity *curr = cfs_rq->curr; > s64 avg = cfs_rq->avg_vruntime; > long load = cfs_rq->avg_load; > > if (curr && curr->on_rq) { >- unsigned long weight = scale_load_down(curr->load.weight); >+ unsigned long weight = entity_weight(curr); > > avg += entity_key(cfs_rq, curr) * weight; > load += weight; >@@ -673,7 +877,11 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) > avg = div_s64(avg, load); > } > >- return cfs_rq->min_vruntime + avg; >+ return avg; >+} >+ >+inline u64 avg_vruntime(struct cfs_rq *cfs_rq) { >+ return cfs_rq->min_vruntime + avg_key(cfs_rq); > } > > /* >@@ -694,13 +902,8 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) > */ > static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) > { >- s64 lag, limit; >- > SCHED_WARN_ON(!se->on_rq); >- lag = avg_vruntime(cfs_rq) - se->vruntime; >- >- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); >- se->vlag = clamp(lag, -limit, limit); >+ se->vlag = avg_vruntime(cfs_rq) - se->vruntime; > } > > /* >@@ -727,7 +930,7 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) > long load = cfs_rq->avg_load; > > if (curr && curr->on_rq) { >- unsigned long weight = scale_load_down(curr->load.weight); >+ unsigned long weight = entity_weight(curr); > > avg += entity_key(cfs_rq, curr) * weight; > load += weight; >@@ -981,6 +1184,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) > * Scheduling class statistics methods: > */ > #ifdef CONFIG_SMP >+#if !defined(CONFIG_SCHED_BORE) > int sched_update_scaling(void) > { > unsigned int factor = get_update_sysctl_factor(); >@@ -992,6 +1196,7 @@ int sched_update_scaling(void) > > return 0; > } >+#endif // CONFIG_SCHED_BORE > #endif > #endif > >@@ -1016,6 +1221,9 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) > /* > * EEVDF: vd_i = ve_i + r_i / w_i > */ >+#ifdef CONFIG_SCHED_BORE >+ update_slice_score(se); >+#endif // CONFIG_SCHED_BORE > se->deadline = se->vruntime + calc_delta_fair(se->slice, se); > > /* >@@ -1158,7 +1366,11 @@ static void update_curr(struct cfs_rq *cfs_rq) > curr->sum_exec_runtime += delta_exec; > schedstat_add(cfs_rq->exec_clock, delta_exec); > >- curr->vruntime += calc_delta_fair(delta_exec, curr); >+#ifdef CONFIG_SCHED_BORE >+ curr->burst_time += delta_exec; >+ update_burst_penalty(curr); >+#endif // CONFIG_SCHED_BORE >+ curr->vruntime += max(1ULL, calc_delta_fair(delta_exec, curr)); > update_deadline(cfs_rq, curr); > update_min_vruntime(cfs_rq); > >@@ -5131,7 +5343,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) > struct sched_entity *curr = cfs_rq->curr; > unsigned long load; > >- lag = se->vlag; >+ u64 limit = calc_delta_fair(max_t(u64, se->slice*2, TICK_NSEC), se); >+ s64 overmet = limit, undermet = limit; >+#ifdef CONFIG_SCHED_BORE >+ if (likely(sched_bore)) overmet = div_s64(overmet, 2); >+#endif // CONFIG_SCHED_BORE >+ lag = clamp(se->vlag, -overmet, undermet); > > /* > * If we want to place a task and preserve lag, we have to >@@ -5187,9 +5404,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) > */ > load = cfs_rq->avg_load; > if (curr && curr->on_rq) >- load += scale_load_down(curr->load.weight); >+ load += entity_weight(curr); > >- lag *= load + scale_load_down(se->load.weight); >+ lag *= load + entity_weight(se); > if (WARN_ON_ONCE(!load)) > load = 1; > lag = div_s64(lag, load); >@@ -6759,6 +6976,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > bool was_sched_idle = sched_idle_rq(rq); > > util_est_dequeue(&rq->cfs, p); >+#ifdef CONFIG_SCHED_BORE >+ if (task_sleep) { >+ update_curr(cfs_rq_of(se)); >+ restart_burst(se); >+ } >+#endif // CONFIG_SCHED_BORE > > for_each_sched_entity(se) { > cfs_rq = cfs_rq_of(se); >@@ -8494,16 +8717,21 @@ static void yield_task_fair(struct rq *rq) > /* > * Are we the only task in the tree? > */ >+#ifdef CONFIG_SCHED_BORE >+ if (unlikely(!sched_bore)) >+#endif // CONFIG_SCHED_BORE > if (unlikely(rq->nr_running == 1)) > return; > >- clear_buddies(cfs_rq, se); >- > update_rq_clock(rq); > /* > * Update run-time statistics of the 'current'. > */ > update_curr(cfs_rq); >+#ifdef CONFIG_SCHED_BORE >+ restart_burst_rescale_deadline(se); >+ if (unlikely(rq->nr_running == 1)) return; >+#endif // CONFIG_SCHED_BORE > /* > * Tell update_rq_clock() that we've just updated, > * so we don't do microscopic update in schedule() >@@ -8511,6 +8739,8 @@ static void yield_task_fair(struct rq *rq) > */ > rq_clock_skip_update(rq); > >+ clear_buddies(cfs_rq, se); >+ > se->deadline += calc_delta_fair(se->slice, se); > } > >@@ -12590,6 +12820,9 @@ static void task_fork_fair(struct task_struct *p) > curr = cfs_rq->curr; > if (curr) > update_curr(cfs_rq); >+#ifdef CONFIG_SCHED_BORE >+ update_slice_score(se); >+#endif // CONFIG_SCHED_BORE > place_entity(cfs_rq, se, ENQUEUE_INITIAL); > rq_unlock(rq, &rf); > } >diff --git a/kernel/sched/features.h b/kernel/sched/features.h >index a3ddf84de4..841a428579 100644 >--- a/kernel/sched/features.h >+++ b/kernel/sched/features.h >@@ -6,7 +6,11 @@ > */ > SCHED_FEAT(PLACE_LAG, true) > SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) >+#ifdef CONFIG_SCHED_BORE >+SCHED_FEAT(RUN_TO_PARITY, false) >+#else // CONFIG_SCHED_BORE > SCHED_FEAT(RUN_TO_PARITY, true) >+#endif // CONFIG_SCHED_BORE > > /* > * Prefer to schedule the task we woke last (assuming it failed >diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h >index 2e5a95486a..14951de9ee 100644 >--- a/kernel/sched/sched.h >+++ b/kernel/sched/sched.h >@@ -144,6 +144,17 @@ extern int sched_rr_timeslice; > # define scale_load_down(w) (w) > #endif > >+#ifdef CONFIG_SCHED_BORE >+# ifdef CONFIG_64BIT >+# define SCHED_AVG_LOAD_EXTRA_RESOLUTION 5 >+# define SCHED_AVG_LOAD_SHIFT \ >+ (SCHED_FIXEDPOINT_SHIFT - SCHED_AVG_LOAD_EXTRA_RESOLUTION) >+# else // CONFIG_64BIT >+# define SCHED_AVG_LOAD_EXTRA_RESOLUTION 0 >+# define SCHED_AVG_LOAD_SHIFT 0 >+# endif // CONFIG_64BIT >+#endif // CONFIG_SCHED_BORE >+ > /* > * Task weight (visible to users) and its load (invisible to users) have > * independent resolution, but they should be well calibrated. We use >@@ -1929,7 +1940,11 @@ static inline void dirty_sched_domain_sysctl(int cpu) > } > #endif > >+#ifdef CONFIG_SCHED_BORE >+extern void sched_update_min_base_slice(void); >+#else // CONFIG_SCHED_BORE > extern int sched_update_scaling(void); >+#endif // CONFIG_SCHED_BORE > > static inline const struct cpumask *task_user_cpus(struct task_struct *p) > { >@@ -2509,6 +2524,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; > extern const_debug unsigned int sysctl_sched_migration_cost; > > extern unsigned int sysctl_sched_base_slice; >+#ifdef CONFIG_SCHED_BORE >+extern unsigned int sysctl_sched_min_base_slice; >+#endif // CONFIG_SCHED_BORE > > #ifdef CONFIG_SCHED_DEBUG > extern int sysctl_resched_latency_warn_ms; >-- >2.34.1 > >diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig >index f8567e95f98be..bf6b5e6a2edc6 100644 >--- a/arch/arm/Kconfig >+++ b/arch/arm/Kconfig >@@ -35,6 +35,7 @@ config ARM > select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE >+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF > select ARCH_USE_MEMTEST >@@ -74,7 +75,7 @@ config ARM > select HAS_IOPORT > select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT > select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 >- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU >+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT > select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL > select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU > select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL >@@ -119,6 +120,7 @@ config ARM > select HAVE_PERF_EVENTS > select HAVE_PERF_REGS > select HAVE_PERF_USER_STACK_DUMP >+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM > select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE > select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RSEQ >diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c >index fef62e4a9edde..622a30243f4a6 100644 >--- a/arch/arm/mm/fault.c >+++ b/arch/arm/mm/fault.c >@@ -404,6 +404,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, > if (addr < TASK_SIZE) > return do_page_fault(addr, fsr, regs); > >+ if (interrupts_enabled(regs)) >+ local_irq_enable(); >+ > if (user_mode(regs)) > goto bad_area; > >@@ -474,6 +477,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, > static int > do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) > { >+ if (interrupts_enabled(regs)) >+ local_irq_enable(); >+ > do_bad_area(addr, fsr, regs); > return 0; > } >diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c >index 7e8773a2d99d0..9fde36fcb80c2 100644 >--- a/arch/arm/vfp/vfpmodule.c >+++ b/arch/arm/vfp/vfpmodule.c >@@ -55,6 +55,34 @@ extern unsigned int VFP_arch_feroceon __alias(VFP_arch); > */ > union vfp_state *vfp_current_hw_state[NR_CPUS]; > >+/* >+ * Claim ownership of the VFP unit. >+ * >+ * The caller may change VFP registers until vfp_unlock() is called. >+ * >+ * local_bh_disable() is used to disable preemption and to disable VFP >+ * processing in softirq context. On PREEMPT_RT kernels local_bh_disable() is >+ * not sufficient because it only serializes soft interrupt related sections >+ * via a local lock, but stays preemptible. Disabling preemption is the right >+ * choice here as bottom half processing is always in thread context on RT >+ * kernels so it implicitly prevents bottom half processing as well. >+ */ >+static void vfp_lock(void) >+{ >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_bh_disable(); >+ else >+ preempt_disable(); >+} >+ >+static void vfp_unlock(void) >+{ >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_bh_enable(); >+ else >+ preempt_enable(); >+} >+ > /* > * Is 'thread's most up to date state stored in this CPUs hardware? > * Must be called from non-preemptible context. >@@ -240,7 +268,7 @@ static void vfp_panic(char *reason, u32 inst) > /* > * Process bitmask of exception conditions. > */ >-static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_regs *regs) >+static int vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr) > { > int si_code = 0; > >@@ -248,8 +276,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ > > if (exceptions == VFP_EXCEPTION_ERROR) { > vfp_panic("unhandled bounce", inst); >- vfp_raise_sigfpe(FPE_FLTINV, regs); >- return; >+ return FPE_FLTINV; > } > > /* >@@ -277,8 +304,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ > RAISE(FPSCR_OFC, FPSCR_OFE, FPE_FLTOVF); > RAISE(FPSCR_IOC, FPSCR_IOE, FPE_FLTINV); > >- if (si_code) >- vfp_raise_sigfpe(si_code, regs); >+ return si_code; > } > > /* >@@ -324,6 +350,8 @@ static u32 vfp_emulate_instruction(u32 inst, u32 fpscr, struct pt_regs *regs) > static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > { > u32 fpscr, orig_fpscr, fpsid, exceptions; >+ int si_code2 = 0; >+ int si_code = 0; > > pr_debug("VFP: bounce: trigger %08x fpexc %08x\n", trigger, fpexc); > >@@ -369,8 +397,8 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > * unallocated VFP instruction but with FPSCR.IXE set and not > * on VFP subarch 1. > */ >- vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr, regs); >- return; >+ si_code = vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr); >+ goto exit; > } > > /* >@@ -394,14 +422,14 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > */ > exceptions = vfp_emulate_instruction(trigger, fpscr, regs); > if (exceptions) >- vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); >+ si_code2 = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); > > /* > * If there isn't a second FP instruction, exit now. Note that > * the FPEXC.FP2V bit is valid only if FPEXC.EX is 1. > */ > if ((fpexc & (FPEXC_EX | FPEXC_FP2V)) != (FPEXC_EX | FPEXC_FP2V)) >- return; >+ goto exit; > > /* > * The barrier() here prevents fpinst2 being read >@@ -413,7 +441,13 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) > emulate: > exceptions = vfp_emulate_instruction(trigger, orig_fpscr, regs); > if (exceptions) >- vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); >+ si_code = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); >+exit: >+ vfp_unlock(); >+ if (si_code2) >+ vfp_raise_sigfpe(si_code2, regs); >+ if (si_code) >+ vfp_raise_sigfpe(si_code, regs); > } > > static void vfp_enable(void *unused) >@@ -512,11 +546,9 @@ static inline void vfp_pm_init(void) { } > */ > void vfp_sync_hwstate(struct thread_info *thread) > { >- unsigned int cpu = get_cpu(); >+ vfp_lock(); > >- local_bh_disable(); >- >- if (vfp_state_in_hw(cpu, thread)) { >+ if (vfp_state_in_hw(raw_smp_processor_id(), thread)) { > u32 fpexc = fmrx(FPEXC); > > /* >@@ -527,8 +559,7 @@ void vfp_sync_hwstate(struct thread_info *thread) > fmxr(FPEXC, fpexc); > } > >- local_bh_enable(); >- put_cpu(); >+ vfp_unlock(); > } > > /* Ensure that the thread reloads the hardware VFP state on the next use. */ >@@ -683,7 +714,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) > if (!user_mode(regs)) > return vfp_kmode_exception(regs, trigger); > >- local_bh_disable(); >+ vfp_lock(); > fpexc = fmrx(FPEXC); > > /* >@@ -748,6 +779,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) > * replay the instruction that trapped. > */ > fmxr(FPEXC, fpexc); >+ vfp_unlock(); > } else { > /* Check for synchronous or asynchronous exceptions */ > if (!(fpexc & (FPEXC_EX | FPEXC_DEX))) { >@@ -762,17 +794,17 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) > if (!(fpscr & FPSCR_IXE)) { > if (!(fpscr & FPSCR_LENGTH_MASK)) { > pr_debug("not VFP\n"); >- local_bh_enable(); >+ vfp_unlock(); > return -ENOEXEC; > } > fpexc |= FPEXC_DEX; > } > } > bounce: regs->ARM_pc += 4; >+ /* VFP_bounce() will invoke vfp_unlock() */ > VFP_bounce(trigger, fpexc, regs); > } > >- local_bh_enable(); > return 0; > } > >@@ -819,7 +851,7 @@ void kernel_neon_begin(void) > unsigned int cpu; > u32 fpexc; > >- local_bh_disable(); >+ vfp_lock(); > > /* > * Kernel mode NEON is only allowed outside of hardirq context with >@@ -850,7 +882,7 @@ void kernel_neon_end(void) > { > /* Disable the NEON/VFP unit. */ > fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN); >- local_bh_enable(); >+ vfp_unlock(); > } > EXPORT_SYMBOL(kernel_neon_end); > >diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig >index 7b071a00425d2..c5210e9b11a20 100644 >--- a/arch/arm64/Kconfig >+++ b/arch/arm64/Kconfig >@@ -97,6 +97,7 @@ config ARM64 > select ARCH_SUPPORTS_NUMA_BALANCING > select ARCH_SUPPORTS_PAGE_TABLE_CHECK > select ARCH_SUPPORTS_PER_VMA_LOCK >+ select ARCH_SUPPORTS_RT > select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH > select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT > select ARCH_WANT_DEFAULT_BPF_JIT >diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig >index 1f11a62809f20..c0c5e02fa2abe 100644 >--- a/arch/powerpc/Kconfig >+++ b/arch/powerpc/Kconfig >@@ -166,6 +166,7 @@ config PPC > select ARCH_STACKWALK > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x >+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF if PPC64 > select ARCH_USE_MEMTEST >@@ -269,6 +270,7 @@ config PPC > select HAVE_PERF_USER_STACK_DUMP > select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RELIABLE_STACKTRACE >+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM > select HAVE_RSEQ > select HAVE_SETUP_PER_CPU_AREA if PPC64 > select HAVE_SOFTIRQ_ON_OWN_STACK >diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h >index 283c346478565..4727f40052ddd 100644 >--- a/arch/powerpc/include/asm/stackprotector.h >+++ b/arch/powerpc/include/asm/stackprotector.h >@@ -19,8 +19,13 @@ > */ > static __always_inline void boot_init_stack_canary(void) > { >- unsigned long canary = get_random_canary(); >+ unsigned long canary; > >+#ifndef CONFIG_PREEMPT_RT >+ canary = get_random_canary(); >+#else >+ canary = ((unsigned long)&canary) & CANARY_MASK; >+#endif > current->stack_canary = canary; > #ifdef CONFIG_PPC64 > get_paca()->canary = canary; >diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c >index 5ea2014aff90d..9cffe23e93572 100644 >--- a/arch/powerpc/kernel/traps.c >+++ b/arch/powerpc/kernel/traps.c >@@ -261,12 +261,17 @@ static char *get_mmu_str(void) > > static int __die(const char *str, struct pt_regs *regs, long err) > { >+ const char *pr = ""; >+ > printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); > >+ if (IS_ENABLED(CONFIG_PREEMPTION)) >+ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; >+ > printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", > IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", > PAGE_SIZE / 1024, get_mmu_str(), >- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", >+ pr, > IS_ENABLED(CONFIG_SMP) ? " SMP" : "", > IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", > debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", >diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig >index 902611954200d..2f188137f830f 100644 >--- a/arch/powerpc/kvm/Kconfig >+++ b/arch/powerpc/kvm/Kconfig >@@ -224,6 +224,7 @@ config KVM_E500MC > config KVM_MPIC > bool "KVM in-kernel MPIC emulation" > depends on KVM && PPC_E500 >+ depends on !PREEMPT_RT > select HAVE_KVM_IRQCHIP > select HAVE_KVM_IRQFD > select HAVE_KVM_IRQ_ROUTING >diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig >index afc0f6a613372..dc3f63c2687d4 100644 >--- a/arch/powerpc/platforms/pseries/Kconfig >+++ b/arch/powerpc/platforms/pseries/Kconfig >@@ -2,6 +2,7 @@ > config PPC_PSERIES > depends on PPC64 && PPC_BOOK3S > bool "IBM pSeries & new (POWER5-based) iSeries" >+ select GENERIC_ALLOCATOR > select HAVE_PCSPKR_PLATFORM > select MPIC > select OF_DYNAMIC >diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c >index 496e16c588aaa..05cee07aafbbb 100644 >--- a/arch/powerpc/platforms/pseries/iommu.c >+++ b/arch/powerpc/platforms/pseries/iommu.c >@@ -25,6 +25,7 @@ > #include <linux/of_address.h> > #include <linux/iommu.h> > #include <linux/rculist.h> >+#include <linux/local_lock.h> > #include <asm/io.h> > #include <asm/prom.h> > #include <asm/rtas.h> >@@ -206,7 +207,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, > return ret; > } > >-static DEFINE_PER_CPU(__be64 *, tce_page); >+struct tce_page { >+ __be64 * page; >+ local_lock_t lock; >+}; >+static DEFINE_PER_CPU(struct tce_page, tce_page) = { >+ .lock = INIT_LOCAL_LOCK(lock), >+}; > > static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > long npages, unsigned long uaddr, >@@ -229,9 +236,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > direction, attrs); > } > >- local_irq_save(flags); /* to protect tcep and the page behind it */ >+ /* to protect tcep and the page behind it */ >+ local_lock_irqsave(&tce_page.lock, flags); > >- tcep = __this_cpu_read(tce_page); >+ tcep = __this_cpu_read(tce_page.page); > > /* This is safe to do since interrupts are off when we're called > * from iommu_alloc{,_sg}() >@@ -240,12 +248,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > tcep = (__be64 *)__get_free_page(GFP_ATOMIC); > /* If allocation fails, fall back to the loop implementation */ > if (!tcep) { >- local_irq_restore(flags); >+ local_unlock_irqrestore(&tce_page.lock, flags); > return tce_build_pSeriesLP(tbl->it_index, tcenum, > tceshift, > npages, uaddr, direction, attrs); > } >- __this_cpu_write(tce_page, tcep); >+ __this_cpu_write(tce_page.page, tcep); > } > > rpn = __pa(uaddr) >> tceshift; >@@ -275,7 +283,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, > tcenum += limit; > } while (npages > 0 && !rc); > >- local_irq_restore(flags); >+ local_unlock_irqrestore(&tce_page.lock, flags); > > if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { > ret = (int)rc; >@@ -459,16 +467,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, > DMA_BIDIRECTIONAL, 0); > } > >- local_irq_disable(); /* to protect tcep and the page behind it */ >- tcep = __this_cpu_read(tce_page); >+ /* to protect tcep and the page behind it */ >+ local_lock_irq(&tce_page.lock); >+ tcep = __this_cpu_read(tce_page.page); > > if (!tcep) { > tcep = (__be64 *)__get_free_page(GFP_ATOMIC); > if (!tcep) { >- local_irq_enable(); >+ local_unlock_irq(&tce_page.lock); > return -ENOMEM; > } >- __this_cpu_write(tce_page, tcep); >+ __this_cpu_write(tce_page.page, tcep); > } > > proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; >@@ -511,7 +520,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, > > /* error cleanup: caller will clear whole range */ > >- local_irq_enable(); >+ local_unlock_irq(&tce_page.lock); > return rc; > } > >diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >index cd4c9a204d08c..1cc38fc2c4711 100644 >--- a/arch/riscv/Kconfig >+++ b/arch/riscv/Kconfig >@@ -49,6 +49,7 @@ config RISCV > select ARCH_SUPPORTS_HUGETLBFS if MMU > select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU > select ARCH_SUPPORTS_PER_VMA_LOCK if MMU >+ select ARCH_SUPPORTS_RT > select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK > select ARCH_USE_MEMTEST > select ARCH_USE_QUEUED_RWLOCKS >@@ -137,6 +138,7 @@ config RISCV > select HAVE_PERF_USER_STACK_DUMP > select HAVE_POSIX_CPU_TIMERS_TASK_WORK > select HAVE_PREEMPT_DYNAMIC_KEY if !XIP_KERNEL >+ select HAVE_PREEMPT_AUTO > select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RETHOOK if !XIP_KERNEL > select HAVE_RSEQ >diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h >index 574779900bfb3..fd9ddc27b29ba 100644 >--- a/arch/riscv/include/asm/thread_info.h >+++ b/arch/riscv/include/asm/thread_info.h >@@ -95,6 +95,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); > * - pending work-to-be-done flags are in lowest half-word > * - other flags in upper half-word(s) > */ >+#define TIF_ARCH_RESCHED_LAZY 0 /* Lazy rescheduling */ > #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ > #define TIF_SIGPENDING 2 /* signal pending */ > #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ >@@ -109,6 +110,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); > #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) > #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) > #define _TIF_UPROBE (1 << TIF_UPROBE) >+#define _TIF_ARCH_RESCHED_LAZY (1 << TIF_ARCH_RESCHED_LAZY) > > #define _TIF_WORK_MASK \ > (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED | \ >diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig >index 1566748f16c42..77744230399f5 100644 >--- a/arch/x86/Kconfig >+++ b/arch/x86/Kconfig >@@ -27,7 +27,7 @@ config X86_64 > # Options that are inherently 64-bit kernel only: > select ARCH_HAS_GIGANTIC_PAGE > select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 >- select ARCH_SUPPORTS_PER_VMA_LOCK >+ select ARCH_SUPPORTS_RT > select HAVE_ARCH_SOFT_DIRTY > select MODULES_USE_ELF_RELA > select NEED_DMA_MAP_STATE >@@ -116,6 +116,7 @@ config X86 > select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG > select ARCH_SUPPORTS_LTO_CLANG > select ARCH_SUPPORTS_LTO_CLANG_THIN >+ select ARCH_SUPPORTS_RT > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 > select ARCH_USE_MEMTEST >@@ -271,6 +272,7 @@ config X86 > select HAVE_STATIC_CALL > select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL > select HAVE_PREEMPT_DYNAMIC_CALL >+ select HAVE_PREEMPT_AUTO > select HAVE_RSEQ > select HAVE_RUST if X86_64 > select HAVE_SYSCALL_TRACEPOINTS >diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h >index d63b02940747f..1ff38ebbd5880 100644 >--- a/arch/x86/include/asm/thread_info.h >+++ b/arch/x86/include/asm/thread_info.h >@@ -81,8 +81,9 @@ struct thread_info { > #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ > #define TIF_SIGPENDING 2 /* signal pending */ > #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ >-#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ >-#define TIF_SSBD 5 /* Speculative store bypass disable */ >+#define TIF_ARCH_RESCHED_LAZY 4 /* Lazy rescheduling */ >+#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ >+#define TIF_SSBD 6 /* Speculative store bypass disable */ > #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ > #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ > #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ >@@ -104,6 +105,7 @@ struct thread_info { > #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) > #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) > #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) >+#define _TIF_ARCH_RESCHED_LAZY (1 << TIF_ARCH_RESCHED_LAZY) > #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) > #define _TIF_SSBD (1 << TIF_SSBD) > #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) >diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c >index 55437f5e0c3ae..7fc47007b9263 100644 >--- a/drivers/acpi/processor_idle.c >+++ b/drivers/acpi/processor_idle.c >@@ -108,7 +108,7 @@ static const struct dmi_system_id processor_power_dmi_table[] = { > */ > static void __cpuidle acpi_safe_halt(void) > { >- if (!tif_need_resched()) { >+ if (!need_resched()) { > raw_safe_halt(); > raw_local_irq_disable(); > } >diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c >index d77d3664ca080..d083a5dfb244a 100644 >--- a/drivers/block/zram/zram_drv.c >+++ b/drivers/block/zram/zram_drv.c >@@ -57,6 +57,41 @@ static void zram_free_page(struct zram *zram, size_t index); > static int zram_read_page(struct zram *zram, struct page *page, u32 index, > struct bio *parent); > >+#ifdef CONFIG_PREEMPT_RT >+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) >+{ >+ size_t index; >+ >+ for (index = 0; index < num_pages; index++) >+ spin_lock_init(&zram->table[index].lock); >+} >+ >+static int zram_slot_trylock(struct zram *zram, u32 index) >+{ >+ int ret; >+ >+ ret = spin_trylock(&zram->table[index].lock); >+ if (ret) >+ __set_bit(ZRAM_LOCK, &zram->table[index].flags); >+ return ret; >+} >+ >+static void zram_slot_lock(struct zram *zram, u32 index) >+{ >+ spin_lock(&zram->table[index].lock); >+ __set_bit(ZRAM_LOCK, &zram->table[index].flags); >+} >+ >+static void zram_slot_unlock(struct zram *zram, u32 index) >+{ >+ __clear_bit(ZRAM_LOCK, &zram->table[index].flags); >+ spin_unlock(&zram->table[index].lock); >+} >+ >+#else >+ >+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } >+ > static int zram_slot_trylock(struct zram *zram, u32 index) > { > return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); >@@ -71,6 +106,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) > { > bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); > } >+#endif > > static inline bool init_done(struct zram *zram) > { >@@ -1242,6 +1278,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) > > if (!huge_class_size) > huge_class_size = zs_huge_class_size(zram->mem_pool); >+ zram_meta_init_table_locks(zram, num_pages); > return true; > } > >diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h >index d090753f97bec..833abc17d4140 100644 >--- a/drivers/block/zram/zram_drv.h >+++ b/drivers/block/zram/zram_drv.h >@@ -69,6 +69,9 @@ struct zram_table_entry { > unsigned long element; > }; > unsigned long flags; >+#ifdef CONFIG_PREEMPT_RT >+ spinlock_t lock; >+#endif > #ifdef CONFIG_ZRAM_MEMORY_TRACKING > ktime_t ac_time; > #endif >diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig >index ce397a8797f7b..98c3f532822d0 100644 >--- a/drivers/gpu/drm/i915/Kconfig >+++ b/drivers/gpu/drm/i915/Kconfig >@@ -3,7 +3,6 @@ config DRM_I915 > tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics" > depends on DRM > depends on X86 && PCI >- depends on !PREEMPT_RT > select INTEL_GTT if X86 > select INTERVAL_TREE > # we need shmfs for the swappable backing store, and in particular >diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c >index 1fd068e6e26ca..8a82a7ebaf1e0 100644 >--- a/drivers/gpu/drm/i915/display/intel_crtc.c >+++ b/drivers/gpu/drm/i915/display/intel_crtc.c >@@ -573,7 +573,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state, > */ > intel_psr_wait_for_idle_locked(new_crtc_state); > >- local_irq_disable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_disable(); > > crtc->debug.min_vbl = min; > crtc->debug.max_vbl = max; >@@ -598,11 +599,13 @@ void intel_pipe_update_start(struct intel_atomic_state *state, > break; > } > >- local_irq_enable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_enable(); > > timeout = schedule_timeout(timeout); > >- local_irq_disable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_disable(); > } > > finish_wait(wq, &wait); >@@ -635,7 +638,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state, > return; > > irq_disable: >- local_irq_disable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_disable(); > } > > #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) >@@ -737,7 +741,8 @@ void intel_pipe_update_end(struct intel_atomic_state *state, > */ > intel_vrr_send_push(new_crtc_state); > >- local_irq_enable(); >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ local_irq_enable(); > > if (intel_vgpu_active(dev_priv)) > goto out; >diff --git a/drivers/gpu/drm/i915/display/intel_vblank.c b/drivers/gpu/drm/i915/display/intel_vblank.c >index 2cec2abf97466..2e4f0ed417b5a 100644 >--- a/drivers/gpu/drm/i915/display/intel_vblank.c >+++ b/drivers/gpu/drm/i915/display/intel_vblank.c >@@ -308,7 +308,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, > */ > spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); > >- /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ >+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) >+ preempt_disable(); > > /* Get optional system timestamp before query. */ > if (stime) >@@ -372,7 +373,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, > if (etime) > *etime = ktime_get(); > >- /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ >+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) >+ preempt_enable(); > > spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); > >diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c >index ecc990ec1b952..8d04b10681f0d 100644 >--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c >+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c >@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) > /* Kick the work once more to drain the signalers, and disarm the irq */ > irq_work_sync(&b->irq_work); > while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { >- local_irq_disable(); >- signal_irq_work(&b->irq_work); >- local_irq_enable(); >+ irq_work_queue(&b->irq_work); > cond_resched(); >+ irq_work_sync(&b->irq_work); > } > } > >diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c >index e8f42ec6b1b47..3274d71a5ce6a 100644 >--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c >+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c >@@ -1303,7 +1303,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > * and context switches) submission. > */ > >- spin_lock(&sched_engine->lock); >+ spin_lock_irq(&sched_engine->lock); > > /* > * If the queue is higher priority than the last >@@ -1403,7 +1403,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > * Even if ELSP[1] is occupied and not worthy > * of timeslices, our queue might be. > */ >- spin_unlock(&sched_engine->lock); >+ spin_unlock_irq(&sched_engine->lock); > return; > } > } >@@ -1429,7 +1429,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > > if (last && !can_merge_rq(last, rq)) { > spin_unlock(&ve->base.sched_engine->lock); >- spin_unlock(&engine->sched_engine->lock); >+ spin_unlock_irq(&engine->sched_engine->lock); > return; /* leave this for another sibling */ > } > >@@ -1591,7 +1591,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > */ > sched_engine->queue_priority_hint = queue_prio(sched_engine); > i915_sched_engine_reset_on_empty(sched_engine); >- spin_unlock(&sched_engine->lock); >+ spin_unlock_irq(&sched_engine->lock); > > /* > * We can skip poking the HW if we ended up with exactly the same set >@@ -1617,13 +1617,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > } > } > >-static void execlists_dequeue_irq(struct intel_engine_cs *engine) >-{ >- local_irq_disable(); /* Suspend interrupts across request submission */ >- execlists_dequeue(engine); >- local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ >-} >- > static void clear_ports(struct i915_request **ports, int count) > { > memset_p((void **)ports, NULL, count); >@@ -2478,7 +2471,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) > } > > if (!engine->execlists.pending[0]) { >- execlists_dequeue_irq(engine); >+ execlists_dequeue(engine); > start_timeslice(engine); > } > >diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h >index 2b6dfe62c8f2a..28243efcaec2e 100644 >--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h >+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h >@@ -349,7 +349,7 @@ static inline int intel_guc_send_busy_loop(struct intel_guc *guc, > { > int err; > unsigned int sleep_period_ms = 1; >- bool not_atomic = !in_atomic() && !irqs_disabled(); >+ bool not_atomic = !in_atomic() && !irqs_disabled() && !rcu_preempt_depth(); > > /* > * FIXME: Have caller pass in if we are in an atomic context to avoid >diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c >index f59081066a197..014d02029a415 100644 >--- a/drivers/gpu/drm/i915/i915_request.c >+++ b/drivers/gpu/drm/i915/i915_request.c >@@ -609,7 +609,6 @@ bool __i915_request_submit(struct i915_request *request) > > RQ_TRACE(request, "\n"); > >- GEM_BUG_ON(!irqs_disabled()); > lockdep_assert_held(&engine->sched_engine->lock); > > /* >@@ -718,7 +717,6 @@ void __i915_request_unsubmit(struct i915_request *request) > */ > RQ_TRACE(request, "\n"); > >- GEM_BUG_ON(!irqs_disabled()); > lockdep_assert_held(&engine->sched_engine->lock); > > /* >diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h >index ce1cbee1b39dd..3c51620d011b1 100644 >--- a/drivers/gpu/drm/i915/i915_trace.h >+++ b/drivers/gpu/drm/i915/i915_trace.h >@@ -6,6 +6,10 @@ > #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) > #define _I915_TRACE_H_ > >+#ifdef CONFIG_PREEMPT_RT >+#define NOTRACE >+#endif >+ > #include <linux/stringify.h> > #include <linux/types.h> > #include <linux/tracepoint.h> >@@ -322,7 +326,7 @@ DEFINE_EVENT(i915_request, i915_request_add, > TP_ARGS(rq) > ); > >-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) >+#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) > DEFINE_EVENT(i915_request, i915_request_guc_submit, > TP_PROTO(struct i915_request *rq), > TP_ARGS(rq) >diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h >index c61066498bf2f..48e19e55d6b07 100644 >--- a/drivers/gpu/drm/i915/i915_utils.h >+++ b/drivers/gpu/drm/i915/i915_utils.h >@@ -288,7 +288,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) > #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) > > /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ >-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) >+#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) > # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) > #else > # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) >diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c >index 9127331518587..6b89e9c2374a5 100644 >--- a/drivers/tty/serial/8250/8250_core.c >+++ b/drivers/tty/serial/8250/8250_core.c >@@ -592,6 +592,7 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) > > #ifdef CONFIG_SERIAL_8250_CONSOLE > >+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE > static void univ8250_console_write(struct console *co, const char *s, > unsigned int count) > { >@@ -599,6 +600,37 @@ static void univ8250_console_write(struct console *co, const char *s, > > serial8250_console_write(up, s, count); > } >+#else >+static bool univ8250_console_write_atomic(struct console *co, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_8250_port *up = &serial8250_ports[co->index]; >+ >+ return serial8250_console_write_atomic(up, wctxt); >+} >+ >+static bool univ8250_console_write_thread(struct console *co, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_8250_port *up = &serial8250_ports[co->index]; >+ >+ return serial8250_console_write_thread(up, wctxt); >+} >+ >+static void univ8250_console_driver_enter(struct console *con, unsigned long *flags) >+{ >+ struct uart_port *up = &serial8250_ports[con->index].port; >+ >+ __uart_port_lock_irqsave(up, flags); >+} >+ >+static void univ8250_console_driver_exit(struct console *con, unsigned long flags) >+{ >+ struct uart_port *up = &serial8250_ports[con->index].port; >+ >+ __uart_port_unlock_irqrestore(up, flags); >+} >+#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */ > > static int univ8250_console_setup(struct console *co, char *options) > { >@@ -698,12 +730,20 @@ static int univ8250_console_match(struct console *co, char *name, int idx, > > static struct console univ8250_console = { > .name = "ttyS", >+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE > .write = univ8250_console_write, >+ .flags = CON_PRINTBUFFER | CON_ANYTIME, >+#else >+ .write_atomic = univ8250_console_write_atomic, >+ .write_thread = univ8250_console_write_thread, >+ .driver_enter = univ8250_console_driver_enter, >+ .driver_exit = univ8250_console_driver_exit, >+ .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_NBCON, >+#endif > .device = uart_console_device, > .setup = univ8250_console_setup, > .exit = univ8250_console_exit, > .match = univ8250_console_match, >- .flags = CON_PRINTBUFFER | CON_ANYTIME, > .index = -1, > .data = &serial8250_reg, > }; >diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c >index 8ca061d3bbb92..3b26105f01570 100644 >--- a/drivers/tty/serial/8250/8250_port.c >+++ b/drivers/tty/serial/8250/8250_port.c >@@ -550,6 +550,11 @@ static int serial8250_em485_init(struct uart_8250_port *p) > if (!p->em485) > return -ENOMEM; > >+#ifndef CONFIG_SERIAL_8250_LEGACY_CONSOLE >+ if (uart_console(&p->port)) >+ dev_warn(p->port.dev, "no atomic printing for rs485 consoles\n"); >+#endif >+ > hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, > HRTIMER_MODE_REL); > hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, >@@ -702,7 +707,11 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) > serial8250_rpm_put(p); > } > >-static void serial8250_clear_IER(struct uart_8250_port *up) >+/* >+ * Only to be used by write_atomic() and the legacy write(), which do not >+ * require port lock. >+ */ >+static void __serial8250_clear_IER(struct uart_8250_port *up) > { > if (up->capabilities & UART_CAP_UUE) > serial_out(up, UART_IER, UART_IER_UUE); >@@ -710,6 +719,14 @@ static void serial8250_clear_IER(struct uart_8250_port *up) > serial_out(up, UART_IER, 0); > } > >+static inline void serial8250_clear_IER(struct uart_8250_port *up) >+{ >+ /* Port locked to synchronize UART_IER access against the console. */ >+ lockdep_assert_held_once(&up->port.lock); >+ >+ __serial8250_clear_IER(up); >+} >+ > #ifdef CONFIG_SERIAL_8250_RSA > /* > * Attempts to turn on the RSA FIFO. Returns zero on failure. >@@ -3320,6 +3337,11 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) > > wait_for_xmitr(up, UART_LSR_THRE); > serial_port_out(port, UART_TX, ch); >+ >+ if (ch == '\n') >+ up->console_newline_needed = false; >+ else >+ up->console_newline_needed = true; > } > > /* >@@ -3348,6 +3370,7 @@ static void serial8250_console_restore(struct uart_8250_port *up) > serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); > } > >+#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE > /* > * Print a string to the serial port using the device FIFO > * >@@ -3406,7 +3429,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, > * First save the IER then disable the interrupts > */ > ier = serial_port_in(port, UART_IER); >- serial8250_clear_IER(up); >+ __serial8250_clear_IER(up); > > /* check scratch reg to see if port powered off during system sleep */ > if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { >@@ -3472,6 +3495,135 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, > if (locked) > uart_port_unlock_irqrestore(port, flags); > } >+#else >+bool serial8250_console_write_thread(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_8250_em485 *em485 = up->em485; >+ struct uart_port *port = &up->port; >+ bool done = false; >+ unsigned int ier; >+ >+ touch_nmi_watchdog(); >+ >+ if (!nbcon_enter_unsafe(wctxt)) >+ return false; >+ >+ /* First save IER then disable the interrupts. */ >+ ier = serial_port_in(port, UART_IER); >+ serial8250_clear_IER(up); >+ >+ /* Check scratch reg if port powered off during system sleep. */ >+ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { >+ serial8250_console_restore(up); >+ up->canary = 0; >+ } >+ >+ if (em485) { >+ if (em485->tx_stopped) >+ up->rs485_start_tx(up); >+ mdelay(port->rs485.delay_rts_before_send); >+ } >+ >+ if (nbcon_exit_unsafe(wctxt)) { >+ int len = READ_ONCE(wctxt->len); >+ int i; >+ >+ /* >+ * Write out the message. Toggle unsafe for each byte in order >+ * to give another (higher priority) context the opportunity >+ * for a friendly takeover. If such a takeover occurs, this >+ * context must reacquire ownership in order to perform final >+ * actions (such as re-enabling the interrupts). >+ * >+ * IMPORTANT: wctxt->outbuf and wctxt->len are no longer valid >+ * after a reacquire so writing the message must be >+ * aborted. >+ */ >+ for (i = 0; i < len; i++) { >+ if (!nbcon_enter_unsafe(wctxt)) { >+ nbcon_reacquire(wctxt); >+ break; >+ } >+ >+ uart_console_write(port, wctxt->outbuf + i, 1, serial8250_console_putchar); >+ >+ if (!nbcon_exit_unsafe(wctxt)) { >+ nbcon_reacquire(wctxt); >+ break; >+ } >+ } >+ done = (i == len); >+ } else { >+ nbcon_reacquire(wctxt); >+ } >+ >+ while (!nbcon_enter_unsafe(wctxt)) >+ nbcon_reacquire(wctxt); >+ >+ /* Finally, wait for transmitter to become empty and restore IER. */ >+ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); >+ if (em485) { >+ mdelay(port->rs485.delay_rts_after_send); >+ if (em485->tx_stopped) >+ up->rs485_stop_tx(up); >+ } >+ serial_port_out(port, UART_IER, ier); >+ >+ /* >+ * The receive handling will happen properly because the receive ready >+ * bit will still be set; it is not cleared on read. However, modem >+ * control will not, we must call it if we have saved something in the >+ * saved flags while processing with interrupts off. >+ */ >+ if (up->msr_saved_flags) >+ serial8250_modem_status(up); >+ >+ /* Success if no handover/takeover and message fully printed. */ >+ return (nbcon_exit_unsafe(wctxt) && done); >+} >+ >+bool serial8250_console_write_atomic(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt) >+{ >+ struct uart_port *port = &up->port; >+ unsigned int ier; >+ >+ /* Atomic console not supported for rs485 mode. */ >+ if (up->em485) >+ return false; >+ >+ touch_nmi_watchdog(); >+ >+ if (!nbcon_enter_unsafe(wctxt)) >+ return false; >+ >+ /* >+ * First save IER then disable the interrupts. The special variant to >+ * clear IER is used because atomic printing may occur without holding >+ * the port lock. >+ */ >+ ier = serial_port_in(port, UART_IER); >+ __serial8250_clear_IER(up); >+ >+ /* Check scratch reg if port powered off during system sleep. */ >+ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { >+ serial8250_console_restore(up); >+ up->canary = 0; >+ } >+ >+ if (up->console_newline_needed) >+ uart_console_write(port, "\n", 1, serial8250_console_putchar); >+ uart_console_write(port, wctxt->outbuf, wctxt->len, serial8250_console_putchar); >+ >+ /* Finally, wait for transmitter to become empty and restore IER. */ >+ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); >+ serial_port_out(port, UART_IER, ier); >+ >+ /* Success if no handover/takeover. */ >+ return nbcon_exit_unsafe(wctxt); >+} >+#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */ > > static unsigned int probe_baud(struct uart_port *port) > { >@@ -3490,6 +3642,7 @@ static unsigned int probe_baud(struct uart_port *port) > > int serial8250_console_setup(struct uart_port *port, char *options, bool probe) > { >+ struct uart_8250_port *up = up_to_u8250p(port); > int baud = 9600; > int bits = 8; > int parity = 'n'; >@@ -3499,6 +3652,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) > if (!port->iobase && !port->membase) > return -ENODEV; > >+ up->console_newline_needed = false; >+ > if (options) > uart_parse_options(options, &baud, &parity, &bits, &flow); > else if (probe) >diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c >index b7635363373e2..afb4f03e44e70 100644 >--- a/drivers/tty/serial/amba-pl011.c >+++ b/drivers/tty/serial/amba-pl011.c >@@ -2328,13 +2328,10 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) > > clk_enable(uap->clk); > >- local_irq_save(flags); >- if (uap->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&uap->port); >+ if (uap->port.sysrq || oops_in_progress) >+ locked = uart_port_trylock_irqsave(&uap->port, &flags); > else >- uart_port_lock(&uap->port); >+ uart_port_lock_irqsave(&uap->port, &flags); > > /* > * First save the CR then disable the interrupts >@@ -2360,8 +2357,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) > pl011_write(old_cr, uap, REG_CR); > > if (locked) >- uart_port_unlock(&uap->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&uap->port, flags); > > clk_disable(uap->clk); > } >diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c >index ad4c1c5d0a7f0..90369adc33f03 100644 >--- a/drivers/tty/serial/omap-serial.c >+++ b/drivers/tty/serial/omap-serial.c >@@ -1212,13 +1212,10 @@ serial_omap_console_write(struct console *co, const char *s, > unsigned int ier; > int locked = 1; > >- local_irq_save(flags); >- if (up->port.sysrq) >- locked = 0; >- else if (oops_in_progress) >- locked = uart_port_trylock(&up->port); >+ if (up->port.sysrq || oops_in_progress) >+ locked = uart_port_trylock_irqsave(&up->port, &flags); > else >- uart_port_lock(&up->port); >+ uart_port_lock_irqsave(&up->port, &flags); > > /* > * First save the IER then disable the interrupts >@@ -1245,8 +1242,7 @@ serial_omap_console_write(struct console *co, const char *s, > check_modem_status(up); > > if (locked) >- uart_port_unlock(&up->port); >- local_irq_restore(flags); >+ uart_port_unlock_irqrestore(&up->port, flags); > } > > static int __init >diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c >index 06414e43e0b53..d6aebf5ebfa21 100644 >--- a/drivers/tty/tty_io.c >+++ b/drivers/tty/tty_io.c >@@ -3544,8 +3544,15 @@ static ssize_t show_cons_active(struct device *dev, > for_each_console(c) { > if (!c->device) > continue; >- if (!c->write) >- continue; >+ if (c->flags & CON_NBCON) { >+ if (!c->write_atomic && >+ !(c->write_thread && c->kthread)) { >+ continue; >+ } >+ } else { >+ if (!c->write) >+ continue; >+ } > if ((c->flags & CON_ENABLED) == 0) > continue; > cs[i++] = c; >diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c >index e0758fe7936dc..2703676549f5e 100644 >--- a/fs/proc/consoles.c >+++ b/fs/proc/consoles.c >@@ -21,12 +21,14 @@ static int show_console_dev(struct seq_file *m, void *v) > { CON_ENABLED, 'E' }, > { CON_CONSDEV, 'C' }, > { CON_BOOT, 'B' }, >+ { CON_NBCON, 'N' }, > { CON_PRINTBUFFER, 'p' }, > { CON_BRL, 'b' }, > { CON_ANYTIME, 'a' }, > }; > char flags[ARRAY_SIZE(con_flags) + 1]; > struct console *con = v; >+ char con_write = '-'; > unsigned int a; > dev_t dev = 0; > >@@ -57,9 +59,15 @@ static int show_console_dev(struct seq_file *m, void *v) > seq_setwidth(m, 21 - 1); > seq_printf(m, "%s%d", con->name, con->index); > seq_pad(m, ' '); >- seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', >- con->write ? 'W' : '-', con->unblank ? 'U' : '-', >- flags); >+ if (con->flags & CON_NBCON) { >+ if (con->write_atomic || con->write_thread) >+ con_write = 'W'; >+ } else { >+ if (con->write) >+ con_write = 'W'; >+ } >+ seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con_write, >+ con->unblank ? 'U' : '-', flags); > if (dev) > seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); > >diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h >index fc53e0ad56d90..448bbef474564 100644 >--- a/include/linux/bottom_half.h >+++ b/include/linux/bottom_half.h >@@ -35,8 +35,10 @@ static inline void local_bh_enable(void) > > #ifdef CONFIG_PREEMPT_RT > extern bool local_bh_blocked(void); >+extern void softirq_preempt(void); > #else > static inline bool local_bh_blocked(void) { return false; } >+static inline void softirq_preempt(void) { } > #endif > > #endif /* _LINUX_BH_H */ >diff --git a/include/linux/console.h b/include/linux/console.h >index 779d388af8a0a..79ef2fd2bd155 100644 >--- a/include/linux/console.h >+++ b/include/linux/console.h >@@ -16,7 +16,9 @@ > > #include <linux/atomic.h> > #include <linux/bits.h> >+#include <linux/irq_work.h> > #include <linux/rculist.h> >+#include <linux/rcuwait.h> > #include <linux/types.h> > > struct vc_data; >@@ -303,9 +305,16 @@ struct nbcon_write_context { > * @node: hlist node for the console list > * > * @write_atomic: Write callback for atomic context >+ * @write_thread: Write callback for non-atomic context >+ * @driver_enter: Callback to begin synchronization with driver code >+ * @driver_exit: Callback to finish synchronization with driver code > * @nbcon_state: State for nbcon consoles > * @nbcon_seq: Sequence number of the next record for nbcon to print > * @pbufs: Pointer to nbcon private buffer >+ * @locked_port: True, if the port lock is locked by nbcon >+ * @kthread: Printer kthread for this console >+ * @rcuwait: RCU-safe wait object for @kthread waking >+ * @irq_work: Defer @kthread waking to IRQ work context > */ > struct console { > char name[16]; >@@ -329,9 +338,17 @@ struct console { > /* nbcon console specific members */ > bool (*write_atomic)(struct console *con, > struct nbcon_write_context *wctxt); >+ bool (*write_thread)(struct console *con, >+ struct nbcon_write_context *wctxt); >+ void (*driver_enter)(struct console *con, unsigned long *flags); >+ void (*driver_exit)(struct console *con, unsigned long flags); > atomic_t __private nbcon_state; > atomic_long_t __private nbcon_seq; > struct printk_buffers *pbufs; >+ bool locked_port; >+ struct task_struct *kthread; >+ struct rcuwait rcuwait; >+ struct irq_work irq_work; > }; > > #ifdef CONFIG_LOCKDEP >@@ -459,13 +476,19 @@ static inline bool console_is_registered(const struct console *con) > hlist_for_each_entry(con, &console_list, node) > > #ifdef CONFIG_PRINTK >+extern void nbcon_cpu_emergency_enter(void); >+extern void nbcon_cpu_emergency_exit(void); > extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); > extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); > extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); >+extern void nbcon_reacquire(struct nbcon_write_context *wctxt); > #else >+static inline void nbcon_cpu_emergency_enter(void) { } >+static inline void nbcon_cpu_emergency_exit(void) { } > static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } > static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } > static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } >+static inline void nbcon_reacquire(struct nbcon_write_context *wctxt) { } > #endif > > extern int console_set_on_cmdline; >diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h >index d95ab85f96ba5..8b3ab0cc1334d 100644 >--- a/include/linux/entry-common.h >+++ b/include/linux/entry-common.h >@@ -60,7 +60,7 @@ > #define EXIT_TO_USER_MODE_WORK \ > (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ > _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ >- ARCH_EXIT_TO_USER_MODE_WORK) >+ _TIF_NEED_RESCHED_LAZY | ARCH_EXIT_TO_USER_MODE_WORK) > > /** > * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs >diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h >index 6813171afccb2..674a622c91be2 100644 >--- a/include/linux/entry-kvm.h >+++ b/include/linux/entry-kvm.h >@@ -18,7 +18,7 @@ > > #define XFER_TO_GUEST_MODE_WORK \ > (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ >- _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) >+ _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED_LAZY | ARCH_XFER_TO_GUEST_MODE_WORK) > > struct kvm_vcpu; > >diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h >index 76121c2bb4f82..f75f6bc195d18 100644 >--- a/include/linux/interrupt.h >+++ b/include/linux/interrupt.h >@@ -609,6 +609,35 @@ extern void __raise_softirq_irqoff(unsigned int nr); > extern void raise_softirq_irqoff(unsigned int nr); > extern void raise_softirq(unsigned int nr); > >+#ifdef CONFIG_PREEMPT_RT >+DECLARE_PER_CPU(struct task_struct *, timersd); >+DECLARE_PER_CPU(unsigned long, pending_timer_softirq); >+ >+extern void raise_timer_softirq(void); >+extern void raise_hrtimer_softirq(void); >+ >+static inline unsigned int local_pending_timers(void) >+{ >+ return __this_cpu_read(pending_timer_softirq); >+} >+ >+#else >+static inline void raise_timer_softirq(void) >+{ >+ raise_softirq(TIMER_SOFTIRQ); >+} >+ >+static inline void raise_hrtimer_softirq(void) >+{ >+ raise_softirq_irqoff(HRTIMER_SOFTIRQ); >+} >+ >+static inline unsigned int local_pending_timers(void) >+{ >+ return local_softirq_pending(); >+} >+#endif >+ > DECLARE_PER_CPU(struct task_struct *, ksoftirqd); > > static inline struct task_struct *this_cpu_ksoftirqd(void) >diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >index 2564e209465ea..1b1df7306627a 100644 >--- a/include/linux/netdevice.h >+++ b/include/linux/netdevice.h >@@ -3288,7 +3288,11 @@ struct softnet_data { > int defer_count; > int defer_ipi_scheduled; > struct sk_buff *defer_list; >+#ifndef CONFIG_PREEMPT_RT > call_single_data_t defer_csd; >+#else >+ struct work_struct defer_work; >+#endif > }; > > static inline void input_queue_head_incr(struct softnet_data *sd) >diff --git a/include/linux/preempt.h b/include/linux/preempt.h >index 9aa6358a1a16b..cd16f0330fba2 100644 >--- a/include/linux/preempt.h >+++ b/include/linux/preempt.h >@@ -230,15 +230,21 @@ do { \ > #define preempt_enable() \ > do { \ > barrier(); \ >- if (unlikely(preempt_count_dec_and_test())) \ >+ if (unlikely(preempt_count_dec_and_test())) { \ >+ instrumentation_begin(); \ > __preempt_schedule(); \ >+ instrumentation_end(); \ >+ } \ > } while (0) > > #define preempt_enable_notrace() \ > do { \ > barrier(); \ >- if (unlikely(__preempt_count_dec_and_test())) \ >+ if (unlikely(__preempt_count_dec_and_test())) { \ >+ instrumentation_begin(); \ > __preempt_schedule_notrace(); \ >+ instrumentation_end(); \ >+ } \ > } while (0) > > #define preempt_check_resched() \ >diff --git a/include/linux/printk.h b/include/linux/printk.h >index 8ef499ab3c1ed..7a942e987b165 100644 >--- a/include/linux/printk.h >+++ b/include/linux/printk.h >@@ -9,6 +9,8 @@ > #include <linux/ratelimit_types.h> > #include <linux/once_lite.h> > >+struct uart_port; >+ > extern const char linux_banner[]; > extern const char linux_proc_banner[]; > >@@ -159,13 +161,16 @@ __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); > > extern void __printk_safe_enter(void); > extern void __printk_safe_exit(void); >+extern void __printk_deferred_enter(void); >+extern void __printk_deferred_exit(void); >+ > /* > * The printk_deferred_enter/exit macros are available only as a hack for > * some code paths that need to defer all printk console printing. Interrupts > * must be disabled for the deferred duration. > */ >-#define printk_deferred_enter __printk_safe_enter >-#define printk_deferred_exit __printk_safe_exit >+#define printk_deferred_enter() __printk_deferred_enter() >+#define printk_deferred_exit() __printk_deferred_exit() > > /* > * Please don't use printk_ratelimit(), because it shares ratelimiting state >@@ -192,6 +197,10 @@ void show_regs_print_info(const char *log_lvl); > extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; > extern asmlinkage void dump_stack(void) __cold; > void printk_trigger_flush(void); >+void printk_legacy_allow_panic_sync(void); >+extern void nbcon_acquire(struct uart_port *up); >+extern void nbcon_release(struct uart_port *up); >+void nbcon_atomic_flush_unsafe(void); > #else > static inline __printf(1, 0) > int vprintk(const char *s, va_list args) >@@ -271,6 +280,23 @@ static inline void dump_stack(void) > static inline void printk_trigger_flush(void) > { > } >+ >+static inline void printk_legacy_allow_panic_sync(void) >+{ >+} >+ >+static inline void nbcon_acquire(struct uart_port *up) >+{ >+} >+ >+static inline void nbcon_release(struct uart_port *up) >+{ >+} >+ >+static inline void nbcon_atomic_flush_unsafe(void) >+{ >+} >+ > #endif > > #ifdef CONFIG_SMP >diff --git a/include/linux/sched.h b/include/linux/sched.h >index 292c316972485..23d2153a90777 100644 >--- a/include/linux/sched.h >+++ b/include/linux/sched.h >@@ -1910,6 +1910,7 @@ static inline int dl_task_check_affinity(struct task_struct *p, const struct cpu > } > #endif > >+extern bool task_is_pi_boosted(const struct task_struct *p); > extern int yield_to(struct task_struct *p, bool preempt); > extern void set_user_nice(struct task_struct *p, long nice); > extern int task_prio(const struct task_struct *p); >@@ -2054,17 +2055,17 @@ static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, > update_ti_thread_flag(task_thread_info(tsk), flag, value); > } > >-static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) >+static inline bool test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) > { > return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); > } > >-static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) >+static inline bool test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) > { > return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); > } > >-static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) >+static inline bool test_tsk_thread_flag(struct task_struct *tsk, int flag) > { > return test_ti_thread_flag(task_thread_info(tsk), flag); > } >@@ -2077,9 +2078,11 @@ static inline void set_tsk_need_resched(struct task_struct *tsk) > static inline void clear_tsk_need_resched(struct task_struct *tsk) > { > clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); >+ if (IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO)) >+ clear_tsk_thread_flag(tsk, TIF_NEED_RESCHED_LAZY); > } > >-static inline int test_tsk_need_resched(struct task_struct *tsk) >+static inline bool test_tsk_need_resched(struct task_struct *tsk) > { > return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); > } >@@ -2260,7 +2263,7 @@ static inline int rwlock_needbreak(rwlock_t *lock) > > static __always_inline bool need_resched(void) > { >- return unlikely(tif_need_resched()); >+ return unlikely(tif_need_resched_lazy() || tif_need_resched()); > } > > /* >diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h >index 478084f9105e1..719416fe8ddc0 100644 >--- a/include/linux/sched/idle.h >+++ b/include/linux/sched/idle.h >@@ -63,7 +63,7 @@ static __always_inline bool __must_check current_set_polling_and_test(void) > */ > smp_mb__after_atomic(); > >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > > static __always_inline bool __must_check current_clr_polling_and_test(void) >@@ -76,7 +76,7 @@ static __always_inline bool __must_check current_clr_polling_and_test(void) > */ > smp_mb__after_atomic(); > >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > > #else >@@ -85,11 +85,11 @@ static inline void __current_clr_polling(void) { } > > static inline bool __must_check current_set_polling_and_test(void) > { >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > static inline bool __must_check current_clr_polling_and_test(void) > { >- return unlikely(tif_need_resched()); >+ return unlikely(need_resched()); > } > #endif > >diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h >index be65de65fe612..ec46e3b49ee99 100644 >--- a/include/linux/serial_8250.h >+++ b/include/linux/serial_8250.h >@@ -153,6 +153,8 @@ struct uart_8250_port { > #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA > unsigned char msr_saved_flags; > >+ bool console_newline_needed; >+ > struct uart_8250_dma *dma; > const struct uart_8250_ops *ops; > >@@ -204,6 +206,10 @@ void serial8250_init_port(struct uart_8250_port *up); > void serial8250_set_defaults(struct uart_8250_port *up); > void serial8250_console_write(struct uart_8250_port *up, const char *s, > unsigned int count); >+bool serial8250_console_write_atomic(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt); >+bool serial8250_console_write_thread(struct uart_8250_port *up, >+ struct nbcon_write_context *wctxt); > int serial8250_console_setup(struct uart_port *port, char *options, bool probe); > int serial8250_console_exit(struct uart_port *port); > >diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h >index 89f7b6c63598c..a49ce4bf7fae8 100644 >--- a/include/linux/serial_core.h >+++ b/include/linux/serial_core.h >@@ -595,6 +595,7 @@ struct uart_port { > static inline void uart_port_lock(struct uart_port *up) > { > spin_lock(&up->lock); >+ nbcon_acquire(up); > } > > /** >@@ -604,6 +605,7 @@ static inline void uart_port_lock(struct uart_port *up) > static inline void uart_port_lock_irq(struct uart_port *up) > { > spin_lock_irq(&up->lock); >+ nbcon_acquire(up); > } > > /** >@@ -614,6 +616,7 @@ static inline void uart_port_lock_irq(struct uart_port *up) > static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) > { > spin_lock_irqsave(&up->lock, *flags); >+ nbcon_acquire(up); > } > > /** >@@ -624,7 +627,11 @@ static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *f > */ > static inline bool uart_port_trylock(struct uart_port *up) > { >- return spin_trylock(&up->lock); >+ if (!spin_trylock(&up->lock)) >+ return false; >+ >+ nbcon_acquire(up); >+ return true; > } > > /** >@@ -636,7 +643,11 @@ static inline bool uart_port_trylock(struct uart_port *up) > */ > static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags) > { >- return spin_trylock_irqsave(&up->lock, *flags); >+ if (!spin_trylock_irqsave(&up->lock, *flags)) >+ return false; >+ >+ nbcon_acquire(up); >+ return true; > } > > /** >@@ -645,6 +656,7 @@ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long > */ > static inline void uart_port_unlock(struct uart_port *up) > { >+ nbcon_release(up); > spin_unlock(&up->lock); > } > >@@ -654,6 +666,7 @@ static inline void uart_port_unlock(struct uart_port *up) > */ > static inline void uart_port_unlock_irq(struct uart_port *up) > { >+ nbcon_release(up); > spin_unlock_irq(&up->lock); > } > >@@ -663,6 +676,19 @@ static inline void uart_port_unlock_irq(struct uart_port *up) > * @flags: The saved interrupt flags for restore > */ > static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) >+{ >+ nbcon_release(up); >+ spin_unlock_irqrestore(&up->lock, flags); >+} >+ >+/* Only for use in the console->driver_enter() callback. */ >+static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) >+{ >+ spin_lock_irqsave(&up->lock, *flags); >+} >+ >+/* Only for use in the console->driver_exit() callback. */ >+static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) > { > spin_unlock_irqrestore(&up->lock, flags); > } >diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h >index 9ea0b28068f49..5ded1450ac1a1 100644 >--- a/include/linux/thread_info.h >+++ b/include/linux/thread_info.h >@@ -59,6 +59,16 @@ enum syscall_work_bit { > > #include <asm/thread_info.h> > >+#ifdef CONFIG_PREEMPT_BUILD_AUTO >+# define TIF_NEED_RESCHED_LAZY TIF_ARCH_RESCHED_LAZY >+# define _TIF_NEED_RESCHED_LAZY _TIF_ARCH_RESCHED_LAZY >+# define TIF_NEED_RESCHED_LAZY_OFFSET (TIF_NEED_RESCHED_LAZY - TIF_NEED_RESCHED) >+#else >+# define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED >+# define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED >+# define TIF_NEED_RESCHED_LAZY_OFFSET 0 >+#endif >+ > #ifdef __KERNEL__ > > #ifndef arch_set_restart_data >@@ -185,6 +195,13 @@ static __always_inline bool tif_need_resched(void) > (unsigned long *)(¤t_thread_info()->flags)); > } > >+static __always_inline bool tif_need_resched_lazy(void) >+{ >+ return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && >+ arch_test_bit(TIF_NEED_RESCHED_LAZY, >+ (unsigned long *)(¤t_thread_info()->flags)); >+} >+ > #else > > static __always_inline bool tif_need_resched(void) >@@ -193,6 +210,13 @@ static __always_inline bool tif_need_resched(void) > (unsigned long *)(¤t_thread_info()->flags)); > } > >+static __always_inline bool tif_need_resched_lazy(void) >+{ >+ return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && >+ test_bit(TIF_NEED_RESCHED_LAZY, >+ (unsigned long *)(¤t_thread_info()->flags)); >+} >+ > #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ > > #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES >diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h >index d68ff9b1247f9..0681b3d5a85c6 100644 >--- a/include/linux/trace_events.h >+++ b/include/linux/trace_events.h >@@ -178,8 +178,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); > > enum trace_flag_type { > TRACE_FLAG_IRQS_OFF = 0x01, >- TRACE_FLAG_IRQS_NOSUPPORT = 0x02, >- TRACE_FLAG_NEED_RESCHED = 0x04, >+ TRACE_FLAG_NEED_RESCHED = 0x02, >+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x04, > TRACE_FLAG_HARDIRQ = 0x08, > TRACE_FLAG_SOFTIRQ = 0x10, > TRACE_FLAG_PREEMPT_RESCHED = 0x20, >@@ -205,11 +205,11 @@ static inline unsigned int tracing_gen_ctx(void) > > static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) > { >- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); >+ return tracing_gen_ctx_irq_test(0); > } > static inline unsigned int tracing_gen_ctx(void) > { >- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); >+ return tracing_gen_ctx_irq_test(0); > } > #endif > >diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt >index c2f1fd95a8214..0f3d4c2a41cb7 100644 >--- a/kernel/Kconfig.preempt >+++ b/kernel/Kconfig.preempt >@@ -11,6 +11,13 @@ config PREEMPT_BUILD > select PREEMPTION > select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK > >+config PREEMPT_BUILD_AUTO >+ bool >+ select PREEMPT_BUILD >+ >+config HAVE_PREEMPT_AUTO >+ bool >+ > choice > prompt "Preemption Model" > default PREEMPT_NONE >@@ -67,9 +74,17 @@ config PREEMPT > embedded system with latency requirements in the milliseconds > range. > >+config PREEMPT_AUTO >+ bool "Automagic preemption mode with runtime tweaking support" >+ depends on HAVE_PREEMPT_AUTO >+ select PREEMPT_BUILD_AUTO >+ help >+ Add some sensible blurb here >+ > config PREEMPT_RT > bool "Fully Preemptible Kernel (Real-Time)" > depends on EXPERT && ARCH_SUPPORTS_RT >+ select PREEMPT_BUILD_AUTO if HAVE_PREEMPT_AUTO > select PREEMPTION > help > This option turns the kernel into a real-time kernel by replacing >@@ -95,7 +110,7 @@ config PREEMPTION > > config PREEMPT_DYNAMIC > bool "Preemption behaviour defined on boot" >- depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT >+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT && !PREEMPT_AUTO > select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY > select PREEMPT_BUILD > default y if HAVE_PREEMPT_DYNAMIC_CALL >diff --git a/kernel/entry/common.c b/kernel/entry/common.c >index d7ee4bc3f2ba3..c1f706038637c 100644 >--- a/kernel/entry/common.c >+++ b/kernel/entry/common.c >@@ -155,7 +155,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, > > local_irq_enable_exit_to_user(ti_work); > >- if (ti_work & _TIF_NEED_RESCHED) >+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) > schedule(); > > if (ti_work & _TIF_UPROBE) >@@ -385,7 +385,7 @@ void raw_irqentry_exit_cond_resched(void) > rcu_irq_exit_check_preempt(); > if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) > WARN_ON_ONCE(!on_thread_stack()); >- if (need_resched()) >+ if (test_tsk_need_resched(current)) > preempt_schedule_irq(); > } > } >diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c >index 2e0f75bcb7fd1..d952fa5ee8801 100644 >--- a/kernel/entry/kvm.c >+++ b/kernel/entry/kvm.c >@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) > return -EINTR; > } > >- if (ti_work & _TIF_NEED_RESCHED) >+ if (ti_work & (_TIF_NEED_RESCHED | TIF_NEED_RESCHED_LAZY)) > schedule(); > > if (ti_work & _TIF_NOTIFY_RESUME) >diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c >index 1d4bc493b2f4b..486c68c11bbe2 100644 >--- a/kernel/ksysfs.c >+++ b/kernel/ksysfs.c >@@ -179,6 +179,15 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size); > > #endif /* CONFIG_CRASH_CORE */ > >+#if defined(CONFIG_PREEMPT_RT) >+static ssize_t realtime_show(struct kobject *kobj, >+ struct kobj_attribute *attr, char *buf) >+{ >+ return sprintf(buf, "%d\n", 1); >+} >+KERNEL_ATTR_RO(realtime); >+#endif >+ > /* whether file capabilities are enabled */ > static ssize_t fscaps_show(struct kobject *kobj, > struct kobj_attribute *attr, char *buf) >@@ -274,6 +283,9 @@ static struct attribute * kernel_attrs[] = { > #ifndef CONFIG_TINY_RCU > &rcu_expedited_attr.attr, > &rcu_normal_attr.attr, >+#endif >+#ifdef CONFIG_PREEMPT_RT >+ &realtime_attr.attr, > #endif > NULL > }; >diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c >index 151bd3de59363..5c21ba41e3087 100644 >--- a/kernel/locking/lockdep.c >+++ b/kernel/locking/lockdep.c >@@ -56,6 +56,7 @@ > #include <linux/kprobes.h> > #include <linux/lockdep.h> > #include <linux/context_tracking.h> >+#include <linux/console.h> > > #include <asm/sections.h> > >@@ -3971,6 +3972,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, > if (!debug_locks_off() || debug_locks_silent) > return; > >+ nbcon_cpu_emergency_enter(); >+ > pr_warn("\n"); > pr_warn("================================\n"); > pr_warn("WARNING: inconsistent lock state\n"); >@@ -3999,6 +4002,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, > > pr_warn("\nstack backtrace:\n"); > dump_stack(); >+ >+ nbcon_cpu_emergency_exit(); > } > > /* >diff --git a/kernel/panic.c b/kernel/panic.c >index 2807639aab51d..9fa44bc38f466 100644 >--- a/kernel/panic.c >+++ b/kernel/panic.c >@@ -370,6 +370,8 @@ void panic(const char *fmt, ...) > */ > atomic_notifier_call_chain(&panic_notifier_list, 0, buf); > >+ printk_legacy_allow_panic_sync(); >+ > panic_print_sys_info(false); > > kmsg_dump(KMSG_DUMP_PANIC); >@@ -446,6 +448,15 @@ void panic(const char *fmt, ...) > > /* Do not scroll important messages printed above */ > suppress_printk = 1; >+ >+ /* >+ * The final messages may not have been printed if in a context that >+ * defers printing (such as NMI) and irq_work is not available. >+ * Explicitly flush the kernel log buffer one last time. >+ */ >+ console_flush_on_panic(CONSOLE_FLUSH_PENDING); >+ nbcon_atomic_flush_unsafe(); >+ > local_irq_enable(); > for (i = 0; ; i += PANIC_TIMER_STEP) { > touch_softlockup_watchdog(); >@@ -623,6 +634,7 @@ bool oops_may_print(void) > */ > void oops_enter(void) > { >+ nbcon_cpu_emergency_enter(); > tracing_off(); > /* can't trust the integrity of the kernel anymore: */ > debug_locks_off(); >@@ -645,6 +657,7 @@ void oops_exit(void) > { > do_oops_enter_exit(); > print_oops_end_marker(); >+ nbcon_cpu_emergency_exit(); > kmsg_dump(KMSG_DUMP_OOPS); > } > >@@ -656,6 +669,8 @@ struct warn_args { > void __warn(const char *file, int line, void *caller, unsigned taint, > struct pt_regs *regs, struct warn_args *args) > { >+ nbcon_cpu_emergency_enter(); >+ > disable_trace_on_warning(); > > if (file) >@@ -686,6 +701,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, > > /* Just a warning, don't kill lockdep. */ > add_taint(taint, LOCKDEP_STILL_OK); >+ >+ nbcon_cpu_emergency_exit(); > } > > #ifdef CONFIG_BUG >diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h >index 6c2afee5ef620..7db6992c54f38 100644 >--- a/kernel/printk/internal.h >+++ b/kernel/printk/internal.h >@@ -44,6 +44,17 @@ enum printk_info_flags { > }; > > extern struct printk_ringbuffer *prb; >+extern bool printk_threads_enabled; >+extern bool have_legacy_console; >+extern bool have_boot_console; >+ >+/* >+ * Specifies if the console lock/unlock dance is needed for console >+ * printing. If @have_boot_console is true, the nbcon consoles will >+ * be printed serially along with the legacy consoles because nbcon >+ * consoles cannot print simultaneously with boot consoles. >+ */ >+#define printing_via_unlock (have_legacy_console || have_boot_console) > > __printf(4, 0) > int vprintk_store(int facility, int level, >@@ -71,12 +82,78 @@ void defer_console_output(void); > > u16 printk_parse_prefix(const char *text, int *level, > enum printk_info_flags *flags); >+void console_lock_spinning_enable(void); >+int console_lock_spinning_disable_and_check(int cookie); > > u64 nbcon_seq_read(struct console *con); > void nbcon_seq_force(struct console *con, u64 seq); > bool nbcon_alloc(struct console *con); > void nbcon_init(struct console *con); > void nbcon_free(struct console *con); >+enum nbcon_prio nbcon_get_default_prio(void); >+void nbcon_atomic_flush_all(void); >+bool nbcon_atomic_emit_next_record(struct console *con, bool *handover, int cookie); >+void nbcon_kthread_create(struct console *con); >+void nbcon_wake_threads(void); >+void nbcon_legacy_kthread_create(void); >+ >+/* >+ * Check if the given console is currently capable and allowed to print >+ * records. Note that this function does not consider the current context, >+ * which can also play a role in deciding if @con can be used to print >+ * records. >+ */ >+static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) >+{ >+ if (!(flags & CON_ENABLED)) >+ return false; >+ >+ if ((flags & CON_SUSPENDED)) >+ return false; >+ >+ if (flags & CON_NBCON) { >+ if (use_atomic) { >+ if (!con->write_atomic) >+ return false; >+ } else { >+ if (!con->write_thread || !con->kthread) >+ return false; >+ } >+ } else { >+ if (!con->write) >+ return false; >+ } >+ >+ /* >+ * Console drivers may assume that per-cpu resources have been >+ * allocated. So unless they're explicitly marked as being able to >+ * cope (CON_ANYTIME) don't call them until this CPU is officially up. >+ */ >+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) >+ return false; >+ >+ return true; >+} >+ >+/** >+ * nbcon_kthread_wake - Wake up a printk thread >+ * @con: Console to operate on >+ */ >+static inline void nbcon_kthread_wake(struct console *con) >+{ >+ /* >+ * Guarantee any new records can be seen by tasks preparing to wait >+ * before this context checks if the rcuwait is empty. >+ * >+ * The full memory barrier in rcuwait_wake_up() pairs with the full >+ * memory barrier within set_current_state() of >+ * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait() >+ * adds the waiter but before it has checked the wait condition. >+ * >+ * This pairs with nbcon_kthread_func:A. >+ */ >+ rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */ >+} > > #else > >@@ -84,6 +161,11 @@ void nbcon_free(struct console *con); > #define PRINTK_MESSAGE_MAX 0 > #define PRINTKRB_RECORD_MAX 0 > >+static inline void nbcon_kthread_wake(struct console *con) { } >+static inline void nbcon_kthread_create(struct console *con) { } >+#define printk_threads_enabled (false) >+#define printing_via_unlock (false) >+ > /* > * In !PRINTK builds we still export console_sem > * semaphore and some of console functions (console_unlock()/etc.), so >@@ -98,6 +180,13 @@ static inline void nbcon_seq_force(struct console *con, u64 seq) { } > static inline bool nbcon_alloc(struct console *con) { return false; } > static inline void nbcon_init(struct console *con) { } > static inline void nbcon_free(struct console *con) { } >+static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; } >+static inline void nbcon_atomic_flush_all(void) { } >+static inline bool nbcon_atomic_emit_next_record(struct console *con, bool *handover, >+ int cookie) { return false; } >+ >+static inline bool console_is_usable(struct console *con, short flags, >+ bool use_atomic) { return false; } > > #endif /* CONFIG_PRINTK */ > >@@ -130,6 +219,7 @@ struct printk_message { > }; > > bool other_cpu_in_panic(void); >+bool this_cpu_in_panic(void); > bool printk_get_next_message(struct printk_message *pmsg, u64 seq, > bool is_extended, bool may_supress); > >diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c >index b96077152f49d..1b1b585b1675b 100644 >--- a/kernel/printk/nbcon.c >+++ b/kernel/printk/nbcon.c >@@ -5,7 +5,11 @@ > #include <linux/kernel.h> > #include <linux/console.h> > #include <linux/delay.h> >+#include <linux/kthread.h> > #include <linux/slab.h> >+#include <linux/serial_core.h> >+#include <linux/syscore_ops.h> >+#include "printk_ringbuffer.h" > #include "internal.h" > /* > * Printk console printing implementation for consoles which does not depend >@@ -140,39 +144,6 @@ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_sta > return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom); > } > >-#ifdef CONFIG_64BIT >- >-#define __seq_to_nbcon_seq(seq) (seq) >-#define __nbcon_seq_to_seq(seq) (seq) >- >-#else /* CONFIG_64BIT */ >- >-#define __seq_to_nbcon_seq(seq) ((u32)seq) >- >-static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq) >-{ >- u64 seq; >- u64 rb_next_seq; >- >- /* >- * The provided sequence is only the lower 32 bits of the ringbuffer >- * sequence. It needs to be expanded to 64bit. Get the next sequence >- * number from the ringbuffer and fold it. >- * >- * Having a 32bit representation in the console is sufficient. >- * If a console ever gets more than 2^31 records behind >- * the ringbuffer then this is the least of the problems. >- * >- * Also the access to the ring buffer is always safe. >- */ >- rb_next_seq = prb_next_seq(prb); >- seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq); >- >- return seq; >-} >- >-#endif /* CONFIG_64BIT */ >- > /** > * nbcon_seq_read - Read the current console sequence > * @con: Console to read the sequence of >@@ -183,7 +154,7 @@ u64 nbcon_seq_read(struct console *con) > { > unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq)); > >- return __nbcon_seq_to_seq(nbcon_seq); >+ return __ulseq_to_u64seq(prb, nbcon_seq); > } > > /** >@@ -204,7 +175,7 @@ void nbcon_seq_force(struct console *con, u64 seq) > */ > u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); > >- atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq)); >+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); > > /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */ > con->seq = 0; >@@ -223,17 +194,19 @@ void nbcon_seq_force(struct console *con, u64 seq) > */ > static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) > { >- unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq); >+ unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq); > struct console *con = ctxt->console; > > if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq, >- __seq_to_nbcon_seq(new_seq))) { >+ __u64seq_to_ulseq(new_seq))) { > ctxt->seq = new_seq; > } else { > ctxt->seq = nbcon_seq_read(con); > } > } > >+bool printk_threads_enabled __ro_after_init; >+ > /** > * nbcon_context_try_acquire_direct - Try to acquire directly > * @ctxt: The context of the caller >@@ -564,6 +537,7 @@ static struct printk_buffers panic_nbcon_pbufs; > * nbcon_context_try_acquire - Try to acquire nbcon console > * @ctxt: The context of the caller > * >+ * Context: Any context which could not be migrated to another CPU. > * Return: True if the console was acquired. False otherwise. > * > * If the caller allowed an unsafe hostile takeover, on success the >@@ -571,7 +545,6 @@ static struct printk_buffers panic_nbcon_pbufs; > * in an unsafe state. Otherwise, on success the caller may assume > * the console is not in an unsafe state. > */ >-__maybe_unused > static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) > { > unsigned int cpu = smp_processor_id(); >@@ -857,9 +830,42 @@ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) > } > EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); > >+/** >+ * nbcon_reacquire - Reacquire a console after losing ownership >+ * @wctxt: The write context that was handed to the write function >+ * >+ * Since ownership can be lost at any time due to handover or takeover, a >+ * printing context _should_ be prepared to back out immediately and >+ * carefully. However, there are many scenarios where the context _must_ >+ * reacquire ownership in order to finalize or revert hardware changes. >+ * >+ * This function allows a context to reacquire ownership using the same >+ * priority as its previous ownership. >+ * >+ * Note that for printing contexts, after a successful reacquire the >+ * context will have no output buffer because that has been lost. This >+ * function cannot be used to resume printing. >+ */ >+void nbcon_reacquire(struct nbcon_write_context *wctxt) >+{ >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); >+ struct console *con = ctxt->console; >+ struct nbcon_state cur; >+ >+ while (!nbcon_context_try_acquire(ctxt)) >+ cpu_relax(); >+ >+ wctxt->outbuf = NULL; >+ wctxt->len = 0; >+ nbcon_state_read(con, &cur); >+ wctxt->unsafe_takeover = cur.unsafe_takeover; >+} >+EXPORT_SYMBOL_GPL(nbcon_reacquire); >+ > /** > * nbcon_emit_next_record - Emit a record in the acquired context > * @wctxt: The write context that will be handed to the write function >+ * @use_atomic: True if the write_atomic callback is to be used > * > * Return: True if this context still owns the console. False if > * ownership was handed over or taken. >@@ -873,8 +879,7 @@ EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); > * When true is returned, @wctxt->ctxt.backlog indicates whether there are > * still records pending in the ringbuffer, > */ >-__maybe_unused >-static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) >+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) > { > struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); > struct console *con = ctxt->console; >@@ -885,7 +890,7 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > unsigned long con_dropped; > struct nbcon_state cur; > unsigned long dropped; >- bool done; >+ bool done = false; > > /* > * The printk buffers are filled within an unsafe section. This >@@ -924,17 +929,26 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > nbcon_state_read(con, &cur); > wctxt->unsafe_takeover = cur.unsafe_takeover; > >- if (con->write_atomic) { >+ if (use_atomic && >+ con->write_atomic) { > done = con->write_atomic(con, wctxt); >- } else { >- nbcon_context_release(ctxt); >- WARN_ON_ONCE(1); >- done = false; >+ >+ } else if (!use_atomic && >+ con->write_thread && >+ con->kthread) { >+ WARN_ON_ONCE(con->kthread != current); >+ done = con->write_thread(con, wctxt); > } > >- /* If not done, the emit was aborted. */ >- if (!done) >+ if (!done) { >+ /* >+ * The emit was aborted, probably due to a loss of ownership. >+ * Ensure ownership was lost or released before reporting the >+ * loss. >+ */ >+ nbcon_context_release(ctxt); > return false; >+ } > > /* > * Since any dropped message was successfully output, reset the >@@ -961,6 +975,511 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) > return nbcon_context_exit_unsafe(ctxt); > } > >+/** >+ * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup >+ * @con: Console to operate on >+ * @ctxt: The acquire context that contains the state >+ * at console_acquire() >+ * >+ * Return: True if the thread should shutdown or if the console is >+ * allowed to print and a record is available. False otherwise. >+ * >+ * After the thread wakes up, it must first check if it should shutdown before >+ * attempting any printing. >+ */ >+static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) >+{ >+ bool is_usable; >+ short flags; >+ int cookie; >+ >+ if (kthread_should_stop()) >+ return true; >+ >+ cookie = console_srcu_read_lock(); >+ flags = console_srcu_read_flags(con); >+ is_usable = console_is_usable(con, flags, false); >+ console_srcu_read_unlock(cookie); >+ >+ if (!is_usable) >+ return false; >+ >+ /* Bring the sequence in @ctxt up to date */ >+ ctxt->seq = nbcon_seq_read(con); >+ >+ return prb_read_valid(prb, ctxt->seq, NULL); >+} >+ >+/** >+ * nbcon_kthread_func - The printer thread function >+ * @__console: Console to operate on >+ */ >+static int nbcon_kthread_func(void *__console) >+{ >+ struct console *con = __console; >+ struct nbcon_write_context wctxt = { >+ .ctxt.console = con, >+ .ctxt.prio = NBCON_PRIO_NORMAL, >+ }; >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); >+ unsigned long flags; >+ short con_flags; >+ bool backlog; >+ int cookie; >+ int ret; >+ >+wait_for_event: >+ /* >+ * Guarantee this task is visible on the rcuwait before >+ * checking the wake condition. >+ * >+ * The full memory barrier within set_current_state() of >+ * ___rcuwait_wait_event() pairs with the full memory >+ * barrier within rcuwait_has_sleeper(). >+ * >+ * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. >+ */ >+ ret = rcuwait_wait_event(&con->rcuwait, >+ nbcon_kthread_should_wakeup(con, ctxt), >+ TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ >+ >+ if (kthread_should_stop()) >+ return 0; >+ >+ /* Wait was interrupted by a spurious signal, go back to sleep. */ >+ if (ret) >+ goto wait_for_event; >+ >+ do { >+ backlog = false; >+ >+ cookie = console_srcu_read_lock(); >+ >+ con_flags = console_srcu_read_flags(con); >+ >+ if (console_is_usable(con, con_flags, false)) { >+ con->driver_enter(con, &flags); >+ >+ /* >+ * Ensure this stays on the CPU to make handover and >+ * takeover possible. >+ */ >+ cant_migrate(); >+ >+ if (nbcon_context_try_acquire(ctxt)) { >+ /* >+ * If the emit fails, this context is no >+ * longer the owner. >+ */ >+ if (nbcon_emit_next_record(&wctxt, false)) { >+ nbcon_context_release(ctxt); >+ backlog = ctxt->backlog; >+ } >+ } >+ >+ con->driver_exit(con, flags); >+ } >+ >+ console_srcu_read_unlock(cookie); >+ >+ } while (backlog); >+ >+ goto wait_for_event; >+} >+ >+/** >+ * nbcon_irq_work - irq work to wake printk thread >+ * @irq_work: The irq work to operate on >+ */ >+static void nbcon_irq_work(struct irq_work *irq_work) >+{ >+ struct console *con = container_of(irq_work, struct console, irq_work); >+ >+ nbcon_kthread_wake(con); >+} >+ >+static inline bool rcuwait_has_sleeper(struct rcuwait *w) >+{ >+ bool has_sleeper; >+ >+ rcu_read_lock(); >+ /* >+ * Guarantee any new records can be seen by tasks preparing to wait >+ * before this context checks if the rcuwait is empty. >+ * >+ * This full memory barrier pairs with the full memory barrier within >+ * set_current_state() of ___rcuwait_wait_event(), which is called >+ * after prepare_to_rcuwait() adds the waiter but before it has >+ * checked the wait condition. >+ * >+ * This pairs with nbcon_kthread_func:A. >+ */ >+ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ >+ has_sleeper = !!rcu_dereference(w->task); >+ rcu_read_unlock(); >+ >+ return has_sleeper; >+} >+ >+/** >+ * nbcon_wake_threads - Wake up printing threads using irq_work >+ */ >+void nbcon_wake_threads(void) >+{ >+ struct console *con; >+ int cookie; >+ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ /* >+ * Only schedule irq_work if the printing thread is >+ * actively waiting. If not waiting, the thread will >+ * notice by itself that it has work to do. >+ */ >+ if (con->kthread && rcuwait_has_sleeper(&con->rcuwait)) >+ irq_work_queue(&con->irq_work); >+ } >+ console_srcu_read_unlock(cookie); >+} >+ >+/* Track the nbcon emergency nesting per CPU. */ >+static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); >+static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; >+ >+/** >+ * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer >+ * >+ * Return: Either a pointer to the per CPU emergency nesting counter of >+ * the current CPU or to the init data during early boot. >+ */ >+static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) >+{ >+ /* >+ * The value of __printk_percpu_data_ready gets set in normal >+ * context and before SMP initialization. As a result it could >+ * never change while inside an nbcon emergency section. >+ */ >+ if (!printk_percpu_data_ready()) >+ return &early_nbcon_pcpu_emergency_nesting; >+ >+ return this_cpu_ptr(&nbcon_pcpu_emergency_nesting); >+} >+ >+/** >+ * nbcon_atomic_emit_one - Print one record for an nbcon console using the >+ * write_atomic() callback >+ * @wctxt: An initialized write context struct to use >+ * for this context >+ * >+ * Return: False if the given console could not print a record or there >+ * are no more records to print, otherwise true. >+ * >+ * This is an internal helper to handle the locking of the console before >+ * calling nbcon_emit_next_record(). >+ */ >+static bool nbcon_atomic_emit_one(struct nbcon_write_context *wctxt) >+{ >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); >+ >+ if (!nbcon_context_try_acquire(ctxt)) >+ return false; >+ >+ /* >+ * nbcon_emit_next_record() returns false when the console was >+ * handed over or taken over. In both cases the context is no >+ * longer valid. >+ */ >+ if (!nbcon_emit_next_record(wctxt, true)) >+ return false; >+ >+ nbcon_context_release(ctxt); >+ >+ return ctxt->backlog; >+} >+ >+/** >+ * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon >+ * printing on the current CPU >+ * >+ * Context: Any context which could not be migrated to another CPU. >+ * Return: The nbcon_prio to use for acquiring an nbcon console in this >+ * context for printing. >+ */ >+enum nbcon_prio nbcon_get_default_prio(void) >+{ >+ unsigned int *cpu_emergency_nesting; >+ >+ if (this_cpu_in_panic()) >+ return NBCON_PRIO_PANIC; >+ >+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); >+ if (*cpu_emergency_nesting) >+ return NBCON_PRIO_EMERGENCY; >+ >+ return NBCON_PRIO_NORMAL; >+} >+ >+/** >+ * nbcon_atomic_emit_next_record - Print one record for an nbcon console >+ * using the write_atomic() callback >+ * @con: The console to print on >+ * @handover: Will be set to true if a printk waiter has taken over the >+ * console_lock, in which case the caller is no longer holding >+ * both the console_lock and the SRCU read lock. Otherwise it >+ * is set to false. >+ * @cookie: The cookie from the SRCU read lock. >+ * >+ * Context: Any context which could not be migrated to another CPU. >+ * Return: True if a record could be printed, otherwise false. >+ * >+ * This function is meant to be called by console_flush_all() to print records >+ * on nbcon consoles using the write_atomic() callback. Essentially it is the >+ * nbcon version of console_emit_next_record(). >+ */ >+bool nbcon_atomic_emit_next_record(struct console *con, bool *handover, int cookie) >+{ >+ struct nbcon_write_context wctxt = { }; >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); >+ unsigned long driver_flags; >+ bool progress = false; >+ unsigned long flags; >+ >+ *handover = false; >+ >+ /* Use the same locking order as console_emit_next_record(). */ >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { >+ printk_safe_enter_irqsave(flags); >+ console_lock_spinning_enable(); >+ stop_critical_timings(); >+ } >+ >+ con->driver_enter(con, &driver_flags); >+ cant_migrate(); >+ >+ ctxt->console = con; >+ ctxt->prio = nbcon_get_default_prio(); >+ >+ progress = nbcon_atomic_emit_one(&wctxt); >+ >+ con->driver_exit(con, driver_flags); >+ >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { >+ start_critical_timings(); >+ *handover = console_lock_spinning_disable_and_check(cookie); >+ printk_safe_exit_irqrestore(flags); >+ } >+ >+ return progress; >+} >+ >+/** >+ * __nbcon_atomic_flush_all - Flush all nbcon consoles using their >+ * write_atomic() callback >+ * @stop_seq: Flush up until this record >+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers >+ */ >+static void __nbcon_atomic_flush_all(u64 stop_seq, bool allow_unsafe_takeover) >+{ >+ struct nbcon_write_context wctxt = { }; >+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); >+ struct console *con; >+ bool any_progress; >+ int cookie; >+ >+ do { >+ any_progress = false; >+ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ short flags = console_srcu_read_flags(con); >+ unsigned long irq_flags; >+ >+ if (!(flags & CON_NBCON)) >+ continue; >+ >+ if (!console_is_usable(con, flags, true)) >+ continue; >+ >+ if (nbcon_seq_read(con) >= stop_seq) >+ continue; >+ >+ memset(ctxt, 0, sizeof(*ctxt)); >+ ctxt->console = con; >+ ctxt->spinwait_max_us = 2000; >+ ctxt->allow_unsafe_takeover = allow_unsafe_takeover; >+ >+ /* >+ * Atomic flushing does not use console driver >+ * synchronization (i.e. it does not hold the port >+ * lock for uart consoles). Therefore IRQs must be >+ * disabled to avoid being interrupted and then >+ * calling into a driver that will deadlock trying >+ * acquire console ownership. >+ * >+ * This also disables migration in order to get the >+ * current CPU priority. >+ */ >+ local_irq_save(irq_flags); >+ >+ ctxt->prio = nbcon_get_default_prio(); >+ >+ any_progress |= nbcon_atomic_emit_one(&wctxt); >+ >+ local_irq_restore(irq_flags); >+ } >+ console_srcu_read_unlock(cookie); >+ } while (any_progress); >+} >+ >+/** >+ * nbcon_atomic_flush_all - Flush all nbcon consoles using their >+ * write_atomic() callback >+ * >+ * Flush the backlog up through the currently newest record. Any new >+ * records added while flushing will not be flushed. This is to avoid >+ * one CPU printing unbounded because other CPUs continue to add records. >+ */ >+void nbcon_atomic_flush_all(void) >+{ >+ __nbcon_atomic_flush_all(prb_next_reserve_seq(prb), false); >+} >+ >+/** >+ * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their >+ * write_atomic() callback and allowing unsafe hostile takeovers >+ * >+ * Flush the backlog up through the currently newest record. Unsafe hostile >+ * takeovers will be performed, if necessary. >+ */ >+void nbcon_atomic_flush_unsafe(void) >+{ >+ __nbcon_atomic_flush_all(prb_next_reserve_seq(prb), true); >+} >+ >+/** >+ * nbcon_cpu_emergency_enter - Enter an emergency section where printk() >+ * messages for that CPU are only stored >+ * >+ * Upon exiting the emergency section, all stored messages are flushed. >+ * >+ * Context: Any context. Disables preemption. >+ * >+ * When within an emergency section, no printing occurs on that CPU. This >+ * is to allow all emergency messages to be dumped into the ringbuffer before >+ * flushing the ringbuffer. The actual printing occurs when exiting the >+ * outermost emergency section. >+ */ >+void nbcon_cpu_emergency_enter(void) >+{ >+ unsigned int *cpu_emergency_nesting; >+ >+ preempt_disable(); >+ >+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); >+ (*cpu_emergency_nesting)++; >+} >+ >+/** >+ * nbcon_cpu_emergency_exit - Exit an emergency section and flush the >+ * stored messages >+ * >+ * Flushing only occurs when exiting all nesting for the CPU. >+ * >+ * Context: Any context. Enables preemption. >+ */ >+void nbcon_cpu_emergency_exit(void) >+{ >+ unsigned int *cpu_emergency_nesting; >+ bool do_trigger_flush = false; >+ >+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); >+ >+ WARN_ON_ONCE(*cpu_emergency_nesting == 0); >+ >+ if (*cpu_emergency_nesting == 1) >+ do_trigger_flush = true; >+ >+ /* Undo the nesting count of nbcon_cpu_emergency_enter(). */ >+ (*cpu_emergency_nesting)--; >+ >+ preempt_enable(); >+ >+ if (do_trigger_flush) >+ printk_trigger_flush(); >+} >+ >+/** >+ * nbcon_kthread_stop - Stop a printer thread >+ * @con: Console to operate on >+ */ >+static void nbcon_kthread_stop(struct console *con) >+{ >+ lockdep_assert_console_list_lock_held(); >+ >+ if (!con->kthread) >+ return; >+ >+ kthread_stop(con->kthread); >+ con->kthread = NULL; >+} >+ >+/** >+ * nbcon_kthread_create - Create a printer thread >+ * @con: Console to operate on >+ * >+ * If it fails, let the console proceed. The atomic part might >+ * be usable and useful. >+ */ >+void nbcon_kthread_create(struct console *con) >+{ >+ struct task_struct *kt; >+ >+ lockdep_assert_console_list_lock_held(); >+ >+ if (!(con->flags & CON_NBCON) || !con->write_thread) >+ return; >+ >+ if (!printk_threads_enabled || con->kthread) >+ return; >+ >+ /* >+ * Printer threads cannot be started as long as any boot console is >+ * registered because there is no way to synchronize the hardware >+ * registers between boot console code and regular console code. >+ */ >+ if (have_boot_console) >+ return; >+ >+ kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); >+ if (IS_ERR(kt)) { >+ con_printk(KERN_ERR, con, "failed to start printing thread\n"); >+ return; >+ } >+ >+ con->kthread = kt; >+ >+ /* >+ * It is important that console printing threads are scheduled >+ * shortly after a printk call and with generous runtime budgets. >+ */ >+ sched_set_normal(con->kthread, -20); >+} >+ >+static int __init printk_setup_threads(void) >+{ >+ struct console *con; >+ >+ console_list_lock(); >+ printk_threads_enabled = true; >+ for_each_console(con) >+ nbcon_kthread_create(con); >+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && printing_via_unlock) >+ nbcon_legacy_kthread_create(); >+ console_list_unlock(); >+ return 0; >+} >+early_initcall(printk_setup_threads); >+ > /** > * nbcon_alloc - Allocate buffers needed by the nbcon console > * @con: Console to allocate buffers for >@@ -1007,8 +1526,11 @@ void nbcon_init(struct console *con) > /* nbcon_alloc() must have been called and successful! */ > BUG_ON(!con->pbufs); > >+ rcuwait_init(&con->rcuwait); >+ init_irq_work(&con->irq_work, nbcon_irq_work); > nbcon_seq_force(con, con->seq); > nbcon_state_set(con, &state); >+ nbcon_kthread_create(con); > } > > /** >@@ -1019,6 +1541,7 @@ void nbcon_free(struct console *con) > { > struct nbcon_state state = { }; > >+ nbcon_kthread_stop(con); > nbcon_state_set(con, &state); > > /* Boot consoles share global printk buffers. */ >@@ -1027,3 +1550,115 @@ void nbcon_free(struct console *con) > > con->pbufs = NULL; > } >+ >+static inline bool uart_is_nbcon(struct uart_port *up) >+{ >+ int cookie; >+ bool ret; >+ >+ if (!uart_console(up)) >+ return false; >+ >+ cookie = console_srcu_read_lock(); >+ ret = (console_srcu_read_flags(up->cons) & CON_NBCON); >+ console_srcu_read_unlock(cookie); >+ return ret; >+} >+ >+/** >+ * nbcon_acquire - The second half of the port locking wrapper >+ * @up: The uart port whose @lock was locked >+ * >+ * The uart_port_lock() wrappers will first lock the spin_lock @up->lock. >+ * Then this function is called to implement nbcon-specific processing. >+ * >+ * If @up is an nbcon console, this console will be acquired and marked as >+ * unsafe. Otherwise this function does nothing. >+ * >+ * nbcon consoles acquired via the port lock wrapper always use priority >+ * NBCON_PRIO_NORMAL. >+ */ >+void nbcon_acquire(struct uart_port *up) >+{ >+ struct console *con = up->cons; >+ struct nbcon_context ctxt; >+ >+ if (!uart_is_nbcon(up)) >+ return; >+ >+ WARN_ON_ONCE(con->locked_port); >+ >+ do { >+ do { >+ memset(&ctxt, 0, sizeof(ctxt)); >+ ctxt.console = con; >+ ctxt.prio = NBCON_PRIO_NORMAL; >+ } while (!nbcon_context_try_acquire(&ctxt)); >+ >+ } while (!nbcon_context_enter_unsafe(&ctxt)); >+ >+ con->locked_port = true; >+} >+EXPORT_SYMBOL_GPL(nbcon_acquire); >+ >+/** >+ * nbcon_release - The first half of the port unlocking wrapper >+ * @up: The uart port whose @lock is about to be unlocked >+ * >+ * The uart_port_unlock() wrappers will first call this function to implement >+ * nbcon-specific processing. Then afterwards the uart_port_unlock() wrappers >+ * will unlock the spin_lock @up->lock. >+ * >+ * If @up is an nbcon console, the console will be marked as safe and >+ * released. Otherwise this function does nothing. >+ * >+ * nbcon consoles acquired via the port lock wrapper always use priority >+ * NBCON_PRIO_NORMAL. >+ */ >+void nbcon_release(struct uart_port *up) >+{ >+ struct console *con = up->cons; >+ struct nbcon_context ctxt = { >+ .console = con, >+ .prio = NBCON_PRIO_NORMAL, >+ }; >+ >+ if (!con->locked_port) >+ return; >+ >+ if (nbcon_context_exit_unsafe(&ctxt)) >+ nbcon_context_release(&ctxt); >+ >+ con->locked_port = false; >+} >+EXPORT_SYMBOL_GPL(nbcon_release); >+ >+/** >+ * printk_kthread_shutdown - shutdown all threaded printers >+ * >+ * On system shutdown all threaded printers are stopped. This allows printk >+ * to transition back to atomic printing, thus providing a robust mechanism >+ * for the final shutdown/reboot messages to be output. >+ */ >+static void printk_kthread_shutdown(void) >+{ >+ struct console *con; >+ >+ console_list_lock(); >+ for_each_console(con) { >+ if (con->flags & CON_NBCON) >+ nbcon_kthread_stop(con); >+ } >+ console_list_unlock(); >+} >+ >+static struct syscore_ops printk_syscore_ops = { >+ .shutdown = printk_kthread_shutdown, >+}; >+ >+static int __init printk_init_ops(void) >+{ >+ register_syscore_ops(&printk_syscore_ops); >+ return 0; >+} >+device_initcall(printk_init_ops); >diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c >index f2444b581e16c..90e951bd02524 100644 >--- a/kernel/printk/printk.c >+++ b/kernel/printk/printk.c >@@ -282,6 +282,7 @@ EXPORT_SYMBOL(console_list_unlock); > * Return: A cookie to pass to console_srcu_read_unlock(). > */ > int console_srcu_read_lock(void) >+ __acquires(&console_srcu) > { > return srcu_read_lock_nmisafe(&console_srcu); > } >@@ -295,6 +296,7 @@ EXPORT_SYMBOL(console_srcu_read_lock); > * Counterpart to console_srcu_read_lock() > */ > void console_srcu_read_unlock(int cookie) >+ __releases(&console_srcu) > { > srcu_read_unlock_nmisafe(&console_srcu, cookie); > } >@@ -347,6 +349,29 @@ static bool panic_in_progress(void) > return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); > } > >+/* Return true if a panic is in progress on the current CPU. */ >+bool this_cpu_in_panic(void) >+{ >+ /* >+ * We can use raw_smp_processor_id() here because it is impossible for >+ * the task to be migrated to the panic_cpu, or away from it. If >+ * panic_cpu has already been set, and we're not currently executing on >+ * that CPU, then we never will be. >+ */ >+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); >+} >+ >+/* >+ * Return true if a panic is in progress on a remote CPU. >+ * >+ * On true, the local CPU should immediately release any printing resources >+ * that may be needed by the panic CPU. >+ */ >+bool other_cpu_in_panic(void) >+{ >+ return (panic_in_progress() && !this_cpu_in_panic()); >+} >+ > /* > * This is used for debugging the mess that is the VT code by > * keeping track if we have the console semaphore held. It's >@@ -438,14 +463,33 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; > /* syslog_lock protects syslog_* variables and write access to clear_seq. */ > static DEFINE_MUTEX(syslog_lock); > >-#ifdef CONFIG_PRINTK > /* >- * During panic, heavy printk by other CPUs can delay the >- * panic and risk deadlock on console resources. >+ * Specifies if a legacy console is registered. If legacy consoles are >+ * present, it is necessary to perform the console_lock/console_unlock dance >+ * whenever console flushing should occur. > */ >-static int __read_mostly suppress_panic_printk; >+bool have_legacy_console; > >+/* >+ * Specifies if an nbcon console is registered. If nbcon consoles are present, >+ * synchronous printing of legacy consoles will not occur during panic until >+ * the backtrace has been stored to the ringbuffer. >+ */ >+bool have_nbcon_console; >+ >+/* >+ * Specifies if a boot console is registered. If boot consoles are present, >+ * nbcon consoles cannot print simultaneously and must be synchronized by >+ * the console lock. This is because boot consoles and nbcon consoles may >+ * have mapped the same hardware. >+ */ >+bool have_boot_console; >+ >+#ifdef CONFIG_PRINTK > DECLARE_WAIT_QUEUE_HEAD(log_wait); >+ >+static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); >+ > /* All 3 protected by @syslog_lock. */ > /* the next printk record to read by syslog(READ) or /proc/kmsg */ > static u64 syslog_seq; >@@ -1844,12 +1888,25 @@ static bool console_waiter; > * there may be a waiter spinning (like a spinlock). Also it must be > * ready to hand over the lock at the end of the section. > */ >-static void console_lock_spinning_enable(void) >+void console_lock_spinning_enable(void) > { >+ /* >+ * Do not use spinning in panic(). The panic CPU wants to keep the lock. >+ * Non-panic CPUs abandon the flush anyway. >+ * >+ * Just keep the lockdep annotation. The panic-CPU should avoid >+ * taking console_owner_lock because it might cause a deadlock. >+ * This looks like the easiest way how to prevent false lockdep >+ * reports without handling races a lockless way. >+ */ >+ if (panic_in_progress()) >+ goto lockdep; >+ > raw_spin_lock(&console_owner_lock); > console_owner = current; > raw_spin_unlock(&console_owner_lock); > >+lockdep: > /* The waiter may spin on us after setting console_owner */ > spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); > } >@@ -1870,10 +1927,26 @@ static void console_lock_spinning_enable(void) > * > * Return: 1 if the lock rights were passed, 0 otherwise. > */ >-static int console_lock_spinning_disable_and_check(int cookie) >+int console_lock_spinning_disable_and_check(int cookie) > { > int waiter; > >+ /* >+ * Ignore spinning waiters during panic() because they might get stopped >+ * or blocked at any time, >+ * >+ * It is safe because nobody is allowed to start spinning during panic >+ * in the first place. If there has been a waiter then non panic CPUs >+ * might stay spinning. They would get stopped anyway. The panic context >+ * will never start spinning and an interrupted spin on panic CPU will >+ * never continue. >+ */ >+ if (panic_in_progress()) { >+ /* Keep lockdep happy. */ >+ spin_release(&console_owner_dep_map, _THIS_IP_); >+ return 0; >+ } >+ > raw_spin_lock(&console_owner_lock); > waiter = READ_ONCE(console_waiter); > console_owner = NULL; >@@ -2259,55 +2332,123 @@ int vprintk_store(int facility, int level, > return ret; > } > >+static bool legacy_allow_panic_sync; >+ >+/* >+ * This acts as a one-way switch to allow legacy consoles to print from >+ * the printk() caller context on a panic CPU. >+ */ >+void printk_legacy_allow_panic_sync(void) >+{ >+ legacy_allow_panic_sync = true; >+} >+ > asmlinkage int vprintk_emit(int facility, int level, > const struct dev_printk_info *dev_info, > const char *fmt, va_list args) > { >+ bool do_trylock_unlock = printing_via_unlock && >+ !IS_ENABLED(CONFIG_PREEMPT_RT); > int printed_len; >- bool in_sched = false; > > /* Suppress unimportant messages after panic happens */ > if (unlikely(suppress_printk)) > return 0; > >- if (unlikely(suppress_panic_printk) && >- atomic_read(&panic_cpu) != raw_smp_processor_id()) >+ /* >+ * The messages on the panic CPU are the most important. If >+ * non-panic CPUs are generating any messages, they will be >+ * silently dropped. >+ */ >+ if (other_cpu_in_panic()) > return 0; > > if (level == LOGLEVEL_SCHED) { > level = LOGLEVEL_DEFAULT; >- in_sched = true; >+ /* If called from the scheduler, we can not call up(). */ >+ do_trylock_unlock = false; > } > > printk_delay(level); > > printed_len = vprintk_store(facility, level, dev_info, fmt, args); > >- /* If called from the scheduler, we can not call up(). */ >- if (!in_sched) { >+ if (!have_boot_console && have_nbcon_console) { >+ bool is_panic_context = this_cpu_in_panic(); >+ >+ /* >+ * In panic, the legacy consoles are not allowed to print from >+ * the printk calling context unless explicitly allowed. This >+ * gives the safe nbcon consoles a chance to print out all the >+ * panic messages first. This restriction only applies if >+ * there are nbcon consoles registered. >+ */ >+ if (is_panic_context) >+ do_trylock_unlock &= legacy_allow_panic_sync; >+ >+ /* >+ * There are situations where nbcon atomic printing should >+ * happen in the printk() caller context: >+ * >+ * - When this CPU is in panic. >+ * >+ * - When booting, before the printing threads have been >+ * started. >+ * >+ * - During shutdown, since the printing threads may not get >+ * a chance to print the final messages. >+ * >+ * Note that if boot consoles are registered, the >+ * console_lock/console_unlock dance must be relied upon >+ * instead because nbcon consoles cannot print simultaneously >+ * with boot consoles. >+ */ >+ if (is_panic_context || >+ !printk_threads_enabled || >+ (system_state > SYSTEM_RUNNING)) { >+ nbcon_atomic_flush_all(); >+ } >+ } >+ >+ nbcon_wake_threads(); >+ >+ if (do_trylock_unlock) { > /* > * The caller may be holding system-critical or > * timing-sensitive locks. Disable preemption during > * printing of all remaining records to all consoles so that > * this context can return as soon as possible. Hopefully > * another printk() caller will take over the printing. >+ * >+ * Also, nbcon_get_default_prio() requires migration disabled. > */ > preempt_disable(); >+ > /* >- * Try to acquire and then immediately release the console >- * semaphore. The release will print out buffers. With the >- * spinning variant, this context tries to take over the >- * printing from another printing context. >+ * Do not emit for EMERGENCY priority. The console will be >+ * explicitly flushed when exiting the emergency section. > */ >- if (console_trylock_spinning()) >- console_unlock(); >+ if (nbcon_get_default_prio() == NBCON_PRIO_EMERGENCY) { >+ do_trylock_unlock = false; >+ } else { >+ /* >+ * Try to acquire and then immediately release the >+ * console semaphore. The release will print out >+ * buffers. With the spinning variant, this context >+ * tries to take over the printing from another >+ * printing context. >+ */ >+ if (console_trylock_spinning()) >+ console_unlock(); >+ } >+ > preempt_enable(); > } > >- if (in_sched) >- defer_console_output(); >- else >+ if (do_trylock_unlock) > wake_up_klogd(); >+ else >+ defer_console_output(); > > return printed_len; > } >@@ -2335,6 +2476,14 @@ EXPORT_SYMBOL(_printk); > static bool pr_flush(int timeout_ms, bool reset_on_progress); > static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); > >+static struct task_struct *nbcon_legacy_kthread; >+ >+static inline void wake_up_legacy_kthread(void) >+{ >+ if (nbcon_legacy_kthread) >+ wake_up_interruptible(&legacy_wait); >+} >+ > #else /* CONFIG_PRINTK */ > > #define printk_time false >@@ -2348,6 +2497,8 @@ static u64 syslog_seq; > static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } > static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } > >+static inline void nbcon_legacy_kthread_create(void) { } >+static inline void wake_up_legacy_kthread(void) { } > #endif /* CONFIG_PRINTK */ > > #ifdef CONFIG_EARLY_PRINTK >@@ -2563,6 +2714,8 @@ void suspend_console(void) > void resume_console(void) > { > struct console *con; >+ short flags; >+ int cookie; > > if (!console_suspend_enabled) > return; >@@ -2579,6 +2732,20 @@ void resume_console(void) > */ > synchronize_srcu(&console_srcu); > >+ /* >+ * Since this runs in task context, wake the threaded printers >+ * directly rather than scheduling irq_work to do it. >+ */ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ flags = console_srcu_read_flags(con); >+ if (flags & CON_NBCON) >+ nbcon_kthread_wake(con); >+ } >+ console_srcu_read_unlock(cookie); >+ >+ wake_up_legacy_kthread(); >+ > pr_flush(1000, true); > } > >@@ -2593,7 +2760,8 @@ void resume_console(void) > */ > static int console_cpu_notify(unsigned int cpu) > { >- if (!cpuhp_tasks_frozen) { >+ if (!cpuhp_tasks_frozen && printing_via_unlock && >+ !IS_ENABLED(CONFIG_PREEMPT_RT)) { > /* If trylock fails, someone else is doing the printing */ > if (console_trylock()) > console_unlock(); >@@ -2601,26 +2769,6 @@ static int console_cpu_notify(unsigned int cpu) > return 0; > } > >-/* >- * Return true if a panic is in progress on a remote CPU. >- * >- * On true, the local CPU should immediately release any printing resources >- * that may be needed by the panic CPU. >- */ >-bool other_cpu_in_panic(void) >-{ >- if (!panic_in_progress()) >- return false; >- >- /* >- * We can use raw_smp_processor_id() here because it is impossible for >- * the task to be migrated to the panic_cpu, or away from it. If >- * panic_cpu has already been set, and we're not currently executing on >- * that CPU, then we never will be. >- */ >- return atomic_read(&panic_cpu) != raw_smp_processor_id(); >-} >- > /** > * console_lock - block the console subsystem from printing > * >@@ -2670,42 +2818,14 @@ int is_console_locked(void) > } > EXPORT_SYMBOL(is_console_locked); > >-/* >- * Check if the given console is currently capable and allowed to print >- * records. >- * >- * Requires the console_srcu_read_lock. >- */ >-static inline bool console_is_usable(struct console *con) >-{ >- short flags = console_srcu_read_flags(con); >- >- if (!(flags & CON_ENABLED)) >- return false; >- >- if ((flags & CON_SUSPENDED)) >- return false; >- >- if (!con->write) >- return false; >- >- /* >- * Console drivers may assume that per-cpu resources have been >- * allocated. So unless they're explicitly marked as being able to >- * cope (CON_ANYTIME) don't call them until this CPU is officially up. >- */ >- if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) >- return false; >- >- return true; >-} >- > static void __console_unlock(void) > { > console_locked = 0; > up_console_sem(); > } > >+static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); >+ > #ifdef CONFIG_PRINTK > > /* >@@ -2776,8 +2896,6 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) > bool printk_get_next_message(struct printk_message *pmsg, u64 seq, > bool is_extended, bool may_suppress) > { >- static int panic_console_dropped; >- > struct printk_buffers *pbufs = pmsg->pbufs; > const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); > const size_t outbuf_sz = sizeof(pbufs->outbuf); >@@ -2805,17 +2923,6 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, > pmsg->seq = r.info->seq; > pmsg->dropped = r.info->seq - seq; > >- /* >- * Check for dropped messages in panic here so that printk >- * suppression can occur as early as possible if necessary. >- */ >- if (pmsg->dropped && >- panic_in_progress() && >- panic_console_dropped++ > 10) { >- suppress_panic_printk = 1; >- pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); >- } >- > /* Skip record that has level above the console loglevel. */ > if (may_suppress && suppress_message_printing(r.info->level)) > goto out; >@@ -2881,31 +2988,45 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co > con->dropped = 0; > } > >- /* >- * While actively printing out messages, if another printk() >- * were to occur on another CPU, it may wait for this one to >- * finish. This task can not be preempted if there is a >- * waiter waiting to take over. >- * >- * Interrupts are disabled because the hand over to a waiter >- * must not be interrupted until the hand over is completed >- * (@console_waiter is cleared). >- */ >- printk_safe_enter_irqsave(flags); >- console_lock_spinning_enable(); >- >- /* Do not trace print latency. */ >- stop_critical_timings(); >- > /* Write everything out to the hardware. */ >- con->write(con, outbuf, pmsg.outbuf_len); > >- start_critical_timings(); >+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { >+ /* >+ * On PREEMPT_RT this function is either in a thread or >+ * panic context. So there is no need for concern about >+ * printk reentrance, handovers, or lockdep complaints. >+ */ > >- con->seq = pmsg.seq + 1; >+ con->write(con, outbuf, pmsg.outbuf_len); >+ con->seq = pmsg.seq + 1; >+ } else { >+ /* >+ * While actively printing out messages, if another printk() >+ * were to occur on another CPU, it may wait for this one to >+ * finish. This task can not be preempted if there is a >+ * waiter waiting to take over. >+ * >+ * Interrupts are disabled because the hand over to a waiter >+ * must not be interrupted until the hand over is completed >+ * (@console_waiter is cleared). >+ */ >+ printk_safe_enter_irqsave(flags); >+ console_lock_spinning_enable(); > >- *handover = console_lock_spinning_disable_and_check(cookie); >- printk_safe_exit_irqrestore(flags); >+ /* Do not trace print latency. */ >+ stop_critical_timings(); >+ >+ lock_map_acquire_try(&printk_legacy_map); >+ con->write(con, outbuf, pmsg.outbuf_len); >+ lock_map_release(&printk_legacy_map); >+ >+ start_critical_timings(); >+ >+ con->seq = pmsg.seq + 1; >+ >+ *handover = console_lock_spinning_disable_and_check(cookie); >+ printk_safe_exit_irqrestore(flags); >+ } > skip: > return true; > } >@@ -2958,13 +3079,33 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove > > cookie = console_srcu_read_lock(); > for_each_console_srcu(con) { >+ short flags = console_srcu_read_flags(con); >+ u64 printk_seq; > bool progress; > >- if (!console_is_usable(con)) >+ /* >+ * console_flush_all() is only for legacy consoles, >+ * unless the nbcon console has no kthread printer. >+ */ >+ if ((flags & CON_NBCON) && con->kthread) >+ continue; >+ >+ if (!console_is_usable(con, flags, true)) > continue; > any_usable = true; > >- progress = console_emit_next_record(con, handover, cookie); >+ if (flags & CON_NBCON) { >+ >+ lock_map_acquire_try(&printk_legacy_map); >+ progress = nbcon_atomic_emit_next_record(con, handover, cookie); >+ lock_map_release(&printk_legacy_map); >+ >+ printk_seq = nbcon_seq_read(con); >+ } else { >+ progress = console_emit_next_record(con, handover, cookie); >+ >+ printk_seq = con->seq; >+ } > > /* > * If a handover has occurred, the SRCU read lock >@@ -2974,8 +3115,8 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove > return false; > > /* Track the next of the highest seq flushed. */ >- if (con->seq > *next_seq) >- *next_seq = con->seq; >+ if (printk_seq > *next_seq) >+ *next_seq = printk_seq; > > if (!progress) > continue; >@@ -2998,19 +3139,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove > return false; > } > >-/** >- * console_unlock - unblock the console subsystem from printing >- * >- * Releases the console_lock which the caller holds to block printing of >- * the console subsystem. >- * >- * While the console_lock was held, console output may have been buffered >- * by printk(). If this is the case, console_unlock(); emits >- * the output prior to releasing the lock. >- * >- * console_unlock(); may be called from any context. >- */ >-void console_unlock(void) >+static void console_flush_and_unlock(void) > { > bool do_cond_resched; > bool handover; >@@ -3054,6 +3183,32 @@ void console_unlock(void) > */ > } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); > } >+ >+/** >+ * console_unlock - unblock the console subsystem from printing >+ * >+ * Releases the console_lock which the caller holds to block printing of >+ * the console subsystem. >+ * >+ * While the console_lock was held, console output may have been buffered >+ * by printk(). If this is the case, console_unlock(); emits >+ * the output prior to releasing the lock. >+ * >+ * console_unlock(); may be called from any context. >+ */ >+void console_unlock(void) >+{ >+ /* >+ * PREEMPT_RT relies on kthread and atomic consoles for printing. >+ * It never attempts to print from console_unlock(). >+ */ >+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { >+ __console_unlock(); >+ return; >+ } >+ >+ console_flush_and_unlock(); >+} > EXPORT_SYMBOL(console_unlock); > > /** >@@ -3187,7 +3342,10 @@ void console_flush_on_panic(enum con_flush_mode mode) > console_srcu_read_unlock(cookie); > } > >- console_flush_all(false, &next_seq, &handover); >+ nbcon_atomic_flush_all(); >+ >+ if (printing_via_unlock) >+ console_flush_all(false, &next_seq, &handover); > } > > /* >@@ -3244,13 +3402,122 @@ EXPORT_SYMBOL(console_stop); > > void console_start(struct console *console) > { >+ short flags; >+ > console_list_lock(); > console_srcu_write_flags(console, console->flags | CON_ENABLED); >+ flags = console->flags; > console_list_unlock(); >+ >+ /* >+ * Ensure that all SRCU list walks have completed. The related >+ * printing context must be able to see it is enabled so that >+ * it is guaranteed to wake up and resume printing. >+ */ >+ synchronize_srcu(&console_srcu); >+ >+ if (flags & CON_NBCON) >+ nbcon_kthread_wake(console); >+ else >+ wake_up_legacy_kthread(); >+ > __pr_flush(console, 1000, true); > } > EXPORT_SYMBOL(console_start); > >+#ifdef CONFIG_PRINTK >+static bool printer_should_wake(void) >+{ >+ bool available = false; >+ struct console *con; >+ int cookie; >+ >+ if (kthread_should_stop()) >+ return true; >+ >+ cookie = console_srcu_read_lock(); >+ for_each_console_srcu(con) { >+ short flags = console_srcu_read_flags(con); >+ u64 printk_seq; >+ >+ /* >+ * The legacy printer thread is only for legacy consoles, >+ * unless the nbcon console has no kthread printer. >+ */ >+ if ((flags & CON_NBCON) && con->kthread) >+ continue; >+ >+ if (!console_is_usable(con, flags, true)) >+ continue; >+ >+ if (flags & CON_NBCON) { >+ printk_seq = nbcon_seq_read(con); >+ } else { >+ /* >+ * It is safe to read @seq because only this >+ * thread context updates @seq. >+ */ >+ printk_seq = con->seq; >+ } >+ >+ if (prb_read_valid(prb, printk_seq, NULL)) { >+ available = true; >+ break; >+ } >+ } >+ console_srcu_read_unlock(cookie); >+ >+ return available; >+} >+ >+static int nbcon_legacy_kthread_func(void *unused) >+{ >+ int error; >+ >+ for (;;) { >+ error = wait_event_interruptible(legacy_wait, printer_should_wake()); >+ >+ if (kthread_should_stop()) >+ break; >+ >+ if (error) >+ continue; >+ >+ console_lock(); >+ console_flush_and_unlock(); >+ } >+ >+ return 0; >+} >+ >+void nbcon_legacy_kthread_create(void) >+{ >+ struct task_struct *kt; >+ >+ lockdep_assert_held(&console_mutex); >+ >+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >+ return; >+ >+ if (!printk_threads_enabled || nbcon_legacy_kthread) >+ return; >+ >+ kt = kthread_run(nbcon_legacy_kthread_func, NULL, "pr/legacy"); >+ if (IS_ERR(kt)) { >+ pr_err("unable to start legacy printing thread\n"); >+ return; >+ } >+ >+ nbcon_legacy_kthread = kt; >+ >+ /* >+ * It is important that console printing threads are scheduled >+ * shortly after a printk call and with generous runtime budgets. >+ */ >+ sched_set_normal(nbcon_legacy_kthread, -20); >+} >+#endif /* CONFIG_PRINTK */ >+ > static int __read_mostly keep_bootcon; > > static int __init keep_bootcon_setup(char *str) >@@ -3382,11 +3649,20 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered) > > newcon->seq = prb_next_seq(prb); > for_each_console(con) { >- if ((con->flags & CON_BOOT) && >- (con->flags & CON_ENABLED) && >- con->seq < newcon->seq) { >- newcon->seq = con->seq; >+ u64 seq; >+ >+ if (!((con->flags & CON_BOOT) && >+ (con->flags & CON_ENABLED))) { >+ continue; > } >+ >+ if (con->flags & CON_NBCON) >+ seq = nbcon_seq_read(con); >+ else >+ seq = con->seq; >+ >+ if (seq < newcon->seq) >+ newcon->seq = seq; > } > } > >@@ -3503,8 +3779,16 @@ void register_console(struct console *newcon) > newcon->dropped = 0; > console_init_seq(newcon, bootcon_registered); > >- if (newcon->flags & CON_NBCON) >+ if (newcon->flags & CON_NBCON) { >+ have_nbcon_console = true; > nbcon_init(newcon); >+ } else { >+ have_legacy_console = true; >+ nbcon_legacy_kthread_create(); >+ } >+ >+ if (newcon->flags & CON_BOOT) >+ have_boot_console = true; > > /* > * Put this console in the list - keep the >@@ -3558,6 +3842,11 @@ EXPORT_SYMBOL(register_console); > /* Must be called under console_list_lock(). */ > static int unregister_console_locked(struct console *console) > { >+ bool is_boot_con = (console->flags & CON_BOOT); >+ bool found_legacy_con = false; >+ bool found_nbcon_con = false; >+ bool found_boot_con = false; >+ struct console *c; > int res; > > lockdep_assert_console_list_lock_held(); >@@ -3605,6 +3894,42 @@ static int unregister_console_locked(struct console *console) > if (console->exit) > res = console->exit(console); > >+ /* >+ * With this console gone, the global flags tracking registered >+ * console types may have changed. Update them. >+ */ >+ for_each_console(c) { >+ if (c->flags & CON_BOOT) >+ found_boot_con = true; >+ >+ if (c->flags & CON_NBCON) >+ found_nbcon_con = true; >+ else >+ found_legacy_con = true; >+ } >+ if (!found_boot_con) >+ have_boot_console = false; >+ if (!found_legacy_con) >+ have_legacy_console = false; >+ if (!found_nbcon_con) >+ have_nbcon_console = false; >+ >+ /* >+ * When the last boot console unregisters, start up the >+ * printing threads. >+ */ >+ if (is_boot_con && !have_boot_console) { >+ for_each_console(c) >+ nbcon_kthread_create(c); >+ } >+ >+#ifdef CONFIG_PRINTK >+ if (!printing_via_unlock && nbcon_legacy_kthread) { >+ kthread_stop(nbcon_legacy_kthread); >+ nbcon_legacy_kthread = NULL; >+ } >+#endif >+ > return res; > } > >@@ -3755,31 +4080,42 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre > u64 last_diff = 0; > u64 printk_seq; > short flags; >+ bool locked; > int cookie; > u64 diff; > u64 seq; > > might_sleep(); > >- seq = prb_next_seq(prb); >+ seq = prb_next_reserve_seq(prb); > >- /* Flush the consoles so that records up to @seq are printed. */ >- console_lock(); >- console_unlock(); >+ /* >+ * Flush the consoles so that records up to @seq are printed. >+ * Otherwise this function will just wait for the threaded printers >+ * to print up to @seq. >+ */ >+ if (printing_via_unlock && !IS_ENABLED(CONFIG_PREEMPT_RT)) { >+ console_lock(); >+ console_unlock(); >+ } > > for (;;) { > unsigned long begin_jiffies; > unsigned long slept_jiffies; > >+ locked = false; > diff = 0; > >- /* >- * Hold the console_lock to guarantee safe access to >- * console->seq. Releasing console_lock flushes more >- * records in case @seq is still not printed on all >- * usable consoles. >- */ >- console_lock(); >+ if (printing_via_unlock) { >+ /* >+ * Hold the console_lock to guarantee safe access to >+ * console->seq. Releasing console_lock flushes more >+ * records in case @seq is still not printed on all >+ * usable consoles. >+ */ >+ console_lock(); >+ locked = true; >+ } > > cookie = console_srcu_read_lock(); > for_each_console_srcu(c) { >@@ -3793,12 +4129,15 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre > * that they make forward progress, so only increment > * @diff for usable consoles. > */ >- if (!console_is_usable(c)) >+ if (!console_is_usable(c, flags, true) && >+ !console_is_usable(c, flags, false)) { > continue; >+ } > > if (flags & CON_NBCON) { > printk_seq = nbcon_seq_read(c); > } else { >+ WARN_ON_ONCE(!locked); > printk_seq = c->seq; > } > >@@ -3810,7 +4149,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre > if (diff != last_diff && reset_on_progress) > remaining_jiffies = timeout_jiffies; > >- console_unlock(); >+ if (locked) >+ console_unlock(); > > /* Note: @diff is 0 if there are no usable consoles. */ > if (diff == 0 || remaining_jiffies == 0) >@@ -3862,9 +4202,16 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) > int pending = this_cpu_xchg(printk_pending, 0); > > if (pending & PRINTK_PENDING_OUTPUT) { >- /* If trylock fails, someone else is doing the printing */ >- if (console_trylock()) >- console_unlock(); >+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { >+ wake_up_interruptible(&legacy_wait); >+ } else { >+ /* >+ * If trylock fails, some other context >+ * will do the printing. >+ */ >+ if (console_trylock()) >+ console_unlock(); >+ } > } > > if (pending & PRINTK_PENDING_WAKEUP) >@@ -3932,11 +4279,16 @@ void defer_console_output(void) > * New messages may have been added directly to the ringbuffer > * using vprintk_store(), so wake any waiters as well. > */ >- __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); >+ int val = PRINTK_PENDING_WAKEUP; >+ >+ if (printing_via_unlock) >+ val |= PRINTK_PENDING_OUTPUT; >+ __wake_up_klogd(val); > } > > void printk_trigger_flush(void) > { >+ nbcon_wake_threads(); > defer_console_output(); > } > >diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c >index fde338606ce83..e7b808b829a04 100644 >--- a/kernel/printk/printk_ringbuffer.c >+++ b/kernel/printk/printk_ringbuffer.c >@@ -6,6 +6,7 @@ > #include <linux/errno.h> > #include <linux/bug.h> > #include "printk_ringbuffer.h" >+#include "internal.h" > > /** > * DOC: printk_ringbuffer overview >@@ -303,6 +304,9 @@ > * > * desc_push_tail:B / desc_reserve:D > * set descriptor reusable (state), then push descriptor tail (id) >+ * >+ * desc_update_last_finalized:A / desc_last_finalized_seq:A >+ * store finalized record, then set new highest finalized sequence number > */ > > #define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) >@@ -1030,9 +1034,13 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, > unsigned long next_lpos; > > if (size == 0) { >- /* Specify a data-less block. */ >- blk_lpos->begin = NO_LPOS; >- blk_lpos->next = NO_LPOS; >+ /* >+ * Data blocks are not created for empty lines. Instead, the >+ * reader will recognize these special lpos values and handle >+ * it appropriately. >+ */ >+ blk_lpos->begin = EMPTY_LINE_LPOS; >+ blk_lpos->next = EMPTY_LINE_LPOS; > return NULL; > } > >@@ -1210,10 +1218,18 @@ static const char *get_data(struct prb_data_ring *data_ring, > > /* Data-less data block description. */ > if (BLK_DATALESS(blk_lpos)) { >- if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { >+ /* >+ * Records that are just empty lines are also valid, even >+ * though they do not have a data block. For such records >+ * explicitly return empty string data to signify success. >+ */ >+ if (blk_lpos->begin == EMPTY_LINE_LPOS && >+ blk_lpos->next == EMPTY_LINE_LPOS) { > *data_size = 0; > return ""; > } >+ >+ /* Data lost, invalid, or otherwise unavailable. */ > return NULL; > } > >@@ -1441,20 +1457,118 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer > return false; > } > >+/* >+ * @last_finalized_seq value guarantees that all records up to and including >+ * this sequence number are finalized and can be read. The only exception are >+ * too old records which have already been overwritten. >+ * >+ * It is also guaranteed that @last_finalized_seq only increases. >+ * >+ * Be aware that finalized records following non-finalized records are not >+ * reported because they are not yet available to the reader. For example, >+ * a new record stored via printk() will not be available to a printer if >+ * it follows a record that has not been finalized yet. However, once that >+ * non-finalized record becomes finalized, @last_finalized_seq will be >+ * appropriately updated and the full set of finalized records will be >+ * available to the printer. And since each printk() caller will either >+ * directly print or trigger deferred printing of all available unprinted >+ * records, all printk() messages will get printed. >+ */ >+static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb) >+{ >+ struct prb_desc_ring *desc_ring = &rb->desc_ring; >+ unsigned long ulseq; >+ >+ /* >+ * Guarantee the sequence number is loaded before loading the >+ * associated record in order to guarantee that the record can be >+ * seen by this CPU. This pairs with desc_update_last_finalized:A. >+ */ >+ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq >+ ); /* LMM(desc_last_finalized_seq:A) */ >+ >+ return __ulseq_to_u64seq(rb, ulseq); >+} >+ >+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, >+ struct printk_record *r, unsigned int *line_count); >+ >+/* >+ * Check if there are records directly following @last_finalized_seq that are >+ * finalized. If so, update @last_finalized_seq to the latest of these >+ * records. It is not allowed to skip over records that are not yet finalized. >+ */ >+static void desc_update_last_finalized(struct printk_ringbuffer *rb) >+{ >+ struct prb_desc_ring *desc_ring = &rb->desc_ring; >+ u64 old_seq = desc_last_finalized_seq(rb); >+ unsigned long oldval; >+ unsigned long newval; >+ u64 finalized_seq; >+ u64 try_seq; >+ >+try_again: >+ finalized_seq = old_seq; >+ try_seq = finalized_seq + 1; >+ >+ /* Try to find later finalized records. */ >+ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) { >+ finalized_seq = try_seq; >+ try_seq++; >+ } >+ >+ /* No update needed if no later finalized record was found. */ >+ if (finalized_seq == old_seq) >+ return; >+ >+ oldval = __u64seq_to_ulseq(old_seq); >+ newval = __u64seq_to_ulseq(finalized_seq); >+ >+ /* >+ * Set the sequence number of a later finalized record that has been >+ * seen. >+ * >+ * Guarantee the record data is visible to other CPUs before storing >+ * its sequence number. This pairs with desc_last_finalized_seq:A. >+ * >+ * Memory barrier involvement: >+ * >+ * If desc_last_finalized_seq:A reads from >+ * desc_update_last_finalized:A, then desc_read:A reads from >+ * _prb_commit:B. >+ * >+ * Relies on: >+ * >+ * RELEASE from _prb_commit:B to desc_update_last_finalized:A >+ * matching >+ * ACQUIRE from desc_last_finalized_seq:A to desc_read:A >+ * >+ * Note: _prb_commit:B and desc_update_last_finalized:A can be >+ * different CPUs. However, the desc_update_last_finalized:A >+ * CPU (which performs the release) must have previously seen >+ * _prb_commit:B. >+ */ >+ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq, >+ &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */ >+ old_seq = __ulseq_to_u64seq(rb, oldval); >+ goto try_again; >+ } >+} >+ > /* > * Attempt to finalize a specified descriptor. If this fails, the descriptor > * is either already final or it will finalize itself when the writer commits. > */ >-static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) >+static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id) > { >+ struct prb_desc_ring *desc_ring = &rb->desc_ring; > unsigned long prev_state_val = DESC_SV(id, desc_committed); > struct prb_desc *d = to_desc(desc_ring, id); > >- atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, >- DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ >- >- /* Best effort to remember the last finalized @id. */ >- atomic_long_set(&desc_ring->last_finalized_id, id); >+ if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val, >+ DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */ >+ desc_update_last_finalized(rb); >+ } > } > > /** >@@ -1550,7 +1664,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, > * readers. (For seq==0 there is no previous descriptor.) > */ > if (info->seq > 0) >- desc_make_final(desc_ring, DESC_ID(id - 1)); >+ desc_make_final(rb, DESC_ID(id - 1)); > > r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); > /* If text data allocation fails, a data-less record is committed. */ >@@ -1643,7 +1757,7 @@ void prb_commit(struct prb_reserved_entry *e) > */ > head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ > if (head_id != e->id) >- desc_make_final(desc_ring, e->id); >+ desc_make_final(e->rb, e->id); > } > > /** >@@ -1663,12 +1777,9 @@ void prb_commit(struct prb_reserved_entry *e) > */ > void prb_final_commit(struct prb_reserved_entry *e) > { >- struct prb_desc_ring *desc_ring = &e->rb->desc_ring; >- > _prb_commit(e, desc_finalized); > >- /* Best effort to remember the last finalized @id. */ >- atomic_long_set(&desc_ring->last_finalized_id, e->id); >+ desc_update_last_finalized(e->rb); > } > > /* >@@ -1746,6 +1857,8 @@ static bool copy_data(struct prb_data_ring *data_ring, > * descriptor. However, it also verifies that the record is finalized and has > * the sequence number @seq. On success, 0 is returned. > * >+ * For the panic CPU, committed descriptors are also considered finalized. >+ * > * Error return values: > * -EINVAL: A finalized record with sequence number @seq does not exist. > * -ENOENT: A finalized record with sequence number @seq exists, but its data >@@ -1764,16 +1877,25 @@ static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, > > /* > * An unexpected @id (desc_miss) or @seq mismatch means the record >- * does not exist. A descriptor in the reserved or committed state >- * means the record does not yet exist for the reader. >+ * does not exist. A descriptor in the reserved state means the >+ * record does not yet exist for the reader. > */ > if (d_state == desc_miss || > d_state == desc_reserved || >- d_state == desc_committed || > s != seq) { > return -EINVAL; > } > >+ /* >+ * A descriptor in the committed state means the record does not yet >+ * exist for the reader. However, for the panic CPU, committed >+ * records are also handled as finalized records since they contain >+ * message data in a consistent state and may contain additional >+ * hints as to the cause of the panic. >+ */ >+ if (d_state == desc_committed && !this_cpu_in_panic()) >+ return -EINVAL; >+ > /* > * A descriptor in the reusable state may no longer have its data > * available; report it as existing but with lost data. Or the record >@@ -1832,7 +1954,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, > } > > /* Get the sequence number of the tail descriptor. */ >-static u64 prb_first_seq(struct printk_ringbuffer *rb) >+u64 prb_first_seq(struct printk_ringbuffer *rb) > { > struct prb_desc_ring *desc_ring = &rb->desc_ring; > enum desc_state d_state; >@@ -1875,12 +1997,131 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) > return seq; > } > >-/* >- * Non-blocking read of a record. Updates @seq to the last finalized record >- * (which may have no data available). >+/** >+ * prb_next_reserve_seq() - Get the sequence number after the most recently >+ * reserved record. > * >- * See the description of prb_read_valid() and prb_read_valid_info() >- * for details. >+ * @rb: The ringbuffer to get the sequence number from. >+ * >+ * This is the public function available to readers to see what sequence >+ * number will be assigned to the next reserved record. >+ * >+ * Note that depending on the situation, this value can be equal to or >+ * higher than the sequence number returned by prb_next_seq(). >+ * >+ * Context: Any context. >+ * Return: The sequence number that will be assigned to the next record >+ * reserved. >+ */ >+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb) >+{ >+ struct prb_desc_ring *desc_ring = &rb->desc_ring; >+ unsigned long last_finalized_id; >+ atomic_long_t *state_var; >+ u64 last_finalized_seq; >+ unsigned long head_id; >+ struct prb_desc desc; >+ unsigned long diff; >+ struct prb_desc *d; >+ int err; >+ >+ /* >+ * It may not be possible to read a sequence number for @head_id. >+ * So the ID of @last_finailzed_seq is used to calculate what the >+ * sequence number of @head_id will be. >+ */ >+ >+try_again: >+ last_finalized_seq = desc_last_finalized_seq(rb); >+ >+ /* >+ * @head_id is loaded after @last_finalized_seq to ensure that it is >+ * at or beyond @last_finalized_seq. >+ * >+ * Memory barrier involvement: >+ * >+ * If desc_last_finalized_seq:A reads from >+ * desc_update_last_finalized:A, then >+ * prb_next_reserve_seq:A reads from desc_reserve:D. >+ * >+ * Relies on: >+ * >+ * RELEASE from desc_reserve:D to desc_update_last_finalized:A >+ * matching >+ * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A >+ * >+ * Note: desc_reserve:D and desc_update_last_finalized:A can be >+ * different CPUs. However, the desc_update_last_finalized:A CPU >+ * (which performs the release) must have previously seen >+ * desc_read:C, which implies desc_reserve:D can be seen. >+ */ >+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */ >+ >+ d = to_desc(desc_ring, last_finalized_seq); >+ state_var = &d->state_var; >+ >+ /* Extract the ID, used to specify the descriptor to read. */ >+ last_finalized_id = DESC_ID(atomic_long_read(state_var)); >+ >+ /* Ensure @last_finalized_id is correct. */ >+ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc); >+ >+ if (err == -EINVAL) { >+ if (last_finalized_seq == 0) { >+ /* >+ * @last_finalized_seq still contains its initial >+ * value. Probably no record has been finalized yet. >+ * This means the ringbuffer is not yet full and the >+ * @head_id value can be used directly (subtracting >+ * off the id value corresponding to seq=0). >+ */ >+ >+ /* >+ * Because of hack#2 of the bootstrapping phase, the >+ * @head_id initial value must be handled separately. >+ */ >+ if (head_id == DESC0_ID(desc_ring->count_bits)) >+ return 0; >+ >+ /* >+ * The @head_id is initialized such that the first >+ * increment will yield the first record (seq=0). >+ * Therefore use the initial value +1 as the base to >+ * subtract from @head_id. >+ */ >+ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1; >+ } else { >+ /* Record must have been overwritten. Try again. */ >+ goto try_again; >+ } >+ } >+ >+ /* >+ * @diff is the number of records beyond the last record available >+ * to readers. >+ */ >+ diff = head_id - last_finalized_id; >+ >+ /* >+ * @head_id points to the most recently reserved record, but this >+ * function returns the sequence number that will be assigned to the >+ * next (not yet reserved) record. Thus +1 is needed. >+ */ >+ return (last_finalized_seq + diff + 1); >+} >+ >+/* >+ * Non-blocking read of a record. >+ * >+ * On success @seq is updated to the record that was read and (if provided) >+ * @r and @line_count will contain the read/calculated data. >+ * >+ * On failure @seq is updated to a record that is not yet available to the >+ * reader, but it will be the next record available to the reader. >+ * >+ * Note: When the current CPU is in panic, this function will skip over any >+ * non-existent/non-finalized records in order to allow the panic CPU >+ * to print any and all records that have been finalized. > */ > static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, > struct printk_record *r, unsigned int *line_count) >@@ -1899,12 +2140,32 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, > *seq = tail_seq; > > } else if (err == -ENOENT) { >- /* Record exists, but no data available. Skip. */ >+ /* Record exists, but the data was lost. Skip. */ > (*seq)++; > > } else { >- /* Non-existent/non-finalized record. Must stop. */ >- return false; >+ /* >+ * Non-existent/non-finalized record. Must stop. >+ * >+ * For panic situations it cannot be expected that >+ * non-finalized records will become finalized. But >+ * there may be other finalized records beyond that >+ * need to be printed for a panic situation. If this >+ * is the panic CPU, skip this >+ * non-existent/non-finalized record unless it is >+ * at or beyond the head, in which case it is not >+ * possible to continue. >+ * >+ * Note that new messages printed on panic CPU are >+ * finalized when we are here. The only exception >+ * might be the last message without trailing newline. >+ * But it would have the sequence number returned >+ * by "prb_next_reserve_seq() - 1". >+ */ >+ if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb))) >+ (*seq)++; >+ else >+ return false; > } > } > >@@ -1932,7 +2193,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, > * On success, the reader must check r->info.seq to see which record was > * actually read. This allows the reader to detect dropped records. > * >- * Failure means @seq refers to a not yet written record. >+ * Failure means @seq refers to a record not yet available to the reader. > */ > bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, > struct printk_record *r) >@@ -1962,7 +2223,7 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, > * On success, the reader must check info->seq to see which record meta data > * was actually read. This allows the reader to detect dropped records. > * >- * Failure means @seq refers to a not yet written record. >+ * Failure means @seq refers to a record not yet available to the reader. > */ > bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, > struct printk_info *info, unsigned int *line_count) >@@ -2008,7 +2269,9 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) > * newest sequence number available to readers will be. > * > * This provides readers a sequence number to jump to if all currently >- * available records should be skipped. >+ * available records should be skipped. It is guaranteed that all records >+ * previous to the returned value have been finalized and are (or were) >+ * available to the reader. > * > * Context: Any context. > * Return: The sequence number of the next newest (not yet available) record >@@ -2016,34 +2279,19 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) > */ > u64 prb_next_seq(struct printk_ringbuffer *rb) > { >- struct prb_desc_ring *desc_ring = &rb->desc_ring; >- enum desc_state d_state; >- unsigned long id; > u64 seq; > >- /* Check if the cached @id still points to a valid @seq. */ >- id = atomic_long_read(&desc_ring->last_finalized_id); >- d_state = desc_read(desc_ring, id, NULL, &seq, NULL); >+ seq = desc_last_finalized_seq(rb); > >- if (d_state == desc_finalized || d_state == desc_reusable) { >- /* >- * Begin searching after the last finalized record. >- * >- * On 0, the search must begin at 0 because of hack#2 >- * of the bootstrapping phase it is not known if a >- * record at index 0 exists. >- */ >- if (seq != 0) >- seq++; >- } else { >- /* >- * The information about the last finalized sequence number >- * has gone. It should happen only when there is a flood of >- * new messages and the ringbuffer is rapidly recycled. >- * Give up and start from the beginning. >- */ >- seq = 0; >- } >+ /* >+ * Begin searching after the last finalized record. >+ * >+ * On 0, the search must begin at 0 because of hack#2 >+ * of the bootstrapping phase it is not known if a >+ * record at index 0 exists. >+ */ >+ if (seq != 0) >+ seq++; > > /* > * The information about the last finalized @seq might be inaccurate. >@@ -2085,7 +2333,7 @@ void prb_init(struct printk_ringbuffer *rb, > rb->desc_ring.infos = infos; > atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); > atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); >- atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits)); >+ atomic_long_set(&rb->desc_ring.last_finalized_seq, 0); > > rb->text_data_ring.size_bits = textbits; > rb->text_data_ring.data = text_buf; >diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h >index 18cd25e489b89..52626d0f1fa37 100644 >--- a/kernel/printk/printk_ringbuffer.h >+++ b/kernel/printk/printk_ringbuffer.h >@@ -75,7 +75,7 @@ struct prb_desc_ring { > struct printk_info *infos; > atomic_long_t head_id; > atomic_long_t tail_id; >- atomic_long_t last_finalized_id; >+ atomic_long_t last_finalized_seq; > }; > > /* >@@ -127,8 +127,22 @@ enum desc_state { > #define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) > #define DESC_ID_MASK (~DESC_FLAGS_MASK) > #define DESC_ID(sv) ((sv) & DESC_ID_MASK) >+ >+/* >+ * Special data block logical position values (for fields of >+ * @prb_desc.text_blk_lpos). >+ * >+ * - Bit0 is used to identify if the record has no data block. (Implemented in >+ * the LPOS_DATALESS() macro.) >+ * >+ * - Bit1 specifies the reason for not having a data block. >+ * >+ * These special values could never be real lpos values because of the >+ * meta data and alignment padding of data blocks. (See to_blk_size() for >+ * details.) >+ */ > #define FAILED_LPOS 0x1 >-#define NO_LPOS 0x3 >+#define EMPTY_LINE_LPOS 0x3 > > #define FAILED_BLK_LPOS \ > { \ >@@ -259,7 +273,7 @@ static struct printk_ringbuffer name = { \ > .infos = &_##name##_infos[0], \ > .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ > .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ >- .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \ >+ .last_finalized_seq = ATOMIC_INIT(0), \ > }, \ > .text_data_ring = { \ > .size_bits = (avgtextbits) + (descbits), \ >@@ -378,7 +392,41 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, > bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, > struct printk_info *info, unsigned int *line_count); > >+u64 prb_first_seq(struct printk_ringbuffer *rb); > u64 prb_first_valid_seq(struct printk_ringbuffer *rb); > u64 prb_next_seq(struct printk_ringbuffer *rb); >+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb); >+ >+#ifdef CONFIG_64BIT >+ >+#define __u64seq_to_ulseq(u64seq) (u64seq) >+#define __ulseq_to_u64seq(rb, ulseq) (ulseq) >+ >+#else /* CONFIG_64BIT */ >+ >+#define __u64seq_to_ulseq(u64seq) ((u32)u64seq) >+ >+static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq) >+{ >+ u64 rb_first_seq = prb_first_seq(rb); >+ u64 seq; >+ >+ /* >+ * The provided sequence is only the lower 32 bits of the ringbuffer >+ * sequence. It needs to be expanded to 64bit. Get the first sequence >+ * number from the ringbuffer and fold it. >+ * >+ * Having a 32bit representation in the console is sufficient. >+ * If a console ever gets more than 2^31 records behind >+ * the ringbuffer then this is the least of the problems. >+ * >+ * Also the access to the ring buffer is always safe. >+ */ >+ seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq); >+ >+ return seq; >+} >+ >+#endif /* CONFIG_64BIT */ > > #endif /* _KERNEL_PRINTK_RINGBUFFER_H */ >diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c >index 6d10927a07d83..8d9408d653de5 100644 >--- a/kernel/printk/printk_safe.c >+++ b/kernel/printk/printk_safe.c >@@ -26,6 +26,18 @@ void __printk_safe_exit(void) > this_cpu_dec(printk_context); > } > >+void __printk_deferred_enter(void) >+{ >+ cant_migrate(); >+ this_cpu_inc(printk_context); >+} >+ >+void __printk_deferred_exit(void) >+{ >+ cant_migrate(); >+ this_cpu_dec(printk_context); >+} >+ > asmlinkage int vprintk(const char *fmt, va_list args) > { > #ifdef CONFIG_KGDB_KDB >diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c >index 30fc9d34e3297..a0b0358352de0 100644 >--- a/kernel/rcu/rcutorture.c >+++ b/kernel/rcu/rcutorture.c >@@ -2409,6 +2409,12 @@ static int rcutorture_booster_init(unsigned int cpu) > WARN_ON_ONCE(!t); > sp.sched_priority = 2; > sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); >+#ifdef CONFIG_PREEMPT_RT >+ t = per_cpu(timersd, cpu); >+ WARN_ON_ONCE(!t); >+ sp.sched_priority = 2; >+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); >+#endif > } > > /* Don't allow time recalculation while creating a new task. */ >diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h >index ac8e86babe449..efb2be8939a2d 100644 >--- a/kernel/rcu/tree_stall.h >+++ b/kernel/rcu/tree_stall.h >@@ -9,6 +9,7 @@ > > #include <linux/kvm_para.h> > #include <linux/rcu_notifier.h> >+#include <linux/console.h> > > ////////////////////////////////////////////////////////////////////////////// > // >@@ -604,6 +605,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) > if (rcu_stall_is_suppressed()) > return; > >+ nbcon_cpu_emergency_enter(); >+ > /* > * OK, time to rat on our buddy... > * See Documentation/RCU/stallwarn.rst for info on how to debug >@@ -658,6 +661,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) > panic_on_rcu_stall(); > > rcu_force_quiescent_state(); /* Kick them all. */ >+ >+ nbcon_cpu_emergency_exit(); > } > > static void print_cpu_stall(unsigned long gps) >diff --git a/kernel/sched/core.c b/kernel/sched/core.c >index a708d225c28e8..d0db38a6978a5 100644 >--- a/kernel/sched/core.c >+++ b/kernel/sched/core.c >@@ -898,14 +898,15 @@ static inline void hrtick_rq_init(struct rq *rq) > > #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) > /* >- * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, >+ * Atomically set TIF_NEED_RESCHED[_LAZY] and test for TIF_POLLING_NRFLAG, > * this avoids any races wrt polling state changes and thereby avoids > * spurious IPIs. > */ >-static inline bool set_nr_and_not_polling(struct task_struct *p) >+static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit) > { > struct thread_info *ti = task_thread_info(p); >- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); >+ >+ return !(fetch_or(&ti->flags, 1 << tif_bit) & _TIF_POLLING_NRFLAG); > } > > /* >@@ -922,7 +923,7 @@ static bool set_nr_if_polling(struct task_struct *p) > do { > if (!(val & _TIF_POLLING_NRFLAG)) > return false; >- if (val & _TIF_NEED_RESCHED) >+ if (val & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) > return true; > } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); > >@@ -930,9 +931,9 @@ static bool set_nr_if_polling(struct task_struct *p) > } > > #else >-static inline bool set_nr_and_not_polling(struct task_struct *p) >+static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit) > { >- set_tsk_need_resched(p); >+ set_tsk_thread_flag(p, tif_bit); > return true; > } > >@@ -1037,28 +1038,47 @@ void wake_up_q(struct wake_q_head *head) > * might also involve a cross-CPU call to trigger the scheduler on > * the target CPU. > */ >-void resched_curr(struct rq *rq) >+static void __resched_curr(struct rq *rq, int lazy) > { >+ int cpu, tif_bit = TIF_NEED_RESCHED + lazy; > struct task_struct *curr = rq->curr; >- int cpu; > > lockdep_assert_rq_held(rq); > >- if (test_tsk_need_resched(curr)) >+ if (unlikely(test_tsk_thread_flag(curr, tif_bit))) > return; > > cpu = cpu_of(rq); > > if (cpu == smp_processor_id()) { >- set_tsk_need_resched(curr); >- set_preempt_need_resched(); >+ set_tsk_thread_flag(curr, tif_bit); >+ if (!lazy) >+ set_preempt_need_resched(); > return; > } > >- if (set_nr_and_not_polling(curr)) >- smp_send_reschedule(cpu); >- else >+ if (set_nr_and_not_polling(curr, tif_bit)) { >+ if (!lazy) >+ smp_send_reschedule(cpu); >+ } else { > trace_sched_wake_idle_without_ipi(cpu); >+ } >+} >+ >+void resched_curr(struct rq *rq) >+{ >+ __resched_curr(rq, 0); >+} >+ >+void resched_curr_lazy(struct rq *rq) >+{ >+ int lazy = IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && !sched_feat(FORCE_NEED_RESCHED) ? >+ TIF_NEED_RESCHED_LAZY_OFFSET : 0; >+ >+ if (lazy && unlikely(test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED))) >+ return; >+ >+ __resched_curr(rq, lazy); > } > > void resched_cpu(int cpu) >@@ -1131,7 +1151,7 @@ static void wake_up_idle_cpu(int cpu) > if (cpu == smp_processor_id()) > return; > >- if (set_nr_and_not_polling(rq->idle)) >+ if (set_nr_and_not_polling(rq->idle, TIF_NEED_RESCHED)) > smp_send_reschedule(cpu); > else > trace_sched_wake_idle_without_ipi(cpu); >@@ -8865,6 +8885,21 @@ static inline void preempt_dynamic_init(void) { } > > #endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ > >+/* >+ * task_is_pi_boosted - Check if task has been PI boosted. >+ * @p: Task to check. >+ * >+ * Return true if task is subject to priority inheritance. >+ */ >+bool task_is_pi_boosted(const struct task_struct *p) >+{ >+ int prio = p->prio; >+ >+ if (!rt_prio(prio)) >+ return false; >+ return prio != p->normal_prio; >+} >+ > /** > * yield - yield the current processor to other threads. > * >diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c >index 4580a450700ec..b40a080e5272d 100644 >--- a/kernel/sched/debug.c >+++ b/kernel/sched/debug.c >@@ -333,6 +333,23 @@ static const struct file_operations sched_debug_fops = { > .release = seq_release, > }; > >+static ssize_t sched_hog_write(struct file *filp, const char __user *ubuf, >+ size_t cnt, loff_t *ppos) >+{ >+ unsigned long end = jiffies + 60 * HZ; >+ >+ for (; time_before(jiffies, end) && !signal_pending(current);) >+ cpu_relax(); >+ >+ return cnt; >+} >+ >+static const struct file_operations sched_hog_fops = { >+ .write = sched_hog_write, >+ .open = simple_open, >+ .llseek = default_llseek, >+}; >+ > static struct dentry *debugfs_sched; > > static __init int sched_init_debug(void) >@@ -374,6 +391,8 @@ static __init int sched_init_debug(void) > > debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); > >+ debugfs_create_file("hog", 0200, debugfs_sched, NULL, &sched_hog_fops); >+ > return 0; > } > late_initcall(sched_init_debug); >diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >index d7a3c63a2171a..a2ca9589880a1 100644 >--- a/kernel/sched/fair.c >+++ b/kernel/sched/fair.c >@@ -1001,8 +1001,10 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); > * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i > * this is probably good enough. > */ >-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) >+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se, bool tick) > { >+ struct rq *rq = rq_of(cfs_rq); >+ > if ((s64)(se->vruntime - se->deadline) < 0) > return; > >@@ -1021,10 +1023,19 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) > /* > * The task has consumed its request, reschedule. > */ >- if (cfs_rq->nr_running > 1) { >- resched_curr(rq_of(cfs_rq)); >- clear_buddies(cfs_rq, se); >+ if (cfs_rq->nr_running < 2) >+ return; >+ >+ if (!IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) || sched_feat(FORCE_NEED_RESCHED)) { >+ resched_curr(rq); >+ } else { >+ /* Did the task ignore the lazy reschedule request? */ >+ if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY)) >+ resched_curr(rq); >+ else >+ resched_curr_lazy(rq); > } >+ clear_buddies(cfs_rq, se); > } > > #include "pelt.h" >@@ -1132,7 +1143,7 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) > /* > * Update the current task's runtime statistics. > */ >-static void update_curr(struct cfs_rq *cfs_rq) >+static void __update_curr(struct cfs_rq *cfs_rq, bool tick) > { > struct sched_entity *curr = cfs_rq->curr; > u64 now = rq_clock_task(rq_of(cfs_rq)); >@@ -1371,7 +1371,7 @@ > update_burst_penalty(curr); > #endif // CONFIG_SCHED_BORE > curr->vruntime += max(1ULL, calc_delta_fair(delta_exec, curr)); >- update_deadline(cfs_rq, curr); >+ update_deadline(cfs_rq, curr, tick); > update_min_vruntime(cfs_rq); > > if (entity_is_task(curr)) { >@@ -1173,6 +1184,11 @@ static void update_curr(struct cfs_rq *cfs_rq) > account_cfs_rq_runtime(cfs_rq, delta_exec); > } > >+static inline void update_curr(struct cfs_rq *cfs_rq) >+{ >+ __update_curr(cfs_rq, false); >+} >+ > static void update_curr_fair(struct rq *rq) > { > update_curr(cfs_rq_of(&rq->curr->se)); >@@ -5449,7 +5465,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) > /* > * Update run-time statistics of the 'current'. > */ >- update_curr(cfs_rq); >+ __update_curr(cfs_rq, true); > > /* > * Ensure that runnable average is periodically updated. >@@ -5463,7 +5479,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) > * validating it and just reschedule. > */ > if (queued) { >- resched_curr(rq_of(cfs_rq)); >+ resched_curr_lazy(rq_of(cfs_rq)); > return; > } > /* >@@ -5609,7 +5625,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) > * hierarchy can be throttled > */ > if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) >- resched_curr(rq_of(cfs_rq)); >+ resched_curr_lazy(rq_of(cfs_rq)); > } > > static __always_inline >@@ -5869,7 +5885,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) > > /* Determine whether we need to wake up potentially idle CPU: */ > if (rq->curr == rq->idle && rq->cfs.nr_running) >- resched_curr(rq); >+ resched_curr_lazy(rq); > } > > #ifdef CONFIG_SMP >@@ -6584,7 +6600,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) > > if (delta < 0) { > if (task_current(rq, p)) >- resched_curr(rq); >+ resched_curr_lazy(rq); > return; > } > hrtick_start(rq, delta); >@@ -8240,7 +8256,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int > * prevents us from potentially nominating it as a false LAST_BUDDY > * below. > */ >- if (test_tsk_need_resched(curr)) >+ if (need_resched()) > return; > > /* Idle tasks are by definition preempted by non-idle tasks. */ >@@ -8282,7 +8298,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int > return; > > preempt: >- resched_curr(rq); >+ resched_curr_lazy(rq); > } > > #ifdef CONFIG_SMP >@@ -12449,7 +12465,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) > */ > if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && > __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) >- resched_curr(rq); >+ resched_curr_lazy(rq); > } > > /* >@@ -12614,7 +12630,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) > */ > if (task_current(rq, p)) { > if (p->prio > oldprio) >- resched_curr(rq); >+ resched_curr_lazy(rq); > } else > wakeup_preempt(rq, p, 0); > } >diff --git a/kernel/sched/features.h b/kernel/sched/features.h >index a3ddf84de430f..c3f7142ff1f84 100644 >--- a/kernel/sched/features.h >+++ b/kernel/sched/features.h >@@ -88,3 +88,5 @@ SCHED_FEAT(UTIL_EST_FASTUP, true) > SCHED_FEAT(LATENCY_WARN, false) > > SCHED_FEAT(HZ_BW, true) >+ >+SCHED_FEAT(FORCE_NEED_RESCHED, false) >diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c >index 565f8374ddbbf..22d70000ab595 100644 >--- a/kernel/sched/idle.c >+++ b/kernel/sched/idle.c >@@ -57,8 +57,7 @@ static noinline int __cpuidle cpu_idle_poll(void) > ct_cpuidle_enter(); > > raw_local_irq_enable(); >- while (!tif_need_resched() && >- (cpu_idle_force_poll || tick_check_broadcast_expired())) >+ while (!need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) > cpu_relax(); > raw_local_irq_disable(); > >diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c >index 6aaf0a3d6081d..46a1784a49f7e 100644 >--- a/kernel/sched/rt.c >+++ b/kernel/sched/rt.c >@@ -2203,8 +2203,11 @@ static int rto_next_cpu(struct root_domain *rd) > > rd->rto_cpu = cpu; > >- if (cpu < nr_cpu_ids) >+ if (cpu < nr_cpu_ids) { >+ if (!has_pushable_tasks(cpu_rq(cpu))) >+ continue; > return cpu; >+ } > > rd->rto_cpu = -1; > >diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h >index 2e5a95486a422..68989416d046c 100644 >--- a/kernel/sched/sched.h >+++ b/kernel/sched/sched.h >@@ -2419,6 +2419,7 @@ extern void init_sched_fair_class(void); > extern void reweight_task(struct task_struct *p, int prio); > > extern void resched_curr(struct rq *rq); >+extern void resched_curr_lazy(struct rq *rq); > extern void resched_cpu(int cpu); > > extern struct rt_bandwidth def_rt_bandwidth; >diff --git a/kernel/softirq.c b/kernel/softirq.c >index 210cf5f8d92c2..cae0ae2e2b0bb 100644 >--- a/kernel/softirq.c >+++ b/kernel/softirq.c >@@ -247,6 +247,19 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) > } > EXPORT_SYMBOL(__local_bh_enable_ip); > >+void softirq_preempt(void) >+{ >+ if (WARN_ON_ONCE(!preemptible())) >+ return; >+ >+ if (WARN_ON_ONCE(__this_cpu_read(softirq_ctrl.cnt) != SOFTIRQ_OFFSET)) >+ return; >+ >+ __local_bh_enable(SOFTIRQ_OFFSET, true); >+ /* preemption point */ >+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); >+} >+ > /* > * Invoked from ksoftirqd_run() outside of the interrupt disabled section > * to acquire the per CPU local lock for reentrancy protection. >@@ -619,6 +632,24 @@ static inline void tick_irq_exit(void) > #endif > } > >+#ifdef CONFIG_PREEMPT_RT >+DEFINE_PER_CPU(struct task_struct *, timersd); >+DEFINE_PER_CPU(unsigned long, pending_timer_softirq); >+ >+static void wake_timersd(void) >+{ >+ struct task_struct *tsk = __this_cpu_read(timersd); >+ >+ if (tsk) >+ wake_up_process(tsk); >+} >+ >+#else >+ >+static inline void wake_timersd(void) { } >+ >+#endif >+ > static inline void __irq_exit_rcu(void) > { > #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED >@@ -631,6 +662,10 @@ static inline void __irq_exit_rcu(void) > if (!in_interrupt() && local_softirq_pending()) > invoke_softirq(); > >+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() && >+ !(in_nmi() | in_hardirq())) >+ wake_timersd(); >+ > tick_irq_exit(); > } > >@@ -963,12 +998,70 @@ static struct smp_hotplug_thread softirq_threads = { > .thread_comm = "ksoftirqd/%u", > }; > >+#ifdef CONFIG_PREEMPT_RT >+static void timersd_setup(unsigned int cpu) >+{ >+ sched_set_fifo_low(current); >+} >+ >+static int timersd_should_run(unsigned int cpu) >+{ >+ return local_pending_timers(); >+} >+ >+static void run_timersd(unsigned int cpu) >+{ >+ unsigned int timer_si; >+ >+ ksoftirqd_run_begin(); >+ >+ timer_si = local_pending_timers(); >+ __this_cpu_write(pending_timer_softirq, 0); >+ or_softirq_pending(timer_si); >+ >+ __do_softirq(); >+ >+ ksoftirqd_run_end(); >+} >+ >+static void raise_ktimers_thread(unsigned int nr) >+{ >+ trace_softirq_raise(nr); >+ __this_cpu_or(pending_timer_softirq, 1 << nr); >+} >+ >+void raise_hrtimer_softirq(void) >+{ >+ raise_ktimers_thread(HRTIMER_SOFTIRQ); >+} >+ >+void raise_timer_softirq(void) >+{ >+ unsigned long flags; >+ >+ local_irq_save(flags); >+ raise_ktimers_thread(TIMER_SOFTIRQ); >+ wake_timersd(); >+ local_irq_restore(flags); >+} >+ >+static struct smp_hotplug_thread timer_threads = { >+ .store = &timersd, >+ .setup = timersd_setup, >+ .thread_should_run = timersd_should_run, >+ .thread_fn = run_timersd, >+ .thread_comm = "ktimers/%u", >+}; >+#endif >+ > static __init int spawn_ksoftirqd(void) > { > cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, > takeover_tasklets); > BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); >- >+#ifdef CONFIG_PREEMPT_RT >+ BUG_ON(smpboot_register_percpu_thread(&timer_threads)); >+#endif > return 0; > } > early_initcall(spawn_ksoftirqd); >diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c >index 760793998cdd7..9f4d7ab03e398 100644 >--- a/kernel/time/hrtimer.c >+++ b/kernel/time/hrtimer.c >@@ -1808,7 +1808,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) > if (!ktime_before(now, cpu_base->softirq_expires_next)) { > cpu_base->softirq_expires_next = KTIME_MAX; > cpu_base->softirq_activated = 1; >- raise_softirq_irqoff(HRTIMER_SOFTIRQ); >+ raise_hrtimer_softirq(); > } > > __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); >@@ -1921,7 +1921,7 @@ void hrtimer_run_queues(void) > if (!ktime_before(now, cpu_base->softirq_expires_next)) { > cpu_base->softirq_expires_next = KTIME_MAX; > cpu_base->softirq_activated = 1; >- raise_softirq_irqoff(HRTIMER_SOFTIRQ); >+ raise_hrtimer_softirq(); > } > > __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); >diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c >index be77b021e5d63..547acecec6de6 100644 >--- a/kernel/time/tick-sched.c >+++ b/kernel/time/tick-sched.c >@@ -796,7 +796,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) > > static inline bool local_timer_softirq_pending(void) > { >- return local_softirq_pending() & BIT(TIMER_SOFTIRQ); >+ return local_pending_timers() & BIT(TIMER_SOFTIRQ); > } > > static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) >diff --git a/kernel/time/timer.c b/kernel/time/timer.c >index 63a8ce7177dd4..b3fbe97d1e342 100644 >--- a/kernel/time/timer.c >+++ b/kernel/time/timer.c >@@ -1470,9 +1470,16 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) > */ > static void timer_sync_wait_running(struct timer_base *base) > { >- if (atomic_read(&base->timer_waiters)) { >+ bool need_preempt; >+ >+ need_preempt = task_is_pi_boosted(current); >+ if (need_preempt || atomic_read(&base->timer_waiters)) { > raw_spin_unlock_irq(&base->lock); > spin_unlock(&base->expiry_lock); >+ >+ if (need_preempt) >+ softirq_preempt(); >+ > spin_lock(&base->expiry_lock); > raw_spin_lock_irq(&base->lock); > } >@@ -2054,7 +2061,7 @@ static void run_local_timers(void) > if (time_before(jiffies, base->next_expiry)) > return; > } >- raise_softirq(TIMER_SOFTIRQ); >+ raise_timer_softirq(); > } > > /* >diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c >index a0defe156b571..3b0f2cc1617f2 100644 >--- a/kernel/trace/trace.c >+++ b/kernel/trace/trace.c >@@ -2695,6 +2695,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) > > if (tif_need_resched()) > trace_flags |= TRACE_FLAG_NEED_RESCHED; >+ if (tif_need_resched_lazy()) >+ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; > if (test_preempt_need_resched()) > trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; > return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | >diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c >index 3e7fa44dc2b24..5e120c2404cf5 100644 >--- a/kernel/trace/trace_output.c >+++ b/kernel/trace/trace_output.c >@@ -460,17 +460,29 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) > (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : > (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : > bh_off ? 'b' : >- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : >+ !IS_ENABLED(CONFIG_TRACE_IRQFLAGS_SUPPORT) ? 'X' : > '.'; > >- switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | >+ switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | > TRACE_FLAG_PREEMPT_RESCHED)) { >+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: >+ need_resched = 'B'; >+ break; > case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: > need_resched = 'N'; > break; >+ case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: >+ need_resched = 'L'; >+ break; >+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY: >+ need_resched = 'b'; >+ break; > case TRACE_FLAG_NEED_RESCHED: > need_resched = 'n'; > break; >+ case TRACE_FLAG_NEED_RESCHED_LAZY: >+ need_resched = 'l'; >+ break; > case TRACE_FLAG_PREEMPT_RESCHED: > need_resched = 'p'; > break; >diff --git a/localversion-rt b/localversion-rt >new file mode 100644 >index 0000000000000..8fc605d806670 >--- /dev/null >+++ b/localversion-rt >@@ -0,0 +1 @@ >+-rt6 >diff --git a/net/core/dev.c b/net/core/dev.c >index ad20bebe153fc..e32cb67c5bf32 100644 >--- a/net/core/dev.c >+++ b/net/core/dev.c >@@ -4682,15 +4682,6 @@ static void rps_trigger_softirq(void *data) > > #endif /* CONFIG_RPS */ > >-/* Called from hardirq (IPI) context */ >-static void trigger_rx_softirq(void *data) >-{ >- struct softnet_data *sd = data; >- >- __raise_softirq_irqoff(NET_RX_SOFTIRQ); >- smp_store_release(&sd->defer_ipi_scheduled, 0); >-} >- > /* > * After we queued a packet into sd->input_pkt_queue, > * we need to make sure this queue is serviced soon. >@@ -6661,6 +6652,32 @@ static void skb_defer_free_flush(struct softnet_data *sd) > } > } > >+#ifndef CONFIG_PREEMPT_RT >+ >+/* Called from hardirq (IPI) context */ >+static void trigger_rx_softirq(void *data) >+{ >+ struct softnet_data *sd = data; >+ >+ __raise_softirq_irqoff(NET_RX_SOFTIRQ); >+ smp_store_release(&sd->defer_ipi_scheduled, 0); >+} >+ >+#else >+ >+static void trigger_rx_softirq(struct work_struct *defer_work) >+{ >+ struct softnet_data *sd; >+ >+ sd = container_of(defer_work, struct softnet_data, defer_work); >+ smp_store_release(&sd->defer_ipi_scheduled, 0); >+ local_bh_disable(); >+ skb_defer_free_flush(sd); >+ local_bh_enable(); >+} >+ >+#endif >+ > static int napi_threaded_poll(void *data) > { > struct napi_struct *napi = data; >@@ -11624,7 +11641,11 @@ static int __init net_dev_init(void) > INIT_CSD(&sd->csd, rps_trigger_softirq, sd); > sd->cpu = i; > #endif >+#ifndef CONFIG_PREEMPT_RT > INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); >+#else >+ INIT_WORK(&sd->defer_work, trigger_rx_softirq); >+#endif > spin_lock_init(&sd->defer_lock); > > init_gro_hash(&sd->backlog); >diff --git a/net/core/skbuff.c b/net/core/skbuff.c >index 94cc40a6f7975..889ef4c4d9548 100644 >--- a/net/core/skbuff.c >+++ b/net/core/skbuff.c >@@ -6861,8 +6861,13 @@ nodefer: __kfree_skb(skb); > /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU > * if we are unlucky enough (this seems very unlikely). > */ >- if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) >+ if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { >+#ifndef CONFIG_PREEMPT_RT > smp_call_function_single_async(cpu, &sd->defer_csd); >+#else >+ schedule_work_on(cpu, &sd->defer_work); >+#endif >+ } > } > > static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 916954
:
874143
|
874144
|
874346
|
874895
|
875322
|
875323
|
879000
| 884278 |
884334
|
888012
|
888021
|
889979
|
889980