Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 487362
Collapse All | Expand All

(-)file_not_specified_in_diff (-104 / +8219 lines)
Line  Link Here
ck
1
--
2
Documentation/scheduler/sched-BFS.txt     |  347 +
Documentation/scheduler/sched-BFS.txt     |  347 +
3
Documentation/sysctl/kernel.txt           |   26 
1
Documentation/sysctl/kernel.txt           |   26 
4
arch/powerpc/platforms/cell/spufs/sched.c |    5 
2
arch/powerpc/platforms/cell/spufs/sched.c |    5 
5
drivers/cpufreq/cpufreq.c                 |    7 
3
drivers/cpufreq/cpufreq.c                 |    7 
6
drivers/cpufreq/cpufreq_conservative.c    |    4 
4
drivers/cpufreq/cpufreq_conservative.c    |    4 
7
drivers/cpufreq/cpufreq_ondemand.c        |    8 
5
drivers/cpufreq/cpufreq_ondemand.c        |    8 
8
fs/proc/base.c                            |    2 
6
fs/proc/base.c                            |    2 
9
include/linux/init_task.h                 |   64 
7
include/linux/init_task.h                 |   64 
10
include/linux/ioprio.h                    |    2 
8
include/linux/ioprio.h                    |    2 
11
include/linux/jiffies.h                   |    2 
9
include/linux/jiffies.h                   |    2 
12
include/linux/sched.h                     |   88 
10
include/linux/sched.h                     |   88 
13
include/linux/sched/rt.h                  |   13 
11
include/linux/sched/rt.h                  |   13 
14
include/uapi/linux/sched.h                |    9 
12
include/uapi/linux/sched.h                |    9 
15
init/Kconfig                              |   54 
13
init/Kconfig                              |   54 
16
init/main.c                               |    3 
14
init/main.c                               |    3 
17
kernel/delayacct.c                        |    2 
15
kernel/delayacct.c                        |    2 
18
kernel/exit.c                             |    2 
16
kernel/exit.c                             |    2 
19
kernel/posix-cpu-timers.c                 |   14 
17
kernel/posix-cpu-timers.c                 |   14 
20
kernel/sched/Makefile                     |    8 
18
kernel/sched/Makefile                     |    8 
21
kernel/sched/bfs.c                        | 7423 ++++++++++++++++++++++++++++++
19
kernel/sched/bfs.c                        | 7423 ++++++++++++++++++++++++++++++
22
kernel/stop_machine.c                     |    3 
20
kernel/stop_machine.c                     |    3 
23
kernel/sysctl.c                           |   31 
21
kernel/sysctl.c                           |   31 
24
kernel/time/Kconfig                       |    2 
22
kernel/time/Kconfig                       |    2 
25
lib/Kconfig.debug                         |    2 
23
lib/Kconfig.debug                         |    2 
26
24 files changed, 8048 insertions(+), 73 deletions(-)
24
24 files changed, 8048 insertions(+), 73 deletions(-)
27
-- a/arch/powerpc/platforms/cell/spufs/sched.c
25
++ b/arch/powerpc/platforms/cell/spufs/sched.c
Lines 64-74 Link Here
64
static struct timer_list spuloadavg_timer;
64
static struct timer_list spuloadavg_timer;
65
65
66
/*
66
/*
67
 * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
68
 */
69
#define NORMAL_PRIO		120
70
71
/*
72
 * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
67
 * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
73
 * tick for every 10 CPU scheduler ticks.
68
 * tick for every 10 CPU scheduler ticks.
74
 */
69
 */
75
-- /dev/null
70
++ b/Documentation/scheduler/sched-BFS.txt
Line 0 Link Here
0
-- a/Documentation/sysctl/kernel.txt
1
BFS - The Brain Fuck Scheduler by Con Kolivas.
2
3
Goals.
4
5
The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
6
completely do away with the complex designs of the past for the cpu process
7
scheduler and instead implement one that is very simple in basic design.
8
The main focus of BFS is to achieve excellent desktop interactivity and
9
responsiveness without heuristics and tuning knobs that are difficult to
10
understand, impossible to model and predict the effect of, and when tuned to
11
one workload cause massive detriment to another.
12
13
14
Design summary.
15
16
BFS is best described as a single runqueue, O(n) lookup, earliest effective
17
virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
18
deadline first) and my previous Staircase Deadline scheduler. Each component
19
shall be described in order to understand the significance of, and reasoning for
20
it. The codebase when the first stable version was released was approximately
21
9000 lines less code than the existing mainline linux kernel scheduler (in
22
2.6.31). This does not even take into account the removal of documentation and
23
the cgroups code that is not used.
24
25
Design reasoning.
26
27
The single runqueue refers to the queued but not running processes for the
28
entire system, regardless of the number of CPUs. The reason for going back to
29
a single runqueue design is that once multiple runqueues are introduced,
30
per-CPU or otherwise, there will be complex interactions as each runqueue will
31
be responsible for the scheduling latency and fairness of the tasks only on its
32
own runqueue, and to achieve fairness and low latency across multiple CPUs, any
33
advantage in throughput of having CPU local tasks causes other disadvantages.
34
This is due to requiring a very complex balancing system to at best achieve some
35
semblance of fairness across CPUs and can only maintain relatively low latency
36
for tasks bound to the same CPUs, not across them. To increase said fairness
37
and latency across CPUs, the advantage of local runqueue locking, which makes
38
for better scalability, is lost due to having to grab multiple locks.
39
40
A significant feature of BFS is that all accounting is done purely based on CPU
41
used and nowhere is sleep time used in any way to determine entitlement or
42
interactivity. Interactivity "estimators" that use some kind of sleep/run
43
algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
44
tasks that aren't interactive as being so. The reason for this is that it is
45
close to impossible to determine that when a task is sleeping, whether it is
46
doing it voluntarily, as in a userspace application waiting for input in the
47
form of a mouse click or otherwise, or involuntarily, because it is waiting for
48
another thread, process, I/O, kernel activity or whatever. Thus, such an
49
estimator will introduce corner cases, and more heuristics will be required to
50
cope with those corner cases, introducing more corner cases and failed
51
interactivity detection and so on. Interactivity in BFS is built into the design
52
by virtue of the fact that tasks that are waking up have not used up their quota
53
of CPU time, and have earlier effective deadlines, thereby making it very likely
54
they will preempt any CPU bound task of equivalent nice level. See below for
55
more information on the virtual deadline mechanism. Even if they do not preempt
56
a running task, because the rr interval is guaranteed to have a bound upper
57
limit on how long a task will wait for, it will be scheduled within a timeframe
58
that will not cause visible interface jitter.
59
60
61
Design details.
62
63
Task insertion.
64
65
BFS inserts tasks into each relevant queue as an O(1) insertion into a double
66
linked list. On insertion, *every* running queue is checked to see if the newly
67
queued task can run on any idle queue, or preempt the lowest running task on the
68
system. This is how the cross-CPU scheduling of BFS achieves significantly lower
69
latency per extra CPU the system has. In this case the lookup is, in the worst
70
case scenario, O(n) where n is the number of CPUs on the system.
71
72
Data protection.
73
74
BFS has one single lock protecting the process local data of every task in the
75
global queue. Thus every insertion, removal and modification of task data in the
76
global runqueue needs to grab the global lock. However, once a task is taken by
77
a CPU, the CPU has its own local data copy of the running process' accounting
78
information which only that CPU accesses and modifies (such as during a
79
timer tick) thus allowing the accounting data to be updated lockless. Once a
80
CPU has taken a task to run, it removes it from the global queue. Thus the
81
global queue only ever has, at most,
82
83
	(number of tasks requesting cpu time) - (number of logical CPUs) + 1
84
85
tasks in the global queue. This value is relevant for the time taken to look up
86
tasks during scheduling. This will increase if many tasks with CPU affinity set
87
in their policy to limit which CPUs they're allowed to run on if they outnumber
88
the number of CPUs. The +1 is because when rescheduling a task, the CPU's
89
currently running task is put back on the queue. Lookup will be described after
90
the virtual deadline mechanism is explained.
91
92
Virtual deadline.
93
94
The key to achieving low latency, scheduling fairness, and "nice level"
95
distribution in BFS is entirely in the virtual deadline mechanism. The one
96
tunable in BFS is the rr_interval, or "round robin interval". This is the
97
maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
98
tasks of the same nice level will be running for, or looking at it the other
99
way around, the longest duration two tasks of the same nice level will be
100
delayed for. When a task requests cpu time, it is given a quota (time_slice)
101
equal to the rr_interval and a virtual deadline. The virtual deadline is
102
offset from the current time in jiffies by this equation:
103
104
	jiffies + (prio_ratio * rr_interval)
105
106
The prio_ratio is determined as a ratio compared to the baseline of nice -20
107
and increases by 10% per nice level. The deadline is a virtual one only in that
108
no guarantee is placed that a task will actually be scheduled by this time, but
109
it is used to compare which task should go next. There are three components to
110
how a task is next chosen. First is time_slice expiration. If a task runs out
111
of its time_slice, it is descheduled, the time_slice is refilled, and the
112
deadline reset to that formula above. Second is sleep, where a task no longer
113
is requesting CPU for whatever reason. The time_slice and deadline are _not_
114
adjusted in this case and are just carried over for when the task is next
115
scheduled. Third is preemption, and that is when a newly waking task is deemed
116
higher priority than a currently running task on any cpu by virtue of the fact
117
that it has an earlier virtual deadline than the currently running task. The
118
earlier deadline is the key to which task is next chosen for the first and
119
second cases. Once a task is descheduled, it is put back on the queue, and an
120
O(n) lookup of all queued-but-not-running tasks is done to determine which has
121
the earliest deadline and that task is chosen to receive CPU next.
122
123
The CPU proportion of different nice tasks works out to be approximately the
124
125
	(prio_ratio difference)^2
126
127
The reason it is squared is that a task's deadline does not change while it is
128
running unless it runs out of time_slice. Thus, even if the time actually
129
passes the deadline of another task that is queued, it will not get CPU time
130
unless the current running task deschedules, and the time "base" (jiffies) is
131
constantly moving.
132
133
Task lookup.
134
135
BFS has 103 priority queues. 100 of these are dedicated to the static priority
136
of realtime tasks, and the remaining 3 are, in order of best to worst priority,
137
SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
138
scheduling). When a task of these priorities is queued, a bitmap of running
139
priorities is set showing which of these priorities has tasks waiting for CPU
140
time. When a CPU is made to reschedule, the lookup for the next task to get
141
CPU time is performed in the following way:
142
143
First the bitmap is checked to see what static priority tasks are queued. If
144
any realtime priorities are found, the corresponding queue is checked and the
145
first task listed there is taken (provided CPU affinity is suitable) and lookup
146
is complete. If the priority corresponds to a SCHED_ISO task, they are also
147
taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
148
to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
149
stage, every task in the runlist that corresponds to that priority is checked
150
to see which has the earliest set deadline, and (provided it has suitable CPU
151
affinity) it is taken off the runqueue and given the CPU. If a task has an
152
expired deadline, it is taken and the rest of the lookup aborted (as they are
153
chosen in FIFO order).
154
155
Thus, the lookup is O(n) in the worst case only, where n is as described
156
earlier, as tasks may be chosen before the whole task list is looked over.
157
158
159
Scalability.
160
161
The major limitations of BFS will be that of scalability, as the separate
162
runqueue designs will have less lock contention as the number of CPUs rises.
163
However they do not scale linearly even with separate runqueues as multiple
164
runqueues will need to be locked concurrently on such designs to be able to
165
achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
166
across CPUs, and to achieve low enough latency for tasks on a busy CPU when
167
other CPUs would be more suited. BFS has the advantage that it requires no
168
balancing algorithm whatsoever, as balancing occurs by proxy simply because
169
all CPUs draw off the global runqueue, in priority and deadline order. Despite
170
the fact that scalability is _not_ the prime concern of BFS, it both shows very
171
good scalability to smaller numbers of CPUs and is likely a more scalable design
172
at these numbers of CPUs.
173
174
It also has some very low overhead scalability features built into the design
175
when it has been deemed their overhead is so marginal that they're worth adding.
176
The first is the local copy of the running process' data to the CPU it's running
177
on to allow that data to be updated lockless where possible. Then there is
178
deference paid to the last CPU a task was running on, by trying that CPU first
179
when looking for an idle CPU to use the next time it's scheduled. Finally there
180
is the notion of "sticky" tasks that are flagged when they are involuntarily
181
descheduled, meaning they still want further CPU time. This sticky flag is
182
used to bias heavily against those tasks being scheduled on a different CPU
183
unless that CPU would be otherwise idle. When a cpu frequency governor is used
184
that scales with CPU load, such as ondemand, sticky tasks are not scheduled
185
on a different CPU at all, preferring instead to go idle. This means the CPU
186
they were bound to is more likely to increase its speed while the other CPU
187
will go idle, thus speeding up total task execution time and likely decreasing
188
power usage. This is the only scenario where BFS will allow a CPU to go idle
189
in preference to scheduling a task on the earliest available spare CPU.
190
191
The real cost of migrating a task from one CPU to another is entirely dependant
192
on the cache footprint of the task, how cache intensive the task is, how long
193
it's been running on that CPU to take up the bulk of its cache, how big the CPU
194
cache is, how fast and how layered the CPU cache is, how fast a context switch
195
is... and so on. In other words, it's close to random in the real world where we
196
do more than just one sole workload. The only thing we can be sure of is that
197
it's not free. So BFS uses the principle that an idle CPU is a wasted CPU and
198
utilising idle CPUs is more important than cache locality, and cache locality
199
only plays a part after that.
200
201
When choosing an idle CPU for a waking task, the cache locality is determined
202
according to where the task last ran and then idle CPUs are ranked from best
203
to worst to choose the most suitable idle CPU based on cache locality, NUMA
204
node locality and hyperthread sibling business. They are chosen in the
205
following preference (if idle):
206
207
* Same core, idle or busy cache, idle threads
208
* Other core, same cache, idle or busy cache, idle threads.
209
* Same node, other CPU, idle cache, idle threads.
210
* Same node, other CPU, busy cache, idle threads.
211
* Same core, busy threads.
212
* Other core, same cache, busy threads.
213
* Same node, other CPU, busy threads.
214
* Other node, other CPU, idle cache, idle threads.
215
* Other node, other CPU, busy cache, idle threads.
216
* Other node, other CPU, busy threads.
217
218
This shows the SMT or "hyperthread" awareness in the design as well which will
219
choose a real idle core first before a logical SMT sibling which already has
220
tasks on the physical CPU.
221
222
Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
223
However this benchmarking was performed on an earlier design that was far less
224
scalable than the current one so it's hard to know how scalable it is in terms
225
of both CPUs (due to the global runqueue) and heavily loaded machines (due to
226
O(n) lookup) at this stage. Note that in terms of scalability, the number of
227
_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
228
quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
229
results are very promising indeed, without needing to tweak any knobs, features
230
or options. Benchmark contributions are most welcome.
231
232
233
Features
234
235
As the initial prime target audience for BFS was the average desktop user, it
236
was designed to not need tweaking, tuning or have features set to obtain benefit
237
from it. Thus the number of knobs and features has been kept to an absolute
238
minimum and should not require extra user input for the vast majority of cases.
239
There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
240
and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
241
to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
242
support for CGROUPS. The average user should neither need to know what these
243
are, nor should they need to be using them to have good desktop behaviour.
244
245
rr_interval
246
247
There is only one "scheduler" tunable, the round robin interval. This can be
248
accessed in
249
250
	/proc/sys/kernel/rr_interval
251
252
The value is in milliseconds, and the default value is set to 6ms. Valid values
253
are from 1 to 1000. Decreasing the value will decrease latencies at the cost of
254
decreasing throughput, while increasing it will improve throughput, but at the
255
cost of worsening latencies. The accuracy of the rr interval is limited by HZ
256
resolution of the kernel configuration. Thus, the worst case latencies are
257
usually slightly higher than this actual value. BFS uses "dithering" to try and
258
minimise the effect the Hz limitation has. The default value of 6 is not an
259
arbitrary one. It is based on the fact that humans can detect jitter at
260
approximately 7ms, so aiming for much lower latencies is pointless under most
261
circumstances. It is worth noting this fact when comparing the latency
262
performance of BFS to other schedulers. Worst case latencies being higher than
263
7ms are far worse than average latencies not being in the microsecond range.
264
Experimentation has shown that rr intervals being increased up to 300 can
265
improve throughput but beyond that, scheduling noise from elsewhere prevents
266
further demonstrable throughput.
267
268
Isochronous scheduling.
269
270
Isochronous scheduling is a unique scheduling policy designed to provide
271
near-real-time performance to unprivileged (ie non-root) users without the
272
ability to starve the machine indefinitely. Isochronous tasks (which means
273
"same time") are set using, for example, the schedtool application like so:
274
275
	schedtool -I -e amarok
276
277
This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
278
is that it has a priority level between true realtime tasks and SCHED_NORMAL
279
which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
280
if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
281
rate). However if ISO tasks run for more than a tunable finite amount of time,
282
they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
283
time is the percentage of _total CPU_ available across the machine, configurable
284
as a percentage in the following "resource handling" tunable (as opposed to a
285
scheduler tunable):
286
287
	/proc/sys/kernel/iso_cpu
288
289
and is set to 70% by default. It is calculated over a rolling 5 second average
290
Because it is the total CPU available, it means that on a multi CPU machine, it
291
is possible to have an ISO task running as realtime scheduling indefinitely on
292
just one CPU, as the other CPUs will be available. Setting this to 100 is the
293
equivalent of giving all users SCHED_RR access and setting it to 0 removes the
294
ability to run any pseudo-realtime tasks.
295
296
A feature of BFS is that it detects when an application tries to obtain a
297
realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
298
appropriate privileges to use those policies. When it detects this, it will
299
give the task SCHED_ISO policy instead. Thus it is transparent to the user.
300
Because some applications constantly set their policy as well as their nice
301
level, there is potential for them to undo the override specified by the user
302
on the command line of setting the policy to SCHED_ISO. To counter this, once
303
a task has been set to SCHED_ISO policy, it needs superuser privileges to set
304
it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
305
processes and threads will also inherit the ISO policy.
306
307
Idleprio scheduling.
308
309
Idleprio scheduling is a scheduling policy designed to give out CPU to a task
310
_only_ when the CPU would be otherwise idle. The idea behind this is to allow
311
ultra low priority tasks to be run in the background that have virtually no
312
effect on the foreground tasks. This is ideally suited to distributed computing
313
clients (like setiathome, folding, mprime etc) but can also be used to start
314
a video encode or so on without any slowdown of other tasks. To avoid this
315
policy from grabbing shared resources and holding them indefinitely, if it
316
detects a state where the task is waiting on I/O, the machine is about to
317
suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
318
per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
319
it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
320
be set to start as SCHED_IDLEPRIO with the schedtool command like so:
321
322
	schedtool -D -e ./mprime
323
324
Subtick accounting.
325
326
It is surprisingly difficult to get accurate CPU accounting, and in many cases,
327
the accounting is done by simply determining what is happening at the precise
328
moment a timer tick fires off. This becomes increasingly inaccurate as the
329
timer tick frequency (HZ) is lowered. It is possible to create an application
330
which uses almost 100% CPU, yet by being descheduled at the right time, records
331
zero CPU usage. While the main problem with this is that there are possible
332
security implications, it is also difficult to determine how much CPU a task
333
really does use. BFS tries to use the sub-tick accounting from the TSC clock,
334
where possible, to determine real CPU usage. This is not entirely reliable, but
335
is far more likely to produce accurate CPU usage data than the existing designs
336
and will not show tasks as consuming no CPU usage when they actually are. Thus,
337
the amount of CPU reported as being used by BFS will more accurately represent
338
how much CPU the task itself is using (as is shown for example by the 'time'
339
application), so the reported values may be quite different to other schedulers.
340
Values reported as the 'load' are more prone to problems with this design, but
341
per process values are closer to real usage. When comparing throughput of BFS
342
to other designs, it is important to compare the actual completed work in terms
343
of total wall clock time taken and total work done, rather than the reported
344
"cpu usage".
345
346
347
Con Kolivas <kernel@kolivas.org> Tue, 5 Apr 2011
348
++ b/Documentation/sysctl/kernel.txt
Lines 33-38 Link Here
33
- domainname
33
- domainname
34
- hostname
34
- hostname
35
- hotplug
35
- hotplug
36
- iso_cpu
36
- kptr_restrict
37
- kptr_restrict
37
- kstack_depth_to_print       [ X86 only ]
38
- kstack_depth_to_print       [ X86 only ]
38
- l2cr                        [ PPC only ]
39
- l2cr                        [ PPC only ]
Lines 60-65 Link Here
60
- randomize_va_space
61
- randomize_va_space
61
- real-root-dev               ==> Documentation/initrd.txt
62
- real-root-dev               ==> Documentation/initrd.txt
62
- reboot-cmd                  [ SPARC only ]
63
- reboot-cmd                  [ SPARC only ]
64
- rr_interval
63
- rtsig-max
65
- rtsig-max
64
- rtsig-nr
66
- rtsig-nr
65
- sem
67
- sem
Lines 306-311 Link Here
306
308
307
==============================================================
309
==============================================================
308
310
311
iso_cpu: (BFS CPU scheduler only).
312
313
This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
314
run effectively at realtime priority, averaged over a rolling five
315
seconds over the -whole- system, meaning all cpus.
316
317
Set to 70 (percent) by default.
318
319
==============================================================
320
309
l2cr: (PPC only)
321
l2cr: (PPC only)
310
322
311
This flag controls the L2 cache of G3 processor boards. If
323
This flag controls the L2 cache of G3 processor boards. If
Lines 538-543 Link Here
538
550
539
==============================================================
551
==============================================================
540
552
553
rr_interval: (BFS CPU scheduler only)
554
555
This is the smallest duration that any cpu process scheduling unit
556
will run for. Increasing this value can increase throughput of cpu
557
bound tasks substantially but at the expense of increased latencies
558
overall. Conversely decreasing it will decrease average and maximum
559
latencies but at the expense of throughput. This value is in
560
milliseconds and the default value chosen depends on the number of
561
cpus available at scheduler initialisation with a minimum of 6.
562
563
Valid values are from 1-1000.
564
565
==============================================================
566
541
rtsig-max & rtsig-nr:
567
rtsig-max & rtsig-nr:
542
568
543
The file rtsig-max can be used to tune the maximum number
569
The file rtsig-max can be used to tune the maximum number
544
-- a/fs/proc/base.c
570
++ b/fs/proc/base.c
Lines 339-345 Link Here
339
static int proc_pid_schedstat(struct task_struct *task, char *buffer)
339
static int proc_pid_schedstat(struct task_struct *task, char *buffer)
340
{
340
{
341
	return sprintf(buffer, "%llu %llu %lu\n",
341
	return sprintf(buffer, "%llu %llu %lu\n",
342
			(unsigned long long)task->se.sum_exec_runtime,
342
			(unsigned long long)tsk_seruntime(task),
343
			(unsigned long long)task->sched_info.run_delay,
343
			(unsigned long long)task->sched_info.run_delay,
344
			task->sched_info.pcount);
344
			task->sched_info.pcount);
345
}
345
}
346
-- a/include/linux/init_task.h
346
++ b/include/linux/init_task.h
Lines 152-163 Link Here
152
# define INIT_VTIME(tsk)
152
# define INIT_VTIME(tsk)
153
#endif
153
#endif
154
154
155
#define INIT_TASK_COMM "swapper"
156
157
/*
155
/*
158
 *  INIT_TASK is used to set up the first task table, touch at
156
 *  INIT_TASK is used to set up the first task table, touch at
159
 * your own risk!. Base=0, limit=0x1fffff (=2MB)
157
 * your own risk!. Base=0, limit=0x1fffff (=2MB)
160
 */
158
 */
159
#ifdef CONFIG_SCHED_BFS
160
#define INIT_TASK_COMM "BFS"
161
#define INIT_TASK(tsk)	\
162
{									\
163
	.state		= 0,						\
164
	.stack		= &init_thread_info,				\
165
	.usage		= ATOMIC_INIT(2),				\
166
	.flags		= PF_KTHREAD,					\
167
	.prio		= NORMAL_PRIO,					\
168
	.static_prio	= MAX_PRIO-20,					\
169
	.normal_prio	= NORMAL_PRIO,					\
170
	.deadline	= 0,						\
171
	.policy		= SCHED_NORMAL,					\
172
	.cpus_allowed	= CPU_MASK_ALL,					\
173
	.mm		= NULL,						\
174
	.active_mm	= &init_mm,					\
175
	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
176
	.time_slice	= HZ,					\
177
	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
178
	INIT_PUSHABLE_TASKS(tsk)					\
179
	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
180
	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
181
	.real_parent	= &tsk,						\
182
	.parent		= &tsk,						\
183
	.children	= LIST_HEAD_INIT(tsk.children),			\
184
	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
185
	.group_leader	= &tsk,						\
186
	RCU_POINTER_INITIALIZER(real_cred, &init_cred),			\
187
	RCU_POINTER_INITIALIZER(cred, &init_cred),			\
188
	.comm		= INIT_TASK_COMM,				\
189
	.thread		= INIT_THREAD,					\
190
	.fs		= &init_fs,					\
191
	.files		= &init_files,					\
192
	.signal		= &init_signals,				\
193
	.sighand	= &init_sighand,				\
194
	.nsproxy	= &init_nsproxy,				\
195
	.pending	= {						\
196
		.list = LIST_HEAD_INIT(tsk.pending.list),		\
197
		.signal = {{0}}},					\
198
	.blocked	= {{0}},					\
199
	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
200
	.journal_info	= NULL,						\
201
	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
202
	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
203
	.timer_slack_ns = 50000, /* 50 usec default slack */		\
204
	.pids = {							\
205
		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
206
		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
207
		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
208
	},								\
209
	INIT_IDS							\
210
	INIT_PERF_EVENTS(tsk)						\
211
	INIT_TRACE_IRQFLAGS						\
212
	INIT_LOCKDEP							\
213
	INIT_FTRACE_GRAPH						\
214
	INIT_TRACE_RECURSION						\
215
	INIT_TASK_RCU_PREEMPT(tsk)					\
216
}
217
#else /* CONFIG_SCHED_BFS */
218
#define INIT_TASK_COMM "swapper"
161
#define INIT_TASK(tsk)	\
219
#define INIT_TASK(tsk)	\
162
{									\
220
{									\
163
	.state		= 0,						\
221
	.state		= 0,						\
Lines 223-229 Link Here
223
	INIT_CPUSET_SEQ							\
281
	INIT_CPUSET_SEQ							\
224
	INIT_VTIME(tsk)							\
282
	INIT_VTIME(tsk)							\
225
}
283
}
226
284
#endif /* CONFIG_SCHED_BFS */
227
285
228
#define INIT_CPU_TIMERS(cpu_timers)					\
286
#define INIT_CPU_TIMERS(cpu_timers)					\
229
{									\
287
{									\
230
-- a/include/linux/ioprio.h
288
++ b/include/linux/ioprio.h
Lines 52-57 Link Here
52
 */
52
 */
53
static inline int task_nice_ioprio(struct task_struct *task)
53
static inline int task_nice_ioprio(struct task_struct *task)
54
{
54
{
55
	if (iso_task(task))
56
		return 0;
55
	return (task_nice(task) + 20) / 5;
57
	return (task_nice(task) + 20) / 5;
56
}
58
}
57
59
58
-- a/include/linux/sched.h
60
++ b/include/linux/sched.h
Lines 229-236 Link Here
229
extern void init_idle(struct task_struct *idle, int cpu);
229
extern void init_idle(struct task_struct *idle, int cpu);
230
extern void init_idle_bootup_task(struct task_struct *idle);
230
extern void init_idle_bootup_task(struct task_struct *idle);
231
231
232
extern int runqueue_is_locked(int cpu);
233
234
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
232
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
235
extern void nohz_balance_enter_idle(int cpu);
233
extern void nohz_balance_enter_idle(int cpu);
236
extern void set_cpu_sd_state_idle(void);
234
extern void set_cpu_sd_state_idle(void);
Lines 1040-1057 Link Here
1040
1038
1041
#ifdef CONFIG_SMP
1039
#ifdef CONFIG_SMP
1042
	struct llist_node wake_entry;
1040
	struct llist_node wake_entry;
1043
	int on_cpu;
1044
#endif
1041
#endif
1045
	int on_rq;
1042
#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BFS)
1043
	bool on_cpu;
1044
#endif
1045
#ifndef CONFIG_SCHED_BFS
1046
	bool on_rq;
1047
#endif
1046
1048
1047
	int prio, static_prio, normal_prio;
1049
	int prio, static_prio, normal_prio;
1048
	unsigned int rt_priority;
1050
	unsigned int rt_priority;
1051
#ifdef CONFIG_SCHED_BFS
1052
	int time_slice;
1053
	u64 deadline;
1054
	struct list_head run_list;
1055
	u64 last_ran;
1056
	u64 sched_time; /* sched_clock time spent running */
1057
#ifdef CONFIG_SMP
1058
	bool sticky; /* Soft affined flag */
1059
#endif
1060
	unsigned long rt_timeout;
1061
#else /* CONFIG_SCHED_BFS */
1049
	const struct sched_class *sched_class;
1062
	const struct sched_class *sched_class;
1050
	struct sched_entity se;
1063
	struct sched_entity se;
1051
	struct sched_rt_entity rt;
1064
	struct sched_rt_entity rt;
1065
1052
#ifdef CONFIG_CGROUP_SCHED
1066
#ifdef CONFIG_CGROUP_SCHED
1053
	struct task_group *sched_task_group;
1067
	struct task_group *sched_task_group;
1054
#endif
1068
#endif
1069
#endif
1055
1070
1056
#ifdef CONFIG_PREEMPT_NOTIFIERS
1071
#ifdef CONFIG_PREEMPT_NOTIFIERS
1057
	/* list of struct preempt_notifier: */
1072
	/* list of struct preempt_notifier: */
Lines 1162-1167 Link Here
1162
	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
1177
	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
1163
1178
1164
	cputime_t utime, stime, utimescaled, stimescaled;
1179
	cputime_t utime, stime, utimescaled, stimescaled;
1180
#ifdef CONFIG_SCHED_BFS
1181
	unsigned long utime_pc, stime_pc;
1182
#endif
1165
	cputime_t gtime;
1183
	cputime_t gtime;
1166
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1184
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1167
	struct cputime prev_cputime;
1185
	struct cputime prev_cputime;
Lines 1418-1423 Link Here
1418
#endif
1436
#endif
1419
};
1437
};
1420
1438
1439
#ifdef CONFIG_SCHED_BFS
1440
bool grunqueue_is_locked(void);
1441
void grq_unlock_wait(void);
1442
void cpu_scaling(int cpu);
1443
void cpu_nonscaling(int cpu);
1444
bool above_background_load(void);
1445
#define tsk_seruntime(t)		((t)->sched_time)
1446
#define tsk_rttimeout(t)		((t)->rt_timeout)
1447
1448
static inline void tsk_cpus_current(struct task_struct *p)
1449
{
1450
}
1451
1452
static inline int runqueue_is_locked(int cpu)
1453
{
1454
	return grunqueue_is_locked();
1455
}
1456
1457
void print_scheduler_version(void);
1458
1459
static inline bool iso_task(struct task_struct *p)
1460
{
1461
	return (p->policy == SCHED_ISO);
1462
}
1463
#else /* CFS */
1464
extern int runqueue_is_locked(int cpu);
1465
static inline void cpu_scaling(int cpu)
1466
{
1467
}
1468
1469
static inline void cpu_nonscaling(int cpu)
1470
{
1471
}
1472
#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)
1473
#define tsk_rttimeout(t)	((t)->rt.timeout)
1474
1475
static inline void tsk_cpus_current(struct task_struct *p)
1476
{
1477
	p->nr_cpus_allowed = current->nr_cpus_allowed;
1478
}
1479
1480
static inline void print_scheduler_version(void)
1481
{
1482
	printk(KERN_INFO"CFS CPU scheduler.\n");
1483
}
1484
1485
static inline bool iso_task(struct task_struct *p)
1486
{
1487
	return false;
1488
}
1489
1490
/* Anyone feel like implementing this? */
1491
static inline bool above_background_load(void)
1492
{
1493
	return false;
1494
}
1495
#endif /* CONFIG_SCHED_BFS */
1496
1421
/* Future-safe accessor for struct task_struct's cpus_allowed. */
1497
/* Future-safe accessor for struct task_struct's cpus_allowed. */
1422
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1498
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1423
1499
Lines 1844-1850 Link Here
1844
task_sched_runtime(struct task_struct *task);
1920
task_sched_runtime(struct task_struct *task);
1845
1921
1846
/* sched_exec is called by processes performing an exec */
1922
/* sched_exec is called by processes performing an exec */
1847
#ifdef CONFIG_SMP
1923
#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BFS)
1848
extern void sched_exec(void);
1924
extern void sched_exec(void);
1849
#else
1925
#else
1850
#define sched_exec()   {}
1926
#define sched_exec()   {}
Lines 2549-2555 Link Here
2549
	return 0;
2625
	return 0;
2550
}
2626
}
2551
2627
2552
static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
2628
static inline void set_task_cpu(struct task_struct *p, int cpu)
2553
{
2629
{
2554
}
2630
}
2555
2631
2556
-- a/init/Kconfig
2632
++ b/init/Kconfig
Lines 28-33 Link Here
28
28
29
menu "General setup"
29
menu "General setup"
30
30
31
config SCHED_BFS
32
	bool "BFS cpu scheduler"
33
	---help---
34
	  The Brain Fuck CPU Scheduler for excellent interactivity and
35
	  responsiveness on the desktop and solid scalability on normal
36
          hardware and commodity servers. Not recommended for 4096 CPUs.
37
38
	  Currently incompatible with the Group CPU scheduler, and RCU TORTURE
39
          TEST so these options are disabled.
40
41
          Say Y here.
42
	default y
43
44
31
config BROKEN
45
config BROKEN
32
	bool
46
	bool
33
47
Lines 302-308 Link Here
302
# Kind of a stub config for the pure tick based cputime accounting
316
# Kind of a stub config for the pure tick based cputime accounting
303
config TICK_CPU_ACCOUNTING
317
config TICK_CPU_ACCOUNTING
304
	bool "Simple tick based cputime accounting"
318
	bool "Simple tick based cputime accounting"
305
	depends on !S390 && !NO_HZ_FULL
319
	depends on !S390 && !NO_HZ_FULL && !SCHED_BFS
306
	help
320
	help
307
	  This is the basic tick based cputime accounting that maintains
321
	  This is the basic tick based cputime accounting that maintains
308
	  statistics about user, system and idle time spent on per jiffies
322
	  statistics about user, system and idle time spent on per jiffies
Lines 325-331 Link Here
325
339
326
config VIRT_CPU_ACCOUNTING_GEN
340
config VIRT_CPU_ACCOUNTING_GEN
327
	bool "Full dynticks CPU time accounting"
341
	bool "Full dynticks CPU time accounting"
328
	depends on HAVE_CONTEXT_TRACKING && 64BIT
342
	depends on HAVE_CONTEXT_TRACKING && 64BIT && !SCHED_BFS
329
	select VIRT_CPU_ACCOUNTING
343
	select VIRT_CPU_ACCOUNTING
330
	select CONTEXT_TRACKING
344
	select CONTEXT_TRACKING
331
	help
345
	help
Lines 488-494 Link Here
488
502
489
config RCU_USER_QS
503
config RCU_USER_QS
490
	bool "Consider userspace as in RCU extended quiescent state"
504
	bool "Consider userspace as in RCU extended quiescent state"
491
	depends on HAVE_CONTEXT_TRACKING && SMP
505
	depends on HAVE_CONTEXT_TRACKING && SMP && !SCHED_BFS
492
	select CONTEXT_TRACKING
506
	select CONTEXT_TRACKING
493
	help
507
	help
494
	  This option sets hooks on kernel / userspace boundaries and
508
	  This option sets hooks on kernel / userspace boundaries and
Lines 657-663 Link Here
657
671
658
config RCU_NOCB_CPU
672
config RCU_NOCB_CPU
659
	bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL"
673
	bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL"
660
	depends on TREE_RCU || TREE_PREEMPT_RCU
674
	depends on (TREE_RCU || TREE_PREEMPT_RCU) && !SCHED_BFS
661
	default n
675
	default n
662
	help
676
	help
663
	  Use this option to reduce OS jitter for aggressive HPC or
677
	  Use this option to reduce OS jitter for aggressive HPC or
Lines 795-800 Link Here
795
	depends on ARCH_SUPPORTS_NUMA_BALANCING
809
	depends on ARCH_SUPPORTS_NUMA_BALANCING
796
	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
810
	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
797
	depends on SMP && NUMA && MIGRATION
811
	depends on SMP && NUMA && MIGRATION
812
	depends on !SCHED_BFS
798
	help
813
	help
799
	  This option adds support for automatic NUMA aware memory/task placement.
814
	  This option adds support for automatic NUMA aware memory/task placement.
800
	  The mechanism is quite primitive and is based on migrating memory when
815
	  The mechanism is quite primitive and is based on migrating memory when
Lines 857-862 Link Here
857
872
858
config CGROUP_CPUACCT
873
config CGROUP_CPUACCT
859
	bool "Simple CPU accounting cgroup subsystem"
874
	bool "Simple CPU accounting cgroup subsystem"
875
	depends on !SCHED_BFS
860
	help
876
	help
861
	  Provides a simple Resource Controller for monitoring the
877
	  Provides a simple Resource Controller for monitoring the
862
	  total CPU consumed by the tasks in a cgroup.
878
	  total CPU consumed by the tasks in a cgroup.
Lines 959-964 Link Here
959
975
960
menuconfig CGROUP_SCHED
976
menuconfig CGROUP_SCHED
961
	bool "Group CPU scheduler"
977
	bool "Group CPU scheduler"
978
	depends on !SCHED_BFS
962
	default n
979
	default n
963
	help
980
	help
964
	  This feature lets CPU scheduler recognize task groups and control CPU
981
	  This feature lets CPU scheduler recognize task groups and control CPU
Lines 1123-1128 Link Here
1123
1140
1124
config SCHED_AUTOGROUP
1141
config SCHED_AUTOGROUP
1125
	bool "Automatic process group scheduling"
1142
	bool "Automatic process group scheduling"
1143
	depends on !SCHED_BFS
1126
	select EVENTFD
1144
	select EVENTFD
1127
	select CGROUPS
1145
	select CGROUPS
1128
	select CGROUP_SCHED
1146
	select CGROUP_SCHED
Lines 1526-1563 Link Here
1526
1544
1527
	  On non-ancient distros (post-2000 ones) N is usually a safe choice.
1545
	  On non-ancient distros (post-2000 ones) N is usually a safe choice.
1528
1546
1529
choice
1530
	prompt "Choose SLAB allocator"
1531
	default SLUB
1532
	help
1533
	   This option allows to select a slab allocator.
1534
1535
config SLAB
1536
	bool "SLAB"
1537
	help
1538
	  The regular slab allocator that is established and known to work
1539
	  well in all environments. It organizes cache hot objects in
1540
	  per cpu and per node queues.
1541
1542
config SLUB
1547
config SLUB
1543
	bool "SLUB (Unqueued Allocator)"
1548
	def_bool y
1544
	help
1545
	   SLUB is a slab allocator that minimizes cache line usage
1546
	   instead of managing queues of cached objects (SLAB approach).
1547
	   Per cpu caching is realized using slabs of objects instead
1548
	   of queues of objects. SLUB can use memory efficiently
1549
	   and has enhanced diagnostics. SLUB is the default choice for
1550
	   a slab allocator.
1551
1552
config SLOB
1553
	depends on EXPERT
1554
	bool "SLOB (Simple Allocator)"
1555
	help
1556
	   SLOB replaces the stock allocator with a drastically simpler
1557
	   allocator. SLOB is generally more space efficient but
1558
	   does not perform as well on large systems.
1559
1560
endchoice
1561
1549
1562
config MMAP_ALLOW_UNINITIALIZED
1550
config MMAP_ALLOW_UNINITIALIZED
1563
	bool "Allow mmapped anonymous memory to be uninitialized"
1551
	bool "Allow mmapped anonymous memory to be uninitialized"
1564
-- a/init/main.c
1552
++ b/init/main.c
Lines 700-706 Link Here
700
	return ret;
700
	return ret;
701
}
701
}
702
702
703
704
extern initcall_t __initcall_start[];
703
extern initcall_t __initcall_start[];
705
extern initcall_t __initcall0_start[];
704
extern initcall_t __initcall0_start[];
706
extern initcall_t __initcall1_start[];
705
extern initcall_t __initcall1_start[];
Lines 820-825 Link Here
820
819
821
	flush_delayed_fput();
820
	flush_delayed_fput();
822
821
822
	print_scheduler_version();
823
823
	if (ramdisk_execute_command) {
824
	if (ramdisk_execute_command) {
824
		if (!run_init_process(ramdisk_execute_command))
825
		if (!run_init_process(ramdisk_execute_command))
825
			return 0;
826
			return 0;
826
-- a/kernel/delayacct.c
827
++ b/kernel/delayacct.c
Lines 133-139 Link Here
133
	 */
133
	 */
134
	t1 = tsk->sched_info.pcount;
134
	t1 = tsk->sched_info.pcount;
135
	t2 = tsk->sched_info.run_delay;
135
	t2 = tsk->sched_info.run_delay;
136
	t3 = tsk->se.sum_exec_runtime;
136
	t3 = tsk_seruntime(tsk);
137
137
138
	d->cpu_count += t1;
138
	d->cpu_count += t1;
139
139
140
-- a/kernel/exit.c
140
++ b/kernel/exit.c
Lines 135-141 Link Here
135
		sig->inblock += task_io_get_inblock(tsk);
135
		sig->inblock += task_io_get_inblock(tsk);
136
		sig->oublock += task_io_get_oublock(tsk);
136
		sig->oublock += task_io_get_oublock(tsk);
137
		task_io_accounting_add(&sig->ioac, &tsk->ioac);
137
		task_io_accounting_add(&sig->ioac, &tsk->ioac);
138
		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
138
		sig->sum_sched_runtime += tsk_seruntime(tsk);
139
	}
139
	}
140
140
141
	sig->nr_threads--;
141
	sig->nr_threads--;
142
-- a/kernel/posix-cpu-timers.c
142
++ b/kernel/posix-cpu-timers.c
Lines 498-508 Link Here
498
{
498
{
499
	cputime_t utime, stime;
499
	cputime_t utime, stime;
500
500
501
	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
501
	add_device_randomness((const void*) &tsk_seruntime(tsk),
502
						sizeof(unsigned long long));
502
						sizeof(unsigned long long));
503
	task_cputime(tsk, &utime, &stime);
503
	task_cputime(tsk, &utime, &stime);
504
	cleanup_timers(tsk->cpu_timers,
504
	cleanup_timers(tsk->cpu_timers,
505
		       utime, stime, tsk->se.sum_exec_runtime);
505
		       utime, stime, tsk_seruntime(tsk));
506
506
507
}
507
}
508
void posix_cpu_timers_exit_group(struct task_struct *tsk)
508
void posix_cpu_timers_exit_group(struct task_struct *tsk)
Lines 513-519 Link Here
513
	task_cputime(tsk, &utime, &stime);
513
	task_cputime(tsk, &utime, &stime);
514
	cleanup_timers(tsk->signal->cpu_timers,
514
	cleanup_timers(tsk->signal->cpu_timers,
515
		       utime + sig->utime, stime + sig->stime,
515
		       utime + sig->utime, stime + sig->stime,
516
		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
516
		       tsk_seruntime(tsk) + sig->sum_sched_runtime);
517
}
517
}
518
518
519
static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
519
static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
Lines 976-982 Link Here
976
		struct cpu_timer_list *t = list_first_entry(timers,
976
		struct cpu_timer_list *t = list_first_entry(timers,
977
						      struct cpu_timer_list,
977
						      struct cpu_timer_list,
978
						      entry);
978
						      entry);
979
		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
979
		if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) {
980
			tsk->cputime_expires.sched_exp = t->expires.sched;
980
			tsk->cputime_expires.sched_exp = t->expires.sched;
981
			break;
981
			break;
982
		}
982
		}
Lines 993-999 Link Here
993
			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
993
			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
994
994
995
		if (hard != RLIM_INFINITY &&
995
		if (hard != RLIM_INFINITY &&
996
		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
996
		    tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
997
			/*
997
			/*
998
			 * At the hard limit, we just die.
998
			 * At the hard limit, we just die.
999
			 * No need to calculate anything else now.
999
			 * No need to calculate anything else now.
Lines 1001-1007 Link Here
1001
			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1001
			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1002
			return;
1002
			return;
1003
		}
1003
		}
1004
		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1004
		if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1005
			/*
1005
			/*
1006
			 * At the soft limit, send a SIGXCPU every second.
1006
			 * At the soft limit, send a SIGXCPU every second.
1007
			 */
1007
			 */
Lines 1282-1288 Link Here
1282
		struct task_cputime task_sample = {
1282
		struct task_cputime task_sample = {
1283
			.utime = utime,
1283
			.utime = utime,
1284
			.stime = stime,
1284
			.stime = stime,
1285
			.sum_exec_runtime = tsk->se.sum_exec_runtime
1285
			.sum_exec_runtime = tsk_seruntime(tsk)
1286
		};
1286
		};
1287
1287
1288
		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1288
		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1289
-- a/kernel/sysctl.c
1289
++ b/kernel/sysctl.c
Lines 128-134 Link Here
128
static int __maybe_unused two = 2;
128
static int __maybe_unused two = 2;
129
static int __maybe_unused three = 3;
129
static int __maybe_unused three = 3;
130
static unsigned long one_ul = 1;
130
static unsigned long one_ul = 1;
131
static int one_hundred = 100;
131
static int __maybe_unused one_hundred = 100;
132
#ifdef CONFIG_SCHED_BFS
133
extern int rr_interval;
134
extern int sched_iso_cpu;
135
static int __read_mostly one_thousand = 1000;
136
#endif
132
#ifdef CONFIG_PRINTK
137
#ifdef CONFIG_PRINTK
133
static int ten_thousand = 10000;
138
static int ten_thousand = 10000;
134
#endif
139
#endif
Lines 256-262 Link Here
256
	{ }
261
	{ }
257
};
262
};
258
263
259
#ifdef CONFIG_SCHED_DEBUG
264
#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS)
260
static int min_sched_granularity_ns = 100000;		/* 100 usecs */
265
static int min_sched_granularity_ns = 100000;		/* 100 usecs */
261
static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
266
static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
262
static int min_wakeup_granularity_ns;			/* 0 usecs */
267
static int min_wakeup_granularity_ns;			/* 0 usecs */
Lines 273-278 Link Here
273
#endif
278
#endif
274
279
275
static struct ctl_table kern_table[] = {
280
static struct ctl_table kern_table[] = {
281
#ifndef CONFIG_SCHED_BFS
276
	{
282
	{
277
		.procname	= "sched_child_runs_first",
283
		.procname	= "sched_child_runs_first",
278
		.data		= &sysctl_sched_child_runs_first,
284
		.data		= &sysctl_sched_child_runs_first,
Lines 436-441 Link Here
436
		.extra1		= &one,
442
		.extra1		= &one,
437
	},
443
	},
438
#endif
444
#endif
445
#endif /* !CONFIG_SCHED_BFS */
439
#ifdef CONFIG_PROVE_LOCKING
446
#ifdef CONFIG_PROVE_LOCKING
440
	{
447
	{
441
		.procname	= "prove_locking",
448
		.procname	= "prove_locking",
Lines 907-912 Link Here
907
		.proc_handler	= proc_dointvec,
914
		.proc_handler	= proc_dointvec,
908
	},
915
	},
909
#endif
916
#endif
917
#ifdef CONFIG_SCHED_BFS
918
	{
919
		.procname	= "rr_interval",
920
		.data		= &rr_interval,
921
		.maxlen		= sizeof (int),
922
		.mode		= 0644,
923
		.proc_handler	= &proc_dointvec_minmax,
924
		.extra1		= &one,
925
		.extra2		= &one_thousand,
926
	},
927
	{
928
		.procname	= "iso_cpu",
929
		.data		= &sched_iso_cpu,
930
		.maxlen		= sizeof (int),
931
		.mode		= 0644,
932
		.proc_handler	= &proc_dointvec_minmax,
933
		.extra1		= &zero,
934
		.extra2		= &one_hundred,
935
	},
936
#endif
910
#if defined(CONFIG_S390) && defined(CONFIG_SMP)
937
#if defined(CONFIG_S390) && defined(CONFIG_SMP)
911
	{
938
	{
912
		.procname	= "spin_retry",
939
		.procname	= "spin_retry",
913
-- a/lib/Kconfig.debug
940
++ b/lib/Kconfig.debug
Lines 940-946 Link Here
940
940
941
config RCU_TORTURE_TEST
941
config RCU_TORTURE_TEST
942
	tristate "torture tests for RCU"
942
	tristate "torture tests for RCU"
943
	depends on DEBUG_KERNEL
943
	depends on DEBUG_KERNEL && !SCHED_BFS
944
	default n
944
	default n
945
	help
945
	help
946
	  This option provides a kernel module that runs torture tests
946
	  This option provides a kernel module that runs torture tests
947
-- a/include/linux/jiffies.h
947
++ b/include/linux/jiffies.h
Lines 159-165 Link Here
159
 * Have the 32 bit jiffies value wrap 5 minutes after boot
159
 * Have the 32 bit jiffies value wrap 5 minutes after boot
160
 * so jiffies wrap bugs show up earlier.
160
 * so jiffies wrap bugs show up earlier.
161
 */
161
 */
162
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
162
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ))
163
163
164
/*
164
/*
165
 * Change timeval to jiffies, trying to avoid the
165
 * Change timeval to jiffies, trying to avoid the
166
-- a/drivers/cpufreq/cpufreq.c
166
++ b/drivers/cpufreq/cpufreq.c
Lines 30-35 Link Here
30
#include <linux/cpu.h>
30
#include <linux/cpu.h>
31
#include <linux/completion.h>
31
#include <linux/completion.h>
32
#include <linux/mutex.h>
32
#include <linux/mutex.h>
33
#include <linux/sched.h>
33
#include <linux/syscore_ops.h>
34
#include <linux/syscore_ops.h>
34
35
35
#include <trace/events/power.h>
36
#include <trace/events/power.h>
Lines 1474-1479 Link Here
1474
1475
1475
	if (cpufreq_driver->target)
1476
	if (cpufreq_driver->target)
1476
		retval = cpufreq_driver->target(policy, target_freq, relation);
1477
		retval = cpufreq_driver->target(policy, target_freq, relation);
1478
	if (likely(retval != -EINVAL)) {
1479
		if (target_freq == policy->max)
1480
			cpu_nonscaling(policy->cpu);
1481
		else
1482
			cpu_scaling(policy->cpu);
1483
	}
1477
1484
1478
	return retval;
1485
	return retval;
1479
}
1486
}
1480
-- a/drivers/cpufreq/cpufreq_ondemand.c
1487
++ b/drivers/cpufreq/cpufreq_ondemand.c
Lines 29-36 Link Here
29
#include "cpufreq_governor.h"
29
#include "cpufreq_governor.h"
30
30
31
/* On-demand governor macros */
31
/* On-demand governor macros */
32
#define DEF_FREQUENCY_DOWN_DIFFERENTIAL		(10)
32
#define DEF_FREQUENCY_DOWN_DIFFERENTIAL		(26)
33
#define DEF_FREQUENCY_UP_THRESHOLD		(80)
33
#define DEF_FREQUENCY_UP_THRESHOLD		(63)
34
#define DEF_SAMPLING_DOWN_FACTOR		(1)
34
#define DEF_SAMPLING_DOWN_FACTOR		(1)
35
#define MAX_SAMPLING_DOWN_FACTOR		(100000)
35
#define MAX_SAMPLING_DOWN_FACTOR		(100000)
36
#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL	(3)
36
#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL	(3)
Lines 160-169 Link Here
160
}
160
}
161
161
162
/*
162
/*
163
 * Every sampling_rate, we check, if current idle time is less than 20%
163
 * Every sampling_rate, we check, if current idle time is less than 37%
164
 * (default), then we try to increase frequency. Every sampling_rate, we look
164
 * (default), then we try to increase frequency. Every sampling_rate, we look
165
 * for the lowest frequency which can sustain the load while keeping idle time
165
 * for the lowest frequency which can sustain the load while keeping idle time
166
 * over 30%. If such a frequency exist, we try to decrease to this frequency.
166
 * over 63%. If such a frequency exist, we try to decrease to this frequency.
167
 *
167
 *
168
 * Any frequency increase takes it to the maximum frequency. Frequency reduction
168
 * Any frequency increase takes it to the maximum frequency. Frequency reduction
169
 * happens at minimum steps of 5% (default) of current frequency
169
 * happens at minimum steps of 5% (default) of current frequency
170
-- /dev/null
170
++ b/kernel/sched/bfs.c
Line 0 Link Here
0
-- a/include/uapi/linux/sched.h
1
/*
2
 *  kernel/sched/bfs.c, was kernel/sched.c
3
 *
4
 *  Kernel scheduler and related syscalls
5
 *
6
 *  Copyright (C) 1991-2002  Linus Torvalds
7
 *
8
 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9
 *		make semaphores SMP safe
10
 *  1998-11-19	Implemented schedule_timeout() and related stuff
11
 *		by Andrea Arcangeli
12
 *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
13
 *		hybrid priority-list and round-robin design with
14
 *		an array-switch method of distributing timeslices
15
 *		and per-CPU runqueues.  Cleanups and useful suggestions
16
 *		by Davide Libenzi, preemptible kernel bits by Robert Love.
17
 *  2003-09-03	Interactivity tuning by Con Kolivas.
18
 *  2004-04-02	Scheduler domains code by Nick Piggin
19
 *  2007-04-15  Work begun on replacing all interactivity tuning with a
20
 *              fair scheduling design by Con Kolivas.
21
 *  2007-05-05  Load balancing (smp-nice) and other improvements
22
 *              by Peter Williams
23
 *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
24
 *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
25
 *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
26
 *              Thomas Gleixner, Mike Kravetz
27
 *  now		Brainfuck deadline scheduling policy by Con Kolivas deletes
28
 *              a whole lot of those previous things.
29
 */
30
31
#include <linux/mm.h>
32
#include <linux/module.h>
33
#include <linux/nmi.h>
34
#include <linux/init.h>
35
#include <asm/uaccess.h>
36
#include <linux/highmem.h>
37
#include <asm/mmu_context.h>
38
#include <linux/interrupt.h>
39
#include <linux/capability.h>
40
#include <linux/completion.h>
41
#include <linux/kernel_stat.h>
42
#include <linux/debug_locks.h>
43
#include <linux/perf_event.h>
44
#include <linux/security.h>
45
#include <linux/notifier.h>
46
#include <linux/profile.h>
47
#include <linux/freezer.h>
48
#include <linux/vmalloc.h>
49
#include <linux/blkdev.h>
50
#include <linux/delay.h>
51
#include <linux/smp.h>
52
#include <linux/threads.h>
53
#include <linux/timer.h>
54
#include <linux/rcupdate.h>
55
#include <linux/cpu.h>
56
#include <linux/cpuset.h>
57
#include <linux/cpumask.h>
58
#include <linux/percpu.h>
59
#include <linux/proc_fs.h>
60
#include <linux/seq_file.h>
61
#include <linux/syscalls.h>
62
#include <linux/times.h>
63
#include <linux/tsacct_kern.h>
64
#include <linux/kprobes.h>
65
#include <linux/delayacct.h>
66
#include <linux/log2.h>
67
#include <linux/bootmem.h>
68
#include <linux/ftrace.h>
69
#include <linux/slab.h>
70
#include <linux/init_task.h>
71
#include <linux/binfmts.h>
72
#include <linux/context_tracking.h>
73
74
#include <asm/switch_to.h>
75
#include <asm/tlb.h>
76
#include <asm/unistd.h>
77
#include <asm/mutex.h>
78
#ifdef CONFIG_PARAVIRT
79
#include <asm/paravirt.h>
80
#endif
81
82
#include "cpupri.h"
83
#include "../workqueue_internal.h"
84
#include "../smpboot.h"
85
86
#define CREATE_TRACE_POINTS
87
#include <trace/events/sched.h>
88
89
#include "bfs_sched.h"
90
91
#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
92
#define rt_task(p)		rt_prio((p)->prio)
93
#define rt_queue(rq)		rt_prio((rq)->rq_prio)
94
#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
95
#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
96
					(policy) == SCHED_RR)
97
#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
98
#define idleprio_task(p)	unlikely((p)->policy == SCHED_IDLEPRIO)
99
#define iso_task(p)		unlikely((p)->policy == SCHED_ISO)
100
#define iso_queue(rq)		unlikely((rq)->rq_policy == SCHED_ISO)
101
#define rq_running_iso(rq)	((rq)->rq_prio == ISO_PRIO)
102
103
#define ISO_PERIOD		((5 * HZ * grq.noc) + 1)
104
105
/*
106
 * Convert user-nice values [ -20 ... 0 ... 19 ]
107
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
108
 * and back.
109
 */
110
#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
111
#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
112
#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
113
114
/*
115
 * 'User priority' is the nice value converted to something we
116
 * can work with better when scaling various scheduler parameters,
117
 * it's a [ 0 ... 39 ] range.
118
 */
119
#define USER_PRIO(p)		((p) - MAX_RT_PRIO)
120
#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
121
#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
122
#define SCHED_PRIO(p)		((p) + MAX_RT_PRIO)
123
#define STOP_PRIO		(MAX_RT_PRIO - 1)
124
125
/*
126
 * Some helpers for converting to/from various scales. Use shifts to get
127
 * approximate multiples of ten for less overhead.
128
 */
129
#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
130
#define JIFFY_NS		(1000000000 / HZ)
131
#define HALF_JIFFY_NS		(1000000000 / HZ / 2)
132
#define HALF_JIFFY_US		(1000000 / HZ / 2)
133
#define MS_TO_NS(TIME)		((TIME) << 20)
134
#define MS_TO_US(TIME)		((TIME) << 10)
135
#define NS_TO_MS(TIME)		((TIME) >> 20)
136
#define NS_TO_US(TIME)		((TIME) >> 10)
137
138
#define RESCHED_US	(100) /* Reschedule if less than this many μs left */
139
140
void print_scheduler_version(void)
141
{
142
	printk(KERN_INFO "BFS CPU scheduler v0.442 by Con Kolivas.\n");
143
}
144
145
/*
146
 * This is the time all tasks within the same priority round robin.
147
 * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
148
 * Tunable via /proc interface.
149
 */
150
int rr_interval __read_mostly = 6;
151
152
/*
153
 * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
154
 * are allowed to run five seconds as real time tasks. This is the total over
155
 * all online cpus.
156
 */
157
int sched_iso_cpu __read_mostly = 70;
158
159
/*
160
 * The relative length of deadline for each priority(nice) level.
161
 */
162
static int prio_ratios[PRIO_RANGE] __read_mostly;
163
164
/*
165
 * The quota handed out to tasks of all priority levels when refilling their
166
 * time_slice.
167
 */
168
static inline int timeslice(void)
169
{
170
	return MS_TO_US(rr_interval);
171
}
172
173
/*
174
 * The global runqueue data that all CPUs work off. Data is protected either
175
 * by the global grq lock, or the discrete lock that precedes the data in this
176
 * struct.
177
 */
178
struct global_rq {
179
	raw_spinlock_t lock;
180
	unsigned long nr_running;
181
	unsigned long nr_uninterruptible;
182
	unsigned long long nr_switches;
183
	struct list_head queue[PRIO_LIMIT];
184
	DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
185
#ifdef CONFIG_SMP
186
	unsigned long qnr; /* queued not running */
187
	cpumask_t cpu_idle_map;
188
	bool idle_cpus;
189
#endif
190
	int noc; /* num_online_cpus stored and updated when it changes */
191
	u64 niffies; /* Nanosecond jiffies */
192
	unsigned long last_jiffy; /* Last jiffy we updated niffies */
193
194
	raw_spinlock_t iso_lock;
195
	int iso_ticks;
196
	bool iso_refractory;
197
};
198
199
#ifdef CONFIG_SMP
200
201
/*
202
 * We add the notion of a root-domain which will be used to define per-domain
203
 * variables. Each exclusive cpuset essentially defines an island domain by
204
 * fully partitioning the member cpus from any other cpuset. Whenever a new
205
 * exclusive cpuset is created, we also create and attach a new root-domain
206
 * object.
207
 *
208
 */
209
struct root_domain {
210
	atomic_t refcount;
211
	atomic_t rto_count;
212
	struct rcu_head rcu;
213
	cpumask_var_t span;
214
	cpumask_var_t online;
215
216
	/*
217
	 * The "RT overload" flag: it gets set if a CPU has more than
218
	 * one runnable RT task.
219
	 */
220
	cpumask_var_t rto_mask;
221
	struct cpupri cpupri;
222
};
223
224
/*
225
 * By default the system creates a single root-domain with all cpus as
226
 * members (mimicking the global state we have today).
227
 */
228
static struct root_domain def_root_domain;
229
230
#endif /* CONFIG_SMP */
231
232
/* There can be only one */
233
static struct global_rq grq;
234
235
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
236
static DEFINE_MUTEX(sched_hotcpu_mutex);
237
238
#ifdef CONFIG_SMP
239
struct rq *cpu_rq(int cpu)
240
{
241
	return &per_cpu(runqueues, (cpu));
242
}
243
#define this_rq()		(&__get_cpu_var(runqueues))
244
#define task_rq(p)		cpu_rq(task_cpu(p))
245
#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
246
/*
247
 * sched_domains_mutex serialises calls to init_sched_domains,
248
 * detach_destroy_domains and partition_sched_domains.
249
 */
250
static DEFINE_MUTEX(sched_domains_mutex);
251
252
/*
253
 * By default the system creates a single root-domain with all cpus as
254
 * members (mimicking the global state we have today).
255
 */
256
static struct root_domain def_root_domain;
257
258
int __weak arch_sd_sibling_asym_packing(void)
259
{
260
       return 0*SD_ASYM_PACKING;
261
}
262
#endif /* CONFIG_SMP */
263
264
static inline void update_rq_clock(struct rq *rq);
265
266
/*
267
 * Sanity check should sched_clock return bogus values. We make sure it does
268
 * not appear to go backwards, and use jiffies to determine the maximum and
269
 * minimum it could possibly have increased, and round down to the nearest
270
 * jiffy when it falls outside this.
271
 */
272
static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
273
{
274
	unsigned long min_diff, max_diff;
275
276
	if (jiff_diff > 1)
277
		min_diff = JIFFIES_TO_NS(jiff_diff - 1);
278
	else
279
		min_diff = 1;
280
	/*  Round up to the nearest tick for maximum */
281
	max_diff = JIFFIES_TO_NS(jiff_diff + 1);
282
283
	if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
284
		*niff_diff = min_diff;
285
}
286
287
#ifdef CONFIG_SMP
288
static inline int cpu_of(struct rq *rq)
289
{
290
	return rq->cpu;
291
}
292
293
/*
294
 * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
295
 * clock is updated with the grq.lock held, it is an opportunity to update the
296
 * niffies value. Any CPU can update it by adding how much its clock has
297
 * increased since it last updated niffies, minus any added niffies by other
298
 * CPUs.
299
 */
300
static inline void update_clocks(struct rq *rq)
301
{
302
	s64 ndiff;
303
	long jdiff;
304
305
	update_rq_clock(rq);
306
	ndiff = rq->clock - rq->old_clock;
307
	/* old_clock is only updated when we are updating niffies */
308
	rq->old_clock = rq->clock;
309
	ndiff -= grq.niffies - rq->last_niffy;
310
	jdiff = jiffies - grq.last_jiffy;
311
	niffy_diff(&ndiff, jdiff);
312
	grq.last_jiffy += jdiff;
313
	grq.niffies += ndiff;
314
	rq->last_niffy = grq.niffies;
315
}
316
#else /* CONFIG_SMP */
317
static struct rq *uprq;
318
#define cpu_rq(cpu)	(uprq)
319
#define this_rq()	(uprq)
320
#define task_rq(p)	(uprq)
321
#define cpu_curr(cpu)	((uprq)->curr)
322
static inline int cpu_of(struct rq *rq)
323
{
324
	return 0;
325
}
326
327
static inline void update_clocks(struct rq *rq)
328
{
329
	s64 ndiff;
330
	long jdiff;
331
332
	update_rq_clock(rq);
333
	ndiff = rq->clock - rq->old_clock;
334
	rq->old_clock = rq->clock;
335
	jdiff = jiffies - grq.last_jiffy;
336
	niffy_diff(&ndiff, jdiff);
337
	grq.last_jiffy += jdiff;
338
	grq.niffies += ndiff;
339
}
340
#endif
341
#define raw_rq()	(&__raw_get_cpu_var(runqueues))
342
343
#include "stats.h"
344
345
#ifndef prepare_arch_switch
346
# define prepare_arch_switch(next)	do { } while (0)
347
#endif
348
#ifndef finish_arch_switch
349
# define finish_arch_switch(prev)	do { } while (0)
350
#endif
351
#ifndef finish_arch_post_lock_switch
352
# define finish_arch_post_lock_switch()	do { } while (0)
353
#endif
354
355
/*
356
 * All common locking functions performed on grq.lock. rq->clock is local to
357
 * the CPU accessing it so it can be modified just with interrupts disabled
358
 * when we're not updating niffies.
359
 * Looking up task_rq must be done under grq.lock to be safe.
360
 */
361
static void update_rq_clock_task(struct rq *rq, s64 delta);
362
363
static inline void update_rq_clock(struct rq *rq)
364
{
365
	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
366
367
	rq->clock += delta;
368
	update_rq_clock_task(rq, delta);
369
}
370
371
static inline bool task_running(struct task_struct *p)
372
{
373
	return p->on_cpu;
374
}
375
376
static inline void grq_lock(void)
377
	__acquires(grq.lock)
378
{
379
	raw_spin_lock(&grq.lock);
380
}
381
382
static inline void grq_unlock(void)
383
	__releases(grq.lock)
384
{
385
	raw_spin_unlock(&grq.lock);
386
}
387
388
static inline void grq_lock_irq(void)
389
	__acquires(grq.lock)
390
{
391
	raw_spin_lock_irq(&grq.lock);
392
}
393
394
static inline void time_lock_grq(struct rq *rq)
395
	__acquires(grq.lock)
396
{
397
	grq_lock();
398
	update_clocks(rq);
399
}
400
401
static inline void grq_unlock_irq(void)
402
	__releases(grq.lock)
403
{
404
	raw_spin_unlock_irq(&grq.lock);
405
}
406
407
static inline void grq_lock_irqsave(unsigned long *flags)
408
	__acquires(grq.lock)
409
{
410
	raw_spin_lock_irqsave(&grq.lock, *flags);
411
}
412
413
static inline void grq_unlock_irqrestore(unsigned long *flags)
414
	__releases(grq.lock)
415
{
416
	raw_spin_unlock_irqrestore(&grq.lock, *flags);
417
}
418
419
static inline struct rq
420
*task_grq_lock(struct task_struct *p, unsigned long *flags)
421
	__acquires(grq.lock)
422
{
423
	grq_lock_irqsave(flags);
424
	return task_rq(p);
425
}
426
427
static inline struct rq
428
*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
429
	__acquires(grq.lock)
430
{
431
	struct rq *rq = task_grq_lock(p, flags);
432
	update_clocks(rq);
433
	return rq;
434
}
435
436
static inline struct rq *task_grq_lock_irq(struct task_struct *p)
437
	__acquires(grq.lock)
438
{
439
	grq_lock_irq();
440
	return task_rq(p);
441
}
442
443
static inline void time_task_grq_lock_irq(struct task_struct *p)
444
	__acquires(grq.lock)
445
{
446
	struct rq *rq = task_grq_lock_irq(p);
447
	update_clocks(rq);
448
}
449
450
static inline void task_grq_unlock_irq(void)
451
	__releases(grq.lock)
452
{
453
	grq_unlock_irq();
454
}
455
456
static inline void task_grq_unlock(unsigned long *flags)
457
	__releases(grq.lock)
458
{
459
	grq_unlock_irqrestore(flags);
460
}
461
462
/**
463
 * grunqueue_is_locked
464
 *
465
 * Returns true if the global runqueue is locked.
466
 * This interface allows printk to be called with the runqueue lock
467
 * held and know whether or not it is OK to wake up the klogd.
468
 */
469
bool grunqueue_is_locked(void)
470
{
471
	return raw_spin_is_locked(&grq.lock);
472
}
473
474
void grq_unlock_wait(void)
475
	__releases(grq.lock)
476
{
477
	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
478
	raw_spin_unlock_wait(&grq.lock);
479
}
480
481
static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
482
	__acquires(grq.lock)
483
{
484
	local_irq_save(*flags);
485
	time_lock_grq(rq);
486
}
487
488
static inline struct rq *__task_grq_lock(struct task_struct *p)
489
	__acquires(grq.lock)
490
{
491
	grq_lock();
492
	return task_rq(p);
493
}
494
495
static inline void __task_grq_unlock(void)
496
	__releases(grq.lock)
497
{
498
	grq_unlock();
499
}
500
501
/*
502
 * Look for any tasks *anywhere* that are running nice 0 or better. We do
503
 * this lockless for overhead reasons since the occasional wrong result
504
 * is harmless.
505
 */
506
bool above_background_load(void)
507
{
508
	int cpu;
509
510
	for_each_online_cpu(cpu) {
511
		struct task_struct *cpu_curr = cpu_rq(cpu)->curr;
512
513
		if (unlikely(!cpu_curr))
514
			continue;
515
		if (PRIO_TO_NICE(cpu_curr->static_prio) < 1) {
516
			return true;
517
		}
518
	}
519
	return false;
520
}
521
522
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
523
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
524
{
525
}
526
527
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
528
{
529
#ifdef CONFIG_DEBUG_SPINLOCK
530
	/* this is a valid case when another task releases the spinlock */
531
	grq.lock.owner = current;
532
#endif
533
	/*
534
	 * If we are tracking spinlock dependencies then we have to
535
	 * fix up the runqueue lock - which gets 'carried over' from
536
	 * prev into current:
537
	 */
538
	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
539
540
	grq_unlock_irq();
541
}
542
543
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
544
545
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
546
{
547
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
548
	grq_unlock_irq();
549
#else
550
	grq_unlock();
551
#endif
552
}
553
554
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
555
{
556
	smp_wmb();
557
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
558
	local_irq_enable();
559
#endif
560
}
561
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
562
563
static inline bool deadline_before(u64 deadline, u64 time)
564
{
565
	return (deadline < time);
566
}
567
568
static inline bool deadline_after(u64 deadline, u64 time)
569
{
570
	return (deadline > time);
571
}
572
573
/*
574
 * A task that is queued but not running will be on the grq run list.
575
 * A task that is not running or queued will not be on the grq run list.
576
 * A task that is currently running will have ->on_cpu set but not on the
577
 * grq run list.
578
 */
579
static inline bool task_queued(struct task_struct *p)
580
{
581
	return (!list_empty(&p->run_list));
582
}
583
584
/*
585
 * Removing from the global runqueue. Enter with grq locked.
586
 */
587
static void dequeue_task(struct task_struct *p)
588
{
589
	list_del_init(&p->run_list);
590
	if (list_empty(grq.queue + p->prio))
591
		__clear_bit(p->prio, grq.prio_bitmap);
592
}
593
594
/*
595
 * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
596
 * an idle task, we ensure none of the following conditions are met.
597
 */
598
static bool idleprio_suitable(struct task_struct *p)
599
{
600
	return (!freezing(p) && !signal_pending(p) &&
601
		!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
602
}
603
604
/*
605
 * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
606
 * that the iso_refractory flag is not set.
607
 */
608
static bool isoprio_suitable(void)
609
{
610
	return !grq.iso_refractory;
611
}
612
613
/*
614
 * Adding to the global runqueue. Enter with grq locked.
615
 */
616
static void enqueue_task(struct task_struct *p)
617
{
618
	if (!rt_task(p)) {
619
		/* Check it hasn't gotten rt from PI */
620
		if ((idleprio_task(p) && idleprio_suitable(p)) ||
621
		   (iso_task(p) && isoprio_suitable()))
622
			p->prio = p->normal_prio;
623
		else
624
			p->prio = NORMAL_PRIO;
625
	}
626
	__set_bit(p->prio, grq.prio_bitmap);
627
	list_add_tail(&p->run_list, grq.queue + p->prio);
628
	sched_info_queued(p);
629
}
630
631
/* Only idle task does this as a real time task*/
632
static inline void enqueue_task_head(struct task_struct *p)
633
{
634
	__set_bit(p->prio, grq.prio_bitmap);
635
	list_add(&p->run_list, grq.queue + p->prio);
636
	sched_info_queued(p);
637
}
638
639
static inline void requeue_task(struct task_struct *p)
640
{
641
	sched_info_queued(p);
642
}
643
644
/*
645
 * Returns the relative length of deadline all compared to the shortest
646
 * deadline which is that of nice -20.
647
 */
648
static inline int task_prio_ratio(struct task_struct *p)
649
{
650
	return prio_ratios[TASK_USER_PRIO(p)];
651
}
652
653
/*
654
 * task_timeslice - all tasks of all priorities get the exact same timeslice
655
 * length. CPU distribution is handled by giving different deadlines to
656
 * tasks of different priorities. Use 128 as the base value for fast shifts.
657
 */
658
static inline int task_timeslice(struct task_struct *p)
659
{
660
	return (rr_interval * task_prio_ratio(p) / 128);
661
}
662
663
#ifdef CONFIG_SMP
664
/*
665
 * qnr is the "queued but not running" count which is the total number of
666
 * tasks on the global runqueue list waiting for cpu time but not actually
667
 * currently running on a cpu.
668
 */
669
static inline void inc_qnr(void)
670
{
671
	grq.qnr++;
672
}
673
674
static inline void dec_qnr(void)
675
{
676
	grq.qnr--;
677
}
678
679
static inline int queued_notrunning(void)
680
{
681
	return grq.qnr;
682
}
683
684
/*
685
 * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
686
 * allow easy lookup of whether any suitable idle CPUs are available.
687
 * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
688
 * idle_cpus variable than to do a full bitmask check when we are busy.
689
 */
690
static inline void set_cpuidle_map(int cpu)
691
{
692
	if (likely(cpu_online(cpu))) {
693
		cpu_set(cpu, grq.cpu_idle_map);
694
		grq.idle_cpus = true;
695
	}
696
}
697
698
static inline void clear_cpuidle_map(int cpu)
699
{
700
	cpu_clear(cpu, grq.cpu_idle_map);
701
	if (cpus_empty(grq.cpu_idle_map))
702
		grq.idle_cpus = false;
703
}
704
705
static bool suitable_idle_cpus(struct task_struct *p)
706
{
707
	if (!grq.idle_cpus)
708
		return false;
709
	return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map));
710
}
711
712
#define CPUIDLE_DIFF_THREAD	(1)
713
#define CPUIDLE_DIFF_CORE	(2)
714
#define CPUIDLE_CACHE_BUSY	(4)
715
#define CPUIDLE_DIFF_CPU	(8)
716
#define CPUIDLE_THREAD_BUSY	(16)
717
#define CPUIDLE_THROTTLED	(32)
718
#define CPUIDLE_DIFF_NODE	(64)
719
720
static void resched_task(struct task_struct *p);
721
static inline bool scaling_rq(struct rq *rq);
722
723
/*
724
 * The best idle CPU is chosen according to the CPUIDLE ranking above where the
725
 * lowest value would give the most suitable CPU to schedule p onto next. The
726
 * order works out to be the following:
727
 *
728
 * Same core, idle or busy cache, idle or busy threads
729
 * Other core, same cache, idle or busy cache, idle threads.
730
 * Same node, other CPU, idle cache, idle threads.
731
 * Same node, other CPU, busy cache, idle threads.
732
 * Other core, same cache, busy threads.
733
 * Same node, other CPU, busy threads.
734
 * Other node, other CPU, idle cache, idle threads.
735
 * Other node, other CPU, busy cache, idle threads.
736
 * Other node, other CPU, busy threads.
737
 */
738
static void
739
resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
740
{
741
	int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THROTTLED |
742
		CPUIDLE_THREAD_BUSY | CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY |
743
		CPUIDLE_DIFF_CORE | CPUIDLE_DIFF_THREAD;
744
	int cpu_tmp;
745
746
	if (cpu_isset(best_cpu, *tmpmask))
747
		goto out;
748
749
	for_each_cpu_mask(cpu_tmp, *tmpmask) {
750
		int ranking, locality;
751
		struct rq *tmp_rq;
752
753
		ranking = 0;
754
		tmp_rq = cpu_rq(cpu_tmp);
755
756
		locality = rq->cpu_locality[cpu_tmp];
757
#ifdef CONFIG_NUMA
758
		if (locality > 3)
759
			ranking |= CPUIDLE_DIFF_NODE;
760
		else
761
#endif
762
		if (locality > 2)
763
			ranking |= CPUIDLE_DIFF_CPU;
764
#ifdef CONFIG_SCHED_MC
765
		else if (locality == 2)
766
			ranking |= CPUIDLE_DIFF_CORE;
767
		if (!(tmp_rq->cache_idle(cpu_tmp)))
768
			ranking |= CPUIDLE_CACHE_BUSY;
769
#endif
770
#ifdef CONFIG_SCHED_SMT
771
		if (locality == 1)
772
			ranking |= CPUIDLE_DIFF_THREAD;
773
		if (!(tmp_rq->siblings_idle(cpu_tmp)))
774
			ranking |= CPUIDLE_THREAD_BUSY;
775
#endif
776
		if (scaling_rq(tmp_rq))
777
			ranking |= CPUIDLE_THROTTLED;
778
779
		if (ranking < best_ranking) {
780
			best_cpu = cpu_tmp;
781
			best_ranking = ranking;
782
		}
783
	}
784
out:
785
	resched_task(cpu_rq(best_cpu)->curr);
786
}
787
788
bool cpus_share_cache(int this_cpu, int that_cpu)
789
{
790
	struct rq *this_rq = cpu_rq(this_cpu);
791
792
	return (this_rq->cpu_locality[that_cpu] < 3);
793
}
794
795
static void resched_best_idle(struct task_struct *p)
796
{
797
	cpumask_t tmpmask;
798
799
	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
800
	resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);
801
}
802
803
static inline void resched_suitable_idle(struct task_struct *p)
804
{
805
	if (suitable_idle_cpus(p))
806
		resched_best_idle(p);
807
}
808
/*
809
 * Flags to tell us whether this CPU is running a CPU frequency governor that
810
 * has slowed its speed or not. No locking required as the very rare wrongly
811
 * read value would be harmless.
812
 */
813
void cpu_scaling(int cpu)
814
{
815
	cpu_rq(cpu)->scaling = true;
816
}
817
818
void cpu_nonscaling(int cpu)
819
{
820
	cpu_rq(cpu)->scaling = false;
821
}
822
823
static inline bool scaling_rq(struct rq *rq)
824
{
825
	return rq->scaling;
826
}
827
828
static inline int locality_diff(struct task_struct *p, struct rq *rq)
829
{
830
	return rq->cpu_locality[task_cpu(p)];
831
}
832
#else /* CONFIG_SMP */
833
static inline void inc_qnr(void)
834
{
835
}
836
837
static inline void dec_qnr(void)
838
{
839
}
840
841
static inline int queued_notrunning(void)
842
{
843
	return grq.nr_running;
844
}
845
846
static inline void set_cpuidle_map(int cpu)
847
{
848
}
849
850
static inline void clear_cpuidle_map(int cpu)
851
{
852
}
853
854
static inline bool suitable_idle_cpus(struct task_struct *p)
855
{
856
	return uprq->curr == uprq->idle;
857
}
858
859
static inline void resched_suitable_idle(struct task_struct *p)
860
{
861
}
862
863
void cpu_scaling(int __unused)
864
{
865
}
866
867
void cpu_nonscaling(int __unused)
868
{
869
}
870
871
/*
872
 * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
873
 * always returns 0.
874
 */
875
static inline bool scaling_rq(struct rq *rq)
876
{
877
	return false;
878
}
879
880
static inline int locality_diff(struct task_struct *p, struct rq *rq)
881
{
882
	return 0;
883
}
884
#endif /* CONFIG_SMP */
885
EXPORT_SYMBOL_GPL(cpu_scaling);
886
EXPORT_SYMBOL_GPL(cpu_nonscaling);
887
888
/*
889
 * activate_idle_task - move idle task to the _front_ of runqueue.
890
 */
891
static inline void activate_idle_task(struct task_struct *p)
892
{
893
	enqueue_task_head(p);
894
	grq.nr_running++;
895
	inc_qnr();
896
}
897
898
static inline int normal_prio(struct task_struct *p)
899
{
900
	if (has_rt_policy(p))
901
		return MAX_RT_PRIO - 1 - p->rt_priority;
902
	if (idleprio_task(p))
903
		return IDLE_PRIO;
904
	if (iso_task(p))
905
		return ISO_PRIO;
906
	return NORMAL_PRIO;
907
}
908
909
/*
910
 * Calculate the current priority, i.e. the priority
911
 * taken into account by the scheduler. This value might
912
 * be boosted by RT tasks as it will be RT if the task got
913
 * RT-boosted. If not then it returns p->normal_prio.
914
 */
915
static int effective_prio(struct task_struct *p)
916
{
917
	p->normal_prio = normal_prio(p);
918
	/*
919
	 * If we are RT tasks or we were boosted to RT priority,
920
	 * keep the priority unchanged. Otherwise, update priority
921
	 * to the normal priority:
922
	 */
923
	if (!rt_prio(p->prio))
924
		return p->normal_prio;
925
	return p->prio;
926
}
927
928
/*
929
 * activate_task - move a task to the runqueue. Enter with grq locked.
930
 */
931
static void activate_task(struct task_struct *p, struct rq *rq)
932
{
933
	update_clocks(rq);
934
935
	/*
936
	 * Sleep time is in units of nanosecs, so shift by 20 to get a
937
	 * milliseconds-range estimation of the amount of time that the task
938
	 * spent sleeping:
939
	 */
940
	if (unlikely(prof_on == SLEEP_PROFILING)) {
941
		if (p->state == TASK_UNINTERRUPTIBLE)
942
			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
943
				     (rq->clock_task - p->last_ran) >> 20);
944
	}
945
946
	p->prio = effective_prio(p);
947
	if (task_contributes_to_load(p))
948
		grq.nr_uninterruptible--;
949
	enqueue_task(p);
950
	grq.nr_running++;
951
	inc_qnr();
952
}
953
954
static inline void clear_sticky(struct task_struct *p);
955
956
/*
957
 * deactivate_task - If it's running, it's not on the grq and we can just
958
 * decrement the nr_running. Enter with grq locked.
959
 */
960
static inline void deactivate_task(struct task_struct *p)
961
{
962
	if (task_contributes_to_load(p))
963
		grq.nr_uninterruptible++;
964
	grq.nr_running--;
965
	clear_sticky(p);
966
}
967
968
static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
969
970
void register_task_migration_notifier(struct notifier_block *n)
971
{
972
	atomic_notifier_chain_register(&task_migration_notifier, n);
973
}
974
975
#ifdef CONFIG_SMP
976
void set_task_cpu(struct task_struct *p, unsigned int cpu)
977
{
978
#ifdef CONFIG_LOCKDEP
979
	/*
980
	 * The caller should hold grq lock.
981
	 */
982
	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
983
#endif
984
	trace_sched_migrate_task(p, cpu);
985
	if (task_cpu(p) != cpu)
986
		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
987
988
	/*
989
	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
990
	 * successfully executed on another CPU. We must ensure that updates of
991
	 * per-task data have been completed by this moment.
992
	 */
993
	smp_wmb();
994
	task_thread_info(p)->cpu = cpu;
995
}
996
997
static inline void clear_sticky(struct task_struct *p)
998
{
999
	p->sticky = false;
1000
}
1001
1002
static inline bool task_sticky(struct task_struct *p)
1003
{
1004
	return p->sticky;
1005
}
1006
1007
/* Reschedule the best idle CPU that is not this one. */
1008
static void
1009
resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p)
1010
{
1011
	cpumask_t tmpmask;
1012
1013
	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
1014
	cpu_clear(cpu, tmpmask);
1015
	if (cpus_empty(tmpmask))
1016
		return;
1017
	resched_best_mask(cpu, rq, &tmpmask);
1018
}
1019
1020
/*
1021
 * We set the sticky flag on a task that is descheduled involuntarily meaning
1022
 * it is awaiting further CPU time. If the last sticky task is still sticky
1023
 * but unlucky enough to not be the next task scheduled, we unstick it and try
1024
 * to find it an idle CPU. Realtime tasks do not stick to minimise their
1025
 * latency at all times.
1026
 */
1027
static inline void
1028
swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
1029
{
1030
	if (rq->sticky_task) {
1031
		if (rq->sticky_task == p) {
1032
			p->sticky = true;
1033
			return;
1034
		}
1035
		if (task_sticky(rq->sticky_task)) {
1036
			clear_sticky(rq->sticky_task);
1037
			resched_closest_idle(rq, cpu, rq->sticky_task);
1038
		}
1039
	}
1040
	if (!rt_task(p)) {
1041
		p->sticky = true;
1042
		rq->sticky_task = p;
1043
	} else {
1044
		resched_closest_idle(rq, cpu, p);
1045
		rq->sticky_task = NULL;
1046
	}
1047
}
1048
1049
static inline void unstick_task(struct rq *rq, struct task_struct *p)
1050
{
1051
	rq->sticky_task = NULL;
1052
	clear_sticky(p);
1053
}
1054
#else
1055
static inline void clear_sticky(struct task_struct *p)
1056
{
1057
}
1058
1059
static inline bool task_sticky(struct task_struct *p)
1060
{
1061
	return false;
1062
}
1063
1064
static inline void
1065
swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
1066
{
1067
}
1068
1069
static inline void unstick_task(struct rq *rq, struct task_struct *p)
1070
{
1071
}
1072
#endif
1073
1074
/*
1075
 * Move a task off the global queue and take it to a cpu for it will
1076
 * become the running task.
1077
 */
1078
static inline void take_task(int cpu, struct task_struct *p)
1079
{
1080
	set_task_cpu(p, cpu);
1081
	dequeue_task(p);
1082
	clear_sticky(p);
1083
	dec_qnr();
1084
}
1085
1086
/*
1087
 * Returns a descheduling task to the grq runqueue unless it is being
1088
 * deactivated.
1089
 */
1090
static inline void return_task(struct task_struct *p, bool deactivate)
1091
{
1092
	if (deactivate)
1093
		deactivate_task(p);
1094
	else {
1095
		inc_qnr();
1096
		enqueue_task(p);
1097
	}
1098
}
1099
1100
/*
1101
 * resched_task - mark a task 'to be rescheduled now'.
1102
 *
1103
 * On UP this means the setting of the need_resched flag, on SMP it
1104
 * might also involve a cross-CPU call to trigger the scheduler on
1105
 * the target CPU.
1106
 */
1107
#ifdef CONFIG_SMP
1108
1109
#ifndef tsk_is_polling
1110
#define tsk_is_polling(t) 0
1111
#endif
1112
1113
static void resched_task(struct task_struct *p)
1114
{
1115
	int cpu;
1116
1117
	assert_raw_spin_locked(&grq.lock);
1118
1119
	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1120
		return;
1121
1122
	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1123
1124
	cpu = task_cpu(p);
1125
	if (cpu == smp_processor_id())
1126
		return;
1127
1128
	/* NEED_RESCHED must be visible before we test polling */
1129
	smp_mb();
1130
	if (!tsk_is_polling(p))
1131
		smp_send_reschedule(cpu);
1132
}
1133
1134
#else
1135
static inline void resched_task(struct task_struct *p)
1136
{
1137
	assert_raw_spin_locked(&grq.lock);
1138
	set_tsk_need_resched(p);
1139
}
1140
#endif
1141
1142
/**
1143
 * task_curr - is this task currently executing on a CPU?
1144
 * @p: the task in question.
1145
 *
1146
 * Return: 1 if the task is currently executing. 0 otherwise.
1147
 */
1148
inline int task_curr(const struct task_struct *p)
1149
{
1150
	return cpu_curr(task_cpu(p)) == p;
1151
}
1152
1153
#ifdef CONFIG_SMP
1154
struct migration_req {
1155
	struct task_struct *task;
1156
	int dest_cpu;
1157
};
1158
1159
/*
1160
 * wait_task_inactive - wait for a thread to unschedule.
1161
 *
1162
 * If @match_state is nonzero, it's the @p->state value just checked and
1163
 * not expected to change.  If it changes, i.e. @p might have woken up,
1164
 * then return zero.  When we succeed in waiting for @p to be off its CPU,
1165
 * we return a positive number (its total switch count).  If a second call
1166
 * a short while later returns the same number, the caller can be sure that
1167
 * @p has remained unscheduled the whole time.
1168
 *
1169
 * The caller must ensure that the task *will* unschedule sometime soon,
1170
 * else this function might spin for a *long* time. This function can't
1171
 * be called with interrupts off, or it may introduce deadlock with
1172
 * smp_call_function() if an IPI is sent by the same process we are
1173
 * waiting to become inactive.
1174
 */
1175
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1176
{
1177
	unsigned long flags;
1178
	bool running, on_rq;
1179
	unsigned long ncsw;
1180
	struct rq *rq;
1181
1182
	for (;;) {
1183
		/*
1184
		 * We do the initial early heuristics without holding
1185
		 * any task-queue locks at all. We'll only try to get
1186
		 * the runqueue lock when things look like they will
1187
		 * work out! In the unlikely event rq is dereferenced
1188
		 * since we're lockless, grab it again.
1189
		 */
1190
#ifdef CONFIG_SMP
1191
retry_rq:
1192
		rq = task_rq(p);
1193
		if (unlikely(!rq))
1194
			goto retry_rq;
1195
#else /* CONFIG_SMP */
1196
		rq = task_rq(p);
1197
#endif
1198
		/*
1199
		 * If the task is actively running on another CPU
1200
		 * still, just relax and busy-wait without holding
1201
		 * any locks.
1202
		 *
1203
		 * NOTE! Since we don't hold any locks, it's not
1204
		 * even sure that "rq" stays as the right runqueue!
1205
		 * But we don't care, since this will return false
1206
		 * if the runqueue has changed and p is actually now
1207
		 * running somewhere else!
1208
		 */
1209
		while (task_running(p) && p == rq->curr) {
1210
			if (match_state && unlikely(p->state != match_state))
1211
				return 0;
1212
			cpu_relax();
1213
		}
1214
1215
		/*
1216
		 * Ok, time to look more closely! We need the grq
1217
		 * lock now, to be *sure*. If we're wrong, we'll
1218
		 * just go back and repeat.
1219
		 */
1220
		rq = task_grq_lock(p, &flags);
1221
		trace_sched_wait_task(p);
1222
		running = task_running(p);
1223
		on_rq = task_queued(p);
1224
		ncsw = 0;
1225
		if (!match_state || p->state == match_state)
1226
			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1227
		task_grq_unlock(&flags);
1228
1229
		/*
1230
		 * If it changed from the expected state, bail out now.
1231
		 */
1232
		if (unlikely(!ncsw))
1233
			break;
1234
1235
		/*
1236
		 * Was it really running after all now that we
1237
		 * checked with the proper locks actually held?
1238
		 *
1239
		 * Oops. Go back and try again..
1240
		 */
1241
		if (unlikely(running)) {
1242
			cpu_relax();
1243
			continue;
1244
		}
1245
1246
		/*
1247
		 * It's not enough that it's not actively running,
1248
		 * it must be off the runqueue _entirely_, and not
1249
		 * preempted!
1250
		 *
1251
		 * So if it was still runnable (but just not actively
1252
		 * running right now), it's preempted, and we should
1253
		 * yield - it could be a while.
1254
		 */
1255
		if (unlikely(on_rq)) {
1256
			ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
1257
1258
			set_current_state(TASK_UNINTERRUPTIBLE);
1259
			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1260
			continue;
1261
		}
1262
1263
		/*
1264
		 * Ahh, all good. It wasn't running, and it wasn't
1265
		 * runnable, which means that it will never become
1266
		 * running in the future either. We're all done!
1267
		 */
1268
		break;
1269
	}
1270
1271
	return ncsw;
1272
}
1273
1274
/***
1275
 * kick_process - kick a running thread to enter/exit the kernel
1276
 * @p: the to-be-kicked thread
1277
 *
1278
 * Cause a process which is running on another CPU to enter
1279
 * kernel-mode, without any delay. (to get signals handled.)
1280
 *
1281
 * NOTE: this function doesn't have to take the runqueue lock,
1282
 * because all it wants to ensure is that the remote task enters
1283
 * the kernel. If the IPI races and the task has been migrated
1284
 * to another CPU then no harm is done and the purpose has been
1285
 * achieved as well.
1286
 */
1287
void kick_process(struct task_struct *p)
1288
{
1289
	int cpu;
1290
1291
	preempt_disable();
1292
	cpu = task_cpu(p);
1293
	if ((cpu != smp_processor_id()) && task_curr(p))
1294
		smp_send_reschedule(cpu);
1295
	preempt_enable();
1296
}
1297
EXPORT_SYMBOL_GPL(kick_process);
1298
#endif
1299
1300
#define rq_idle(rq)	((rq)->rq_prio == PRIO_LIMIT)
1301
1302
/*
1303
 * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
1304
 * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
1305
 * between themselves, they cooperatively multitask. An idle rq scores as
1306
 * prio PRIO_LIMIT so it is always preempted.
1307
 */
1308
static inline bool
1309
can_preempt(struct task_struct *p, int prio, u64 deadline)
1310
{
1311
	/* Better static priority RT task or better policy preemption */
1312
	if (p->prio < prio)
1313
		return true;
1314
	if (p->prio > prio)
1315
		return false;
1316
	/* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
1317
	if (!deadline_before(p->deadline, deadline))
1318
		return false;
1319
	return true;
1320
}
1321
1322
#ifdef CONFIG_SMP
1323
#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
1324
#ifdef CONFIG_HOTPLUG_CPU
1325
/*
1326
 * Check to see if there is a task that is affined only to offline CPUs but
1327
 * still wants runtime. This happens to kernel threads during suspend/halt and
1328
 * disabling of CPUs.
1329
 */
1330
static inline bool online_cpus(struct task_struct *p)
1331
{
1332
	return (likely(cpus_intersects(cpu_online_map, p->cpus_allowed)));
1333
}
1334
#else /* CONFIG_HOTPLUG_CPU */
1335
/* All available CPUs are always online without hotplug. */
1336
static inline bool online_cpus(struct task_struct *p)
1337
{
1338
	return true;
1339
}
1340
#endif
1341
1342
/*
1343
 * Check to see if p can run on cpu, and if not, whether there are any online
1344
 * CPUs it can run on instead.
1345
 */
1346
static inline bool needs_other_cpu(struct task_struct *p, int cpu)
1347
{
1348
	if (unlikely(!cpu_isset(cpu, p->cpus_allowed)))
1349
		return true;
1350
	return false;
1351
}
1352
1353
/*
1354
 * When all else is equal, still prefer this_rq.
1355
 */
1356
static void try_preempt(struct task_struct *p, struct rq *this_rq)
1357
{
1358
	struct rq *highest_prio_rq = NULL;
1359
	int cpu, highest_prio;
1360
	u64 latest_deadline;
1361
	cpumask_t tmp;
1362
1363
	/*
1364
	 * We clear the sticky flag here because for a task to have called
1365
	 * try_preempt with the sticky flag enabled means some complicated
1366
	 * re-scheduling has occurred and we should ignore the sticky flag.
1367
	 */
1368
	clear_sticky(p);
1369
1370
	if (suitable_idle_cpus(p)) {
1371
		resched_best_idle(p);
1372
		return;
1373
	}
1374
1375
	/* IDLEPRIO tasks never preempt anything but idle */
1376
	if (p->policy == SCHED_IDLEPRIO)
1377
		return;
1378
1379
	if (likely(online_cpus(p)))
1380
		cpus_and(tmp, cpu_online_map, p->cpus_allowed);
1381
	else
1382
		return;
1383
1384
	highest_prio = latest_deadline = 0;
1385
1386
	for_each_cpu_mask(cpu, tmp) {
1387
		struct rq *rq;
1388
		int rq_prio;
1389
1390
		rq = cpu_rq(cpu);
1391
		rq_prio = rq->rq_prio;
1392
		if (rq_prio < highest_prio)
1393
			continue;
1394
1395
		if (rq_prio > highest_prio ||
1396
		    deadline_after(rq->rq_deadline, latest_deadline)) {
1397
			latest_deadline = rq->rq_deadline;
1398
			highest_prio = rq_prio;
1399
			highest_prio_rq = rq;
1400
		}
1401
	}
1402
1403
	if (likely(highest_prio_rq)) {
1404
		if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline))
1405
			resched_task(highest_prio_rq->curr);
1406
	}
1407
}
1408
#else /* CONFIG_SMP */
1409
static inline bool needs_other_cpu(struct task_struct *p, int cpu)
1410
{
1411
	return false;
1412
}
1413
1414
static void try_preempt(struct task_struct *p, struct rq *this_rq)
1415
{
1416
	if (p->policy == SCHED_IDLEPRIO)
1417
		return;
1418
	if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
1419
		resched_task(uprq->curr);
1420
}
1421
#endif /* CONFIG_SMP */
1422
1423
static void
1424
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1425
{
1426
#ifdef CONFIG_SCHEDSTATS
1427
	struct rq *rq = this_rq();
1428
1429
#ifdef CONFIG_SMP
1430
	int this_cpu = smp_processor_id();
1431
1432
	if (cpu == this_cpu)
1433
		schedstat_inc(rq, ttwu_local);
1434
	else {
1435
		struct sched_domain *sd;
1436
1437
		rcu_read_lock();
1438
		for_each_domain(this_cpu, sd) {
1439
			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1440
				schedstat_inc(sd, ttwu_wake_remote);
1441
				break;
1442
			}
1443
		}
1444
		rcu_read_unlock();
1445
	}
1446
1447
#endif /* CONFIG_SMP */
1448
1449
	schedstat_inc(rq, ttwu_count);
1450
#endif /* CONFIG_SCHEDSTATS */
1451
}
1452
1453
static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
1454
				 bool is_sync)
1455
{
1456
	activate_task(p, rq);
1457
1458
	/*
1459
	 * Sync wakeups (i.e. those types of wakeups where the waker
1460
	 * has indicated that it will leave the CPU in short order)
1461
	 * don't trigger a preemption if there are no idle cpus,
1462
	 * instead waiting for current to deschedule.
1463
	 */
1464
	if (!is_sync || suitable_idle_cpus(p))
1465
		try_preempt(p, rq);
1466
}
1467
1468
static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
1469
					bool success)
1470
{
1471
	trace_sched_wakeup(p, success);
1472
	p->state = TASK_RUNNING;
1473
1474
	/*
1475
	 * if a worker is waking up, notify workqueue. Note that on BFS, we
1476
	 * don't really know what cpu it will be, so we fake it for
1477
	 * wq_worker_waking_up :/
1478
	 */
1479
	if ((p->flags & PF_WQ_WORKER) && success)
1480
		wq_worker_waking_up(p, cpu_of(rq));
1481
}
1482
1483
#ifdef CONFIG_SMP
1484
static void
1485
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1486
{
1487
	ttwu_activate(p, rq, false);
1488
	ttwu_post_activation(p, rq, true);
1489
}
1490
1491
static void sched_ttwu_pending(void)
1492
{
1493
	struct rq *rq = this_rq();
1494
	struct llist_node *llist = llist_del_all(&rq->wake_list);
1495
	struct task_struct *p;
1496
1497
	grq_lock();
1498
1499
	while (llist) {
1500
		p = llist_entry(llist, struct task_struct, wake_entry);
1501
		llist = llist_next(llist);
1502
		ttwu_do_activate(rq, p, 0);
1503
	}
1504
1505
	grq_unlock();
1506
}
1507
1508
void scheduler_ipi(void)
1509
{
1510
	if (llist_empty(&this_rq()->wake_list))
1511
		return;
1512
1513
	/*
1514
	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1515
	 * traditionally all their work was done from the interrupt return
1516
	 * path. Now that we actually do some work, we need to make sure
1517
	 * we do call them.
1518
	 *
1519
	 * Some archs already do call them, luckily irq_enter/exit nest
1520
	 * properly.
1521
	 *
1522
	 * Arguably we should visit all archs and update all handlers,
1523
	 * however a fair share of IPIs are still resched only so this would
1524
	 * somewhat pessimize the simple resched case.
1525
	 */
1526
	irq_enter();
1527
	sched_ttwu_pending();
1528
1529
	irq_exit();
1530
}
1531
#endif /* CONFIG_SMP */
1532
1533
/*
1534
 * wake flags
1535
 */
1536
#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
1537
#define WF_FORK		0x02		/* child wakeup after fork */
1538
#define WF_MIGRATED	0x4		/* internal use, task got migrated */
1539
1540
/***
1541
 * try_to_wake_up - wake up a thread
1542
 * @p: the thread to be awakened
1543
 * @state: the mask of task states that can be woken
1544
 * @wake_flags: wake modifier flags (WF_*)
1545
 *
1546
 * Put it on the run-queue if it's not already there. The "current"
1547
 * thread is always on the run-queue (except when the actual
1548
 * re-schedule is in progress), and as such you're allowed to do
1549
 * the simpler "current->state = TASK_RUNNING" to mark yourself
1550
 * runnable without the overhead of this.
1551
 *
1552
 * Return: %true if @p was woken up, %false if it was already running.
1553
 * or @state didn't match @p's state.
1554
 */
1555
static bool try_to_wake_up(struct task_struct *p, unsigned int state,
1556
			  int wake_flags)
1557
{
1558
	bool success = false;
1559
	unsigned long flags;
1560
	struct rq *rq;
1561
	int cpu;
1562
1563
	get_cpu();
1564
1565
	/*
1566
	 * If we are going to wake up a thread waiting for CONDITION we
1567
	 * need to ensure that CONDITION=1 done by the caller can not be
1568
	 * reordered with p->state check below. This pairs with mb() in
1569
	 * set_current_state() the waiting thread does.
1570
	 */
1571
	smp_mb__before_spinlock();
1572
1573
	/*
1574
	 * No need to do time_lock_grq as we only need to update the rq clock
1575
	 * if we activate the task
1576
	 */
1577
	rq = task_grq_lock(p, &flags);
1578
	cpu = task_cpu(p);
1579
1580
	/* state is a volatile long, どうして、分からない */
1581
	if (!((unsigned int)p->state & state))
1582
		goto out_unlock;
1583
1584
	if (task_queued(p) || task_running(p))
1585
		goto out_running;
1586
1587
	ttwu_activate(p, rq, wake_flags & WF_SYNC);
1588
	success = true;
1589
1590
out_running:
1591
	ttwu_post_activation(p, rq, success);
1592
out_unlock:
1593
	task_grq_unlock(&flags);
1594
1595
	ttwu_stat(p, cpu, wake_flags);
1596
1597
	put_cpu();
1598
1599
	return success;
1600
}
1601
1602
/**
1603
 * try_to_wake_up_local - try to wake up a local task with grq lock held
1604
 * @p: the thread to be awakened
1605
 *
1606
 * Put @p on the run-queue if it's not already there. The caller must
1607
 * ensure that grq is locked and, @p is not the current task.
1608
 * grq stays locked over invocation.
1609
 */
1610
static void try_to_wake_up_local(struct task_struct *p)
1611
{
1612
	struct rq *rq = task_rq(p);
1613
	bool success = false;
1614
1615
	lockdep_assert_held(&grq.lock);
1616
1617
	if (!(p->state & TASK_NORMAL))
1618
		return;
1619
1620
	if (!task_queued(p)) {
1621
		if (likely(!task_running(p))) {
1622
			schedstat_inc(rq, ttwu_count);
1623
			schedstat_inc(rq, ttwu_local);
1624
		}
1625
		ttwu_activate(p, rq, false);
1626
		ttwu_stat(p, smp_processor_id(), 0);
1627
		success = true;
1628
	}
1629
	ttwu_post_activation(p, rq, success);
1630
}
1631
1632
/**
1633
 * wake_up_process - Wake up a specific process
1634
 * @p: The process to be woken up.
1635
 *
1636
 * Attempt to wake up the nominated process and move it to the set of runnable
1637
 * processes.
1638
 *
1639
 * Return: 1 if the process was woken up, 0 if it was already running.
1640
 *
1641
 * It may be assumed that this function implies a write memory barrier before
1642
 * changing the task state if and only if any tasks are woken up.
1643
 */
1644
int wake_up_process(struct task_struct *p)
1645
{
1646
	WARN_ON(task_is_stopped_or_traced(p));
1647
	return try_to_wake_up(p, TASK_NORMAL, 0);
1648
}
1649
EXPORT_SYMBOL(wake_up_process);
1650
1651
int wake_up_state(struct task_struct *p, unsigned int state)
1652
{
1653
	return try_to_wake_up(p, state, 0);
1654
}
1655
1656
static void time_slice_expired(struct task_struct *p);
1657
1658
/*
1659
 * Perform scheduler related setup for a newly forked process p.
1660
 * p is forked by current.
1661
 */
1662
void sched_fork(struct task_struct *p)
1663
{
1664
#ifdef CONFIG_PREEMPT_NOTIFIERS
1665
	INIT_HLIST_HEAD(&p->preempt_notifiers);
1666
#endif
1667
	/*
1668
	 * The process state is set to the same value of the process executing
1669
	 * do_fork() code. That is running. This guarantees that nobody will
1670
	 * actually run it, and a signal or other external event cannot wake
1671
	 * it up and insert it on the runqueue either.
1672
	 */
1673
1674
	/* Should be reset in fork.c but done here for ease of bfs patching */
1675
	p->utime =
1676
	p->stime =
1677
	p->utimescaled =
1678
	p->stimescaled =
1679
	p->sched_time =
1680
	p->stime_pc =
1681
	p->utime_pc = 0;
1682
1683
	/*
1684
	 * Revert to default priority/policy on fork if requested.
1685
	 */
1686
	if (unlikely(p->sched_reset_on_fork)) {
1687
		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
1688
			p->policy = SCHED_NORMAL;
1689
			p->normal_prio = normal_prio(p);
1690
		}
1691
1692
		if (PRIO_TO_NICE(p->static_prio) < 0) {
1693
			p->static_prio = NICE_TO_PRIO(0);
1694
			p->normal_prio = p->static_prio;
1695
		}
1696
1697
		/*
1698
		 * We don't need the reset flag anymore after the fork. It has
1699
		 * fulfilled its duty:
1700
		 */
1701
		p->sched_reset_on_fork = 0;
1702
	}
1703
1704
	INIT_LIST_HEAD(&p->run_list);
1705
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1706
	if (unlikely(sched_info_on()))
1707
		memset(&p->sched_info, 0, sizeof(p->sched_info));
1708
#endif
1709
	p->on_cpu = false;
1710
	clear_sticky(p);
1711
1712
#ifdef CONFIG_PREEMPT_COUNT
1713
	/* Want to start with kernel preemption disabled. */
1714
	task_thread_info(p)->preempt_count = 1;
1715
#endif
1716
}
1717
1718
/*
1719
 * wake_up_new_task - wake up a newly created task for the first time.
1720
 *
1721
 * This function will do some initial scheduler statistics housekeeping
1722
 * that must be done for every newly created context, then puts the task
1723
 * on the runqueue and wakes it.
1724
 */
1725
void wake_up_new_task(struct task_struct *p)
1726
{
1727
	struct task_struct *parent;
1728
	unsigned long flags;
1729
	struct rq *rq;
1730
1731
	parent = p->parent;
1732
	rq = task_grq_lock(p, &flags);
1733
1734
	/*
1735
	 * Reinit new task deadline as its creator deadline could have changed
1736
	 * since call to dup_task_struct().
1737
	 */
1738
	p->deadline = rq->rq_deadline;
1739
1740
	/*
1741
	 * If the task is a new process, current and parent are the same. If
1742
	 * the task is a new thread in the thread group, it will have much more
1743
	 * in common with current than with the parent.
1744
	 */
1745
	set_task_cpu(p, task_cpu(rq->curr));
1746
1747
	/*
1748
	 * Make sure we do not leak PI boosting priority to the child.
1749
	 */
1750
	p->prio = rq->curr->normal_prio;
1751
1752
	activate_task(p, rq);
1753
	trace_sched_wakeup_new(p, 1);
1754
	if (unlikely(p->policy == SCHED_FIFO))
1755
		goto after_ts_init;
1756
1757
	/*
1758
	 * Share the timeslice between parent and child, thus the
1759
	 * total amount of pending timeslices in the system doesn't change,
1760
	 * resulting in more scheduling fairness. If it's negative, it won't
1761
	 * matter since that's the same as being 0. current's time_slice is
1762
	 * actually in rq_time_slice when it's running, as is its last_ran
1763
	 * value. rq->rq_deadline is only modified within schedule() so it
1764
	 * is always equal to current->deadline.
1765
	 */
1766
	p->last_ran = rq->rq_last_ran;
1767
	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
1768
		rq->rq_time_slice /= 2;
1769
		p->time_slice = rq->rq_time_slice;
1770
after_ts_init:
1771
		if (rq->curr == parent && !suitable_idle_cpus(p)) {
1772
			/*
1773
			 * The VM isn't cloned, so we're in a good position to
1774
			 * do child-runs-first in anticipation of an exec. This
1775
			 * usually avoids a lot of COW overhead.
1776
			 */
1777
			set_tsk_need_resched(parent);
1778
		} else
1779
			try_preempt(p, rq);
1780
	} else {
1781
		if (rq->curr == parent) {
1782
			/*
1783
		 	* Forking task has run out of timeslice. Reschedule it and
1784
		 	* start its child with a new time slice and deadline. The
1785
		 	* child will end up running first because its deadline will
1786
		 	* be slightly earlier.
1787
		 	*/
1788
			rq->rq_time_slice = 0;
1789
			set_tsk_need_resched(parent);
1790
		}
1791
		time_slice_expired(p);
1792
	}
1793
	task_grq_unlock(&flags);
1794
}
1795
1796
#ifdef CONFIG_PREEMPT_NOTIFIERS
1797
1798
/**
1799
 * preempt_notifier_register - tell me when current is being preempted & rescheduled
1800
 * @notifier: notifier struct to register
1801
 */
1802
void preempt_notifier_register(struct preempt_notifier *notifier)
1803
{
1804
	hlist_add_head(&notifier->link, &current->preempt_notifiers);
1805
}
1806
EXPORT_SYMBOL_GPL(preempt_notifier_register);
1807
1808
/**
1809
 * preempt_notifier_unregister - no longer interested in preemption notifications
1810
 * @notifier: notifier struct to unregister
1811
 *
1812
 * This is safe to call from within a preemption notifier.
1813
 */
1814
void preempt_notifier_unregister(struct preempt_notifier *notifier)
1815
{
1816
	hlist_del(&notifier->link);
1817
}
1818
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1819
1820
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1821
{
1822
	struct preempt_notifier *notifier;
1823
1824
	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1825
		notifier->ops->sched_in(notifier, raw_smp_processor_id());
1826
}
1827
1828
static void
1829
fire_sched_out_preempt_notifiers(struct task_struct *curr,
1830
				 struct task_struct *next)
1831
{
1832
	struct preempt_notifier *notifier;
1833
1834
	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1835
		notifier->ops->sched_out(notifier, next);
1836
}
1837
1838
#else /* !CONFIG_PREEMPT_NOTIFIERS */
1839
1840
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1841
{
1842
}
1843
1844
static void
1845
fire_sched_out_preempt_notifiers(struct task_struct *curr,
1846
				 struct task_struct *next)
1847
{
1848
}
1849
1850
#endif /* CONFIG_PREEMPT_NOTIFIERS */
1851
1852
/**
1853
 * prepare_task_switch - prepare to switch tasks
1854
 * @rq: the runqueue preparing to switch
1855
 * @next: the task we are going to switch to.
1856
 *
1857
 * This is called with the rq lock held and interrupts off. It must
1858
 * be paired with a subsequent finish_task_switch after the context
1859
 * switch.
1860
 *
1861
 * prepare_task_switch sets up locking and calls architecture specific
1862
 * hooks.
1863
 */
1864
static inline void
1865
prepare_task_switch(struct rq *rq, struct task_struct *prev,
1866
		    struct task_struct *next)
1867
{
1868
	sched_info_switch(prev, next);
1869
	perf_event_task_sched_out(prev, next);
1870
	fire_sched_out_preempt_notifiers(prev, next);
1871
	prepare_lock_switch(rq, next);
1872
	prepare_arch_switch(next);
1873
	trace_sched_switch(prev, next);
1874
}
1875
1876
/**
1877
 * finish_task_switch - clean up after a task-switch
1878
 * @rq: runqueue associated with task-switch
1879
 * @prev: the thread we just switched away from.
1880
 *
1881
 * finish_task_switch must be called after the context switch, paired
1882
 * with a prepare_task_switch call before the context switch.
1883
 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1884
 * and do any other architecture-specific cleanup actions.
1885
 *
1886
 * Note that we may have delayed dropping an mm in context_switch(). If
1887
 * so, we finish that here outside of the runqueue lock.  (Doing it
1888
 * with the lock held can cause deadlocks; see schedule() for
1889
 * details.)
1890
 */
1891
static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1892
	__releases(grq.lock)
1893
{
1894
	struct mm_struct *mm = rq->prev_mm;
1895
	long prev_state;
1896
1897
	rq->prev_mm = NULL;
1898
1899
	/*
1900
	 * A task struct has one reference for the use as "current".
1901
	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1902
	 * schedule one last time. The schedule call will never return, and
1903
	 * the scheduled task must drop that reference.
1904
	 * The test for TASK_DEAD must occur while the runqueue locks are
1905
	 * still held, otherwise prev could be scheduled on another cpu, die
1906
	 * there before we look at prev->state, and then the reference would
1907
	 * be dropped twice.
1908
	 *		Manfred Spraul <manfred@colorfullife.com>
1909
	 */
1910
	prev_state = prev->state;
1911
	vtime_task_switch(prev);
1912
	finish_arch_switch(prev);
1913
	perf_event_task_sched_in(prev, current);
1914
	finish_lock_switch(rq, prev);
1915
	finish_arch_post_lock_switch();
1916
1917
	fire_sched_in_preempt_notifiers(current);
1918
	if (mm)
1919
		mmdrop(mm);
1920
	if (unlikely(prev_state == TASK_DEAD)) {
1921
		/*
1922
		 * Remove function-return probe instances associated with this
1923
		 * task and put them back on the free list.
1924
		 */
1925
		kprobe_flush_task(prev);
1926
		put_task_struct(prev);
1927
	}
1928
}
1929
1930
/**
1931
 * schedule_tail - first thing a freshly forked thread must call.
1932
 * @prev: the thread we just switched away from.
1933
 */
1934
asmlinkage void schedule_tail(struct task_struct *prev)
1935
	__releases(grq.lock)
1936
{
1937
	struct rq *rq = this_rq();
1938
1939
	finish_task_switch(rq, prev);
1940
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1941
	/* In this case, finish_task_switch does not reenable preemption */
1942
	preempt_enable();
1943
#endif
1944
	if (current->set_child_tid)
1945
		put_user(current->pid, current->set_child_tid);
1946
}
1947
1948
/*
1949
 * context_switch - switch to the new MM and the new
1950
 * thread's register state.
1951
 */
1952
static inline void
1953
context_switch(struct rq *rq, struct task_struct *prev,
1954
	       struct task_struct *next)
1955
{
1956
	struct mm_struct *mm, *oldmm;
1957
1958
	prepare_task_switch(rq, prev, next);
1959
1960
	mm = next->mm;
1961
	oldmm = prev->active_mm;
1962
	/*
1963
	 * For paravirt, this is coupled with an exit in switch_to to
1964
	 * combine the page table reload and the switch backend into
1965
	 * one hypercall.
1966
	 */
1967
	arch_start_context_switch(prev);
1968
1969
	if (!mm) {
1970
		next->active_mm = oldmm;
1971
		atomic_inc(&oldmm->mm_count);
1972
		enter_lazy_tlb(oldmm, next);
1973
	} else
1974
		switch_mm(oldmm, mm, next);
1975
1976
	if (!prev->mm) {
1977
		prev->active_mm = NULL;
1978
		rq->prev_mm = oldmm;
1979
	}
1980
	/*
1981
	 * Since the runqueue lock will be released by the next
1982
	 * task (which is an invalid locking op but in the case
1983
	 * of the scheduler it's an obvious special-case), so we
1984
	 * do an early lockdep release here:
1985
	 */
1986
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1987
	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
1988
#endif
1989
1990
	/* Here we just switch the register state and the stack. */
1991
	context_tracking_task_switch(prev, next);
1992
	switch_to(prev, next, prev);
1993
1994
	barrier();
1995
	/*
1996
	 * this_rq must be evaluated again because prev may have moved
1997
	 * CPUs since it called schedule(), thus the 'rq' on its stack
1998
	 * frame will be invalid.
1999
	 */
2000
	finish_task_switch(this_rq(), prev);
2001
}
2002
2003
/*
2004
 * nr_running, nr_uninterruptible and nr_context_switches:
2005
 *
2006
 * externally visible scheduler statistics: current number of runnable
2007
 * threads, total number of context switches performed since bootup. All are
2008
 * measured without grabbing the grq lock but the occasional inaccurate result
2009
 * doesn't matter so long as it's positive.
2010
 */
2011
unsigned long nr_running(void)
2012
{
2013
	long nr = grq.nr_running;
2014
2015
	if (unlikely(nr < 0))
2016
		nr = 0;
2017
	return (unsigned long)nr;
2018
}
2019
2020
static unsigned long nr_uninterruptible(void)
2021
{
2022
	long nu = grq.nr_uninterruptible;
2023
2024
	if (unlikely(nu < 0))
2025
		nu = 0;
2026
	return nu;
2027
}
2028
2029
unsigned long long nr_context_switches(void)
2030
{
2031
	long long ns = grq.nr_switches;
2032
2033
	/* This is of course impossible */
2034
	if (unlikely(ns < 0))
2035
		ns = 1;
2036
	return (unsigned long long)ns;
2037
}
2038
2039
unsigned long nr_iowait(void)
2040
{
2041
	unsigned long i, sum = 0;
2042
2043
	for_each_possible_cpu(i)
2044
		sum += atomic_read(&cpu_rq(i)->nr_iowait);
2045
2046
	return sum;
2047
}
2048
2049
unsigned long nr_iowait_cpu(int cpu)
2050
{
2051
	struct rq *this = cpu_rq(cpu);
2052
	return atomic_read(&this->nr_iowait);
2053
}
2054
2055
unsigned long nr_active(void)
2056
{
2057
	return nr_running() + nr_uninterruptible();
2058
}
2059
2060
/* Beyond a task running on this CPU, load is equal everywhere on BFS */
2061
unsigned long this_cpu_load(void)
2062
{
2063
	return this_rq()->rq_running +
2064
		((queued_notrunning() + nr_uninterruptible()) / grq.noc);
2065
}
2066
2067
/* Variables and functions for calc_load */
2068
static unsigned long calc_load_update;
2069
unsigned long avenrun[3];
2070
EXPORT_SYMBOL(avenrun);
2071
2072
/**
2073
 * get_avenrun - get the load average array
2074
 * @loads:	pointer to dest load array
2075
 * @offset:	offset to add
2076
 * @shift:	shift count to shift the result left
2077
 *
2078
 * These values are estimates at best, so no need for locking.
2079
 */
2080
void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2081
{
2082
	loads[0] = (avenrun[0] + offset) << shift;
2083
	loads[1] = (avenrun[1] + offset) << shift;
2084
	loads[2] = (avenrun[2] + offset) << shift;
2085
}
2086
2087
static unsigned long
2088
calc_load(unsigned long load, unsigned long exp, unsigned long active)
2089
{
2090
	load *= exp;
2091
	load += active * (FIXED_1 - exp);
2092
	return load >> FSHIFT;
2093
}
2094
2095
/*
2096
 * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
2097
 */
2098
void calc_global_load(unsigned long ticks)
2099
{
2100
	long active;
2101
2102
	if (time_before(jiffies, calc_load_update))
2103
		return;
2104
	active = nr_active() * FIXED_1;
2105
2106
	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2107
	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2108
	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2109
2110
	calc_load_update = jiffies + LOAD_FREQ;
2111
}
2112
2113
DEFINE_PER_CPU(struct kernel_stat, kstat);
2114
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2115
2116
EXPORT_PER_CPU_SYMBOL(kstat);
2117
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2118
2119
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2120
2121
/*
2122
 * There are no locks covering percpu hardirq/softirq time.
2123
 * They are only modified in account_system_vtime, on corresponding CPU
2124
 * with interrupts disabled. So, writes are safe.
2125
 * They are read and saved off onto struct rq in update_rq_clock().
2126
 * This may result in other CPU reading this CPU's irq time and can
2127
 * race with irq/account_system_vtime on this CPU. We would either get old
2128
 * or new value with a side effect of accounting a slice of irq time to wrong
2129
 * task when irq is in progress while we read rq->clock. That is a worthy
2130
 * compromise in place of having locks on each irq in account_system_time.
2131
 */
2132
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
2133
static DEFINE_PER_CPU(u64, cpu_softirq_time);
2134
2135
static DEFINE_PER_CPU(u64, irq_start_time);
2136
static int sched_clock_irqtime;
2137
2138
void enable_sched_clock_irqtime(void)
2139
{
2140
	sched_clock_irqtime = 1;
2141
}
2142
2143
void disable_sched_clock_irqtime(void)
2144
{
2145
	sched_clock_irqtime = 0;
2146
}
2147
2148
#ifndef CONFIG_64BIT
2149
static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
2150
2151
static inline void irq_time_write_begin(void)
2152
{
2153
	__this_cpu_inc(irq_time_seq.sequence);
2154
	smp_wmb();
2155
}
2156
2157
static inline void irq_time_write_end(void)
2158
{
2159
	smp_wmb();
2160
	__this_cpu_inc(irq_time_seq.sequence);
2161
}
2162
2163
static inline u64 irq_time_read(int cpu)
2164
{
2165
	u64 irq_time;
2166
	unsigned seq;
2167
2168
	do {
2169
		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
2170
		irq_time = per_cpu(cpu_softirq_time, cpu) +
2171
			   per_cpu(cpu_hardirq_time, cpu);
2172
	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
2173
2174
	return irq_time;
2175
}
2176
#else /* CONFIG_64BIT */
2177
static inline void irq_time_write_begin(void)
2178
{
2179
}
2180
2181
static inline void irq_time_write_end(void)
2182
{
2183
}
2184
2185
static inline u64 irq_time_read(int cpu)
2186
{
2187
	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
2188
}
2189
#endif /* CONFIG_64BIT */
2190
2191
/*
2192
 * Called before incrementing preempt_count on {soft,}irq_enter
2193
 * and before decrementing preempt_count on {soft,}irq_exit.
2194
 */
2195
void irqtime_account_irq(struct task_struct *curr)
2196
{
2197
	unsigned long flags;
2198
	s64 delta;
2199
	int cpu;
2200
2201
	if (!sched_clock_irqtime)
2202
		return;
2203
2204
	local_irq_save(flags);
2205
2206
	cpu = smp_processor_id();
2207
	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
2208
	__this_cpu_add(irq_start_time, delta);
2209
2210
	irq_time_write_begin();
2211
	/*
2212
	 * We do not account for softirq time from ksoftirqd here.
2213
	 * We want to continue accounting softirq time to ksoftirqd thread
2214
	 * in that case, so as not to confuse scheduler with a special task
2215
	 * that do not consume any time, but still wants to run.
2216
	 */
2217
	if (hardirq_count())
2218
		__this_cpu_add(cpu_hardirq_time, delta);
2219
	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
2220
		__this_cpu_add(cpu_softirq_time, delta);
2221
2222
	irq_time_write_end();
2223
	local_irq_restore(flags);
2224
}
2225
EXPORT_SYMBOL_GPL(irqtime_account_irq);
2226
2227
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2228
2229
#ifdef CONFIG_PARAVIRT
2230
static inline u64 steal_ticks(u64 steal)
2231
{
2232
	if (unlikely(steal > NSEC_PER_SEC))
2233
		return div_u64(steal, TICK_NSEC);
2234
2235
	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
2236
}
2237
#endif
2238
2239
static void update_rq_clock_task(struct rq *rq, s64 delta)
2240
{
2241
/*
2242
 * In theory, the compile should just see 0 here, and optimize out the call
2243
 * to sched_rt_avg_update. But I don't trust it...
2244
 */
2245
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2246
	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2247
2248
	/*
2249
	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
2250
	 * this case when a previous update_rq_clock() happened inside a
2251
	 * {soft,}irq region.
2252
	 *
2253
	 * When this happens, we stop ->clock_task and only update the
2254
	 * prev_irq_time stamp to account for the part that fit, so that a next
2255
	 * update will consume the rest. This ensures ->clock_task is
2256
	 * monotonic.
2257
	 *
2258
	 * It does however cause some slight miss-attribution of {soft,}irq
2259
	 * time, a more accurate solution would be to update the irq_time using
2260
	 * the current rq->clock timestamp, except that would require using
2261
	 * atomic ops.
2262
	 */
2263
	if (irq_delta > delta)
2264
		irq_delta = delta;
2265
2266
	rq->prev_irq_time += irq_delta;
2267
	delta -= irq_delta;
2268
#endif
2269
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
2270
	if (static_key_false((&paravirt_steal_rq_enabled))) {
2271
		s64 steal = paravirt_steal_clock(cpu_of(rq));
2272
		u64 st;
2273
2274
		steal -= rq->prev_steal_time_rq;
2275
2276
		if (unlikely(steal > delta))
2277
			steal = delta;
2278
2279
		st = steal_ticks(steal);
2280
		steal = st * TICK_NSEC;
2281
2282
		rq->prev_steal_time_rq += steal;
2283
2284
		delta -= steal;
2285
	}
2286
#endif
2287
2288
	rq->clock_task += delta;
2289
}
2290
2291
#ifndef nsecs_to_cputime
2292
# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
2293
#endif
2294
2295
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2296
static void irqtime_account_hi_si(void)
2297
{
2298
	u64 *cpustat = kcpustat_this_cpu->cpustat;
2299
	u64 latest_ns;
2300
2301
	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
2302
	if (latest_ns > cpustat[CPUTIME_IRQ])
2303
		cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
2304
2305
	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
2306
	if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
2307
		cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
2308
}
2309
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2310
2311
#define sched_clock_irqtime	(0)
2312
2313
static inline void irqtime_account_hi_si(void)
2314
{
2315
}
2316
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2317
2318
static __always_inline bool steal_account_process_tick(void)
2319
{
2320
#ifdef CONFIG_PARAVIRT
2321
	if (static_key_false(&paravirt_steal_enabled)) {
2322
		u64 steal, st = 0;
2323
2324
		steal = paravirt_steal_clock(smp_processor_id());
2325
		steal -= this_rq()->prev_steal_time;
2326
2327
		st = steal_ticks(steal);
2328
		this_rq()->prev_steal_time += st * TICK_NSEC;
2329
2330
		account_steal_time(st);
2331
		return st;
2332
	}
2333
#endif
2334
	return false;
2335
}
2336
2337
/*
2338
 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
2339
 * tasks (sum on group iteration) belonging to @tsk's group.
2340
 */
2341
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
2342
{
2343
	struct signal_struct *sig = tsk->signal;
2344
	cputime_t utime, stime;
2345
	struct task_struct *t;
2346
2347
	times->utime = sig->utime;
2348
	times->stime = sig->stime;
2349
	times->sum_exec_runtime = sig->sum_sched_runtime;
2350
2351
	rcu_read_lock();
2352
	/* make sure we can trust tsk->thread_group list */
2353
	if (!likely(pid_alive(tsk)))
2354
		goto out;
2355
2356
	t = tsk;
2357
	do {
2358
		task_cputime(t, &utime, &stime);
2359
		times->utime += utime;
2360
		times->stime += stime;
2361
		times->sum_exec_runtime += task_sched_runtime(t);
2362
	} while_each_thread(tsk, t);
2363
out:
2364
	rcu_read_unlock();
2365
}
2366
2367
/*
2368
 * On each tick, see what percentage of that tick was attributed to each
2369
 * component and add the percentage to the _pc values. Once a _pc value has
2370
 * accumulated one tick's worth, account for that. This means the total
2371
 * percentage of load components will always be 128 (pseudo 100) per tick.
2372
 */
2373
static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc)
2374
{
2375
	u64 *cpustat = kcpustat_this_cpu->cpustat;
2376
2377
	if (atomic_read(&rq->nr_iowait) > 0) {
2378
		rq->iowait_pc += pc;
2379
		if (rq->iowait_pc >= 128) {
2380
			cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128;
2381
			rq->iowait_pc %= 128;
2382
		}
2383
	} else {
2384
		rq->idle_pc += pc;
2385
		if (rq->idle_pc >= 128) {
2386
			cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128;
2387
			rq->idle_pc %= 128;
2388
		}
2389
	}
2390
	acct_update_integrals(idle);
2391
}
2392
2393
static void
2394
pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
2395
	       unsigned long pc, unsigned long ns)
2396
{
2397
	u64 *cpustat = kcpustat_this_cpu->cpustat;
2398
	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
2399
2400
	p->stime_pc += pc;
2401
	if (p->stime_pc >= 128) {
2402
		int jiffs = p->stime_pc / 128;
2403
2404
		p->stime_pc %= 128;
2405
		p->stime += (__force u64)cputime_one_jiffy * jiffs;
2406
		p->stimescaled += one_jiffy_scaled * jiffs;
2407
		account_group_system_time(p, cputime_one_jiffy * jiffs);
2408
	}
2409
	p->sched_time += ns;
2410
	/*
2411
	 * Do not update the cputimer if the task is already released by
2412
	 * release_task().
2413
	 *
2414
	 * This could be executed if a tick happens when a task is inside
2415
	 * do_exit() between the call to release_task() and its final
2416
	 * schedule() call for autoreaping tasks.
2417
	 */
2418
	if (likely(p->sighand))
2419
		account_group_exec_runtime(p, ns);
2420
2421
	if (hardirq_count() - hardirq_offset) {
2422
		rq->irq_pc += pc;
2423
		if (rq->irq_pc >= 128) {
2424
			cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128;
2425
			rq->irq_pc %= 128;
2426
		}
2427
	} else if (in_serving_softirq()) {
2428
		rq->softirq_pc += pc;
2429
		if (rq->softirq_pc >= 128) {
2430
			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
2431
			rq->softirq_pc %= 128;
2432
		}
2433
	} else {
2434
		rq->system_pc += pc;
2435
		if (rq->system_pc >= 128) {
2436
			cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128;
2437
			rq->system_pc %= 128;
2438
		}
2439
	}
2440
	acct_update_integrals(p);
2441
}
2442
2443
static void pc_user_time(struct rq *rq, struct task_struct *p,
2444
			 unsigned long pc, unsigned long ns)
2445
{
2446
	u64 *cpustat = kcpustat_this_cpu->cpustat;
2447
	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
2448
2449
	p->utime_pc += pc;
2450
	if (p->utime_pc >= 128) {
2451
		int jiffs = p->utime_pc / 128;
2452
2453
		p->utime_pc %= 128;
2454
		p->utime += (__force u64)cputime_one_jiffy * jiffs;
2455
		p->utimescaled += one_jiffy_scaled * jiffs;
2456
		account_group_user_time(p, cputime_one_jiffy * jiffs);
2457
	}
2458
	p->sched_time += ns;
2459
	/*
2460
	 * Do not update the cputimer if the task is already released by
2461
	 * release_task().
2462
	 *
2463
	 * it would preferable to defer the autoreap release_task
2464
	 * after the last context switch but harder to do.
2465
	 */
2466
	if (likely(p->sighand))
2467
		account_group_exec_runtime(p, ns);
2468
2469
	if (this_cpu_ksoftirqd() == p) {
2470
		/*
2471
		 * ksoftirqd time do not get accounted in cpu_softirq_time.
2472
		 * So, we have to handle it separately here.
2473
		 */
2474
		rq->softirq_pc += pc;
2475
		if (rq->softirq_pc >= 128) {
2476
			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
2477
			rq->softirq_pc %= 128;
2478
		}
2479
	}
2480
2481
	if (TASK_NICE(p) > 0 || idleprio_task(p)) {
2482
		rq->nice_pc += pc;
2483
		if (rq->nice_pc >= 128) {
2484
			cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128;
2485
			rq->nice_pc %= 128;
2486
		}
2487
	} else {
2488
		rq->user_pc += pc;
2489
		if (rq->user_pc >= 128) {
2490
			cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128;
2491
			rq->user_pc %= 128;
2492
		}
2493
	}
2494
	acct_update_integrals(p);
2495
}
2496
2497
/*
2498
 * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
2499
 * shifts instead of 100
2500
 */
2501
#define NS_TO_PC(NS)	(NS * 128 / JIFFY_NS)
2502
2503
/*
2504
 * This is called on clock ticks.
2505
 * Bank in p->sched_time the ns elapsed since the last tick or switch.
2506
 * CPU scheduler quota accounting is also performed here in microseconds.
2507
 */
2508
static void
2509
update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
2510
{
2511
	long account_ns = rq->clock_task - rq->rq_last_ran;
2512
	struct task_struct *idle = rq->idle;
2513
	unsigned long account_pc;
2514
2515
	if (unlikely(account_ns < 0) || steal_account_process_tick())
2516
		goto ts_account;
2517
2518
	account_pc = NS_TO_PC(account_ns);
2519
2520
	/* Accurate tick timekeeping */
2521
	if (user_mode(get_irq_regs()))
2522
		pc_user_time(rq, p, account_pc, account_ns);
2523
	else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
2524
		pc_system_time(rq, p, HARDIRQ_OFFSET,
2525
			       account_pc, account_ns);
2526
	else
2527
		pc_idle_time(rq, idle, account_pc);
2528
2529
	if (sched_clock_irqtime)
2530
		irqtime_account_hi_si();
2531
2532
ts_account:
2533
	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
2534
	if (rq->rq_policy != SCHED_FIFO && p != idle) {
2535
		s64 time_diff = rq->clock - rq->timekeep_clock;
2536
2537
		niffy_diff(&time_diff, 1);
2538
		rq->rq_time_slice -= NS_TO_US(time_diff);
2539
	}
2540
2541
	rq->rq_last_ran = rq->clock_task;
2542
	rq->timekeep_clock = rq->clock;
2543
}
2544
2545
/*
2546
 * This is called on context switches.
2547
 * Bank in p->sched_time the ns elapsed since the last tick or switch.
2548
 * CPU scheduler quota accounting is also performed here in microseconds.
2549
 */
2550
static void
2551
update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
2552
{
2553
	long account_ns = rq->clock_task - rq->rq_last_ran;
2554
	struct task_struct *idle = rq->idle;
2555
	unsigned long account_pc;
2556
2557
	if (unlikely(account_ns < 0))
2558
		goto ts_account;
2559
2560
	account_pc = NS_TO_PC(account_ns);
2561
2562
	/* Accurate subtick timekeeping */
2563
	if (p != idle) {
2564
		pc_user_time(rq, p, account_pc, account_ns);
2565
	}
2566
	else
2567
		pc_idle_time(rq, idle, account_pc);
2568
2569
ts_account:
2570
	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
2571
	if (rq->rq_policy != SCHED_FIFO && p != idle) {
2572
		s64 time_diff = rq->clock - rq->timekeep_clock;
2573
2574
		niffy_diff(&time_diff, 1);
2575
		rq->rq_time_slice -= NS_TO_US(time_diff);
2576
	}
2577
2578
	rq->rq_last_ran = rq->clock_task;
2579
	rq->timekeep_clock = rq->clock;
2580
}
2581
2582
/*
2583
 * Return any ns on the sched_clock that have not yet been accounted in
2584
 * @p in case that task is currently running.
2585
 *
2586
 * Called with task_grq_lock() held.
2587
 */
2588
static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2589
{
2590
	u64 ns = 0;
2591
2592
	if (p == rq->curr) {
2593
		update_clocks(rq);
2594
		ns = rq->clock_task - rq->rq_last_ran;
2595
		if (unlikely((s64)ns < 0))
2596
			ns = 0;
2597
	}
2598
2599
	return ns;
2600
}
2601
2602
unsigned long long task_delta_exec(struct task_struct *p)
2603
{
2604
	unsigned long flags;
2605
	struct rq *rq;
2606
	u64 ns;
2607
2608
	rq = task_grq_lock(p, &flags);
2609
	ns = do_task_delta_exec(p, rq);
2610
	task_grq_unlock(&flags);
2611
2612
	return ns;
2613
}
2614
2615
/*
2616
 * Return accounted runtime for the task.
2617
 * Return separately the current's pending runtime that have not been
2618
 * accounted yet.
2619
 *
2620
 * grq lock already acquired.
2621
 */
2622
unsigned long long task_sched_runtime(struct task_struct *p)
2623
{
2624
	unsigned long flags;
2625
	struct rq *rq;
2626
	u64 ns;
2627
2628
	rq = task_grq_lock(p, &flags);
2629
	ns = p->sched_time + do_task_delta_exec(p, rq);
2630
	task_grq_unlock(&flags);
2631
2632
	return ns;
2633
}
2634
2635
/*
2636
 * Return accounted runtime for the task.
2637
 * Return separately the current's pending runtime that have not been
2638
 * accounted yet.
2639
 */
2640
unsigned long long task_sched_runtime_nodelta(struct task_struct *p, unsigned long long *delta)
2641
{
2642
	unsigned long flags;
2643
	struct rq *rq;
2644
	u64 ns;
2645
2646
	rq = task_grq_lock(p, &flags);
2647
	ns = p->sched_time;
2648
	*delta = do_task_delta_exec(p, rq);
2649
	task_grq_unlock(&flags);
2650
2651
	return ns;
2652
}
2653
2654
/* Compatibility crap */
2655
void account_user_time(struct task_struct *p, cputime_t cputime,
2656
		       cputime_t cputime_scaled)
2657
{
2658
}
2659
2660
void account_idle_time(cputime_t cputime)
2661
{
2662
}
2663
2664
void update_cpu_load_nohz(void)
2665
{
2666
}
2667
2668
#ifdef CONFIG_NO_HZ_COMMON
2669
void calc_load_enter_idle(void)
2670
{
2671
}
2672
2673
void calc_load_exit_idle(void)
2674
{
2675
}
2676
#endif /* CONFIG_NO_HZ_COMMON */
2677
2678
/*
2679
 * Account guest cpu time to a process.
2680
 * @p: the process that the cpu time gets accounted to
2681
 * @cputime: the cpu time spent in virtual machine since the last update
2682
 * @cputime_scaled: cputime scaled by cpu frequency
2683
 */
2684
static void account_guest_time(struct task_struct *p, cputime_t cputime,
2685
			       cputime_t cputime_scaled)
2686
{
2687
	u64 *cpustat = kcpustat_this_cpu->cpustat;
2688
2689
	/* Add guest time to process. */
2690
	p->utime += (__force u64)cputime;
2691
	p->utimescaled += (__force u64)cputime_scaled;
2692
	account_group_user_time(p, cputime);
2693
	p->gtime += (__force u64)cputime;
2694
2695
	/* Add guest time to cpustat. */
2696
	if (TASK_NICE(p) > 0) {
2697
		cpustat[CPUTIME_NICE] += (__force u64)cputime;
2698
		cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
2699
	} else {
2700
		cpustat[CPUTIME_USER] += (__force u64)cputime;
2701
		cpustat[CPUTIME_GUEST] += (__force u64)cputime;
2702
	}
2703
}
2704
2705
/*
2706
 * Account system cpu time to a process and desired cpustat field
2707
 * @p: the process that the cpu time gets accounted to
2708
 * @cputime: the cpu time spent in kernel space since the last update
2709
 * @cputime_scaled: cputime scaled by cpu frequency
2710
 * @target_cputime64: pointer to cpustat field that has to be updated
2711
 */
2712
static inline
2713
void __account_system_time(struct task_struct *p, cputime_t cputime,
2714
			cputime_t cputime_scaled, cputime64_t *target_cputime64)
2715
{
2716
	/* Add system time to process. */
2717
	p->stime += (__force u64)cputime;
2718
	p->stimescaled += (__force u64)cputime_scaled;
2719
	account_group_system_time(p, cputime);
2720
2721
	/* Add system time to cpustat. */
2722
	*target_cputime64 += (__force u64)cputime;
2723
2724
	/* Account for system time used */
2725
	acct_update_integrals(p);
2726
}
2727
2728
/*
2729
 * Account system cpu time to a process.
2730
 * @p: the process that the cpu time gets accounted to
2731
 * @hardirq_offset: the offset to subtract from hardirq_count()
2732
 * @cputime: the cpu time spent in kernel space since the last update
2733
 * @cputime_scaled: cputime scaled by cpu frequency
2734
 * This is for guest only now.
2735
 */
2736
void account_system_time(struct task_struct *p, int hardirq_offset,
2737
			 cputime_t cputime, cputime_t cputime_scaled)
2738
{
2739
2740
	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
2741
		account_guest_time(p, cputime, cputime_scaled);
2742
}
2743
2744
/*
2745
 * Account for involuntary wait time.
2746
 * @steal: the cpu time spent in involuntary wait
2747
 */
2748
void account_steal_time(cputime_t cputime)
2749
{
2750
	u64 *cpustat = kcpustat_this_cpu->cpustat;
2751
2752
	cpustat[CPUTIME_STEAL] += (__force u64)cputime;
2753
}
2754
2755
/*
2756
 * Account for idle time.
2757
 * @cputime: the cpu time spent in idle wait
2758
 */
2759
static void account_idle_times(cputime_t cputime)
2760
{
2761
	u64 *cpustat = kcpustat_this_cpu->cpustat;
2762
	struct rq *rq = this_rq();
2763
2764
	if (atomic_read(&rq->nr_iowait) > 0)
2765
		cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
2766
	else
2767
		cpustat[CPUTIME_IDLE] += (__force u64)cputime;
2768
}
2769
2770
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
2771
2772
void account_process_tick(struct task_struct *p, int user_tick)
2773
{
2774
}
2775
2776
/*
2777
 * Account multiple ticks of steal time.
2778
 * @p: the process from which the cpu time has been stolen
2779
 * @ticks: number of stolen ticks
2780
 */
2781
void account_steal_ticks(unsigned long ticks)
2782
{
2783
	account_steal_time(jiffies_to_cputime(ticks));
2784
}
2785
2786
/*
2787
 * Account multiple ticks of idle time.
2788
 * @ticks: number of stolen ticks
2789
 */
2790
void account_idle_ticks(unsigned long ticks)
2791
{
2792
	account_idle_times(jiffies_to_cputime(ticks));
2793
}
2794
#endif
2795
2796
static inline void grq_iso_lock(void)
2797
	__acquires(grq.iso_lock)
2798
{
2799
	raw_spin_lock(&grq.iso_lock);
2800
}
2801
2802
static inline void grq_iso_unlock(void)
2803
	__releases(grq.iso_lock)
2804
{
2805
	raw_spin_unlock(&grq.iso_lock);
2806
}
2807
2808
/*
2809
 * Functions to test for when SCHED_ISO tasks have used their allocated
2810
 * quota as real time scheduling and convert them back to SCHED_NORMAL.
2811
 * Where possible, the data is tested lockless, to avoid grabbing iso_lock
2812
 * because the occasional inaccurate result won't matter. However the
2813
 * tick data is only ever modified under lock. iso_refractory is only simply
2814
 * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
2815
 */
2816
static bool set_iso_refractory(void)
2817
{
2818
	grq.iso_refractory = true;
2819
	return grq.iso_refractory;
2820
}
2821
2822
static bool clear_iso_refractory(void)
2823
{
2824
	grq.iso_refractory = false;
2825
	return grq.iso_refractory;
2826
}
2827
2828
/*
2829
 * Test if SCHED_ISO tasks have run longer than their alloted period as RT
2830
 * tasks and set the refractory flag if necessary. There is 10% hysteresis
2831
 * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
2832
 * slow division.
2833
 */
2834
static bool test_ret_isorefractory(struct rq *rq)
2835
{
2836
	if (likely(!grq.iso_refractory)) {
2837
		if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu)
2838
			return set_iso_refractory();
2839
	} else {
2840
		if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))
2841
			return clear_iso_refractory();
2842
	}
2843
	return grq.iso_refractory;
2844
}
2845
2846
static void iso_tick(void)
2847
{
2848
	grq_iso_lock();
2849
	grq.iso_ticks += 100;
2850
	grq_iso_unlock();
2851
}
2852
2853
/* No SCHED_ISO task was running so decrease rq->iso_ticks */
2854
static inline void no_iso_tick(void)
2855
{
2856
	if (grq.iso_ticks) {
2857
		grq_iso_lock();
2858
		grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
2859
		if (unlikely(grq.iso_refractory && grq.iso_ticks <
2860
		    ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
2861
			clear_iso_refractory();
2862
		grq_iso_unlock();
2863
	}
2864
}
2865
2866
/* This manages tasks that have run out of timeslice during a scheduler_tick */
2867
static void task_running_tick(struct rq *rq)
2868
{
2869
	struct task_struct *p;
2870
2871
	/*
2872
	 * If a SCHED_ISO task is running we increment the iso_ticks. In
2873
	 * order to prevent SCHED_ISO tasks from causing starvation in the
2874
	 * presence of true RT tasks we account those as iso_ticks as well.
2875
	 */
2876
	if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) {
2877
		if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128)
2878
			iso_tick();
2879
	} else
2880
		no_iso_tick();
2881
2882
	if (iso_queue(rq)) {
2883
		if (unlikely(test_ret_isorefractory(rq))) {
2884
			if (rq_running_iso(rq)) {
2885
				/*
2886
				 * SCHED_ISO task is running as RT and limit
2887
				 * has been hit. Force it to reschedule as
2888
				 * SCHED_NORMAL by zeroing its time_slice
2889
				 */
2890
				rq->rq_time_slice = 0;
2891
			}
2892
		}
2893
	}
2894
2895
	/* SCHED_FIFO tasks never run out of timeslice. */
2896
	if (rq->rq_policy == SCHED_FIFO)
2897
		return;
2898
	/*
2899
	 * Tasks that were scheduled in the first half of a tick are not
2900
	 * allowed to run into the 2nd half of the next tick if they will
2901
	 * run out of time slice in the interim. Otherwise, if they have
2902
	 * less than RESCHED_US μs of time slice left they will be rescheduled.
2903
	 */
2904
	if (rq->dither) {
2905
		if (rq->rq_time_slice > HALF_JIFFY_US)
2906
			return;
2907
		else
2908
			rq->rq_time_slice = 0;
2909
	} else if (rq->rq_time_slice >= RESCHED_US)
2910
			return;
2911
2912
	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
2913
	p = rq->curr;
2914
	grq_lock();
2915
	requeue_task(p);
2916
	set_tsk_need_resched(p);
2917
	grq_unlock();
2918
}
2919
2920
/*
2921
 * This function gets called by the timer code, with HZ frequency.
2922
 * We call it with interrupts disabled. The data modified is all
2923
 * local to struct rq so we don't need to grab grq lock.
2924
 */
2925
void scheduler_tick(void)
2926
{
2927
	int cpu __maybe_unused = smp_processor_id();
2928
	struct rq *rq = cpu_rq(cpu);
2929
2930
	sched_clock_tick();
2931
	/* grq lock not grabbed, so only update rq clock */
2932
	update_rq_clock(rq);
2933
	update_cpu_clock_tick(rq, rq->curr);
2934
	if (!rq_idle(rq))
2935
		task_running_tick(rq);
2936
	else
2937
		no_iso_tick();
2938
	rq->last_tick = rq->clock;
2939
	perf_event_task_tick();
2940
}
2941
2942
notrace unsigned long get_parent_ip(unsigned long addr)
2943
{
2944
	if (in_lock_functions(addr)) {
2945
		addr = CALLER_ADDR2;
2946
		if (in_lock_functions(addr))
2947
			addr = CALLER_ADDR3;
2948
	}
2949
	return addr;
2950
}
2951
2952
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2953
				defined(CONFIG_PREEMPT_TRACER))
2954
void __kprobes add_preempt_count(int val)
2955
{
2956
#ifdef CONFIG_DEBUG_PREEMPT
2957
	/*
2958
	 * Underflow?
2959
	 */
2960
	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2961
		return;
2962
#endif
2963
	preempt_count() += val;
2964
#ifdef CONFIG_DEBUG_PREEMPT
2965
	/*
2966
	 * Spinlock count overflowing soon?
2967
	 */
2968
	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2969
				PREEMPT_MASK - 10);
2970
#endif
2971
	if (preempt_count() == val)
2972
		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2973
}
2974
EXPORT_SYMBOL(add_preempt_count);
2975
2976
void __kprobes sub_preempt_count(int val)
2977
{
2978
#ifdef CONFIG_DEBUG_PREEMPT
2979
	/*
2980
	 * Underflow?
2981
	 */
2982
	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2983
		return;
2984
	/*
2985
	 * Is the spinlock portion underflowing?
2986
	 */
2987
	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2988
			!(preempt_count() & PREEMPT_MASK)))
2989
		return;
2990
#endif
2991
2992
	if (preempt_count() == val)
2993
		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2994
	preempt_count() -= val;
2995
}
2996
EXPORT_SYMBOL(sub_preempt_count);
2997
#endif
2998
2999
/*
3000
 * Deadline is "now" in niffies + (offset by priority). Setting the deadline
3001
 * is the key to everything. It distributes cpu fairly amongst tasks of the
3002
 * same nice value, it proportions cpu according to nice level, it means the
3003
 * task that last woke up the longest ago has the earliest deadline, thus
3004
 * ensuring that interactive tasks get low latency on wake up. The CPU
3005
 * proportion works out to the square of the virtual deadline difference, so
3006
 * this equation will give nice 19 3% CPU compared to nice 0.
3007
 */
3008
static inline u64 prio_deadline_diff(int user_prio)
3009
{
3010
	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
3011
}
3012
3013
static inline u64 task_deadline_diff(struct task_struct *p)
3014
{
3015
	return prio_deadline_diff(TASK_USER_PRIO(p));
3016
}
3017
3018
static inline u64 static_deadline_diff(int static_prio)
3019
{
3020
	return prio_deadline_diff(USER_PRIO(static_prio));
3021
}
3022
3023
static inline int longest_deadline_diff(void)
3024
{
3025
	return prio_deadline_diff(39);
3026
}
3027
3028
static inline int ms_longest_deadline_diff(void)
3029
{
3030
	return NS_TO_MS(longest_deadline_diff());
3031
}
3032
3033
/*
3034
 * The time_slice is only refilled when it is empty and that is when we set a
3035
 * new deadline.
3036
 */
3037
static void time_slice_expired(struct task_struct *p)
3038
{
3039
	p->time_slice = timeslice();
3040
	p->deadline = grq.niffies + task_deadline_diff(p);
3041
}
3042
3043
/*
3044
 * Timeslices below RESCHED_US are considered as good as expired as there's no
3045
 * point rescheduling when there's so little time left. SCHED_BATCH tasks
3046
 * have been flagged be not latency sensitive and likely to be fully CPU
3047
 * bound so every time they're rescheduled they have their time_slice
3048
 * refilled, but get a new later deadline to have little effect on
3049
 * SCHED_NORMAL tasks.
3050
3051
 */
3052
static inline void check_deadline(struct task_struct *p)
3053
{
3054
	if (p->time_slice < RESCHED_US || batch_task(p))
3055
		time_slice_expired(p);
3056
}
3057
3058
#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
3059
3060
/*
3061
 * Scheduler queue bitmap specific find next bit.
3062
 */
3063
static inline unsigned long
3064
next_sched_bit(const unsigned long *addr, unsigned long offset)
3065
{
3066
	const unsigned long *p;
3067
	unsigned long result;
3068
	unsigned long size;
3069
	unsigned long tmp;
3070
3071
	size = PRIO_LIMIT;
3072
	if (offset >= size)
3073
		return size;
3074
3075
	p = addr + BITOP_WORD(offset);
3076
	result = offset & ~(BITS_PER_LONG-1);
3077
	size -= result;
3078
	offset %= BITS_PER_LONG;
3079
	if (offset) {
3080
		tmp = *(p++);
3081
		tmp &= (~0UL << offset);
3082
		if (size < BITS_PER_LONG)
3083
			goto found_first;
3084
		if (tmp)
3085
			goto found_middle;
3086
		size -= BITS_PER_LONG;
3087
		result += BITS_PER_LONG;
3088
	}
3089
	while (size & ~(BITS_PER_LONG-1)) {
3090
		if ((tmp = *(p++)))
3091
			goto found_middle;
3092
		result += BITS_PER_LONG;
3093
		size -= BITS_PER_LONG;
3094
	}
3095
	if (!size)
3096
		return result;
3097
	tmp = *p;
3098
3099
found_first:
3100
	tmp &= (~0UL >> (BITS_PER_LONG - size));
3101
	if (tmp == 0UL)		/* Are any bits set? */
3102
		return result + size;	/* Nope. */
3103
found_middle:
3104
	return result + __ffs(tmp);
3105
}
3106
3107
/*
3108
 * O(n) lookup of all tasks in the global runqueue. The real brainfuck
3109
 * of lock contention and O(n). It's not really O(n) as only the queued,
3110
 * but not running tasks are scanned, and is O(n) queued in the worst case
3111
 * scenario only because the right task can be found before scanning all of
3112
 * them.
3113
 * Tasks are selected in this order:
3114
 * Real time tasks are selected purely by their static priority and in the
3115
 * order they were queued, so the lowest value idx, and the first queued task
3116
 * of that priority value is chosen.
3117
 * If no real time tasks are found, the SCHED_ISO priority is checked, and
3118
 * all SCHED_ISO tasks have the same priority value, so they're selected by
3119
 * the earliest deadline value.
3120
 * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the
3121
 * earliest deadline.
3122
 * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
3123
 * selected by the earliest deadline.
3124
 */
3125
static inline struct
3126
task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
3127
{
3128
	struct task_struct *edt = NULL;
3129
	unsigned long idx = -1;
3130
3131
	do {
3132
		struct list_head *queue;
3133
		struct task_struct *p;
3134
		u64 earliest_deadline;
3135
3136
		idx = next_sched_bit(grq.prio_bitmap, ++idx);
3137
		if (idx >= PRIO_LIMIT)
3138
			return idle;
3139
		queue = grq.queue + idx;
3140
3141
		if (idx < MAX_RT_PRIO) {
3142
			/* We found an rt task */
3143
			list_for_each_entry(p, queue, run_list) {
3144
				/* Make sure cpu affinity is ok */
3145
				if (needs_other_cpu(p, cpu))
3146
					continue;
3147
				edt = p;
3148
				goto out_take;
3149
			}
3150
			/*
3151
			 * None of the RT tasks at this priority can run on
3152
			 * this cpu
3153
			 */
3154
			continue;
3155
		}
3156
3157
		/*
3158
		 * No rt tasks. Find the earliest deadline task. Now we're in
3159
		 * O(n) territory.
3160
		 */
3161
		earliest_deadline = ~0ULL;
3162
		list_for_each_entry(p, queue, run_list) {
3163
			u64 dl;
3164
3165
			/* Make sure cpu affinity is ok */
3166
			if (needs_other_cpu(p, cpu))
3167
				continue;
3168
3169
			/*
3170
			 * Soft affinity happens here by not scheduling a task
3171
			 * with its sticky flag set that ran on a different CPU
3172
			 * last when the CPU is scaling, or by greatly biasing
3173
			 * against its deadline when not, based on cpu cache
3174
			 * locality.
3175
			 */
3176
			if (task_sticky(p) && task_rq(p) != rq) {
3177
				if (scaling_rq(rq))
3178
					continue;
3179
				dl = p->deadline << locality_diff(p, rq);
3180
			} else
3181
				dl = p->deadline;
3182
3183
			if (deadline_before(dl, earliest_deadline)) {
3184
				earliest_deadline = dl;
3185
				edt = p;
3186
			}
3187
		}
3188
	} while (!edt);
3189
3190
out_take:
3191
	take_task(cpu, edt);
3192
	return edt;
3193
}
3194
3195
3196
/*
3197
 * Print scheduling while atomic bug:
3198
 */
3199
static noinline void __schedule_bug(struct task_struct *prev)
3200
{
3201
	if (oops_in_progress)
3202
		return;
3203
3204
	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3205
		prev->comm, prev->pid, preempt_count());
3206
3207
	debug_show_held_locks(prev);
3208
	print_modules();
3209
	if (irqs_disabled())
3210
		print_irqtrace_events(prev);
3211
	dump_stack();
3212
	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3213
}
3214
3215
/*
3216
 * Various schedule()-time debugging checks and statistics:
3217
 */
3218
static inline void schedule_debug(struct task_struct *prev)
3219
{
3220
	/*
3221
	 * Test if we are atomic. Since do_exit() needs to call into
3222
	 * schedule() atomically, we ignore that path for now.
3223
	 * Otherwise, whine if we are scheduling when we should not be.
3224
	 */
3225
	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3226
		__schedule_bug(prev);
3227
	rcu_sleep_check();
3228
3229
	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3230
3231
	schedstat_inc(this_rq(), sched_count);
3232
}
3233
3234
/*
3235
 * The currently running task's information is all stored in rq local data
3236
 * which is only modified by the local CPU, thereby allowing the data to be
3237
 * changed without grabbing the grq lock.
3238
 */
3239
static inline void set_rq_task(struct rq *rq, struct task_struct *p)
3240
{
3241
	rq->rq_time_slice = p->time_slice;
3242
	rq->rq_deadline = p->deadline;
3243
	rq->rq_last_ran = p->last_ran = rq->clock_task;
3244
	rq->rq_policy = p->policy;
3245
	rq->rq_prio = p->prio;
3246
	if (p != rq->idle)
3247
		rq->rq_running = true;
3248
	else
3249
		rq->rq_running = false;
3250
}
3251
3252
static void reset_rq_task(struct rq *rq, struct task_struct *p)
3253
{
3254
	rq->rq_policy = p->policy;
3255
	rq->rq_prio = p->prio;
3256
}
3257
3258
/*
3259
 * schedule() is the main scheduler function.
3260
 *
3261
 * The main means of driving the scheduler and thus entering this function are:
3262
 *
3263
 *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
3264
 *
3265
 *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
3266
 *      paths. For example, see arch/x86/entry_64.S.
3267
 *
3268
 *      To drive preemption between tasks, the scheduler sets the flag in timer
3269
 *      interrupt handler scheduler_tick().
3270
 *
3271
 *   3. Wakeups don't really cause entry into schedule(). They add a
3272
 *      task to the run-queue and that's it.
3273
 *
3274
 *      Now, if the new task added to the run-queue preempts the current
3275
 *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
3276
 *      called on the nearest possible occasion:
3277
 *
3278
 *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
3279
 *
3280
 *         - in syscall or exception context, at the next outmost
3281
 *           preempt_enable(). (this might be as soon as the wake_up()'s
3282
 *           spin_unlock()!)
3283
 *
3284
 *         - in IRQ context, return from interrupt-handler to
3285
 *           preemptible context
3286
 *
3287
 *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
3288
 *         then at the next:
3289
 *
3290
 *          - cond_resched() call
3291
 *          - explicit schedule() call
3292
 *          - return from syscall or exception to user-space
3293
 *          - return from interrupt-handler to user-space
3294
 */
3295
asmlinkage void __sched schedule(void)
3296
{
3297
	struct task_struct *prev, *next, *idle;
3298
	unsigned long *switch_count;
3299
	bool deactivate;
3300
	struct rq *rq;
3301
	int cpu;
3302
3303
need_resched:
3304
	preempt_disable();
3305
	cpu = smp_processor_id();
3306
	rq = cpu_rq(cpu);
3307
	rcu_note_context_switch(cpu);
3308
	prev = rq->curr;
3309
3310
	deactivate = false;
3311
	schedule_debug(prev);
3312
3313
	/*
3314
	 * Make sure that signal_pending_state()->signal_pending() below
3315
	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
3316
	 * done by the caller to avoid the race with signal_wake_up().
3317
	 */
3318
	smp_mb__before_spinlock();
3319
	grq_lock_irq();
3320
3321
	switch_count = &prev->nivcsw;
3322
	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3323
		if (unlikely(signal_pending_state(prev->state, prev))) {
3324
			prev->state = TASK_RUNNING;
3325
		} else {
3326
			deactivate = true;
3327
			/*
3328
			 * If a worker is going to sleep, notify and
3329
			 * ask workqueue whether it wants to wake up a
3330
			 * task to maintain concurrency.  If so, wake
3331
			 * up the task.
3332
			 */
3333
			if (prev->flags & PF_WQ_WORKER) {
3334
				struct task_struct *to_wakeup;
3335
3336
				to_wakeup = wq_worker_sleeping(prev, cpu);
3337
				if (to_wakeup) {
3338
					/* This shouldn't happen, but does */
3339
					if (unlikely(to_wakeup == prev))
3340
						deactivate = false;
3341
					else
3342
						try_to_wake_up_local(to_wakeup);
3343
				}
3344
			}
3345
		}
3346
		switch_count = &prev->nvcsw;
3347
	}
3348
3349
	/*
3350
	 * If we are going to sleep and we have plugged IO queued, make
3351
	 * sure to submit it to avoid deadlocks.
3352
	 */
3353
	if (unlikely(deactivate && blk_needs_flush_plug(prev))) {
3354
		grq_unlock_irq();
3355
		preempt_enable_no_resched();
3356
		blk_schedule_flush_plug(prev);
3357
		goto need_resched;
3358
	}
3359
3360
	update_clocks(rq);
3361
	update_cpu_clock_switch(rq, prev);
3362
	if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
3363
		rq->dither = false;
3364
	else
3365
		rq->dither = true;
3366
3367
	clear_tsk_need_resched(prev);
3368
3369
	idle = rq->idle;
3370
	if (idle != prev) {
3371
		/* Update all the information stored on struct rq */
3372
		prev->time_slice = rq->rq_time_slice;
3373
		prev->deadline = rq->rq_deadline;
3374
		check_deadline(prev);
3375
		prev->last_ran = rq->clock_task;
3376
3377
		/* Task changed affinity off this CPU */
3378
		if (needs_other_cpu(prev, cpu)) {
3379
			if (!deactivate)
3380
				resched_suitable_idle(prev);
3381
		} else if (!deactivate) {
3382
			if (!queued_notrunning()) {
3383
				/*
3384
				* We now know prev is the only thing that is
3385
				* awaiting CPU so we can bypass rechecking for
3386
				* the earliest deadline task and just run it
3387
				* again.
3388
				*/
3389
				set_rq_task(rq, prev);
3390
				grq_unlock_irq();
3391
				goto rerun_prev_unlocked;
3392
			} else
3393
				swap_sticky(rq, cpu, prev);
3394
		}
3395
		return_task(prev, deactivate);
3396
	}
3397
3398
	if (unlikely(!queued_notrunning())) {
3399
		/*
3400
		 * This CPU is now truly idle as opposed to when idle is
3401
		 * scheduled as a high priority task in its own right.
3402
		 */
3403
		next = idle;
3404
		schedstat_inc(rq, sched_goidle);
3405
		set_cpuidle_map(cpu);
3406
	} else {
3407
		next = earliest_deadline_task(rq, cpu, idle);
3408
		if (likely(next->prio != PRIO_LIMIT))
3409
			clear_cpuidle_map(cpu);
3410
		else
3411
			set_cpuidle_map(cpu);
3412
	}
3413
3414
	if (likely(prev != next)) {
3415
		resched_suitable_idle(prev);
3416
		/*
3417
		 * Don't stick tasks when a real time task is going to run as
3418
		 * they may literally get stuck.
3419
		 */
3420
		if (rt_task(next))
3421
			unstick_task(rq, prev);
3422
		set_rq_task(rq, next);
3423
		grq.nr_switches++;
3424
		prev->on_cpu = false;
3425
		next->on_cpu = true;
3426
		rq->curr = next;
3427
		++*switch_count;
3428
3429
		context_switch(rq, prev, next); /* unlocks the grq */
3430
		/*
3431
		 * The context switch have flipped the stack from under us
3432
		 * and restored the local variables which were saved when
3433
		 * this task called schedule() in the past. prev == current
3434
		 * is still correct, but it can be moved to another cpu/rq.
3435
		 */
3436
		cpu = smp_processor_id();
3437
		rq = cpu_rq(cpu);
3438
		idle = rq->idle;
3439
	} else
3440
		grq_unlock_irq();
3441
3442
rerun_prev_unlocked:
3443
	sched_preempt_enable_no_resched();
3444
	if (unlikely(need_resched()))
3445
		goto need_resched;
3446
}
3447
EXPORT_SYMBOL(schedule);
3448
3449
#ifdef CONFIG_RCU_USER_QS
3450
asmlinkage void __sched schedule_user(void)
3451
{
3452
	/*
3453
	 * If we come here after a random call to set_need_resched(),
3454
	 * or we have been woken up remotely but the IPI has not yet arrived,
3455
	 * we haven't yet exited the RCU idle mode. Do it here manually until
3456
	 * we find a better solution.
3457
	 */
3458
	user_exit();
3459
	schedule();
3460
	user_enter();
3461
}
3462
#endif
3463
3464
/**
3465
 * schedule_preempt_disabled - called with preemption disabled
3466
 *
3467
 * Returns with preemption disabled. Note: preempt_count must be 1
3468
 */
3469
void __sched schedule_preempt_disabled(void)
3470
{
3471
	sched_preempt_enable_no_resched();
3472
	schedule();
3473
	preempt_disable();
3474
}
3475
3476
#ifdef CONFIG_PREEMPT
3477
/*
3478
 * this is the entry point to schedule() from in-kernel preemption
3479
 * off of preempt_enable. Kernel preemptions off return from interrupt
3480
 * occur there and call schedule directly.
3481
 */
3482
asmlinkage void __sched notrace preempt_schedule(void)
3483
{
3484
	struct thread_info *ti = current_thread_info();
3485
3486
	/*
3487
	 * If there is a non-zero preempt_count or interrupts are disabled,
3488
	 * we do not want to preempt the current task. Just return..
3489
	 */
3490
	if (likely(ti->preempt_count || irqs_disabled()))
3491
		return;
3492
3493
	do {
3494
		add_preempt_count_notrace(PREEMPT_ACTIVE);
3495
		schedule();
3496
		sub_preempt_count_notrace(PREEMPT_ACTIVE);
3497
3498
		/*
3499
		 * Check again in case we missed a preemption opportunity
3500
		 * between schedule and now.
3501
		 */
3502
		barrier();
3503
	} while (need_resched());
3504
}
3505
EXPORT_SYMBOL(preempt_schedule);
3506
3507
/*
3508
 * this is the entry point to schedule() from kernel preemption
3509
 * off of irq context.
3510
 * Note, that this is called and return with irqs disabled. This will
3511
 * protect us against recursive calling from irq.
3512
 */
3513
asmlinkage void __sched preempt_schedule_irq(void)
3514
{
3515
	struct thread_info *ti = current_thread_info();
3516
	enum ctx_state prev_state;
3517
3518
	/* Catch callers which need to be fixed */
3519
	BUG_ON(ti->preempt_count || !irqs_disabled());
3520
3521
	prev_state = exception_enter();
3522
3523
	do {
3524
		add_preempt_count(PREEMPT_ACTIVE);
3525
		local_irq_enable();
3526
		schedule();
3527
		local_irq_disable();
3528
		sub_preempt_count(PREEMPT_ACTIVE);
3529
3530
		/*
3531
		 * Check again in case we missed a preemption opportunity
3532
		 * between schedule and now.
3533
		 */
3534
		barrier();
3535
	} while (need_resched());
3536
3537
	exception_exit(prev_state);
3538
}
3539
3540
#endif /* CONFIG_PREEMPT */
3541
3542
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3543
			  void *key)
3544
{
3545
	return try_to_wake_up(curr->private, mode, wake_flags);
3546
}
3547
EXPORT_SYMBOL(default_wake_function);
3548
3549
/*
3550
 * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
3551
 * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
3552
 * number) then we wake all the non-exclusive tasks and one exclusive task.
3553
 *
3554
 * There are circumstances in which we can try to wake a task which has already
3555
 * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
3556
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3557
 */
3558
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3559
			int nr_exclusive, int wake_flags, void *key)
3560
{
3561
	struct list_head *tmp, *next;
3562
3563
	list_for_each_safe(tmp, next, &q->task_list) {
3564
		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3565
		unsigned int flags = curr->flags;
3566
3567
		if (curr->func(curr, mode, wake_flags, key) &&
3568
				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3569
			break;
3570
	}
3571
}
3572
3573
/**
3574
 * __wake_up - wake up threads blocked on a waitqueue.
3575
 * @q: the waitqueue
3576
 * @mode: which threads
3577
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3578
 * @key: is directly passed to the wakeup function
3579
 *
3580
 * It may be assumed that this function implies a write memory barrier before
3581
 * changing the task state if and only if any tasks are woken up.
3582
 */
3583
void __wake_up(wait_queue_head_t *q, unsigned int mode,
3584
			int nr_exclusive, void *key)
3585
{
3586
	unsigned long flags;
3587
3588
	spin_lock_irqsave(&q->lock, flags);
3589
	__wake_up_common(q, mode, nr_exclusive, 0, key);
3590
	spin_unlock_irqrestore(&q->lock, flags);
3591
}
3592
EXPORT_SYMBOL(__wake_up);
3593
3594
/*
3595
 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3596
 */
3597
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3598
{
3599
	__wake_up_common(q, mode, nr, 0, NULL);
3600
}
3601
EXPORT_SYMBOL_GPL(__wake_up_locked);
3602
3603
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3604
{
3605
	__wake_up_common(q, mode, 1, 0, key);
3606
}
3607
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3608
3609
/**
3610
 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
3611
 * @q: the waitqueue
3612
 * @mode: which threads
3613
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3614
 * @key: opaque value to be passed to wakeup targets
3615
 *
3616
 * The sync wakeup differs that the waker knows that it will schedule
3617
 * away soon, so while the target thread will be woken up, it will not
3618
 * be migrated to another CPU - ie. the two threads are 'synchronised'
3619
 * with each other. This can prevent needless bouncing between CPUs.
3620
 *
3621
 * On UP it can prevent extra preemption.
3622
 *
3623
 * It may be assumed that this function implies a write memory barrier before
3624
 * changing the task state if and only if any tasks are woken up.
3625
 */
3626
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3627
			int nr_exclusive, void *key)
3628
{
3629
	unsigned long flags;
3630
	int wake_flags = WF_SYNC;
3631
3632
	if (unlikely(!q))
3633
		return;
3634
3635
	if (unlikely(!nr_exclusive))
3636
		wake_flags = 0;
3637
3638
	spin_lock_irqsave(&q->lock, flags);
3639
	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3640
	spin_unlock_irqrestore(&q->lock, flags);
3641
}
3642
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3643
3644
/**
3645
 * __wake_up_sync - wake up threads blocked on a waitqueue.
3646
 * @q: the waitqueue
3647
 * @mode: which threads
3648
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3649
 *
3650
 * The sync wakeup differs that the waker knows that it will schedule
3651
 * away soon, so while the target thread will be woken up, it will not
3652
 * be migrated to another CPU - ie. the two threads are 'synchronised'
3653
 * with each other. This can prevent needless bouncing between CPUs.
3654
 *
3655
 * On UP it can prevent extra preemption.
3656
 */
3657
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3658
{
3659
	unsigned long flags;
3660
	int sync = 1;
3661
3662
	if (unlikely(!q))
3663
		return;
3664
3665
	if (unlikely(!nr_exclusive))
3666
		sync = 0;
3667
3668
	spin_lock_irqsave(&q->lock, flags);
3669
	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
3670
	spin_unlock_irqrestore(&q->lock, flags);
3671
}
3672
EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
3673
3674
/**
3675
 * complete: - signals a single thread waiting on this completion
3676
 * @x:  holds the state of this particular completion
3677
 *
3678
 * This will wake up a single thread waiting on this completion. Threads will be
3679
 * awakened in the same order in which they were queued.
3680
 *
3681
 * See also complete_all(), wait_for_completion() and related routines.
3682
 *
3683
 * It may be assumed that this function implies a write memory barrier before
3684
 * changing the task state if and only if any tasks are woken up.
3685
 */
3686
void complete(struct completion *x)
3687
{
3688
	unsigned long flags;
3689
3690
	spin_lock_irqsave(&x->wait.lock, flags);
3691
	x->done++;
3692
	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3693
	spin_unlock_irqrestore(&x->wait.lock, flags);
3694
}
3695
EXPORT_SYMBOL(complete);
3696
3697
/**
3698
 * complete_all: - signals all threads waiting on this completion
3699
 * @x:  holds the state of this particular completion
3700
 *
3701
 * This will wake up all threads waiting on this particular completion event.
3702
 *
3703
 * It may be assumed that this function implies a write memory barrier before
3704
 * changing the task state if and only if any tasks are woken up.
3705
 */
3706
void complete_all(struct completion *x)
3707
{
3708
	unsigned long flags;
3709
3710
	spin_lock_irqsave(&x->wait.lock, flags);
3711
	x->done += UINT_MAX/2;
3712
	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3713
	spin_unlock_irqrestore(&x->wait.lock, flags);
3714
}
3715
EXPORT_SYMBOL(complete_all);
3716
3717
static inline long __sched
3718
do_wait_for_common(struct completion *x,
3719
		   long (*action)(long), long timeout, int state)
3720
{
3721
	if (!x->done) {
3722
		DECLARE_WAITQUEUE(wait, current);
3723
3724
		__add_wait_queue_tail_exclusive(&x->wait, &wait);
3725
		do {
3726
			if (signal_pending_state(state, current)) {
3727
				timeout = -ERESTARTSYS;
3728
				break;
3729
			}
3730
			__set_current_state(state);
3731
			spin_unlock_irq(&x->wait.lock);
3732
			timeout = action(timeout);
3733
			spin_lock_irq(&x->wait.lock);
3734
		} while (!x->done && timeout);
3735
		__remove_wait_queue(&x->wait, &wait);
3736
		if (!x->done)
3737
			return timeout;
3738
	}
3739
	x->done--;
3740
	return timeout ?: 1;
3741
}
3742
3743
static inline long __sched
3744
__wait_for_common(struct completion *x,
3745
		  long (*action)(long), long timeout, int state)
3746
{
3747
	might_sleep();
3748
3749
	spin_lock_irq(&x->wait.lock);
3750
	timeout = do_wait_for_common(x, action, timeout, state);
3751
	spin_unlock_irq(&x->wait.lock);
3752
	return timeout;
3753
}
3754
3755
static long __sched
3756
wait_for_common(struct completion *x, long timeout, int state)
3757
{
3758
	return __wait_for_common(x, schedule_timeout, timeout, state);
3759
}
3760
3761
static long __sched
3762
wait_for_common_io(struct completion *x, long timeout, int state)
3763
{
3764
	return __wait_for_common(x, io_schedule_timeout, timeout, state);
3765
}
3766
3767
/**
3768
 * wait_for_completion: - waits for completion of a task
3769
 * @x:  holds the state of this particular completion
3770
 *
3771
 * This waits to be signaled for completion of a specific task. It is NOT
3772
 * interruptible and there is no timeout.
3773
 *
3774
 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
3775
 * and interrupt capability. Also see complete().
3776
 */
3777
void __sched wait_for_completion(struct completion *x)
3778
{
3779
	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3780
}
3781
EXPORT_SYMBOL(wait_for_completion);
3782
3783
/**
3784
 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
3785
 * @x:  holds the state of this particular completion
3786
 * @timeout:  timeout value in jiffies
3787
 *
3788
 * This waits for either a completion of a specific task to be signaled or for a
3789
 * specified timeout to expire. The timeout is in jiffies. It is not
3790
 * interruptible.
3791
 *
3792
 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
3793
 * till timeout) if completed.
3794
 */
3795
unsigned long __sched
3796
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3797
{
3798
	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3799
}
3800
EXPORT_SYMBOL(wait_for_completion_timeout);
3801
3802
 /**
3803
 * wait_for_completion_io: - waits for completion of a task
3804
 * @x:  holds the state of this particular completion
3805
 *
3806
 * This waits to be signaled for completion of a specific task. It is NOT
3807
 * interruptible and there is no timeout. The caller is accounted as waiting
3808
 * for IO.
3809
 */
3810
void __sched wait_for_completion_io(struct completion *x)
3811
{
3812
	wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3813
}
3814
EXPORT_SYMBOL(wait_for_completion_io);
3815
3816
/**
3817
 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
3818
 * @x:  holds the state of this particular completion
3819
 * @timeout:  timeout value in jiffies
3820
 *
3821
 * This waits for either a completion of a specific task to be signaled or for a
3822
 * specified timeout to expire. The timeout is in jiffies. It is not
3823
 * interruptible. The caller is accounted as waiting for IO.
3824
 *
3825
 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
3826
 * till timeout) if completed.
3827
 */
3828
unsigned long __sched
3829
wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
3830
{
3831
	return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
3832
}
3833
EXPORT_SYMBOL(wait_for_completion_io_timeout);
3834
3835
/**
3836
 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3837
 * @x:  holds the state of this particular completion
3838
 *
3839
 * This waits for completion of a specific task to be signaled. It is
3840
 * interruptible.
3841
 *
3842
 * Return: -ERESTARTSYS if interrupted, 0 if completed.
3843
 */
3844
int __sched wait_for_completion_interruptible(struct completion *x)
3845
{
3846
	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3847
	if (t == -ERESTARTSYS)
3848
		return t;
3849
	return 0;
3850
}
3851
EXPORT_SYMBOL(wait_for_completion_interruptible);
3852
3853
/**
3854
 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
3855
 * @x:  holds the state of this particular completion
3856
 * @timeout:  timeout value in jiffies
3857
 *
3858
 * This waits for either a completion of a specific task to be signaled or for a
3859
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
3860
 *
3861
 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
3862
 * or number of jiffies left till timeout) if completed.
3863
 */
3864
long __sched
3865
wait_for_completion_interruptible_timeout(struct completion *x,
3866
					  unsigned long timeout)
3867
{
3868
	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3869
}
3870
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3871
3872
/**
3873
 * wait_for_completion_killable: - waits for completion of a task (killable)
3874
 * @x:  holds the state of this particular completion
3875
 *
3876
 * This waits to be signaled for completion of a specific task. It can be
3877
 * interrupted by a kill signal.
3878
 *
3879
 * Return: -ERESTARTSYS if interrupted, 0 if completed.
3880
 */
3881
int __sched wait_for_completion_killable(struct completion *x)
3882
{
3883
	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3884
	if (t == -ERESTARTSYS)
3885
		return t;
3886
	return 0;
3887
}
3888
EXPORT_SYMBOL(wait_for_completion_killable);
3889
3890
/**
3891
 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
3892
 * @x:  holds the state of this particular completion
3893
 * @timeout:  timeout value in jiffies
3894
 *
3895
 * This waits for either a completion of a specific task to be
3896
 * signaled or for a specified timeout to expire. It can be
3897
 * interrupted by a kill signal. The timeout is in jiffies.
3898
 *
3899
 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
3900
 * or number of jiffies left till timeout) if completed.
3901
 */
3902
long __sched
3903
wait_for_completion_killable_timeout(struct completion *x,
3904
				     unsigned long timeout)
3905
{
3906
	return wait_for_common(x, timeout, TASK_KILLABLE);
3907
}
3908
EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3909
3910
/**
3911
 *	try_wait_for_completion - try to decrement a completion without blocking
3912
 *	@x:	completion structure
3913
 *
3914
 *	Return: 0 if a decrement cannot be done without blocking
3915
 *		1 if a decrement succeeded.
3916
 *
3917
 *	If a completion is being used as a counting completion,
3918
 *	attempt to decrement the counter without blocking. This
3919
 *	enables us to avoid waiting if the resource the completion
3920
 *	is protecting is not available.
3921
 */
3922
bool try_wait_for_completion(struct completion *x)
3923
{
3924
	unsigned long flags;
3925
	int ret = 1;
3926
3927
	spin_lock_irqsave(&x->wait.lock, flags);
3928
	if (!x->done)
3929
		ret = 0;
3930
	else
3931
		x->done--;
3932
	spin_unlock_irqrestore(&x->wait.lock, flags);
3933
	return ret;
3934
}
3935
EXPORT_SYMBOL(try_wait_for_completion);
3936
3937
/**
3938
 *	completion_done - Test to see if a completion has any waiters
3939
 *	@x:	completion structure
3940
 *
3941
 *	Return: 0 if there are waiters (wait_for_completion() in progress)
3942
 *		1 if there are no waiters.
3943
 *
3944
 */
3945
bool completion_done(struct completion *x)
3946
{
3947
	unsigned long flags;
3948
	int ret = 1;
3949
3950
	spin_lock_irqsave(&x->wait.lock, flags);
3951
	if (!x->done)
3952
		ret = 0;
3953
	spin_unlock_irqrestore(&x->wait.lock, flags);
3954
	return ret;
3955
}
3956
EXPORT_SYMBOL(completion_done);
3957
3958
static long __sched
3959
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3960
{
3961
	unsigned long flags;
3962
	wait_queue_t wait;
3963
3964
	init_waitqueue_entry(&wait, current);
3965
3966
	__set_current_state(state);
3967
3968
	spin_lock_irqsave(&q->lock, flags);
3969
	__add_wait_queue(q, &wait);
3970
	spin_unlock(&q->lock);
3971
	timeout = schedule_timeout(timeout);
3972
	spin_lock_irq(&q->lock);
3973
	__remove_wait_queue(q, &wait);
3974
	spin_unlock_irqrestore(&q->lock, flags);
3975
3976
	return timeout;
3977
}
3978
3979
void __sched interruptible_sleep_on(wait_queue_head_t *q)
3980
{
3981
	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3982
}
3983
EXPORT_SYMBOL(interruptible_sleep_on);
3984
3985
long __sched
3986
interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3987
{
3988
	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3989
}
3990
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3991
3992
void __sched sleep_on(wait_queue_head_t *q)
3993
{
3994
	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3995
}
3996
EXPORT_SYMBOL(sleep_on);
3997
3998
long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3999
{
4000
	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4001
}
4002
EXPORT_SYMBOL(sleep_on_timeout);
4003
4004
#ifdef CONFIG_RT_MUTEXES
4005
4006
/*
4007
 * rt_mutex_setprio - set the current priority of a task
4008
 * @p: task
4009
 * @prio: prio value (kernel-internal form)
4010
 *
4011
 * This function changes the 'effective' priority of a task. It does
4012
 * not touch ->normal_prio like __setscheduler().
4013
 *
4014
 * Used by the rt_mutex code to implement priority inheritance logic.
4015
 */
4016
void rt_mutex_setprio(struct task_struct *p, int prio)
4017
{
4018
	unsigned long flags;
4019
	int queued, oldprio;
4020
	struct rq *rq;
4021
4022
	BUG_ON(prio < 0 || prio > MAX_PRIO);
4023
4024
	rq = task_grq_lock(p, &flags);
4025
4026
	/*
4027
	 * Idle task boosting is a nono in general. There is one
4028
	 * exception, when PREEMPT_RT and NOHZ is active:
4029
	 *
4030
	 * The idle task calls get_next_timer_interrupt() and holds
4031
	 * the timer wheel base->lock on the CPU and another CPU wants
4032
	 * to access the timer (probably to cancel it). We can safely
4033
	 * ignore the boosting request, as the idle CPU runs this code
4034
	 * with interrupts disabled and will complete the lock
4035
	 * protected section without being interrupted. So there is no
4036
	 * real need to boost.
4037
	 */
4038
	if (unlikely(p == rq->idle)) {
4039
		WARN_ON(p != rq->curr);
4040
		WARN_ON(p->pi_blocked_on);
4041
		goto out_unlock;
4042
	}
4043
4044
	trace_sched_pi_setprio(p, prio);
4045
	oldprio = p->prio;
4046
	queued = task_queued(p);
4047
	if (queued)
4048
		dequeue_task(p);
4049
	p->prio = prio;
4050
	if (task_running(p) && prio > oldprio)
4051
		resched_task(p);
4052
	if (queued) {
4053
		enqueue_task(p);
4054
		try_preempt(p, rq);
4055
	}
4056
4057
out_unlock:
4058
	task_grq_unlock(&flags);
4059
}
4060
4061
#endif
4062
4063
/*
4064
 * Adjust the deadline for when the priority is to change, before it's
4065
 * changed.
4066
 */
4067
static inline void adjust_deadline(struct task_struct *p, int new_prio)
4068
{
4069
	p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
4070
}
4071
4072
void set_user_nice(struct task_struct *p, long nice)
4073
{
4074
	int queued, new_static, old_static;
4075
	unsigned long flags;
4076
	struct rq *rq;
4077
4078
	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4079
		return;
4080
	new_static = NICE_TO_PRIO(nice);
4081
	/*
4082
	 * We have to be careful, if called from sys_setpriority(),
4083
	 * the task might be in the middle of scheduling on another CPU.
4084
	 */
4085
	rq = time_task_grq_lock(p, &flags);
4086
	/*
4087
	 * The RT priorities are set via sched_setscheduler(), but we still
4088
	 * allow the 'normal' nice value to be set - but as expected
4089
	 * it wont have any effect on scheduling until the task is
4090
	 * not SCHED_NORMAL/SCHED_BATCH:
4091
	 */
4092
	if (has_rt_policy(p)) {
4093
		p->static_prio = new_static;
4094
		goto out_unlock;
4095
	}
4096
	queued = task_queued(p);
4097
	if (queued)
4098
		dequeue_task(p);
4099
4100
	adjust_deadline(p, new_static);
4101
	old_static = p->static_prio;
4102
	p->static_prio = new_static;
4103
	p->prio = effective_prio(p);
4104
4105
	if (queued) {
4106
		enqueue_task(p);
4107
		if (new_static < old_static)
4108
			try_preempt(p, rq);
4109
	} else if (task_running(p)) {
4110
		reset_rq_task(rq, p);
4111
		if (old_static < new_static)
4112
			resched_task(p);
4113
	}
4114
out_unlock:
4115
	task_grq_unlock(&flags);
4116
}
4117
EXPORT_SYMBOL(set_user_nice);
4118
4119
/*
4120
 * can_nice - check if a task can reduce its nice value
4121
 * @p: task
4122
 * @nice: nice value
4123
 */
4124
int can_nice(const struct task_struct *p, const int nice)
4125
{
4126
	/* convert nice value [19,-20] to rlimit style value [1,40] */
4127
	int nice_rlim = 20 - nice;
4128
4129
	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4130
		capable(CAP_SYS_NICE));
4131
}
4132
4133
#ifdef __ARCH_WANT_SYS_NICE
4134
4135
/*
4136
 * sys_nice - change the priority of the current process.
4137
 * @increment: priority increment
4138
 *
4139
 * sys_setpriority is a more generic, but much slower function that
4140
 * does similar things.
4141
 */
4142
SYSCALL_DEFINE1(nice, int, increment)
4143
{
4144
	long nice, retval;
4145
4146
	/*
4147
	 * Setpriority might change our priority at the same moment.
4148
	 * We don't have to worry. Conceptually one call occurs first
4149
	 * and we have a single winner.
4150
	 */
4151
	if (increment < -40)
4152
		increment = -40;
4153
	if (increment > 40)
4154
		increment = 40;
4155
4156
	nice = TASK_NICE(current) + increment;
4157
	if (nice < -20)
4158
		nice = -20;
4159
	if (nice > 19)
4160
		nice = 19;
4161
4162
	if (increment < 0 && !can_nice(current, nice))
4163
		return -EPERM;
4164
4165
	retval = security_task_setnice(current, nice);
4166
	if (retval)
4167
		return retval;
4168
4169
	set_user_nice(current, nice);
4170
	return 0;
4171
}
4172
4173
#endif
4174
4175
/**
4176
 * task_prio - return the priority value of a given task.
4177
 * @p: the task in question.
4178
 *
4179
 * Return: The priority value as seen by users in /proc.
4180
 * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
4181
 * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
4182
 */
4183
int task_prio(const struct task_struct *p)
4184
{
4185
	int delta, prio = p->prio - MAX_RT_PRIO;
4186
4187
	/* rt tasks and iso tasks */
4188
	if (prio <= 0)
4189
		goto out;
4190
4191
	/* Convert to ms to avoid overflows */
4192
	delta = NS_TO_MS(p->deadline - grq.niffies);
4193
	delta = delta * 40 / ms_longest_deadline_diff();
4194
	if (delta > 0 && delta <= 80)
4195
		prio += delta;
4196
	if (idleprio_task(p))
4197
		prio += 40;
4198
out:
4199
	return prio;
4200
}
4201
4202
/**
4203
 * task_nice - return the nice value of a given task.
4204
 * @p: the task in question.
4205
 *
4206
 * Return: The nice value [ -20 ... 0 ... 19 ].
4207
 */
4208
int task_nice(const struct task_struct *p)
4209
{
4210
	return TASK_NICE(p);
4211
}
4212
EXPORT_SYMBOL_GPL(task_nice);
4213
4214
/**
4215
 * idle_cpu - is a given cpu idle currently?
4216
 * @cpu: the processor in question.
4217
 *
4218
 * Return: 1 if the CPU is currently idle. 0 otherwise.
4219
 */
4220
int idle_cpu(int cpu)
4221
{
4222
#ifdef CONFIG_SMP
4223
	struct rq *rq = cpu_rq(cpu);
4224
4225
	if (!llist_empty(&rq->wake_list))
4226
		return 0;
4227
#endif
4228
	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4229
}
4230
4231
/**
4232
 * idle_task - return the idle task for a given cpu.
4233
 * @cpu: the processor in question.
4234
 *
4235
 * Return: The idle task for the cpu @cpu.
4236
 */
4237
struct task_struct *idle_task(int cpu)
4238
{
4239
	return cpu_rq(cpu)->idle;
4240
}
4241
4242
/**
4243
 * find_process_by_pid - find a process with a matching PID value.
4244
 * @pid: the pid in question.
4245
 *
4246
 * The task of @pid, if found. %NULL otherwise.
4247
 */
4248
static inline struct task_struct *find_process_by_pid(pid_t pid)
4249
{
4250
	return pid ? find_task_by_vpid(pid) : current;
4251
}
4252
4253
/* Actually do priority change: must hold grq lock. */
4254
static void
4255
__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio)
4256
{
4257
	int oldrtprio, oldprio;
4258
4259
	p->policy = policy;
4260
	oldrtprio = p->rt_priority;
4261
	p->rt_priority = prio;
4262
	p->normal_prio = normal_prio(p);
4263
	oldprio = p->prio;
4264
	/* we are holding p->pi_lock already */
4265
	p->prio = rt_mutex_getprio(p);
4266
	if (task_running(p)) {
4267
		reset_rq_task(rq, p);
4268
		/* Resched only if we might now be preempted */
4269
		if (p->prio > oldprio || p->rt_priority > oldrtprio)
4270
			resched_task(p);
4271
	}
4272
}
4273
4274
/*
4275
 * check the target process has a UID that matches the current process's
4276
 */
4277
static bool check_same_owner(struct task_struct *p)
4278
{
4279
	const struct cred *cred = current_cred(), *pcred;
4280
	bool match;
4281
4282
	rcu_read_lock();
4283
	pcred = __task_cred(p);
4284
	match = (uid_eq(cred->euid, pcred->euid) ||
4285
		 uid_eq(cred->euid, pcred->uid));
4286
	rcu_read_unlock();
4287
	return match;
4288
}
4289
4290
static int __sched_setscheduler(struct task_struct *p, int policy,
4291
				const struct sched_param *param, bool user)
4292
{
4293
	struct sched_param zero_param = { .sched_priority = 0 };
4294
	int queued, retval, oldpolicy = -1;
4295
	unsigned long flags, rlim_rtprio = 0;
4296
	int reset_on_fork;
4297
	struct rq *rq;
4298
4299
	/* may grab non-irq protected spin_locks */
4300
	BUG_ON(in_interrupt());
4301
4302
	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
4303
		unsigned long lflags;
4304
4305
		if (!lock_task_sighand(p, &lflags))
4306
			return -ESRCH;
4307
		rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4308
		unlock_task_sighand(p, &lflags);
4309
		if (rlim_rtprio)
4310
			goto recheck;
4311
		/*
4312
		 * If the caller requested an RT policy without having the
4313
		 * necessary rights, we downgrade the policy to SCHED_ISO.
4314
		 * We also set the parameter to zero to pass the checks.
4315
		 */
4316
		policy = SCHED_ISO;
4317
		param = &zero_param;
4318
	}
4319
recheck:
4320
	/* double check policy once rq lock held */
4321
	if (policy < 0) {
4322
		reset_on_fork = p->sched_reset_on_fork;
4323
		policy = oldpolicy = p->policy;
4324
	} else {
4325
		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4326
		policy &= ~SCHED_RESET_ON_FORK;
4327
4328
		if (!SCHED_RANGE(policy))
4329
			return -EINVAL;
4330
	}
4331
4332
	/*
4333
	 * Valid priorities for SCHED_FIFO and SCHED_RR are
4334
	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
4335
	 * SCHED_BATCH is 0.
4336
	 */
4337
	if (param->sched_priority < 0 ||
4338
	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
4339
	    (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
4340
		return -EINVAL;
4341
	if (is_rt_policy(policy) != (param->sched_priority != 0))
4342
		return -EINVAL;
4343
4344
	/*
4345
	 * Allow unprivileged RT tasks to decrease priority:
4346
	 */
4347
	if (user && !capable(CAP_SYS_NICE)) {
4348
		if (is_rt_policy(policy)) {
4349
			unsigned long rlim_rtprio =
4350
					task_rlimit(p, RLIMIT_RTPRIO);
4351
4352
			/* can't set/change the rt policy */
4353
			if (policy != p->policy && !rlim_rtprio)
4354
				return -EPERM;
4355
4356
			/* can't increase priority */
4357
			if (param->sched_priority > p->rt_priority &&
4358
			    param->sched_priority > rlim_rtprio)
4359
				return -EPERM;
4360
		} else {
4361
			switch (p->policy) {
4362
				/*
4363
				 * Can only downgrade policies but not back to
4364
				 * SCHED_NORMAL
4365
				 */
4366
				case SCHED_ISO:
4367
					if (policy == SCHED_ISO)
4368
						goto out;
4369
					if (policy == SCHED_NORMAL)
4370
						return -EPERM;
4371
					break;
4372
				case SCHED_BATCH:
4373
					if (policy == SCHED_BATCH)
4374
						goto out;
4375
					if (policy != SCHED_IDLEPRIO)
4376
						return -EPERM;
4377
					break;
4378
				case SCHED_IDLEPRIO:
4379
					if (policy == SCHED_IDLEPRIO)
4380
						goto out;
4381
					return -EPERM;
4382
				default:
4383
					break;
4384
			}
4385
		}
4386
4387
		/* can't change other user's priorities */
4388
		if (!check_same_owner(p))
4389
			return -EPERM;
4390
4391
		/* Normal users shall not reset the sched_reset_on_fork flag */
4392
		if (p->sched_reset_on_fork && !reset_on_fork)
4393
			return -EPERM;
4394
	}
4395
4396
	if (user) {
4397
		retval = security_task_setscheduler(p);
4398
		if (retval)
4399
			return retval;
4400
	}
4401
4402
	/*
4403
	 * make sure no PI-waiters arrive (or leave) while we are
4404
	 * changing the priority of the task:
4405
	 */
4406
	raw_spin_lock_irqsave(&p->pi_lock, flags);
4407
	/*
4408
	 * To be able to change p->policy safely, the grunqueue lock must be
4409
	 * held.
4410
	 */
4411
	rq = __task_grq_lock(p);
4412
4413
	/*
4414
	 * Changing the policy of the stop threads its a very bad idea
4415
	 */
4416
	if (p == rq->stop) {
4417
		__task_grq_unlock();
4418
		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4419
		return -EINVAL;
4420
	}
4421
4422
	/*
4423
	 * If not changing anything there's no need to proceed further:
4424
	 */
4425
	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
4426
			param->sched_priority == p->rt_priority))) {
4427
4428
		__task_grq_unlock();
4429
		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4430
		return 0;
4431
	}
4432
4433
	/* recheck policy now with rq lock held */
4434
	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4435
		policy = oldpolicy = -1;
4436
		__task_grq_unlock();
4437
		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4438
		goto recheck;
4439
	}
4440
	update_clocks(rq);
4441
	p->sched_reset_on_fork = reset_on_fork;
4442
4443
	queued = task_queued(p);
4444
	if (queued)
4445
		dequeue_task(p);
4446
	__setscheduler(p, rq, policy, param->sched_priority);
4447
	if (queued) {
4448
		enqueue_task(p);
4449
		try_preempt(p, rq);
4450
	}
4451
	__task_grq_unlock();
4452
	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4453
4454
	rt_mutex_adjust_pi(p);
4455
out:
4456
	return 0;
4457
}
4458
4459
/**
4460
 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4461
 * @p: the task in question.
4462
 * @policy: new policy.
4463
 * @param: structure containing the new RT priority.
4464
 *
4465
 * Return: 0 on success. An error code otherwise.
4466
 *
4467
 * NOTE that the task may be already dead.
4468
 */
4469
int sched_setscheduler(struct task_struct *p, int policy,
4470
		       const struct sched_param *param)
4471
{
4472
	return __sched_setscheduler(p, policy, param, true);
4473
}
4474
4475
EXPORT_SYMBOL_GPL(sched_setscheduler);
4476
4477
/**
4478
 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4479
 * @p: the task in question.
4480
 * @policy: new policy.
4481
 * @param: structure containing the new RT priority.
4482
 *
4483
 * Just like sched_setscheduler, only don't bother checking if the
4484
 * current context has permission.  For example, this is needed in
4485
 * stop_machine(): we create temporary high priority worker threads,
4486
 * but our caller might not have that capability.
4487
 *
4488
 * Return: 0 on success. An error code otherwise.
4489
 */
4490
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4491
			       const struct sched_param *param)
4492
{
4493
	return __sched_setscheduler(p, policy, param, false);
4494
}
4495
4496
static int
4497
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4498
{
4499
	struct sched_param lparam;
4500
	struct task_struct *p;
4501
	int retval;
4502
4503
	if (!param || pid < 0)
4504
		return -EINVAL;
4505
	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4506
		return -EFAULT;
4507
4508
	rcu_read_lock();
4509
	retval = -ESRCH;
4510
	p = find_process_by_pid(pid);
4511
	if (p != NULL)
4512
		retval = sched_setscheduler(p, policy, &lparam);
4513
	rcu_read_unlock();
4514
4515
	return retval;
4516
}
4517
4518
/**
4519
 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4520
 * @pid: the pid in question.
4521
 * @policy: new policy.
4522
 *
4523
 * Return: 0 on success. An error code otherwise.
4524
 * @param: structure containing the new RT priority.
4525
 */
4526
asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4527
				       struct sched_param __user *param)
4528
{
4529
	/* negative values for policy are not valid */
4530
	if (policy < 0)
4531
		return -EINVAL;
4532
4533
	return do_sched_setscheduler(pid, policy, param);
4534
}
4535
4536
/**
4537
 * sys_sched_setparam - set/change the RT priority of a thread
4538
 * @pid: the pid in question.
4539
 * @param: structure containing the new RT priority.
4540
 *
4541
 * Return: 0 on success. An error code otherwise.
4542
 */
4543
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4544
{
4545
	return do_sched_setscheduler(pid, -1, param);
4546
}
4547
4548
/**
4549
 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4550
 * @pid: the pid in question.
4551
 *
4552
 * Return: On success, the policy of the thread. Otherwise, a negative error
4553
 * code.
4554
 */
4555
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4556
{
4557
	struct task_struct *p;
4558
	int retval = -EINVAL;
4559
4560
	if (pid < 0)
4561
		goto out_nounlock;
4562
4563
	retval = -ESRCH;
4564
	rcu_read_lock();
4565
	p = find_process_by_pid(pid);
4566
	if (p) {
4567
		retval = security_task_getscheduler(p);
4568
		if (!retval)
4569
			retval = p->policy;
4570
	}
4571
	rcu_read_unlock();
4572
4573
out_nounlock:
4574
	return retval;
4575
}
4576
4577
/**
4578
 * sys_sched_getscheduler - get the RT priority of a thread
4579
 * @pid: the pid in question.
4580
 * @param: structure containing the RT priority.
4581
 *
4582
 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
4583
 * code.
4584
 */
4585
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4586
{
4587
	struct sched_param lp;
4588
	struct task_struct *p;
4589
	int retval = -EINVAL;
4590
4591
	if (!param || pid < 0)
4592
		goto out_nounlock;
4593
4594
	rcu_read_lock();
4595
	p = find_process_by_pid(pid);
4596
	retval = -ESRCH;
4597
	if (!p)
4598
		goto out_unlock;
4599
4600
	retval = security_task_getscheduler(p);
4601
	if (retval)
4602
		goto out_unlock;
4603
4604
	lp.sched_priority = p->rt_priority;
4605
	rcu_read_unlock();
4606
4607
	/*
4608
	 * This one might sleep, we cannot do it with a spinlock held ...
4609
	 */
4610
	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4611
4612
out_nounlock:
4613
	return retval;
4614
4615
out_unlock:
4616
	rcu_read_unlock();
4617
	return retval;
4618
}
4619
4620
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4621
{
4622
	cpumask_var_t cpus_allowed, new_mask;
4623
	struct task_struct *p;
4624
	int retval;
4625
4626
	get_online_cpus();
4627
	rcu_read_lock();
4628
4629
	p = find_process_by_pid(pid);
4630
	if (!p) {
4631
		rcu_read_unlock();
4632
		put_online_cpus();
4633
		return -ESRCH;
4634
	}
4635
4636
	/* Prevent p going away */
4637
	get_task_struct(p);
4638
	rcu_read_unlock();
4639
4640
	if (p->flags & PF_NO_SETAFFINITY) {
4641
		retval = -EINVAL;
4642
		goto out_put_task;
4643
	}
4644
	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4645
		retval = -ENOMEM;
4646
		goto out_put_task;
4647
	}
4648
	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4649
		retval = -ENOMEM;
4650
		goto out_free_cpus_allowed;
4651
	}
4652
	retval = -EPERM;
4653
	if (!check_same_owner(p)) {
4654
		rcu_read_lock();
4655
		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4656
			rcu_read_unlock();
4657
			goto out_unlock;
4658
		}
4659
		rcu_read_unlock();
4660
	}
4661
4662
	retval = security_task_setscheduler(p);
4663
	if (retval)
4664
		goto out_unlock;
4665
4666
	cpuset_cpus_allowed(p, cpus_allowed);
4667
	cpumask_and(new_mask, in_mask, cpus_allowed);
4668
again:
4669
	retval = set_cpus_allowed_ptr(p, new_mask);
4670
4671
	if (!retval) {
4672
		cpuset_cpus_allowed(p, cpus_allowed);
4673
		if (!cpumask_subset(new_mask, cpus_allowed)) {
4674
			/*
4675
			 * We must have raced with a concurrent cpuset
4676
			 * update. Just reset the cpus_allowed to the
4677
			 * cpuset's cpus_allowed
4678
			 */
4679
			cpumask_copy(new_mask, cpus_allowed);
4680
			goto again;
4681
		}
4682
	}
4683
out_unlock:
4684
	free_cpumask_var(new_mask);
4685
out_free_cpus_allowed:
4686
	free_cpumask_var(cpus_allowed);
4687
out_put_task:
4688
	put_task_struct(p);
4689
	put_online_cpus();
4690
	return retval;
4691
}
4692
4693
static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4694
			     cpumask_t *new_mask)
4695
{
4696
	if (len < sizeof(cpumask_t)) {
4697
		memset(new_mask, 0, sizeof(cpumask_t));
4698
	} else if (len > sizeof(cpumask_t)) {
4699
		len = sizeof(cpumask_t);
4700
	}
4701
	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4702
}
4703
4704
4705
/**
4706
 * sys_sched_setaffinity - set the cpu affinity of a process
4707
 * @pid: pid of the process
4708
 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4709
 * @user_mask_ptr: user-space pointer to the new cpu mask
4710
 *
4711
 * Return: 0 on success. An error code otherwise.
4712
 */
4713
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4714
		unsigned long __user *, user_mask_ptr)
4715
{
4716
	cpumask_var_t new_mask;
4717
	int retval;
4718
4719
	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4720
		return -ENOMEM;
4721
4722
	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4723
	if (retval == 0)
4724
		retval = sched_setaffinity(pid, new_mask);
4725
	free_cpumask_var(new_mask);
4726
	return retval;
4727
}
4728
4729
long sched_getaffinity(pid_t pid, cpumask_t *mask)
4730
{
4731
	struct task_struct *p;
4732
	unsigned long flags;
4733
	int retval;
4734
4735
	get_online_cpus();
4736
	rcu_read_lock();
4737
4738
	retval = -ESRCH;
4739
	p = find_process_by_pid(pid);
4740
	if (!p)
4741
		goto out_unlock;
4742
4743
	retval = security_task_getscheduler(p);
4744
	if (retval)
4745
		goto out_unlock;
4746
4747
	grq_lock_irqsave(&flags);
4748
	cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask);
4749
	grq_unlock_irqrestore(&flags);
4750
4751
out_unlock:
4752
	rcu_read_unlock();
4753
	put_online_cpus();
4754
4755
	return retval;
4756
}
4757
4758
/**
4759
 * sys_sched_getaffinity - get the cpu affinity of a process
4760
 * @pid: pid of the process
4761
 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4762
 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4763
 *
4764
 * Return: 0 on success. An error code otherwise.
4765
 */
4766
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4767
		unsigned long __user *, user_mask_ptr)
4768
{
4769
	int ret;
4770
	cpumask_var_t mask;
4771
4772
	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4773
		return -EINVAL;
4774
	if (len & (sizeof(unsigned long)-1))
4775
		return -EINVAL;
4776
4777
	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4778
		return -ENOMEM;
4779
4780
	ret = sched_getaffinity(pid, mask);
4781
	if (ret == 0) {
4782
		size_t retlen = min_t(size_t, len, cpumask_size());
4783
4784
		if (copy_to_user(user_mask_ptr, mask, retlen))
4785
			ret = -EFAULT;
4786
		else
4787
			ret = retlen;
4788
	}
4789
	free_cpumask_var(mask);
4790
4791
	return ret;
4792
}
4793
4794
/**
4795
 * sys_sched_yield - yield the current processor to other threads.
4796
 *
4797
 * This function yields the current CPU to other tasks. It does this by
4798
 * scheduling away the current task. If it still has the earliest deadline
4799
 * it will be scheduled again as the next task.
4800
 *
4801
 * Return: 0.
4802
 */
4803
SYSCALL_DEFINE0(sched_yield)
4804
{
4805
	struct task_struct *p;
4806
4807
	p = current;
4808
	grq_lock_irq();
4809
	schedstat_inc(task_rq(p), yld_count);
4810
	requeue_task(p);
4811
4812
	/*
4813
	 * Since we are going to call schedule() anyway, there's
4814
	 * no need to preempt or enable interrupts:
4815
	 */
4816
	__release(grq.lock);
4817
	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
4818
	do_raw_spin_unlock(&grq.lock);
4819
	sched_preempt_enable_no_resched();
4820
4821
	schedule();
4822
4823
	return 0;
4824
}
4825
4826
static inline bool should_resched(void)
4827
{
4828
	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4829
}
4830
4831
static void __cond_resched(void)
4832
{
4833
	add_preempt_count(PREEMPT_ACTIVE);
4834
	schedule();
4835
	sub_preempt_count(PREEMPT_ACTIVE);
4836
}
4837
4838
int __sched _cond_resched(void)
4839
{
4840
	if (should_resched()) {
4841
		__cond_resched();
4842
		return 1;
4843
	}
4844
	return 0;
4845
}
4846
EXPORT_SYMBOL(_cond_resched);
4847
4848
/*
4849
 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4850
 * call schedule, and on return reacquire the lock.
4851
 *
4852
 * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
4853
 * operations here to prevent schedule() from being called twice (once via
4854
 * spin_unlock(), once by hand).
4855
 */
4856
int __cond_resched_lock(spinlock_t *lock)
4857
{
4858
	int resched = should_resched();
4859
	int ret = 0;
4860
4861
	lockdep_assert_held(lock);
4862
4863
	if (spin_needbreak(lock) || resched) {
4864
		spin_unlock(lock);
4865
		if (resched)
4866
			__cond_resched();
4867
		else
4868
			cpu_relax();
4869
		ret = 1;
4870
		spin_lock(lock);
4871
	}
4872
	return ret;
4873
}
4874
EXPORT_SYMBOL(__cond_resched_lock);
4875
4876
int __sched __cond_resched_softirq(void)
4877
{
4878
	BUG_ON(!in_softirq());
4879
4880
	if (should_resched()) {
4881
		local_bh_enable();
4882
		__cond_resched();
4883
		local_bh_disable();
4884
		return 1;
4885
	}
4886
	return 0;
4887
}
4888
EXPORT_SYMBOL(__cond_resched_softirq);
4889
4890
/**
4891
 * yield - yield the current processor to other threads.
4892
 *
4893
 * Do not ever use this function, there's a 99% chance you're doing it wrong.
4894
 *
4895
 * The scheduler is at all times free to pick the calling task as the most
4896
 * eligible task to run, if removing the yield() call from your code breaks
4897
 * it, its already broken.
4898
 *
4899
 * Typical broken usage is:
4900
 *
4901
 * while (!event)
4902
 * 	yield();
4903
 *
4904
 * where one assumes that yield() will let 'the other' process run that will
4905
 * make event true. If the current task is a SCHED_FIFO task that will never
4906
 * happen. Never use yield() as a progress guarantee!!
4907
 *
4908
 * If you want to use yield() to wait for something, use wait_event().
4909
 * If you want to use yield() to be 'nice' for others, use cond_resched().
4910
 * If you still want to use yield(), do not!
4911
 */
4912
void __sched yield(void)
4913
{
4914
	set_current_state(TASK_RUNNING);
4915
	sys_sched_yield();
4916
}
4917
EXPORT_SYMBOL(yield);
4918
4919
/**
4920
 * yield_to - yield the current processor to another thread in
4921
 * your thread group, or accelerate that thread toward the
4922
 * processor it's on.
4923
 * @p: target task
4924
 * @preempt: whether task preemption is allowed or not
4925
 *
4926
 * It's the caller's job to ensure that the target task struct
4927
 * can't go away on us before we can do any checks.
4928
 *
4929
 * Return:
4930
 *	true (>0) if we indeed boosted the target task.
4931
 *	false (0) if we failed to boost the target.
4932
 *	-ESRCH if there's no task to yield to.
4933
 */
4934
bool __sched yield_to(struct task_struct *p, bool preempt)
4935
{
4936
	unsigned long flags;
4937
	int yielded = 0;
4938
	struct rq *rq;
4939
4940
	rq = this_rq();
4941
	grq_lock_irqsave(&flags);
4942
	if (task_running(p) || p->state) {
4943
		yielded = -ESRCH;
4944
		goto out_unlock;
4945
	}
4946
	yielded = 1;
4947
	if (p->deadline > rq->rq_deadline)
4948
		p->deadline = rq->rq_deadline;
4949
	p->time_slice += rq->rq_time_slice;
4950
	rq->rq_time_slice = 0;
4951
	if (p->time_slice > timeslice())
4952
		p->time_slice = timeslice();
4953
	set_tsk_need_resched(rq->curr);
4954
out_unlock:
4955
	grq_unlock_irqrestore(&flags);
4956
4957
	if (yielded > 0)
4958
		schedule();
4959
	return yielded;
4960
}
4961
EXPORT_SYMBOL_GPL(yield_to);
4962
4963
/*
4964
 * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
4965
 * that process accounting knows that this is a task in IO wait state.
4966
 *
4967
 * But don't do that if it is a deliberate, throttling IO wait (this task
4968
 * has set its backing_dev_info: the queue against which it should throttle)
4969
 */
4970
void __sched io_schedule(void)
4971
{
4972
	struct rq *rq = raw_rq();
4973
4974
	delayacct_blkio_start();
4975
	atomic_inc(&rq->nr_iowait);
4976
	blk_flush_plug(current);
4977
	current->in_iowait = 1;
4978
	schedule();
4979
	current->in_iowait = 0;
4980
	atomic_dec(&rq->nr_iowait);
4981
	delayacct_blkio_end();
4982
}
4983
EXPORT_SYMBOL(io_schedule);
4984
4985
long __sched io_schedule_timeout(long timeout)
4986
{
4987
	struct rq *rq = raw_rq();
4988
	long ret;
4989
4990
	delayacct_blkio_start();
4991
	atomic_inc(&rq->nr_iowait);
4992
	blk_flush_plug(current);
4993
	current->in_iowait = 1;
4994
	ret = schedule_timeout(timeout);
4995
	current->in_iowait = 0;
4996
	atomic_dec(&rq->nr_iowait);
4997
	delayacct_blkio_end();
4998
	return ret;
4999
}
5000
5001
/**
5002
 * sys_sched_get_priority_max - return maximum RT priority.
5003
 * @policy: scheduling class.
5004
 *
5005
 * Return: On success, this syscall returns the maximum
5006
 * rt_priority that can be used by a given scheduling class.
5007
 * On failure, a negative error code is returned.
5008
 */
5009
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5010
{
5011
	int ret = -EINVAL;
5012
5013
	switch (policy) {
5014
	case SCHED_FIFO:
5015
	case SCHED_RR:
5016
		ret = MAX_USER_RT_PRIO-1;
5017
		break;
5018
	case SCHED_NORMAL:
5019
	case SCHED_BATCH:
5020
	case SCHED_ISO:
5021
	case SCHED_IDLEPRIO:
5022
		ret = 0;
5023
		break;
5024
	}
5025
	return ret;
5026
}
5027
5028
/**
5029
 * sys_sched_get_priority_min - return minimum RT priority.
5030
 * @policy: scheduling class.
5031
 *
5032
 * Return: On success, this syscall returns the minimum
5033
 * rt_priority that can be used by a given scheduling class.
5034
 * On failure, a negative error code is returned.
5035
 */
5036
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5037
{
5038
	int ret = -EINVAL;
5039
5040
	switch (policy) {
5041
	case SCHED_FIFO:
5042
	case SCHED_RR:
5043
		ret = 1;
5044
		break;
5045
	case SCHED_NORMAL:
5046
	case SCHED_BATCH:
5047
	case SCHED_ISO:
5048
	case SCHED_IDLEPRIO:
5049
		ret = 0;
5050
		break;
5051
	}
5052
	return ret;
5053
}
5054
5055
/**
5056
 * sys_sched_rr_get_interval - return the default timeslice of a process.
5057
 * @pid: pid of the process.
5058
 * @interval: userspace pointer to the timeslice value.
5059
 *
5060
 *
5061
 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
5062
 * an error code.
5063
 */
5064
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5065
		struct timespec __user *, interval)
5066
{
5067
	struct task_struct *p;
5068
	unsigned int time_slice;
5069
	unsigned long flags;
5070
	int retval;
5071
	struct timespec t;
5072
5073
	if (pid < 0)
5074
		return -EINVAL;
5075
5076
	retval = -ESRCH;
5077
	rcu_read_lock();
5078
	p = find_process_by_pid(pid);
5079
	if (!p)
5080
		goto out_unlock;
5081
5082
	retval = security_task_getscheduler(p);
5083
	if (retval)
5084
		goto out_unlock;
5085
5086
	grq_lock_irqsave(&flags);
5087
	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
5088
	grq_unlock_irqrestore(&flags);
5089
5090
	rcu_read_unlock();
5091
	t = ns_to_timespec(time_slice);
5092
	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5093
	return retval;
5094
5095
out_unlock:
5096
	rcu_read_unlock();
5097
	return retval;
5098
}
5099
5100
static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5101
5102
void sched_show_task(struct task_struct *p)
5103
{
5104
	unsigned long free = 0;
5105
	int ppid;
5106
	unsigned state;
5107
5108
	state = p->state ? __ffs(p->state) + 1 : 0;
5109
	printk(KERN_INFO "%-15.15s %c", p->comm,
5110
		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5111
#if BITS_PER_LONG == 32
5112
	if (state == TASK_RUNNING)
5113
		printk(KERN_CONT " running  ");
5114
	else
5115
		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5116
#else
5117
	if (state == TASK_RUNNING)
5118
		printk(KERN_CONT "  running task    ");
5119
	else
5120
		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5121
#endif
5122
#ifdef CONFIG_DEBUG_STACK_USAGE
5123
	free = stack_not_used(p);
5124
#endif
5125
	rcu_read_lock();
5126
	ppid = task_pid_nr(rcu_dereference(p->real_parent));
5127
	rcu_read_unlock();
5128
	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5129
		task_pid_nr(p), ppid,
5130
		(unsigned long)task_thread_info(p)->flags);
5131
5132
	print_worker_info(KERN_INFO, p);
5133
	show_stack(p, NULL);
5134
}
5135
5136
void show_state_filter(unsigned long state_filter)
5137
{
5138
	struct task_struct *g, *p;
5139
5140
#if BITS_PER_LONG == 32
5141
	printk(KERN_INFO
5142
		"  task                PC stack   pid father\n");
5143
#else
5144
	printk(KERN_INFO
5145
		"  task                        PC stack   pid father\n");
5146
#endif
5147
	rcu_read_lock();
5148
	do_each_thread(g, p) {
5149
		/*
5150
		 * reset the NMI-timeout, listing all files on a slow
5151
		 * console might take a lot of time:
5152
		 */
5153
		touch_nmi_watchdog();
5154
		if (!state_filter || (p->state & state_filter))
5155
			sched_show_task(p);
5156
	} while_each_thread(g, p);
5157
5158
	touch_all_softlockup_watchdogs();
5159
5160
	rcu_read_unlock();
5161
	/*
5162
	 * Only show locks if all tasks are dumped:
5163
	 */
5164
	if (!state_filter)
5165
		debug_show_all_locks();
5166
}
5167
5168
void dump_cpu_task(int cpu)
5169
{
5170
	pr_info("Task dump for CPU %d:\n", cpu);
5171
	sched_show_task(cpu_curr(cpu));
5172
}
5173
5174
#ifdef CONFIG_SMP
5175
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
5176
{
5177
	cpumask_copy(tsk_cpus_allowed(p), new_mask);
5178
}
5179
#endif
5180
5181
/**
5182
 * init_idle - set up an idle thread for a given CPU
5183
 * @idle: task in question
5184
 * @cpu: cpu the idle task belongs to
5185
 *
5186
 * NOTE: this function does not set the idle thread's NEED_RESCHED
5187
 * flag, to make booting more robust.
5188
 */
5189
void init_idle(struct task_struct *idle, int cpu)
5190
{
5191
	struct rq *rq = cpu_rq(cpu);
5192
	unsigned long flags;
5193
5194
	time_grq_lock(rq, &flags);
5195
	idle->last_ran = rq->clock_task;
5196
	idle->state = TASK_RUNNING;
5197
	/* Setting prio to illegal value shouldn't matter when never queued */
5198
	idle->prio = PRIO_LIMIT;
5199
	set_rq_task(rq, idle);
5200
	do_set_cpus_allowed(idle, &cpumask_of_cpu(cpu));
5201
	/* Silence PROVE_RCU */
5202
	rcu_read_lock();
5203
	set_task_cpu(idle, cpu);
5204
	rcu_read_unlock();
5205
	rq->curr = rq->idle = idle;
5206
	idle->on_cpu = 1;
5207
	grq_unlock_irqrestore(&flags);
5208
5209
	/* Set the preempt count _outside_ the spinlocks! */
5210
	task_thread_info(idle)->preempt_count = 0;
5211
5212
	ftrace_graph_init_idle_task(idle, cpu);
5213
#if defined(CONFIG_SMP)
5214
	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5215
#endif
5216
}
5217
5218
#ifdef CONFIG_SMP
5219
#ifdef CONFIG_NO_HZ_COMMON
5220
void nohz_balance_enter_idle(int cpu)
5221
{
5222
}
5223
5224
void select_nohz_load_balancer(int stop_tick)
5225
{
5226
}
5227
5228
void set_cpu_sd_state_idle(void) {}
5229
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
5230
/**
5231
 * lowest_flag_domain - Return lowest sched_domain containing flag.
5232
 * @cpu:	The cpu whose lowest level of sched domain is to
5233
 *		be returned.
5234
 * @flag:	The flag to check for the lowest sched_domain
5235
 *		for the given cpu.
5236
 *
5237
 * Returns the lowest sched_domain of a cpu which contains the given flag.
5238
 */
5239
static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
5240
{
5241
	struct sched_domain *sd;
5242
5243
	for_each_domain(cpu, sd)
5244
		if (sd && (sd->flags & flag))
5245
			break;
5246
5247
	return sd;
5248
}
5249
5250
/**
5251
 * for_each_flag_domain - Iterates over sched_domains containing the flag.
5252
 * @cpu:	The cpu whose domains we're iterating over.
5253
 * @sd:		variable holding the value of the power_savings_sd
5254
 *		for cpu.
5255
 * @flag:	The flag to filter the sched_domains to be iterated.
5256
 *
5257
 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
5258
 * set, starting from the lowest sched_domain to the highest.
5259
 */
5260
#define for_each_flag_domain(cpu, sd, flag) \
5261
	for (sd = lowest_flag_domain(cpu, flag); \
5262
		(sd && (sd->flags & flag)); sd = sd->parent)
5263
5264
#endif /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
5265
5266
static inline void resched_cpu(int cpu)
5267
{
5268
	unsigned long flags;
5269
5270
	grq_lock_irqsave(&flags);
5271
	resched_task(cpu_curr(cpu));
5272
	grq_unlock_irqrestore(&flags);
5273
}
5274
5275
/*
5276
 * In the semi idle case, use the nearest busy cpu for migrating timers
5277
 * from an idle cpu.  This is good for power-savings.
5278
 *
5279
 * We don't do similar optimization for completely idle system, as
5280
 * selecting an idle cpu will add more delays to the timers than intended
5281
 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
5282
 */
5283
int get_nohz_timer_target(void)
5284
{
5285
	int cpu = smp_processor_id();
5286
	int i;
5287
	struct sched_domain *sd;
5288
5289
	rcu_read_lock();
5290
	for_each_domain(cpu, sd) {
5291
		for_each_cpu(i, sched_domain_span(sd)) {
5292
			if (!idle_cpu(i))
5293
				cpu = i;
5294
			goto unlock;
5295
		}
5296
	}
5297
unlock:
5298
	rcu_read_unlock();
5299
	return cpu;
5300
}
5301
5302
/*
5303
 * When add_timer_on() enqueues a timer into the timer wheel of an
5304
 * idle CPU then this timer might expire before the next timer event
5305
 * which is scheduled to wake up that CPU. In case of a completely
5306
 * idle system the next event might even be infinite time into the
5307
 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
5308
 * leaves the inner idle loop so the newly added timer is taken into
5309
 * account when the CPU goes back to idle and evaluates the timer
5310
 * wheel for the next timer event.
5311
 */
5312
void wake_up_idle_cpu(int cpu)
5313
{
5314
	struct task_struct *idle;
5315
	struct rq *rq;
5316
5317
	if (cpu == smp_processor_id())
5318
		return;
5319
5320
	rq = cpu_rq(cpu);
5321
	idle = rq->idle;
5322
5323
	/*
5324
	 * This is safe, as this function is called with the timer
5325
	 * wheel base lock of (cpu) held. When the CPU is on the way
5326
	 * to idle and has not yet set rq->curr to idle then it will
5327
	 * be serialised on the timer wheel base lock and take the new
5328
	 * timer into account automatically.
5329
	 */
5330
	if (unlikely(rq->curr != idle))
5331
		return;
5332
5333
	/*
5334
	 * We can set TIF_RESCHED on the idle task of the other CPU
5335
	 * lockless. The worst case is that the other CPU runs the
5336
	 * idle task through an additional NOOP schedule()
5337
	 */
5338
	set_tsk_need_resched(idle);
5339
5340
	/* NEED_RESCHED must be visible before we test polling */
5341
	smp_mb();
5342
	if (!tsk_is_polling(idle))
5343
		smp_send_reschedule(cpu);
5344
}
5345
5346
void wake_up_nohz_cpu(int cpu)
5347
{
5348
	wake_up_idle_cpu(cpu);
5349
}
5350
#endif /* CONFIG_NO_HZ_COMMON */
5351
5352
/*
5353
 * Change a given task's CPU affinity. Migrate the thread to a
5354
 * proper CPU and schedule it away if the CPU it's executing on
5355
 * is removed from the allowed bitmask.
5356
 *
5357
 * NOTE: the caller must have a valid reference to the task, the
5358
 * task must not exit() & deallocate itself prematurely. The
5359
 * call is not atomic; no spinlocks may be held.
5360
 */
5361
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5362
{
5363
	bool running_wrong = false;
5364
	bool queued = false;
5365
	unsigned long flags;
5366
	struct rq *rq;
5367
	int ret = 0;
5368
5369
	rq = task_grq_lock(p, &flags);
5370
5371
	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
5372
		goto out;
5373
5374
	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5375
		ret = -EINVAL;
5376
		goto out;
5377
	}
5378
5379
	queued = task_queued(p);
5380
5381
	do_set_cpus_allowed(p, new_mask);
5382
5383
	/* Can the task run on the task's current CPU? If so, we're done */
5384
	if (cpumask_test_cpu(task_cpu(p), new_mask))
5385
		goto out;
5386
5387
	if (task_running(p)) {
5388
		/* Task is running on the wrong cpu now, reschedule it. */
5389
		if (rq == this_rq()) {
5390
			set_tsk_need_resched(p);
5391
			running_wrong = true;
5392
		} else
5393
			resched_task(p);
5394
	} else
5395
		set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
5396
5397
out:
5398
	if (queued)
5399
		try_preempt(p, rq);
5400
	task_grq_unlock(&flags);
5401
5402
	if (running_wrong)
5403
		_cond_resched();
5404
5405
	return ret;
5406
}
5407
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5408
5409
#ifdef CONFIG_HOTPLUG_CPU
5410
extern struct task_struct *cpu_stopper_task;
5411
/* Run through task list and find tasks affined to just the dead cpu, then
5412
 * allocate a new affinity */
5413
static void break_sole_affinity(int src_cpu, struct task_struct *idle)
5414
{
5415
	struct task_struct *p, *t, *stopper;
5416
5417
	stopper = per_cpu(cpu_stopper_task, src_cpu);
5418
	do_each_thread(t, p) {
5419
		if (p != stopper && p != idle && !online_cpus(p)) {
5420
			cpumask_copy(tsk_cpus_allowed(p), cpu_possible_mask);
5421
			/*
5422
			 * Don't tell them about moving exiting tasks or
5423
			 * kernel threads (both mm NULL), since they never
5424
			 * leave kernel.
5425
			 */
5426
			if (p->mm && printk_ratelimit()) {
5427
				printk(KERN_INFO "process %d (%s) no "
5428
				       "longer affine to cpu %d\n",
5429
				       task_pid_nr(p), p->comm, src_cpu);
5430
			}
5431
		}
5432
		clear_sticky(p);
5433
	} while_each_thread(t, p);
5434
}
5435
5436
/*
5437
 * Ensures that the idle task is using init_mm right before its cpu goes
5438
 * offline.
5439
 */
5440
void idle_task_exit(void)
5441
{
5442
	struct mm_struct *mm = current->active_mm;
5443
5444
	BUG_ON(cpu_online(smp_processor_id()));
5445
5446
	if (mm != &init_mm)
5447
		switch_mm(mm, &init_mm, current);
5448
	mmdrop(mm);
5449
}
5450
#endif /* CONFIG_HOTPLUG_CPU */
5451
void sched_set_stop_task(int cpu, struct task_struct *stop)
5452
{
5453
	struct sched_param stop_param = { .sched_priority = STOP_PRIO };
5454
	struct sched_param start_param = { .sched_priority = 0 };
5455
	struct task_struct *old_stop = cpu_rq(cpu)->stop;
5456
5457
	if (stop) {
5458
		/*
5459
		 * Make it appear like a SCHED_FIFO task, its something
5460
		 * userspace knows about and won't get confused about.
5461
		 *
5462
		 * Also, it will make PI more or less work without too
5463
		 * much confusion -- but then, stop work should not
5464
		 * rely on PI working anyway.
5465
		 */
5466
		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
5467
	}
5468
5469
	cpu_rq(cpu)->stop = stop;
5470
5471
	if (old_stop) {
5472
		/*
5473
		 * Reset it back to a normal scheduling policy so that
5474
		 * it can die in pieces.
5475
		 */
5476
		sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
5477
	}
5478
}
5479
5480
5481
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5482
5483
static struct ctl_table sd_ctl_dir[] = {
5484
	{
5485
		.procname	= "sched_domain",
5486
		.mode		= 0555,
5487
	},
5488
	{}
5489
};
5490
5491
static struct ctl_table sd_ctl_root[] = {
5492
	{
5493
		.procname	= "kernel",
5494
		.mode		= 0555,
5495
		.child		= sd_ctl_dir,
5496
	},
5497
	{}
5498
};
5499
5500
static struct ctl_table *sd_alloc_ctl_entry(int n)
5501
{
5502
	struct ctl_table *entry =
5503
		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5504
5505
	return entry;
5506
}
5507
5508
static void sd_free_ctl_entry(struct ctl_table **tablep)
5509
{
5510
	struct ctl_table *entry;
5511
5512
	/*
5513
	 * In the intermediate directories, both the child directory and
5514
	 * procname are dynamically allocated and could fail but the mode
5515
	 * will always be set. In the lowest directory the names are
5516
	 * static strings and all have proc handlers.
5517
	 */
5518
	for (entry = *tablep; entry->mode; entry++) {
5519
		if (entry->child)
5520
			sd_free_ctl_entry(&entry->child);
5521
		if (entry->proc_handler == NULL)
5522
			kfree(entry->procname);
5523
	}
5524
5525
	kfree(*tablep);
5526
	*tablep = NULL;
5527
}
5528
5529
static void
5530
set_table_entry(struct ctl_table *entry,
5531
		const char *procname, void *data, int maxlen,
5532
		mode_t mode, proc_handler *proc_handler)
5533
{
5534
	entry->procname = procname;
5535
	entry->data = data;
5536
	entry->maxlen = maxlen;
5537
	entry->mode = mode;
5538
	entry->proc_handler = proc_handler;
5539
}
5540
5541
static struct ctl_table *
5542
sd_alloc_ctl_domain_table(struct sched_domain *sd)
5543
{
5544
	struct ctl_table *table = sd_alloc_ctl_entry(13);
5545
5546
	if (table == NULL)
5547
		return NULL;
5548
5549
	set_table_entry(&table[0], "min_interval", &sd->min_interval,
5550
		sizeof(long), 0644, proc_doulongvec_minmax);
5551
	set_table_entry(&table[1], "max_interval", &sd->max_interval,
5552
		sizeof(long), 0644, proc_doulongvec_minmax);
5553
	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5554
		sizeof(int), 0644, proc_dointvec_minmax);
5555
	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5556
		sizeof(int), 0644, proc_dointvec_minmax);
5557
	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5558
		sizeof(int), 0644, proc_dointvec_minmax);
5559
	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5560
		sizeof(int), 0644, proc_dointvec_minmax);
5561
	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5562
		sizeof(int), 0644, proc_dointvec_minmax);
5563
	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5564
		sizeof(int), 0644, proc_dointvec_minmax);
5565
	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5566
		sizeof(int), 0644, proc_dointvec_minmax);
5567
	set_table_entry(&table[9], "cache_nice_tries",
5568
		&sd->cache_nice_tries,
5569
		sizeof(int), 0644, proc_dointvec_minmax);
5570
	set_table_entry(&table[10], "flags", &sd->flags,
5571
		sizeof(int), 0644, proc_dointvec_minmax);
5572
	set_table_entry(&table[11], "name", sd->name,
5573
		CORENAME_MAX_SIZE, 0444, proc_dostring);
5574
	/* &table[12] is terminator */
5575
5576
	return table;
5577
}
5578
5579
static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5580
{
5581
	struct ctl_table *entry, *table;
5582
	struct sched_domain *sd;
5583
	int domain_num = 0, i;
5584
	char buf[32];
5585
5586
	for_each_domain(cpu, sd)
5587
		domain_num++;
5588
	entry = table = sd_alloc_ctl_entry(domain_num + 1);
5589
	if (table == NULL)
5590
		return NULL;
5591
5592
	i = 0;
5593
	for_each_domain(cpu, sd) {
5594
		snprintf(buf, 32, "domain%d", i);
5595
		entry->procname = kstrdup(buf, GFP_KERNEL);
5596
		entry->mode = 0555;
5597
		entry->child = sd_alloc_ctl_domain_table(sd);
5598
		entry++;
5599
		i++;
5600
	}
5601
	return table;
5602
}
5603
5604
static struct ctl_table_header *sd_sysctl_header;
5605
static void register_sched_domain_sysctl(void)
5606
{
5607
	int i, cpu_num = num_possible_cpus();
5608
	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5609
	char buf[32];
5610
5611
	WARN_ON(sd_ctl_dir[0].child);
5612
	sd_ctl_dir[0].child = entry;
5613
5614
	if (entry == NULL)
5615
		return;
5616
5617
	for_each_possible_cpu(i) {
5618
		snprintf(buf, 32, "cpu%d", i);
5619
		entry->procname = kstrdup(buf, GFP_KERNEL);
5620
		entry->mode = 0555;
5621
		entry->child = sd_alloc_ctl_cpu_table(i);
5622
		entry++;
5623
	}
5624
5625
	WARN_ON(sd_sysctl_header);
5626
	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5627
}
5628
5629
/* may be called multiple times per register */
5630
static void unregister_sched_domain_sysctl(void)
5631
{
5632
	if (sd_sysctl_header)
5633
		unregister_sysctl_table(sd_sysctl_header);
5634
	sd_sysctl_header = NULL;
5635
	if (sd_ctl_dir[0].child)
5636
		sd_free_ctl_entry(&sd_ctl_dir[0].child);
5637
}
5638
#else
5639
static void register_sched_domain_sysctl(void)
5640
{
5641
}
5642
static void unregister_sched_domain_sysctl(void)
5643
{
5644
}
5645
#endif
5646
5647
static void set_rq_online(struct rq *rq)
5648
{
5649
	if (!rq->online) {
5650
		cpumask_set_cpu(cpu_of(rq), rq->rd->online);
5651
		rq->online = true;
5652
	}
5653
}
5654
5655
static void set_rq_offline(struct rq *rq)
5656
{
5657
	if (rq->online) {
5658
		cpumask_clear_cpu(cpu_of(rq), rq->rd->online);
5659
		rq->online = false;
5660
	}
5661
}
5662
5663
/*
5664
 * migration_call - callback that gets triggered when a CPU is added.
5665
 */
5666
static int
5667
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5668
{
5669
	int cpu = (long)hcpu;
5670
	unsigned long flags;
5671
	struct rq *rq = cpu_rq(cpu);
5672
#ifdef CONFIG_HOTPLUG_CPU
5673
	struct task_struct *idle = rq->idle;
5674
#endif
5675
5676
	switch (action & ~CPU_TASKS_FROZEN) {
5677
5678
	case CPU_UP_PREPARE:
5679
		break;
5680
5681
	case CPU_ONLINE:
5682
		/* Update our root-domain */
5683
		grq_lock_irqsave(&flags);
5684
		if (rq->rd) {
5685
			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5686
5687
			set_rq_online(rq);
5688
		}
5689
		grq.noc = num_online_cpus();
5690
		grq_unlock_irqrestore(&flags);
5691
		break;
5692
5693
#ifdef CONFIG_HOTPLUG_CPU
5694
	case CPU_DEAD:
5695
		/* Idle task back to normal (off runqueue, low prio) */
5696
		grq_lock_irq();
5697
		return_task(idle, true);
5698
		idle->static_prio = MAX_PRIO;
5699
		__setscheduler(idle, rq, SCHED_NORMAL, 0);
5700
		idle->prio = PRIO_LIMIT;
5701
		set_rq_task(rq, idle);
5702
		update_clocks(rq);
5703
		grq_unlock_irq();
5704
		break;
5705
5706
	case CPU_DYING:
5707
		sched_ttwu_pending();
5708
		/* Update our root-domain */
5709
		grq_lock_irqsave(&flags);
5710
		if (rq->rd) {
5711
			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5712
			set_rq_offline(rq);
5713
		}
5714
		break_sole_affinity(cpu, idle);
5715
		grq.noc = num_online_cpus();
5716
		grq_unlock_irqrestore(&flags);
5717
		break;
5718
#endif
5719
	}
5720
	return NOTIFY_OK;
5721
}
5722
5723
/*
5724
 * Register at high priority so that task migration (migrate_all_tasks)
5725
 * happens before everything else.  This has to be lower priority than
5726
 * the notifier in the perf_counter subsystem, though.
5727
 */
5728
static struct notifier_block  migration_notifier = {
5729
	.notifier_call = migration_call,
5730
	.priority = CPU_PRI_MIGRATION,
5731
};
5732
5733
static int sched_cpu_active(struct notifier_block *nfb,
5734
				      unsigned long action, void *hcpu)
5735
{
5736
	switch (action & ~CPU_TASKS_FROZEN) {
5737
	case CPU_STARTING:
5738
	case CPU_DOWN_FAILED:
5739
		set_cpu_active((long)hcpu, true);
5740
		return NOTIFY_OK;
5741
	default:
5742
		return NOTIFY_DONE;
5743
	}
5744
}
5745
5746
static int sched_cpu_inactive(struct notifier_block *nfb,
5747
					unsigned long action, void *hcpu)
5748
{
5749
	switch (action & ~CPU_TASKS_FROZEN) {
5750
	case CPU_DOWN_PREPARE:
5751
		set_cpu_active((long)hcpu, false);
5752
		return NOTIFY_OK;
5753
	default:
5754
		return NOTIFY_DONE;
5755
	}
5756
}
5757
5758
int __init migration_init(void)
5759
{
5760
	void *cpu = (void *)(long)smp_processor_id();
5761
	int err;
5762
5763
	/* Initialise migration for the boot CPU */
5764
	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5765
	BUG_ON(err == NOTIFY_BAD);
5766
	migration_call(&migration_notifier, CPU_ONLINE, cpu);
5767
	register_cpu_notifier(&migration_notifier);
5768
5769
	/* Register cpu active notifiers */
5770
	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5771
	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5772
5773
	return 0;
5774
}
5775
early_initcall(migration_init);
5776
#endif
5777
5778
#ifdef CONFIG_SMP
5779
5780
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5781
5782
#ifdef CONFIG_SCHED_DEBUG
5783
5784
static __read_mostly int sched_debug_enabled;
5785
5786
static int __init sched_debug_setup(char *str)
5787
{
5788
	sched_debug_enabled = 1;
5789
5790
	return 0;
5791
}
5792
early_param("sched_debug", sched_debug_setup);
5793
5794
static inline bool sched_debug(void)
5795
{
5796
	return sched_debug_enabled;
5797
}
5798
5799
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5800
				  struct cpumask *groupmask)
5801
{
5802
	char str[256];
5803
5804
	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5805
	cpumask_clear(groupmask);
5806
5807
	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5808
5809
	if (!(sd->flags & SD_LOAD_BALANCE)) {
5810
		printk("does not load-balance\n");
5811
		if (sd->parent)
5812
			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5813
					" has parent");
5814
		return -1;
5815
	}
5816
5817
	printk(KERN_CONT "span %s level %s\n", str, sd->name);
5818
5819
	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5820
		printk(KERN_ERR "ERROR: domain->span does not contain "
5821
				"CPU%d\n", cpu);
5822
	}
5823
5824
	printk(KERN_CONT "\n");
5825
5826
	if (!cpumask_equal(sched_domain_span(sd), groupmask))
5827
		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5828
5829
	if (sd->parent &&
5830
	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5831
		printk(KERN_ERR "ERROR: parent span is not a superset "
5832
			"of domain->span\n");
5833
	return 0;
5834
}
5835
5836
static void sched_domain_debug(struct sched_domain *sd, int cpu)
5837
{
5838
	int level = 0;
5839
5840
	if (!sched_debug_enabled)
5841
		return;
5842
5843
	if (!sd) {
5844
		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5845
		return;
5846
	}
5847
5848
	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5849
5850
	for (;;) {
5851
		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5852
			break;
5853
		level++;
5854
		sd = sd->parent;
5855
		if (!sd)
5856
			break;
5857
	}
5858
}
5859
#else /* !CONFIG_SCHED_DEBUG */
5860
# define sched_domain_debug(sd, cpu) do { } while (0)
5861
static inline bool sched_debug(void)
5862
{
5863
	return false;
5864
}
5865
#endif /* CONFIG_SCHED_DEBUG */
5866
5867
static int sd_degenerate(struct sched_domain *sd)
5868
{
5869
	if (cpumask_weight(sched_domain_span(sd)) == 1)
5870
		return 1;
5871
5872
	/* Following flags don't use groups */
5873
	if (sd->flags & (SD_WAKE_AFFINE))
5874
		return 0;
5875
5876
	return 1;
5877
}
5878
5879
static int
5880
sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5881
{
5882
	unsigned long cflags = sd->flags, pflags = parent->flags;
5883
5884
	if (sd_degenerate(parent))
5885
		return 1;
5886
5887
	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5888
		return 0;
5889
5890
	if (~cflags & pflags)
5891
		return 0;
5892
5893
	return 1;
5894
}
5895
5896
static void free_rootdomain(struct rcu_head *rcu)
5897
{
5898
	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5899
5900
	cpupri_cleanup(&rd->cpupri);
5901
	free_cpumask_var(rd->rto_mask);
5902
	free_cpumask_var(rd->online);
5903
	free_cpumask_var(rd->span);
5904
	kfree(rd);
5905
}
5906
5907
static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5908
{
5909
	struct root_domain *old_rd = NULL;
5910
	unsigned long flags;
5911
5912
	grq_lock_irqsave(&flags);
5913
5914
	if (rq->rd) {
5915
		old_rd = rq->rd;
5916
5917
		if (cpumask_test_cpu(rq->cpu, old_rd->online))
5918
			set_rq_offline(rq);
5919
5920
		cpumask_clear_cpu(rq->cpu, old_rd->span);
5921
5922
		/*
5923
		 * If we dont want to free the old_rt yet then
5924
		 * set old_rd to NULL to skip the freeing later
5925
		 * in this function:
5926
		 */
5927
		if (!atomic_dec_and_test(&old_rd->refcount))
5928
			old_rd = NULL;
5929
	}
5930
5931
	atomic_inc(&rd->refcount);
5932
	rq->rd = rd;
5933
5934
	cpumask_set_cpu(rq->cpu, rd->span);
5935
	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5936
		set_rq_online(rq);
5937
5938
	grq_unlock_irqrestore(&flags);
5939
5940
	if (old_rd)
5941
		call_rcu_sched(&old_rd->rcu, free_rootdomain);
5942
}
5943
5944
static int init_rootdomain(struct root_domain *rd)
5945
{
5946
	memset(rd, 0, sizeof(*rd));
5947
5948
	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5949
		goto out;
5950
	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5951
		goto free_span;
5952
	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5953
		goto free_online;
5954
5955
	if (cpupri_init(&rd->cpupri) != 0)
5956
		goto free_rto_mask;
5957
	return 0;
5958
5959
free_rto_mask:
5960
	free_cpumask_var(rd->rto_mask);
5961
free_online:
5962
	free_cpumask_var(rd->online);
5963
free_span:
5964
	free_cpumask_var(rd->span);
5965
out:
5966
	return -ENOMEM;
5967
}
5968
5969
static void init_defrootdomain(void)
5970
{
5971
	init_rootdomain(&def_root_domain);
5972
5973
	atomic_set(&def_root_domain.refcount, 1);
5974
}
5975
5976
static struct root_domain *alloc_rootdomain(void)
5977
{
5978
	struct root_domain *rd;
5979
5980
	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5981
	if (!rd)
5982
		return NULL;
5983
5984
	if (init_rootdomain(rd) != 0) {
5985
		kfree(rd);
5986
		return NULL;
5987
	}
5988
5989
	return rd;
5990
}
5991
5992
static void free_sched_domain(struct rcu_head *rcu)
5993
{
5994
	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5995
5996
	kfree(sd);
5997
}
5998
5999
static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6000
{
6001
	call_rcu(&sd->rcu, free_sched_domain);
6002
}
6003
6004
static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6005
{
6006
	for (; sd; sd = sd->parent)
6007
		destroy_sched_domain(sd, cpu);
6008
}
6009
6010
/*
6011
 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6012
 * hold the hotplug lock.
6013
 */
6014
static void
6015
cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6016
{
6017
	struct rq *rq = cpu_rq(cpu);
6018
	struct sched_domain *tmp;
6019
6020
	/* Remove the sched domains which do not contribute to scheduling. */
6021
	for (tmp = sd; tmp; ) {
6022
		struct sched_domain *parent = tmp->parent;
6023
		if (!parent)
6024
			break;
6025
6026
		if (sd_parent_degenerate(tmp, parent)) {
6027
			tmp->parent = parent->parent;
6028
			if (parent->parent)
6029
				parent->parent->child = tmp;
6030
			destroy_sched_domain(parent, cpu);
6031
		} else
6032
			tmp = tmp->parent;
6033
	}
6034
6035
	if (sd && sd_degenerate(sd)) {
6036
		tmp = sd;
6037
		sd = sd->parent;
6038
		destroy_sched_domain(tmp, cpu);
6039
		if (sd)
6040
			sd->child = NULL;
6041
	}
6042
6043
	sched_domain_debug(sd, cpu);
6044
6045
	rq_attach_root(rq, rd);
6046
	tmp = rq->sd;
6047
	rcu_assign_pointer(rq->sd, sd);
6048
	destroy_sched_domains(tmp, cpu);
6049
}
6050
6051
/* cpus with isolated domains */
6052
static cpumask_var_t cpu_isolated_map;
6053
6054
/* Setup the mask of cpus configured for isolated domains */
6055
static int __init isolated_cpu_setup(char *str)
6056
{
6057
	alloc_bootmem_cpumask_var(&cpu_isolated_map);
6058
	cpulist_parse(str, cpu_isolated_map);
6059
	return 1;
6060
}
6061
6062
__setup("isolcpus=", isolated_cpu_setup);
6063
6064
static const struct cpumask *cpu_cpu_mask(int cpu)
6065
{
6066
	return cpumask_of_node(cpu_to_node(cpu));
6067
}
6068
6069
struct sd_data {
6070
	struct sched_domain **__percpu sd;
6071
};
6072
6073
struct s_data {
6074
	struct sched_domain ** __percpu sd;
6075
	struct root_domain	*rd;
6076
};
6077
6078
enum s_alloc {
6079
	sa_rootdomain,
6080
	sa_sd,
6081
	sa_sd_storage,
6082
	sa_none,
6083
};
6084
6085
struct sched_domain_topology_level;
6086
6087
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6088
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6089
6090
#define SDTL_OVERLAP	0x01
6091
6092
struct sched_domain_topology_level {
6093
	sched_domain_init_f init;
6094
	sched_domain_mask_f mask;
6095
	int		    flags;
6096
	int		    numa_level;
6097
	struct sd_data      data;
6098
};
6099
6100
/*
6101
 * Initializers for schedule domains
6102
 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6103
 */
6104
6105
#ifdef CONFIG_SCHED_DEBUG
6106
# define SD_INIT_NAME(sd, type)		sd->name = #type
6107
#else
6108
# define SD_INIT_NAME(sd, type)		do { } while (0)
6109
#endif
6110
6111
#define SD_INIT_FUNC(type)						\
6112
static noinline struct sched_domain *					\
6113
sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
6114
{									\
6115
	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\
6116
	*sd = SD_##type##_INIT;						\
6117
	SD_INIT_NAME(sd, type);						\
6118
	sd->private = &tl->data;					\
6119
	return sd;							\
6120
}
6121
6122
SD_INIT_FUNC(CPU)
6123
#ifdef CONFIG_SCHED_SMT
6124
 SD_INIT_FUNC(SIBLING)
6125
#endif
6126
#ifdef CONFIG_SCHED_MC
6127
 SD_INIT_FUNC(MC)
6128
#endif
6129
#ifdef CONFIG_SCHED_BOOK
6130
 SD_INIT_FUNC(BOOK)
6131
#endif
6132
6133
static int default_relax_domain_level = -1;
6134
int sched_domain_level_max;
6135
6136
static int __init setup_relax_domain_level(char *str)
6137
{
6138
	if (kstrtoint(str, 0, &default_relax_domain_level))
6139
		pr_warn("Unable to set relax_domain_level\n");
6140
6141
	return 1;
6142
}
6143
__setup("relax_domain_level=", setup_relax_domain_level);
6144
6145
static void set_domain_attribute(struct sched_domain *sd,
6146
				 struct sched_domain_attr *attr)
6147
{
6148
	int request;
6149
6150
	if (!attr || attr->relax_domain_level < 0) {
6151
		if (default_relax_domain_level < 0)
6152
			return;
6153
		else
6154
			request = default_relax_domain_level;
6155
	} else
6156
		request = attr->relax_domain_level;
6157
	if (request < sd->level) {
6158
		/* turn off idle balance on this domain */
6159
		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6160
	} else {
6161
		/* turn on idle balance on this domain */
6162
		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6163
	}
6164
}
6165
6166
static void __sdt_free(const struct cpumask *cpu_map);
6167
static int __sdt_alloc(const struct cpumask *cpu_map);
6168
6169
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6170
				 const struct cpumask *cpu_map)
6171
{
6172
	switch (what) {
6173
	case sa_rootdomain:
6174
		if (!atomic_read(&d->rd->refcount))
6175
			free_rootdomain(&d->rd->rcu); /* fall through */
6176
	case sa_sd:
6177
		free_percpu(d->sd); /* fall through */
6178
	case sa_sd_storage:
6179
		__sdt_free(cpu_map); /* fall through */
6180
	case sa_none:
6181
		break;
6182
	}
6183
}
6184
6185
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6186
						   const struct cpumask *cpu_map)
6187
{
6188
	memset(d, 0, sizeof(*d));
6189
6190
	if (__sdt_alloc(cpu_map))
6191
		return sa_sd_storage;
6192
	d->sd = alloc_percpu(struct sched_domain *);
6193
	if (!d->sd)
6194
		return sa_sd_storage;
6195
	d->rd = alloc_rootdomain();
6196
	if (!d->rd)
6197
		return sa_sd;
6198
	return sa_rootdomain;
6199
}
6200
6201
/*
6202
 * NULL the sd_data elements we've used to build the sched_domain
6203
 * structure so that the subsequent __free_domain_allocs()
6204
 * will not free the data we're using.
6205
 */
6206
static void claim_allocations(int cpu, struct sched_domain *sd)
6207
{
6208
	struct sd_data *sdd = sd->private;
6209
6210
	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6211
	*per_cpu_ptr(sdd->sd, cpu) = NULL;
6212
}
6213
6214
#ifdef CONFIG_SCHED_SMT
6215
static const struct cpumask *cpu_smt_mask(int cpu)
6216
{
6217
	return topology_thread_cpumask(cpu);
6218
}
6219
#endif
6220
6221
/*
6222
 * Topology list, bottom-up.
6223
 */
6224
static struct sched_domain_topology_level default_topology[] = {
6225
#ifdef CONFIG_SCHED_SMT
6226
	{ sd_init_SIBLING, cpu_smt_mask, },
6227
#endif
6228
#ifdef CONFIG_SCHED_MC
6229
	{ sd_init_MC, cpu_coregroup_mask, },
6230
#endif
6231
#ifdef CONFIG_SCHED_BOOK
6232
	{ sd_init_BOOK, cpu_book_mask, },
6233
#endif
6234
	{ sd_init_CPU, cpu_cpu_mask, },
6235
	{ NULL, },
6236
};
6237
6238
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6239
6240
#define for_each_sd_topology(tl)			\
6241
	for (tl = sched_domain_topology; tl->init; tl++)
6242
6243
#ifdef CONFIG_NUMA
6244
6245
static int sched_domains_numa_levels;
6246
static int *sched_domains_numa_distance;
6247
static struct cpumask ***sched_domains_numa_masks;
6248
static int sched_domains_curr_level;
6249
6250
static inline int sd_local_flags(int level)
6251
{
6252
	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6253
		return 0;
6254
6255
	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6256
}
6257
6258
static struct sched_domain *
6259
sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6260
{
6261
	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6262
	int level = tl->numa_level;
6263
	int sd_weight = cpumask_weight(
6264
			sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6265
6266
	*sd = (struct sched_domain){
6267
		.min_interval		= sd_weight,
6268
		.max_interval		= 2*sd_weight,
6269
		.busy_factor		= 32,
6270
		.imbalance_pct		= 125,
6271
		.cache_nice_tries	= 2,
6272
		.busy_idx		= 3,
6273
		.idle_idx		= 2,
6274
		.newidle_idx		= 0,
6275
		.wake_idx		= 0,
6276
		.forkexec_idx		= 0,
6277
6278
		.flags			= 1*SD_LOAD_BALANCE
6279
					| 1*SD_BALANCE_NEWIDLE
6280
					| 0*SD_BALANCE_EXEC
6281
					| 0*SD_BALANCE_FORK
6282
					| 0*SD_BALANCE_WAKE
6283
					| 0*SD_WAKE_AFFINE
6284
					| 0*SD_SHARE_CPUPOWER
6285
					| 0*SD_SHARE_PKG_RESOURCES
6286
					| 1*SD_SERIALIZE
6287
					| 0*SD_PREFER_SIBLING
6288
					| sd_local_flags(level)
6289
					,
6290
		.last_balance		= jiffies,
6291
		.balance_interval	= sd_weight,
6292
	};
6293
	SD_INIT_NAME(sd, NUMA);
6294
	sd->private = &tl->data;
6295
6296
	/*
6297
	 * Ugly hack to pass state to sd_numa_mask()...
6298
	 */
6299
	sched_domains_curr_level = tl->numa_level;
6300
6301
	return sd;
6302
}
6303
6304
static const struct cpumask *sd_numa_mask(int cpu)
6305
{
6306
	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6307
}
6308
6309
static void sched_numa_warn(const char *str)
6310
{
6311
	static int done = false;
6312
	int i,j;
6313
6314
	if (done)
6315
		return;
6316
6317
	done = true;
6318
6319
	printk(KERN_WARNING "ERROR: %s\n\n", str);
6320
6321
	for (i = 0; i < nr_node_ids; i++) {
6322
		printk(KERN_WARNING "  ");
6323
		for (j = 0; j < nr_node_ids; j++)
6324
			printk(KERN_CONT "%02d ", node_distance(i,j));
6325
		printk(KERN_CONT "\n");
6326
	}
6327
	printk(KERN_WARNING "\n");
6328
}
6329
6330
static bool find_numa_distance(int distance)
6331
{
6332
	int i;
6333
6334
	if (distance == node_distance(0, 0))
6335
		return true;
6336
6337
	for (i = 0; i < sched_domains_numa_levels; i++) {
6338
		if (sched_domains_numa_distance[i] == distance)
6339
			return true;
6340
	}
6341
6342
	return false;
6343
}
6344
6345
static void sched_init_numa(void)
6346
{
6347
	int next_distance, curr_distance = node_distance(0, 0);
6348
	struct sched_domain_topology_level *tl;
6349
	int level = 0;
6350
	int i, j, k;
6351
6352
	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6353
	if (!sched_domains_numa_distance)
6354
		return;
6355
6356
	/*
6357
	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6358
	 * unique distances in the node_distance() table.
6359
	 *
6360
	 * Assumes node_distance(0,j) includes all distances in
6361
	 * node_distance(i,j) in order to avoid cubic time.
6362
	 */
6363
	next_distance = curr_distance;
6364
	for (i = 0; i < nr_node_ids; i++) {
6365
		for (j = 0; j < nr_node_ids; j++) {
6366
			for (k = 0; k < nr_node_ids; k++) {
6367
				int distance = node_distance(i, k);
6368
6369
				if (distance > curr_distance &&
6370
				    (distance < next_distance ||
6371
				     next_distance == curr_distance))
6372
					next_distance = distance;
6373
6374
				/*
6375
				 * While not a strong assumption it would be nice to know
6376
				 * about cases where if node A is connected to B, B is not
6377
				 * equally connected to A.
6378
				 */
6379
				if (sched_debug() && node_distance(k, i) != distance)
6380
					sched_numa_warn("Node-distance not symmetric");
6381
6382
				if (sched_debug() && i && !find_numa_distance(distance))
6383
					sched_numa_warn("Node-0 not representative");
6384
			}
6385
			if (next_distance != curr_distance) {
6386
				sched_domains_numa_distance[level++] = next_distance;
6387
				sched_domains_numa_levels = level;
6388
				curr_distance = next_distance;
6389
			} else break;
6390
		}
6391
6392
		/*
6393
		 * In case of sched_debug() we verify the above assumption.
6394
		 */
6395
		if (!sched_debug())
6396
			break;
6397
	}
6398
	/*
6399
	 * 'level' contains the number of unique distances, excluding the
6400
	 * identity distance node_distance(i,i).
6401
	 *
6402
	 * The sched_domains_numa_distance[] array includes the actual distance
6403
	 * numbers.
6404
	 */
6405
6406
	/*
6407
	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
6408
	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
6409
	 * the array will contain less then 'level' members. This could be
6410
	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
6411
	 * in other functions.
6412
	 *
6413
	 * We reset it to 'level' at the end of this function.
6414
	 */
6415
	sched_domains_numa_levels = 0;
6416
6417
	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6418
	if (!sched_domains_numa_masks)
6419
		return;
6420
6421
	/*
6422
	 * Now for each level, construct a mask per node which contains all
6423
	 * cpus of nodes that are that many hops away from us.
6424
	 */
6425
	for (i = 0; i < level; i++) {
6426
		sched_domains_numa_masks[i] =
6427
			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6428
		if (!sched_domains_numa_masks[i])
6429
			return;
6430
6431
		for (j = 0; j < nr_node_ids; j++) {
6432
			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6433
			if (!mask)
6434
				return;
6435
6436
			sched_domains_numa_masks[i][j] = mask;
6437
6438
			for (k = 0; k < nr_node_ids; k++) {
6439
				if (node_distance(j, k) > sched_domains_numa_distance[i])
6440
					continue;
6441
6442
				cpumask_or(mask, mask, cpumask_of_node(k));
6443
			}
6444
		}
6445
	}
6446
6447
	tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6448
			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6449
	if (!tl)
6450
		return;
6451
6452
	/*
6453
	 * Copy the default topology bits..
6454
	 */
6455
	for (i = 0; default_topology[i].init; i++)
6456
		tl[i] = default_topology[i];
6457
6458
	/*
6459
	 * .. and append 'j' levels of NUMA goodness.
6460
	 */
6461
	for (j = 0; j < level; i++, j++) {
6462
		tl[i] = (struct sched_domain_topology_level){
6463
			.init = sd_numa_init,
6464
			.mask = sd_numa_mask,
6465
			.flags = SDTL_OVERLAP,
6466
			.numa_level = j,
6467
		};
6468
	}
6469
6470
	sched_domain_topology = tl;
6471
6472
	sched_domains_numa_levels = level;
6473
}
6474
6475
static void sched_domains_numa_masks_set(int cpu)
6476
{
6477
	int i, j;
6478
	int node = cpu_to_node(cpu);
6479
6480
	for (i = 0; i < sched_domains_numa_levels; i++) {
6481
		for (j = 0; j < nr_node_ids; j++) {
6482
			if (node_distance(j, node) <= sched_domains_numa_distance[i])
6483
				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6484
		}
6485
	}
6486
}
6487
6488
static void sched_domains_numa_masks_clear(int cpu)
6489
{
6490
	int i, j;
6491
	for (i = 0; i < sched_domains_numa_levels; i++) {
6492
		for (j = 0; j < nr_node_ids; j++)
6493
			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6494
	}
6495
}
6496
6497
/*
6498
 * Update sched_domains_numa_masks[level][node] array when new cpus
6499
 * are onlined.
6500
 */
6501
static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6502
					   unsigned long action,
6503
					   void *hcpu)
6504
{
6505
	int cpu = (long)hcpu;
6506
6507
	switch (action & ~CPU_TASKS_FROZEN) {
6508
	case CPU_ONLINE:
6509
		sched_domains_numa_masks_set(cpu);
6510
		break;
6511
6512
	case CPU_DEAD:
6513
		sched_domains_numa_masks_clear(cpu);
6514
		break;
6515
6516
	default:
6517
		return NOTIFY_DONE;
6518
	}
6519
6520
	return NOTIFY_OK;
6521
}
6522
#else
6523
static inline void sched_init_numa(void)
6524
{
6525
}
6526
6527
static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6528
					   unsigned long action,
6529
					   void *hcpu)
6530
{
6531
	return 0;
6532
}
6533
#endif /* CONFIG_NUMA */
6534
6535
static int __sdt_alloc(const struct cpumask *cpu_map)
6536
{
6537
	struct sched_domain_topology_level *tl;
6538
	int j;
6539
6540
	for_each_sd_topology(tl) {
6541
		struct sd_data *sdd = &tl->data;
6542
6543
		sdd->sd = alloc_percpu(struct sched_domain *);
6544
		if (!sdd->sd)
6545
			return -ENOMEM;
6546
6547
		for_each_cpu(j, cpu_map) {
6548
			struct sched_domain *sd;
6549
6550
		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6551
					GFP_KERNEL, cpu_to_node(j));
6552
			if (!sd)
6553
				return -ENOMEM;
6554
6555
			*per_cpu_ptr(sdd->sd, j) = sd;
6556
		}
6557
	}
6558
6559
	return 0;
6560
}
6561
6562
static void __sdt_free(const struct cpumask *cpu_map)
6563
{
6564
	struct sched_domain_topology_level *tl;
6565
	int j;
6566
6567
	for_each_sd_topology(tl) {
6568
		struct sd_data *sdd = &tl->data;
6569
6570
		for_each_cpu(j, cpu_map) {
6571
			struct sched_domain *sd;
6572
6573
			if (sdd->sd) {
6574
				sd = *per_cpu_ptr(sdd->sd, j);
6575
				kfree(*per_cpu_ptr(sdd->sd, j));
6576
			}
6577
		}
6578
		free_percpu(sdd->sd);
6579
		sdd->sd = NULL;
6580
	}
6581
}
6582
6583
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6584
		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6585
		struct sched_domain *child, int cpu)
6586
{
6587
	struct sched_domain *sd = tl->init(tl, cpu);
6588
	if (!sd)
6589
		return child;
6590
6591
	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6592
	if (child) {
6593
		sd->level = child->level + 1;
6594
		sched_domain_level_max = max(sched_domain_level_max, sd->level);
6595
		child->parent = sd;
6596
		sd->child = child;
6597
	}
6598
	set_domain_attribute(sd, attr);
6599
6600
	return sd;
6601
}
6602
6603
/*
6604
 * Build sched domains for a given set of cpus and attach the sched domains
6605
 * to the individual cpus
6606
 */
6607
static int build_sched_domains(const struct cpumask *cpu_map,
6608
			       struct sched_domain_attr *attr)
6609
{
6610
	enum s_alloc alloc_state;
6611
	struct sched_domain *sd;
6612
	struct s_data d;
6613
	int i, ret = -ENOMEM;
6614
6615
	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6616
	if (alloc_state != sa_rootdomain)
6617
		goto error;
6618
6619
	/* Set up domains for cpus specified by the cpu_map. */
6620
	for_each_cpu(i, cpu_map) {
6621
		struct sched_domain_topology_level *tl;
6622
6623
		sd = NULL;
6624
		for_each_sd_topology(tl) {
6625
			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6626
			if (tl == sched_domain_topology)
6627
				*per_cpu_ptr(d.sd, i) = sd;
6628
			if (tl->flags & SDTL_OVERLAP)
6629
				sd->flags |= SD_OVERLAP;
6630
			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6631
				break;
6632
		}
6633
	}
6634
6635
	/* Calculate CPU power for physical packages and nodes */
6636
	for (i = nr_cpumask_bits-1; i >= 0; i--) {
6637
		if (!cpumask_test_cpu(i, cpu_map))
6638
			continue;
6639
6640
		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6641
			claim_allocations(i, sd);
6642
		}
6643
	}
6644
6645
	/* Attach the domains */
6646
	rcu_read_lock();
6647
	for_each_cpu(i, cpu_map) {
6648
		sd = *per_cpu_ptr(d.sd, i);
6649
		cpu_attach_domain(sd, d.rd, i);
6650
	}
6651
	rcu_read_unlock();
6652
6653
	ret = 0;
6654
error:
6655
	__free_domain_allocs(&d, alloc_state, cpu_map);
6656
	return ret;
6657
}
6658
6659
static cpumask_var_t *doms_cur;	/* current sched domains */
6660
static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
6661
static struct sched_domain_attr *dattr_cur;
6662
				/* attribues of custom domains in 'doms_cur' */
6663
6664
/*
6665
 * Special case: If a kmalloc of a doms_cur partition (array of
6666
 * cpumask) fails, then fallback to a single sched domain,
6667
 * as determined by the single cpumask fallback_doms.
6668
 */
6669
static cpumask_var_t fallback_doms;
6670
6671
/*
6672
 * arch_update_cpu_topology lets virtualized architectures update the
6673
 * cpu core maps. It is supposed to return 1 if the topology changed
6674
 * or 0 if it stayed the same.
6675
 */
6676
int __attribute__((weak)) arch_update_cpu_topology(void)
6677
{
6678
	return 0;
6679
}
6680
6681
cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6682
{
6683
	int i;
6684
	cpumask_var_t *doms;
6685
6686
	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6687
	if (!doms)
6688
		return NULL;
6689
	for (i = 0; i < ndoms; i++) {
6690
		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6691
			free_sched_domains(doms, i);
6692
			return NULL;
6693
		}
6694
	}
6695
	return doms;
6696
}
6697
6698
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6699
{
6700
	unsigned int i;
6701
	for (i = 0; i < ndoms; i++)
6702
		free_cpumask_var(doms[i]);
6703
	kfree(doms);
6704
}
6705
6706
/*
6707
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6708
 * For now this just excludes isolated cpus, but could be used to
6709
 * exclude other special cases in the future.
6710
 */
6711
static int init_sched_domains(const struct cpumask *cpu_map)
6712
{
6713
	int err;
6714
6715
	arch_update_cpu_topology();
6716
	ndoms_cur = 1;
6717
	doms_cur = alloc_sched_domains(ndoms_cur);
6718
	if (!doms_cur)
6719
		doms_cur = &fallback_doms;
6720
	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6721
	err = build_sched_domains(doms_cur[0], NULL);
6722
	register_sched_domain_sysctl();
6723
6724
	return err;
6725
}
6726
6727
/*
6728
 * Detach sched domains from a group of cpus specified in cpu_map
6729
 * These cpus will now be attached to the NULL domain
6730
 */
6731
static void detach_destroy_domains(const struct cpumask *cpu_map)
6732
{
6733
	int i;
6734
6735
	rcu_read_lock();
6736
	for_each_cpu(i, cpu_map)
6737
		cpu_attach_domain(NULL, &def_root_domain, i);
6738
	rcu_read_unlock();
6739
}
6740
6741
/* handle null as "default" */
6742
static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6743
			struct sched_domain_attr *new, int idx_new)
6744
{
6745
	struct sched_domain_attr tmp;
6746
6747
	/* fast path */
6748
	if (!new && !cur)
6749
		return 1;
6750
6751
	tmp = SD_ATTR_INIT;
6752
	return !memcmp(cur ? (cur + idx_cur) : &tmp,
6753
			new ? (new + idx_new) : &tmp,
6754
			sizeof(struct sched_domain_attr));
6755
}
6756
6757
/*
6758
 * Partition sched domains as specified by the 'ndoms_new'
6759
 * cpumasks in the array doms_new[] of cpumasks. This compares
6760
 * doms_new[] to the current sched domain partitioning, doms_cur[].
6761
 * It destroys each deleted domain and builds each new domain.
6762
 *
6763
 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6764
 * The masks don't intersect (don't overlap.) We should setup one
6765
 * sched domain for each mask. CPUs not in any of the cpumasks will
6766
 * not be load balanced. If the same cpumask appears both in the
6767
 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6768
 * it as it is.
6769
 *
6770
 * The passed in 'doms_new' should be allocated using
6771
 * alloc_sched_domains.  This routine takes ownership of it and will
6772
 * free_sched_domains it when done with it. If the caller failed the
6773
 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6774
 * and partition_sched_domains() will fallback to the single partition
6775
 * 'fallback_doms', it also forces the domains to be rebuilt.
6776
 *
6777
 * If doms_new == NULL it will be replaced with cpu_online_mask.
6778
 * ndoms_new == 0 is a special case for destroying existing domains,
6779
 * and it will not create the default domain.
6780
 *
6781
 * Call with hotplug lock held
6782
 */
6783
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6784
			     struct sched_domain_attr *dattr_new)
6785
{
6786
	int i, j, n;
6787
	int new_topology;
6788
6789
	mutex_lock(&sched_domains_mutex);
6790
6791
	/* always unregister in case we don't destroy any domains */
6792
	unregister_sched_domain_sysctl();
6793
6794
	/* Let architecture update cpu core mappings. */
6795
	new_topology = arch_update_cpu_topology();
6796
6797
	n = doms_new ? ndoms_new : 0;
6798
6799
	/* Destroy deleted domains */
6800
	for (i = 0; i < ndoms_cur; i++) {
6801
		for (j = 0; j < n && !new_topology; j++) {
6802
			if (cpumask_equal(doms_cur[i], doms_new[j])
6803
			    && dattrs_equal(dattr_cur, i, dattr_new, j))
6804
				goto match1;
6805
		}
6806
		/* no match - a current sched domain not in new doms_new[] */
6807
		detach_destroy_domains(doms_cur[i]);
6808
match1:
6809
		;
6810
	}
6811
6812
	if (doms_new == NULL) {
6813
		ndoms_cur = 0;
6814
		doms_new = &fallback_doms;
6815
		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6816
		WARN_ON_ONCE(dattr_new);
6817
	}
6818
6819
	/* Build new domains */
6820
	for (i = 0; i < ndoms_new; i++) {
6821
		for (j = 0; j < ndoms_cur && !new_topology; j++) {
6822
			if (cpumask_equal(doms_new[i], doms_cur[j])
6823
			    && dattrs_equal(dattr_new, i, dattr_cur, j))
6824
				goto match2;
6825
		}
6826
		/* no match - add a new doms_new */
6827
		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6828
match2:
6829
		;
6830
	}
6831
6832
	/* Remember the new sched domains */
6833
	if (doms_cur != &fallback_doms)
6834
		free_sched_domains(doms_cur, ndoms_cur);
6835
	kfree(dattr_cur);	/* kfree(NULL) is safe */
6836
	doms_cur = doms_new;
6837
	dattr_cur = dattr_new;
6838
	ndoms_cur = ndoms_new;
6839
6840
	register_sched_domain_sysctl();
6841
6842
	mutex_unlock(&sched_domains_mutex);
6843
}
6844
6845
/*
6846
 * Update cpusets according to cpu_active mask.  If cpusets are
6847
 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
6848
 * around partition_sched_domains().
6849
 */
6850
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6851
			     void *hcpu)
6852
{
6853
	switch (action & ~CPU_TASKS_FROZEN) {
6854
	case CPU_ONLINE:
6855
	case CPU_DOWN_FAILED:
6856
		cpuset_update_active_cpus(true);
6857
		return NOTIFY_OK;
6858
	default:
6859
		return NOTIFY_DONE;
6860
	}
6861
}
6862
6863
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6864
			       void *hcpu)
6865
{
6866
	switch (action & ~CPU_TASKS_FROZEN) {
6867
	case CPU_DOWN_PREPARE:
6868
		cpuset_update_active_cpus(false);
6869
		return NOTIFY_OK;
6870
	default:
6871
		return NOTIFY_DONE;
6872
	}
6873
}
6874
6875
#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
6876
/*
6877
 * Cheaper version of the below functions in case support for SMT and MC is
6878
 * compiled in but CPUs have no siblings.
6879
 */
6880
static bool sole_cpu_idle(int cpu)
6881
{
6882
	return rq_idle(cpu_rq(cpu));
6883
}
6884
#endif
6885
#ifdef CONFIG_SCHED_SMT
6886
/* All this CPU's SMT siblings are idle */
6887
static bool siblings_cpu_idle(int cpu)
6888
{
6889
	return cpumask_subset(&(cpu_rq(cpu)->smt_siblings),
6890
			      &grq.cpu_idle_map);
6891
}
6892
#endif
6893
#ifdef CONFIG_SCHED_MC
6894
/* All this CPU's shared cache siblings are idle */
6895
static bool cache_cpu_idle(int cpu)
6896
{
6897
	return cpumask_subset(&(cpu_rq(cpu)->cache_siblings),
6898
			      &grq.cpu_idle_map);
6899
}
6900
#endif
6901
6902
enum sched_domain_level {
6903
	SD_LV_NONE = 0,
6904
	SD_LV_SIBLING,
6905
	SD_LV_MC,
6906
	SD_LV_BOOK,
6907
	SD_LV_CPU,
6908
	SD_LV_NODE,
6909
	SD_LV_ALLNODES,
6910
	SD_LV_MAX
6911
};
6912
6913
void __init sched_init_smp(void)
6914
{
6915
	struct sched_domain *sd;
6916
	int cpu;
6917
6918
	cpumask_var_t non_isolated_cpus;
6919
6920
	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6921
	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6922
6923
	sched_init_numa();
6924
6925
	get_online_cpus();
6926
	mutex_lock(&sched_domains_mutex);
6927
	init_sched_domains(cpu_active_mask);
6928
	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6929
	if (cpumask_empty(non_isolated_cpus))
6930
		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6931
	mutex_unlock(&sched_domains_mutex);
6932
	put_online_cpus();
6933
6934
	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6935
	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6936
	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6937
6938
	/* Move init over to a non-isolated CPU */
6939
	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6940
		BUG();
6941
	free_cpumask_var(non_isolated_cpus);
6942
6943
	grq_lock_irq();
6944
	/*
6945
	 * Set up the relative cache distance of each online cpu from each
6946
	 * other in a simple array for quick lookup. Locality is determined
6947
	 * by the closest sched_domain that CPUs are separated by. CPUs with
6948
	 * shared cache in SMT and MC are treated as local. Separate CPUs
6949
	 * (within the same package or physically) within the same node are
6950
	 * treated as not local. CPUs not even in the same domain (different
6951
	 * nodes) are treated as very distant.
6952
	 */
6953
	for_each_online_cpu(cpu) {
6954
		struct rq *rq = cpu_rq(cpu);
6955
6956
		mutex_lock(&sched_domains_mutex);
6957
		for_each_domain(cpu, sd) {
6958
			int locality, other_cpu;
6959
6960
#ifdef CONFIG_SCHED_SMT
6961
			if (sd->level == SD_LV_SIBLING) {
6962
				for_each_cpu_mask(other_cpu, *sched_domain_span(sd))
6963
					cpumask_set_cpu(other_cpu, &rq->smt_siblings);
6964
			}
6965
#endif
6966
#ifdef CONFIG_SCHED_MC
6967
			if (sd->level == SD_LV_MC) {
6968
				for_each_cpu_mask(other_cpu, *sched_domain_span(sd))
6969
					cpumask_set_cpu(other_cpu, &rq->cache_siblings);
6970
			}
6971
#endif
6972
			if (sd->level <= SD_LV_SIBLING)
6973
				locality = 1;
6974
			else if (sd->level <= SD_LV_MC)
6975
				locality = 2;
6976
			else if (sd->level <= SD_LV_NODE)
6977
				locality = 3;
6978
			else
6979
				continue;
6980
6981
			for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) {
6982
				if (locality < rq->cpu_locality[other_cpu])
6983
					rq->cpu_locality[other_cpu] = locality;
6984
			}
6985
		}
6986
		mutex_unlock(&sched_domains_mutex);
6987
6988
		/*
6989
		 * Each runqueue has its own function in case it doesn't have
6990
		 * siblings of its own allowing mixed topologies.
6991
		 */
6992
#ifdef CONFIG_SCHED_SMT
6993
		if (cpus_weight(rq->smt_siblings) > 1)
6994
			rq->siblings_idle = siblings_cpu_idle;
6995
#endif
6996
#ifdef CONFIG_SCHED_MC
6997
		if (cpus_weight(rq->cache_siblings) > 1)
6998
			rq->cache_idle = cache_cpu_idle;
6999
#endif
7000
	}
7001
	grq_unlock_irq();
7002
}
7003
#else
7004
void __init sched_init_smp(void)
7005
{
7006
}
7007
#endif /* CONFIG_SMP */
7008
7009
unsigned int sysctl_timer_migration = 1;
7010
7011
int in_sched_functions(unsigned long addr)
7012
{
7013
	return in_lock_functions(addr) ||
7014
		(addr >= (unsigned long)__sched_text_start
7015
		&& addr < (unsigned long)__sched_text_end);
7016
}
7017
7018
void __init sched_init(void)
7019
{
7020
	int i;
7021
	struct rq *rq;
7022
7023
	prio_ratios[0] = 128;
7024
	for (i = 1 ; i < PRIO_RANGE ; i++)
7025
		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
7026
7027
	raw_spin_lock_init(&grq.lock);
7028
	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
7029
	grq.niffies = 0;
7030
	grq.last_jiffy = jiffies;
7031
	raw_spin_lock_init(&grq.iso_lock);
7032
	grq.iso_ticks = 0;
7033
	grq.iso_refractory = false;
7034
	grq.noc = 1;
7035
#ifdef CONFIG_SMP
7036
	init_defrootdomain();
7037
	grq.qnr = grq.idle_cpus = 0;
7038
	cpumask_clear(&grq.cpu_idle_map);
7039
#else
7040
	uprq = &per_cpu(runqueues, 0);
7041
#endif
7042
	for_each_possible_cpu(i) {
7043
		rq = cpu_rq(i);
7044
		rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
7045
			      rq->iowait_pc = rq->idle_pc = 0;
7046
		rq->dither = false;
7047
#ifdef CONFIG_SMP
7048
		rq->sticky_task = NULL;
7049
		rq->last_niffy = 0;
7050
		rq->sd = NULL;
7051
		rq->rd = NULL;
7052
		rq->online = false;
7053
		rq->cpu = i;
7054
		rq_attach_root(rq, &def_root_domain);
7055
#endif
7056
		atomic_set(&rq->nr_iowait, 0);
7057
	}
7058
7059
#ifdef CONFIG_SMP
7060
	nr_cpu_ids = i;
7061
	/*
7062
	 * Set the base locality for cpu cache distance calculation to
7063
	 * "distant" (3). Make sure the distance from a CPU to itself is 0.
7064
	 */
7065
	for_each_possible_cpu(i) {
7066
		int j;
7067
7068
		rq = cpu_rq(i);
7069
#ifdef CONFIG_SCHED_SMT
7070
		cpumask_clear(&rq->smt_siblings);
7071
		cpumask_set_cpu(i, &rq->smt_siblings);
7072
		rq->siblings_idle = sole_cpu_idle;
7073
		cpumask_set_cpu(i, &rq->smt_siblings);
7074
#endif
7075
#ifdef CONFIG_SCHED_MC
7076
		cpumask_clear(&rq->cache_siblings);
7077
		cpumask_set_cpu(i, &rq->cache_siblings);
7078
		rq->cache_idle = sole_cpu_idle;
7079
		cpumask_set_cpu(i, &rq->cache_siblings);
7080
#endif
7081
		rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(int *), GFP_ATOMIC);
7082
		for_each_possible_cpu(j) {
7083
			if (i == j)
7084
				rq->cpu_locality[j] = 0;
7085
			else
7086
				rq->cpu_locality[j] = 4;
7087
		}
7088
	}
7089
#endif
7090
7091
	for (i = 0; i < PRIO_LIMIT; i++)
7092
		INIT_LIST_HEAD(grq.queue + i);
7093
	/* delimiter for bitsearch */
7094
	__set_bit(PRIO_LIMIT, grq.prio_bitmap);
7095
7096
#ifdef CONFIG_PREEMPT_NOTIFIERS
7097
	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7098
#endif
7099
7100
#ifdef CONFIG_RT_MUTEXES
7101
	plist_head_init(&init_task.pi_waiters);
7102
#endif
7103
7104
	/*
7105
	 * The boot idle thread does lazy MMU switching as well:
7106
	 */
7107
	atomic_inc(&init_mm.mm_count);
7108
	enter_lazy_tlb(&init_mm, current);
7109
7110
	/*
7111
	 * Make us the idle thread. Technically, schedule() should not be
7112
	 * called from this thread, however somewhere below it might be,
7113
	 * but because we are the idle thread, we just pick up running again
7114
	 * when this runqueue becomes "idle".
7115
	 */
7116
	init_idle(current, smp_processor_id());
7117
7118
#ifdef CONFIG_SMP
7119
	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7120
	/* May be allocated at isolcpus cmdline parse time */
7121
	if (cpu_isolated_map == NULL)
7122
		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7123
	idle_thread_set_boot_cpu();
7124
#endif /* SMP */
7125
}
7126
7127
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7128
static inline int preempt_count_equals(int preempt_offset)
7129
{
7130
	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7131
7132
	return (nested == preempt_offset);
7133
}
7134
7135
void __might_sleep(const char *file, int line, int preempt_offset)
7136
{
7137
	static unsigned long prev_jiffy;	/* ratelimiting */
7138
7139
	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
7140
	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
7141
	    system_state != SYSTEM_RUNNING || oops_in_progress)
7142
		return;
7143
	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7144
		return;
7145
	prev_jiffy = jiffies;
7146
7147
	printk(KERN_ERR
7148
		"BUG: sleeping function called from invalid context at %s:%d\n",
7149
			file, line);
7150
	printk(KERN_ERR
7151
		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7152
			in_atomic(), irqs_disabled(),
7153
			current->pid, current->comm);
7154
7155
	debug_show_held_locks(current);
7156
	if (irqs_disabled())
7157
		print_irqtrace_events(current);
7158
	dump_stack();
7159
}
7160
EXPORT_SYMBOL(__might_sleep);
7161
#endif
7162
7163
#ifdef CONFIG_MAGIC_SYSRQ
7164
void normalize_rt_tasks(void)
7165
{
7166
	struct task_struct *g, *p;
7167
	unsigned long flags;
7168
	struct rq *rq;
7169
	int queued;
7170
7171
	read_lock_irqsave(&tasklist_lock, flags);
7172
7173
	do_each_thread(g, p) {
7174
		if (!rt_task(p) && !iso_task(p))
7175
			continue;
7176
7177
		raw_spin_lock(&p->pi_lock);
7178
		rq = __task_grq_lock(p);
7179
7180
		queued = task_queued(p);
7181
		if (queued)
7182
			dequeue_task(p);
7183
		__setscheduler(p, rq, SCHED_NORMAL, 0);
7184
		if (queued) {
7185
			enqueue_task(p);
7186
			try_preempt(p, rq);
7187
		}
7188
7189
		__task_grq_unlock();
7190
		raw_spin_unlock(&p->pi_lock);
7191
	} while_each_thread(g, p);
7192
7193
	read_unlock_irqrestore(&tasklist_lock, flags);
7194
}
7195
#endif /* CONFIG_MAGIC_SYSRQ */
7196
7197
#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7198
/*
7199
 * These functions are only useful for the IA64 MCA handling, or kdb.
7200
 *
7201
 * They can only be called when the whole system has been
7202
 * stopped - every CPU needs to be quiescent, and no scheduling
7203
 * activity can take place. Using them for anything else would
7204
 * be a serious bug, and as a result, they aren't even visible
7205
 * under any other configuration.
7206
 */
7207
7208
/**
7209
 * curr_task - return the current task for a given cpu.
7210
 * @cpu: the processor in question.
7211
 *
7212
 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7213
 *
7214
 * Return: The current task for @cpu.
7215
 */
7216
struct task_struct *curr_task(int cpu)
7217
{
7218
	return cpu_curr(cpu);
7219
}
7220
7221
#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7222
7223
#ifdef CONFIG_IA64
7224
/**
7225
 * set_curr_task - set the current task for a given cpu.
7226
 * @cpu: the processor in question.
7227
 * @p: the task pointer to set.
7228
 *
7229
 * Description: This function must only be used when non-maskable interrupts
7230
 * are serviced on a separate stack.  It allows the architecture to switch the
7231
 * notion of the current task on a cpu in a non-blocking manner.  This function
7232
 * must be called with all CPU's synchronised, and interrupts disabled, the
7233
 * and caller must save the original value of the current task (see
7234
 * curr_task() above) and restore that value before reenabling interrupts and
7235
 * re-starting the system.
7236
 *
7237
 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7238
 */
7239
void set_curr_task(int cpu, struct task_struct *p)
7240
{
7241
	cpu_curr(cpu) = p;
7242
}
7243
7244
#endif
7245
7246
/*
7247
 * Use precise platform statistics if available:
7248
 */
7249
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
7250
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
7251
{
7252
	*ut = p->utime;
7253
	*st = p->stime;
7254
}
7255
7256
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
7257
{
7258
	struct task_cputime cputime;
7259
7260
	thread_group_cputime(p, &cputime);
7261
7262
	*ut = cputime.utime;
7263
	*st = cputime.stime;
7264
}
7265
7266
void vtime_account_system_irqsafe(struct task_struct *tsk)
7267
{
7268
	unsigned long flags;
7269
7270
	local_irq_save(flags);
7271
	vtime_account_system(tsk);
7272
	local_irq_restore(flags);
7273
}
7274
EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
7275
7276
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
7277
void vtime_task_switch(struct task_struct *prev)
7278
{
7279
	if (is_idle_task(prev))
7280
		vtime_account_idle(prev);
7281
	else
7282
		vtime_account_system(prev);
7283
7284
	vtime_account_user(prev);
7285
	arch_vtime_task_switch(prev);
7286
}
7287
#endif
7288
7289
#else
7290
/*
7291
 * Perform (stime * rtime) / total, but avoid multiplication overflow by
7292
 * losing precision when the numbers are big.
7293
 */
7294
static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
7295
{
7296
	u64 scaled;
7297
7298
	for (;;) {
7299
		/* Make sure "rtime" is the bigger of stime/rtime */
7300
		if (stime > rtime) {
7301
			u64 tmp = rtime; rtime = stime; stime = tmp;
7302
		}
7303
7304
		/* Make sure 'total' fits in 32 bits */
7305
		if (total >> 32)
7306
			goto drop_precision;
7307
7308
		/* Does rtime (and thus stime) fit in 32 bits? */
7309
		if (!(rtime >> 32))
7310
			break;
7311
7312
		/* Can we just balance rtime/stime rather than dropping bits? */
7313
		if (stime >> 31)
7314
			goto drop_precision;
7315
7316
		/* We can grow stime and shrink rtime and try to make them both fit */
7317
		stime <<= 1;
7318
		rtime >>= 1;
7319
		continue;
7320
7321
drop_precision:
7322
		/* We drop from rtime, it has more bits than stime */
7323
		rtime >>= 1;
7324
		total >>= 1;
7325
	}
7326
7327
	/*
7328
	 * Make sure gcc understands that this is a 32x32->64 multiply,
7329
	 * followed by a 64/32->64 divide.
7330
	 */
7331
	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
7332
	return (__force cputime_t) scaled;
7333
}
7334
7335
/*
7336
 * Adjust tick based cputime random precision against scheduler
7337
 * runtime accounting.
7338
 */
7339
static void cputime_adjust(struct task_cputime *curr,
7340
			   struct cputime *prev,
7341
			   cputime_t *ut, cputime_t *st)
7342
{
7343
	cputime_t rtime, stime, utime, total;
7344
7345
	stime = curr->stime;
7346
	total = stime + curr->utime;
7347
7348
	/*
7349
	 * Tick based cputime accounting depend on random scheduling
7350
	 * timeslices of a task to be interrupted or not by the timer.
7351
	 * Depending on these circumstances, the number of these interrupts
7352
	 * may be over or under-optimistic, matching the real user and system
7353
	 * cputime with a variable precision.
7354
	 *
7355
	 * Fix this by scaling these tick based values against the total
7356
	 * runtime accounted by the CFS scheduler.
7357
	 */
7358
	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
7359
7360
	/*
7361
	 * Update userspace visible utime/stime values only if actual execution
7362
	 * time is bigger than already exported. Note that can happen, that we
7363
	 * provided bigger values due to scaling inaccuracy on big numbers.
7364
	 */
7365
	if (prev->stime + prev->utime >= rtime)
7366
		goto out;
7367
7368
	if (total) {
7369
		stime = scale_stime((__force u64)stime,
7370
				    (__force u64)rtime, (__force u64)total);
7371
		utime = rtime - stime;
7372
	} else {
7373
		stime = rtime;
7374
		utime = 0;
7375
	}
7376
7377
	/*
7378
	 * If the tick based count grows faster than the scheduler one,
7379
	 * the result of the scaling may go backward.
7380
	 * Let's enforce monotonicity.
7381
	 */
7382
	prev->stime = max(prev->stime, stime);
7383
	prev->utime = max(prev->utime, utime);
7384
7385
out:
7386
	*ut = prev->utime;
7387
	*st = prev->stime;
7388
}
7389
7390
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
7391
{
7392
	struct task_cputime cputime = {
7393
		.sum_exec_runtime = tsk_seruntime(p),
7394
	};
7395
7396
	task_cputime(p, &cputime.utime, &cputime.stime);
7397
	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
7398
}
7399
7400
/*
7401
 * Must be called with siglock held.
7402
 */
7403
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
7404
{
7405
	struct task_cputime cputime;
7406
7407
	thread_group_cputime(p, &cputime);
7408
	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
7409
}
7410
#endif
7411
7412
void init_idle_bootup_task(struct task_struct *idle)
7413
{}
7414
7415
#ifdef CONFIG_SCHED_DEBUG
7416
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
7417
{}
7418
7419
void proc_sched_set_task(struct task_struct *p)
7420
{}
7421
#endif
7422
7423
#ifdef CONFIG_SMP
7424
#define SCHED_LOAD_SHIFT	(10)
7425
#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
7426
7427
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
7428
{
7429
	return SCHED_LOAD_SCALE;
7430
}
7431
7432
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
7433
{
7434
	unsigned long weight = cpumask_weight(sched_domain_span(sd));
7435
	unsigned long smt_gain = sd->smt_gain;
7436
7437
	smt_gain /= weight;
7438
7439
	return smt_gain;
7440
}
7441
#endif
7442
++ b/include/uapi/linux/sched.h
Lines 37-44 Link Here
37
#define SCHED_FIFO		1
37
#define SCHED_FIFO		1
38
#define SCHED_RR		2
38
#define SCHED_RR		2
39
#define SCHED_BATCH		3
39
#define SCHED_BATCH		3
40
/* SCHED_ISO: reserved but not implemented yet */
40
/* SCHED_ISO: Implemented on BFS only */
41
#define SCHED_IDLE		5
41
#define SCHED_IDLE		5
42
#ifdef CONFIG_SCHED_BFS
43
#define SCHED_ISO		4
44
#define SCHED_IDLEPRIO		SCHED_IDLE
45
#define SCHED_MAX		(SCHED_IDLEPRIO)
46
#define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
47
#endif
48
42
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
49
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
43
#define SCHED_RESET_ON_FORK     0x40000000
50
#define SCHED_RESET_ON_FORK     0x40000000
44
51
45
-- a/include/linux/sched/rt.h
52
++ b/include/linux/sched/rt.h
Lines 14-24 Link Here
14
 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
14
 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
15
 */
15
 */
16
16
17
#ifdef CONFIG_SCHED_BFS
18
#define MAX_USER_RT_PRIO	100
19
#define MAX_RT_PRIO		(MAX_USER_RT_PRIO + 1)
20
#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
21
22
#define PRIO_RANGE		(40)
23
#define MAX_PRIO		(MAX_RT_PRIO + PRIO_RANGE)
24
#define ISO_PRIO		(MAX_RT_PRIO)
25
#define NORMAL_PRIO		(MAX_RT_PRIO + 1)
26
#define IDLE_PRIO		(MAX_RT_PRIO + 2)
27
#define PRIO_LIMIT		((IDLE_PRIO) + 1)
28
#else /* CONFIG_SCHED_BFS */
17
#define MAX_USER_RT_PRIO	100
29
#define MAX_USER_RT_PRIO	100
18
#define MAX_RT_PRIO		MAX_USER_RT_PRIO
30
#define MAX_RT_PRIO		MAX_USER_RT_PRIO
19
31
20
#define MAX_PRIO		(MAX_RT_PRIO + 40)
32
#define MAX_PRIO		(MAX_RT_PRIO + 40)
21
#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
33
#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
34
#endif /* CONFIG_SCHED_BFS */
22
35
23
static inline int rt_prio(int prio)
36
static inline int rt_prio(int prio)
24
{
37
{
25
-- a/kernel/stop_machine.c
38
++ b/kernel/stop_machine.c
Lines 40-46 Link Here
40
};
40
};
41
41
42
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
42
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
43
static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
43
DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
44
44
static bool stop_machine_initialized = false;
45
static bool stop_machine_initialized = false;
45
46
46
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
47
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
47
-- a/drivers/cpufreq/cpufreq_conservative.c
48
++ b/drivers/cpufreq/cpufreq_conservative.c
Lines 27-34 Link Here
27
#include "cpufreq_governor.h"
27
#include "cpufreq_governor.h"
28
28
29
/* Conservative governor macros */
29
/* Conservative governor macros */
30
#define DEF_FREQUENCY_UP_THRESHOLD		(80)
30
#define DEF_FREQUENCY_UP_THRESHOLD		(63)
31
#define DEF_FREQUENCY_DOWN_THRESHOLD		(20)
31
#define DEF_FREQUENCY_DOWN_THRESHOLD		(26)
32
#define DEF_FREQUENCY_STEP			(5)
32
#define DEF_FREQUENCY_STEP			(5)
33
#define DEF_SAMPLING_DOWN_FACTOR		(1)
33
#define DEF_SAMPLING_DOWN_FACTOR		(1)
34
#define MAX_SAMPLING_DOWN_FACTOR		(10)
34
#define MAX_SAMPLING_DOWN_FACTOR		(10)
35
-- linux-3.11-bfs.orig/kernel/time/Kconfig
35
++ b/kernel/time/Kconfig
Lines 94-100 Link Here
94
config NO_HZ_FULL
94
config NO_HZ_FULL
95
	bool "Full dynticks system (tickless)"
95
	bool "Full dynticks system (tickless)"
96
	# NO_HZ_COMMON dependency
96
	# NO_HZ_COMMON dependency
97
	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
97
	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS
98
	# We need at least one periodic CPU for timekeeping
98
	# We need at least one periodic CPU for timekeeping
99
	depends on SMP
99
	depends on SMP
100
	# RCU_USER_QS dependency
100
	# RCU_USER_QS dependency
101
-- a/kernel/sched/Makefile
101
++ b/kernel/sched/Makefile
Lines 11-19 Link Here
11
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
11
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12
endif
12
endif
13
13
14
ifdef CONFIG_SCHED_BFS
15
obj-y += bfs.o clock.o
16
else
14
obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
17
obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15
obj-$(CONFIG_SMP) += cpupri.o
16
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
18
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17
obj-$(CONFIG_SCHEDSTATS) += stats.o
18
obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
20
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
20
-- /dev/null
21
endif
22
obj-$(CONFIG_SMP) += cpupri.o
23
obj-$(CONFIG_SCHEDSTATS) += stats.o
24
++ b/kernel/sched/bfs_sched.h
Line 0 Link Here
0
-- a/kernel/sched/stats.c
1
#include <linux/sched.h>
2
3
#ifndef BFS_SCHED_H
4
#define BFS_SCHED_H
5
6
/*
7
 * This is the main, per-CPU runqueue data structure.
8
 * This data should only be modified by the local cpu.
9
 */
10
struct rq {
11
	struct task_struct *curr, *idle, *stop;
12
	struct mm_struct *prev_mm;
13
14
	/* Stored data about rq->curr to work outside grq lock */
15
	u64 rq_deadline;
16
	unsigned int rq_policy;
17
	int rq_time_slice;
18
	u64 rq_last_ran;
19
	int rq_prio;
20
	bool rq_running; /* There is a task running */
21
22
	/* Accurate timekeeping data */
23
	u64 timekeep_clock;
24
	unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
25
		iowait_pc, idle_pc;
26
	atomic_t nr_iowait;
27
28
#ifdef CONFIG_SMP
29
	int cpu;		/* cpu of this runqueue */
30
	bool online;
31
	bool scaling; /* This CPU is managed by a scaling CPU freq governor */
32
	struct task_struct *sticky_task;
33
34
	struct root_domain *rd;
35
	struct sched_domain *sd;
36
	int *cpu_locality; /* CPU relative cache distance */
37
#ifdef CONFIG_SCHED_SMT
38
	bool (*siblings_idle)(int cpu);
39
	/* See if all smt siblings are idle */
40
	cpumask_t smt_siblings;
41
#endif /* CONFIG_SCHED_SMT */
42
#ifdef CONFIG_SCHED_MC
43
	bool (*cache_idle)(int cpu);
44
	/* See if all cache siblings are idle */
45
	cpumask_t cache_siblings;
46
#endif /* CONFIG_SCHED_MC */
47
	u64 last_niffy; /* Last time this RQ updated grq.niffies */
48
#endif /* CONFIG_SMP */
49
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
50
	u64 prev_irq_time;
51
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
52
#ifdef CONFIG_PARAVIRT
53
	u64 prev_steal_time;
54
#endif /* CONFIG_PARAVIRT */
55
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
56
	u64 prev_steal_time_rq;
57
#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
58
59
	u64 clock, old_clock, last_tick;
60
	u64 clock_task;
61
	bool dither;
62
63
#ifdef CONFIG_SCHEDSTATS
64
65
	/* latency stats */
66
	struct sched_info rq_sched_info;
67
	unsigned long long rq_cpu_time;
68
	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
69
70
	/* sys_sched_yield() stats */
71
	unsigned int yld_count;
72
73
	/* schedule() stats */
74
	unsigned int sched_switch;
75
	unsigned int sched_count;
76
	unsigned int sched_goidle;
77
78
	/* try_to_wake_up() stats */
79
	unsigned int ttwu_count;
80
	unsigned int ttwu_local;
81
#endif /* CONFIG_SCHEDSTATS */
82
83
#ifdef CONFIG_SMP
84
	struct llist_head wake_list;
85
#endif
86
};
87
88
#ifdef CONFIG_SMP
89
struct rq *cpu_rq(int cpu);
90
#endif
91
92
static inline u64 rq_clock(struct rq *rq)
93
{
94
	return rq->clock;
95
}
96
97
static inline u64 rq_clock_task(struct rq *rq)
98
{
99
	return rq->clock_task;
100
}
101
102
#define rcu_dereference_check_sched_domain(p) \
103
	rcu_dereference_check((p), \
104
			      lockdep_is_held(&sched_domains_mutex))
105
106
/*
107
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
108
 * See detach_destroy_domains: synchronize_sched for details.
109
 *
110
 * The domain tree of any CPU may only be accessed from within
111
 * preempt-disabled sections.
112
 */
113
#define for_each_domain(cpu, __sd) \
114
	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
115
116
#endif
117
++ b/kernel/sched/stats.c
Lines 4-10 Link Here
4
#include <linux/seq_file.h>
4
#include <linux/seq_file.h>
5
#include <linux/proc_fs.h>
5
#include <linux/proc_fs.h>
6
6
7
#ifndef CONFIG_SCHED_BFS
7
#include "sched.h"
8
#include "sched.h"
9
#else
10
#include "bfs_sched.h"
11
#endif
8
12
9
/*
13
/*
10
 * bump this up when changing the output format or the meaning of an existing
14
 * bump this up when changing the output format or the meaning of an existing
11
-- a/include/linux/spinlock.h
15
++ b/include/linux/spinlock.h
Lines 117-122 Link Here
117
#endif /*arch_spin_is_contended*/
117
#endif /*arch_spin_is_contended*/
118
#endif
118
#endif
119
119
120
#ifndef smp_mb__before_spinlock
121
#define smp_mb__before_spinlock()  smp_wmb()
122
#endif
123
120
/* The lock does not imply full memory barrier. */
124
/* The lock does not imply full memory barrier. */
121
#ifndef ARCH_HAS_SMP_MB_AFTER_LOCK
125
#ifndef ARCH_HAS_SMP_MB_AFTER_LOCK
122
static inline void smp_mb__after_lock(void) { smp_mb(); }
126
static inline void smp_mb__after_lock(void) { smp_mb(); }

Return to bug 487362