Line
Link Here
|
|
ck |
|
|
1 |
-- |
2 |
Documentation/scheduler/sched-BFS.txt | 347 + |
|
Documentation/scheduler/sched-BFS.txt | 347 + |
3 |
Documentation/sysctl/kernel.txt | 26 |
1 |
Documentation/sysctl/kernel.txt | 26 |
4 |
arch/powerpc/platforms/cell/spufs/sched.c | 5 |
2 |
arch/powerpc/platforms/cell/spufs/sched.c | 5 |
5 |
drivers/cpufreq/cpufreq.c | 7 |
3 |
drivers/cpufreq/cpufreq.c | 7 |
6 |
drivers/cpufreq/cpufreq_conservative.c | 4 |
4 |
drivers/cpufreq/cpufreq_conservative.c | 4 |
7 |
drivers/cpufreq/cpufreq_ondemand.c | 8 |
5 |
drivers/cpufreq/cpufreq_ondemand.c | 8 |
8 |
fs/proc/base.c | 2 |
6 |
fs/proc/base.c | 2 |
9 |
include/linux/init_task.h | 64 |
7 |
include/linux/init_task.h | 64 |
10 |
include/linux/ioprio.h | 2 |
8 |
include/linux/ioprio.h | 2 |
11 |
include/linux/jiffies.h | 2 |
9 |
include/linux/jiffies.h | 2 |
12 |
include/linux/sched.h | 88 |
10 |
include/linux/sched.h | 88 |
13 |
include/linux/sched/rt.h | 13 |
11 |
include/linux/sched/rt.h | 13 |
14 |
include/uapi/linux/sched.h | 9 |
12 |
include/uapi/linux/sched.h | 9 |
15 |
init/Kconfig | 54 |
13 |
init/Kconfig | 54 |
16 |
init/main.c | 3 |
14 |
init/main.c | 3 |
17 |
kernel/delayacct.c | 2 |
15 |
kernel/delayacct.c | 2 |
18 |
kernel/exit.c | 2 |
16 |
kernel/exit.c | 2 |
19 |
kernel/posix-cpu-timers.c | 14 |
17 |
kernel/posix-cpu-timers.c | 14 |
20 |
kernel/sched/Makefile | 8 |
18 |
kernel/sched/Makefile | 8 |
21 |
kernel/sched/bfs.c | 7423 ++++++++++++++++++++++++++++++ |
19 |
kernel/sched/bfs.c | 7423 ++++++++++++++++++++++++++++++ |
22 |
kernel/stop_machine.c | 3 |
20 |
kernel/stop_machine.c | 3 |
23 |
kernel/sysctl.c | 31 |
21 |
kernel/sysctl.c | 31 |
24 |
kernel/time/Kconfig | 2 |
22 |
kernel/time/Kconfig | 2 |
25 |
lib/Kconfig.debug | 2 |
23 |
lib/Kconfig.debug | 2 |
26 |
24 files changed, 8048 insertions(+), 73 deletions(-) |
24 |
24 files changed, 8048 insertions(+), 73 deletions(-) |
27 |
-- a/arch/powerpc/platforms/cell/spufs/sched.c |
25 |
++ b/arch/powerpc/platforms/cell/spufs/sched.c |
Lines 64-74
Link Here
|
64 |
static struct timer_list spuloadavg_timer; |
64 |
static struct timer_list spuloadavg_timer; |
65 |
|
65 |
|
66 |
/* |
66 |
/* |
67 |
* Priority of a normal, non-rt, non-niced'd process (aka nice level 0). |
|
|
68 |
*/ |
69 |
#define NORMAL_PRIO 120 |
70 |
|
71 |
/* |
72 |
* Frequency of the spu scheduler tick. By default we do one SPU scheduler |
67 |
* Frequency of the spu scheduler tick. By default we do one SPU scheduler |
73 |
* tick for every 10 CPU scheduler ticks. |
68 |
* tick for every 10 CPU scheduler ticks. |
74 |
*/ |
69 |
*/ |
75 |
-- /dev/null |
70 |
++ b/Documentation/scheduler/sched-BFS.txt |
Line 0
Link Here
|
0 |
-- a/Documentation/sysctl/kernel.txt |
1 |
BFS - The Brain Fuck Scheduler by Con Kolivas. |
|
|
2 |
|
3 |
Goals. |
4 |
|
5 |
The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to |
6 |
completely do away with the complex designs of the past for the cpu process |
7 |
scheduler and instead implement one that is very simple in basic design. |
8 |
The main focus of BFS is to achieve excellent desktop interactivity and |
9 |
responsiveness without heuristics and tuning knobs that are difficult to |
10 |
understand, impossible to model and predict the effect of, and when tuned to |
11 |
one workload cause massive detriment to another. |
12 |
|
13 |
|
14 |
Design summary. |
15 |
|
16 |
BFS is best described as a single runqueue, O(n) lookup, earliest effective |
17 |
virtual deadline first design, loosely based on EEVDF (earliest eligible virtual |
18 |
deadline first) and my previous Staircase Deadline scheduler. Each component |
19 |
shall be described in order to understand the significance of, and reasoning for |
20 |
it. The codebase when the first stable version was released was approximately |
21 |
9000 lines less code than the existing mainline linux kernel scheduler (in |
22 |
2.6.31). This does not even take into account the removal of documentation and |
23 |
the cgroups code that is not used. |
24 |
|
25 |
Design reasoning. |
26 |
|
27 |
The single runqueue refers to the queued but not running processes for the |
28 |
entire system, regardless of the number of CPUs. The reason for going back to |
29 |
a single runqueue design is that once multiple runqueues are introduced, |
30 |
per-CPU or otherwise, there will be complex interactions as each runqueue will |
31 |
be responsible for the scheduling latency and fairness of the tasks only on its |
32 |
own runqueue, and to achieve fairness and low latency across multiple CPUs, any |
33 |
advantage in throughput of having CPU local tasks causes other disadvantages. |
34 |
This is due to requiring a very complex balancing system to at best achieve some |
35 |
semblance of fairness across CPUs and can only maintain relatively low latency |
36 |
for tasks bound to the same CPUs, not across them. To increase said fairness |
37 |
and latency across CPUs, the advantage of local runqueue locking, which makes |
38 |
for better scalability, is lost due to having to grab multiple locks. |
39 |
|
40 |
A significant feature of BFS is that all accounting is done purely based on CPU |
41 |
used and nowhere is sleep time used in any way to determine entitlement or |
42 |
interactivity. Interactivity "estimators" that use some kind of sleep/run |
43 |
algorithm are doomed to fail to detect all interactive tasks, and to falsely tag |
44 |
tasks that aren't interactive as being so. The reason for this is that it is |
45 |
close to impossible to determine that when a task is sleeping, whether it is |
46 |
doing it voluntarily, as in a userspace application waiting for input in the |
47 |
form of a mouse click or otherwise, or involuntarily, because it is waiting for |
48 |
another thread, process, I/O, kernel activity or whatever. Thus, such an |
49 |
estimator will introduce corner cases, and more heuristics will be required to |
50 |
cope with those corner cases, introducing more corner cases and failed |
51 |
interactivity detection and so on. Interactivity in BFS is built into the design |
52 |
by virtue of the fact that tasks that are waking up have not used up their quota |
53 |
of CPU time, and have earlier effective deadlines, thereby making it very likely |
54 |
they will preempt any CPU bound task of equivalent nice level. See below for |
55 |
more information on the virtual deadline mechanism. Even if they do not preempt |
56 |
a running task, because the rr interval is guaranteed to have a bound upper |
57 |
limit on how long a task will wait for, it will be scheduled within a timeframe |
58 |
that will not cause visible interface jitter. |
59 |
|
60 |
|
61 |
Design details. |
62 |
|
63 |
Task insertion. |
64 |
|
65 |
BFS inserts tasks into each relevant queue as an O(1) insertion into a double |
66 |
linked list. On insertion, *every* running queue is checked to see if the newly |
67 |
queued task can run on any idle queue, or preempt the lowest running task on the |
68 |
system. This is how the cross-CPU scheduling of BFS achieves significantly lower |
69 |
latency per extra CPU the system has. In this case the lookup is, in the worst |
70 |
case scenario, O(n) where n is the number of CPUs on the system. |
71 |
|
72 |
Data protection. |
73 |
|
74 |
BFS has one single lock protecting the process local data of every task in the |
75 |
global queue. Thus every insertion, removal and modification of task data in the |
76 |
global runqueue needs to grab the global lock. However, once a task is taken by |
77 |
a CPU, the CPU has its own local data copy of the running process' accounting |
78 |
information which only that CPU accesses and modifies (such as during a |
79 |
timer tick) thus allowing the accounting data to be updated lockless. Once a |
80 |
CPU has taken a task to run, it removes it from the global queue. Thus the |
81 |
global queue only ever has, at most, |
82 |
|
83 |
(number of tasks requesting cpu time) - (number of logical CPUs) + 1 |
84 |
|
85 |
tasks in the global queue. This value is relevant for the time taken to look up |
86 |
tasks during scheduling. This will increase if many tasks with CPU affinity set |
87 |
in their policy to limit which CPUs they're allowed to run on if they outnumber |
88 |
the number of CPUs. The +1 is because when rescheduling a task, the CPU's |
89 |
currently running task is put back on the queue. Lookup will be described after |
90 |
the virtual deadline mechanism is explained. |
91 |
|
92 |
Virtual deadline. |
93 |
|
94 |
The key to achieving low latency, scheduling fairness, and "nice level" |
95 |
distribution in BFS is entirely in the virtual deadline mechanism. The one |
96 |
tunable in BFS is the rr_interval, or "round robin interval". This is the |
97 |
maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) |
98 |
tasks of the same nice level will be running for, or looking at it the other |
99 |
way around, the longest duration two tasks of the same nice level will be |
100 |
delayed for. When a task requests cpu time, it is given a quota (time_slice) |
101 |
equal to the rr_interval and a virtual deadline. The virtual deadline is |
102 |
offset from the current time in jiffies by this equation: |
103 |
|
104 |
jiffies + (prio_ratio * rr_interval) |
105 |
|
106 |
The prio_ratio is determined as a ratio compared to the baseline of nice -20 |
107 |
and increases by 10% per nice level. The deadline is a virtual one only in that |
108 |
no guarantee is placed that a task will actually be scheduled by this time, but |
109 |
it is used to compare which task should go next. There are three components to |
110 |
how a task is next chosen. First is time_slice expiration. If a task runs out |
111 |
of its time_slice, it is descheduled, the time_slice is refilled, and the |
112 |
deadline reset to that formula above. Second is sleep, where a task no longer |
113 |
is requesting CPU for whatever reason. The time_slice and deadline are _not_ |
114 |
adjusted in this case and are just carried over for when the task is next |
115 |
scheduled. Third is preemption, and that is when a newly waking task is deemed |
116 |
higher priority than a currently running task on any cpu by virtue of the fact |
117 |
that it has an earlier virtual deadline than the currently running task. The |
118 |
earlier deadline is the key to which task is next chosen for the first and |
119 |
second cases. Once a task is descheduled, it is put back on the queue, and an |
120 |
O(n) lookup of all queued-but-not-running tasks is done to determine which has |
121 |
the earliest deadline and that task is chosen to receive CPU next. |
122 |
|
123 |
The CPU proportion of different nice tasks works out to be approximately the |
124 |
|
125 |
(prio_ratio difference)^2 |
126 |
|
127 |
The reason it is squared is that a task's deadline does not change while it is |
128 |
running unless it runs out of time_slice. Thus, even if the time actually |
129 |
passes the deadline of another task that is queued, it will not get CPU time |
130 |
unless the current running task deschedules, and the time "base" (jiffies) is |
131 |
constantly moving. |
132 |
|
133 |
Task lookup. |
134 |
|
135 |
BFS has 103 priority queues. 100 of these are dedicated to the static priority |
136 |
of realtime tasks, and the remaining 3 are, in order of best to worst priority, |
137 |
SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority |
138 |
scheduling). When a task of these priorities is queued, a bitmap of running |
139 |
priorities is set showing which of these priorities has tasks waiting for CPU |
140 |
time. When a CPU is made to reschedule, the lookup for the next task to get |
141 |
CPU time is performed in the following way: |
142 |
|
143 |
First the bitmap is checked to see what static priority tasks are queued. If |
144 |
any realtime priorities are found, the corresponding queue is checked and the |
145 |
first task listed there is taken (provided CPU affinity is suitable) and lookup |
146 |
is complete. If the priority corresponds to a SCHED_ISO task, they are also |
147 |
taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds |
148 |
to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this |
149 |
stage, every task in the runlist that corresponds to that priority is checked |
150 |
to see which has the earliest set deadline, and (provided it has suitable CPU |
151 |
affinity) it is taken off the runqueue and given the CPU. If a task has an |
152 |
expired deadline, it is taken and the rest of the lookup aborted (as they are |
153 |
chosen in FIFO order). |
154 |
|
155 |
Thus, the lookup is O(n) in the worst case only, where n is as described |
156 |
earlier, as tasks may be chosen before the whole task list is looked over. |
157 |
|
158 |
|
159 |
Scalability. |
160 |
|
161 |
The major limitations of BFS will be that of scalability, as the separate |
162 |
runqueue designs will have less lock contention as the number of CPUs rises. |
163 |
However they do not scale linearly even with separate runqueues as multiple |
164 |
runqueues will need to be locked concurrently on such designs to be able to |
165 |
achieve fair CPU balancing, to try and achieve some sort of nice-level fairness |
166 |
across CPUs, and to achieve low enough latency for tasks on a busy CPU when |
167 |
other CPUs would be more suited. BFS has the advantage that it requires no |
168 |
balancing algorithm whatsoever, as balancing occurs by proxy simply because |
169 |
all CPUs draw off the global runqueue, in priority and deadline order. Despite |
170 |
the fact that scalability is _not_ the prime concern of BFS, it both shows very |
171 |
good scalability to smaller numbers of CPUs and is likely a more scalable design |
172 |
at these numbers of CPUs. |
173 |
|
174 |
It also has some very low overhead scalability features built into the design |
175 |
when it has been deemed their overhead is so marginal that they're worth adding. |
176 |
The first is the local copy of the running process' data to the CPU it's running |
177 |
on to allow that data to be updated lockless where possible. Then there is |
178 |
deference paid to the last CPU a task was running on, by trying that CPU first |
179 |
when looking for an idle CPU to use the next time it's scheduled. Finally there |
180 |
is the notion of "sticky" tasks that are flagged when they are involuntarily |
181 |
descheduled, meaning they still want further CPU time. This sticky flag is |
182 |
used to bias heavily against those tasks being scheduled on a different CPU |
183 |
unless that CPU would be otherwise idle. When a cpu frequency governor is used |
184 |
that scales with CPU load, such as ondemand, sticky tasks are not scheduled |
185 |
on a different CPU at all, preferring instead to go idle. This means the CPU |
186 |
they were bound to is more likely to increase its speed while the other CPU |
187 |
will go idle, thus speeding up total task execution time and likely decreasing |
188 |
power usage. This is the only scenario where BFS will allow a CPU to go idle |
189 |
in preference to scheduling a task on the earliest available spare CPU. |
190 |
|
191 |
The real cost of migrating a task from one CPU to another is entirely dependant |
192 |
on the cache footprint of the task, how cache intensive the task is, how long |
193 |
it's been running on that CPU to take up the bulk of its cache, how big the CPU |
194 |
cache is, how fast and how layered the CPU cache is, how fast a context switch |
195 |
is... and so on. In other words, it's close to random in the real world where we |
196 |
do more than just one sole workload. The only thing we can be sure of is that |
197 |
it's not free. So BFS uses the principle that an idle CPU is a wasted CPU and |
198 |
utilising idle CPUs is more important than cache locality, and cache locality |
199 |
only plays a part after that. |
200 |
|
201 |
When choosing an idle CPU for a waking task, the cache locality is determined |
202 |
according to where the task last ran and then idle CPUs are ranked from best |
203 |
to worst to choose the most suitable idle CPU based on cache locality, NUMA |
204 |
node locality and hyperthread sibling business. They are chosen in the |
205 |
following preference (if idle): |
206 |
|
207 |
* Same core, idle or busy cache, idle threads |
208 |
* Other core, same cache, idle or busy cache, idle threads. |
209 |
* Same node, other CPU, idle cache, idle threads. |
210 |
* Same node, other CPU, busy cache, idle threads. |
211 |
* Same core, busy threads. |
212 |
* Other core, same cache, busy threads. |
213 |
* Same node, other CPU, busy threads. |
214 |
* Other node, other CPU, idle cache, idle threads. |
215 |
* Other node, other CPU, busy cache, idle threads. |
216 |
* Other node, other CPU, busy threads. |
217 |
|
218 |
This shows the SMT or "hyperthread" awareness in the design as well which will |
219 |
choose a real idle core first before a logical SMT sibling which already has |
220 |
tasks on the physical CPU. |
221 |
|
222 |
Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. |
223 |
However this benchmarking was performed on an earlier design that was far less |
224 |
scalable than the current one so it's hard to know how scalable it is in terms |
225 |
of both CPUs (due to the global runqueue) and heavily loaded machines (due to |
226 |
O(n) lookup) at this stage. Note that in terms of scalability, the number of |
227 |
_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) |
228 |
quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark |
229 |
results are very promising indeed, without needing to tweak any knobs, features |
230 |
or options. Benchmark contributions are most welcome. |
231 |
|
232 |
|
233 |
Features |
234 |
|
235 |
As the initial prime target audience for BFS was the average desktop user, it |
236 |
was designed to not need tweaking, tuning or have features set to obtain benefit |
237 |
from it. Thus the number of knobs and features has been kept to an absolute |
238 |
minimum and should not require extra user input for the vast majority of cases. |
239 |
There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval |
240 |
and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition |
241 |
to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is |
242 |
support for CGROUPS. The average user should neither need to know what these |
243 |
are, nor should they need to be using them to have good desktop behaviour. |
244 |
|
245 |
rr_interval |
246 |
|
247 |
There is only one "scheduler" tunable, the round robin interval. This can be |
248 |
accessed in |
249 |
|
250 |
/proc/sys/kernel/rr_interval |
251 |
|
252 |
The value is in milliseconds, and the default value is set to 6ms. Valid values |
253 |
are from 1 to 1000. Decreasing the value will decrease latencies at the cost of |
254 |
decreasing throughput, while increasing it will improve throughput, but at the |
255 |
cost of worsening latencies. The accuracy of the rr interval is limited by HZ |
256 |
resolution of the kernel configuration. Thus, the worst case latencies are |
257 |
usually slightly higher than this actual value. BFS uses "dithering" to try and |
258 |
minimise the effect the Hz limitation has. The default value of 6 is not an |
259 |
arbitrary one. It is based on the fact that humans can detect jitter at |
260 |
approximately 7ms, so aiming for much lower latencies is pointless under most |
261 |
circumstances. It is worth noting this fact when comparing the latency |
262 |
performance of BFS to other schedulers. Worst case latencies being higher than |
263 |
7ms are far worse than average latencies not being in the microsecond range. |
264 |
Experimentation has shown that rr intervals being increased up to 300 can |
265 |
improve throughput but beyond that, scheduling noise from elsewhere prevents |
266 |
further demonstrable throughput. |
267 |
|
268 |
Isochronous scheduling. |
269 |
|
270 |
Isochronous scheduling is a unique scheduling policy designed to provide |
271 |
near-real-time performance to unprivileged (ie non-root) users without the |
272 |
ability to starve the machine indefinitely. Isochronous tasks (which means |
273 |
"same time") are set using, for example, the schedtool application like so: |
274 |
|
275 |
schedtool -I -e amarok |
276 |
|
277 |
This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works |
278 |
is that it has a priority level between true realtime tasks and SCHED_NORMAL |
279 |
which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, |
280 |
if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval |
281 |
rate). However if ISO tasks run for more than a tunable finite amount of time, |
282 |
they are then demoted back to SCHED_NORMAL scheduling. This finite amount of |
283 |
time is the percentage of _total CPU_ available across the machine, configurable |
284 |
as a percentage in the following "resource handling" tunable (as opposed to a |
285 |
scheduler tunable): |
286 |
|
287 |
/proc/sys/kernel/iso_cpu |
288 |
|
289 |
and is set to 70% by default. It is calculated over a rolling 5 second average |
290 |
Because it is the total CPU available, it means that on a multi CPU machine, it |
291 |
is possible to have an ISO task running as realtime scheduling indefinitely on |
292 |
just one CPU, as the other CPUs will be available. Setting this to 100 is the |
293 |
equivalent of giving all users SCHED_RR access and setting it to 0 removes the |
294 |
ability to run any pseudo-realtime tasks. |
295 |
|
296 |
A feature of BFS is that it detects when an application tries to obtain a |
297 |
realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the |
298 |
appropriate privileges to use those policies. When it detects this, it will |
299 |
give the task SCHED_ISO policy instead. Thus it is transparent to the user. |
300 |
Because some applications constantly set their policy as well as their nice |
301 |
level, there is potential for them to undo the override specified by the user |
302 |
on the command line of setting the policy to SCHED_ISO. To counter this, once |
303 |
a task has been set to SCHED_ISO policy, it needs superuser privileges to set |
304 |
it back to SCHED_NORMAL. This will ensure the task remains ISO and all child |
305 |
processes and threads will also inherit the ISO policy. |
306 |
|
307 |
Idleprio scheduling. |
308 |
|
309 |
Idleprio scheduling is a scheduling policy designed to give out CPU to a task |
310 |
_only_ when the CPU would be otherwise idle. The idea behind this is to allow |
311 |
ultra low priority tasks to be run in the background that have virtually no |
312 |
effect on the foreground tasks. This is ideally suited to distributed computing |
313 |
clients (like setiathome, folding, mprime etc) but can also be used to start |
314 |
a video encode or so on without any slowdown of other tasks. To avoid this |
315 |
policy from grabbing shared resources and holding them indefinitely, if it |
316 |
detects a state where the task is waiting on I/O, the machine is about to |
317 |
suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As |
318 |
per the Isochronous task management, once a task has been scheduled as IDLEPRIO, |
319 |
it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can |
320 |
be set to start as SCHED_IDLEPRIO with the schedtool command like so: |
321 |
|
322 |
schedtool -D -e ./mprime |
323 |
|
324 |
Subtick accounting. |
325 |
|
326 |
It is surprisingly difficult to get accurate CPU accounting, and in many cases, |
327 |
the accounting is done by simply determining what is happening at the precise |
328 |
moment a timer tick fires off. This becomes increasingly inaccurate as the |
329 |
timer tick frequency (HZ) is lowered. It is possible to create an application |
330 |
which uses almost 100% CPU, yet by being descheduled at the right time, records |
331 |
zero CPU usage. While the main problem with this is that there are possible |
332 |
security implications, it is also difficult to determine how much CPU a task |
333 |
really does use. BFS tries to use the sub-tick accounting from the TSC clock, |
334 |
where possible, to determine real CPU usage. This is not entirely reliable, but |
335 |
is far more likely to produce accurate CPU usage data than the existing designs |
336 |
and will not show tasks as consuming no CPU usage when they actually are. Thus, |
337 |
the amount of CPU reported as being used by BFS will more accurately represent |
338 |
how much CPU the task itself is using (as is shown for example by the 'time' |
339 |
application), so the reported values may be quite different to other schedulers. |
340 |
Values reported as the 'load' are more prone to problems with this design, but |
341 |
per process values are closer to real usage. When comparing throughput of BFS |
342 |
to other designs, it is important to compare the actual completed work in terms |
343 |
of total wall clock time taken and total work done, rather than the reported |
344 |
"cpu usage". |
345 |
|
346 |
|
347 |
Con Kolivas <kernel@kolivas.org> Tue, 5 Apr 2011 |
|
|
348 |
++ b/Documentation/sysctl/kernel.txt |
Lines 33-38
Link Here
|
33 |
- domainname |
33 |
- domainname |
34 |
- hostname |
34 |
- hostname |
35 |
- hotplug |
35 |
- hotplug |
|
|
36 |
- iso_cpu |
36 |
- kptr_restrict |
37 |
- kptr_restrict |
37 |
- kstack_depth_to_print [ X86 only ] |
38 |
- kstack_depth_to_print [ X86 only ] |
38 |
- l2cr [ PPC only ] |
39 |
- l2cr [ PPC only ] |
Lines 60-65
Link Here
|
60 |
- randomize_va_space |
61 |
- randomize_va_space |
61 |
- real-root-dev ==> Documentation/initrd.txt |
62 |
- real-root-dev ==> Documentation/initrd.txt |
62 |
- reboot-cmd [ SPARC only ] |
63 |
- reboot-cmd [ SPARC only ] |
|
|
64 |
- rr_interval |
63 |
- rtsig-max |
65 |
- rtsig-max |
64 |
- rtsig-nr |
66 |
- rtsig-nr |
65 |
- sem |
67 |
- sem |
Lines 306-311
Link Here
|
306 |
|
308 |
|
307 |
============================================================== |
309 |
============================================================== |
308 |
|
310 |
|
|
|
311 |
iso_cpu: (BFS CPU scheduler only). |
312 |
|
313 |
This sets the percentage cpu that the unprivileged SCHED_ISO tasks can |
314 |
run effectively at realtime priority, averaged over a rolling five |
315 |
seconds over the -whole- system, meaning all cpus. |
316 |
|
317 |
Set to 70 (percent) by default. |
318 |
|
319 |
============================================================== |
320 |
|
309 |
l2cr: (PPC only) |
321 |
l2cr: (PPC only) |
310 |
|
322 |
|
311 |
This flag controls the L2 cache of G3 processor boards. If |
323 |
This flag controls the L2 cache of G3 processor boards. If |
Lines 538-543
Link Here
|
538 |
|
550 |
|
539 |
============================================================== |
551 |
============================================================== |
540 |
|
552 |
|
|
|
553 |
rr_interval: (BFS CPU scheduler only) |
554 |
|
555 |
This is the smallest duration that any cpu process scheduling unit |
556 |
will run for. Increasing this value can increase throughput of cpu |
557 |
bound tasks substantially but at the expense of increased latencies |
558 |
overall. Conversely decreasing it will decrease average and maximum |
559 |
latencies but at the expense of throughput. This value is in |
560 |
milliseconds and the default value chosen depends on the number of |
561 |
cpus available at scheduler initialisation with a minimum of 6. |
562 |
|
563 |
Valid values are from 1-1000. |
564 |
|
565 |
============================================================== |
566 |
|
541 |
rtsig-max & rtsig-nr: |
567 |
rtsig-max & rtsig-nr: |
542 |
|
568 |
|
543 |
The file rtsig-max can be used to tune the maximum number |
569 |
The file rtsig-max can be used to tune the maximum number |
544 |
-- a/fs/proc/base.c |
570 |
++ b/fs/proc/base.c |
Lines 339-345
Link Here
|
339 |
static int proc_pid_schedstat(struct task_struct *task, char *buffer) |
339 |
static int proc_pid_schedstat(struct task_struct *task, char *buffer) |
340 |
{ |
340 |
{ |
341 |
return sprintf(buffer, "%llu %llu %lu\n", |
341 |
return sprintf(buffer, "%llu %llu %lu\n", |
342 |
(unsigned long long)task->se.sum_exec_runtime, |
342 |
(unsigned long long)tsk_seruntime(task), |
343 |
(unsigned long long)task->sched_info.run_delay, |
343 |
(unsigned long long)task->sched_info.run_delay, |
344 |
task->sched_info.pcount); |
344 |
task->sched_info.pcount); |
345 |
} |
345 |
} |
346 |
-- a/include/linux/init_task.h |
346 |
++ b/include/linux/init_task.h |
Lines 152-163
Link Here
|
152 |
# define INIT_VTIME(tsk) |
152 |
# define INIT_VTIME(tsk) |
153 |
#endif |
153 |
#endif |
154 |
|
154 |
|
155 |
#define INIT_TASK_COMM "swapper" |
|
|
156 |
|
157 |
/* |
155 |
/* |
158 |
* INIT_TASK is used to set up the first task table, touch at |
156 |
* INIT_TASK is used to set up the first task table, touch at |
159 |
* your own risk!. Base=0, limit=0x1fffff (=2MB) |
157 |
* your own risk!. Base=0, limit=0x1fffff (=2MB) |
160 |
*/ |
158 |
*/ |
|
|
159 |
#ifdef CONFIG_SCHED_BFS |
160 |
#define INIT_TASK_COMM "BFS" |
161 |
#define INIT_TASK(tsk) \ |
162 |
{ \ |
163 |
.state = 0, \ |
164 |
.stack = &init_thread_info, \ |
165 |
.usage = ATOMIC_INIT(2), \ |
166 |
.flags = PF_KTHREAD, \ |
167 |
.prio = NORMAL_PRIO, \ |
168 |
.static_prio = MAX_PRIO-20, \ |
169 |
.normal_prio = NORMAL_PRIO, \ |
170 |
.deadline = 0, \ |
171 |
.policy = SCHED_NORMAL, \ |
172 |
.cpus_allowed = CPU_MASK_ALL, \ |
173 |
.mm = NULL, \ |
174 |
.active_mm = &init_mm, \ |
175 |
.run_list = LIST_HEAD_INIT(tsk.run_list), \ |
176 |
.time_slice = HZ, \ |
177 |
.tasks = LIST_HEAD_INIT(tsk.tasks), \ |
178 |
INIT_PUSHABLE_TASKS(tsk) \ |
179 |
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \ |
180 |
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ |
181 |
.real_parent = &tsk, \ |
182 |
.parent = &tsk, \ |
183 |
.children = LIST_HEAD_INIT(tsk.children), \ |
184 |
.sibling = LIST_HEAD_INIT(tsk.sibling), \ |
185 |
.group_leader = &tsk, \ |
186 |
RCU_POINTER_INITIALIZER(real_cred, &init_cred), \ |
187 |
RCU_POINTER_INITIALIZER(cred, &init_cred), \ |
188 |
.comm = INIT_TASK_COMM, \ |
189 |
.thread = INIT_THREAD, \ |
190 |
.fs = &init_fs, \ |
191 |
.files = &init_files, \ |
192 |
.signal = &init_signals, \ |
193 |
.sighand = &init_sighand, \ |
194 |
.nsproxy = &init_nsproxy, \ |
195 |
.pending = { \ |
196 |
.list = LIST_HEAD_INIT(tsk.pending.list), \ |
197 |
.signal = {{0}}}, \ |
198 |
.blocked = {{0}}, \ |
199 |
.alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ |
200 |
.journal_info = NULL, \ |
201 |
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ |
202 |
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ |
203 |
.timer_slack_ns = 50000, /* 50 usec default slack */ \ |
204 |
.pids = { \ |
205 |
[PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ |
206 |
[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ |
207 |
[PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ |
208 |
}, \ |
209 |
INIT_IDS \ |
210 |
INIT_PERF_EVENTS(tsk) \ |
211 |
INIT_TRACE_IRQFLAGS \ |
212 |
INIT_LOCKDEP \ |
213 |
INIT_FTRACE_GRAPH \ |
214 |
INIT_TRACE_RECURSION \ |
215 |
INIT_TASK_RCU_PREEMPT(tsk) \ |
216 |
} |
217 |
#else /* CONFIG_SCHED_BFS */ |
218 |
#define INIT_TASK_COMM "swapper" |
161 |
#define INIT_TASK(tsk) \ |
219 |
#define INIT_TASK(tsk) \ |
162 |
{ \ |
220 |
{ \ |
163 |
.state = 0, \ |
221 |
.state = 0, \ |
Lines 223-229
Link Here
|
223 |
INIT_CPUSET_SEQ \ |
281 |
INIT_CPUSET_SEQ \ |
224 |
INIT_VTIME(tsk) \ |
282 |
INIT_VTIME(tsk) \ |
225 |
} |
283 |
} |
226 |
|
284 |
#endif /* CONFIG_SCHED_BFS */ |
227 |
|
285 |
|
228 |
#define INIT_CPU_TIMERS(cpu_timers) \ |
286 |
#define INIT_CPU_TIMERS(cpu_timers) \ |
229 |
{ \ |
287 |
{ \ |
230 |
-- a/include/linux/ioprio.h |
288 |
++ b/include/linux/ioprio.h |
Lines 52-57
Link Here
|
52 |
*/ |
52 |
*/ |
53 |
static inline int task_nice_ioprio(struct task_struct *task) |
53 |
static inline int task_nice_ioprio(struct task_struct *task) |
54 |
{ |
54 |
{ |
|
|
55 |
if (iso_task(task)) |
56 |
return 0; |
55 |
return (task_nice(task) + 20) / 5; |
57 |
return (task_nice(task) + 20) / 5; |
56 |
} |
58 |
} |
57 |
|
59 |
|
58 |
-- a/include/linux/sched.h |
60 |
++ b/include/linux/sched.h |
Lines 229-236
Link Here
|
229 |
extern void init_idle(struct task_struct *idle, int cpu); |
229 |
extern void init_idle(struct task_struct *idle, int cpu); |
230 |
extern void init_idle_bootup_task(struct task_struct *idle); |
230 |
extern void init_idle_bootup_task(struct task_struct *idle); |
231 |
|
231 |
|
232 |
extern int runqueue_is_locked(int cpu); |
|
|
233 |
|
234 |
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) |
232 |
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) |
235 |
extern void nohz_balance_enter_idle(int cpu); |
233 |
extern void nohz_balance_enter_idle(int cpu); |
236 |
extern void set_cpu_sd_state_idle(void); |
234 |
extern void set_cpu_sd_state_idle(void); |
Lines 1040-1057
Link Here
|
1040 |
|
1038 |
|
1041 |
#ifdef CONFIG_SMP |
1039 |
#ifdef CONFIG_SMP |
1042 |
struct llist_node wake_entry; |
1040 |
struct llist_node wake_entry; |
1043 |
int on_cpu; |
|
|
1044 |
#endif |
1041 |
#endif |
1045 |
int on_rq; |
1042 |
#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BFS) |
|
|
1043 |
bool on_cpu; |
1044 |
#endif |
1045 |
#ifndef CONFIG_SCHED_BFS |
1046 |
bool on_rq; |
1047 |
#endif |
1046 |
|
1048 |
|
1047 |
int prio, static_prio, normal_prio; |
1049 |
int prio, static_prio, normal_prio; |
1048 |
unsigned int rt_priority; |
1050 |
unsigned int rt_priority; |
|
|
1051 |
#ifdef CONFIG_SCHED_BFS |
1052 |
int time_slice; |
1053 |
u64 deadline; |
1054 |
struct list_head run_list; |
1055 |
u64 last_ran; |
1056 |
u64 sched_time; /* sched_clock time spent running */ |
1057 |
#ifdef CONFIG_SMP |
1058 |
bool sticky; /* Soft affined flag */ |
1059 |
#endif |
1060 |
unsigned long rt_timeout; |
1061 |
#else /* CONFIG_SCHED_BFS */ |
1049 |
const struct sched_class *sched_class; |
1062 |
const struct sched_class *sched_class; |
1050 |
struct sched_entity se; |
1063 |
struct sched_entity se; |
1051 |
struct sched_rt_entity rt; |
1064 |
struct sched_rt_entity rt; |
|
|
1065 |
|
1052 |
#ifdef CONFIG_CGROUP_SCHED |
1066 |
#ifdef CONFIG_CGROUP_SCHED |
1053 |
struct task_group *sched_task_group; |
1067 |
struct task_group *sched_task_group; |
1054 |
#endif |
1068 |
#endif |
|
|
1069 |
#endif |
1055 |
|
1070 |
|
1056 |
#ifdef CONFIG_PREEMPT_NOTIFIERS |
1071 |
#ifdef CONFIG_PREEMPT_NOTIFIERS |
1057 |
/* list of struct preempt_notifier: */ |
1072 |
/* list of struct preempt_notifier: */ |
Lines 1162-1167
Link Here
|
1162 |
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ |
1177 |
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ |
1163 |
|
1178 |
|
1164 |
cputime_t utime, stime, utimescaled, stimescaled; |
1179 |
cputime_t utime, stime, utimescaled, stimescaled; |
|
|
1180 |
#ifdef CONFIG_SCHED_BFS |
1181 |
unsigned long utime_pc, stime_pc; |
1182 |
#endif |
1165 |
cputime_t gtime; |
1183 |
cputime_t gtime; |
1166 |
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
1184 |
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
1167 |
struct cputime prev_cputime; |
1185 |
struct cputime prev_cputime; |
Lines 1418-1423
Link Here
|
1418 |
#endif |
1436 |
#endif |
1419 |
}; |
1437 |
}; |
1420 |
|
1438 |
|
|
|
1439 |
#ifdef CONFIG_SCHED_BFS |
1440 |
bool grunqueue_is_locked(void); |
1441 |
void grq_unlock_wait(void); |
1442 |
void cpu_scaling(int cpu); |
1443 |
void cpu_nonscaling(int cpu); |
1444 |
bool above_background_load(void); |
1445 |
#define tsk_seruntime(t) ((t)->sched_time) |
1446 |
#define tsk_rttimeout(t) ((t)->rt_timeout) |
1447 |
|
1448 |
static inline void tsk_cpus_current(struct task_struct *p) |
1449 |
{ |
1450 |
} |
1451 |
|
1452 |
static inline int runqueue_is_locked(int cpu) |
1453 |
{ |
1454 |
return grunqueue_is_locked(); |
1455 |
} |
1456 |
|
1457 |
void print_scheduler_version(void); |
1458 |
|
1459 |
static inline bool iso_task(struct task_struct *p) |
1460 |
{ |
1461 |
return (p->policy == SCHED_ISO); |
1462 |
} |
1463 |
#else /* CFS */ |
1464 |
extern int runqueue_is_locked(int cpu); |
1465 |
static inline void cpu_scaling(int cpu) |
1466 |
{ |
1467 |
} |
1468 |
|
1469 |
static inline void cpu_nonscaling(int cpu) |
1470 |
{ |
1471 |
} |
1472 |
#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) |
1473 |
#define tsk_rttimeout(t) ((t)->rt.timeout) |
1474 |
|
1475 |
static inline void tsk_cpus_current(struct task_struct *p) |
1476 |
{ |
1477 |
p->nr_cpus_allowed = current->nr_cpus_allowed; |
1478 |
} |
1479 |
|
1480 |
static inline void print_scheduler_version(void) |
1481 |
{ |
1482 |
printk(KERN_INFO"CFS CPU scheduler.\n"); |
1483 |
} |
1484 |
|
1485 |
static inline bool iso_task(struct task_struct *p) |
1486 |
{ |
1487 |
return false; |
1488 |
} |
1489 |
|
1490 |
/* Anyone feel like implementing this? */ |
1491 |
static inline bool above_background_load(void) |
1492 |
{ |
1493 |
return false; |
1494 |
} |
1495 |
#endif /* CONFIG_SCHED_BFS */ |
1496 |
|
1421 |
/* Future-safe accessor for struct task_struct's cpus_allowed. */ |
1497 |
/* Future-safe accessor for struct task_struct's cpus_allowed. */ |
1422 |
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) |
1498 |
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) |
1423 |
|
1499 |
|
Lines 1844-1850
Link Here
|
1844 |
task_sched_runtime(struct task_struct *task); |
1920 |
task_sched_runtime(struct task_struct *task); |
1845 |
|
1921 |
|
1846 |
/* sched_exec is called by processes performing an exec */ |
1922 |
/* sched_exec is called by processes performing an exec */ |
1847 |
#ifdef CONFIG_SMP |
1923 |
#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BFS) |
1848 |
extern void sched_exec(void); |
1924 |
extern void sched_exec(void); |
1849 |
#else |
1925 |
#else |
1850 |
#define sched_exec() {} |
1926 |
#define sched_exec() {} |
Lines 2549-2555
Link Here
|
2549 |
return 0; |
2625 |
return 0; |
2550 |
} |
2626 |
} |
2551 |
|
2627 |
|
2552 |
static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) |
2628 |
static inline void set_task_cpu(struct task_struct *p, int cpu) |
2553 |
{ |
2629 |
{ |
2554 |
} |
2630 |
} |
2555 |
|
2631 |
|
2556 |
-- a/init/Kconfig |
2632 |
++ b/init/Kconfig |
Lines 28-33
Link Here
|
28 |
|
28 |
|
29 |
menu "General setup" |
29 |
menu "General setup" |
30 |
|
30 |
|
|
|
31 |
config SCHED_BFS |
32 |
bool "BFS cpu scheduler" |
33 |
---help--- |
34 |
The Brain Fuck CPU Scheduler for excellent interactivity and |
35 |
responsiveness on the desktop and solid scalability on normal |
36 |
hardware and commodity servers. Not recommended for 4096 CPUs. |
37 |
|
38 |
Currently incompatible with the Group CPU scheduler, and RCU TORTURE |
39 |
TEST so these options are disabled. |
40 |
|
41 |
Say Y here. |
42 |
default y |
43 |
|
44 |
|
31 |
config BROKEN |
45 |
config BROKEN |
32 |
bool |
46 |
bool |
33 |
|
47 |
|
Lines 302-308
Link Here
|
302 |
# Kind of a stub config for the pure tick based cputime accounting |
316 |
# Kind of a stub config for the pure tick based cputime accounting |
303 |
config TICK_CPU_ACCOUNTING |
317 |
config TICK_CPU_ACCOUNTING |
304 |
bool "Simple tick based cputime accounting" |
318 |
bool "Simple tick based cputime accounting" |
305 |
depends on !S390 && !NO_HZ_FULL |
319 |
depends on !S390 && !NO_HZ_FULL && !SCHED_BFS |
306 |
help |
320 |
help |
307 |
This is the basic tick based cputime accounting that maintains |
321 |
This is the basic tick based cputime accounting that maintains |
308 |
statistics about user, system and idle time spent on per jiffies |
322 |
statistics about user, system and idle time spent on per jiffies |
Lines 325-331
Link Here
|
325 |
|
339 |
|
326 |
config VIRT_CPU_ACCOUNTING_GEN |
340 |
config VIRT_CPU_ACCOUNTING_GEN |
327 |
bool "Full dynticks CPU time accounting" |
341 |
bool "Full dynticks CPU time accounting" |
328 |
depends on HAVE_CONTEXT_TRACKING && 64BIT |
342 |
depends on HAVE_CONTEXT_TRACKING && 64BIT && !SCHED_BFS |
329 |
select VIRT_CPU_ACCOUNTING |
343 |
select VIRT_CPU_ACCOUNTING |
330 |
select CONTEXT_TRACKING |
344 |
select CONTEXT_TRACKING |
331 |
help |
345 |
help |
Lines 488-494
Link Here
|
488 |
|
502 |
|
489 |
config RCU_USER_QS |
503 |
config RCU_USER_QS |
490 |
bool "Consider userspace as in RCU extended quiescent state" |
504 |
bool "Consider userspace as in RCU extended quiescent state" |
491 |
depends on HAVE_CONTEXT_TRACKING && SMP |
505 |
depends on HAVE_CONTEXT_TRACKING && SMP && !SCHED_BFS |
492 |
select CONTEXT_TRACKING |
506 |
select CONTEXT_TRACKING |
493 |
help |
507 |
help |
494 |
This option sets hooks on kernel / userspace boundaries and |
508 |
This option sets hooks on kernel / userspace boundaries and |
Lines 657-663
Link Here
|
657 |
|
671 |
|
658 |
config RCU_NOCB_CPU |
672 |
config RCU_NOCB_CPU |
659 |
bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL" |
673 |
bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL" |
660 |
depends on TREE_RCU || TREE_PREEMPT_RCU |
674 |
depends on (TREE_RCU || TREE_PREEMPT_RCU) && !SCHED_BFS |
661 |
default n |
675 |
default n |
662 |
help |
676 |
help |
663 |
Use this option to reduce OS jitter for aggressive HPC or |
677 |
Use this option to reduce OS jitter for aggressive HPC or |
Lines 795-800
Link Here
|
795 |
depends on ARCH_SUPPORTS_NUMA_BALANCING |
809 |
depends on ARCH_SUPPORTS_NUMA_BALANCING |
796 |
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY |
810 |
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY |
797 |
depends on SMP && NUMA && MIGRATION |
811 |
depends on SMP && NUMA && MIGRATION |
|
|
812 |
depends on !SCHED_BFS |
798 |
help |
813 |
help |
799 |
This option adds support for automatic NUMA aware memory/task placement. |
814 |
This option adds support for automatic NUMA aware memory/task placement. |
800 |
The mechanism is quite primitive and is based on migrating memory when |
815 |
The mechanism is quite primitive and is based on migrating memory when |
Lines 857-862
Link Here
|
857 |
|
872 |
|
858 |
config CGROUP_CPUACCT |
873 |
config CGROUP_CPUACCT |
859 |
bool "Simple CPU accounting cgroup subsystem" |
874 |
bool "Simple CPU accounting cgroup subsystem" |
|
|
875 |
depends on !SCHED_BFS |
860 |
help |
876 |
help |
861 |
Provides a simple Resource Controller for monitoring the |
877 |
Provides a simple Resource Controller for monitoring the |
862 |
total CPU consumed by the tasks in a cgroup. |
878 |
total CPU consumed by the tasks in a cgroup. |
Lines 959-964
Link Here
|
959 |
|
975 |
|
960 |
menuconfig CGROUP_SCHED |
976 |
menuconfig CGROUP_SCHED |
961 |
bool "Group CPU scheduler" |
977 |
bool "Group CPU scheduler" |
|
|
978 |
depends on !SCHED_BFS |
962 |
default n |
979 |
default n |
963 |
help |
980 |
help |
964 |
This feature lets CPU scheduler recognize task groups and control CPU |
981 |
This feature lets CPU scheduler recognize task groups and control CPU |
Lines 1123-1128
Link Here
|
1123 |
|
1140 |
|
1124 |
config SCHED_AUTOGROUP |
1141 |
config SCHED_AUTOGROUP |
1125 |
bool "Automatic process group scheduling" |
1142 |
bool "Automatic process group scheduling" |
|
|
1143 |
depends on !SCHED_BFS |
1126 |
select EVENTFD |
1144 |
select EVENTFD |
1127 |
select CGROUPS |
1145 |
select CGROUPS |
1128 |
select CGROUP_SCHED |
1146 |
select CGROUP_SCHED |
Lines 1526-1563
Link Here
|
1526 |
|
1544 |
|
1527 |
On non-ancient distros (post-2000 ones) N is usually a safe choice. |
1545 |
On non-ancient distros (post-2000 ones) N is usually a safe choice. |
1528 |
|
1546 |
|
1529 |
choice |
|
|
1530 |
prompt "Choose SLAB allocator" |
1531 |
default SLUB |
1532 |
help |
1533 |
This option allows to select a slab allocator. |
1534 |
|
1535 |
config SLAB |
1536 |
bool "SLAB" |
1537 |
help |
1538 |
The regular slab allocator that is established and known to work |
1539 |
well in all environments. It organizes cache hot objects in |
1540 |
per cpu and per node queues. |
1541 |
|
1542 |
config SLUB |
1547 |
config SLUB |
1543 |
bool "SLUB (Unqueued Allocator)" |
1548 |
def_bool y |
1544 |
help |
|
|
1545 |
SLUB is a slab allocator that minimizes cache line usage |
1546 |
instead of managing queues of cached objects (SLAB approach). |
1547 |
Per cpu caching is realized using slabs of objects instead |
1548 |
of queues of objects. SLUB can use memory efficiently |
1549 |
and has enhanced diagnostics. SLUB is the default choice for |
1550 |
a slab allocator. |
1551 |
|
1552 |
config SLOB |
1553 |
depends on EXPERT |
1554 |
bool "SLOB (Simple Allocator)" |
1555 |
help |
1556 |
SLOB replaces the stock allocator with a drastically simpler |
1557 |
allocator. SLOB is generally more space efficient but |
1558 |
does not perform as well on large systems. |
1559 |
|
1560 |
endchoice |
1561 |
|
1549 |
|
1562 |
config MMAP_ALLOW_UNINITIALIZED |
1550 |
config MMAP_ALLOW_UNINITIALIZED |
1563 |
bool "Allow mmapped anonymous memory to be uninitialized" |
1551 |
bool "Allow mmapped anonymous memory to be uninitialized" |
1564 |
-- a/init/main.c |
1552 |
++ b/init/main.c |
Lines 700-706
Link Here
|
700 |
return ret; |
700 |
return ret; |
701 |
} |
701 |
} |
702 |
|
702 |
|
703 |
|
|
|
704 |
extern initcall_t __initcall_start[]; |
703 |
extern initcall_t __initcall_start[]; |
705 |
extern initcall_t __initcall0_start[]; |
704 |
extern initcall_t __initcall0_start[]; |
706 |
extern initcall_t __initcall1_start[]; |
705 |
extern initcall_t __initcall1_start[]; |
Lines 820-825
Link Here
|
820 |
|
819 |
|
821 |
flush_delayed_fput(); |
820 |
flush_delayed_fput(); |
822 |
|
821 |
|
|
|
822 |
print_scheduler_version(); |
823 |
|
823 |
if (ramdisk_execute_command) { |
824 |
if (ramdisk_execute_command) { |
824 |
if (!run_init_process(ramdisk_execute_command)) |
825 |
if (!run_init_process(ramdisk_execute_command)) |
825 |
return 0; |
826 |
return 0; |
826 |
-- a/kernel/delayacct.c |
827 |
++ b/kernel/delayacct.c |
Lines 133-139
Link Here
|
133 |
*/ |
133 |
*/ |
134 |
t1 = tsk->sched_info.pcount; |
134 |
t1 = tsk->sched_info.pcount; |
135 |
t2 = tsk->sched_info.run_delay; |
135 |
t2 = tsk->sched_info.run_delay; |
136 |
t3 = tsk->se.sum_exec_runtime; |
136 |
t3 = tsk_seruntime(tsk); |
137 |
|
137 |
|
138 |
d->cpu_count += t1; |
138 |
d->cpu_count += t1; |
139 |
|
139 |
|
140 |
-- a/kernel/exit.c |
140 |
++ b/kernel/exit.c |
Lines 135-141
Link Here
|
135 |
sig->inblock += task_io_get_inblock(tsk); |
135 |
sig->inblock += task_io_get_inblock(tsk); |
136 |
sig->oublock += task_io_get_oublock(tsk); |
136 |
sig->oublock += task_io_get_oublock(tsk); |
137 |
task_io_accounting_add(&sig->ioac, &tsk->ioac); |
137 |
task_io_accounting_add(&sig->ioac, &tsk->ioac); |
138 |
sig->sum_sched_runtime += tsk->se.sum_exec_runtime; |
138 |
sig->sum_sched_runtime += tsk_seruntime(tsk); |
139 |
} |
139 |
} |
140 |
|
140 |
|
141 |
sig->nr_threads--; |
141 |
sig->nr_threads--; |
142 |
-- a/kernel/posix-cpu-timers.c |
142 |
++ b/kernel/posix-cpu-timers.c |
Lines 498-508
Link Here
|
498 |
{ |
498 |
{ |
499 |
cputime_t utime, stime; |
499 |
cputime_t utime, stime; |
500 |
|
500 |
|
501 |
add_device_randomness((const void*) &tsk->se.sum_exec_runtime, |
501 |
add_device_randomness((const void*) &tsk_seruntime(tsk), |
502 |
sizeof(unsigned long long)); |
502 |
sizeof(unsigned long long)); |
503 |
task_cputime(tsk, &utime, &stime); |
503 |
task_cputime(tsk, &utime, &stime); |
504 |
cleanup_timers(tsk->cpu_timers, |
504 |
cleanup_timers(tsk->cpu_timers, |
505 |
utime, stime, tsk->se.sum_exec_runtime); |
505 |
utime, stime, tsk_seruntime(tsk)); |
506 |
|
506 |
|
507 |
} |
507 |
} |
508 |
void posix_cpu_timers_exit_group(struct task_struct *tsk) |
508 |
void posix_cpu_timers_exit_group(struct task_struct *tsk) |
Lines 513-519
Link Here
|
513 |
task_cputime(tsk, &utime, &stime); |
513 |
task_cputime(tsk, &utime, &stime); |
514 |
cleanup_timers(tsk->signal->cpu_timers, |
514 |
cleanup_timers(tsk->signal->cpu_timers, |
515 |
utime + sig->utime, stime + sig->stime, |
515 |
utime + sig->utime, stime + sig->stime, |
516 |
tsk->se.sum_exec_runtime + sig->sum_sched_runtime); |
516 |
tsk_seruntime(tsk) + sig->sum_sched_runtime); |
517 |
} |
517 |
} |
518 |
|
518 |
|
519 |
static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) |
519 |
static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) |
Lines 976-982
Link Here
|
976 |
struct cpu_timer_list *t = list_first_entry(timers, |
976 |
struct cpu_timer_list *t = list_first_entry(timers, |
977 |
struct cpu_timer_list, |
977 |
struct cpu_timer_list, |
978 |
entry); |
978 |
entry); |
979 |
if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { |
979 |
if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) { |
980 |
tsk->cputime_expires.sched_exp = t->expires.sched; |
980 |
tsk->cputime_expires.sched_exp = t->expires.sched; |
981 |
break; |
981 |
break; |
982 |
} |
982 |
} |
Lines 993-999
Link Here
|
993 |
ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); |
993 |
ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); |
994 |
|
994 |
|
995 |
if (hard != RLIM_INFINITY && |
995 |
if (hard != RLIM_INFINITY && |
996 |
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { |
996 |
tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { |
997 |
/* |
997 |
/* |
998 |
* At the hard limit, we just die. |
998 |
* At the hard limit, we just die. |
999 |
* No need to calculate anything else now. |
999 |
* No need to calculate anything else now. |
Lines 1001-1007
Link Here
|
1001 |
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
1001 |
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
1002 |
return; |
1002 |
return; |
1003 |
} |
1003 |
} |
1004 |
if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { |
1004 |
if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { |
1005 |
/* |
1005 |
/* |
1006 |
* At the soft limit, send a SIGXCPU every second. |
1006 |
* At the soft limit, send a SIGXCPU every second. |
1007 |
*/ |
1007 |
*/ |
Lines 1282-1288
Link Here
|
1282 |
struct task_cputime task_sample = { |
1282 |
struct task_cputime task_sample = { |
1283 |
.utime = utime, |
1283 |
.utime = utime, |
1284 |
.stime = stime, |
1284 |
.stime = stime, |
1285 |
.sum_exec_runtime = tsk->se.sum_exec_runtime |
1285 |
.sum_exec_runtime = tsk_seruntime(tsk) |
1286 |
}; |
1286 |
}; |
1287 |
|
1287 |
|
1288 |
if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) |
1288 |
if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) |
1289 |
-- a/kernel/sysctl.c |
1289 |
++ b/kernel/sysctl.c |
Lines 128-134
Link Here
|
128 |
static int __maybe_unused two = 2; |
128 |
static int __maybe_unused two = 2; |
129 |
static int __maybe_unused three = 3; |
129 |
static int __maybe_unused three = 3; |
130 |
static unsigned long one_ul = 1; |
130 |
static unsigned long one_ul = 1; |
131 |
static int one_hundred = 100; |
131 |
static int __maybe_unused one_hundred = 100; |
|
|
132 |
#ifdef CONFIG_SCHED_BFS |
133 |
extern int rr_interval; |
134 |
extern int sched_iso_cpu; |
135 |
static int __read_mostly one_thousand = 1000; |
136 |
#endif |
132 |
#ifdef CONFIG_PRINTK |
137 |
#ifdef CONFIG_PRINTK |
133 |
static int ten_thousand = 10000; |
138 |
static int ten_thousand = 10000; |
134 |
#endif |
139 |
#endif |
Lines 256-262
Link Here
|
256 |
{ } |
261 |
{ } |
257 |
}; |
262 |
}; |
258 |
|
263 |
|
259 |
#ifdef CONFIG_SCHED_DEBUG |
264 |
#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS) |
260 |
static int min_sched_granularity_ns = 100000; /* 100 usecs */ |
265 |
static int min_sched_granularity_ns = 100000; /* 100 usecs */ |
261 |
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
266 |
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
262 |
static int min_wakeup_granularity_ns; /* 0 usecs */ |
267 |
static int min_wakeup_granularity_ns; /* 0 usecs */ |
Lines 273-278
Link Here
|
273 |
#endif |
278 |
#endif |
274 |
|
279 |
|
275 |
static struct ctl_table kern_table[] = { |
280 |
static struct ctl_table kern_table[] = { |
|
|
281 |
#ifndef CONFIG_SCHED_BFS |
276 |
{ |
282 |
{ |
277 |
.procname = "sched_child_runs_first", |
283 |
.procname = "sched_child_runs_first", |
278 |
.data = &sysctl_sched_child_runs_first, |
284 |
.data = &sysctl_sched_child_runs_first, |
Lines 436-441
Link Here
|
436 |
.extra1 = &one, |
442 |
.extra1 = &one, |
437 |
}, |
443 |
}, |
438 |
#endif |
444 |
#endif |
|
|
445 |
#endif /* !CONFIG_SCHED_BFS */ |
439 |
#ifdef CONFIG_PROVE_LOCKING |
446 |
#ifdef CONFIG_PROVE_LOCKING |
440 |
{ |
447 |
{ |
441 |
.procname = "prove_locking", |
448 |
.procname = "prove_locking", |
Lines 907-912
Link Here
|
907 |
.proc_handler = proc_dointvec, |
914 |
.proc_handler = proc_dointvec, |
908 |
}, |
915 |
}, |
909 |
#endif |
916 |
#endif |
|
|
917 |
#ifdef CONFIG_SCHED_BFS |
918 |
{ |
919 |
.procname = "rr_interval", |
920 |
.data = &rr_interval, |
921 |
.maxlen = sizeof (int), |
922 |
.mode = 0644, |
923 |
.proc_handler = &proc_dointvec_minmax, |
924 |
.extra1 = &one, |
925 |
.extra2 = &one_thousand, |
926 |
}, |
927 |
{ |
928 |
.procname = "iso_cpu", |
929 |
.data = &sched_iso_cpu, |
930 |
.maxlen = sizeof (int), |
931 |
.mode = 0644, |
932 |
.proc_handler = &proc_dointvec_minmax, |
933 |
.extra1 = &zero, |
934 |
.extra2 = &one_hundred, |
935 |
}, |
936 |
#endif |
910 |
#if defined(CONFIG_S390) && defined(CONFIG_SMP) |
937 |
#if defined(CONFIG_S390) && defined(CONFIG_SMP) |
911 |
{ |
938 |
{ |
912 |
.procname = "spin_retry", |
939 |
.procname = "spin_retry", |
913 |
-- a/lib/Kconfig.debug |
940 |
++ b/lib/Kconfig.debug |
Lines 940-946
Link Here
|
940 |
|
940 |
|
941 |
config RCU_TORTURE_TEST |
941 |
config RCU_TORTURE_TEST |
942 |
tristate "torture tests for RCU" |
942 |
tristate "torture tests for RCU" |
943 |
depends on DEBUG_KERNEL |
943 |
depends on DEBUG_KERNEL && !SCHED_BFS |
944 |
default n |
944 |
default n |
945 |
help |
945 |
help |
946 |
This option provides a kernel module that runs torture tests |
946 |
This option provides a kernel module that runs torture tests |
947 |
-- a/include/linux/jiffies.h |
947 |
++ b/include/linux/jiffies.h |
Lines 159-165
Link Here
|
159 |
* Have the 32 bit jiffies value wrap 5 minutes after boot |
159 |
* Have the 32 bit jiffies value wrap 5 minutes after boot |
160 |
* so jiffies wrap bugs show up earlier. |
160 |
* so jiffies wrap bugs show up earlier. |
161 |
*/ |
161 |
*/ |
162 |
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) |
162 |
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) |
163 |
|
163 |
|
164 |
/* |
164 |
/* |
165 |
* Change timeval to jiffies, trying to avoid the |
165 |
* Change timeval to jiffies, trying to avoid the |
166 |
-- a/drivers/cpufreq/cpufreq.c |
166 |
++ b/drivers/cpufreq/cpufreq.c |
Lines 30-35
Link Here
|
30 |
#include <linux/cpu.h> |
30 |
#include <linux/cpu.h> |
31 |
#include <linux/completion.h> |
31 |
#include <linux/completion.h> |
32 |
#include <linux/mutex.h> |
32 |
#include <linux/mutex.h> |
|
|
33 |
#include <linux/sched.h> |
33 |
#include <linux/syscore_ops.h> |
34 |
#include <linux/syscore_ops.h> |
34 |
|
35 |
|
35 |
#include <trace/events/power.h> |
36 |
#include <trace/events/power.h> |
Lines 1474-1479
Link Here
|
1474 |
|
1475 |
|
1475 |
if (cpufreq_driver->target) |
1476 |
if (cpufreq_driver->target) |
1476 |
retval = cpufreq_driver->target(policy, target_freq, relation); |
1477 |
retval = cpufreq_driver->target(policy, target_freq, relation); |
|
|
1478 |
if (likely(retval != -EINVAL)) { |
1479 |
if (target_freq == policy->max) |
1480 |
cpu_nonscaling(policy->cpu); |
1481 |
else |
1482 |
cpu_scaling(policy->cpu); |
1483 |
} |
1477 |
|
1484 |
|
1478 |
return retval; |
1485 |
return retval; |
1479 |
} |
1486 |
} |
1480 |
-- a/drivers/cpufreq/cpufreq_ondemand.c |
1487 |
++ b/drivers/cpufreq/cpufreq_ondemand.c |
Lines 29-36
Link Here
|
29 |
#include "cpufreq_governor.h" |
29 |
#include "cpufreq_governor.h" |
30 |
|
30 |
|
31 |
/* On-demand governor macros */ |
31 |
/* On-demand governor macros */ |
32 |
#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) |
32 |
#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (26) |
33 |
#define DEF_FREQUENCY_UP_THRESHOLD (80) |
33 |
#define DEF_FREQUENCY_UP_THRESHOLD (63) |
34 |
#define DEF_SAMPLING_DOWN_FACTOR (1) |
34 |
#define DEF_SAMPLING_DOWN_FACTOR (1) |
35 |
#define MAX_SAMPLING_DOWN_FACTOR (100000) |
35 |
#define MAX_SAMPLING_DOWN_FACTOR (100000) |
36 |
#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) |
36 |
#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) |
Lines 160-169
Link Here
|
160 |
} |
160 |
} |
161 |
|
161 |
|
162 |
/* |
162 |
/* |
163 |
* Every sampling_rate, we check, if current idle time is less than 20% |
163 |
* Every sampling_rate, we check, if current idle time is less than 37% |
164 |
* (default), then we try to increase frequency. Every sampling_rate, we look |
164 |
* (default), then we try to increase frequency. Every sampling_rate, we look |
165 |
* for the lowest frequency which can sustain the load while keeping idle time |
165 |
* for the lowest frequency which can sustain the load while keeping idle time |
166 |
* over 30%. If such a frequency exist, we try to decrease to this frequency. |
166 |
* over 63%. If such a frequency exist, we try to decrease to this frequency. |
167 |
* |
167 |
* |
168 |
* Any frequency increase takes it to the maximum frequency. Frequency reduction |
168 |
* Any frequency increase takes it to the maximum frequency. Frequency reduction |
169 |
* happens at minimum steps of 5% (default) of current frequency |
169 |
* happens at minimum steps of 5% (default) of current frequency |
170 |
-- /dev/null |
170 |
++ b/kernel/sched/bfs.c |
Line 0
Link Here
|
0 |
-- a/include/uapi/linux/sched.h |
1 |
/* |
|
|
2 |
* kernel/sched/bfs.c, was kernel/sched.c |
3 |
* |
4 |
* Kernel scheduler and related syscalls |
5 |
* |
6 |
* Copyright (C) 1991-2002 Linus Torvalds |
7 |
* |
8 |
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and |
9 |
* make semaphores SMP safe |
10 |
* 1998-11-19 Implemented schedule_timeout() and related stuff |
11 |
* by Andrea Arcangeli |
12 |
* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: |
13 |
* hybrid priority-list and round-robin design with |
14 |
* an array-switch method of distributing timeslices |
15 |
* and per-CPU runqueues. Cleanups and useful suggestions |
16 |
* by Davide Libenzi, preemptible kernel bits by Robert Love. |
17 |
* 2003-09-03 Interactivity tuning by Con Kolivas. |
18 |
* 2004-04-02 Scheduler domains code by Nick Piggin |
19 |
* 2007-04-15 Work begun on replacing all interactivity tuning with a |
20 |
* fair scheduling design by Con Kolivas. |
21 |
* 2007-05-05 Load balancing (smp-nice) and other improvements |
22 |
* by Peter Williams |
23 |
* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
24 |
* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
25 |
* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, |
26 |
* Thomas Gleixner, Mike Kravetz |
27 |
* now Brainfuck deadline scheduling policy by Con Kolivas deletes |
28 |
* a whole lot of those previous things. |
29 |
*/ |
30 |
|
31 |
#include <linux/mm.h> |
32 |
#include <linux/module.h> |
33 |
#include <linux/nmi.h> |
34 |
#include <linux/init.h> |
35 |
#include <asm/uaccess.h> |
36 |
#include <linux/highmem.h> |
37 |
#include <asm/mmu_context.h> |
38 |
#include <linux/interrupt.h> |
39 |
#include <linux/capability.h> |
40 |
#include <linux/completion.h> |
41 |
#include <linux/kernel_stat.h> |
42 |
#include <linux/debug_locks.h> |
43 |
#include <linux/perf_event.h> |
44 |
#include <linux/security.h> |
45 |
#include <linux/notifier.h> |
46 |
#include <linux/profile.h> |
47 |
#include <linux/freezer.h> |
48 |
#include <linux/vmalloc.h> |
49 |
#include <linux/blkdev.h> |
50 |
#include <linux/delay.h> |
51 |
#include <linux/smp.h> |
52 |
#include <linux/threads.h> |
53 |
#include <linux/timer.h> |
54 |
#include <linux/rcupdate.h> |
55 |
#include <linux/cpu.h> |
56 |
#include <linux/cpuset.h> |
57 |
#include <linux/cpumask.h> |
58 |
#include <linux/percpu.h> |
59 |
#include <linux/proc_fs.h> |
60 |
#include <linux/seq_file.h> |
61 |
#include <linux/syscalls.h> |
62 |
#include <linux/times.h> |
63 |
#include <linux/tsacct_kern.h> |
64 |
#include <linux/kprobes.h> |
65 |
#include <linux/delayacct.h> |
66 |
#include <linux/log2.h> |
67 |
#include <linux/bootmem.h> |
68 |
#include <linux/ftrace.h> |
69 |
#include <linux/slab.h> |
70 |
#include <linux/init_task.h> |
71 |
#include <linux/binfmts.h> |
72 |
#include <linux/context_tracking.h> |
73 |
|
74 |
#include <asm/switch_to.h> |
75 |
#include <asm/tlb.h> |
76 |
#include <asm/unistd.h> |
77 |
#include <asm/mutex.h> |
78 |
#ifdef CONFIG_PARAVIRT |
79 |
#include <asm/paravirt.h> |
80 |
#endif |
81 |
|
82 |
#include "cpupri.h" |
83 |
#include "../workqueue_internal.h" |
84 |
#include "../smpboot.h" |
85 |
|
86 |
#define CREATE_TRACE_POINTS |
87 |
#include <trace/events/sched.h> |
88 |
|
89 |
#include "bfs_sched.h" |
90 |
|
91 |
#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) |
92 |
#define rt_task(p) rt_prio((p)->prio) |
93 |
#define rt_queue(rq) rt_prio((rq)->rq_prio) |
94 |
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) |
95 |
#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ |
96 |
(policy) == SCHED_RR) |
97 |
#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) |
98 |
#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) |
99 |
#define iso_task(p) unlikely((p)->policy == SCHED_ISO) |
100 |
#define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO) |
101 |
#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO) |
102 |
|
103 |
#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) |
104 |
|
105 |
/* |
106 |
* Convert user-nice values [ -20 ... 0 ... 19 ] |
107 |
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
108 |
* and back. |
109 |
*/ |
110 |
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) |
111 |
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) |
112 |
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) |
113 |
|
114 |
/* |
115 |
* 'User priority' is the nice value converted to something we |
116 |
* can work with better when scaling various scheduler parameters, |
117 |
* it's a [ 0 ... 39 ] range. |
118 |
*/ |
119 |
#define USER_PRIO(p) ((p) - MAX_RT_PRIO) |
120 |
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) |
121 |
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
122 |
#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) |
123 |
#define STOP_PRIO (MAX_RT_PRIO - 1) |
124 |
|
125 |
/* |
126 |
* Some helpers for converting to/from various scales. Use shifts to get |
127 |
* approximate multiples of ten for less overhead. |
128 |
*/ |
129 |
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
130 |
#define JIFFY_NS (1000000000 / HZ) |
131 |
#define HALF_JIFFY_NS (1000000000 / HZ / 2) |
132 |
#define HALF_JIFFY_US (1000000 / HZ / 2) |
133 |
#define MS_TO_NS(TIME) ((TIME) << 20) |
134 |
#define MS_TO_US(TIME) ((TIME) << 10) |
135 |
#define NS_TO_MS(TIME) ((TIME) >> 20) |
136 |
#define NS_TO_US(TIME) ((TIME) >> 10) |
137 |
|
138 |
#define RESCHED_US (100) /* Reschedule if less than this many μs left */ |
139 |
|
140 |
void print_scheduler_version(void) |
141 |
{ |
142 |
printk(KERN_INFO "BFS CPU scheduler v0.442 by Con Kolivas.\n"); |
143 |
} |
144 |
|
145 |
/* |
146 |
* This is the time all tasks within the same priority round robin. |
147 |
* Value is in ms and set to a minimum of 6ms. Scales with number of cpus. |
148 |
* Tunable via /proc interface. |
149 |
*/ |
150 |
int rr_interval __read_mostly = 6; |
151 |
|
152 |
/* |
153 |
* sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks |
154 |
* are allowed to run five seconds as real time tasks. This is the total over |
155 |
* all online cpus. |
156 |
*/ |
157 |
int sched_iso_cpu __read_mostly = 70; |
158 |
|
159 |
/* |
160 |
* The relative length of deadline for each priority(nice) level. |
161 |
*/ |
162 |
static int prio_ratios[PRIO_RANGE] __read_mostly; |
163 |
|
164 |
/* |
165 |
* The quota handed out to tasks of all priority levels when refilling their |
166 |
* time_slice. |
167 |
*/ |
168 |
static inline int timeslice(void) |
169 |
{ |
170 |
return MS_TO_US(rr_interval); |
171 |
} |
172 |
|
173 |
/* |
174 |
* The global runqueue data that all CPUs work off. Data is protected either |
175 |
* by the global grq lock, or the discrete lock that precedes the data in this |
176 |
* struct. |
177 |
*/ |
178 |
struct global_rq { |
179 |
raw_spinlock_t lock; |
180 |
unsigned long nr_running; |
181 |
unsigned long nr_uninterruptible; |
182 |
unsigned long long nr_switches; |
183 |
struct list_head queue[PRIO_LIMIT]; |
184 |
DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); |
185 |
#ifdef CONFIG_SMP |
186 |
unsigned long qnr; /* queued not running */ |
187 |
cpumask_t cpu_idle_map; |
188 |
bool idle_cpus; |
189 |
#endif |
190 |
int noc; /* num_online_cpus stored and updated when it changes */ |
191 |
u64 niffies; /* Nanosecond jiffies */ |
192 |
unsigned long last_jiffy; /* Last jiffy we updated niffies */ |
193 |
|
194 |
raw_spinlock_t iso_lock; |
195 |
int iso_ticks; |
196 |
bool iso_refractory; |
197 |
}; |
198 |
|
199 |
#ifdef CONFIG_SMP |
200 |
|
201 |
/* |
202 |
* We add the notion of a root-domain which will be used to define per-domain |
203 |
* variables. Each exclusive cpuset essentially defines an island domain by |
204 |
* fully partitioning the member cpus from any other cpuset. Whenever a new |
205 |
* exclusive cpuset is created, we also create and attach a new root-domain |
206 |
* object. |
207 |
* |
208 |
*/ |
209 |
struct root_domain { |
210 |
atomic_t refcount; |
211 |
atomic_t rto_count; |
212 |
struct rcu_head rcu; |
213 |
cpumask_var_t span; |
214 |
cpumask_var_t online; |
215 |
|
216 |
/* |
217 |
* The "RT overload" flag: it gets set if a CPU has more than |
218 |
* one runnable RT task. |
219 |
*/ |
220 |
cpumask_var_t rto_mask; |
221 |
struct cpupri cpupri; |
222 |
}; |
223 |
|
224 |
/* |
225 |
* By default the system creates a single root-domain with all cpus as |
226 |
* members (mimicking the global state we have today). |
227 |
*/ |
228 |
static struct root_domain def_root_domain; |
229 |
|
230 |
#endif /* CONFIG_SMP */ |
231 |
|
232 |
/* There can be only one */ |
233 |
static struct global_rq grq; |
234 |
|
235 |
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
236 |
static DEFINE_MUTEX(sched_hotcpu_mutex); |
237 |
|
238 |
#ifdef CONFIG_SMP |
239 |
struct rq *cpu_rq(int cpu) |
240 |
{ |
241 |
return &per_cpu(runqueues, (cpu)); |
242 |
} |
243 |
#define this_rq() (&__get_cpu_var(runqueues)) |
244 |
#define task_rq(p) cpu_rq(task_cpu(p)) |
245 |
#define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
246 |
/* |
247 |
* sched_domains_mutex serialises calls to init_sched_domains, |
248 |
* detach_destroy_domains and partition_sched_domains. |
249 |
*/ |
250 |
static DEFINE_MUTEX(sched_domains_mutex); |
251 |
|
252 |
/* |
253 |
* By default the system creates a single root-domain with all cpus as |
254 |
* members (mimicking the global state we have today). |
255 |
*/ |
256 |
static struct root_domain def_root_domain; |
257 |
|
258 |
int __weak arch_sd_sibling_asym_packing(void) |
259 |
{ |
260 |
return 0*SD_ASYM_PACKING; |
261 |
} |
262 |
#endif /* CONFIG_SMP */ |
263 |
|
264 |
static inline void update_rq_clock(struct rq *rq); |
265 |
|
266 |
/* |
267 |
* Sanity check should sched_clock return bogus values. We make sure it does |
268 |
* not appear to go backwards, and use jiffies to determine the maximum and |
269 |
* minimum it could possibly have increased, and round down to the nearest |
270 |
* jiffy when it falls outside this. |
271 |
*/ |
272 |
static inline void niffy_diff(s64 *niff_diff, int jiff_diff) |
273 |
{ |
274 |
unsigned long min_diff, max_diff; |
275 |
|
276 |
if (jiff_diff > 1) |
277 |
min_diff = JIFFIES_TO_NS(jiff_diff - 1); |
278 |
else |
279 |
min_diff = 1; |
280 |
/* Round up to the nearest tick for maximum */ |
281 |
max_diff = JIFFIES_TO_NS(jiff_diff + 1); |
282 |
|
283 |
if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff)) |
284 |
*niff_diff = min_diff; |
285 |
} |
286 |
|
287 |
#ifdef CONFIG_SMP |
288 |
static inline int cpu_of(struct rq *rq) |
289 |
{ |
290 |
return rq->cpu; |
291 |
} |
292 |
|
293 |
/* |
294 |
* Niffies are a globally increasing nanosecond counter. Whenever a runqueue |
295 |
* clock is updated with the grq.lock held, it is an opportunity to update the |
296 |
* niffies value. Any CPU can update it by adding how much its clock has |
297 |
* increased since it last updated niffies, minus any added niffies by other |
298 |
* CPUs. |
299 |
*/ |
300 |
static inline void update_clocks(struct rq *rq) |
301 |
{ |
302 |
s64 ndiff; |
303 |
long jdiff; |
304 |
|
305 |
update_rq_clock(rq); |
306 |
ndiff = rq->clock - rq->old_clock; |
307 |
/* old_clock is only updated when we are updating niffies */ |
308 |
rq->old_clock = rq->clock; |
309 |
ndiff -= grq.niffies - rq->last_niffy; |
310 |
jdiff = jiffies - grq.last_jiffy; |
311 |
niffy_diff(&ndiff, jdiff); |
312 |
grq.last_jiffy += jdiff; |
313 |
grq.niffies += ndiff; |
314 |
rq->last_niffy = grq.niffies; |
315 |
} |
316 |
#else /* CONFIG_SMP */ |
317 |
static struct rq *uprq; |
318 |
#define cpu_rq(cpu) (uprq) |
319 |
#define this_rq() (uprq) |
320 |
#define task_rq(p) (uprq) |
321 |
#define cpu_curr(cpu) ((uprq)->curr) |
322 |
static inline int cpu_of(struct rq *rq) |
323 |
{ |
324 |
return 0; |
325 |
} |
326 |
|
327 |
static inline void update_clocks(struct rq *rq) |
328 |
{ |
329 |
s64 ndiff; |
330 |
long jdiff; |
331 |
|
332 |
update_rq_clock(rq); |
333 |
ndiff = rq->clock - rq->old_clock; |
334 |
rq->old_clock = rq->clock; |
335 |
jdiff = jiffies - grq.last_jiffy; |
336 |
niffy_diff(&ndiff, jdiff); |
337 |
grq.last_jiffy += jdiff; |
338 |
grq.niffies += ndiff; |
339 |
} |
340 |
#endif |
341 |
#define raw_rq() (&__raw_get_cpu_var(runqueues)) |
342 |
|
343 |
#include "stats.h" |
344 |
|
345 |
#ifndef prepare_arch_switch |
346 |
# define prepare_arch_switch(next) do { } while (0) |
347 |
#endif |
348 |
#ifndef finish_arch_switch |
349 |
# define finish_arch_switch(prev) do { } while (0) |
350 |
#endif |
351 |
#ifndef finish_arch_post_lock_switch |
352 |
# define finish_arch_post_lock_switch() do { } while (0) |
353 |
#endif |
354 |
|
355 |
/* |
356 |
* All common locking functions performed on grq.lock. rq->clock is local to |
357 |
* the CPU accessing it so it can be modified just with interrupts disabled |
358 |
* when we're not updating niffies. |
359 |
* Looking up task_rq must be done under grq.lock to be safe. |
360 |
*/ |
361 |
static void update_rq_clock_task(struct rq *rq, s64 delta); |
362 |
|
363 |
static inline void update_rq_clock(struct rq *rq) |
364 |
{ |
365 |
s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
366 |
|
367 |
rq->clock += delta; |
368 |
update_rq_clock_task(rq, delta); |
369 |
} |
370 |
|
371 |
static inline bool task_running(struct task_struct *p) |
372 |
{ |
373 |
return p->on_cpu; |
374 |
} |
375 |
|
376 |
static inline void grq_lock(void) |
377 |
__acquires(grq.lock) |
378 |
{ |
379 |
raw_spin_lock(&grq.lock); |
380 |
} |
381 |
|
382 |
static inline void grq_unlock(void) |
383 |
__releases(grq.lock) |
384 |
{ |
385 |
raw_spin_unlock(&grq.lock); |
386 |
} |
387 |
|
388 |
static inline void grq_lock_irq(void) |
389 |
__acquires(grq.lock) |
390 |
{ |
391 |
raw_spin_lock_irq(&grq.lock); |
392 |
} |
393 |
|
394 |
static inline void time_lock_grq(struct rq *rq) |
395 |
__acquires(grq.lock) |
396 |
{ |
397 |
grq_lock(); |
398 |
update_clocks(rq); |
399 |
} |
400 |
|
401 |
static inline void grq_unlock_irq(void) |
402 |
__releases(grq.lock) |
403 |
{ |
404 |
raw_spin_unlock_irq(&grq.lock); |
405 |
} |
406 |
|
407 |
static inline void grq_lock_irqsave(unsigned long *flags) |
408 |
__acquires(grq.lock) |
409 |
{ |
410 |
raw_spin_lock_irqsave(&grq.lock, *flags); |
411 |
} |
412 |
|
413 |
static inline void grq_unlock_irqrestore(unsigned long *flags) |
414 |
__releases(grq.lock) |
415 |
{ |
416 |
raw_spin_unlock_irqrestore(&grq.lock, *flags); |
417 |
} |
418 |
|
419 |
static inline struct rq |
420 |
*task_grq_lock(struct task_struct *p, unsigned long *flags) |
421 |
__acquires(grq.lock) |
422 |
{ |
423 |
grq_lock_irqsave(flags); |
424 |
return task_rq(p); |
425 |
} |
426 |
|
427 |
static inline struct rq |
428 |
*time_task_grq_lock(struct task_struct *p, unsigned long *flags) |
429 |
__acquires(grq.lock) |
430 |
{ |
431 |
struct rq *rq = task_grq_lock(p, flags); |
432 |
update_clocks(rq); |
433 |
return rq; |
434 |
} |
435 |
|
436 |
static inline struct rq *task_grq_lock_irq(struct task_struct *p) |
437 |
__acquires(grq.lock) |
438 |
{ |
439 |
grq_lock_irq(); |
440 |
return task_rq(p); |
441 |
} |
442 |
|
443 |
static inline void time_task_grq_lock_irq(struct task_struct *p) |
444 |
__acquires(grq.lock) |
445 |
{ |
446 |
struct rq *rq = task_grq_lock_irq(p); |
447 |
update_clocks(rq); |
448 |
} |
449 |
|
450 |
static inline void task_grq_unlock_irq(void) |
451 |
__releases(grq.lock) |
452 |
{ |
453 |
grq_unlock_irq(); |
454 |
} |
455 |
|
456 |
static inline void task_grq_unlock(unsigned long *flags) |
457 |
__releases(grq.lock) |
458 |
{ |
459 |
grq_unlock_irqrestore(flags); |
460 |
} |
461 |
|
462 |
/** |
463 |
* grunqueue_is_locked |
464 |
* |
465 |
* Returns true if the global runqueue is locked. |
466 |
* This interface allows printk to be called with the runqueue lock |
467 |
* held and know whether or not it is OK to wake up the klogd. |
468 |
*/ |
469 |
bool grunqueue_is_locked(void) |
470 |
{ |
471 |
return raw_spin_is_locked(&grq.lock); |
472 |
} |
473 |
|
474 |
void grq_unlock_wait(void) |
475 |
__releases(grq.lock) |
476 |
{ |
477 |
smp_mb(); /* spin-unlock-wait is not a full memory barrier */ |
478 |
raw_spin_unlock_wait(&grq.lock); |
479 |
} |
480 |
|
481 |
static inline void time_grq_lock(struct rq *rq, unsigned long *flags) |
482 |
__acquires(grq.lock) |
483 |
{ |
484 |
local_irq_save(*flags); |
485 |
time_lock_grq(rq); |
486 |
} |
487 |
|
488 |
static inline struct rq *__task_grq_lock(struct task_struct *p) |
489 |
__acquires(grq.lock) |
490 |
{ |
491 |
grq_lock(); |
492 |
return task_rq(p); |
493 |
} |
494 |
|
495 |
static inline void __task_grq_unlock(void) |
496 |
__releases(grq.lock) |
497 |
{ |
498 |
grq_unlock(); |
499 |
} |
500 |
|
501 |
/* |
502 |
* Look for any tasks *anywhere* that are running nice 0 or better. We do |
503 |
* this lockless for overhead reasons since the occasional wrong result |
504 |
* is harmless. |
505 |
*/ |
506 |
bool above_background_load(void) |
507 |
{ |
508 |
int cpu; |
509 |
|
510 |
for_each_online_cpu(cpu) { |
511 |
struct task_struct *cpu_curr = cpu_rq(cpu)->curr; |
512 |
|
513 |
if (unlikely(!cpu_curr)) |
514 |
continue; |
515 |
if (PRIO_TO_NICE(cpu_curr->static_prio) < 1) { |
516 |
return true; |
517 |
} |
518 |
} |
519 |
return false; |
520 |
} |
521 |
|
522 |
#ifndef __ARCH_WANT_UNLOCKED_CTXSW |
523 |
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
524 |
{ |
525 |
} |
526 |
|
527 |
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
528 |
{ |
529 |
#ifdef CONFIG_DEBUG_SPINLOCK |
530 |
/* this is a valid case when another task releases the spinlock */ |
531 |
grq.lock.owner = current; |
532 |
#endif |
533 |
/* |
534 |
* If we are tracking spinlock dependencies then we have to |
535 |
* fix up the runqueue lock - which gets 'carried over' from |
536 |
* prev into current: |
537 |
*/ |
538 |
spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_); |
539 |
|
540 |
grq_unlock_irq(); |
541 |
} |
542 |
|
543 |
#else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
544 |
|
545 |
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
546 |
{ |
547 |
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
548 |
grq_unlock_irq(); |
549 |
#else |
550 |
grq_unlock(); |
551 |
#endif |
552 |
} |
553 |
|
554 |
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
555 |
{ |
556 |
smp_wmb(); |
557 |
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
558 |
local_irq_enable(); |
559 |
#endif |
560 |
} |
561 |
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
562 |
|
563 |
static inline bool deadline_before(u64 deadline, u64 time) |
564 |
{ |
565 |
return (deadline < time); |
566 |
} |
567 |
|
568 |
static inline bool deadline_after(u64 deadline, u64 time) |
569 |
{ |
570 |
return (deadline > time); |
571 |
} |
572 |
|
573 |
/* |
574 |
* A task that is queued but not running will be on the grq run list. |
575 |
* A task that is not running or queued will not be on the grq run list. |
576 |
* A task that is currently running will have ->on_cpu set but not on the |
577 |
* grq run list. |
578 |
*/ |
579 |
static inline bool task_queued(struct task_struct *p) |
580 |
{ |
581 |
return (!list_empty(&p->run_list)); |
582 |
} |
583 |
|
584 |
/* |
585 |
* Removing from the global runqueue. Enter with grq locked. |
586 |
*/ |
587 |
static void dequeue_task(struct task_struct *p) |
588 |
{ |
589 |
list_del_init(&p->run_list); |
590 |
if (list_empty(grq.queue + p->prio)) |
591 |
__clear_bit(p->prio, grq.prio_bitmap); |
592 |
} |
593 |
|
594 |
/* |
595 |
* To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as |
596 |
* an idle task, we ensure none of the following conditions are met. |
597 |
*/ |
598 |
static bool idleprio_suitable(struct task_struct *p) |
599 |
{ |
600 |
return (!freezing(p) && !signal_pending(p) && |
601 |
!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); |
602 |
} |
603 |
|
604 |
/* |
605 |
* To determine if a task of SCHED_ISO can run in pseudo-realtime, we check |
606 |
* that the iso_refractory flag is not set. |
607 |
*/ |
608 |
static bool isoprio_suitable(void) |
609 |
{ |
610 |
return !grq.iso_refractory; |
611 |
} |
612 |
|
613 |
/* |
614 |
* Adding to the global runqueue. Enter with grq locked. |
615 |
*/ |
616 |
static void enqueue_task(struct task_struct *p) |
617 |
{ |
618 |
if (!rt_task(p)) { |
619 |
/* Check it hasn't gotten rt from PI */ |
620 |
if ((idleprio_task(p) && idleprio_suitable(p)) || |
621 |
(iso_task(p) && isoprio_suitable())) |
622 |
p->prio = p->normal_prio; |
623 |
else |
624 |
p->prio = NORMAL_PRIO; |
625 |
} |
626 |
__set_bit(p->prio, grq.prio_bitmap); |
627 |
list_add_tail(&p->run_list, grq.queue + p->prio); |
628 |
sched_info_queued(p); |
629 |
} |
630 |
|
631 |
/* Only idle task does this as a real time task*/ |
632 |
static inline void enqueue_task_head(struct task_struct *p) |
633 |
{ |
634 |
__set_bit(p->prio, grq.prio_bitmap); |
635 |
list_add(&p->run_list, grq.queue + p->prio); |
636 |
sched_info_queued(p); |
637 |
} |
638 |
|
639 |
static inline void requeue_task(struct task_struct *p) |
640 |
{ |
641 |
sched_info_queued(p); |
642 |
} |
643 |
|
644 |
/* |
645 |
* Returns the relative length of deadline all compared to the shortest |
646 |
* deadline which is that of nice -20. |
647 |
*/ |
648 |
static inline int task_prio_ratio(struct task_struct *p) |
649 |
{ |
650 |
return prio_ratios[TASK_USER_PRIO(p)]; |
651 |
} |
652 |
|
653 |
/* |
654 |
* task_timeslice - all tasks of all priorities get the exact same timeslice |
655 |
* length. CPU distribution is handled by giving different deadlines to |
656 |
* tasks of different priorities. Use 128 as the base value for fast shifts. |
657 |
*/ |
658 |
static inline int task_timeslice(struct task_struct *p) |
659 |
{ |
660 |
return (rr_interval * task_prio_ratio(p) / 128); |
661 |
} |
662 |
|
663 |
#ifdef CONFIG_SMP |
664 |
/* |
665 |
* qnr is the "queued but not running" count which is the total number of |
666 |
* tasks on the global runqueue list waiting for cpu time but not actually |
667 |
* currently running on a cpu. |
668 |
*/ |
669 |
static inline void inc_qnr(void) |
670 |
{ |
671 |
grq.qnr++; |
672 |
} |
673 |
|
674 |
static inline void dec_qnr(void) |
675 |
{ |
676 |
grq.qnr--; |
677 |
} |
678 |
|
679 |
static inline int queued_notrunning(void) |
680 |
{ |
681 |
return grq.qnr; |
682 |
} |
683 |
|
684 |
/* |
685 |
* The cpu_idle_map stores a bitmap of all the CPUs currently idle to |
686 |
* allow easy lookup of whether any suitable idle CPUs are available. |
687 |
* It's cheaper to maintain a binary yes/no if there are any idle CPUs on the |
688 |
* idle_cpus variable than to do a full bitmask check when we are busy. |
689 |
*/ |
690 |
static inline void set_cpuidle_map(int cpu) |
691 |
{ |
692 |
if (likely(cpu_online(cpu))) { |
693 |
cpu_set(cpu, grq.cpu_idle_map); |
694 |
grq.idle_cpus = true; |
695 |
} |
696 |
} |
697 |
|
698 |
static inline void clear_cpuidle_map(int cpu) |
699 |
{ |
700 |
cpu_clear(cpu, grq.cpu_idle_map); |
701 |
if (cpus_empty(grq.cpu_idle_map)) |
702 |
grq.idle_cpus = false; |
703 |
} |
704 |
|
705 |
static bool suitable_idle_cpus(struct task_struct *p) |
706 |
{ |
707 |
if (!grq.idle_cpus) |
708 |
return false; |
709 |
return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map)); |
710 |
} |
711 |
|
712 |
#define CPUIDLE_DIFF_THREAD (1) |
713 |
#define CPUIDLE_DIFF_CORE (2) |
714 |
#define CPUIDLE_CACHE_BUSY (4) |
715 |
#define CPUIDLE_DIFF_CPU (8) |
716 |
#define CPUIDLE_THREAD_BUSY (16) |
717 |
#define CPUIDLE_THROTTLED (32) |
718 |
#define CPUIDLE_DIFF_NODE (64) |
719 |
|
720 |
static void resched_task(struct task_struct *p); |
721 |
static inline bool scaling_rq(struct rq *rq); |
722 |
|
723 |
/* |
724 |
* The best idle CPU is chosen according to the CPUIDLE ranking above where the |
725 |
* lowest value would give the most suitable CPU to schedule p onto next. The |
726 |
* order works out to be the following: |
727 |
* |
728 |
* Same core, idle or busy cache, idle or busy threads |
729 |
* Other core, same cache, idle or busy cache, idle threads. |
730 |
* Same node, other CPU, idle cache, idle threads. |
731 |
* Same node, other CPU, busy cache, idle threads. |
732 |
* Other core, same cache, busy threads. |
733 |
* Same node, other CPU, busy threads. |
734 |
* Other node, other CPU, idle cache, idle threads. |
735 |
* Other node, other CPU, busy cache, idle threads. |
736 |
* Other node, other CPU, busy threads. |
737 |
*/ |
738 |
static void |
739 |
resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) |
740 |
{ |
741 |
int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THROTTLED | |
742 |
CPUIDLE_THREAD_BUSY | CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | |
743 |
CPUIDLE_DIFF_CORE | CPUIDLE_DIFF_THREAD; |
744 |
int cpu_tmp; |
745 |
|
746 |
if (cpu_isset(best_cpu, *tmpmask)) |
747 |
goto out; |
748 |
|
749 |
for_each_cpu_mask(cpu_tmp, *tmpmask) { |
750 |
int ranking, locality; |
751 |
struct rq *tmp_rq; |
752 |
|
753 |
ranking = 0; |
754 |
tmp_rq = cpu_rq(cpu_tmp); |
755 |
|
756 |
locality = rq->cpu_locality[cpu_tmp]; |
757 |
#ifdef CONFIG_NUMA |
758 |
if (locality > 3) |
759 |
ranking |= CPUIDLE_DIFF_NODE; |
760 |
else |
761 |
#endif |
762 |
if (locality > 2) |
763 |
ranking |= CPUIDLE_DIFF_CPU; |
764 |
#ifdef CONFIG_SCHED_MC |
765 |
else if (locality == 2) |
766 |
ranking |= CPUIDLE_DIFF_CORE; |
767 |
if (!(tmp_rq->cache_idle(cpu_tmp))) |
768 |
ranking |= CPUIDLE_CACHE_BUSY; |
769 |
#endif |
770 |
#ifdef CONFIG_SCHED_SMT |
771 |
if (locality == 1) |
772 |
ranking |= CPUIDLE_DIFF_THREAD; |
773 |
if (!(tmp_rq->siblings_idle(cpu_tmp))) |
774 |
ranking |= CPUIDLE_THREAD_BUSY; |
775 |
#endif |
776 |
if (scaling_rq(tmp_rq)) |
777 |
ranking |= CPUIDLE_THROTTLED; |
778 |
|
779 |
if (ranking < best_ranking) { |
780 |
best_cpu = cpu_tmp; |
781 |
best_ranking = ranking; |
782 |
} |
783 |
} |
784 |
out: |
785 |
resched_task(cpu_rq(best_cpu)->curr); |
786 |
} |
787 |
|
788 |
bool cpus_share_cache(int this_cpu, int that_cpu) |
789 |
{ |
790 |
struct rq *this_rq = cpu_rq(this_cpu); |
791 |
|
792 |
return (this_rq->cpu_locality[that_cpu] < 3); |
793 |
} |
794 |
|
795 |
static void resched_best_idle(struct task_struct *p) |
796 |
{ |
797 |
cpumask_t tmpmask; |
798 |
|
799 |
cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); |
800 |
resched_best_mask(task_cpu(p), task_rq(p), &tmpmask); |
801 |
} |
802 |
|
803 |
static inline void resched_suitable_idle(struct task_struct *p) |
804 |
{ |
805 |
if (suitable_idle_cpus(p)) |
806 |
resched_best_idle(p); |
807 |
} |
808 |
/* |
809 |
* Flags to tell us whether this CPU is running a CPU frequency governor that |
810 |
* has slowed its speed or not. No locking required as the very rare wrongly |
811 |
* read value would be harmless. |
812 |
*/ |
813 |
void cpu_scaling(int cpu) |
814 |
{ |
815 |
cpu_rq(cpu)->scaling = true; |
816 |
} |
817 |
|
818 |
void cpu_nonscaling(int cpu) |
819 |
{ |
820 |
cpu_rq(cpu)->scaling = false; |
821 |
} |
822 |
|
823 |
static inline bool scaling_rq(struct rq *rq) |
824 |
{ |
825 |
return rq->scaling; |
826 |
} |
827 |
|
828 |
static inline int locality_diff(struct task_struct *p, struct rq *rq) |
829 |
{ |
830 |
return rq->cpu_locality[task_cpu(p)]; |
831 |
} |
832 |
#else /* CONFIG_SMP */ |
833 |
static inline void inc_qnr(void) |
834 |
{ |
835 |
} |
836 |
|
837 |
static inline void dec_qnr(void) |
838 |
{ |
839 |
} |
840 |
|
841 |
static inline int queued_notrunning(void) |
842 |
{ |
843 |
return grq.nr_running; |
844 |
} |
845 |
|
846 |
static inline void set_cpuidle_map(int cpu) |
847 |
{ |
848 |
} |
849 |
|
850 |
static inline void clear_cpuidle_map(int cpu) |
851 |
{ |
852 |
} |
853 |
|
854 |
static inline bool suitable_idle_cpus(struct task_struct *p) |
855 |
{ |
856 |
return uprq->curr == uprq->idle; |
857 |
} |
858 |
|
859 |
static inline void resched_suitable_idle(struct task_struct *p) |
860 |
{ |
861 |
} |
862 |
|
863 |
void cpu_scaling(int __unused) |
864 |
{ |
865 |
} |
866 |
|
867 |
void cpu_nonscaling(int __unused) |
868 |
{ |
869 |
} |
870 |
|
871 |
/* |
872 |
* Although CPUs can scale in UP, there is nowhere else for tasks to go so this |
873 |
* always returns 0. |
874 |
*/ |
875 |
static inline bool scaling_rq(struct rq *rq) |
876 |
{ |
877 |
return false; |
878 |
} |
879 |
|
880 |
static inline int locality_diff(struct task_struct *p, struct rq *rq) |
881 |
{ |
882 |
return 0; |
883 |
} |
884 |
#endif /* CONFIG_SMP */ |
885 |
EXPORT_SYMBOL_GPL(cpu_scaling); |
886 |
EXPORT_SYMBOL_GPL(cpu_nonscaling); |
887 |
|
888 |
/* |
889 |
* activate_idle_task - move idle task to the _front_ of runqueue. |
890 |
*/ |
891 |
static inline void activate_idle_task(struct task_struct *p) |
892 |
{ |
893 |
enqueue_task_head(p); |
894 |
grq.nr_running++; |
895 |
inc_qnr(); |
896 |
} |
897 |
|
898 |
static inline int normal_prio(struct task_struct *p) |
899 |
{ |
900 |
if (has_rt_policy(p)) |
901 |
return MAX_RT_PRIO - 1 - p->rt_priority; |
902 |
if (idleprio_task(p)) |
903 |
return IDLE_PRIO; |
904 |
if (iso_task(p)) |
905 |
return ISO_PRIO; |
906 |
return NORMAL_PRIO; |
907 |
} |
908 |
|
909 |
/* |
910 |
* Calculate the current priority, i.e. the priority |
911 |
* taken into account by the scheduler. This value might |
912 |
* be boosted by RT tasks as it will be RT if the task got |
913 |
* RT-boosted. If not then it returns p->normal_prio. |
914 |
*/ |
915 |
static int effective_prio(struct task_struct *p) |
916 |
{ |
917 |
p->normal_prio = normal_prio(p); |
918 |
/* |
919 |
* If we are RT tasks or we were boosted to RT priority, |
920 |
* keep the priority unchanged. Otherwise, update priority |
921 |
* to the normal priority: |
922 |
*/ |
923 |
if (!rt_prio(p->prio)) |
924 |
return p->normal_prio; |
925 |
return p->prio; |
926 |
} |
927 |
|
928 |
/* |
929 |
* activate_task - move a task to the runqueue. Enter with grq locked. |
930 |
*/ |
931 |
static void activate_task(struct task_struct *p, struct rq *rq) |
932 |
{ |
933 |
update_clocks(rq); |
934 |
|
935 |
/* |
936 |
* Sleep time is in units of nanosecs, so shift by 20 to get a |
937 |
* milliseconds-range estimation of the amount of time that the task |
938 |
* spent sleeping: |
939 |
*/ |
940 |
if (unlikely(prof_on == SLEEP_PROFILING)) { |
941 |
if (p->state == TASK_UNINTERRUPTIBLE) |
942 |
profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), |
943 |
(rq->clock_task - p->last_ran) >> 20); |
944 |
} |
945 |
|
946 |
p->prio = effective_prio(p); |
947 |
if (task_contributes_to_load(p)) |
948 |
grq.nr_uninterruptible--; |
949 |
enqueue_task(p); |
950 |
grq.nr_running++; |
951 |
inc_qnr(); |
952 |
} |
953 |
|
954 |
static inline void clear_sticky(struct task_struct *p); |
955 |
|
956 |
/* |
957 |
* deactivate_task - If it's running, it's not on the grq and we can just |
958 |
* decrement the nr_running. Enter with grq locked. |
959 |
*/ |
960 |
static inline void deactivate_task(struct task_struct *p) |
961 |
{ |
962 |
if (task_contributes_to_load(p)) |
963 |
grq.nr_uninterruptible++; |
964 |
grq.nr_running--; |
965 |
clear_sticky(p); |
966 |
} |
967 |
|
968 |
static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); |
969 |
|
970 |
void register_task_migration_notifier(struct notifier_block *n) |
971 |
{ |
972 |
atomic_notifier_chain_register(&task_migration_notifier, n); |
973 |
} |
974 |
|
975 |
#ifdef CONFIG_SMP |
976 |
void set_task_cpu(struct task_struct *p, unsigned int cpu) |
977 |
{ |
978 |
#ifdef CONFIG_LOCKDEP |
979 |
/* |
980 |
* The caller should hold grq lock. |
981 |
*/ |
982 |
WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock)); |
983 |
#endif |
984 |
trace_sched_migrate_task(p, cpu); |
985 |
if (task_cpu(p) != cpu) |
986 |
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); |
987 |
|
988 |
/* |
989 |
* After ->cpu is set up to a new value, task_grq_lock(p, ...) can be |
990 |
* successfully executed on another CPU. We must ensure that updates of |
991 |
* per-task data have been completed by this moment. |
992 |
*/ |
993 |
smp_wmb(); |
994 |
task_thread_info(p)->cpu = cpu; |
995 |
} |
996 |
|
997 |
static inline void clear_sticky(struct task_struct *p) |
998 |
{ |
999 |
p->sticky = false; |
1000 |
} |
1001 |
|
1002 |
static inline bool task_sticky(struct task_struct *p) |
1003 |
{ |
1004 |
return p->sticky; |
1005 |
} |
1006 |
|
1007 |
/* Reschedule the best idle CPU that is not this one. */ |
1008 |
static void |
1009 |
resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p) |
1010 |
{ |
1011 |
cpumask_t tmpmask; |
1012 |
|
1013 |
cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); |
1014 |
cpu_clear(cpu, tmpmask); |
1015 |
if (cpus_empty(tmpmask)) |
1016 |
return; |
1017 |
resched_best_mask(cpu, rq, &tmpmask); |
1018 |
} |
1019 |
|
1020 |
/* |
1021 |
* We set the sticky flag on a task that is descheduled involuntarily meaning |
1022 |
* it is awaiting further CPU time. If the last sticky task is still sticky |
1023 |
* but unlucky enough to not be the next task scheduled, we unstick it and try |
1024 |
* to find it an idle CPU. Realtime tasks do not stick to minimise their |
1025 |
* latency at all times. |
1026 |
*/ |
1027 |
static inline void |
1028 |
swap_sticky(struct rq *rq, int cpu, struct task_struct *p) |
1029 |
{ |
1030 |
if (rq->sticky_task) { |
1031 |
if (rq->sticky_task == p) { |
1032 |
p->sticky = true; |
1033 |
return; |
1034 |
} |
1035 |
if (task_sticky(rq->sticky_task)) { |
1036 |
clear_sticky(rq->sticky_task); |
1037 |
resched_closest_idle(rq, cpu, rq->sticky_task); |
1038 |
} |
1039 |
} |
1040 |
if (!rt_task(p)) { |
1041 |
p->sticky = true; |
1042 |
rq->sticky_task = p; |
1043 |
} else { |
1044 |
resched_closest_idle(rq, cpu, p); |
1045 |
rq->sticky_task = NULL; |
1046 |
} |
1047 |
} |
1048 |
|
1049 |
static inline void unstick_task(struct rq *rq, struct task_struct *p) |
1050 |
{ |
1051 |
rq->sticky_task = NULL; |
1052 |
clear_sticky(p); |
1053 |
} |
1054 |
#else |
1055 |
static inline void clear_sticky(struct task_struct *p) |
1056 |
{ |
1057 |
} |
1058 |
|
1059 |
static inline bool task_sticky(struct task_struct *p) |
1060 |
{ |
1061 |
return false; |
1062 |
} |
1063 |
|
1064 |
static inline void |
1065 |
swap_sticky(struct rq *rq, int cpu, struct task_struct *p) |
1066 |
{ |
1067 |
} |
1068 |
|
1069 |
static inline void unstick_task(struct rq *rq, struct task_struct *p) |
1070 |
{ |
1071 |
} |
1072 |
#endif |
1073 |
|
1074 |
/* |
1075 |
* Move a task off the global queue and take it to a cpu for it will |
1076 |
* become the running task. |
1077 |
*/ |
1078 |
static inline void take_task(int cpu, struct task_struct *p) |
1079 |
{ |
1080 |
set_task_cpu(p, cpu); |
1081 |
dequeue_task(p); |
1082 |
clear_sticky(p); |
1083 |
dec_qnr(); |
1084 |
} |
1085 |
|
1086 |
/* |
1087 |
* Returns a descheduling task to the grq runqueue unless it is being |
1088 |
* deactivated. |
1089 |
*/ |
1090 |
static inline void return_task(struct task_struct *p, bool deactivate) |
1091 |
{ |
1092 |
if (deactivate) |
1093 |
deactivate_task(p); |
1094 |
else { |
1095 |
inc_qnr(); |
1096 |
enqueue_task(p); |
1097 |
} |
1098 |
} |
1099 |
|
1100 |
/* |
1101 |
* resched_task - mark a task 'to be rescheduled now'. |
1102 |
* |
1103 |
* On UP this means the setting of the need_resched flag, on SMP it |
1104 |
* might also involve a cross-CPU call to trigger the scheduler on |
1105 |
* the target CPU. |
1106 |
*/ |
1107 |
#ifdef CONFIG_SMP |
1108 |
|
1109 |
#ifndef tsk_is_polling |
1110 |
#define tsk_is_polling(t) 0 |
1111 |
#endif |
1112 |
|
1113 |
static void resched_task(struct task_struct *p) |
1114 |
{ |
1115 |
int cpu; |
1116 |
|
1117 |
assert_raw_spin_locked(&grq.lock); |
1118 |
|
1119 |
if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) |
1120 |
return; |
1121 |
|
1122 |
set_tsk_thread_flag(p, TIF_NEED_RESCHED); |
1123 |
|
1124 |
cpu = task_cpu(p); |
1125 |
if (cpu == smp_processor_id()) |
1126 |
return; |
1127 |
|
1128 |
/* NEED_RESCHED must be visible before we test polling */ |
1129 |
smp_mb(); |
1130 |
if (!tsk_is_polling(p)) |
1131 |
smp_send_reschedule(cpu); |
1132 |
} |
1133 |
|
1134 |
#else |
1135 |
static inline void resched_task(struct task_struct *p) |
1136 |
{ |
1137 |
assert_raw_spin_locked(&grq.lock); |
1138 |
set_tsk_need_resched(p); |
1139 |
} |
1140 |
#endif |
1141 |
|
1142 |
/** |
1143 |
* task_curr - is this task currently executing on a CPU? |
1144 |
* @p: the task in question. |
1145 |
* |
1146 |
* Return: 1 if the task is currently executing. 0 otherwise. |
1147 |
*/ |
1148 |
inline int task_curr(const struct task_struct *p) |
1149 |
{ |
1150 |
return cpu_curr(task_cpu(p)) == p; |
1151 |
} |
1152 |
|
1153 |
#ifdef CONFIG_SMP |
1154 |
struct migration_req { |
1155 |
struct task_struct *task; |
1156 |
int dest_cpu; |
1157 |
}; |
1158 |
|
1159 |
/* |
1160 |
* wait_task_inactive - wait for a thread to unschedule. |
1161 |
* |
1162 |
* If @match_state is nonzero, it's the @p->state value just checked and |
1163 |
* not expected to change. If it changes, i.e. @p might have woken up, |
1164 |
* then return zero. When we succeed in waiting for @p to be off its CPU, |
1165 |
* we return a positive number (its total switch count). If a second call |
1166 |
* a short while later returns the same number, the caller can be sure that |
1167 |
* @p has remained unscheduled the whole time. |
1168 |
* |
1169 |
* The caller must ensure that the task *will* unschedule sometime soon, |
1170 |
* else this function might spin for a *long* time. This function can't |
1171 |
* be called with interrupts off, or it may introduce deadlock with |
1172 |
* smp_call_function() if an IPI is sent by the same process we are |
1173 |
* waiting to become inactive. |
1174 |
*/ |
1175 |
unsigned long wait_task_inactive(struct task_struct *p, long match_state) |
1176 |
{ |
1177 |
unsigned long flags; |
1178 |
bool running, on_rq; |
1179 |
unsigned long ncsw; |
1180 |
struct rq *rq; |
1181 |
|
1182 |
for (;;) { |
1183 |
/* |
1184 |
* We do the initial early heuristics without holding |
1185 |
* any task-queue locks at all. We'll only try to get |
1186 |
* the runqueue lock when things look like they will |
1187 |
* work out! In the unlikely event rq is dereferenced |
1188 |
* since we're lockless, grab it again. |
1189 |
*/ |
1190 |
#ifdef CONFIG_SMP |
1191 |
retry_rq: |
1192 |
rq = task_rq(p); |
1193 |
if (unlikely(!rq)) |
1194 |
goto retry_rq; |
1195 |
#else /* CONFIG_SMP */ |
1196 |
rq = task_rq(p); |
1197 |
#endif |
1198 |
/* |
1199 |
* If the task is actively running on another CPU |
1200 |
* still, just relax and busy-wait without holding |
1201 |
* any locks. |
1202 |
* |
1203 |
* NOTE! Since we don't hold any locks, it's not |
1204 |
* even sure that "rq" stays as the right runqueue! |
1205 |
* But we don't care, since this will return false |
1206 |
* if the runqueue has changed and p is actually now |
1207 |
* running somewhere else! |
1208 |
*/ |
1209 |
while (task_running(p) && p == rq->curr) { |
1210 |
if (match_state && unlikely(p->state != match_state)) |
1211 |
return 0; |
1212 |
cpu_relax(); |
1213 |
} |
1214 |
|
1215 |
/* |
1216 |
* Ok, time to look more closely! We need the grq |
1217 |
* lock now, to be *sure*. If we're wrong, we'll |
1218 |
* just go back and repeat. |
1219 |
*/ |
1220 |
rq = task_grq_lock(p, &flags); |
1221 |
trace_sched_wait_task(p); |
1222 |
running = task_running(p); |
1223 |
on_rq = task_queued(p); |
1224 |
ncsw = 0; |
1225 |
if (!match_state || p->state == match_state) |
1226 |
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
1227 |
task_grq_unlock(&flags); |
1228 |
|
1229 |
/* |
1230 |
* If it changed from the expected state, bail out now. |
1231 |
*/ |
1232 |
if (unlikely(!ncsw)) |
1233 |
break; |
1234 |
|
1235 |
/* |
1236 |
* Was it really running after all now that we |
1237 |
* checked with the proper locks actually held? |
1238 |
* |
1239 |
* Oops. Go back and try again.. |
1240 |
*/ |
1241 |
if (unlikely(running)) { |
1242 |
cpu_relax(); |
1243 |
continue; |
1244 |
} |
1245 |
|
1246 |
/* |
1247 |
* It's not enough that it's not actively running, |
1248 |
* it must be off the runqueue _entirely_, and not |
1249 |
* preempted! |
1250 |
* |
1251 |
* So if it was still runnable (but just not actively |
1252 |
* running right now), it's preempted, and we should |
1253 |
* yield - it could be a while. |
1254 |
*/ |
1255 |
if (unlikely(on_rq)) { |
1256 |
ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ); |
1257 |
|
1258 |
set_current_state(TASK_UNINTERRUPTIBLE); |
1259 |
schedule_hrtimeout(&to, HRTIMER_MODE_REL); |
1260 |
continue; |
1261 |
} |
1262 |
|
1263 |
/* |
1264 |
* Ahh, all good. It wasn't running, and it wasn't |
1265 |
* runnable, which means that it will never become |
1266 |
* running in the future either. We're all done! |
1267 |
*/ |
1268 |
break; |
1269 |
} |
1270 |
|
1271 |
return ncsw; |
1272 |
} |
1273 |
|
1274 |
/*** |
1275 |
* kick_process - kick a running thread to enter/exit the kernel |
1276 |
* @p: the to-be-kicked thread |
1277 |
* |
1278 |
* Cause a process which is running on another CPU to enter |
1279 |
* kernel-mode, without any delay. (to get signals handled.) |
1280 |
* |
1281 |
* NOTE: this function doesn't have to take the runqueue lock, |
1282 |
* because all it wants to ensure is that the remote task enters |
1283 |
* the kernel. If the IPI races and the task has been migrated |
1284 |
* to another CPU then no harm is done and the purpose has been |
1285 |
* achieved as well. |
1286 |
*/ |
1287 |
void kick_process(struct task_struct *p) |
1288 |
{ |
1289 |
int cpu; |
1290 |
|
1291 |
preempt_disable(); |
1292 |
cpu = task_cpu(p); |
1293 |
if ((cpu != smp_processor_id()) && task_curr(p)) |
1294 |
smp_send_reschedule(cpu); |
1295 |
preempt_enable(); |
1296 |
} |
1297 |
EXPORT_SYMBOL_GPL(kick_process); |
1298 |
#endif |
1299 |
|
1300 |
#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) |
1301 |
|
1302 |
/* |
1303 |
* RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the |
1304 |
* basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or |
1305 |
* between themselves, they cooperatively multitask. An idle rq scores as |
1306 |
* prio PRIO_LIMIT so it is always preempted. |
1307 |
*/ |
1308 |
static inline bool |
1309 |
can_preempt(struct task_struct *p, int prio, u64 deadline) |
1310 |
{ |
1311 |
/* Better static priority RT task or better policy preemption */ |
1312 |
if (p->prio < prio) |
1313 |
return true; |
1314 |
if (p->prio > prio) |
1315 |
return false; |
1316 |
/* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */ |
1317 |
if (!deadline_before(p->deadline, deadline)) |
1318 |
return false; |
1319 |
return true; |
1320 |
} |
1321 |
|
1322 |
#ifdef CONFIG_SMP |
1323 |
#define cpu_online_map (*(cpumask_t *)cpu_online_mask) |
1324 |
#ifdef CONFIG_HOTPLUG_CPU |
1325 |
/* |
1326 |
* Check to see if there is a task that is affined only to offline CPUs but |
1327 |
* still wants runtime. This happens to kernel threads during suspend/halt and |
1328 |
* disabling of CPUs. |
1329 |
*/ |
1330 |
static inline bool online_cpus(struct task_struct *p) |
1331 |
{ |
1332 |
return (likely(cpus_intersects(cpu_online_map, p->cpus_allowed))); |
1333 |
} |
1334 |
#else /* CONFIG_HOTPLUG_CPU */ |
1335 |
/* All available CPUs are always online without hotplug. */ |
1336 |
static inline bool online_cpus(struct task_struct *p) |
1337 |
{ |
1338 |
return true; |
1339 |
} |
1340 |
#endif |
1341 |
|
1342 |
/* |
1343 |
* Check to see if p can run on cpu, and if not, whether there are any online |
1344 |
* CPUs it can run on instead. |
1345 |
*/ |
1346 |
static inline bool needs_other_cpu(struct task_struct *p, int cpu) |
1347 |
{ |
1348 |
if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) |
1349 |
return true; |
1350 |
return false; |
1351 |
} |
1352 |
|
1353 |
/* |
1354 |
* When all else is equal, still prefer this_rq. |
1355 |
*/ |
1356 |
static void try_preempt(struct task_struct *p, struct rq *this_rq) |
1357 |
{ |
1358 |
struct rq *highest_prio_rq = NULL; |
1359 |
int cpu, highest_prio; |
1360 |
u64 latest_deadline; |
1361 |
cpumask_t tmp; |
1362 |
|
1363 |
/* |
1364 |
* We clear the sticky flag here because for a task to have called |
1365 |
* try_preempt with the sticky flag enabled means some complicated |
1366 |
* re-scheduling has occurred and we should ignore the sticky flag. |
1367 |
*/ |
1368 |
clear_sticky(p); |
1369 |
|
1370 |
if (suitable_idle_cpus(p)) { |
1371 |
resched_best_idle(p); |
1372 |
return; |
1373 |
} |
1374 |
|
1375 |
/* IDLEPRIO tasks never preempt anything but idle */ |
1376 |
if (p->policy == SCHED_IDLEPRIO) |
1377 |
return; |
1378 |
|
1379 |
if (likely(online_cpus(p))) |
1380 |
cpus_and(tmp, cpu_online_map, p->cpus_allowed); |
1381 |
else |
1382 |
return; |
1383 |
|
1384 |
highest_prio = latest_deadline = 0; |
1385 |
|
1386 |
for_each_cpu_mask(cpu, tmp) { |
1387 |
struct rq *rq; |
1388 |
int rq_prio; |
1389 |
|
1390 |
rq = cpu_rq(cpu); |
1391 |
rq_prio = rq->rq_prio; |
1392 |
if (rq_prio < highest_prio) |
1393 |
continue; |
1394 |
|
1395 |
if (rq_prio > highest_prio || |
1396 |
deadline_after(rq->rq_deadline, latest_deadline)) { |
1397 |
latest_deadline = rq->rq_deadline; |
1398 |
highest_prio = rq_prio; |
1399 |
highest_prio_rq = rq; |
1400 |
} |
1401 |
} |
1402 |
|
1403 |
if (likely(highest_prio_rq)) { |
1404 |
if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline)) |
1405 |
resched_task(highest_prio_rq->curr); |
1406 |
} |
1407 |
} |
1408 |
#else /* CONFIG_SMP */ |
1409 |
static inline bool needs_other_cpu(struct task_struct *p, int cpu) |
1410 |
{ |
1411 |
return false; |
1412 |
} |
1413 |
|
1414 |
static void try_preempt(struct task_struct *p, struct rq *this_rq) |
1415 |
{ |
1416 |
if (p->policy == SCHED_IDLEPRIO) |
1417 |
return; |
1418 |
if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) |
1419 |
resched_task(uprq->curr); |
1420 |
} |
1421 |
#endif /* CONFIG_SMP */ |
1422 |
|
1423 |
static void |
1424 |
ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
1425 |
{ |
1426 |
#ifdef CONFIG_SCHEDSTATS |
1427 |
struct rq *rq = this_rq(); |
1428 |
|
1429 |
#ifdef CONFIG_SMP |
1430 |
int this_cpu = smp_processor_id(); |
1431 |
|
1432 |
if (cpu == this_cpu) |
1433 |
schedstat_inc(rq, ttwu_local); |
1434 |
else { |
1435 |
struct sched_domain *sd; |
1436 |
|
1437 |
rcu_read_lock(); |
1438 |
for_each_domain(this_cpu, sd) { |
1439 |
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
1440 |
schedstat_inc(sd, ttwu_wake_remote); |
1441 |
break; |
1442 |
} |
1443 |
} |
1444 |
rcu_read_unlock(); |
1445 |
} |
1446 |
|
1447 |
#endif /* CONFIG_SMP */ |
1448 |
|
1449 |
schedstat_inc(rq, ttwu_count); |
1450 |
#endif /* CONFIG_SCHEDSTATS */ |
1451 |
} |
1452 |
|
1453 |
static inline void ttwu_activate(struct task_struct *p, struct rq *rq, |
1454 |
bool is_sync) |
1455 |
{ |
1456 |
activate_task(p, rq); |
1457 |
|
1458 |
/* |
1459 |
* Sync wakeups (i.e. those types of wakeups where the waker |
1460 |
* has indicated that it will leave the CPU in short order) |
1461 |
* don't trigger a preemption if there are no idle cpus, |
1462 |
* instead waiting for current to deschedule. |
1463 |
*/ |
1464 |
if (!is_sync || suitable_idle_cpus(p)) |
1465 |
try_preempt(p, rq); |
1466 |
} |
1467 |
|
1468 |
static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, |
1469 |
bool success) |
1470 |
{ |
1471 |
trace_sched_wakeup(p, success); |
1472 |
p->state = TASK_RUNNING; |
1473 |
|
1474 |
/* |
1475 |
* if a worker is waking up, notify workqueue. Note that on BFS, we |
1476 |
* don't really know what cpu it will be, so we fake it for |
1477 |
* wq_worker_waking_up :/ |
1478 |
*/ |
1479 |
if ((p->flags & PF_WQ_WORKER) && success) |
1480 |
wq_worker_waking_up(p, cpu_of(rq)); |
1481 |
} |
1482 |
|
1483 |
#ifdef CONFIG_SMP |
1484 |
static void |
1485 |
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) |
1486 |
{ |
1487 |
ttwu_activate(p, rq, false); |
1488 |
ttwu_post_activation(p, rq, true); |
1489 |
} |
1490 |
|
1491 |
static void sched_ttwu_pending(void) |
1492 |
{ |
1493 |
struct rq *rq = this_rq(); |
1494 |
struct llist_node *llist = llist_del_all(&rq->wake_list); |
1495 |
struct task_struct *p; |
1496 |
|
1497 |
grq_lock(); |
1498 |
|
1499 |
while (llist) { |
1500 |
p = llist_entry(llist, struct task_struct, wake_entry); |
1501 |
llist = llist_next(llist); |
1502 |
ttwu_do_activate(rq, p, 0); |
1503 |
} |
1504 |
|
1505 |
grq_unlock(); |
1506 |
} |
1507 |
|
1508 |
void scheduler_ipi(void) |
1509 |
{ |
1510 |
if (llist_empty(&this_rq()->wake_list)) |
1511 |
return; |
1512 |
|
1513 |
/* |
1514 |
* Not all reschedule IPI handlers call irq_enter/irq_exit, since |
1515 |
* traditionally all their work was done from the interrupt return |
1516 |
* path. Now that we actually do some work, we need to make sure |
1517 |
* we do call them. |
1518 |
* |
1519 |
* Some archs already do call them, luckily irq_enter/exit nest |
1520 |
* properly. |
1521 |
* |
1522 |
* Arguably we should visit all archs and update all handlers, |
1523 |
* however a fair share of IPIs are still resched only so this would |
1524 |
* somewhat pessimize the simple resched case. |
1525 |
*/ |
1526 |
irq_enter(); |
1527 |
sched_ttwu_pending(); |
1528 |
|
1529 |
irq_exit(); |
1530 |
} |
1531 |
#endif /* CONFIG_SMP */ |
1532 |
|
1533 |
/* |
1534 |
* wake flags |
1535 |
*/ |
1536 |
#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ |
1537 |
#define WF_FORK 0x02 /* child wakeup after fork */ |
1538 |
#define WF_MIGRATED 0x4 /* internal use, task got migrated */ |
1539 |
|
1540 |
/*** |
1541 |
* try_to_wake_up - wake up a thread |
1542 |
* @p: the thread to be awakened |
1543 |
* @state: the mask of task states that can be woken |
1544 |
* @wake_flags: wake modifier flags (WF_*) |
1545 |
* |
1546 |
* Put it on the run-queue if it's not already there. The "current" |
1547 |
* thread is always on the run-queue (except when the actual |
1548 |
* re-schedule is in progress), and as such you're allowed to do |
1549 |
* the simpler "current->state = TASK_RUNNING" to mark yourself |
1550 |
* runnable without the overhead of this. |
1551 |
* |
1552 |
* Return: %true if @p was woken up, %false if it was already running. |
1553 |
* or @state didn't match @p's state. |
1554 |
*/ |
1555 |
static bool try_to_wake_up(struct task_struct *p, unsigned int state, |
1556 |
int wake_flags) |
1557 |
{ |
1558 |
bool success = false; |
1559 |
unsigned long flags; |
1560 |
struct rq *rq; |
1561 |
int cpu; |
1562 |
|
1563 |
get_cpu(); |
1564 |
|
1565 |
/* |
1566 |
* If we are going to wake up a thread waiting for CONDITION we |
1567 |
* need to ensure that CONDITION=1 done by the caller can not be |
1568 |
* reordered with p->state check below. This pairs with mb() in |
1569 |
* set_current_state() the waiting thread does. |
1570 |
*/ |
1571 |
smp_mb__before_spinlock(); |
1572 |
|
1573 |
/* |
1574 |
* No need to do time_lock_grq as we only need to update the rq clock |
1575 |
* if we activate the task |
1576 |
*/ |
1577 |
rq = task_grq_lock(p, &flags); |
1578 |
cpu = task_cpu(p); |
1579 |
|
1580 |
/* state is a volatile long, どうして、分からない */ |
1581 |
if (!((unsigned int)p->state & state)) |
1582 |
goto out_unlock; |
1583 |
|
1584 |
if (task_queued(p) || task_running(p)) |
1585 |
goto out_running; |
1586 |
|
1587 |
ttwu_activate(p, rq, wake_flags & WF_SYNC); |
1588 |
success = true; |
1589 |
|
1590 |
out_running: |
1591 |
ttwu_post_activation(p, rq, success); |
1592 |
out_unlock: |
1593 |
task_grq_unlock(&flags); |
1594 |
|
1595 |
ttwu_stat(p, cpu, wake_flags); |
1596 |
|
1597 |
put_cpu(); |
1598 |
|
1599 |
return success; |
1600 |
} |
1601 |
|
1602 |
/** |
1603 |
* try_to_wake_up_local - try to wake up a local task with grq lock held |
1604 |
* @p: the thread to be awakened |
1605 |
* |
1606 |
* Put @p on the run-queue if it's not already there. The caller must |
1607 |
* ensure that grq is locked and, @p is not the current task. |
1608 |
* grq stays locked over invocation. |
1609 |
*/ |
1610 |
static void try_to_wake_up_local(struct task_struct *p) |
1611 |
{ |
1612 |
struct rq *rq = task_rq(p); |
1613 |
bool success = false; |
1614 |
|
1615 |
lockdep_assert_held(&grq.lock); |
1616 |
|
1617 |
if (!(p->state & TASK_NORMAL)) |
1618 |
return; |
1619 |
|
1620 |
if (!task_queued(p)) { |
1621 |
if (likely(!task_running(p))) { |
1622 |
schedstat_inc(rq, ttwu_count); |
1623 |
schedstat_inc(rq, ttwu_local); |
1624 |
} |
1625 |
ttwu_activate(p, rq, false); |
1626 |
ttwu_stat(p, smp_processor_id(), 0); |
1627 |
success = true; |
1628 |
} |
1629 |
ttwu_post_activation(p, rq, success); |
1630 |
} |
1631 |
|
1632 |
/** |
1633 |
* wake_up_process - Wake up a specific process |
1634 |
* @p: The process to be woken up. |
1635 |
* |
1636 |
* Attempt to wake up the nominated process and move it to the set of runnable |
1637 |
* processes. |
1638 |
* |
1639 |
* Return: 1 if the process was woken up, 0 if it was already running. |
1640 |
* |
1641 |
* It may be assumed that this function implies a write memory barrier before |
1642 |
* changing the task state if and only if any tasks are woken up. |
1643 |
*/ |
1644 |
int wake_up_process(struct task_struct *p) |
1645 |
{ |
1646 |
WARN_ON(task_is_stopped_or_traced(p)); |
1647 |
return try_to_wake_up(p, TASK_NORMAL, 0); |
1648 |
} |
1649 |
EXPORT_SYMBOL(wake_up_process); |
1650 |
|
1651 |
int wake_up_state(struct task_struct *p, unsigned int state) |
1652 |
{ |
1653 |
return try_to_wake_up(p, state, 0); |
1654 |
} |
1655 |
|
1656 |
static void time_slice_expired(struct task_struct *p); |
1657 |
|
1658 |
/* |
1659 |
* Perform scheduler related setup for a newly forked process p. |
1660 |
* p is forked by current. |
1661 |
*/ |
1662 |
void sched_fork(struct task_struct *p) |
1663 |
{ |
1664 |
#ifdef CONFIG_PREEMPT_NOTIFIERS |
1665 |
INIT_HLIST_HEAD(&p->preempt_notifiers); |
1666 |
#endif |
1667 |
/* |
1668 |
* The process state is set to the same value of the process executing |
1669 |
* do_fork() code. That is running. This guarantees that nobody will |
1670 |
* actually run it, and a signal or other external event cannot wake |
1671 |
* it up and insert it on the runqueue either. |
1672 |
*/ |
1673 |
|
1674 |
/* Should be reset in fork.c but done here for ease of bfs patching */ |
1675 |
p->utime = |
1676 |
p->stime = |
1677 |
p->utimescaled = |
1678 |
p->stimescaled = |
1679 |
p->sched_time = |
1680 |
p->stime_pc = |
1681 |
p->utime_pc = 0; |
1682 |
|
1683 |
/* |
1684 |
* Revert to default priority/policy on fork if requested. |
1685 |
*/ |
1686 |
if (unlikely(p->sched_reset_on_fork)) { |
1687 |
if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { |
1688 |
p->policy = SCHED_NORMAL; |
1689 |
p->normal_prio = normal_prio(p); |
1690 |
} |
1691 |
|
1692 |
if (PRIO_TO_NICE(p->static_prio) < 0) { |
1693 |
p->static_prio = NICE_TO_PRIO(0); |
1694 |
p->normal_prio = p->static_prio; |
1695 |
} |
1696 |
|
1697 |
/* |
1698 |
* We don't need the reset flag anymore after the fork. It has |
1699 |
* fulfilled its duty: |
1700 |
*/ |
1701 |
p->sched_reset_on_fork = 0; |
1702 |
} |
1703 |
|
1704 |
INIT_LIST_HEAD(&p->run_list); |
1705 |
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1706 |
if (unlikely(sched_info_on())) |
1707 |
memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1708 |
#endif |
1709 |
p->on_cpu = false; |
1710 |
clear_sticky(p); |
1711 |
|
1712 |
#ifdef CONFIG_PREEMPT_COUNT |
1713 |
/* Want to start with kernel preemption disabled. */ |
1714 |
task_thread_info(p)->preempt_count = 1; |
1715 |
#endif |
1716 |
} |
1717 |
|
1718 |
/* |
1719 |
* wake_up_new_task - wake up a newly created task for the first time. |
1720 |
* |
1721 |
* This function will do some initial scheduler statistics housekeeping |
1722 |
* that must be done for every newly created context, then puts the task |
1723 |
* on the runqueue and wakes it. |
1724 |
*/ |
1725 |
void wake_up_new_task(struct task_struct *p) |
1726 |
{ |
1727 |
struct task_struct *parent; |
1728 |
unsigned long flags; |
1729 |
struct rq *rq; |
1730 |
|
1731 |
parent = p->parent; |
1732 |
rq = task_grq_lock(p, &flags); |
1733 |
|
1734 |
/* |
1735 |
* Reinit new task deadline as its creator deadline could have changed |
1736 |
* since call to dup_task_struct(). |
1737 |
*/ |
1738 |
p->deadline = rq->rq_deadline; |
1739 |
|
1740 |
/* |
1741 |
* If the task is a new process, current and parent are the same. If |
1742 |
* the task is a new thread in the thread group, it will have much more |
1743 |
* in common with current than with the parent. |
1744 |
*/ |
1745 |
set_task_cpu(p, task_cpu(rq->curr)); |
1746 |
|
1747 |
/* |
1748 |
* Make sure we do not leak PI boosting priority to the child. |
1749 |
*/ |
1750 |
p->prio = rq->curr->normal_prio; |
1751 |
|
1752 |
activate_task(p, rq); |
1753 |
trace_sched_wakeup_new(p, 1); |
1754 |
if (unlikely(p->policy == SCHED_FIFO)) |
1755 |
goto after_ts_init; |
1756 |
|
1757 |
/* |
1758 |
* Share the timeslice between parent and child, thus the |
1759 |
* total amount of pending timeslices in the system doesn't change, |
1760 |
* resulting in more scheduling fairness. If it's negative, it won't |
1761 |
* matter since that's the same as being 0. current's time_slice is |
1762 |
* actually in rq_time_slice when it's running, as is its last_ran |
1763 |
* value. rq->rq_deadline is only modified within schedule() so it |
1764 |
* is always equal to current->deadline. |
1765 |
*/ |
1766 |
p->last_ran = rq->rq_last_ran; |
1767 |
if (likely(rq->rq_time_slice >= RESCHED_US * 2)) { |
1768 |
rq->rq_time_slice /= 2; |
1769 |
p->time_slice = rq->rq_time_slice; |
1770 |
after_ts_init: |
1771 |
if (rq->curr == parent && !suitable_idle_cpus(p)) { |
1772 |
/* |
1773 |
* The VM isn't cloned, so we're in a good position to |
1774 |
* do child-runs-first in anticipation of an exec. This |
1775 |
* usually avoids a lot of COW overhead. |
1776 |
*/ |
1777 |
set_tsk_need_resched(parent); |
1778 |
} else |
1779 |
try_preempt(p, rq); |
1780 |
} else { |
1781 |
if (rq->curr == parent) { |
1782 |
/* |
1783 |
* Forking task has run out of timeslice. Reschedule it and |
1784 |
* start its child with a new time slice and deadline. The |
1785 |
* child will end up running first because its deadline will |
1786 |
* be slightly earlier. |
1787 |
*/ |
1788 |
rq->rq_time_slice = 0; |
1789 |
set_tsk_need_resched(parent); |
1790 |
} |
1791 |
time_slice_expired(p); |
1792 |
} |
1793 |
task_grq_unlock(&flags); |
1794 |
} |
1795 |
|
1796 |
#ifdef CONFIG_PREEMPT_NOTIFIERS |
1797 |
|
1798 |
/** |
1799 |
* preempt_notifier_register - tell me when current is being preempted & rescheduled |
1800 |
* @notifier: notifier struct to register |
1801 |
*/ |
1802 |
void preempt_notifier_register(struct preempt_notifier *notifier) |
1803 |
{ |
1804 |
hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); |
1805 |
} |
1806 |
EXPORT_SYMBOL_GPL(preempt_notifier_register); |
1807 |
|
1808 |
/** |
1809 |
* preempt_notifier_unregister - no longer interested in preemption notifications |
1810 |
* @notifier: notifier struct to unregister |
1811 |
* |
1812 |
* This is safe to call from within a preemption notifier. |
1813 |
*/ |
1814 |
void preempt_notifier_unregister(struct preempt_notifier *notifier) |
1815 |
{ |
1816 |
hlist_del(¬ifier->link); |
1817 |
} |
1818 |
EXPORT_SYMBOL_GPL(preempt_notifier_unregister); |
1819 |
|
1820 |
static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
1821 |
{ |
1822 |
struct preempt_notifier *notifier; |
1823 |
|
1824 |
hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) |
1825 |
notifier->ops->sched_in(notifier, raw_smp_processor_id()); |
1826 |
} |
1827 |
|
1828 |
static void |
1829 |
fire_sched_out_preempt_notifiers(struct task_struct *curr, |
1830 |
struct task_struct *next) |
1831 |
{ |
1832 |
struct preempt_notifier *notifier; |
1833 |
|
1834 |
hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) |
1835 |
notifier->ops->sched_out(notifier, next); |
1836 |
} |
1837 |
|
1838 |
#else /* !CONFIG_PREEMPT_NOTIFIERS */ |
1839 |
|
1840 |
static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
1841 |
{ |
1842 |
} |
1843 |
|
1844 |
static void |
1845 |
fire_sched_out_preempt_notifiers(struct task_struct *curr, |
1846 |
struct task_struct *next) |
1847 |
{ |
1848 |
} |
1849 |
|
1850 |
#endif /* CONFIG_PREEMPT_NOTIFIERS */ |
1851 |
|
1852 |
/** |
1853 |
* prepare_task_switch - prepare to switch tasks |
1854 |
* @rq: the runqueue preparing to switch |
1855 |
* @next: the task we are going to switch to. |
1856 |
* |
1857 |
* This is called with the rq lock held and interrupts off. It must |
1858 |
* be paired with a subsequent finish_task_switch after the context |
1859 |
* switch. |
1860 |
* |
1861 |
* prepare_task_switch sets up locking and calls architecture specific |
1862 |
* hooks. |
1863 |
*/ |
1864 |
static inline void |
1865 |
prepare_task_switch(struct rq *rq, struct task_struct *prev, |
1866 |
struct task_struct *next) |
1867 |
{ |
1868 |
sched_info_switch(prev, next); |
1869 |
perf_event_task_sched_out(prev, next); |
1870 |
fire_sched_out_preempt_notifiers(prev, next); |
1871 |
prepare_lock_switch(rq, next); |
1872 |
prepare_arch_switch(next); |
1873 |
trace_sched_switch(prev, next); |
1874 |
} |
1875 |
|
1876 |
/** |
1877 |
* finish_task_switch - clean up after a task-switch |
1878 |
* @rq: runqueue associated with task-switch |
1879 |
* @prev: the thread we just switched away from. |
1880 |
* |
1881 |
* finish_task_switch must be called after the context switch, paired |
1882 |
* with a prepare_task_switch call before the context switch. |
1883 |
* finish_task_switch will reconcile locking set up by prepare_task_switch, |
1884 |
* and do any other architecture-specific cleanup actions. |
1885 |
* |
1886 |
* Note that we may have delayed dropping an mm in context_switch(). If |
1887 |
* so, we finish that here outside of the runqueue lock. (Doing it |
1888 |
* with the lock held can cause deadlocks; see schedule() for |
1889 |
* details.) |
1890 |
*/ |
1891 |
static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) |
1892 |
__releases(grq.lock) |
1893 |
{ |
1894 |
struct mm_struct *mm = rq->prev_mm; |
1895 |
long prev_state; |
1896 |
|
1897 |
rq->prev_mm = NULL; |
1898 |
|
1899 |
/* |
1900 |
* A task struct has one reference for the use as "current". |
1901 |
* If a task dies, then it sets TASK_DEAD in tsk->state and calls |
1902 |
* schedule one last time. The schedule call will never return, and |
1903 |
* the scheduled task must drop that reference. |
1904 |
* The test for TASK_DEAD must occur while the runqueue locks are |
1905 |
* still held, otherwise prev could be scheduled on another cpu, die |
1906 |
* there before we look at prev->state, and then the reference would |
1907 |
* be dropped twice. |
1908 |
* Manfred Spraul <manfred@colorfullife.com> |
1909 |
*/ |
1910 |
prev_state = prev->state; |
1911 |
vtime_task_switch(prev); |
1912 |
finish_arch_switch(prev); |
1913 |
perf_event_task_sched_in(prev, current); |
1914 |
finish_lock_switch(rq, prev); |
1915 |
finish_arch_post_lock_switch(); |
1916 |
|
1917 |
fire_sched_in_preempt_notifiers(current); |
1918 |
if (mm) |
1919 |
mmdrop(mm); |
1920 |
if (unlikely(prev_state == TASK_DEAD)) { |
1921 |
/* |
1922 |
* Remove function-return probe instances associated with this |
1923 |
* task and put them back on the free list. |
1924 |
*/ |
1925 |
kprobe_flush_task(prev); |
1926 |
put_task_struct(prev); |
1927 |
} |
1928 |
} |
1929 |
|
1930 |
/** |
1931 |
* schedule_tail - first thing a freshly forked thread must call. |
1932 |
* @prev: the thread we just switched away from. |
1933 |
*/ |
1934 |
asmlinkage void schedule_tail(struct task_struct *prev) |
1935 |
__releases(grq.lock) |
1936 |
{ |
1937 |
struct rq *rq = this_rq(); |
1938 |
|
1939 |
finish_task_switch(rq, prev); |
1940 |
#ifdef __ARCH_WANT_UNLOCKED_CTXSW |
1941 |
/* In this case, finish_task_switch does not reenable preemption */ |
1942 |
preempt_enable(); |
1943 |
#endif |
1944 |
if (current->set_child_tid) |
1945 |
put_user(current->pid, current->set_child_tid); |
1946 |
} |
1947 |
|
1948 |
/* |
1949 |
* context_switch - switch to the new MM and the new |
1950 |
* thread's register state. |
1951 |
*/ |
1952 |
static inline void |
1953 |
context_switch(struct rq *rq, struct task_struct *prev, |
1954 |
struct task_struct *next) |
1955 |
{ |
1956 |
struct mm_struct *mm, *oldmm; |
1957 |
|
1958 |
prepare_task_switch(rq, prev, next); |
1959 |
|
1960 |
mm = next->mm; |
1961 |
oldmm = prev->active_mm; |
1962 |
/* |
1963 |
* For paravirt, this is coupled with an exit in switch_to to |
1964 |
* combine the page table reload and the switch backend into |
1965 |
* one hypercall. |
1966 |
*/ |
1967 |
arch_start_context_switch(prev); |
1968 |
|
1969 |
if (!mm) { |
1970 |
next->active_mm = oldmm; |
1971 |
atomic_inc(&oldmm->mm_count); |
1972 |
enter_lazy_tlb(oldmm, next); |
1973 |
} else |
1974 |
switch_mm(oldmm, mm, next); |
1975 |
|
1976 |
if (!prev->mm) { |
1977 |
prev->active_mm = NULL; |
1978 |
rq->prev_mm = oldmm; |
1979 |
} |
1980 |
/* |
1981 |
* Since the runqueue lock will be released by the next |
1982 |
* task (which is an invalid locking op but in the case |
1983 |
* of the scheduler it's an obvious special-case), so we |
1984 |
* do an early lockdep release here: |
1985 |
*/ |
1986 |
#ifndef __ARCH_WANT_UNLOCKED_CTXSW |
1987 |
spin_release(&grq.lock.dep_map, 1, _THIS_IP_); |
1988 |
#endif |
1989 |
|
1990 |
/* Here we just switch the register state and the stack. */ |
1991 |
context_tracking_task_switch(prev, next); |
1992 |
switch_to(prev, next, prev); |
1993 |
|
1994 |
barrier(); |
1995 |
/* |
1996 |
* this_rq must be evaluated again because prev may have moved |
1997 |
* CPUs since it called schedule(), thus the 'rq' on its stack |
1998 |
* frame will be invalid. |
1999 |
*/ |
2000 |
finish_task_switch(this_rq(), prev); |
2001 |
} |
2002 |
|
2003 |
/* |
2004 |
* nr_running, nr_uninterruptible and nr_context_switches: |
2005 |
* |
2006 |
* externally visible scheduler statistics: current number of runnable |
2007 |
* threads, total number of context switches performed since bootup. All are |
2008 |
* measured without grabbing the grq lock but the occasional inaccurate result |
2009 |
* doesn't matter so long as it's positive. |
2010 |
*/ |
2011 |
unsigned long nr_running(void) |
2012 |
{ |
2013 |
long nr = grq.nr_running; |
2014 |
|
2015 |
if (unlikely(nr < 0)) |
2016 |
nr = 0; |
2017 |
return (unsigned long)nr; |
2018 |
} |
2019 |
|
2020 |
static unsigned long nr_uninterruptible(void) |
2021 |
{ |
2022 |
long nu = grq.nr_uninterruptible; |
2023 |
|
2024 |
if (unlikely(nu < 0)) |
2025 |
nu = 0; |
2026 |
return nu; |
2027 |
} |
2028 |
|
2029 |
unsigned long long nr_context_switches(void) |
2030 |
{ |
2031 |
long long ns = grq.nr_switches; |
2032 |
|
2033 |
/* This is of course impossible */ |
2034 |
if (unlikely(ns < 0)) |
2035 |
ns = 1; |
2036 |
return (unsigned long long)ns; |
2037 |
} |
2038 |
|
2039 |
unsigned long nr_iowait(void) |
2040 |
{ |
2041 |
unsigned long i, sum = 0; |
2042 |
|
2043 |
for_each_possible_cpu(i) |
2044 |
sum += atomic_read(&cpu_rq(i)->nr_iowait); |
2045 |
|
2046 |
return sum; |
2047 |
} |
2048 |
|
2049 |
unsigned long nr_iowait_cpu(int cpu) |
2050 |
{ |
2051 |
struct rq *this = cpu_rq(cpu); |
2052 |
return atomic_read(&this->nr_iowait); |
2053 |
} |
2054 |
|
2055 |
unsigned long nr_active(void) |
2056 |
{ |
2057 |
return nr_running() + nr_uninterruptible(); |
2058 |
} |
2059 |
|
2060 |
/* Beyond a task running on this CPU, load is equal everywhere on BFS */ |
2061 |
unsigned long this_cpu_load(void) |
2062 |
{ |
2063 |
return this_rq()->rq_running + |
2064 |
((queued_notrunning() + nr_uninterruptible()) / grq.noc); |
2065 |
} |
2066 |
|
2067 |
/* Variables and functions for calc_load */ |
2068 |
static unsigned long calc_load_update; |
2069 |
unsigned long avenrun[3]; |
2070 |
EXPORT_SYMBOL(avenrun); |
2071 |
|
2072 |
/** |
2073 |
* get_avenrun - get the load average array |
2074 |
* @loads: pointer to dest load array |
2075 |
* @offset: offset to add |
2076 |
* @shift: shift count to shift the result left |
2077 |
* |
2078 |
* These values are estimates at best, so no need for locking. |
2079 |
*/ |
2080 |
void get_avenrun(unsigned long *loads, unsigned long offset, int shift) |
2081 |
{ |
2082 |
loads[0] = (avenrun[0] + offset) << shift; |
2083 |
loads[1] = (avenrun[1] + offset) << shift; |
2084 |
loads[2] = (avenrun[2] + offset) << shift; |
2085 |
} |
2086 |
|
2087 |
static unsigned long |
2088 |
calc_load(unsigned long load, unsigned long exp, unsigned long active) |
2089 |
{ |
2090 |
load *= exp; |
2091 |
load += active * (FIXED_1 - exp); |
2092 |
return load >> FSHIFT; |
2093 |
} |
2094 |
|
2095 |
/* |
2096 |
* calc_load - update the avenrun load estimates every LOAD_FREQ seconds. |
2097 |
*/ |
2098 |
void calc_global_load(unsigned long ticks) |
2099 |
{ |
2100 |
long active; |
2101 |
|
2102 |
if (time_before(jiffies, calc_load_update)) |
2103 |
return; |
2104 |
active = nr_active() * FIXED_1; |
2105 |
|
2106 |
avenrun[0] = calc_load(avenrun[0], EXP_1, active); |
2107 |
avenrun[1] = calc_load(avenrun[1], EXP_5, active); |
2108 |
avenrun[2] = calc_load(avenrun[2], EXP_15, active); |
2109 |
|
2110 |
calc_load_update = jiffies + LOAD_FREQ; |
2111 |
} |
2112 |
|
2113 |
DEFINE_PER_CPU(struct kernel_stat, kstat); |
2114 |
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); |
2115 |
|
2116 |
EXPORT_PER_CPU_SYMBOL(kstat); |
2117 |
EXPORT_PER_CPU_SYMBOL(kernel_cpustat); |
2118 |
|
2119 |
#ifdef CONFIG_IRQ_TIME_ACCOUNTING |
2120 |
|
2121 |
/* |
2122 |
* There are no locks covering percpu hardirq/softirq time. |
2123 |
* They are only modified in account_system_vtime, on corresponding CPU |
2124 |
* with interrupts disabled. So, writes are safe. |
2125 |
* They are read and saved off onto struct rq in update_rq_clock(). |
2126 |
* This may result in other CPU reading this CPU's irq time and can |
2127 |
* race with irq/account_system_vtime on this CPU. We would either get old |
2128 |
* or new value with a side effect of accounting a slice of irq time to wrong |
2129 |
* task when irq is in progress while we read rq->clock. That is a worthy |
2130 |
* compromise in place of having locks on each irq in account_system_time. |
2131 |
*/ |
2132 |
static DEFINE_PER_CPU(u64, cpu_hardirq_time); |
2133 |
static DEFINE_PER_CPU(u64, cpu_softirq_time); |
2134 |
|
2135 |
static DEFINE_PER_CPU(u64, irq_start_time); |
2136 |
static int sched_clock_irqtime; |
2137 |
|
2138 |
void enable_sched_clock_irqtime(void) |
2139 |
{ |
2140 |
sched_clock_irqtime = 1; |
2141 |
} |
2142 |
|
2143 |
void disable_sched_clock_irqtime(void) |
2144 |
{ |
2145 |
sched_clock_irqtime = 0; |
2146 |
} |
2147 |
|
2148 |
#ifndef CONFIG_64BIT |
2149 |
static DEFINE_PER_CPU(seqcount_t, irq_time_seq); |
2150 |
|
2151 |
static inline void irq_time_write_begin(void) |
2152 |
{ |
2153 |
__this_cpu_inc(irq_time_seq.sequence); |
2154 |
smp_wmb(); |
2155 |
} |
2156 |
|
2157 |
static inline void irq_time_write_end(void) |
2158 |
{ |
2159 |
smp_wmb(); |
2160 |
__this_cpu_inc(irq_time_seq.sequence); |
2161 |
} |
2162 |
|
2163 |
static inline u64 irq_time_read(int cpu) |
2164 |
{ |
2165 |
u64 irq_time; |
2166 |
unsigned seq; |
2167 |
|
2168 |
do { |
2169 |
seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); |
2170 |
irq_time = per_cpu(cpu_softirq_time, cpu) + |
2171 |
per_cpu(cpu_hardirq_time, cpu); |
2172 |
} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); |
2173 |
|
2174 |
return irq_time; |
2175 |
} |
2176 |
#else /* CONFIG_64BIT */ |
2177 |
static inline void irq_time_write_begin(void) |
2178 |
{ |
2179 |
} |
2180 |
|
2181 |
static inline void irq_time_write_end(void) |
2182 |
{ |
2183 |
} |
2184 |
|
2185 |
static inline u64 irq_time_read(int cpu) |
2186 |
{ |
2187 |
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); |
2188 |
} |
2189 |
#endif /* CONFIG_64BIT */ |
2190 |
|
2191 |
/* |
2192 |
* Called before incrementing preempt_count on {soft,}irq_enter |
2193 |
* and before decrementing preempt_count on {soft,}irq_exit. |
2194 |
*/ |
2195 |
void irqtime_account_irq(struct task_struct *curr) |
2196 |
{ |
2197 |
unsigned long flags; |
2198 |
s64 delta; |
2199 |
int cpu; |
2200 |
|
2201 |
if (!sched_clock_irqtime) |
2202 |
return; |
2203 |
|
2204 |
local_irq_save(flags); |
2205 |
|
2206 |
cpu = smp_processor_id(); |
2207 |
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); |
2208 |
__this_cpu_add(irq_start_time, delta); |
2209 |
|
2210 |
irq_time_write_begin(); |
2211 |
/* |
2212 |
* We do not account for softirq time from ksoftirqd here. |
2213 |
* We want to continue accounting softirq time to ksoftirqd thread |
2214 |
* in that case, so as not to confuse scheduler with a special task |
2215 |
* that do not consume any time, but still wants to run. |
2216 |
*/ |
2217 |
if (hardirq_count()) |
2218 |
__this_cpu_add(cpu_hardirq_time, delta); |
2219 |
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
2220 |
__this_cpu_add(cpu_softirq_time, delta); |
2221 |
|
2222 |
irq_time_write_end(); |
2223 |
local_irq_restore(flags); |
2224 |
} |
2225 |
EXPORT_SYMBOL_GPL(irqtime_account_irq); |
2226 |
|
2227 |
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2228 |
|
2229 |
#ifdef CONFIG_PARAVIRT |
2230 |
static inline u64 steal_ticks(u64 steal) |
2231 |
{ |
2232 |
if (unlikely(steal > NSEC_PER_SEC)) |
2233 |
return div_u64(steal, TICK_NSEC); |
2234 |
|
2235 |
return __iter_div_u64_rem(steal, TICK_NSEC, &steal); |
2236 |
} |
2237 |
#endif |
2238 |
|
2239 |
static void update_rq_clock_task(struct rq *rq, s64 delta) |
2240 |
{ |
2241 |
/* |
2242 |
* In theory, the compile should just see 0 here, and optimize out the call |
2243 |
* to sched_rt_avg_update. But I don't trust it... |
2244 |
*/ |
2245 |
#ifdef CONFIG_IRQ_TIME_ACCOUNTING |
2246 |
s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
2247 |
|
2248 |
/* |
2249 |
* Since irq_time is only updated on {soft,}irq_exit, we might run into |
2250 |
* this case when a previous update_rq_clock() happened inside a |
2251 |
* {soft,}irq region. |
2252 |
* |
2253 |
* When this happens, we stop ->clock_task and only update the |
2254 |
* prev_irq_time stamp to account for the part that fit, so that a next |
2255 |
* update will consume the rest. This ensures ->clock_task is |
2256 |
* monotonic. |
2257 |
* |
2258 |
* It does however cause some slight miss-attribution of {soft,}irq |
2259 |
* time, a more accurate solution would be to update the irq_time using |
2260 |
* the current rq->clock timestamp, except that would require using |
2261 |
* atomic ops. |
2262 |
*/ |
2263 |
if (irq_delta > delta) |
2264 |
irq_delta = delta; |
2265 |
|
2266 |
rq->prev_irq_time += irq_delta; |
2267 |
delta -= irq_delta; |
2268 |
#endif |
2269 |
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING |
2270 |
if (static_key_false((¶virt_steal_rq_enabled))) { |
2271 |
s64 steal = paravirt_steal_clock(cpu_of(rq)); |
2272 |
u64 st; |
2273 |
|
2274 |
steal -= rq->prev_steal_time_rq; |
2275 |
|
2276 |
if (unlikely(steal > delta)) |
2277 |
steal = delta; |
2278 |
|
2279 |
st = steal_ticks(steal); |
2280 |
steal = st * TICK_NSEC; |
2281 |
|
2282 |
rq->prev_steal_time_rq += steal; |
2283 |
|
2284 |
delta -= steal; |
2285 |
} |
2286 |
#endif |
2287 |
|
2288 |
rq->clock_task += delta; |
2289 |
} |
2290 |
|
2291 |
#ifndef nsecs_to_cputime |
2292 |
# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) |
2293 |
#endif |
2294 |
|
2295 |
#ifdef CONFIG_IRQ_TIME_ACCOUNTING |
2296 |
static void irqtime_account_hi_si(void) |
2297 |
{ |
2298 |
u64 *cpustat = kcpustat_this_cpu->cpustat; |
2299 |
u64 latest_ns; |
2300 |
|
2301 |
latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)); |
2302 |
if (latest_ns > cpustat[CPUTIME_IRQ]) |
2303 |
cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy; |
2304 |
|
2305 |
latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)); |
2306 |
if (latest_ns > cpustat[CPUTIME_SOFTIRQ]) |
2307 |
cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy; |
2308 |
} |
2309 |
#else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2310 |
|
2311 |
#define sched_clock_irqtime (0) |
2312 |
|
2313 |
static inline void irqtime_account_hi_si(void) |
2314 |
{ |
2315 |
} |
2316 |
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2317 |
|
2318 |
static __always_inline bool steal_account_process_tick(void) |
2319 |
{ |
2320 |
#ifdef CONFIG_PARAVIRT |
2321 |
if (static_key_false(¶virt_steal_enabled)) { |
2322 |
u64 steal, st = 0; |
2323 |
|
2324 |
steal = paravirt_steal_clock(smp_processor_id()); |
2325 |
steal -= this_rq()->prev_steal_time; |
2326 |
|
2327 |
st = steal_ticks(steal); |
2328 |
this_rq()->prev_steal_time += st * TICK_NSEC; |
2329 |
|
2330 |
account_steal_time(st); |
2331 |
return st; |
2332 |
} |
2333 |
#endif |
2334 |
return false; |
2335 |
} |
2336 |
|
2337 |
/* |
2338 |
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live |
2339 |
* tasks (sum on group iteration) belonging to @tsk's group. |
2340 |
*/ |
2341 |
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) |
2342 |
{ |
2343 |
struct signal_struct *sig = tsk->signal; |
2344 |
cputime_t utime, stime; |
2345 |
struct task_struct *t; |
2346 |
|
2347 |
times->utime = sig->utime; |
2348 |
times->stime = sig->stime; |
2349 |
times->sum_exec_runtime = sig->sum_sched_runtime; |
2350 |
|
2351 |
rcu_read_lock(); |
2352 |
/* make sure we can trust tsk->thread_group list */ |
2353 |
if (!likely(pid_alive(tsk))) |
2354 |
goto out; |
2355 |
|
2356 |
t = tsk; |
2357 |
do { |
2358 |
task_cputime(t, &utime, &stime); |
2359 |
times->utime += utime; |
2360 |
times->stime += stime; |
2361 |
times->sum_exec_runtime += task_sched_runtime(t); |
2362 |
} while_each_thread(tsk, t); |
2363 |
out: |
2364 |
rcu_read_unlock(); |
2365 |
} |
2366 |
|
2367 |
/* |
2368 |
* On each tick, see what percentage of that tick was attributed to each |
2369 |
* component and add the percentage to the _pc values. Once a _pc value has |
2370 |
* accumulated one tick's worth, account for that. This means the total |
2371 |
* percentage of load components will always be 128 (pseudo 100) per tick. |
2372 |
*/ |
2373 |
static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc) |
2374 |
{ |
2375 |
u64 *cpustat = kcpustat_this_cpu->cpustat; |
2376 |
|
2377 |
if (atomic_read(&rq->nr_iowait) > 0) { |
2378 |
rq->iowait_pc += pc; |
2379 |
if (rq->iowait_pc >= 128) { |
2380 |
cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128; |
2381 |
rq->iowait_pc %= 128; |
2382 |
} |
2383 |
} else { |
2384 |
rq->idle_pc += pc; |
2385 |
if (rq->idle_pc >= 128) { |
2386 |
cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128; |
2387 |
rq->idle_pc %= 128; |
2388 |
} |
2389 |
} |
2390 |
acct_update_integrals(idle); |
2391 |
} |
2392 |
|
2393 |
static void |
2394 |
pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset, |
2395 |
unsigned long pc, unsigned long ns) |
2396 |
{ |
2397 |
u64 *cpustat = kcpustat_this_cpu->cpustat; |
2398 |
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
2399 |
|
2400 |
p->stime_pc += pc; |
2401 |
if (p->stime_pc >= 128) { |
2402 |
int jiffs = p->stime_pc / 128; |
2403 |
|
2404 |
p->stime_pc %= 128; |
2405 |
p->stime += (__force u64)cputime_one_jiffy * jiffs; |
2406 |
p->stimescaled += one_jiffy_scaled * jiffs; |
2407 |
account_group_system_time(p, cputime_one_jiffy * jiffs); |
2408 |
} |
2409 |
p->sched_time += ns; |
2410 |
/* |
2411 |
* Do not update the cputimer if the task is already released by |
2412 |
* release_task(). |
2413 |
* |
2414 |
* This could be executed if a tick happens when a task is inside |
2415 |
* do_exit() between the call to release_task() and its final |
2416 |
* schedule() call for autoreaping tasks. |
2417 |
*/ |
2418 |
if (likely(p->sighand)) |
2419 |
account_group_exec_runtime(p, ns); |
2420 |
|
2421 |
if (hardirq_count() - hardirq_offset) { |
2422 |
rq->irq_pc += pc; |
2423 |
if (rq->irq_pc >= 128) { |
2424 |
cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128; |
2425 |
rq->irq_pc %= 128; |
2426 |
} |
2427 |
} else if (in_serving_softirq()) { |
2428 |
rq->softirq_pc += pc; |
2429 |
if (rq->softirq_pc >= 128) { |
2430 |
cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; |
2431 |
rq->softirq_pc %= 128; |
2432 |
} |
2433 |
} else { |
2434 |
rq->system_pc += pc; |
2435 |
if (rq->system_pc >= 128) { |
2436 |
cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128; |
2437 |
rq->system_pc %= 128; |
2438 |
} |
2439 |
} |
2440 |
acct_update_integrals(p); |
2441 |
} |
2442 |
|
2443 |
static void pc_user_time(struct rq *rq, struct task_struct *p, |
2444 |
unsigned long pc, unsigned long ns) |
2445 |
{ |
2446 |
u64 *cpustat = kcpustat_this_cpu->cpustat; |
2447 |
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
2448 |
|
2449 |
p->utime_pc += pc; |
2450 |
if (p->utime_pc >= 128) { |
2451 |
int jiffs = p->utime_pc / 128; |
2452 |
|
2453 |
p->utime_pc %= 128; |
2454 |
p->utime += (__force u64)cputime_one_jiffy * jiffs; |
2455 |
p->utimescaled += one_jiffy_scaled * jiffs; |
2456 |
account_group_user_time(p, cputime_one_jiffy * jiffs); |
2457 |
} |
2458 |
p->sched_time += ns; |
2459 |
/* |
2460 |
* Do not update the cputimer if the task is already released by |
2461 |
* release_task(). |
2462 |
* |
2463 |
* it would preferable to defer the autoreap release_task |
2464 |
* after the last context switch but harder to do. |
2465 |
*/ |
2466 |
if (likely(p->sighand)) |
2467 |
account_group_exec_runtime(p, ns); |
2468 |
|
2469 |
if (this_cpu_ksoftirqd() == p) { |
2470 |
/* |
2471 |
* ksoftirqd time do not get accounted in cpu_softirq_time. |
2472 |
* So, we have to handle it separately here. |
2473 |
*/ |
2474 |
rq->softirq_pc += pc; |
2475 |
if (rq->softirq_pc >= 128) { |
2476 |
cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; |
2477 |
rq->softirq_pc %= 128; |
2478 |
} |
2479 |
} |
2480 |
|
2481 |
if (TASK_NICE(p) > 0 || idleprio_task(p)) { |
2482 |
rq->nice_pc += pc; |
2483 |
if (rq->nice_pc >= 128) { |
2484 |
cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128; |
2485 |
rq->nice_pc %= 128; |
2486 |
} |
2487 |
} else { |
2488 |
rq->user_pc += pc; |
2489 |
if (rq->user_pc >= 128) { |
2490 |
cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128; |
2491 |
rq->user_pc %= 128; |
2492 |
} |
2493 |
} |
2494 |
acct_update_integrals(p); |
2495 |
} |
2496 |
|
2497 |
/* |
2498 |
* Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast |
2499 |
* shifts instead of 100 |
2500 |
*/ |
2501 |
#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS) |
2502 |
|
2503 |
/* |
2504 |
* This is called on clock ticks. |
2505 |
* Bank in p->sched_time the ns elapsed since the last tick or switch. |
2506 |
* CPU scheduler quota accounting is also performed here in microseconds. |
2507 |
*/ |
2508 |
static void |
2509 |
update_cpu_clock_tick(struct rq *rq, struct task_struct *p) |
2510 |
{ |
2511 |
long account_ns = rq->clock_task - rq->rq_last_ran; |
2512 |
struct task_struct *idle = rq->idle; |
2513 |
unsigned long account_pc; |
2514 |
|
2515 |
if (unlikely(account_ns < 0) || steal_account_process_tick()) |
2516 |
goto ts_account; |
2517 |
|
2518 |
account_pc = NS_TO_PC(account_ns); |
2519 |
|
2520 |
/* Accurate tick timekeeping */ |
2521 |
if (user_mode(get_irq_regs())) |
2522 |
pc_user_time(rq, p, account_pc, account_ns); |
2523 |
else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) |
2524 |
pc_system_time(rq, p, HARDIRQ_OFFSET, |
2525 |
account_pc, account_ns); |
2526 |
else |
2527 |
pc_idle_time(rq, idle, account_pc); |
2528 |
|
2529 |
if (sched_clock_irqtime) |
2530 |
irqtime_account_hi_si(); |
2531 |
|
2532 |
ts_account: |
2533 |
/* time_slice accounting is done in usecs to avoid overflow on 32bit */ |
2534 |
if (rq->rq_policy != SCHED_FIFO && p != idle) { |
2535 |
s64 time_diff = rq->clock - rq->timekeep_clock; |
2536 |
|
2537 |
niffy_diff(&time_diff, 1); |
2538 |
rq->rq_time_slice -= NS_TO_US(time_diff); |
2539 |
} |
2540 |
|
2541 |
rq->rq_last_ran = rq->clock_task; |
2542 |
rq->timekeep_clock = rq->clock; |
2543 |
} |
2544 |
|
2545 |
/* |
2546 |
* This is called on context switches. |
2547 |
* Bank in p->sched_time the ns elapsed since the last tick or switch. |
2548 |
* CPU scheduler quota accounting is also performed here in microseconds. |
2549 |
*/ |
2550 |
static void |
2551 |
update_cpu_clock_switch(struct rq *rq, struct task_struct *p) |
2552 |
{ |
2553 |
long account_ns = rq->clock_task - rq->rq_last_ran; |
2554 |
struct task_struct *idle = rq->idle; |
2555 |
unsigned long account_pc; |
2556 |
|
2557 |
if (unlikely(account_ns < 0)) |
2558 |
goto ts_account; |
2559 |
|
2560 |
account_pc = NS_TO_PC(account_ns); |
2561 |
|
2562 |
/* Accurate subtick timekeeping */ |
2563 |
if (p != idle) { |
2564 |
pc_user_time(rq, p, account_pc, account_ns); |
2565 |
} |
2566 |
else |
2567 |
pc_idle_time(rq, idle, account_pc); |
2568 |
|
2569 |
ts_account: |
2570 |
/* time_slice accounting is done in usecs to avoid overflow on 32bit */ |
2571 |
if (rq->rq_policy != SCHED_FIFO && p != idle) { |
2572 |
s64 time_diff = rq->clock - rq->timekeep_clock; |
2573 |
|
2574 |
niffy_diff(&time_diff, 1); |
2575 |
rq->rq_time_slice -= NS_TO_US(time_diff); |
2576 |
} |
2577 |
|
2578 |
rq->rq_last_ran = rq->clock_task; |
2579 |
rq->timekeep_clock = rq->clock; |
2580 |
} |
2581 |
|
2582 |
/* |
2583 |
* Return any ns on the sched_clock that have not yet been accounted in |
2584 |
* @p in case that task is currently running. |
2585 |
* |
2586 |
* Called with task_grq_lock() held. |
2587 |
*/ |
2588 |
static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) |
2589 |
{ |
2590 |
u64 ns = 0; |
2591 |
|
2592 |
if (p == rq->curr) { |
2593 |
update_clocks(rq); |
2594 |
ns = rq->clock_task - rq->rq_last_ran; |
2595 |
if (unlikely((s64)ns < 0)) |
2596 |
ns = 0; |
2597 |
} |
2598 |
|
2599 |
return ns; |
2600 |
} |
2601 |
|
2602 |
unsigned long long task_delta_exec(struct task_struct *p) |
2603 |
{ |
2604 |
unsigned long flags; |
2605 |
struct rq *rq; |
2606 |
u64 ns; |
2607 |
|
2608 |
rq = task_grq_lock(p, &flags); |
2609 |
ns = do_task_delta_exec(p, rq); |
2610 |
task_grq_unlock(&flags); |
2611 |
|
2612 |
return ns; |
2613 |
} |
2614 |
|
2615 |
/* |
2616 |
* Return accounted runtime for the task. |
2617 |
* Return separately the current's pending runtime that have not been |
2618 |
* accounted yet. |
2619 |
* |
2620 |
* grq lock already acquired. |
2621 |
*/ |
2622 |
unsigned long long task_sched_runtime(struct task_struct *p) |
2623 |
{ |
2624 |
unsigned long flags; |
2625 |
struct rq *rq; |
2626 |
u64 ns; |
2627 |
|
2628 |
rq = task_grq_lock(p, &flags); |
2629 |
ns = p->sched_time + do_task_delta_exec(p, rq); |
2630 |
task_grq_unlock(&flags); |
2631 |
|
2632 |
return ns; |
2633 |
} |
2634 |
|
2635 |
/* |
2636 |
* Return accounted runtime for the task. |
2637 |
* Return separately the current's pending runtime that have not been |
2638 |
* accounted yet. |
2639 |
*/ |
2640 |
unsigned long long task_sched_runtime_nodelta(struct task_struct *p, unsigned long long *delta) |
2641 |
{ |
2642 |
unsigned long flags; |
2643 |
struct rq *rq; |
2644 |
u64 ns; |
2645 |
|
2646 |
rq = task_grq_lock(p, &flags); |
2647 |
ns = p->sched_time; |
2648 |
*delta = do_task_delta_exec(p, rq); |
2649 |
task_grq_unlock(&flags); |
2650 |
|
2651 |
return ns; |
2652 |
} |
2653 |
|
2654 |
/* Compatibility crap */ |
2655 |
void account_user_time(struct task_struct *p, cputime_t cputime, |
2656 |
cputime_t cputime_scaled) |
2657 |
{ |
2658 |
} |
2659 |
|
2660 |
void account_idle_time(cputime_t cputime) |
2661 |
{ |
2662 |
} |
2663 |
|
2664 |
void update_cpu_load_nohz(void) |
2665 |
{ |
2666 |
} |
2667 |
|
2668 |
#ifdef CONFIG_NO_HZ_COMMON |
2669 |
void calc_load_enter_idle(void) |
2670 |
{ |
2671 |
} |
2672 |
|
2673 |
void calc_load_exit_idle(void) |
2674 |
{ |
2675 |
} |
2676 |
#endif /* CONFIG_NO_HZ_COMMON */ |
2677 |
|
2678 |
/* |
2679 |
* Account guest cpu time to a process. |
2680 |
* @p: the process that the cpu time gets accounted to |
2681 |
* @cputime: the cpu time spent in virtual machine since the last update |
2682 |
* @cputime_scaled: cputime scaled by cpu frequency |
2683 |
*/ |
2684 |
static void account_guest_time(struct task_struct *p, cputime_t cputime, |
2685 |
cputime_t cputime_scaled) |
2686 |
{ |
2687 |
u64 *cpustat = kcpustat_this_cpu->cpustat; |
2688 |
|
2689 |
/* Add guest time to process. */ |
2690 |
p->utime += (__force u64)cputime; |
2691 |
p->utimescaled += (__force u64)cputime_scaled; |
2692 |
account_group_user_time(p, cputime); |
2693 |
p->gtime += (__force u64)cputime; |
2694 |
|
2695 |
/* Add guest time to cpustat. */ |
2696 |
if (TASK_NICE(p) > 0) { |
2697 |
cpustat[CPUTIME_NICE] += (__force u64)cputime; |
2698 |
cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime; |
2699 |
} else { |
2700 |
cpustat[CPUTIME_USER] += (__force u64)cputime; |
2701 |
cpustat[CPUTIME_GUEST] += (__force u64)cputime; |
2702 |
} |
2703 |
} |
2704 |
|
2705 |
/* |
2706 |
* Account system cpu time to a process and desired cpustat field |
2707 |
* @p: the process that the cpu time gets accounted to |
2708 |
* @cputime: the cpu time spent in kernel space since the last update |
2709 |
* @cputime_scaled: cputime scaled by cpu frequency |
2710 |
* @target_cputime64: pointer to cpustat field that has to be updated |
2711 |
*/ |
2712 |
static inline |
2713 |
void __account_system_time(struct task_struct *p, cputime_t cputime, |
2714 |
cputime_t cputime_scaled, cputime64_t *target_cputime64) |
2715 |
{ |
2716 |
/* Add system time to process. */ |
2717 |
p->stime += (__force u64)cputime; |
2718 |
p->stimescaled += (__force u64)cputime_scaled; |
2719 |
account_group_system_time(p, cputime); |
2720 |
|
2721 |
/* Add system time to cpustat. */ |
2722 |
*target_cputime64 += (__force u64)cputime; |
2723 |
|
2724 |
/* Account for system time used */ |
2725 |
acct_update_integrals(p); |
2726 |
} |
2727 |
|
2728 |
/* |
2729 |
* Account system cpu time to a process. |
2730 |
* @p: the process that the cpu time gets accounted to |
2731 |
* @hardirq_offset: the offset to subtract from hardirq_count() |
2732 |
* @cputime: the cpu time spent in kernel space since the last update |
2733 |
* @cputime_scaled: cputime scaled by cpu frequency |
2734 |
* This is for guest only now. |
2735 |
*/ |
2736 |
void account_system_time(struct task_struct *p, int hardirq_offset, |
2737 |
cputime_t cputime, cputime_t cputime_scaled) |
2738 |
{ |
2739 |
|
2740 |
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) |
2741 |
account_guest_time(p, cputime, cputime_scaled); |
2742 |
} |
2743 |
|
2744 |
/* |
2745 |
* Account for involuntary wait time. |
2746 |
* @steal: the cpu time spent in involuntary wait |
2747 |
*/ |
2748 |
void account_steal_time(cputime_t cputime) |
2749 |
{ |
2750 |
u64 *cpustat = kcpustat_this_cpu->cpustat; |
2751 |
|
2752 |
cpustat[CPUTIME_STEAL] += (__force u64)cputime; |
2753 |
} |
2754 |
|
2755 |
/* |
2756 |
* Account for idle time. |
2757 |
* @cputime: the cpu time spent in idle wait |
2758 |
*/ |
2759 |
static void account_idle_times(cputime_t cputime) |
2760 |
{ |
2761 |
u64 *cpustat = kcpustat_this_cpu->cpustat; |
2762 |
struct rq *rq = this_rq(); |
2763 |
|
2764 |
if (atomic_read(&rq->nr_iowait) > 0) |
2765 |
cpustat[CPUTIME_IOWAIT] += (__force u64)cputime; |
2766 |
else |
2767 |
cpustat[CPUTIME_IDLE] += (__force u64)cputime; |
2768 |
} |
2769 |
|
2770 |
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
2771 |
|
2772 |
void account_process_tick(struct task_struct *p, int user_tick) |
2773 |
{ |
2774 |
} |
2775 |
|
2776 |
/* |
2777 |
* Account multiple ticks of steal time. |
2778 |
* @p: the process from which the cpu time has been stolen |
2779 |
* @ticks: number of stolen ticks |
2780 |
*/ |
2781 |
void account_steal_ticks(unsigned long ticks) |
2782 |
{ |
2783 |
account_steal_time(jiffies_to_cputime(ticks)); |
2784 |
} |
2785 |
|
2786 |
/* |
2787 |
* Account multiple ticks of idle time. |
2788 |
* @ticks: number of stolen ticks |
2789 |
*/ |
2790 |
void account_idle_ticks(unsigned long ticks) |
2791 |
{ |
2792 |
account_idle_times(jiffies_to_cputime(ticks)); |
2793 |
} |
2794 |
#endif |
2795 |
|
2796 |
static inline void grq_iso_lock(void) |
2797 |
__acquires(grq.iso_lock) |
2798 |
{ |
2799 |
raw_spin_lock(&grq.iso_lock); |
2800 |
} |
2801 |
|
2802 |
static inline void grq_iso_unlock(void) |
2803 |
__releases(grq.iso_lock) |
2804 |
{ |
2805 |
raw_spin_unlock(&grq.iso_lock); |
2806 |
} |
2807 |
|
2808 |
/* |
2809 |
* Functions to test for when SCHED_ISO tasks have used their allocated |
2810 |
* quota as real time scheduling and convert them back to SCHED_NORMAL. |
2811 |
* Where possible, the data is tested lockless, to avoid grabbing iso_lock |
2812 |
* because the occasional inaccurate result won't matter. However the |
2813 |
* tick data is only ever modified under lock. iso_refractory is only simply |
2814 |
* set to 0 or 1 so it's not worth grabbing the lock yet again for that. |
2815 |
*/ |
2816 |
static bool set_iso_refractory(void) |
2817 |
{ |
2818 |
grq.iso_refractory = true; |
2819 |
return grq.iso_refractory; |
2820 |
} |
2821 |
|
2822 |
static bool clear_iso_refractory(void) |
2823 |
{ |
2824 |
grq.iso_refractory = false; |
2825 |
return grq.iso_refractory; |
2826 |
} |
2827 |
|
2828 |
/* |
2829 |
* Test if SCHED_ISO tasks have run longer than their alloted period as RT |
2830 |
* tasks and set the refractory flag if necessary. There is 10% hysteresis |
2831 |
* for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a |
2832 |
* slow division. |
2833 |
*/ |
2834 |
static bool test_ret_isorefractory(struct rq *rq) |
2835 |
{ |
2836 |
if (likely(!grq.iso_refractory)) { |
2837 |
if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu) |
2838 |
return set_iso_refractory(); |
2839 |
} else { |
2840 |
if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) |
2841 |
return clear_iso_refractory(); |
2842 |
} |
2843 |
return grq.iso_refractory; |
2844 |
} |
2845 |
|
2846 |
static void iso_tick(void) |
2847 |
{ |
2848 |
grq_iso_lock(); |
2849 |
grq.iso_ticks += 100; |
2850 |
grq_iso_unlock(); |
2851 |
} |
2852 |
|
2853 |
/* No SCHED_ISO task was running so decrease rq->iso_ticks */ |
2854 |
static inline void no_iso_tick(void) |
2855 |
{ |
2856 |
if (grq.iso_ticks) { |
2857 |
grq_iso_lock(); |
2858 |
grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; |
2859 |
if (unlikely(grq.iso_refractory && grq.iso_ticks < |
2860 |
ISO_PERIOD * (sched_iso_cpu * 115 / 128))) |
2861 |
clear_iso_refractory(); |
2862 |
grq_iso_unlock(); |
2863 |
} |
2864 |
} |
2865 |
|
2866 |
/* This manages tasks that have run out of timeslice during a scheduler_tick */ |
2867 |
static void task_running_tick(struct rq *rq) |
2868 |
{ |
2869 |
struct task_struct *p; |
2870 |
|
2871 |
/* |
2872 |
* If a SCHED_ISO task is running we increment the iso_ticks. In |
2873 |
* order to prevent SCHED_ISO tasks from causing starvation in the |
2874 |
* presence of true RT tasks we account those as iso_ticks as well. |
2875 |
*/ |
2876 |
if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) { |
2877 |
if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128) |
2878 |
iso_tick(); |
2879 |
} else |
2880 |
no_iso_tick(); |
2881 |
|
2882 |
if (iso_queue(rq)) { |
2883 |
if (unlikely(test_ret_isorefractory(rq))) { |
2884 |
if (rq_running_iso(rq)) { |
2885 |
/* |
2886 |
* SCHED_ISO task is running as RT and limit |
2887 |
* has been hit. Force it to reschedule as |
2888 |
* SCHED_NORMAL by zeroing its time_slice |
2889 |
*/ |
2890 |
rq->rq_time_slice = 0; |
2891 |
} |
2892 |
} |
2893 |
} |
2894 |
|
2895 |
/* SCHED_FIFO tasks never run out of timeslice. */ |
2896 |
if (rq->rq_policy == SCHED_FIFO) |
2897 |
return; |
2898 |
/* |
2899 |
* Tasks that were scheduled in the first half of a tick are not |
2900 |
* allowed to run into the 2nd half of the next tick if they will |
2901 |
* run out of time slice in the interim. Otherwise, if they have |
2902 |
* less than RESCHED_US μs of time slice left they will be rescheduled. |
2903 |
*/ |
2904 |
if (rq->dither) { |
2905 |
if (rq->rq_time_slice > HALF_JIFFY_US) |
2906 |
return; |
2907 |
else |
2908 |
rq->rq_time_slice = 0; |
2909 |
} else if (rq->rq_time_slice >= RESCHED_US) |
2910 |
return; |
2911 |
|
2912 |
/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ |
2913 |
p = rq->curr; |
2914 |
grq_lock(); |
2915 |
requeue_task(p); |
2916 |
set_tsk_need_resched(p); |
2917 |
grq_unlock(); |
2918 |
} |
2919 |
|
2920 |
/* |
2921 |
* This function gets called by the timer code, with HZ frequency. |
2922 |
* We call it with interrupts disabled. The data modified is all |
2923 |
* local to struct rq so we don't need to grab grq lock. |
2924 |
*/ |
2925 |
void scheduler_tick(void) |
2926 |
{ |
2927 |
int cpu __maybe_unused = smp_processor_id(); |
2928 |
struct rq *rq = cpu_rq(cpu); |
2929 |
|
2930 |
sched_clock_tick(); |
2931 |
/* grq lock not grabbed, so only update rq clock */ |
2932 |
update_rq_clock(rq); |
2933 |
update_cpu_clock_tick(rq, rq->curr); |
2934 |
if (!rq_idle(rq)) |
2935 |
task_running_tick(rq); |
2936 |
else |
2937 |
no_iso_tick(); |
2938 |
rq->last_tick = rq->clock; |
2939 |
perf_event_task_tick(); |
2940 |
} |
2941 |
|
2942 |
notrace unsigned long get_parent_ip(unsigned long addr) |
2943 |
{ |
2944 |
if (in_lock_functions(addr)) { |
2945 |
addr = CALLER_ADDR2; |
2946 |
if (in_lock_functions(addr)) |
2947 |
addr = CALLER_ADDR3; |
2948 |
} |
2949 |
return addr; |
2950 |
} |
2951 |
|
2952 |
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
2953 |
defined(CONFIG_PREEMPT_TRACER)) |
2954 |
void __kprobes add_preempt_count(int val) |
2955 |
{ |
2956 |
#ifdef CONFIG_DEBUG_PREEMPT |
2957 |
/* |
2958 |
* Underflow? |
2959 |
*/ |
2960 |
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
2961 |
return; |
2962 |
#endif |
2963 |
preempt_count() += val; |
2964 |
#ifdef CONFIG_DEBUG_PREEMPT |
2965 |
/* |
2966 |
* Spinlock count overflowing soon? |
2967 |
*/ |
2968 |
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
2969 |
PREEMPT_MASK - 10); |
2970 |
#endif |
2971 |
if (preempt_count() == val) |
2972 |
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
2973 |
} |
2974 |
EXPORT_SYMBOL(add_preempt_count); |
2975 |
|
2976 |
void __kprobes sub_preempt_count(int val) |
2977 |
{ |
2978 |
#ifdef CONFIG_DEBUG_PREEMPT |
2979 |
/* |
2980 |
* Underflow? |
2981 |
*/ |
2982 |
if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) |
2983 |
return; |
2984 |
/* |
2985 |
* Is the spinlock portion underflowing? |
2986 |
*/ |
2987 |
if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
2988 |
!(preempt_count() & PREEMPT_MASK))) |
2989 |
return; |
2990 |
#endif |
2991 |
|
2992 |
if (preempt_count() == val) |
2993 |
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
2994 |
preempt_count() -= val; |
2995 |
} |
2996 |
EXPORT_SYMBOL(sub_preempt_count); |
2997 |
#endif |
2998 |
|
2999 |
/* |
3000 |
* Deadline is "now" in niffies + (offset by priority). Setting the deadline |
3001 |
* is the key to everything. It distributes cpu fairly amongst tasks of the |
3002 |
* same nice value, it proportions cpu according to nice level, it means the |
3003 |
* task that last woke up the longest ago has the earliest deadline, thus |
3004 |
* ensuring that interactive tasks get low latency on wake up. The CPU |
3005 |
* proportion works out to the square of the virtual deadline difference, so |
3006 |
* this equation will give nice 19 3% CPU compared to nice 0. |
3007 |
*/ |
3008 |
static inline u64 prio_deadline_diff(int user_prio) |
3009 |
{ |
3010 |
return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); |
3011 |
} |
3012 |
|
3013 |
static inline u64 task_deadline_diff(struct task_struct *p) |
3014 |
{ |
3015 |
return prio_deadline_diff(TASK_USER_PRIO(p)); |
3016 |
} |
3017 |
|
3018 |
static inline u64 static_deadline_diff(int static_prio) |
3019 |
{ |
3020 |
return prio_deadline_diff(USER_PRIO(static_prio)); |
3021 |
} |
3022 |
|
3023 |
static inline int longest_deadline_diff(void) |
3024 |
{ |
3025 |
return prio_deadline_diff(39); |
3026 |
} |
3027 |
|
3028 |
static inline int ms_longest_deadline_diff(void) |
3029 |
{ |
3030 |
return NS_TO_MS(longest_deadline_diff()); |
3031 |
} |
3032 |
|
3033 |
/* |
3034 |
* The time_slice is only refilled when it is empty and that is when we set a |
3035 |
* new deadline. |
3036 |
*/ |
3037 |
static void time_slice_expired(struct task_struct *p) |
3038 |
{ |
3039 |
p->time_slice = timeslice(); |
3040 |
p->deadline = grq.niffies + task_deadline_diff(p); |
3041 |
} |
3042 |
|
3043 |
/* |
3044 |
* Timeslices below RESCHED_US are considered as good as expired as there's no |
3045 |
* point rescheduling when there's so little time left. SCHED_BATCH tasks |
3046 |
* have been flagged be not latency sensitive and likely to be fully CPU |
3047 |
* bound so every time they're rescheduled they have their time_slice |
3048 |
* refilled, but get a new later deadline to have little effect on |
3049 |
* SCHED_NORMAL tasks. |
3050 |
|
3051 |
*/ |
3052 |
static inline void check_deadline(struct task_struct *p) |
3053 |
{ |
3054 |
if (p->time_slice < RESCHED_US || batch_task(p)) |
3055 |
time_slice_expired(p); |
3056 |
} |
3057 |
|
3058 |
#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) |
3059 |
|
3060 |
/* |
3061 |
* Scheduler queue bitmap specific find next bit. |
3062 |
*/ |
3063 |
static inline unsigned long |
3064 |
next_sched_bit(const unsigned long *addr, unsigned long offset) |
3065 |
{ |
3066 |
const unsigned long *p; |
3067 |
unsigned long result; |
3068 |
unsigned long size; |
3069 |
unsigned long tmp; |
3070 |
|
3071 |
size = PRIO_LIMIT; |
3072 |
if (offset >= size) |
3073 |
return size; |
3074 |
|
3075 |
p = addr + BITOP_WORD(offset); |
3076 |
result = offset & ~(BITS_PER_LONG-1); |
3077 |
size -= result; |
3078 |
offset %= BITS_PER_LONG; |
3079 |
if (offset) { |
3080 |
tmp = *(p++); |
3081 |
tmp &= (~0UL << offset); |
3082 |
if (size < BITS_PER_LONG) |
3083 |
goto found_first; |
3084 |
if (tmp) |
3085 |
goto found_middle; |
3086 |
size -= BITS_PER_LONG; |
3087 |
result += BITS_PER_LONG; |
3088 |
} |
3089 |
while (size & ~(BITS_PER_LONG-1)) { |
3090 |
if ((tmp = *(p++))) |
3091 |
goto found_middle; |
3092 |
result += BITS_PER_LONG; |
3093 |
size -= BITS_PER_LONG; |
3094 |
} |
3095 |
if (!size) |
3096 |
return result; |
3097 |
tmp = *p; |
3098 |
|
3099 |
found_first: |
3100 |
tmp &= (~0UL >> (BITS_PER_LONG - size)); |
3101 |
if (tmp == 0UL) /* Are any bits set? */ |
3102 |
return result + size; /* Nope. */ |
3103 |
found_middle: |
3104 |
return result + __ffs(tmp); |
3105 |
} |
3106 |
|
3107 |
/* |
3108 |
* O(n) lookup of all tasks in the global runqueue. The real brainfuck |
3109 |
* of lock contention and O(n). It's not really O(n) as only the queued, |
3110 |
* but not running tasks are scanned, and is O(n) queued in the worst case |
3111 |
* scenario only because the right task can be found before scanning all of |
3112 |
* them. |
3113 |
* Tasks are selected in this order: |
3114 |
* Real time tasks are selected purely by their static priority and in the |
3115 |
* order they were queued, so the lowest value idx, and the first queued task |
3116 |
* of that priority value is chosen. |
3117 |
* If no real time tasks are found, the SCHED_ISO priority is checked, and |
3118 |
* all SCHED_ISO tasks have the same priority value, so they're selected by |
3119 |
* the earliest deadline value. |
3120 |
* If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the |
3121 |
* earliest deadline. |
3122 |
* Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are |
3123 |
* selected by the earliest deadline. |
3124 |
*/ |
3125 |
static inline struct |
3126 |
task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) |
3127 |
{ |
3128 |
struct task_struct *edt = NULL; |
3129 |
unsigned long idx = -1; |
3130 |
|
3131 |
do { |
3132 |
struct list_head *queue; |
3133 |
struct task_struct *p; |
3134 |
u64 earliest_deadline; |
3135 |
|
3136 |
idx = next_sched_bit(grq.prio_bitmap, ++idx); |
3137 |
if (idx >= PRIO_LIMIT) |
3138 |
return idle; |
3139 |
queue = grq.queue + idx; |
3140 |
|
3141 |
if (idx < MAX_RT_PRIO) { |
3142 |
/* We found an rt task */ |
3143 |
list_for_each_entry(p, queue, run_list) { |
3144 |
/* Make sure cpu affinity is ok */ |
3145 |
if (needs_other_cpu(p, cpu)) |
3146 |
continue; |
3147 |
edt = p; |
3148 |
goto out_take; |
3149 |
} |
3150 |
/* |
3151 |
* None of the RT tasks at this priority can run on |
3152 |
* this cpu |
3153 |
*/ |
3154 |
continue; |
3155 |
} |
3156 |
|
3157 |
/* |
3158 |
* No rt tasks. Find the earliest deadline task. Now we're in |
3159 |
* O(n) territory. |
3160 |
*/ |
3161 |
earliest_deadline = ~0ULL; |
3162 |
list_for_each_entry(p, queue, run_list) { |
3163 |
u64 dl; |
3164 |
|
3165 |
/* Make sure cpu affinity is ok */ |
3166 |
if (needs_other_cpu(p, cpu)) |
3167 |
continue; |
3168 |
|
3169 |
/* |
3170 |
* Soft affinity happens here by not scheduling a task |
3171 |
* with its sticky flag set that ran on a different CPU |
3172 |
* last when the CPU is scaling, or by greatly biasing |
3173 |
* against its deadline when not, based on cpu cache |
3174 |
* locality. |
3175 |
*/ |
3176 |
if (task_sticky(p) && task_rq(p) != rq) { |
3177 |
if (scaling_rq(rq)) |
3178 |
continue; |
3179 |
dl = p->deadline << locality_diff(p, rq); |
3180 |
} else |
3181 |
dl = p->deadline; |
3182 |
|
3183 |
if (deadline_before(dl, earliest_deadline)) { |
3184 |
earliest_deadline = dl; |
3185 |
edt = p; |
3186 |
} |
3187 |
} |
3188 |
} while (!edt); |
3189 |
|
3190 |
out_take: |
3191 |
take_task(cpu, edt); |
3192 |
return edt; |
3193 |
} |
3194 |
|
3195 |
|
3196 |
/* |
3197 |
* Print scheduling while atomic bug: |
3198 |
*/ |
3199 |
static noinline void __schedule_bug(struct task_struct *prev) |
3200 |
{ |
3201 |
if (oops_in_progress) |
3202 |
return; |
3203 |
|
3204 |
printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", |
3205 |
prev->comm, prev->pid, preempt_count()); |
3206 |
|
3207 |
debug_show_held_locks(prev); |
3208 |
print_modules(); |
3209 |
if (irqs_disabled()) |
3210 |
print_irqtrace_events(prev); |
3211 |
dump_stack(); |
3212 |
add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
3213 |
} |
3214 |
|
3215 |
/* |
3216 |
* Various schedule()-time debugging checks and statistics: |
3217 |
*/ |
3218 |
static inline void schedule_debug(struct task_struct *prev) |
3219 |
{ |
3220 |
/* |
3221 |
* Test if we are atomic. Since do_exit() needs to call into |
3222 |
* schedule() atomically, we ignore that path for now. |
3223 |
* Otherwise, whine if we are scheduling when we should not be. |
3224 |
*/ |
3225 |
if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) |
3226 |
__schedule_bug(prev); |
3227 |
rcu_sleep_check(); |
3228 |
|
3229 |
profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3230 |
|
3231 |
schedstat_inc(this_rq(), sched_count); |
3232 |
} |
3233 |
|
3234 |
/* |
3235 |
* The currently running task's information is all stored in rq local data |
3236 |
* which is only modified by the local CPU, thereby allowing the data to be |
3237 |
* changed without grabbing the grq lock. |
3238 |
*/ |
3239 |
static inline void set_rq_task(struct rq *rq, struct task_struct *p) |
3240 |
{ |
3241 |
rq->rq_time_slice = p->time_slice; |
3242 |
rq->rq_deadline = p->deadline; |
3243 |
rq->rq_last_ran = p->last_ran = rq->clock_task; |
3244 |
rq->rq_policy = p->policy; |
3245 |
rq->rq_prio = p->prio; |
3246 |
if (p != rq->idle) |
3247 |
rq->rq_running = true; |
3248 |
else |
3249 |
rq->rq_running = false; |
3250 |
} |
3251 |
|
3252 |
static void reset_rq_task(struct rq *rq, struct task_struct *p) |
3253 |
{ |
3254 |
rq->rq_policy = p->policy; |
3255 |
rq->rq_prio = p->prio; |
3256 |
} |
3257 |
|
3258 |
/* |
3259 |
* schedule() is the main scheduler function. |
3260 |
* |
3261 |
* The main means of driving the scheduler and thus entering this function are: |
3262 |
* |
3263 |
* 1. Explicit blocking: mutex, semaphore, waitqueue, etc. |
3264 |
* |
3265 |
* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return |
3266 |
* paths. For example, see arch/x86/entry_64.S. |
3267 |
* |
3268 |
* To drive preemption between tasks, the scheduler sets the flag in timer |
3269 |
* interrupt handler scheduler_tick(). |
3270 |
* |
3271 |
* 3. Wakeups don't really cause entry into schedule(). They add a |
3272 |
* task to the run-queue and that's it. |
3273 |
* |
3274 |
* Now, if the new task added to the run-queue preempts the current |
3275 |
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets |
3276 |
* called on the nearest possible occasion: |
3277 |
* |
3278 |
* - If the kernel is preemptible (CONFIG_PREEMPT=y): |
3279 |
* |
3280 |
* - in syscall or exception context, at the next outmost |
3281 |
* preempt_enable(). (this might be as soon as the wake_up()'s |
3282 |
* spin_unlock()!) |
3283 |
* |
3284 |
* - in IRQ context, return from interrupt-handler to |
3285 |
* preemptible context |
3286 |
* |
3287 |
* - If the kernel is not preemptible (CONFIG_PREEMPT is not set) |
3288 |
* then at the next: |
3289 |
* |
3290 |
* - cond_resched() call |
3291 |
* - explicit schedule() call |
3292 |
* - return from syscall or exception to user-space |
3293 |
* - return from interrupt-handler to user-space |
3294 |
*/ |
3295 |
asmlinkage void __sched schedule(void) |
3296 |
{ |
3297 |
struct task_struct *prev, *next, *idle; |
3298 |
unsigned long *switch_count; |
3299 |
bool deactivate; |
3300 |
struct rq *rq; |
3301 |
int cpu; |
3302 |
|
3303 |
need_resched: |
3304 |
preempt_disable(); |
3305 |
cpu = smp_processor_id(); |
3306 |
rq = cpu_rq(cpu); |
3307 |
rcu_note_context_switch(cpu); |
3308 |
prev = rq->curr; |
3309 |
|
3310 |
deactivate = false; |
3311 |
schedule_debug(prev); |
3312 |
|
3313 |
/* |
3314 |
* Make sure that signal_pending_state()->signal_pending() below |
3315 |
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) |
3316 |
* done by the caller to avoid the race with signal_wake_up(). |
3317 |
*/ |
3318 |
smp_mb__before_spinlock(); |
3319 |
grq_lock_irq(); |
3320 |
|
3321 |
switch_count = &prev->nivcsw; |
3322 |
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3323 |
if (unlikely(signal_pending_state(prev->state, prev))) { |
3324 |
prev->state = TASK_RUNNING; |
3325 |
} else { |
3326 |
deactivate = true; |
3327 |
/* |
3328 |
* If a worker is going to sleep, notify and |
3329 |
* ask workqueue whether it wants to wake up a |
3330 |
* task to maintain concurrency. If so, wake |
3331 |
* up the task. |
3332 |
*/ |
3333 |
if (prev->flags & PF_WQ_WORKER) { |
3334 |
struct task_struct *to_wakeup; |
3335 |
|
3336 |
to_wakeup = wq_worker_sleeping(prev, cpu); |
3337 |
if (to_wakeup) { |
3338 |
/* This shouldn't happen, but does */ |
3339 |
if (unlikely(to_wakeup == prev)) |
3340 |
deactivate = false; |
3341 |
else |
3342 |
try_to_wake_up_local(to_wakeup); |
3343 |
} |
3344 |
} |
3345 |
} |
3346 |
switch_count = &prev->nvcsw; |
3347 |
} |
3348 |
|
3349 |
/* |
3350 |
* If we are going to sleep and we have plugged IO queued, make |
3351 |
* sure to submit it to avoid deadlocks. |
3352 |
*/ |
3353 |
if (unlikely(deactivate && blk_needs_flush_plug(prev))) { |
3354 |
grq_unlock_irq(); |
3355 |
preempt_enable_no_resched(); |
3356 |
blk_schedule_flush_plug(prev); |
3357 |
goto need_resched; |
3358 |
} |
3359 |
|
3360 |
update_clocks(rq); |
3361 |
update_cpu_clock_switch(rq, prev); |
3362 |
if (rq->clock - rq->last_tick > HALF_JIFFY_NS) |
3363 |
rq->dither = false; |
3364 |
else |
3365 |
rq->dither = true; |
3366 |
|
3367 |
clear_tsk_need_resched(prev); |
3368 |
|
3369 |
idle = rq->idle; |
3370 |
if (idle != prev) { |
3371 |
/* Update all the information stored on struct rq */ |
3372 |
prev->time_slice = rq->rq_time_slice; |
3373 |
prev->deadline = rq->rq_deadline; |
3374 |
check_deadline(prev); |
3375 |
prev->last_ran = rq->clock_task; |
3376 |
|
3377 |
/* Task changed affinity off this CPU */ |
3378 |
if (needs_other_cpu(prev, cpu)) { |
3379 |
if (!deactivate) |
3380 |
resched_suitable_idle(prev); |
3381 |
} else if (!deactivate) { |
3382 |
if (!queued_notrunning()) { |
3383 |
/* |
3384 |
* We now know prev is the only thing that is |
3385 |
* awaiting CPU so we can bypass rechecking for |
3386 |
* the earliest deadline task and just run it |
3387 |
* again. |
3388 |
*/ |
3389 |
set_rq_task(rq, prev); |
3390 |
grq_unlock_irq(); |
3391 |
goto rerun_prev_unlocked; |
3392 |
} else |
3393 |
swap_sticky(rq, cpu, prev); |
3394 |
} |
3395 |
return_task(prev, deactivate); |
3396 |
} |
3397 |
|
3398 |
if (unlikely(!queued_notrunning())) { |
3399 |
/* |
3400 |
* This CPU is now truly idle as opposed to when idle is |
3401 |
* scheduled as a high priority task in its own right. |
3402 |
*/ |
3403 |
next = idle; |
3404 |
schedstat_inc(rq, sched_goidle); |
3405 |
set_cpuidle_map(cpu); |
3406 |
} else { |
3407 |
next = earliest_deadline_task(rq, cpu, idle); |
3408 |
if (likely(next->prio != PRIO_LIMIT)) |
3409 |
clear_cpuidle_map(cpu); |
3410 |
else |
3411 |
set_cpuidle_map(cpu); |
3412 |
} |
3413 |
|
3414 |
if (likely(prev != next)) { |
3415 |
resched_suitable_idle(prev); |
3416 |
/* |
3417 |
* Don't stick tasks when a real time task is going to run as |
3418 |
* they may literally get stuck. |
3419 |
*/ |
3420 |
if (rt_task(next)) |
3421 |
unstick_task(rq, prev); |
3422 |
set_rq_task(rq, next); |
3423 |
grq.nr_switches++; |
3424 |
prev->on_cpu = false; |
3425 |
next->on_cpu = true; |
3426 |
rq->curr = next; |
3427 |
++*switch_count; |
3428 |
|
3429 |
context_switch(rq, prev, next); /* unlocks the grq */ |
3430 |
/* |
3431 |
* The context switch have flipped the stack from under us |
3432 |
* and restored the local variables which were saved when |
3433 |
* this task called schedule() in the past. prev == current |
3434 |
* is still correct, but it can be moved to another cpu/rq. |
3435 |
*/ |
3436 |
cpu = smp_processor_id(); |
3437 |
rq = cpu_rq(cpu); |
3438 |
idle = rq->idle; |
3439 |
} else |
3440 |
grq_unlock_irq(); |
3441 |
|
3442 |
rerun_prev_unlocked: |
3443 |
sched_preempt_enable_no_resched(); |
3444 |
if (unlikely(need_resched())) |
3445 |
goto need_resched; |
3446 |
} |
3447 |
EXPORT_SYMBOL(schedule); |
3448 |
|
3449 |
#ifdef CONFIG_RCU_USER_QS |
3450 |
asmlinkage void __sched schedule_user(void) |
3451 |
{ |
3452 |
/* |
3453 |
* If we come here after a random call to set_need_resched(), |
3454 |
* or we have been woken up remotely but the IPI has not yet arrived, |
3455 |
* we haven't yet exited the RCU idle mode. Do it here manually until |
3456 |
* we find a better solution. |
3457 |
*/ |
3458 |
user_exit(); |
3459 |
schedule(); |
3460 |
user_enter(); |
3461 |
} |
3462 |
#endif |
3463 |
|
3464 |
/** |
3465 |
* schedule_preempt_disabled - called with preemption disabled |
3466 |
* |
3467 |
* Returns with preemption disabled. Note: preempt_count must be 1 |
3468 |
*/ |
3469 |
void __sched schedule_preempt_disabled(void) |
3470 |
{ |
3471 |
sched_preempt_enable_no_resched(); |
3472 |
schedule(); |
3473 |
preempt_disable(); |
3474 |
} |
3475 |
|
3476 |
#ifdef CONFIG_PREEMPT |
3477 |
/* |
3478 |
* this is the entry point to schedule() from in-kernel preemption |
3479 |
* off of preempt_enable. Kernel preemptions off return from interrupt |
3480 |
* occur there and call schedule directly. |
3481 |
*/ |
3482 |
asmlinkage void __sched notrace preempt_schedule(void) |
3483 |
{ |
3484 |
struct thread_info *ti = current_thread_info(); |
3485 |
|
3486 |
/* |
3487 |
* If there is a non-zero preempt_count or interrupts are disabled, |
3488 |
* we do not want to preempt the current task. Just return.. |
3489 |
*/ |
3490 |
if (likely(ti->preempt_count || irqs_disabled())) |
3491 |
return; |
3492 |
|
3493 |
do { |
3494 |
add_preempt_count_notrace(PREEMPT_ACTIVE); |
3495 |
schedule(); |
3496 |
sub_preempt_count_notrace(PREEMPT_ACTIVE); |
3497 |
|
3498 |
/* |
3499 |
* Check again in case we missed a preemption opportunity |
3500 |
* between schedule and now. |
3501 |
*/ |
3502 |
barrier(); |
3503 |
} while (need_resched()); |
3504 |
} |
3505 |
EXPORT_SYMBOL(preempt_schedule); |
3506 |
|
3507 |
/* |
3508 |
* this is the entry point to schedule() from kernel preemption |
3509 |
* off of irq context. |
3510 |
* Note, that this is called and return with irqs disabled. This will |
3511 |
* protect us against recursive calling from irq. |
3512 |
*/ |
3513 |
asmlinkage void __sched preempt_schedule_irq(void) |
3514 |
{ |
3515 |
struct thread_info *ti = current_thread_info(); |
3516 |
enum ctx_state prev_state; |
3517 |
|
3518 |
/* Catch callers which need to be fixed */ |
3519 |
BUG_ON(ti->preempt_count || !irqs_disabled()); |
3520 |
|
3521 |
prev_state = exception_enter(); |
3522 |
|
3523 |
do { |
3524 |
add_preempt_count(PREEMPT_ACTIVE); |
3525 |
local_irq_enable(); |
3526 |
schedule(); |
3527 |
local_irq_disable(); |
3528 |
sub_preempt_count(PREEMPT_ACTIVE); |
3529 |
|
3530 |
/* |
3531 |
* Check again in case we missed a preemption opportunity |
3532 |
* between schedule and now. |
3533 |
*/ |
3534 |
barrier(); |
3535 |
} while (need_resched()); |
3536 |
|
3537 |
exception_exit(prev_state); |
3538 |
} |
3539 |
|
3540 |
#endif /* CONFIG_PREEMPT */ |
3541 |
|
3542 |
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, |
3543 |
void *key) |
3544 |
{ |
3545 |
return try_to_wake_up(curr->private, mode, wake_flags); |
3546 |
} |
3547 |
EXPORT_SYMBOL(default_wake_function); |
3548 |
|
3549 |
/* |
3550 |
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just |
3551 |
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve |
3552 |
* number) then we wake all the non-exclusive tasks and one exclusive task. |
3553 |
* |
3554 |
* There are circumstances in which we can try to wake a task which has already |
3555 |
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
3556 |
* zero in this (rare) case, and we handle it by continuing to scan the queue. |
3557 |
*/ |
3558 |
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
3559 |
int nr_exclusive, int wake_flags, void *key) |
3560 |
{ |
3561 |
struct list_head *tmp, *next; |
3562 |
|
3563 |
list_for_each_safe(tmp, next, &q->task_list) { |
3564 |
wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); |
3565 |
unsigned int flags = curr->flags; |
3566 |
|
3567 |
if (curr->func(curr, mode, wake_flags, key) && |
3568 |
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
3569 |
break; |
3570 |
} |
3571 |
} |
3572 |
|
3573 |
/** |
3574 |
* __wake_up - wake up threads blocked on a waitqueue. |
3575 |
* @q: the waitqueue |
3576 |
* @mode: which threads |
3577 |
* @nr_exclusive: how many wake-one or wake-many threads to wake up |
3578 |
* @key: is directly passed to the wakeup function |
3579 |
* |
3580 |
* It may be assumed that this function implies a write memory barrier before |
3581 |
* changing the task state if and only if any tasks are woken up. |
3582 |
*/ |
3583 |
void __wake_up(wait_queue_head_t *q, unsigned int mode, |
3584 |
int nr_exclusive, void *key) |
3585 |
{ |
3586 |
unsigned long flags; |
3587 |
|
3588 |
spin_lock_irqsave(&q->lock, flags); |
3589 |
__wake_up_common(q, mode, nr_exclusive, 0, key); |
3590 |
spin_unlock_irqrestore(&q->lock, flags); |
3591 |
} |
3592 |
EXPORT_SYMBOL(__wake_up); |
3593 |
|
3594 |
/* |
3595 |
* Same as __wake_up but called with the spinlock in wait_queue_head_t held. |
3596 |
*/ |
3597 |
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) |
3598 |
{ |
3599 |
__wake_up_common(q, mode, nr, 0, NULL); |
3600 |
} |
3601 |
EXPORT_SYMBOL_GPL(__wake_up_locked); |
3602 |
|
3603 |
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) |
3604 |
{ |
3605 |
__wake_up_common(q, mode, 1, 0, key); |
3606 |
} |
3607 |
EXPORT_SYMBOL_GPL(__wake_up_locked_key); |
3608 |
|
3609 |
/** |
3610 |
* __wake_up_sync_key - wake up threads blocked on a waitqueue. |
3611 |
* @q: the waitqueue |
3612 |
* @mode: which threads |
3613 |
* @nr_exclusive: how many wake-one or wake-many threads to wake up |
3614 |
* @key: opaque value to be passed to wakeup targets |
3615 |
* |
3616 |
* The sync wakeup differs that the waker knows that it will schedule |
3617 |
* away soon, so while the target thread will be woken up, it will not |
3618 |
* be migrated to another CPU - ie. the two threads are 'synchronised' |
3619 |
* with each other. This can prevent needless bouncing between CPUs. |
3620 |
* |
3621 |
* On UP it can prevent extra preemption. |
3622 |
* |
3623 |
* It may be assumed that this function implies a write memory barrier before |
3624 |
* changing the task state if and only if any tasks are woken up. |
3625 |
*/ |
3626 |
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, |
3627 |
int nr_exclusive, void *key) |
3628 |
{ |
3629 |
unsigned long flags; |
3630 |
int wake_flags = WF_SYNC; |
3631 |
|
3632 |
if (unlikely(!q)) |
3633 |
return; |
3634 |
|
3635 |
if (unlikely(!nr_exclusive)) |
3636 |
wake_flags = 0; |
3637 |
|
3638 |
spin_lock_irqsave(&q->lock, flags); |
3639 |
__wake_up_common(q, mode, nr_exclusive, wake_flags, key); |
3640 |
spin_unlock_irqrestore(&q->lock, flags); |
3641 |
} |
3642 |
EXPORT_SYMBOL_GPL(__wake_up_sync_key); |
3643 |
|
3644 |
/** |
3645 |
* __wake_up_sync - wake up threads blocked on a waitqueue. |
3646 |
* @q: the waitqueue |
3647 |
* @mode: which threads |
3648 |
* @nr_exclusive: how many wake-one or wake-many threads to wake up |
3649 |
* |
3650 |
* The sync wakeup differs that the waker knows that it will schedule |
3651 |
* away soon, so while the target thread will be woken up, it will not |
3652 |
* be migrated to another CPU - ie. the two threads are 'synchronised' |
3653 |
* with each other. This can prevent needless bouncing between CPUs. |
3654 |
* |
3655 |
* On UP it can prevent extra preemption. |
3656 |
*/ |
3657 |
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) |
3658 |
{ |
3659 |
unsigned long flags; |
3660 |
int sync = 1; |
3661 |
|
3662 |
if (unlikely(!q)) |
3663 |
return; |
3664 |
|
3665 |
if (unlikely(!nr_exclusive)) |
3666 |
sync = 0; |
3667 |
|
3668 |
spin_lock_irqsave(&q->lock, flags); |
3669 |
__wake_up_common(q, mode, nr_exclusive, sync, NULL); |
3670 |
spin_unlock_irqrestore(&q->lock, flags); |
3671 |
} |
3672 |
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
3673 |
|
3674 |
/** |
3675 |
* complete: - signals a single thread waiting on this completion |
3676 |
* @x: holds the state of this particular completion |
3677 |
* |
3678 |
* This will wake up a single thread waiting on this completion. Threads will be |
3679 |
* awakened in the same order in which they were queued. |
3680 |
* |
3681 |
* See also complete_all(), wait_for_completion() and related routines. |
3682 |
* |
3683 |
* It may be assumed that this function implies a write memory barrier before |
3684 |
* changing the task state if and only if any tasks are woken up. |
3685 |
*/ |
3686 |
void complete(struct completion *x) |
3687 |
{ |
3688 |
unsigned long flags; |
3689 |
|
3690 |
spin_lock_irqsave(&x->wait.lock, flags); |
3691 |
x->done++; |
3692 |
__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); |
3693 |
spin_unlock_irqrestore(&x->wait.lock, flags); |
3694 |
} |
3695 |
EXPORT_SYMBOL(complete); |
3696 |
|
3697 |
/** |
3698 |
* complete_all: - signals all threads waiting on this completion |
3699 |
* @x: holds the state of this particular completion |
3700 |
* |
3701 |
* This will wake up all threads waiting on this particular completion event. |
3702 |
* |
3703 |
* It may be assumed that this function implies a write memory barrier before |
3704 |
* changing the task state if and only if any tasks are woken up. |
3705 |
*/ |
3706 |
void complete_all(struct completion *x) |
3707 |
{ |
3708 |
unsigned long flags; |
3709 |
|
3710 |
spin_lock_irqsave(&x->wait.lock, flags); |
3711 |
x->done += UINT_MAX/2; |
3712 |
__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); |
3713 |
spin_unlock_irqrestore(&x->wait.lock, flags); |
3714 |
} |
3715 |
EXPORT_SYMBOL(complete_all); |
3716 |
|
3717 |
static inline long __sched |
3718 |
do_wait_for_common(struct completion *x, |
3719 |
long (*action)(long), long timeout, int state) |
3720 |
{ |
3721 |
if (!x->done) { |
3722 |
DECLARE_WAITQUEUE(wait, current); |
3723 |
|
3724 |
__add_wait_queue_tail_exclusive(&x->wait, &wait); |
3725 |
do { |
3726 |
if (signal_pending_state(state, current)) { |
3727 |
timeout = -ERESTARTSYS; |
3728 |
break; |
3729 |
} |
3730 |
__set_current_state(state); |
3731 |
spin_unlock_irq(&x->wait.lock); |
3732 |
timeout = action(timeout); |
3733 |
spin_lock_irq(&x->wait.lock); |
3734 |
} while (!x->done && timeout); |
3735 |
__remove_wait_queue(&x->wait, &wait); |
3736 |
if (!x->done) |
3737 |
return timeout; |
3738 |
} |
3739 |
x->done--; |
3740 |
return timeout ?: 1; |
3741 |
} |
3742 |
|
3743 |
static inline long __sched |
3744 |
__wait_for_common(struct completion *x, |
3745 |
long (*action)(long), long timeout, int state) |
3746 |
{ |
3747 |
might_sleep(); |
3748 |
|
3749 |
spin_lock_irq(&x->wait.lock); |
3750 |
timeout = do_wait_for_common(x, action, timeout, state); |
3751 |
spin_unlock_irq(&x->wait.lock); |
3752 |
return timeout; |
3753 |
} |
3754 |
|
3755 |
static long __sched |
3756 |
wait_for_common(struct completion *x, long timeout, int state) |
3757 |
{ |
3758 |
return __wait_for_common(x, schedule_timeout, timeout, state); |
3759 |
} |
3760 |
|
3761 |
static long __sched |
3762 |
wait_for_common_io(struct completion *x, long timeout, int state) |
3763 |
{ |
3764 |
return __wait_for_common(x, io_schedule_timeout, timeout, state); |
3765 |
} |
3766 |
|
3767 |
/** |
3768 |
* wait_for_completion: - waits for completion of a task |
3769 |
* @x: holds the state of this particular completion |
3770 |
* |
3771 |
* This waits to be signaled for completion of a specific task. It is NOT |
3772 |
* interruptible and there is no timeout. |
3773 |
* |
3774 |
* See also similar routines (i.e. wait_for_completion_timeout()) with timeout |
3775 |
* and interrupt capability. Also see complete(). |
3776 |
*/ |
3777 |
void __sched wait_for_completion(struct completion *x) |
3778 |
{ |
3779 |
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
3780 |
} |
3781 |
EXPORT_SYMBOL(wait_for_completion); |
3782 |
|
3783 |
/** |
3784 |
* wait_for_completion_timeout: - waits for completion of a task (w/timeout) |
3785 |
* @x: holds the state of this particular completion |
3786 |
* @timeout: timeout value in jiffies |
3787 |
* |
3788 |
* This waits for either a completion of a specific task to be signaled or for a |
3789 |
* specified timeout to expire. The timeout is in jiffies. It is not |
3790 |
* interruptible. |
3791 |
* |
3792 |
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left |
3793 |
* till timeout) if completed. |
3794 |
*/ |
3795 |
unsigned long __sched |
3796 |
wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
3797 |
{ |
3798 |
return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); |
3799 |
} |
3800 |
EXPORT_SYMBOL(wait_for_completion_timeout); |
3801 |
|
3802 |
/** |
3803 |
* wait_for_completion_io: - waits for completion of a task |
3804 |
* @x: holds the state of this particular completion |
3805 |
* |
3806 |
* This waits to be signaled for completion of a specific task. It is NOT |
3807 |
* interruptible and there is no timeout. The caller is accounted as waiting |
3808 |
* for IO. |
3809 |
*/ |
3810 |
void __sched wait_for_completion_io(struct completion *x) |
3811 |
{ |
3812 |
wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
3813 |
} |
3814 |
EXPORT_SYMBOL(wait_for_completion_io); |
3815 |
|
3816 |
/** |
3817 |
* wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) |
3818 |
* @x: holds the state of this particular completion |
3819 |
* @timeout: timeout value in jiffies |
3820 |
* |
3821 |
* This waits for either a completion of a specific task to be signaled or for a |
3822 |
* specified timeout to expire. The timeout is in jiffies. It is not |
3823 |
* interruptible. The caller is accounted as waiting for IO. |
3824 |
* |
3825 |
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left |
3826 |
* till timeout) if completed. |
3827 |
*/ |
3828 |
unsigned long __sched |
3829 |
wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) |
3830 |
{ |
3831 |
return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); |
3832 |
} |
3833 |
EXPORT_SYMBOL(wait_for_completion_io_timeout); |
3834 |
|
3835 |
/** |
3836 |
* wait_for_completion_interruptible: - waits for completion of a task (w/intr) |
3837 |
* @x: holds the state of this particular completion |
3838 |
* |
3839 |
* This waits for completion of a specific task to be signaled. It is |
3840 |
* interruptible. |
3841 |
* |
3842 |
* Return: -ERESTARTSYS if interrupted, 0 if completed. |
3843 |
*/ |
3844 |
int __sched wait_for_completion_interruptible(struct completion *x) |
3845 |
{ |
3846 |
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
3847 |
if (t == -ERESTARTSYS) |
3848 |
return t; |
3849 |
return 0; |
3850 |
} |
3851 |
EXPORT_SYMBOL(wait_for_completion_interruptible); |
3852 |
|
3853 |
/** |
3854 |
* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) |
3855 |
* @x: holds the state of this particular completion |
3856 |
* @timeout: timeout value in jiffies |
3857 |
* |
3858 |
* This waits for either a completion of a specific task to be signaled or for a |
3859 |
* specified timeout to expire. It is interruptible. The timeout is in jiffies. |
3860 |
* |
3861 |
* Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, |
3862 |
* or number of jiffies left till timeout) if completed. |
3863 |
*/ |
3864 |
long __sched |
3865 |
wait_for_completion_interruptible_timeout(struct completion *x, |
3866 |
unsigned long timeout) |
3867 |
{ |
3868 |
return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); |
3869 |
} |
3870 |
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
3871 |
|
3872 |
/** |
3873 |
* wait_for_completion_killable: - waits for completion of a task (killable) |
3874 |
* @x: holds the state of this particular completion |
3875 |
* |
3876 |
* This waits to be signaled for completion of a specific task. It can be |
3877 |
* interrupted by a kill signal. |
3878 |
* |
3879 |
* Return: -ERESTARTSYS if interrupted, 0 if completed. |
3880 |
*/ |
3881 |
int __sched wait_for_completion_killable(struct completion *x) |
3882 |
{ |
3883 |
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); |
3884 |
if (t == -ERESTARTSYS) |
3885 |
return t; |
3886 |
return 0; |
3887 |
} |
3888 |
EXPORT_SYMBOL(wait_for_completion_killable); |
3889 |
|
3890 |
/** |
3891 |
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) |
3892 |
* @x: holds the state of this particular completion |
3893 |
* @timeout: timeout value in jiffies |
3894 |
* |
3895 |
* This waits for either a completion of a specific task to be |
3896 |
* signaled or for a specified timeout to expire. It can be |
3897 |
* interrupted by a kill signal. The timeout is in jiffies. |
3898 |
* |
3899 |
* Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, |
3900 |
* or number of jiffies left till timeout) if completed. |
3901 |
*/ |
3902 |
long __sched |
3903 |
wait_for_completion_killable_timeout(struct completion *x, |
3904 |
unsigned long timeout) |
3905 |
{ |
3906 |
return wait_for_common(x, timeout, TASK_KILLABLE); |
3907 |
} |
3908 |
EXPORT_SYMBOL(wait_for_completion_killable_timeout); |
3909 |
|
3910 |
/** |
3911 |
* try_wait_for_completion - try to decrement a completion without blocking |
3912 |
* @x: completion structure |
3913 |
* |
3914 |
* Return: 0 if a decrement cannot be done without blocking |
3915 |
* 1 if a decrement succeeded. |
3916 |
* |
3917 |
* If a completion is being used as a counting completion, |
3918 |
* attempt to decrement the counter without blocking. This |
3919 |
* enables us to avoid waiting if the resource the completion |
3920 |
* is protecting is not available. |
3921 |
*/ |
3922 |
bool try_wait_for_completion(struct completion *x) |
3923 |
{ |
3924 |
unsigned long flags; |
3925 |
int ret = 1; |
3926 |
|
3927 |
spin_lock_irqsave(&x->wait.lock, flags); |
3928 |
if (!x->done) |
3929 |
ret = 0; |
3930 |
else |
3931 |
x->done--; |
3932 |
spin_unlock_irqrestore(&x->wait.lock, flags); |
3933 |
return ret; |
3934 |
} |
3935 |
EXPORT_SYMBOL(try_wait_for_completion); |
3936 |
|
3937 |
/** |
3938 |
* completion_done - Test to see if a completion has any waiters |
3939 |
* @x: completion structure |
3940 |
* |
3941 |
* Return: 0 if there are waiters (wait_for_completion() in progress) |
3942 |
* 1 if there are no waiters. |
3943 |
* |
3944 |
*/ |
3945 |
bool completion_done(struct completion *x) |
3946 |
{ |
3947 |
unsigned long flags; |
3948 |
int ret = 1; |
3949 |
|
3950 |
spin_lock_irqsave(&x->wait.lock, flags); |
3951 |
if (!x->done) |
3952 |
ret = 0; |
3953 |
spin_unlock_irqrestore(&x->wait.lock, flags); |
3954 |
return ret; |
3955 |
} |
3956 |
EXPORT_SYMBOL(completion_done); |
3957 |
|
3958 |
static long __sched |
3959 |
sleep_on_common(wait_queue_head_t *q, int state, long timeout) |
3960 |
{ |
3961 |
unsigned long flags; |
3962 |
wait_queue_t wait; |
3963 |
|
3964 |
init_waitqueue_entry(&wait, current); |
3965 |
|
3966 |
__set_current_state(state); |
3967 |
|
3968 |
spin_lock_irqsave(&q->lock, flags); |
3969 |
__add_wait_queue(q, &wait); |
3970 |
spin_unlock(&q->lock); |
3971 |
timeout = schedule_timeout(timeout); |
3972 |
spin_lock_irq(&q->lock); |
3973 |
__remove_wait_queue(q, &wait); |
3974 |
spin_unlock_irqrestore(&q->lock, flags); |
3975 |
|
3976 |
return timeout; |
3977 |
} |
3978 |
|
3979 |
void __sched interruptible_sleep_on(wait_queue_head_t *q) |
3980 |
{ |
3981 |
sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
3982 |
} |
3983 |
EXPORT_SYMBOL(interruptible_sleep_on); |
3984 |
|
3985 |
long __sched |
3986 |
interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3987 |
{ |
3988 |
return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); |
3989 |
} |
3990 |
EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
3991 |
|
3992 |
void __sched sleep_on(wait_queue_head_t *q) |
3993 |
{ |
3994 |
sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
3995 |
} |
3996 |
EXPORT_SYMBOL(sleep_on); |
3997 |
|
3998 |
long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3999 |
{ |
4000 |
return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); |
4001 |
} |
4002 |
EXPORT_SYMBOL(sleep_on_timeout); |
4003 |
|
4004 |
#ifdef CONFIG_RT_MUTEXES |
4005 |
|
4006 |
/* |
4007 |
* rt_mutex_setprio - set the current priority of a task |
4008 |
* @p: task |
4009 |
* @prio: prio value (kernel-internal form) |
4010 |
* |
4011 |
* This function changes the 'effective' priority of a task. It does |
4012 |
* not touch ->normal_prio like __setscheduler(). |
4013 |
* |
4014 |
* Used by the rt_mutex code to implement priority inheritance logic. |
4015 |
*/ |
4016 |
void rt_mutex_setprio(struct task_struct *p, int prio) |
4017 |
{ |
4018 |
unsigned long flags; |
4019 |
int queued, oldprio; |
4020 |
struct rq *rq; |
4021 |
|
4022 |
BUG_ON(prio < 0 || prio > MAX_PRIO); |
4023 |
|
4024 |
rq = task_grq_lock(p, &flags); |
4025 |
|
4026 |
/* |
4027 |
* Idle task boosting is a nono in general. There is one |
4028 |
* exception, when PREEMPT_RT and NOHZ is active: |
4029 |
* |
4030 |
* The idle task calls get_next_timer_interrupt() and holds |
4031 |
* the timer wheel base->lock on the CPU and another CPU wants |
4032 |
* to access the timer (probably to cancel it). We can safely |
4033 |
* ignore the boosting request, as the idle CPU runs this code |
4034 |
* with interrupts disabled and will complete the lock |
4035 |
* protected section without being interrupted. So there is no |
4036 |
* real need to boost. |
4037 |
*/ |
4038 |
if (unlikely(p == rq->idle)) { |
4039 |
WARN_ON(p != rq->curr); |
4040 |
WARN_ON(p->pi_blocked_on); |
4041 |
goto out_unlock; |
4042 |
} |
4043 |
|
4044 |
trace_sched_pi_setprio(p, prio); |
4045 |
oldprio = p->prio; |
4046 |
queued = task_queued(p); |
4047 |
if (queued) |
4048 |
dequeue_task(p); |
4049 |
p->prio = prio; |
4050 |
if (task_running(p) && prio > oldprio) |
4051 |
resched_task(p); |
4052 |
if (queued) { |
4053 |
enqueue_task(p); |
4054 |
try_preempt(p, rq); |
4055 |
} |
4056 |
|
4057 |
out_unlock: |
4058 |
task_grq_unlock(&flags); |
4059 |
} |
4060 |
|
4061 |
#endif |
4062 |
|
4063 |
/* |
4064 |
* Adjust the deadline for when the priority is to change, before it's |
4065 |
* changed. |
4066 |
*/ |
4067 |
static inline void adjust_deadline(struct task_struct *p, int new_prio) |
4068 |
{ |
4069 |
p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); |
4070 |
} |
4071 |
|
4072 |
void set_user_nice(struct task_struct *p, long nice) |
4073 |
{ |
4074 |
int queued, new_static, old_static; |
4075 |
unsigned long flags; |
4076 |
struct rq *rq; |
4077 |
|
4078 |
if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
4079 |
return; |
4080 |
new_static = NICE_TO_PRIO(nice); |
4081 |
/* |
4082 |
* We have to be careful, if called from sys_setpriority(), |
4083 |
* the task might be in the middle of scheduling on another CPU. |
4084 |
*/ |
4085 |
rq = time_task_grq_lock(p, &flags); |
4086 |
/* |
4087 |
* The RT priorities are set via sched_setscheduler(), but we still |
4088 |
* allow the 'normal' nice value to be set - but as expected |
4089 |
* it wont have any effect on scheduling until the task is |
4090 |
* not SCHED_NORMAL/SCHED_BATCH: |
4091 |
*/ |
4092 |
if (has_rt_policy(p)) { |
4093 |
p->static_prio = new_static; |
4094 |
goto out_unlock; |
4095 |
} |
4096 |
queued = task_queued(p); |
4097 |
if (queued) |
4098 |
dequeue_task(p); |
4099 |
|
4100 |
adjust_deadline(p, new_static); |
4101 |
old_static = p->static_prio; |
4102 |
p->static_prio = new_static; |
4103 |
p->prio = effective_prio(p); |
4104 |
|
4105 |
if (queued) { |
4106 |
enqueue_task(p); |
4107 |
if (new_static < old_static) |
4108 |
try_preempt(p, rq); |
4109 |
} else if (task_running(p)) { |
4110 |
reset_rq_task(rq, p); |
4111 |
if (old_static < new_static) |
4112 |
resched_task(p); |
4113 |
} |
4114 |
out_unlock: |
4115 |
task_grq_unlock(&flags); |
4116 |
} |
4117 |
EXPORT_SYMBOL(set_user_nice); |
4118 |
|
4119 |
/* |
4120 |
* can_nice - check if a task can reduce its nice value |
4121 |
* @p: task |
4122 |
* @nice: nice value |
4123 |
*/ |
4124 |
int can_nice(const struct task_struct *p, const int nice) |
4125 |
{ |
4126 |
/* convert nice value [19,-20] to rlimit style value [1,40] */ |
4127 |
int nice_rlim = 20 - nice; |
4128 |
|
4129 |
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
4130 |
capable(CAP_SYS_NICE)); |
4131 |
} |
4132 |
|
4133 |
#ifdef __ARCH_WANT_SYS_NICE |
4134 |
|
4135 |
/* |
4136 |
* sys_nice - change the priority of the current process. |
4137 |
* @increment: priority increment |
4138 |
* |
4139 |
* sys_setpriority is a more generic, but much slower function that |
4140 |
* does similar things. |
4141 |
*/ |
4142 |
SYSCALL_DEFINE1(nice, int, increment) |
4143 |
{ |
4144 |
long nice, retval; |
4145 |
|
4146 |
/* |
4147 |
* Setpriority might change our priority at the same moment. |
4148 |
* We don't have to worry. Conceptually one call occurs first |
4149 |
* and we have a single winner. |
4150 |
*/ |
4151 |
if (increment < -40) |
4152 |
increment = -40; |
4153 |
if (increment > 40) |
4154 |
increment = 40; |
4155 |
|
4156 |
nice = TASK_NICE(current) + increment; |
4157 |
if (nice < -20) |
4158 |
nice = -20; |
4159 |
if (nice > 19) |
4160 |
nice = 19; |
4161 |
|
4162 |
if (increment < 0 && !can_nice(current, nice)) |
4163 |
return -EPERM; |
4164 |
|
4165 |
retval = security_task_setnice(current, nice); |
4166 |
if (retval) |
4167 |
return retval; |
4168 |
|
4169 |
set_user_nice(current, nice); |
4170 |
return 0; |
4171 |
} |
4172 |
|
4173 |
#endif |
4174 |
|
4175 |
/** |
4176 |
* task_prio - return the priority value of a given task. |
4177 |
* @p: the task in question. |
4178 |
* |
4179 |
* Return: The priority value as seen by users in /proc. |
4180 |
* RT tasks are offset by -100. Normal tasks are centered around 1, value goes |
4181 |
* from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). |
4182 |
*/ |
4183 |
int task_prio(const struct task_struct *p) |
4184 |
{ |
4185 |
int delta, prio = p->prio - MAX_RT_PRIO; |
4186 |
|
4187 |
/* rt tasks and iso tasks */ |
4188 |
if (prio <= 0) |
4189 |
goto out; |
4190 |
|
4191 |
/* Convert to ms to avoid overflows */ |
4192 |
delta = NS_TO_MS(p->deadline - grq.niffies); |
4193 |
delta = delta * 40 / ms_longest_deadline_diff(); |
4194 |
if (delta > 0 && delta <= 80) |
4195 |
prio += delta; |
4196 |
if (idleprio_task(p)) |
4197 |
prio += 40; |
4198 |
out: |
4199 |
return prio; |
4200 |
} |
4201 |
|
4202 |
/** |
4203 |
* task_nice - return the nice value of a given task. |
4204 |
* @p: the task in question. |
4205 |
* |
4206 |
* Return: The nice value [ -20 ... 0 ... 19 ]. |
4207 |
*/ |
4208 |
int task_nice(const struct task_struct *p) |
4209 |
{ |
4210 |
return TASK_NICE(p); |
4211 |
} |
4212 |
EXPORT_SYMBOL_GPL(task_nice); |
4213 |
|
4214 |
/** |
4215 |
* idle_cpu - is a given cpu idle currently? |
4216 |
* @cpu: the processor in question. |
4217 |
* |
4218 |
* Return: 1 if the CPU is currently idle. 0 otherwise. |
4219 |
*/ |
4220 |
int idle_cpu(int cpu) |
4221 |
{ |
4222 |
#ifdef CONFIG_SMP |
4223 |
struct rq *rq = cpu_rq(cpu); |
4224 |
|
4225 |
if (!llist_empty(&rq->wake_list)) |
4226 |
return 0; |
4227 |
#endif |
4228 |
return cpu_curr(cpu) == cpu_rq(cpu)->idle; |
4229 |
} |
4230 |
|
4231 |
/** |
4232 |
* idle_task - return the idle task for a given cpu. |
4233 |
* @cpu: the processor in question. |
4234 |
* |
4235 |
* Return: The idle task for the cpu @cpu. |
4236 |
*/ |
4237 |
struct task_struct *idle_task(int cpu) |
4238 |
{ |
4239 |
return cpu_rq(cpu)->idle; |
4240 |
} |
4241 |
|
4242 |
/** |
4243 |
* find_process_by_pid - find a process with a matching PID value. |
4244 |
* @pid: the pid in question. |
4245 |
* |
4246 |
* The task of @pid, if found. %NULL otherwise. |
4247 |
*/ |
4248 |
static inline struct task_struct *find_process_by_pid(pid_t pid) |
4249 |
{ |
4250 |
return pid ? find_task_by_vpid(pid) : current; |
4251 |
} |
4252 |
|
4253 |
/* Actually do priority change: must hold grq lock. */ |
4254 |
static void |
4255 |
__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio) |
4256 |
{ |
4257 |
int oldrtprio, oldprio; |
4258 |
|
4259 |
p->policy = policy; |
4260 |
oldrtprio = p->rt_priority; |
4261 |
p->rt_priority = prio; |
4262 |
p->normal_prio = normal_prio(p); |
4263 |
oldprio = p->prio; |
4264 |
/* we are holding p->pi_lock already */ |
4265 |
p->prio = rt_mutex_getprio(p); |
4266 |
if (task_running(p)) { |
4267 |
reset_rq_task(rq, p); |
4268 |
/* Resched only if we might now be preempted */ |
4269 |
if (p->prio > oldprio || p->rt_priority > oldrtprio) |
4270 |
resched_task(p); |
4271 |
} |
4272 |
} |
4273 |
|
4274 |
/* |
4275 |
* check the target process has a UID that matches the current process's |
4276 |
*/ |
4277 |
static bool check_same_owner(struct task_struct *p) |
4278 |
{ |
4279 |
const struct cred *cred = current_cred(), *pcred; |
4280 |
bool match; |
4281 |
|
4282 |
rcu_read_lock(); |
4283 |
pcred = __task_cred(p); |
4284 |
match = (uid_eq(cred->euid, pcred->euid) || |
4285 |
uid_eq(cred->euid, pcred->uid)); |
4286 |
rcu_read_unlock(); |
4287 |
return match; |
4288 |
} |
4289 |
|
4290 |
static int __sched_setscheduler(struct task_struct *p, int policy, |
4291 |
const struct sched_param *param, bool user) |
4292 |
{ |
4293 |
struct sched_param zero_param = { .sched_priority = 0 }; |
4294 |
int queued, retval, oldpolicy = -1; |
4295 |
unsigned long flags, rlim_rtprio = 0; |
4296 |
int reset_on_fork; |
4297 |
struct rq *rq; |
4298 |
|
4299 |
/* may grab non-irq protected spin_locks */ |
4300 |
BUG_ON(in_interrupt()); |
4301 |
|
4302 |
if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { |
4303 |
unsigned long lflags; |
4304 |
|
4305 |
if (!lock_task_sighand(p, &lflags)) |
4306 |
return -ESRCH; |
4307 |
rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); |
4308 |
unlock_task_sighand(p, &lflags); |
4309 |
if (rlim_rtprio) |
4310 |
goto recheck; |
4311 |
/* |
4312 |
* If the caller requested an RT policy without having the |
4313 |
* necessary rights, we downgrade the policy to SCHED_ISO. |
4314 |
* We also set the parameter to zero to pass the checks. |
4315 |
*/ |
4316 |
policy = SCHED_ISO; |
4317 |
param = &zero_param; |
4318 |
} |
4319 |
recheck: |
4320 |
/* double check policy once rq lock held */ |
4321 |
if (policy < 0) { |
4322 |
reset_on_fork = p->sched_reset_on_fork; |
4323 |
policy = oldpolicy = p->policy; |
4324 |
} else { |
4325 |
reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
4326 |
policy &= ~SCHED_RESET_ON_FORK; |
4327 |
|
4328 |
if (!SCHED_RANGE(policy)) |
4329 |
return -EINVAL; |
4330 |
} |
4331 |
|
4332 |
/* |
4333 |
* Valid priorities for SCHED_FIFO and SCHED_RR are |
4334 |
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and |
4335 |
* SCHED_BATCH is 0. |
4336 |
*/ |
4337 |
if (param->sched_priority < 0 || |
4338 |
(p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) || |
4339 |
(!p->mm && param->sched_priority > MAX_RT_PRIO - 1)) |
4340 |
return -EINVAL; |
4341 |
if (is_rt_policy(policy) != (param->sched_priority != 0)) |
4342 |
return -EINVAL; |
4343 |
|
4344 |
/* |
4345 |
* Allow unprivileged RT tasks to decrease priority: |
4346 |
*/ |
4347 |
if (user && !capable(CAP_SYS_NICE)) { |
4348 |
if (is_rt_policy(policy)) { |
4349 |
unsigned long rlim_rtprio = |
4350 |
task_rlimit(p, RLIMIT_RTPRIO); |
4351 |
|
4352 |
/* can't set/change the rt policy */ |
4353 |
if (policy != p->policy && !rlim_rtprio) |
4354 |
return -EPERM; |
4355 |
|
4356 |
/* can't increase priority */ |
4357 |
if (param->sched_priority > p->rt_priority && |
4358 |
param->sched_priority > rlim_rtprio) |
4359 |
return -EPERM; |
4360 |
} else { |
4361 |
switch (p->policy) { |
4362 |
/* |
4363 |
* Can only downgrade policies but not back to |
4364 |
* SCHED_NORMAL |
4365 |
*/ |
4366 |
case SCHED_ISO: |
4367 |
if (policy == SCHED_ISO) |
4368 |
goto out; |
4369 |
if (policy == SCHED_NORMAL) |
4370 |
return -EPERM; |
4371 |
break; |
4372 |
case SCHED_BATCH: |
4373 |
if (policy == SCHED_BATCH) |
4374 |
goto out; |
4375 |
if (policy != SCHED_IDLEPRIO) |
4376 |
return -EPERM; |
4377 |
break; |
4378 |
case SCHED_IDLEPRIO: |
4379 |
if (policy == SCHED_IDLEPRIO) |
4380 |
goto out; |
4381 |
return -EPERM; |
4382 |
default: |
4383 |
break; |
4384 |
} |
4385 |
} |
4386 |
|
4387 |
/* can't change other user's priorities */ |
4388 |
if (!check_same_owner(p)) |
4389 |
return -EPERM; |
4390 |
|
4391 |
/* Normal users shall not reset the sched_reset_on_fork flag */ |
4392 |
if (p->sched_reset_on_fork && !reset_on_fork) |
4393 |
return -EPERM; |
4394 |
} |
4395 |
|
4396 |
if (user) { |
4397 |
retval = security_task_setscheduler(p); |
4398 |
if (retval) |
4399 |
return retval; |
4400 |
} |
4401 |
|
4402 |
/* |
4403 |
* make sure no PI-waiters arrive (or leave) while we are |
4404 |
* changing the priority of the task: |
4405 |
*/ |
4406 |
raw_spin_lock_irqsave(&p->pi_lock, flags); |
4407 |
/* |
4408 |
* To be able to change p->policy safely, the grunqueue lock must be |
4409 |
* held. |
4410 |
*/ |
4411 |
rq = __task_grq_lock(p); |
4412 |
|
4413 |
/* |
4414 |
* Changing the policy of the stop threads its a very bad idea |
4415 |
*/ |
4416 |
if (p == rq->stop) { |
4417 |
__task_grq_unlock(); |
4418 |
raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4419 |
return -EINVAL; |
4420 |
} |
4421 |
|
4422 |
/* |
4423 |
* If not changing anything there's no need to proceed further: |
4424 |
*/ |
4425 |
if (unlikely(policy == p->policy && (!is_rt_policy(policy) || |
4426 |
param->sched_priority == p->rt_priority))) { |
4427 |
|
4428 |
__task_grq_unlock(); |
4429 |
raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4430 |
return 0; |
4431 |
} |
4432 |
|
4433 |
/* recheck policy now with rq lock held */ |
4434 |
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4435 |
policy = oldpolicy = -1; |
4436 |
__task_grq_unlock(); |
4437 |
raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4438 |
goto recheck; |
4439 |
} |
4440 |
update_clocks(rq); |
4441 |
p->sched_reset_on_fork = reset_on_fork; |
4442 |
|
4443 |
queued = task_queued(p); |
4444 |
if (queued) |
4445 |
dequeue_task(p); |
4446 |
__setscheduler(p, rq, policy, param->sched_priority); |
4447 |
if (queued) { |
4448 |
enqueue_task(p); |
4449 |
try_preempt(p, rq); |
4450 |
} |
4451 |
__task_grq_unlock(); |
4452 |
raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4453 |
|
4454 |
rt_mutex_adjust_pi(p); |
4455 |
out: |
4456 |
return 0; |
4457 |
} |
4458 |
|
4459 |
/** |
4460 |
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
4461 |
* @p: the task in question. |
4462 |
* @policy: new policy. |
4463 |
* @param: structure containing the new RT priority. |
4464 |
* |
4465 |
* Return: 0 on success. An error code otherwise. |
4466 |
* |
4467 |
* NOTE that the task may be already dead. |
4468 |
*/ |
4469 |
int sched_setscheduler(struct task_struct *p, int policy, |
4470 |
const struct sched_param *param) |
4471 |
{ |
4472 |
return __sched_setscheduler(p, policy, param, true); |
4473 |
} |
4474 |
|
4475 |
EXPORT_SYMBOL_GPL(sched_setscheduler); |
4476 |
|
4477 |
/** |
4478 |
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. |
4479 |
* @p: the task in question. |
4480 |
* @policy: new policy. |
4481 |
* @param: structure containing the new RT priority. |
4482 |
* |
4483 |
* Just like sched_setscheduler, only don't bother checking if the |
4484 |
* current context has permission. For example, this is needed in |
4485 |
* stop_machine(): we create temporary high priority worker threads, |
4486 |
* but our caller might not have that capability. |
4487 |
* |
4488 |
* Return: 0 on success. An error code otherwise. |
4489 |
*/ |
4490 |
int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
4491 |
const struct sched_param *param) |
4492 |
{ |
4493 |
return __sched_setscheduler(p, policy, param, false); |
4494 |
} |
4495 |
|
4496 |
static int |
4497 |
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
4498 |
{ |
4499 |
struct sched_param lparam; |
4500 |
struct task_struct *p; |
4501 |
int retval; |
4502 |
|
4503 |
if (!param || pid < 0) |
4504 |
return -EINVAL; |
4505 |
if (copy_from_user(&lparam, param, sizeof(struct sched_param))) |
4506 |
return -EFAULT; |
4507 |
|
4508 |
rcu_read_lock(); |
4509 |
retval = -ESRCH; |
4510 |
p = find_process_by_pid(pid); |
4511 |
if (p != NULL) |
4512 |
retval = sched_setscheduler(p, policy, &lparam); |
4513 |
rcu_read_unlock(); |
4514 |
|
4515 |
return retval; |
4516 |
} |
4517 |
|
4518 |
/** |
4519 |
* sys_sched_setscheduler - set/change the scheduler policy and RT priority |
4520 |
* @pid: the pid in question. |
4521 |
* @policy: new policy. |
4522 |
* |
4523 |
* Return: 0 on success. An error code otherwise. |
4524 |
* @param: structure containing the new RT priority. |
4525 |
*/ |
4526 |
asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, |
4527 |
struct sched_param __user *param) |
4528 |
{ |
4529 |
/* negative values for policy are not valid */ |
4530 |
if (policy < 0) |
4531 |
return -EINVAL; |
4532 |
|
4533 |
return do_sched_setscheduler(pid, policy, param); |
4534 |
} |
4535 |
|
4536 |
/** |
4537 |
* sys_sched_setparam - set/change the RT priority of a thread |
4538 |
* @pid: the pid in question. |
4539 |
* @param: structure containing the new RT priority. |
4540 |
* |
4541 |
* Return: 0 on success. An error code otherwise. |
4542 |
*/ |
4543 |
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) |
4544 |
{ |
4545 |
return do_sched_setscheduler(pid, -1, param); |
4546 |
} |
4547 |
|
4548 |
/** |
4549 |
* sys_sched_getscheduler - get the policy (scheduling class) of a thread |
4550 |
* @pid: the pid in question. |
4551 |
* |
4552 |
* Return: On success, the policy of the thread. Otherwise, a negative error |
4553 |
* code. |
4554 |
*/ |
4555 |
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) |
4556 |
{ |
4557 |
struct task_struct *p; |
4558 |
int retval = -EINVAL; |
4559 |
|
4560 |
if (pid < 0) |
4561 |
goto out_nounlock; |
4562 |
|
4563 |
retval = -ESRCH; |
4564 |
rcu_read_lock(); |
4565 |
p = find_process_by_pid(pid); |
4566 |
if (p) { |
4567 |
retval = security_task_getscheduler(p); |
4568 |
if (!retval) |
4569 |
retval = p->policy; |
4570 |
} |
4571 |
rcu_read_unlock(); |
4572 |
|
4573 |
out_nounlock: |
4574 |
return retval; |
4575 |
} |
4576 |
|
4577 |
/** |
4578 |
* sys_sched_getscheduler - get the RT priority of a thread |
4579 |
* @pid: the pid in question. |
4580 |
* @param: structure containing the RT priority. |
4581 |
* |
4582 |
* Return: On success, 0 and the RT priority is in @param. Otherwise, an error |
4583 |
* code. |
4584 |
*/ |
4585 |
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) |
4586 |
{ |
4587 |
struct sched_param lp; |
4588 |
struct task_struct *p; |
4589 |
int retval = -EINVAL; |
4590 |
|
4591 |
if (!param || pid < 0) |
4592 |
goto out_nounlock; |
4593 |
|
4594 |
rcu_read_lock(); |
4595 |
p = find_process_by_pid(pid); |
4596 |
retval = -ESRCH; |
4597 |
if (!p) |
4598 |
goto out_unlock; |
4599 |
|
4600 |
retval = security_task_getscheduler(p); |
4601 |
if (retval) |
4602 |
goto out_unlock; |
4603 |
|
4604 |
lp.sched_priority = p->rt_priority; |
4605 |
rcu_read_unlock(); |
4606 |
|
4607 |
/* |
4608 |
* This one might sleep, we cannot do it with a spinlock held ... |
4609 |
*/ |
4610 |
retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
4611 |
|
4612 |
out_nounlock: |
4613 |
return retval; |
4614 |
|
4615 |
out_unlock: |
4616 |
rcu_read_unlock(); |
4617 |
return retval; |
4618 |
} |
4619 |
|
4620 |
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) |
4621 |
{ |
4622 |
cpumask_var_t cpus_allowed, new_mask; |
4623 |
struct task_struct *p; |
4624 |
int retval; |
4625 |
|
4626 |
get_online_cpus(); |
4627 |
rcu_read_lock(); |
4628 |
|
4629 |
p = find_process_by_pid(pid); |
4630 |
if (!p) { |
4631 |
rcu_read_unlock(); |
4632 |
put_online_cpus(); |
4633 |
return -ESRCH; |
4634 |
} |
4635 |
|
4636 |
/* Prevent p going away */ |
4637 |
get_task_struct(p); |
4638 |
rcu_read_unlock(); |
4639 |
|
4640 |
if (p->flags & PF_NO_SETAFFINITY) { |
4641 |
retval = -EINVAL; |
4642 |
goto out_put_task; |
4643 |
} |
4644 |
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { |
4645 |
retval = -ENOMEM; |
4646 |
goto out_put_task; |
4647 |
} |
4648 |
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { |
4649 |
retval = -ENOMEM; |
4650 |
goto out_free_cpus_allowed; |
4651 |
} |
4652 |
retval = -EPERM; |
4653 |
if (!check_same_owner(p)) { |
4654 |
rcu_read_lock(); |
4655 |
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { |
4656 |
rcu_read_unlock(); |
4657 |
goto out_unlock; |
4658 |
} |
4659 |
rcu_read_unlock(); |
4660 |
} |
4661 |
|
4662 |
retval = security_task_setscheduler(p); |
4663 |
if (retval) |
4664 |
goto out_unlock; |
4665 |
|
4666 |
cpuset_cpus_allowed(p, cpus_allowed); |
4667 |
cpumask_and(new_mask, in_mask, cpus_allowed); |
4668 |
again: |
4669 |
retval = set_cpus_allowed_ptr(p, new_mask); |
4670 |
|
4671 |
if (!retval) { |
4672 |
cpuset_cpus_allowed(p, cpus_allowed); |
4673 |
if (!cpumask_subset(new_mask, cpus_allowed)) { |
4674 |
/* |
4675 |
* We must have raced with a concurrent cpuset |
4676 |
* update. Just reset the cpus_allowed to the |
4677 |
* cpuset's cpus_allowed |
4678 |
*/ |
4679 |
cpumask_copy(new_mask, cpus_allowed); |
4680 |
goto again; |
4681 |
} |
4682 |
} |
4683 |
out_unlock: |
4684 |
free_cpumask_var(new_mask); |
4685 |
out_free_cpus_allowed: |
4686 |
free_cpumask_var(cpus_allowed); |
4687 |
out_put_task: |
4688 |
put_task_struct(p); |
4689 |
put_online_cpus(); |
4690 |
return retval; |
4691 |
} |
4692 |
|
4693 |
static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, |
4694 |
cpumask_t *new_mask) |
4695 |
{ |
4696 |
if (len < sizeof(cpumask_t)) { |
4697 |
memset(new_mask, 0, sizeof(cpumask_t)); |
4698 |
} else if (len > sizeof(cpumask_t)) { |
4699 |
len = sizeof(cpumask_t); |
4700 |
} |
4701 |
return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; |
4702 |
} |
4703 |
|
4704 |
|
4705 |
/** |
4706 |
* sys_sched_setaffinity - set the cpu affinity of a process |
4707 |
* @pid: pid of the process |
4708 |
* @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4709 |
* @user_mask_ptr: user-space pointer to the new cpu mask |
4710 |
* |
4711 |
* Return: 0 on success. An error code otherwise. |
4712 |
*/ |
4713 |
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, |
4714 |
unsigned long __user *, user_mask_ptr) |
4715 |
{ |
4716 |
cpumask_var_t new_mask; |
4717 |
int retval; |
4718 |
|
4719 |
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) |
4720 |
return -ENOMEM; |
4721 |
|
4722 |
retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); |
4723 |
if (retval == 0) |
4724 |
retval = sched_setaffinity(pid, new_mask); |
4725 |
free_cpumask_var(new_mask); |
4726 |
return retval; |
4727 |
} |
4728 |
|
4729 |
long sched_getaffinity(pid_t pid, cpumask_t *mask) |
4730 |
{ |
4731 |
struct task_struct *p; |
4732 |
unsigned long flags; |
4733 |
int retval; |
4734 |
|
4735 |
get_online_cpus(); |
4736 |
rcu_read_lock(); |
4737 |
|
4738 |
retval = -ESRCH; |
4739 |
p = find_process_by_pid(pid); |
4740 |
if (!p) |
4741 |
goto out_unlock; |
4742 |
|
4743 |
retval = security_task_getscheduler(p); |
4744 |
if (retval) |
4745 |
goto out_unlock; |
4746 |
|
4747 |
grq_lock_irqsave(&flags); |
4748 |
cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask); |
4749 |
grq_unlock_irqrestore(&flags); |
4750 |
|
4751 |
out_unlock: |
4752 |
rcu_read_unlock(); |
4753 |
put_online_cpus(); |
4754 |
|
4755 |
return retval; |
4756 |
} |
4757 |
|
4758 |
/** |
4759 |
* sys_sched_getaffinity - get the cpu affinity of a process |
4760 |
* @pid: pid of the process |
4761 |
* @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4762 |
* @user_mask_ptr: user-space pointer to hold the current cpu mask |
4763 |
* |
4764 |
* Return: 0 on success. An error code otherwise. |
4765 |
*/ |
4766 |
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, |
4767 |
unsigned long __user *, user_mask_ptr) |
4768 |
{ |
4769 |
int ret; |
4770 |
cpumask_var_t mask; |
4771 |
|
4772 |
if ((len * BITS_PER_BYTE) < nr_cpu_ids) |
4773 |
return -EINVAL; |
4774 |
if (len & (sizeof(unsigned long)-1)) |
4775 |
return -EINVAL; |
4776 |
|
4777 |
if (!alloc_cpumask_var(&mask, GFP_KERNEL)) |
4778 |
return -ENOMEM; |
4779 |
|
4780 |
ret = sched_getaffinity(pid, mask); |
4781 |
if (ret == 0) { |
4782 |
size_t retlen = min_t(size_t, len, cpumask_size()); |
4783 |
|
4784 |
if (copy_to_user(user_mask_ptr, mask, retlen)) |
4785 |
ret = -EFAULT; |
4786 |
else |
4787 |
ret = retlen; |
4788 |
} |
4789 |
free_cpumask_var(mask); |
4790 |
|
4791 |
return ret; |
4792 |
} |
4793 |
|
4794 |
/** |
4795 |
* sys_sched_yield - yield the current processor to other threads. |
4796 |
* |
4797 |
* This function yields the current CPU to other tasks. It does this by |
4798 |
* scheduling away the current task. If it still has the earliest deadline |
4799 |
* it will be scheduled again as the next task. |
4800 |
* |
4801 |
* Return: 0. |
4802 |
*/ |
4803 |
SYSCALL_DEFINE0(sched_yield) |
4804 |
{ |
4805 |
struct task_struct *p; |
4806 |
|
4807 |
p = current; |
4808 |
grq_lock_irq(); |
4809 |
schedstat_inc(task_rq(p), yld_count); |
4810 |
requeue_task(p); |
4811 |
|
4812 |
/* |
4813 |
* Since we are going to call schedule() anyway, there's |
4814 |
* no need to preempt or enable interrupts: |
4815 |
*/ |
4816 |
__release(grq.lock); |
4817 |
spin_release(&grq.lock.dep_map, 1, _THIS_IP_); |
4818 |
do_raw_spin_unlock(&grq.lock); |
4819 |
sched_preempt_enable_no_resched(); |
4820 |
|
4821 |
schedule(); |
4822 |
|
4823 |
return 0; |
4824 |
} |
4825 |
|
4826 |
static inline bool should_resched(void) |
4827 |
{ |
4828 |
return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); |
4829 |
} |
4830 |
|
4831 |
static void __cond_resched(void) |
4832 |
{ |
4833 |
add_preempt_count(PREEMPT_ACTIVE); |
4834 |
schedule(); |
4835 |
sub_preempt_count(PREEMPT_ACTIVE); |
4836 |
} |
4837 |
|
4838 |
int __sched _cond_resched(void) |
4839 |
{ |
4840 |
if (should_resched()) { |
4841 |
__cond_resched(); |
4842 |
return 1; |
4843 |
} |
4844 |
return 0; |
4845 |
} |
4846 |
EXPORT_SYMBOL(_cond_resched); |
4847 |
|
4848 |
/* |
4849 |
* __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
4850 |
* call schedule, and on return reacquire the lock. |
4851 |
* |
4852 |
* This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
4853 |
* operations here to prevent schedule() from being called twice (once via |
4854 |
* spin_unlock(), once by hand). |
4855 |
*/ |
4856 |
int __cond_resched_lock(spinlock_t *lock) |
4857 |
{ |
4858 |
int resched = should_resched(); |
4859 |
int ret = 0; |
4860 |
|
4861 |
lockdep_assert_held(lock); |
4862 |
|
4863 |
if (spin_needbreak(lock) || resched) { |
4864 |
spin_unlock(lock); |
4865 |
if (resched) |
4866 |
__cond_resched(); |
4867 |
else |
4868 |
cpu_relax(); |
4869 |
ret = 1; |
4870 |
spin_lock(lock); |
4871 |
} |
4872 |
return ret; |
4873 |
} |
4874 |
EXPORT_SYMBOL(__cond_resched_lock); |
4875 |
|
4876 |
int __sched __cond_resched_softirq(void) |
4877 |
{ |
4878 |
BUG_ON(!in_softirq()); |
4879 |
|
4880 |
if (should_resched()) { |
4881 |
local_bh_enable(); |
4882 |
__cond_resched(); |
4883 |
local_bh_disable(); |
4884 |
return 1; |
4885 |
} |
4886 |
return 0; |
4887 |
} |
4888 |
EXPORT_SYMBOL(__cond_resched_softirq); |
4889 |
|
4890 |
/** |
4891 |
* yield - yield the current processor to other threads. |
4892 |
* |
4893 |
* Do not ever use this function, there's a 99% chance you're doing it wrong. |
4894 |
* |
4895 |
* The scheduler is at all times free to pick the calling task as the most |
4896 |
* eligible task to run, if removing the yield() call from your code breaks |
4897 |
* it, its already broken. |
4898 |
* |
4899 |
* Typical broken usage is: |
4900 |
* |
4901 |
* while (!event) |
4902 |
* yield(); |
4903 |
* |
4904 |
* where one assumes that yield() will let 'the other' process run that will |
4905 |
* make event true. If the current task is a SCHED_FIFO task that will never |
4906 |
* happen. Never use yield() as a progress guarantee!! |
4907 |
* |
4908 |
* If you want to use yield() to wait for something, use wait_event(). |
4909 |
* If you want to use yield() to be 'nice' for others, use cond_resched(). |
4910 |
* If you still want to use yield(), do not! |
4911 |
*/ |
4912 |
void __sched yield(void) |
4913 |
{ |
4914 |
set_current_state(TASK_RUNNING); |
4915 |
sys_sched_yield(); |
4916 |
} |
4917 |
EXPORT_SYMBOL(yield); |
4918 |
|
4919 |
/** |
4920 |
* yield_to - yield the current processor to another thread in |
4921 |
* your thread group, or accelerate that thread toward the |
4922 |
* processor it's on. |
4923 |
* @p: target task |
4924 |
* @preempt: whether task preemption is allowed or not |
4925 |
* |
4926 |
* It's the caller's job to ensure that the target task struct |
4927 |
* can't go away on us before we can do any checks. |
4928 |
* |
4929 |
* Return: |
4930 |
* true (>0) if we indeed boosted the target task. |
4931 |
* false (0) if we failed to boost the target. |
4932 |
* -ESRCH if there's no task to yield to. |
4933 |
*/ |
4934 |
bool __sched yield_to(struct task_struct *p, bool preempt) |
4935 |
{ |
4936 |
unsigned long flags; |
4937 |
int yielded = 0; |
4938 |
struct rq *rq; |
4939 |
|
4940 |
rq = this_rq(); |
4941 |
grq_lock_irqsave(&flags); |
4942 |
if (task_running(p) || p->state) { |
4943 |
yielded = -ESRCH; |
4944 |
goto out_unlock; |
4945 |
} |
4946 |
yielded = 1; |
4947 |
if (p->deadline > rq->rq_deadline) |
4948 |
p->deadline = rq->rq_deadline; |
4949 |
p->time_slice += rq->rq_time_slice; |
4950 |
rq->rq_time_slice = 0; |
4951 |
if (p->time_slice > timeslice()) |
4952 |
p->time_slice = timeslice(); |
4953 |
set_tsk_need_resched(rq->curr); |
4954 |
out_unlock: |
4955 |
grq_unlock_irqrestore(&flags); |
4956 |
|
4957 |
if (yielded > 0) |
4958 |
schedule(); |
4959 |
return yielded; |
4960 |
} |
4961 |
EXPORT_SYMBOL_GPL(yield_to); |
4962 |
|
4963 |
/* |
4964 |
* This task is about to go to sleep on IO. Increment rq->nr_iowait so |
4965 |
* that process accounting knows that this is a task in IO wait state. |
4966 |
* |
4967 |
* But don't do that if it is a deliberate, throttling IO wait (this task |
4968 |
* has set its backing_dev_info: the queue against which it should throttle) |
4969 |
*/ |
4970 |
void __sched io_schedule(void) |
4971 |
{ |
4972 |
struct rq *rq = raw_rq(); |
4973 |
|
4974 |
delayacct_blkio_start(); |
4975 |
atomic_inc(&rq->nr_iowait); |
4976 |
blk_flush_plug(current); |
4977 |
current->in_iowait = 1; |
4978 |
schedule(); |
4979 |
current->in_iowait = 0; |
4980 |
atomic_dec(&rq->nr_iowait); |
4981 |
delayacct_blkio_end(); |
4982 |
} |
4983 |
EXPORT_SYMBOL(io_schedule); |
4984 |
|
4985 |
long __sched io_schedule_timeout(long timeout) |
4986 |
{ |
4987 |
struct rq *rq = raw_rq(); |
4988 |
long ret; |
4989 |
|
4990 |
delayacct_blkio_start(); |
4991 |
atomic_inc(&rq->nr_iowait); |
4992 |
blk_flush_plug(current); |
4993 |
current->in_iowait = 1; |
4994 |
ret = schedule_timeout(timeout); |
4995 |
current->in_iowait = 0; |
4996 |
atomic_dec(&rq->nr_iowait); |
4997 |
delayacct_blkio_end(); |
4998 |
return ret; |
4999 |
} |
5000 |
|
5001 |
/** |
5002 |
* sys_sched_get_priority_max - return maximum RT priority. |
5003 |
* @policy: scheduling class. |
5004 |
* |
5005 |
* Return: On success, this syscall returns the maximum |
5006 |
* rt_priority that can be used by a given scheduling class. |
5007 |
* On failure, a negative error code is returned. |
5008 |
*/ |
5009 |
SYSCALL_DEFINE1(sched_get_priority_max, int, policy) |
5010 |
{ |
5011 |
int ret = -EINVAL; |
5012 |
|
5013 |
switch (policy) { |
5014 |
case SCHED_FIFO: |
5015 |
case SCHED_RR: |
5016 |
ret = MAX_USER_RT_PRIO-1; |
5017 |
break; |
5018 |
case SCHED_NORMAL: |
5019 |
case SCHED_BATCH: |
5020 |
case SCHED_ISO: |
5021 |
case SCHED_IDLEPRIO: |
5022 |
ret = 0; |
5023 |
break; |
5024 |
} |
5025 |
return ret; |
5026 |
} |
5027 |
|
5028 |
/** |
5029 |
* sys_sched_get_priority_min - return minimum RT priority. |
5030 |
* @policy: scheduling class. |
5031 |
* |
5032 |
* Return: On success, this syscall returns the minimum |
5033 |
* rt_priority that can be used by a given scheduling class. |
5034 |
* On failure, a negative error code is returned. |
5035 |
*/ |
5036 |
SYSCALL_DEFINE1(sched_get_priority_min, int, policy) |
5037 |
{ |
5038 |
int ret = -EINVAL; |
5039 |
|
5040 |
switch (policy) { |
5041 |
case SCHED_FIFO: |
5042 |
case SCHED_RR: |
5043 |
ret = 1; |
5044 |
break; |
5045 |
case SCHED_NORMAL: |
5046 |
case SCHED_BATCH: |
5047 |
case SCHED_ISO: |
5048 |
case SCHED_IDLEPRIO: |
5049 |
ret = 0; |
5050 |
break; |
5051 |
} |
5052 |
return ret; |
5053 |
} |
5054 |
|
5055 |
/** |
5056 |
* sys_sched_rr_get_interval - return the default timeslice of a process. |
5057 |
* @pid: pid of the process. |
5058 |
* @interval: userspace pointer to the timeslice value. |
5059 |
* |
5060 |
* |
5061 |
* Return: On success, 0 and the timeslice is in @interval. Otherwise, |
5062 |
* an error code. |
5063 |
*/ |
5064 |
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, |
5065 |
struct timespec __user *, interval) |
5066 |
{ |
5067 |
struct task_struct *p; |
5068 |
unsigned int time_slice; |
5069 |
unsigned long flags; |
5070 |
int retval; |
5071 |
struct timespec t; |
5072 |
|
5073 |
if (pid < 0) |
5074 |
return -EINVAL; |
5075 |
|
5076 |
retval = -ESRCH; |
5077 |
rcu_read_lock(); |
5078 |
p = find_process_by_pid(pid); |
5079 |
if (!p) |
5080 |
goto out_unlock; |
5081 |
|
5082 |
retval = security_task_getscheduler(p); |
5083 |
if (retval) |
5084 |
goto out_unlock; |
5085 |
|
5086 |
grq_lock_irqsave(&flags); |
5087 |
time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); |
5088 |
grq_unlock_irqrestore(&flags); |
5089 |
|
5090 |
rcu_read_unlock(); |
5091 |
t = ns_to_timespec(time_slice); |
5092 |
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
5093 |
return retval; |
5094 |
|
5095 |
out_unlock: |
5096 |
rcu_read_unlock(); |
5097 |
return retval; |
5098 |
} |
5099 |
|
5100 |
static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; |
5101 |
|
5102 |
void sched_show_task(struct task_struct *p) |
5103 |
{ |
5104 |
unsigned long free = 0; |
5105 |
int ppid; |
5106 |
unsigned state; |
5107 |
|
5108 |
state = p->state ? __ffs(p->state) + 1 : 0; |
5109 |
printk(KERN_INFO "%-15.15s %c", p->comm, |
5110 |
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5111 |
#if BITS_PER_LONG == 32 |
5112 |
if (state == TASK_RUNNING) |
5113 |
printk(KERN_CONT " running "); |
5114 |
else |
5115 |
printk(KERN_CONT " %08lx ", thread_saved_pc(p)); |
5116 |
#else |
5117 |
if (state == TASK_RUNNING) |
5118 |
printk(KERN_CONT " running task "); |
5119 |
else |
5120 |
printk(KERN_CONT " %016lx ", thread_saved_pc(p)); |
5121 |
#endif |
5122 |
#ifdef CONFIG_DEBUG_STACK_USAGE |
5123 |
free = stack_not_used(p); |
5124 |
#endif |
5125 |
rcu_read_lock(); |
5126 |
ppid = task_pid_nr(rcu_dereference(p->real_parent)); |
5127 |
rcu_read_unlock(); |
5128 |
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
5129 |
task_pid_nr(p), ppid, |
5130 |
(unsigned long)task_thread_info(p)->flags); |
5131 |
|
5132 |
print_worker_info(KERN_INFO, p); |
5133 |
show_stack(p, NULL); |
5134 |
} |
5135 |
|
5136 |
void show_state_filter(unsigned long state_filter) |
5137 |
{ |
5138 |
struct task_struct *g, *p; |
5139 |
|
5140 |
#if BITS_PER_LONG == 32 |
5141 |
printk(KERN_INFO |
5142 |
" task PC stack pid father\n"); |
5143 |
#else |
5144 |
printk(KERN_INFO |
5145 |
" task PC stack pid father\n"); |
5146 |
#endif |
5147 |
rcu_read_lock(); |
5148 |
do_each_thread(g, p) { |
5149 |
/* |
5150 |
* reset the NMI-timeout, listing all files on a slow |
5151 |
* console might take a lot of time: |
5152 |
*/ |
5153 |
touch_nmi_watchdog(); |
5154 |
if (!state_filter || (p->state & state_filter)) |
5155 |
sched_show_task(p); |
5156 |
} while_each_thread(g, p); |
5157 |
|
5158 |
touch_all_softlockup_watchdogs(); |
5159 |
|
5160 |
rcu_read_unlock(); |
5161 |
/* |
5162 |
* Only show locks if all tasks are dumped: |
5163 |
*/ |
5164 |
if (!state_filter) |
5165 |
debug_show_all_locks(); |
5166 |
} |
5167 |
|
5168 |
void dump_cpu_task(int cpu) |
5169 |
{ |
5170 |
pr_info("Task dump for CPU %d:\n", cpu); |
5171 |
sched_show_task(cpu_curr(cpu)); |
5172 |
} |
5173 |
|
5174 |
#ifdef CONFIG_SMP |
5175 |
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
5176 |
{ |
5177 |
cpumask_copy(tsk_cpus_allowed(p), new_mask); |
5178 |
} |
5179 |
#endif |
5180 |
|
5181 |
/** |
5182 |
* init_idle - set up an idle thread for a given CPU |
5183 |
* @idle: task in question |
5184 |
* @cpu: cpu the idle task belongs to |
5185 |
* |
5186 |
* NOTE: this function does not set the idle thread's NEED_RESCHED |
5187 |
* flag, to make booting more robust. |
5188 |
*/ |
5189 |
void init_idle(struct task_struct *idle, int cpu) |
5190 |
{ |
5191 |
struct rq *rq = cpu_rq(cpu); |
5192 |
unsigned long flags; |
5193 |
|
5194 |
time_grq_lock(rq, &flags); |
5195 |
idle->last_ran = rq->clock_task; |
5196 |
idle->state = TASK_RUNNING; |
5197 |
/* Setting prio to illegal value shouldn't matter when never queued */ |
5198 |
idle->prio = PRIO_LIMIT; |
5199 |
set_rq_task(rq, idle); |
5200 |
do_set_cpus_allowed(idle, &cpumask_of_cpu(cpu)); |
5201 |
/* Silence PROVE_RCU */ |
5202 |
rcu_read_lock(); |
5203 |
set_task_cpu(idle, cpu); |
5204 |
rcu_read_unlock(); |
5205 |
rq->curr = rq->idle = idle; |
5206 |
idle->on_cpu = 1; |
5207 |
grq_unlock_irqrestore(&flags); |
5208 |
|
5209 |
/* Set the preempt count _outside_ the spinlocks! */ |
5210 |
task_thread_info(idle)->preempt_count = 0; |
5211 |
|
5212 |
ftrace_graph_init_idle_task(idle, cpu); |
5213 |
#if defined(CONFIG_SMP) |
5214 |
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); |
5215 |
#endif |
5216 |
} |
5217 |
|
5218 |
#ifdef CONFIG_SMP |
5219 |
#ifdef CONFIG_NO_HZ_COMMON |
5220 |
void nohz_balance_enter_idle(int cpu) |
5221 |
{ |
5222 |
} |
5223 |
|
5224 |
void select_nohz_load_balancer(int stop_tick) |
5225 |
{ |
5226 |
} |
5227 |
|
5228 |
void set_cpu_sd_state_idle(void) {} |
5229 |
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
5230 |
/** |
5231 |
* lowest_flag_domain - Return lowest sched_domain containing flag. |
5232 |
* @cpu: The cpu whose lowest level of sched domain is to |
5233 |
* be returned. |
5234 |
* @flag: The flag to check for the lowest sched_domain |
5235 |
* for the given cpu. |
5236 |
* |
5237 |
* Returns the lowest sched_domain of a cpu which contains the given flag. |
5238 |
*/ |
5239 |
static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) |
5240 |
{ |
5241 |
struct sched_domain *sd; |
5242 |
|
5243 |
for_each_domain(cpu, sd) |
5244 |
if (sd && (sd->flags & flag)) |
5245 |
break; |
5246 |
|
5247 |
return sd; |
5248 |
} |
5249 |
|
5250 |
/** |
5251 |
* for_each_flag_domain - Iterates over sched_domains containing the flag. |
5252 |
* @cpu: The cpu whose domains we're iterating over. |
5253 |
* @sd: variable holding the value of the power_savings_sd |
5254 |
* for cpu. |
5255 |
* @flag: The flag to filter the sched_domains to be iterated. |
5256 |
* |
5257 |
* Iterates over all the scheduler domains for a given cpu that has the 'flag' |
5258 |
* set, starting from the lowest sched_domain to the highest. |
5259 |
*/ |
5260 |
#define for_each_flag_domain(cpu, sd, flag) \ |
5261 |
for (sd = lowest_flag_domain(cpu, flag); \ |
5262 |
(sd && (sd->flags & flag)); sd = sd->parent) |
5263 |
|
5264 |
#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
5265 |
|
5266 |
static inline void resched_cpu(int cpu) |
5267 |
{ |
5268 |
unsigned long flags; |
5269 |
|
5270 |
grq_lock_irqsave(&flags); |
5271 |
resched_task(cpu_curr(cpu)); |
5272 |
grq_unlock_irqrestore(&flags); |
5273 |
} |
5274 |
|
5275 |
/* |
5276 |
* In the semi idle case, use the nearest busy cpu for migrating timers |
5277 |
* from an idle cpu. This is good for power-savings. |
5278 |
* |
5279 |
* We don't do similar optimization for completely idle system, as |
5280 |
* selecting an idle cpu will add more delays to the timers than intended |
5281 |
* (as that cpu's timer base may not be uptodate wrt jiffies etc). |
5282 |
*/ |
5283 |
int get_nohz_timer_target(void) |
5284 |
{ |
5285 |
int cpu = smp_processor_id(); |
5286 |
int i; |
5287 |
struct sched_domain *sd; |
5288 |
|
5289 |
rcu_read_lock(); |
5290 |
for_each_domain(cpu, sd) { |
5291 |
for_each_cpu(i, sched_domain_span(sd)) { |
5292 |
if (!idle_cpu(i)) |
5293 |
cpu = i; |
5294 |
goto unlock; |
5295 |
} |
5296 |
} |
5297 |
unlock: |
5298 |
rcu_read_unlock(); |
5299 |
return cpu; |
5300 |
} |
5301 |
|
5302 |
/* |
5303 |
* When add_timer_on() enqueues a timer into the timer wheel of an |
5304 |
* idle CPU then this timer might expire before the next timer event |
5305 |
* which is scheduled to wake up that CPU. In case of a completely |
5306 |
* idle system the next event might even be infinite time into the |
5307 |
* future. wake_up_idle_cpu() ensures that the CPU is woken up and |
5308 |
* leaves the inner idle loop so the newly added timer is taken into |
5309 |
* account when the CPU goes back to idle and evaluates the timer |
5310 |
* wheel for the next timer event. |
5311 |
*/ |
5312 |
void wake_up_idle_cpu(int cpu) |
5313 |
{ |
5314 |
struct task_struct *idle; |
5315 |
struct rq *rq; |
5316 |
|
5317 |
if (cpu == smp_processor_id()) |
5318 |
return; |
5319 |
|
5320 |
rq = cpu_rq(cpu); |
5321 |
idle = rq->idle; |
5322 |
|
5323 |
/* |
5324 |
* This is safe, as this function is called with the timer |
5325 |
* wheel base lock of (cpu) held. When the CPU is on the way |
5326 |
* to idle and has not yet set rq->curr to idle then it will |
5327 |
* be serialised on the timer wheel base lock and take the new |
5328 |
* timer into account automatically. |
5329 |
*/ |
5330 |
if (unlikely(rq->curr != idle)) |
5331 |
return; |
5332 |
|
5333 |
/* |
5334 |
* We can set TIF_RESCHED on the idle task of the other CPU |
5335 |
* lockless. The worst case is that the other CPU runs the |
5336 |
* idle task through an additional NOOP schedule() |
5337 |
*/ |
5338 |
set_tsk_need_resched(idle); |
5339 |
|
5340 |
/* NEED_RESCHED must be visible before we test polling */ |
5341 |
smp_mb(); |
5342 |
if (!tsk_is_polling(idle)) |
5343 |
smp_send_reschedule(cpu); |
5344 |
} |
5345 |
|
5346 |
void wake_up_nohz_cpu(int cpu) |
5347 |
{ |
5348 |
wake_up_idle_cpu(cpu); |
5349 |
} |
5350 |
#endif /* CONFIG_NO_HZ_COMMON */ |
5351 |
|
5352 |
/* |
5353 |
* Change a given task's CPU affinity. Migrate the thread to a |
5354 |
* proper CPU and schedule it away if the CPU it's executing on |
5355 |
* is removed from the allowed bitmask. |
5356 |
* |
5357 |
* NOTE: the caller must have a valid reference to the task, the |
5358 |
* task must not exit() & deallocate itself prematurely. The |
5359 |
* call is not atomic; no spinlocks may be held. |
5360 |
*/ |
5361 |
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) |
5362 |
{ |
5363 |
bool running_wrong = false; |
5364 |
bool queued = false; |
5365 |
unsigned long flags; |
5366 |
struct rq *rq; |
5367 |
int ret = 0; |
5368 |
|
5369 |
rq = task_grq_lock(p, &flags); |
5370 |
|
5371 |
if (cpumask_equal(tsk_cpus_allowed(p), new_mask)) |
5372 |
goto out; |
5373 |
|
5374 |
if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
5375 |
ret = -EINVAL; |
5376 |
goto out; |
5377 |
} |
5378 |
|
5379 |
queued = task_queued(p); |
5380 |
|
5381 |
do_set_cpus_allowed(p, new_mask); |
5382 |
|
5383 |
/* Can the task run on the task's current CPU? If so, we're done */ |
5384 |
if (cpumask_test_cpu(task_cpu(p), new_mask)) |
5385 |
goto out; |
5386 |
|
5387 |
if (task_running(p)) { |
5388 |
/* Task is running on the wrong cpu now, reschedule it. */ |
5389 |
if (rq == this_rq()) { |
5390 |
set_tsk_need_resched(p); |
5391 |
running_wrong = true; |
5392 |
} else |
5393 |
resched_task(p); |
5394 |
} else |
5395 |
set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask)); |
5396 |
|
5397 |
out: |
5398 |
if (queued) |
5399 |
try_preempt(p, rq); |
5400 |
task_grq_unlock(&flags); |
5401 |
|
5402 |
if (running_wrong) |
5403 |
_cond_resched(); |
5404 |
|
5405 |
return ret; |
5406 |
} |
5407 |
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); |
5408 |
|
5409 |
#ifdef CONFIG_HOTPLUG_CPU |
5410 |
extern struct task_struct *cpu_stopper_task; |
5411 |
/* Run through task list and find tasks affined to just the dead cpu, then |
5412 |
* allocate a new affinity */ |
5413 |
static void break_sole_affinity(int src_cpu, struct task_struct *idle) |
5414 |
{ |
5415 |
struct task_struct *p, *t, *stopper; |
5416 |
|
5417 |
stopper = per_cpu(cpu_stopper_task, src_cpu); |
5418 |
do_each_thread(t, p) { |
5419 |
if (p != stopper && p != idle && !online_cpus(p)) { |
5420 |
cpumask_copy(tsk_cpus_allowed(p), cpu_possible_mask); |
5421 |
/* |
5422 |
* Don't tell them about moving exiting tasks or |
5423 |
* kernel threads (both mm NULL), since they never |
5424 |
* leave kernel. |
5425 |
*/ |
5426 |
if (p->mm && printk_ratelimit()) { |
5427 |
printk(KERN_INFO "process %d (%s) no " |
5428 |
"longer affine to cpu %d\n", |
5429 |
task_pid_nr(p), p->comm, src_cpu); |
5430 |
} |
5431 |
} |
5432 |
clear_sticky(p); |
5433 |
} while_each_thread(t, p); |
5434 |
} |
5435 |
|
5436 |
/* |
5437 |
* Ensures that the idle task is using init_mm right before its cpu goes |
5438 |
* offline. |
5439 |
*/ |
5440 |
void idle_task_exit(void) |
5441 |
{ |
5442 |
struct mm_struct *mm = current->active_mm; |
5443 |
|
5444 |
BUG_ON(cpu_online(smp_processor_id())); |
5445 |
|
5446 |
if (mm != &init_mm) |
5447 |
switch_mm(mm, &init_mm, current); |
5448 |
mmdrop(mm); |
5449 |
} |
5450 |
#endif /* CONFIG_HOTPLUG_CPU */ |
5451 |
void sched_set_stop_task(int cpu, struct task_struct *stop) |
5452 |
{ |
5453 |
struct sched_param stop_param = { .sched_priority = STOP_PRIO }; |
5454 |
struct sched_param start_param = { .sched_priority = 0 }; |
5455 |
struct task_struct *old_stop = cpu_rq(cpu)->stop; |
5456 |
|
5457 |
if (stop) { |
5458 |
/* |
5459 |
* Make it appear like a SCHED_FIFO task, its something |
5460 |
* userspace knows about and won't get confused about. |
5461 |
* |
5462 |
* Also, it will make PI more or less work without too |
5463 |
* much confusion -- but then, stop work should not |
5464 |
* rely on PI working anyway. |
5465 |
*/ |
5466 |
sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); |
5467 |
} |
5468 |
|
5469 |
cpu_rq(cpu)->stop = stop; |
5470 |
|
5471 |
if (old_stop) { |
5472 |
/* |
5473 |
* Reset it back to a normal scheduling policy so that |
5474 |
* it can die in pieces. |
5475 |
*/ |
5476 |
sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); |
5477 |
} |
5478 |
} |
5479 |
|
5480 |
|
5481 |
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
5482 |
|
5483 |
static struct ctl_table sd_ctl_dir[] = { |
5484 |
{ |
5485 |
.procname = "sched_domain", |
5486 |
.mode = 0555, |
5487 |
}, |
5488 |
{} |
5489 |
}; |
5490 |
|
5491 |
static struct ctl_table sd_ctl_root[] = { |
5492 |
{ |
5493 |
.procname = "kernel", |
5494 |
.mode = 0555, |
5495 |
.child = sd_ctl_dir, |
5496 |
}, |
5497 |
{} |
5498 |
}; |
5499 |
|
5500 |
static struct ctl_table *sd_alloc_ctl_entry(int n) |
5501 |
{ |
5502 |
struct ctl_table *entry = |
5503 |
kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); |
5504 |
|
5505 |
return entry; |
5506 |
} |
5507 |
|
5508 |
static void sd_free_ctl_entry(struct ctl_table **tablep) |
5509 |
{ |
5510 |
struct ctl_table *entry; |
5511 |
|
5512 |
/* |
5513 |
* In the intermediate directories, both the child directory and |
5514 |
* procname are dynamically allocated and could fail but the mode |
5515 |
* will always be set. In the lowest directory the names are |
5516 |
* static strings and all have proc handlers. |
5517 |
*/ |
5518 |
for (entry = *tablep; entry->mode; entry++) { |
5519 |
if (entry->child) |
5520 |
sd_free_ctl_entry(&entry->child); |
5521 |
if (entry->proc_handler == NULL) |
5522 |
kfree(entry->procname); |
5523 |
} |
5524 |
|
5525 |
kfree(*tablep); |
5526 |
*tablep = NULL; |
5527 |
} |
5528 |
|
5529 |
static void |
5530 |
set_table_entry(struct ctl_table *entry, |
5531 |
const char *procname, void *data, int maxlen, |
5532 |
mode_t mode, proc_handler *proc_handler) |
5533 |
{ |
5534 |
entry->procname = procname; |
5535 |
entry->data = data; |
5536 |
entry->maxlen = maxlen; |
5537 |
entry->mode = mode; |
5538 |
entry->proc_handler = proc_handler; |
5539 |
} |
5540 |
|
5541 |
static struct ctl_table * |
5542 |
sd_alloc_ctl_domain_table(struct sched_domain *sd) |
5543 |
{ |
5544 |
struct ctl_table *table = sd_alloc_ctl_entry(13); |
5545 |
|
5546 |
if (table == NULL) |
5547 |
return NULL; |
5548 |
|
5549 |
set_table_entry(&table[0], "min_interval", &sd->min_interval, |
5550 |
sizeof(long), 0644, proc_doulongvec_minmax); |
5551 |
set_table_entry(&table[1], "max_interval", &sd->max_interval, |
5552 |
sizeof(long), 0644, proc_doulongvec_minmax); |
5553 |
set_table_entry(&table[2], "busy_idx", &sd->busy_idx, |
5554 |
sizeof(int), 0644, proc_dointvec_minmax); |
5555 |
set_table_entry(&table[3], "idle_idx", &sd->idle_idx, |
5556 |
sizeof(int), 0644, proc_dointvec_minmax); |
5557 |
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, |
5558 |
sizeof(int), 0644, proc_dointvec_minmax); |
5559 |
set_table_entry(&table[5], "wake_idx", &sd->wake_idx, |
5560 |
sizeof(int), 0644, proc_dointvec_minmax); |
5561 |
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, |
5562 |
sizeof(int), 0644, proc_dointvec_minmax); |
5563 |
set_table_entry(&table[7], "busy_factor", &sd->busy_factor, |
5564 |
sizeof(int), 0644, proc_dointvec_minmax); |
5565 |
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
5566 |
sizeof(int), 0644, proc_dointvec_minmax); |
5567 |
set_table_entry(&table[9], "cache_nice_tries", |
5568 |
&sd->cache_nice_tries, |
5569 |
sizeof(int), 0644, proc_dointvec_minmax); |
5570 |
set_table_entry(&table[10], "flags", &sd->flags, |
5571 |
sizeof(int), 0644, proc_dointvec_minmax); |
5572 |
set_table_entry(&table[11], "name", sd->name, |
5573 |
CORENAME_MAX_SIZE, 0444, proc_dostring); |
5574 |
/* &table[12] is terminator */ |
5575 |
|
5576 |
return table; |
5577 |
} |
5578 |
|
5579 |
static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) |
5580 |
{ |
5581 |
struct ctl_table *entry, *table; |
5582 |
struct sched_domain *sd; |
5583 |
int domain_num = 0, i; |
5584 |
char buf[32]; |
5585 |
|
5586 |
for_each_domain(cpu, sd) |
5587 |
domain_num++; |
5588 |
entry = table = sd_alloc_ctl_entry(domain_num + 1); |
5589 |
if (table == NULL) |
5590 |
return NULL; |
5591 |
|
5592 |
i = 0; |
5593 |
for_each_domain(cpu, sd) { |
5594 |
snprintf(buf, 32, "domain%d", i); |
5595 |
entry->procname = kstrdup(buf, GFP_KERNEL); |
5596 |
entry->mode = 0555; |
5597 |
entry->child = sd_alloc_ctl_domain_table(sd); |
5598 |
entry++; |
5599 |
i++; |
5600 |
} |
5601 |
return table; |
5602 |
} |
5603 |
|
5604 |
static struct ctl_table_header *sd_sysctl_header; |
5605 |
static void register_sched_domain_sysctl(void) |
5606 |
{ |
5607 |
int i, cpu_num = num_possible_cpus(); |
5608 |
struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
5609 |
char buf[32]; |
5610 |
|
5611 |
WARN_ON(sd_ctl_dir[0].child); |
5612 |
sd_ctl_dir[0].child = entry; |
5613 |
|
5614 |
if (entry == NULL) |
5615 |
return; |
5616 |
|
5617 |
for_each_possible_cpu(i) { |
5618 |
snprintf(buf, 32, "cpu%d", i); |
5619 |
entry->procname = kstrdup(buf, GFP_KERNEL); |
5620 |
entry->mode = 0555; |
5621 |
entry->child = sd_alloc_ctl_cpu_table(i); |
5622 |
entry++; |
5623 |
} |
5624 |
|
5625 |
WARN_ON(sd_sysctl_header); |
5626 |
sd_sysctl_header = register_sysctl_table(sd_ctl_root); |
5627 |
} |
5628 |
|
5629 |
/* may be called multiple times per register */ |
5630 |
static void unregister_sched_domain_sysctl(void) |
5631 |
{ |
5632 |
if (sd_sysctl_header) |
5633 |
unregister_sysctl_table(sd_sysctl_header); |
5634 |
sd_sysctl_header = NULL; |
5635 |
if (sd_ctl_dir[0].child) |
5636 |
sd_free_ctl_entry(&sd_ctl_dir[0].child); |
5637 |
} |
5638 |
#else |
5639 |
static void register_sched_domain_sysctl(void) |
5640 |
{ |
5641 |
} |
5642 |
static void unregister_sched_domain_sysctl(void) |
5643 |
{ |
5644 |
} |
5645 |
#endif |
5646 |
|
5647 |
static void set_rq_online(struct rq *rq) |
5648 |
{ |
5649 |
if (!rq->online) { |
5650 |
cpumask_set_cpu(cpu_of(rq), rq->rd->online); |
5651 |
rq->online = true; |
5652 |
} |
5653 |
} |
5654 |
|
5655 |
static void set_rq_offline(struct rq *rq) |
5656 |
{ |
5657 |
if (rq->online) { |
5658 |
cpumask_clear_cpu(cpu_of(rq), rq->rd->online); |
5659 |
rq->online = false; |
5660 |
} |
5661 |
} |
5662 |
|
5663 |
/* |
5664 |
* migration_call - callback that gets triggered when a CPU is added. |
5665 |
*/ |
5666 |
static int |
5667 |
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
5668 |
{ |
5669 |
int cpu = (long)hcpu; |
5670 |
unsigned long flags; |
5671 |
struct rq *rq = cpu_rq(cpu); |
5672 |
#ifdef CONFIG_HOTPLUG_CPU |
5673 |
struct task_struct *idle = rq->idle; |
5674 |
#endif |
5675 |
|
5676 |
switch (action & ~CPU_TASKS_FROZEN) { |
5677 |
|
5678 |
case CPU_UP_PREPARE: |
5679 |
break; |
5680 |
|
5681 |
case CPU_ONLINE: |
5682 |
/* Update our root-domain */ |
5683 |
grq_lock_irqsave(&flags); |
5684 |
if (rq->rd) { |
5685 |
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
5686 |
|
5687 |
set_rq_online(rq); |
5688 |
} |
5689 |
grq.noc = num_online_cpus(); |
5690 |
grq_unlock_irqrestore(&flags); |
5691 |
break; |
5692 |
|
5693 |
#ifdef CONFIG_HOTPLUG_CPU |
5694 |
case CPU_DEAD: |
5695 |
/* Idle task back to normal (off runqueue, low prio) */ |
5696 |
grq_lock_irq(); |
5697 |
return_task(idle, true); |
5698 |
idle->static_prio = MAX_PRIO; |
5699 |
__setscheduler(idle, rq, SCHED_NORMAL, 0); |
5700 |
idle->prio = PRIO_LIMIT; |
5701 |
set_rq_task(rq, idle); |
5702 |
update_clocks(rq); |
5703 |
grq_unlock_irq(); |
5704 |
break; |
5705 |
|
5706 |
case CPU_DYING: |
5707 |
sched_ttwu_pending(); |
5708 |
/* Update our root-domain */ |
5709 |
grq_lock_irqsave(&flags); |
5710 |
if (rq->rd) { |
5711 |
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
5712 |
set_rq_offline(rq); |
5713 |
} |
5714 |
break_sole_affinity(cpu, idle); |
5715 |
grq.noc = num_online_cpus(); |
5716 |
grq_unlock_irqrestore(&flags); |
5717 |
break; |
5718 |
#endif |
5719 |
} |
5720 |
return NOTIFY_OK; |
5721 |
} |
5722 |
|
5723 |
/* |
5724 |
* Register at high priority so that task migration (migrate_all_tasks) |
5725 |
* happens before everything else. This has to be lower priority than |
5726 |
* the notifier in the perf_counter subsystem, though. |
5727 |
*/ |
5728 |
static struct notifier_block migration_notifier = { |
5729 |
.notifier_call = migration_call, |
5730 |
.priority = CPU_PRI_MIGRATION, |
5731 |
}; |
5732 |
|
5733 |
static int sched_cpu_active(struct notifier_block *nfb, |
5734 |
unsigned long action, void *hcpu) |
5735 |
{ |
5736 |
switch (action & ~CPU_TASKS_FROZEN) { |
5737 |
case CPU_STARTING: |
5738 |
case CPU_DOWN_FAILED: |
5739 |
set_cpu_active((long)hcpu, true); |
5740 |
return NOTIFY_OK; |
5741 |
default: |
5742 |
return NOTIFY_DONE; |
5743 |
} |
5744 |
} |
5745 |
|
5746 |
static int sched_cpu_inactive(struct notifier_block *nfb, |
5747 |
unsigned long action, void *hcpu) |
5748 |
{ |
5749 |
switch (action & ~CPU_TASKS_FROZEN) { |
5750 |
case CPU_DOWN_PREPARE: |
5751 |
set_cpu_active((long)hcpu, false); |
5752 |
return NOTIFY_OK; |
5753 |
default: |
5754 |
return NOTIFY_DONE; |
5755 |
} |
5756 |
} |
5757 |
|
5758 |
int __init migration_init(void) |
5759 |
{ |
5760 |
void *cpu = (void *)(long)smp_processor_id(); |
5761 |
int err; |
5762 |
|
5763 |
/* Initialise migration for the boot CPU */ |
5764 |
err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
5765 |
BUG_ON(err == NOTIFY_BAD); |
5766 |
migration_call(&migration_notifier, CPU_ONLINE, cpu); |
5767 |
register_cpu_notifier(&migration_notifier); |
5768 |
|
5769 |
/* Register cpu active notifiers */ |
5770 |
cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); |
5771 |
cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); |
5772 |
|
5773 |
return 0; |
5774 |
} |
5775 |
early_initcall(migration_init); |
5776 |
#endif |
5777 |
|
5778 |
#ifdef CONFIG_SMP |
5779 |
|
5780 |
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ |
5781 |
|
5782 |
#ifdef CONFIG_SCHED_DEBUG |
5783 |
|
5784 |
static __read_mostly int sched_debug_enabled; |
5785 |
|
5786 |
static int __init sched_debug_setup(char *str) |
5787 |
{ |
5788 |
sched_debug_enabled = 1; |
5789 |
|
5790 |
return 0; |
5791 |
} |
5792 |
early_param("sched_debug", sched_debug_setup); |
5793 |
|
5794 |
static inline bool sched_debug(void) |
5795 |
{ |
5796 |
return sched_debug_enabled; |
5797 |
} |
5798 |
|
5799 |
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
5800 |
struct cpumask *groupmask) |
5801 |
{ |
5802 |
char str[256]; |
5803 |
|
5804 |
cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); |
5805 |
cpumask_clear(groupmask); |
5806 |
|
5807 |
printk(KERN_DEBUG "%*s domain %d: ", level, "", level); |
5808 |
|
5809 |
if (!(sd->flags & SD_LOAD_BALANCE)) { |
5810 |
printk("does not load-balance\n"); |
5811 |
if (sd->parent) |
5812 |
printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" |
5813 |
" has parent"); |
5814 |
return -1; |
5815 |
} |
5816 |
|
5817 |
printk(KERN_CONT "span %s level %s\n", str, sd->name); |
5818 |
|
5819 |
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
5820 |
printk(KERN_ERR "ERROR: domain->span does not contain " |
5821 |
"CPU%d\n", cpu); |
5822 |
} |
5823 |
|
5824 |
printk(KERN_CONT "\n"); |
5825 |
|
5826 |
if (!cpumask_equal(sched_domain_span(sd), groupmask)) |
5827 |
printk(KERN_ERR "ERROR: groups don't span domain->span\n"); |
5828 |
|
5829 |
if (sd->parent && |
5830 |
!cpumask_subset(groupmask, sched_domain_span(sd->parent))) |
5831 |
printk(KERN_ERR "ERROR: parent span is not a superset " |
5832 |
"of domain->span\n"); |
5833 |
return 0; |
5834 |
} |
5835 |
|
5836 |
static void sched_domain_debug(struct sched_domain *sd, int cpu) |
5837 |
{ |
5838 |
int level = 0; |
5839 |
|
5840 |
if (!sched_debug_enabled) |
5841 |
return; |
5842 |
|
5843 |
if (!sd) { |
5844 |
printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); |
5845 |
return; |
5846 |
} |
5847 |
|
5848 |
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
5849 |
|
5850 |
for (;;) { |
5851 |
if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
5852 |
break; |
5853 |
level++; |
5854 |
sd = sd->parent; |
5855 |
if (!sd) |
5856 |
break; |
5857 |
} |
5858 |
} |
5859 |
#else /* !CONFIG_SCHED_DEBUG */ |
5860 |
# define sched_domain_debug(sd, cpu) do { } while (0) |
5861 |
static inline bool sched_debug(void) |
5862 |
{ |
5863 |
return false; |
5864 |
} |
5865 |
#endif /* CONFIG_SCHED_DEBUG */ |
5866 |
|
5867 |
static int sd_degenerate(struct sched_domain *sd) |
5868 |
{ |
5869 |
if (cpumask_weight(sched_domain_span(sd)) == 1) |
5870 |
return 1; |
5871 |
|
5872 |
/* Following flags don't use groups */ |
5873 |
if (sd->flags & (SD_WAKE_AFFINE)) |
5874 |
return 0; |
5875 |
|
5876 |
return 1; |
5877 |
} |
5878 |
|
5879 |
static int |
5880 |
sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
5881 |
{ |
5882 |
unsigned long cflags = sd->flags, pflags = parent->flags; |
5883 |
|
5884 |
if (sd_degenerate(parent)) |
5885 |
return 1; |
5886 |
|
5887 |
if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) |
5888 |
return 0; |
5889 |
|
5890 |
if (~cflags & pflags) |
5891 |
return 0; |
5892 |
|
5893 |
return 1; |
5894 |
} |
5895 |
|
5896 |
static void free_rootdomain(struct rcu_head *rcu) |
5897 |
{ |
5898 |
struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
5899 |
|
5900 |
cpupri_cleanup(&rd->cpupri); |
5901 |
free_cpumask_var(rd->rto_mask); |
5902 |
free_cpumask_var(rd->online); |
5903 |
free_cpumask_var(rd->span); |
5904 |
kfree(rd); |
5905 |
} |
5906 |
|
5907 |
static void rq_attach_root(struct rq *rq, struct root_domain *rd) |
5908 |
{ |
5909 |
struct root_domain *old_rd = NULL; |
5910 |
unsigned long flags; |
5911 |
|
5912 |
grq_lock_irqsave(&flags); |
5913 |
|
5914 |
if (rq->rd) { |
5915 |
old_rd = rq->rd; |
5916 |
|
5917 |
if (cpumask_test_cpu(rq->cpu, old_rd->online)) |
5918 |
set_rq_offline(rq); |
5919 |
|
5920 |
cpumask_clear_cpu(rq->cpu, old_rd->span); |
5921 |
|
5922 |
/* |
5923 |
* If we dont want to free the old_rt yet then |
5924 |
* set old_rd to NULL to skip the freeing later |
5925 |
* in this function: |
5926 |
*/ |
5927 |
if (!atomic_dec_and_test(&old_rd->refcount)) |
5928 |
old_rd = NULL; |
5929 |
} |
5930 |
|
5931 |
atomic_inc(&rd->refcount); |
5932 |
rq->rd = rd; |
5933 |
|
5934 |
cpumask_set_cpu(rq->cpu, rd->span); |
5935 |
if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
5936 |
set_rq_online(rq); |
5937 |
|
5938 |
grq_unlock_irqrestore(&flags); |
5939 |
|
5940 |
if (old_rd) |
5941 |
call_rcu_sched(&old_rd->rcu, free_rootdomain); |
5942 |
} |
5943 |
|
5944 |
static int init_rootdomain(struct root_domain *rd) |
5945 |
{ |
5946 |
memset(rd, 0, sizeof(*rd)); |
5947 |
|
5948 |
if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) |
5949 |
goto out; |
5950 |
if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
5951 |
goto free_span; |
5952 |
if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) |
5953 |
goto free_online; |
5954 |
|
5955 |
if (cpupri_init(&rd->cpupri) != 0) |
5956 |
goto free_rto_mask; |
5957 |
return 0; |
5958 |
|
5959 |
free_rto_mask: |
5960 |
free_cpumask_var(rd->rto_mask); |
5961 |
free_online: |
5962 |
free_cpumask_var(rd->online); |
5963 |
free_span: |
5964 |
free_cpumask_var(rd->span); |
5965 |
out: |
5966 |
return -ENOMEM; |
5967 |
} |
5968 |
|
5969 |
static void init_defrootdomain(void) |
5970 |
{ |
5971 |
init_rootdomain(&def_root_domain); |
5972 |
|
5973 |
atomic_set(&def_root_domain.refcount, 1); |
5974 |
} |
5975 |
|
5976 |
static struct root_domain *alloc_rootdomain(void) |
5977 |
{ |
5978 |
struct root_domain *rd; |
5979 |
|
5980 |
rd = kmalloc(sizeof(*rd), GFP_KERNEL); |
5981 |
if (!rd) |
5982 |
return NULL; |
5983 |
|
5984 |
if (init_rootdomain(rd) != 0) { |
5985 |
kfree(rd); |
5986 |
return NULL; |
5987 |
} |
5988 |
|
5989 |
return rd; |
5990 |
} |
5991 |
|
5992 |
static void free_sched_domain(struct rcu_head *rcu) |
5993 |
{ |
5994 |
struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); |
5995 |
|
5996 |
kfree(sd); |
5997 |
} |
5998 |
|
5999 |
static void destroy_sched_domain(struct sched_domain *sd, int cpu) |
6000 |
{ |
6001 |
call_rcu(&sd->rcu, free_sched_domain); |
6002 |
} |
6003 |
|
6004 |
static void destroy_sched_domains(struct sched_domain *sd, int cpu) |
6005 |
{ |
6006 |
for (; sd; sd = sd->parent) |
6007 |
destroy_sched_domain(sd, cpu); |
6008 |
} |
6009 |
|
6010 |
/* |
6011 |
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6012 |
* hold the hotplug lock. |
6013 |
*/ |
6014 |
static void |
6015 |
cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) |
6016 |
{ |
6017 |
struct rq *rq = cpu_rq(cpu); |
6018 |
struct sched_domain *tmp; |
6019 |
|
6020 |
/* Remove the sched domains which do not contribute to scheduling. */ |
6021 |
for (tmp = sd; tmp; ) { |
6022 |
struct sched_domain *parent = tmp->parent; |
6023 |
if (!parent) |
6024 |
break; |
6025 |
|
6026 |
if (sd_parent_degenerate(tmp, parent)) { |
6027 |
tmp->parent = parent->parent; |
6028 |
if (parent->parent) |
6029 |
parent->parent->child = tmp; |
6030 |
destroy_sched_domain(parent, cpu); |
6031 |
} else |
6032 |
tmp = tmp->parent; |
6033 |
} |
6034 |
|
6035 |
if (sd && sd_degenerate(sd)) { |
6036 |
tmp = sd; |
6037 |
sd = sd->parent; |
6038 |
destroy_sched_domain(tmp, cpu); |
6039 |
if (sd) |
6040 |
sd->child = NULL; |
6041 |
} |
6042 |
|
6043 |
sched_domain_debug(sd, cpu); |
6044 |
|
6045 |
rq_attach_root(rq, rd); |
6046 |
tmp = rq->sd; |
6047 |
rcu_assign_pointer(rq->sd, sd); |
6048 |
destroy_sched_domains(tmp, cpu); |
6049 |
} |
6050 |
|
6051 |
/* cpus with isolated domains */ |
6052 |
static cpumask_var_t cpu_isolated_map; |
6053 |
|
6054 |
/* Setup the mask of cpus configured for isolated domains */ |
6055 |
static int __init isolated_cpu_setup(char *str) |
6056 |
{ |
6057 |
alloc_bootmem_cpumask_var(&cpu_isolated_map); |
6058 |
cpulist_parse(str, cpu_isolated_map); |
6059 |
return 1; |
6060 |
} |
6061 |
|
6062 |
__setup("isolcpus=", isolated_cpu_setup); |
6063 |
|
6064 |
static const struct cpumask *cpu_cpu_mask(int cpu) |
6065 |
{ |
6066 |
return cpumask_of_node(cpu_to_node(cpu)); |
6067 |
} |
6068 |
|
6069 |
struct sd_data { |
6070 |
struct sched_domain **__percpu sd; |
6071 |
}; |
6072 |
|
6073 |
struct s_data { |
6074 |
struct sched_domain ** __percpu sd; |
6075 |
struct root_domain *rd; |
6076 |
}; |
6077 |
|
6078 |
enum s_alloc { |
6079 |
sa_rootdomain, |
6080 |
sa_sd, |
6081 |
sa_sd_storage, |
6082 |
sa_none, |
6083 |
}; |
6084 |
|
6085 |
struct sched_domain_topology_level; |
6086 |
|
6087 |
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6088 |
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6089 |
|
6090 |
#define SDTL_OVERLAP 0x01 |
6091 |
|
6092 |
struct sched_domain_topology_level { |
6093 |
sched_domain_init_f init; |
6094 |
sched_domain_mask_f mask; |
6095 |
int flags; |
6096 |
int numa_level; |
6097 |
struct sd_data data; |
6098 |
}; |
6099 |
|
6100 |
/* |
6101 |
* Initializers for schedule domains |
6102 |
* Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
6103 |
*/ |
6104 |
|
6105 |
#ifdef CONFIG_SCHED_DEBUG |
6106 |
# define SD_INIT_NAME(sd, type) sd->name = #type |
6107 |
#else |
6108 |
# define SD_INIT_NAME(sd, type) do { } while (0) |
6109 |
#endif |
6110 |
|
6111 |
#define SD_INIT_FUNC(type) \ |
6112 |
static noinline struct sched_domain * \ |
6113 |
sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
6114 |
{ \ |
6115 |
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
6116 |
*sd = SD_##type##_INIT; \ |
6117 |
SD_INIT_NAME(sd, type); \ |
6118 |
sd->private = &tl->data; \ |
6119 |
return sd; \ |
6120 |
} |
6121 |
|
6122 |
SD_INIT_FUNC(CPU) |
6123 |
#ifdef CONFIG_SCHED_SMT |
6124 |
SD_INIT_FUNC(SIBLING) |
6125 |
#endif |
6126 |
#ifdef CONFIG_SCHED_MC |
6127 |
SD_INIT_FUNC(MC) |
6128 |
#endif |
6129 |
#ifdef CONFIG_SCHED_BOOK |
6130 |
SD_INIT_FUNC(BOOK) |
6131 |
#endif |
6132 |
|
6133 |
static int default_relax_domain_level = -1; |
6134 |
int sched_domain_level_max; |
6135 |
|
6136 |
static int __init setup_relax_domain_level(char *str) |
6137 |
{ |
6138 |
if (kstrtoint(str, 0, &default_relax_domain_level)) |
6139 |
pr_warn("Unable to set relax_domain_level\n"); |
6140 |
|
6141 |
return 1; |
6142 |
} |
6143 |
__setup("relax_domain_level=", setup_relax_domain_level); |
6144 |
|
6145 |
static void set_domain_attribute(struct sched_domain *sd, |
6146 |
struct sched_domain_attr *attr) |
6147 |
{ |
6148 |
int request; |
6149 |
|
6150 |
if (!attr || attr->relax_domain_level < 0) { |
6151 |
if (default_relax_domain_level < 0) |
6152 |
return; |
6153 |
else |
6154 |
request = default_relax_domain_level; |
6155 |
} else |
6156 |
request = attr->relax_domain_level; |
6157 |
if (request < sd->level) { |
6158 |
/* turn off idle balance on this domain */ |
6159 |
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
6160 |
} else { |
6161 |
/* turn on idle balance on this domain */ |
6162 |
sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
6163 |
} |
6164 |
} |
6165 |
|
6166 |
static void __sdt_free(const struct cpumask *cpu_map); |
6167 |
static int __sdt_alloc(const struct cpumask *cpu_map); |
6168 |
|
6169 |
static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
6170 |
const struct cpumask *cpu_map) |
6171 |
{ |
6172 |
switch (what) { |
6173 |
case sa_rootdomain: |
6174 |
if (!atomic_read(&d->rd->refcount)) |
6175 |
free_rootdomain(&d->rd->rcu); /* fall through */ |
6176 |
case sa_sd: |
6177 |
free_percpu(d->sd); /* fall through */ |
6178 |
case sa_sd_storage: |
6179 |
__sdt_free(cpu_map); /* fall through */ |
6180 |
case sa_none: |
6181 |
break; |
6182 |
} |
6183 |
} |
6184 |
|
6185 |
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
6186 |
const struct cpumask *cpu_map) |
6187 |
{ |
6188 |
memset(d, 0, sizeof(*d)); |
6189 |
|
6190 |
if (__sdt_alloc(cpu_map)) |
6191 |
return sa_sd_storage; |
6192 |
d->sd = alloc_percpu(struct sched_domain *); |
6193 |
if (!d->sd) |
6194 |
return sa_sd_storage; |
6195 |
d->rd = alloc_rootdomain(); |
6196 |
if (!d->rd) |
6197 |
return sa_sd; |
6198 |
return sa_rootdomain; |
6199 |
} |
6200 |
|
6201 |
/* |
6202 |
* NULL the sd_data elements we've used to build the sched_domain |
6203 |
* structure so that the subsequent __free_domain_allocs() |
6204 |
* will not free the data we're using. |
6205 |
*/ |
6206 |
static void claim_allocations(int cpu, struct sched_domain *sd) |
6207 |
{ |
6208 |
struct sd_data *sdd = sd->private; |
6209 |
|
6210 |
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
6211 |
*per_cpu_ptr(sdd->sd, cpu) = NULL; |
6212 |
} |
6213 |
|
6214 |
#ifdef CONFIG_SCHED_SMT |
6215 |
static const struct cpumask *cpu_smt_mask(int cpu) |
6216 |
{ |
6217 |
return topology_thread_cpumask(cpu); |
6218 |
} |
6219 |
#endif |
6220 |
|
6221 |
/* |
6222 |
* Topology list, bottom-up. |
6223 |
*/ |
6224 |
static struct sched_domain_topology_level default_topology[] = { |
6225 |
#ifdef CONFIG_SCHED_SMT |
6226 |
{ sd_init_SIBLING, cpu_smt_mask, }, |
6227 |
#endif |
6228 |
#ifdef CONFIG_SCHED_MC |
6229 |
{ sd_init_MC, cpu_coregroup_mask, }, |
6230 |
#endif |
6231 |
#ifdef CONFIG_SCHED_BOOK |
6232 |
{ sd_init_BOOK, cpu_book_mask, }, |
6233 |
#endif |
6234 |
{ sd_init_CPU, cpu_cpu_mask, }, |
6235 |
{ NULL, }, |
6236 |
}; |
6237 |
|
6238 |
static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6239 |
|
6240 |
#define for_each_sd_topology(tl) \ |
6241 |
for (tl = sched_domain_topology; tl->init; tl++) |
6242 |
|
6243 |
#ifdef CONFIG_NUMA |
6244 |
|
6245 |
static int sched_domains_numa_levels; |
6246 |
static int *sched_domains_numa_distance; |
6247 |
static struct cpumask ***sched_domains_numa_masks; |
6248 |
static int sched_domains_curr_level; |
6249 |
|
6250 |
static inline int sd_local_flags(int level) |
6251 |
{ |
6252 |
if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) |
6253 |
return 0; |
6254 |
|
6255 |
return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; |
6256 |
} |
6257 |
|
6258 |
static struct sched_domain * |
6259 |
sd_numa_init(struct sched_domain_topology_level *tl, int cpu) |
6260 |
{ |
6261 |
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); |
6262 |
int level = tl->numa_level; |
6263 |
int sd_weight = cpumask_weight( |
6264 |
sched_domains_numa_masks[level][cpu_to_node(cpu)]); |
6265 |
|
6266 |
*sd = (struct sched_domain){ |
6267 |
.min_interval = sd_weight, |
6268 |
.max_interval = 2*sd_weight, |
6269 |
.busy_factor = 32, |
6270 |
.imbalance_pct = 125, |
6271 |
.cache_nice_tries = 2, |
6272 |
.busy_idx = 3, |
6273 |
.idle_idx = 2, |
6274 |
.newidle_idx = 0, |
6275 |
.wake_idx = 0, |
6276 |
.forkexec_idx = 0, |
6277 |
|
6278 |
.flags = 1*SD_LOAD_BALANCE |
6279 |
| 1*SD_BALANCE_NEWIDLE |
6280 |
| 0*SD_BALANCE_EXEC |
6281 |
| 0*SD_BALANCE_FORK |
6282 |
| 0*SD_BALANCE_WAKE |
6283 |
| 0*SD_WAKE_AFFINE |
6284 |
| 0*SD_SHARE_CPUPOWER |
6285 |
| 0*SD_SHARE_PKG_RESOURCES |
6286 |
| 1*SD_SERIALIZE |
6287 |
| 0*SD_PREFER_SIBLING |
6288 |
| sd_local_flags(level) |
6289 |
, |
6290 |
.last_balance = jiffies, |
6291 |
.balance_interval = sd_weight, |
6292 |
}; |
6293 |
SD_INIT_NAME(sd, NUMA); |
6294 |
sd->private = &tl->data; |
6295 |
|
6296 |
/* |
6297 |
* Ugly hack to pass state to sd_numa_mask()... |
6298 |
*/ |
6299 |
sched_domains_curr_level = tl->numa_level; |
6300 |
|
6301 |
return sd; |
6302 |
} |
6303 |
|
6304 |
static const struct cpumask *sd_numa_mask(int cpu) |
6305 |
{ |
6306 |
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; |
6307 |
} |
6308 |
|
6309 |
static void sched_numa_warn(const char *str) |
6310 |
{ |
6311 |
static int done = false; |
6312 |
int i,j; |
6313 |
|
6314 |
if (done) |
6315 |
return; |
6316 |
|
6317 |
done = true; |
6318 |
|
6319 |
printk(KERN_WARNING "ERROR: %s\n\n", str); |
6320 |
|
6321 |
for (i = 0; i < nr_node_ids; i++) { |
6322 |
printk(KERN_WARNING " "); |
6323 |
for (j = 0; j < nr_node_ids; j++) |
6324 |
printk(KERN_CONT "%02d ", node_distance(i,j)); |
6325 |
printk(KERN_CONT "\n"); |
6326 |
} |
6327 |
printk(KERN_WARNING "\n"); |
6328 |
} |
6329 |
|
6330 |
static bool find_numa_distance(int distance) |
6331 |
{ |
6332 |
int i; |
6333 |
|
6334 |
if (distance == node_distance(0, 0)) |
6335 |
return true; |
6336 |
|
6337 |
for (i = 0; i < sched_domains_numa_levels; i++) { |
6338 |
if (sched_domains_numa_distance[i] == distance) |
6339 |
return true; |
6340 |
} |
6341 |
|
6342 |
return false; |
6343 |
} |
6344 |
|
6345 |
static void sched_init_numa(void) |
6346 |
{ |
6347 |
int next_distance, curr_distance = node_distance(0, 0); |
6348 |
struct sched_domain_topology_level *tl; |
6349 |
int level = 0; |
6350 |
int i, j, k; |
6351 |
|
6352 |
sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); |
6353 |
if (!sched_domains_numa_distance) |
6354 |
return; |
6355 |
|
6356 |
/* |
6357 |
* O(nr_nodes^2) deduplicating selection sort -- in order to find the |
6358 |
* unique distances in the node_distance() table. |
6359 |
* |
6360 |
* Assumes node_distance(0,j) includes all distances in |
6361 |
* node_distance(i,j) in order to avoid cubic time. |
6362 |
*/ |
6363 |
next_distance = curr_distance; |
6364 |
for (i = 0; i < nr_node_ids; i++) { |
6365 |
for (j = 0; j < nr_node_ids; j++) { |
6366 |
for (k = 0; k < nr_node_ids; k++) { |
6367 |
int distance = node_distance(i, k); |
6368 |
|
6369 |
if (distance > curr_distance && |
6370 |
(distance < next_distance || |
6371 |
next_distance == curr_distance)) |
6372 |
next_distance = distance; |
6373 |
|
6374 |
/* |
6375 |
* While not a strong assumption it would be nice to know |
6376 |
* about cases where if node A is connected to B, B is not |
6377 |
* equally connected to A. |
6378 |
*/ |
6379 |
if (sched_debug() && node_distance(k, i) != distance) |
6380 |
sched_numa_warn("Node-distance not symmetric"); |
6381 |
|
6382 |
if (sched_debug() && i && !find_numa_distance(distance)) |
6383 |
sched_numa_warn("Node-0 not representative"); |
6384 |
} |
6385 |
if (next_distance != curr_distance) { |
6386 |
sched_domains_numa_distance[level++] = next_distance; |
6387 |
sched_domains_numa_levels = level; |
6388 |
curr_distance = next_distance; |
6389 |
} else break; |
6390 |
} |
6391 |
|
6392 |
/* |
6393 |
* In case of sched_debug() we verify the above assumption. |
6394 |
*/ |
6395 |
if (!sched_debug()) |
6396 |
break; |
6397 |
} |
6398 |
/* |
6399 |
* 'level' contains the number of unique distances, excluding the |
6400 |
* identity distance node_distance(i,i). |
6401 |
* |
6402 |
* The sched_domains_numa_distance[] array includes the actual distance |
6403 |
* numbers. |
6404 |
*/ |
6405 |
|
6406 |
/* |
6407 |
* Here, we should temporarily reset sched_domains_numa_levels to 0. |
6408 |
* If it fails to allocate memory for array sched_domains_numa_masks[][], |
6409 |
* the array will contain less then 'level' members. This could be |
6410 |
* dangerous when we use it to iterate array sched_domains_numa_masks[][] |
6411 |
* in other functions. |
6412 |
* |
6413 |
* We reset it to 'level' at the end of this function. |
6414 |
*/ |
6415 |
sched_domains_numa_levels = 0; |
6416 |
|
6417 |
sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); |
6418 |
if (!sched_domains_numa_masks) |
6419 |
return; |
6420 |
|
6421 |
/* |
6422 |
* Now for each level, construct a mask per node which contains all |
6423 |
* cpus of nodes that are that many hops away from us. |
6424 |
*/ |
6425 |
for (i = 0; i < level; i++) { |
6426 |
sched_domains_numa_masks[i] = |
6427 |
kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); |
6428 |
if (!sched_domains_numa_masks[i]) |
6429 |
return; |
6430 |
|
6431 |
for (j = 0; j < nr_node_ids; j++) { |
6432 |
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); |
6433 |
if (!mask) |
6434 |
return; |
6435 |
|
6436 |
sched_domains_numa_masks[i][j] = mask; |
6437 |
|
6438 |
for (k = 0; k < nr_node_ids; k++) { |
6439 |
if (node_distance(j, k) > sched_domains_numa_distance[i]) |
6440 |
continue; |
6441 |
|
6442 |
cpumask_or(mask, mask, cpumask_of_node(k)); |
6443 |
} |
6444 |
} |
6445 |
} |
6446 |
|
6447 |
tl = kzalloc((ARRAY_SIZE(default_topology) + level) * |
6448 |
sizeof(struct sched_domain_topology_level), GFP_KERNEL); |
6449 |
if (!tl) |
6450 |
return; |
6451 |
|
6452 |
/* |
6453 |
* Copy the default topology bits.. |
6454 |
*/ |
6455 |
for (i = 0; default_topology[i].init; i++) |
6456 |
tl[i] = default_topology[i]; |
6457 |
|
6458 |
/* |
6459 |
* .. and append 'j' levels of NUMA goodness. |
6460 |
*/ |
6461 |
for (j = 0; j < level; i++, j++) { |
6462 |
tl[i] = (struct sched_domain_topology_level){ |
6463 |
.init = sd_numa_init, |
6464 |
.mask = sd_numa_mask, |
6465 |
.flags = SDTL_OVERLAP, |
6466 |
.numa_level = j, |
6467 |
}; |
6468 |
} |
6469 |
|
6470 |
sched_domain_topology = tl; |
6471 |
|
6472 |
sched_domains_numa_levels = level; |
6473 |
} |
6474 |
|
6475 |
static void sched_domains_numa_masks_set(int cpu) |
6476 |
{ |
6477 |
int i, j; |
6478 |
int node = cpu_to_node(cpu); |
6479 |
|
6480 |
for (i = 0; i < sched_domains_numa_levels; i++) { |
6481 |
for (j = 0; j < nr_node_ids; j++) { |
6482 |
if (node_distance(j, node) <= sched_domains_numa_distance[i]) |
6483 |
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); |
6484 |
} |
6485 |
} |
6486 |
} |
6487 |
|
6488 |
static void sched_domains_numa_masks_clear(int cpu) |
6489 |
{ |
6490 |
int i, j; |
6491 |
for (i = 0; i < sched_domains_numa_levels; i++) { |
6492 |
for (j = 0; j < nr_node_ids; j++) |
6493 |
cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); |
6494 |
} |
6495 |
} |
6496 |
|
6497 |
/* |
6498 |
* Update sched_domains_numa_masks[level][node] array when new cpus |
6499 |
* are onlined. |
6500 |
*/ |
6501 |
static int sched_domains_numa_masks_update(struct notifier_block *nfb, |
6502 |
unsigned long action, |
6503 |
void *hcpu) |
6504 |
{ |
6505 |
int cpu = (long)hcpu; |
6506 |
|
6507 |
switch (action & ~CPU_TASKS_FROZEN) { |
6508 |
case CPU_ONLINE: |
6509 |
sched_domains_numa_masks_set(cpu); |
6510 |
break; |
6511 |
|
6512 |
case CPU_DEAD: |
6513 |
sched_domains_numa_masks_clear(cpu); |
6514 |
break; |
6515 |
|
6516 |
default: |
6517 |
return NOTIFY_DONE; |
6518 |
} |
6519 |
|
6520 |
return NOTIFY_OK; |
6521 |
} |
6522 |
#else |
6523 |
static inline void sched_init_numa(void) |
6524 |
{ |
6525 |
} |
6526 |
|
6527 |
static int sched_domains_numa_masks_update(struct notifier_block *nfb, |
6528 |
unsigned long action, |
6529 |
void *hcpu) |
6530 |
{ |
6531 |
return 0; |
6532 |
} |
6533 |
#endif /* CONFIG_NUMA */ |
6534 |
|
6535 |
static int __sdt_alloc(const struct cpumask *cpu_map) |
6536 |
{ |
6537 |
struct sched_domain_topology_level *tl; |
6538 |
int j; |
6539 |
|
6540 |
for_each_sd_topology(tl) { |
6541 |
struct sd_data *sdd = &tl->data; |
6542 |
|
6543 |
sdd->sd = alloc_percpu(struct sched_domain *); |
6544 |
if (!sdd->sd) |
6545 |
return -ENOMEM; |
6546 |
|
6547 |
for_each_cpu(j, cpu_map) { |
6548 |
struct sched_domain *sd; |
6549 |
|
6550 |
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), |
6551 |
GFP_KERNEL, cpu_to_node(j)); |
6552 |
if (!sd) |
6553 |
return -ENOMEM; |
6554 |
|
6555 |
*per_cpu_ptr(sdd->sd, j) = sd; |
6556 |
} |
6557 |
} |
6558 |
|
6559 |
return 0; |
6560 |
} |
6561 |
|
6562 |
static void __sdt_free(const struct cpumask *cpu_map) |
6563 |
{ |
6564 |
struct sched_domain_topology_level *tl; |
6565 |
int j; |
6566 |
|
6567 |
for_each_sd_topology(tl) { |
6568 |
struct sd_data *sdd = &tl->data; |
6569 |
|
6570 |
for_each_cpu(j, cpu_map) { |
6571 |
struct sched_domain *sd; |
6572 |
|
6573 |
if (sdd->sd) { |
6574 |
sd = *per_cpu_ptr(sdd->sd, j); |
6575 |
kfree(*per_cpu_ptr(sdd->sd, j)); |
6576 |
} |
6577 |
} |
6578 |
free_percpu(sdd->sd); |
6579 |
sdd->sd = NULL; |
6580 |
} |
6581 |
} |
6582 |
|
6583 |
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, |
6584 |
const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
6585 |
struct sched_domain *child, int cpu) |
6586 |
{ |
6587 |
struct sched_domain *sd = tl->init(tl, cpu); |
6588 |
if (!sd) |
6589 |
return child; |
6590 |
|
6591 |
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); |
6592 |
if (child) { |
6593 |
sd->level = child->level + 1; |
6594 |
sched_domain_level_max = max(sched_domain_level_max, sd->level); |
6595 |
child->parent = sd; |
6596 |
sd->child = child; |
6597 |
} |
6598 |
set_domain_attribute(sd, attr); |
6599 |
|
6600 |
return sd; |
6601 |
} |
6602 |
|
6603 |
/* |
6604 |
* Build sched domains for a given set of cpus and attach the sched domains |
6605 |
* to the individual cpus |
6606 |
*/ |
6607 |
static int build_sched_domains(const struct cpumask *cpu_map, |
6608 |
struct sched_domain_attr *attr) |
6609 |
{ |
6610 |
enum s_alloc alloc_state; |
6611 |
struct sched_domain *sd; |
6612 |
struct s_data d; |
6613 |
int i, ret = -ENOMEM; |
6614 |
|
6615 |
alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
6616 |
if (alloc_state != sa_rootdomain) |
6617 |
goto error; |
6618 |
|
6619 |
/* Set up domains for cpus specified by the cpu_map. */ |
6620 |
for_each_cpu(i, cpu_map) { |
6621 |
struct sched_domain_topology_level *tl; |
6622 |
|
6623 |
sd = NULL; |
6624 |
for_each_sd_topology(tl) { |
6625 |
sd = build_sched_domain(tl, cpu_map, attr, sd, i); |
6626 |
if (tl == sched_domain_topology) |
6627 |
*per_cpu_ptr(d.sd, i) = sd; |
6628 |
if (tl->flags & SDTL_OVERLAP) |
6629 |
sd->flags |= SD_OVERLAP; |
6630 |
if (cpumask_equal(cpu_map, sched_domain_span(sd))) |
6631 |
break; |
6632 |
} |
6633 |
} |
6634 |
|
6635 |
/* Calculate CPU power for physical packages and nodes */ |
6636 |
for (i = nr_cpumask_bits-1; i >= 0; i--) { |
6637 |
if (!cpumask_test_cpu(i, cpu_map)) |
6638 |
continue; |
6639 |
|
6640 |
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
6641 |
claim_allocations(i, sd); |
6642 |
} |
6643 |
} |
6644 |
|
6645 |
/* Attach the domains */ |
6646 |
rcu_read_lock(); |
6647 |
for_each_cpu(i, cpu_map) { |
6648 |
sd = *per_cpu_ptr(d.sd, i); |
6649 |
cpu_attach_domain(sd, d.rd, i); |
6650 |
} |
6651 |
rcu_read_unlock(); |
6652 |
|
6653 |
ret = 0; |
6654 |
error: |
6655 |
__free_domain_allocs(&d, alloc_state, cpu_map); |
6656 |
return ret; |
6657 |
} |
6658 |
|
6659 |
static cpumask_var_t *doms_cur; /* current sched domains */ |
6660 |
static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
6661 |
static struct sched_domain_attr *dattr_cur; |
6662 |
/* attribues of custom domains in 'doms_cur' */ |
6663 |
|
6664 |
/* |
6665 |
* Special case: If a kmalloc of a doms_cur partition (array of |
6666 |
* cpumask) fails, then fallback to a single sched domain, |
6667 |
* as determined by the single cpumask fallback_doms. |
6668 |
*/ |
6669 |
static cpumask_var_t fallback_doms; |
6670 |
|
6671 |
/* |
6672 |
* arch_update_cpu_topology lets virtualized architectures update the |
6673 |
* cpu core maps. It is supposed to return 1 if the topology changed |
6674 |
* or 0 if it stayed the same. |
6675 |
*/ |
6676 |
int __attribute__((weak)) arch_update_cpu_topology(void) |
6677 |
{ |
6678 |
return 0; |
6679 |
} |
6680 |
|
6681 |
cpumask_var_t *alloc_sched_domains(unsigned int ndoms) |
6682 |
{ |
6683 |
int i; |
6684 |
cpumask_var_t *doms; |
6685 |
|
6686 |
doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); |
6687 |
if (!doms) |
6688 |
return NULL; |
6689 |
for (i = 0; i < ndoms; i++) { |
6690 |
if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { |
6691 |
free_sched_domains(doms, i); |
6692 |
return NULL; |
6693 |
} |
6694 |
} |
6695 |
return doms; |
6696 |
} |
6697 |
|
6698 |
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) |
6699 |
{ |
6700 |
unsigned int i; |
6701 |
for (i = 0; i < ndoms; i++) |
6702 |
free_cpumask_var(doms[i]); |
6703 |
kfree(doms); |
6704 |
} |
6705 |
|
6706 |
/* |
6707 |
* Set up scheduler domains and groups. Callers must hold the hotplug lock. |
6708 |
* For now this just excludes isolated cpus, but could be used to |
6709 |
* exclude other special cases in the future. |
6710 |
*/ |
6711 |
static int init_sched_domains(const struct cpumask *cpu_map) |
6712 |
{ |
6713 |
int err; |
6714 |
|
6715 |
arch_update_cpu_topology(); |
6716 |
ndoms_cur = 1; |
6717 |
doms_cur = alloc_sched_domains(ndoms_cur); |
6718 |
if (!doms_cur) |
6719 |
doms_cur = &fallback_doms; |
6720 |
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
6721 |
err = build_sched_domains(doms_cur[0], NULL); |
6722 |
register_sched_domain_sysctl(); |
6723 |
|
6724 |
return err; |
6725 |
} |
6726 |
|
6727 |
/* |
6728 |
* Detach sched domains from a group of cpus specified in cpu_map |
6729 |
* These cpus will now be attached to the NULL domain |
6730 |
*/ |
6731 |
static void detach_destroy_domains(const struct cpumask *cpu_map) |
6732 |
{ |
6733 |
int i; |
6734 |
|
6735 |
rcu_read_lock(); |
6736 |
for_each_cpu(i, cpu_map) |
6737 |
cpu_attach_domain(NULL, &def_root_domain, i); |
6738 |
rcu_read_unlock(); |
6739 |
} |
6740 |
|
6741 |
/* handle null as "default" */ |
6742 |
static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, |
6743 |
struct sched_domain_attr *new, int idx_new) |
6744 |
{ |
6745 |
struct sched_domain_attr tmp; |
6746 |
|
6747 |
/* fast path */ |
6748 |
if (!new && !cur) |
6749 |
return 1; |
6750 |
|
6751 |
tmp = SD_ATTR_INIT; |
6752 |
return !memcmp(cur ? (cur + idx_cur) : &tmp, |
6753 |
new ? (new + idx_new) : &tmp, |
6754 |
sizeof(struct sched_domain_attr)); |
6755 |
} |
6756 |
|
6757 |
/* |
6758 |
* Partition sched domains as specified by the 'ndoms_new' |
6759 |
* cpumasks in the array doms_new[] of cpumasks. This compares |
6760 |
* doms_new[] to the current sched domain partitioning, doms_cur[]. |
6761 |
* It destroys each deleted domain and builds each new domain. |
6762 |
* |
6763 |
* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. |
6764 |
* The masks don't intersect (don't overlap.) We should setup one |
6765 |
* sched domain for each mask. CPUs not in any of the cpumasks will |
6766 |
* not be load balanced. If the same cpumask appears both in the |
6767 |
* current 'doms_cur' domains and in the new 'doms_new', we can leave |
6768 |
* it as it is. |
6769 |
* |
6770 |
* The passed in 'doms_new' should be allocated using |
6771 |
* alloc_sched_domains. This routine takes ownership of it and will |
6772 |
* free_sched_domains it when done with it. If the caller failed the |
6773 |
* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, |
6774 |
* and partition_sched_domains() will fallback to the single partition |
6775 |
* 'fallback_doms', it also forces the domains to be rebuilt. |
6776 |
* |
6777 |
* If doms_new == NULL it will be replaced with cpu_online_mask. |
6778 |
* ndoms_new == 0 is a special case for destroying existing domains, |
6779 |
* and it will not create the default domain. |
6780 |
* |
6781 |
* Call with hotplug lock held |
6782 |
*/ |
6783 |
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
6784 |
struct sched_domain_attr *dattr_new) |
6785 |
{ |
6786 |
int i, j, n; |
6787 |
int new_topology; |
6788 |
|
6789 |
mutex_lock(&sched_domains_mutex); |
6790 |
|
6791 |
/* always unregister in case we don't destroy any domains */ |
6792 |
unregister_sched_domain_sysctl(); |
6793 |
|
6794 |
/* Let architecture update cpu core mappings. */ |
6795 |
new_topology = arch_update_cpu_topology(); |
6796 |
|
6797 |
n = doms_new ? ndoms_new : 0; |
6798 |
|
6799 |
/* Destroy deleted domains */ |
6800 |
for (i = 0; i < ndoms_cur; i++) { |
6801 |
for (j = 0; j < n && !new_topology; j++) { |
6802 |
if (cpumask_equal(doms_cur[i], doms_new[j]) |
6803 |
&& dattrs_equal(dattr_cur, i, dattr_new, j)) |
6804 |
goto match1; |
6805 |
} |
6806 |
/* no match - a current sched domain not in new doms_new[] */ |
6807 |
detach_destroy_domains(doms_cur[i]); |
6808 |
match1: |
6809 |
; |
6810 |
} |
6811 |
|
6812 |
if (doms_new == NULL) { |
6813 |
ndoms_cur = 0; |
6814 |
doms_new = &fallback_doms; |
6815 |
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
6816 |
WARN_ON_ONCE(dattr_new); |
6817 |
} |
6818 |
|
6819 |
/* Build new domains */ |
6820 |
for (i = 0; i < ndoms_new; i++) { |
6821 |
for (j = 0; j < ndoms_cur && !new_topology; j++) { |
6822 |
if (cpumask_equal(doms_new[i], doms_cur[j]) |
6823 |
&& dattrs_equal(dattr_new, i, dattr_cur, j)) |
6824 |
goto match2; |
6825 |
} |
6826 |
/* no match - add a new doms_new */ |
6827 |
build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
6828 |
match2: |
6829 |
; |
6830 |
} |
6831 |
|
6832 |
/* Remember the new sched domains */ |
6833 |
if (doms_cur != &fallback_doms) |
6834 |
free_sched_domains(doms_cur, ndoms_cur); |
6835 |
kfree(dattr_cur); /* kfree(NULL) is safe */ |
6836 |
doms_cur = doms_new; |
6837 |
dattr_cur = dattr_new; |
6838 |
ndoms_cur = ndoms_new; |
6839 |
|
6840 |
register_sched_domain_sysctl(); |
6841 |
|
6842 |
mutex_unlock(&sched_domains_mutex); |
6843 |
} |
6844 |
|
6845 |
/* |
6846 |
* Update cpusets according to cpu_active mask. If cpusets are |
6847 |
* disabled, cpuset_update_active_cpus() becomes a simple wrapper |
6848 |
* around partition_sched_domains(). |
6849 |
*/ |
6850 |
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, |
6851 |
void *hcpu) |
6852 |
{ |
6853 |
switch (action & ~CPU_TASKS_FROZEN) { |
6854 |
case CPU_ONLINE: |
6855 |
case CPU_DOWN_FAILED: |
6856 |
cpuset_update_active_cpus(true); |
6857 |
return NOTIFY_OK; |
6858 |
default: |
6859 |
return NOTIFY_DONE; |
6860 |
} |
6861 |
} |
6862 |
|
6863 |
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
6864 |
void *hcpu) |
6865 |
{ |
6866 |
switch (action & ~CPU_TASKS_FROZEN) { |
6867 |
case CPU_DOWN_PREPARE: |
6868 |
cpuset_update_active_cpus(false); |
6869 |
return NOTIFY_OK; |
6870 |
default: |
6871 |
return NOTIFY_DONE; |
6872 |
} |
6873 |
} |
6874 |
|
6875 |
#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) |
6876 |
/* |
6877 |
* Cheaper version of the below functions in case support for SMT and MC is |
6878 |
* compiled in but CPUs have no siblings. |
6879 |
*/ |
6880 |
static bool sole_cpu_idle(int cpu) |
6881 |
{ |
6882 |
return rq_idle(cpu_rq(cpu)); |
6883 |
} |
6884 |
#endif |
6885 |
#ifdef CONFIG_SCHED_SMT |
6886 |
/* All this CPU's SMT siblings are idle */ |
6887 |
static bool siblings_cpu_idle(int cpu) |
6888 |
{ |
6889 |
return cpumask_subset(&(cpu_rq(cpu)->smt_siblings), |
6890 |
&grq.cpu_idle_map); |
6891 |
} |
6892 |
#endif |
6893 |
#ifdef CONFIG_SCHED_MC |
6894 |
/* All this CPU's shared cache siblings are idle */ |
6895 |
static bool cache_cpu_idle(int cpu) |
6896 |
{ |
6897 |
return cpumask_subset(&(cpu_rq(cpu)->cache_siblings), |
6898 |
&grq.cpu_idle_map); |
6899 |
} |
6900 |
#endif |
6901 |
|
6902 |
enum sched_domain_level { |
6903 |
SD_LV_NONE = 0, |
6904 |
SD_LV_SIBLING, |
6905 |
SD_LV_MC, |
6906 |
SD_LV_BOOK, |
6907 |
SD_LV_CPU, |
6908 |
SD_LV_NODE, |
6909 |
SD_LV_ALLNODES, |
6910 |
SD_LV_MAX |
6911 |
}; |
6912 |
|
6913 |
void __init sched_init_smp(void) |
6914 |
{ |
6915 |
struct sched_domain *sd; |
6916 |
int cpu; |
6917 |
|
6918 |
cpumask_var_t non_isolated_cpus; |
6919 |
|
6920 |
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
6921 |
alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
6922 |
|
6923 |
sched_init_numa(); |
6924 |
|
6925 |
get_online_cpus(); |
6926 |
mutex_lock(&sched_domains_mutex); |
6927 |
init_sched_domains(cpu_active_mask); |
6928 |
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
6929 |
if (cpumask_empty(non_isolated_cpus)) |
6930 |
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
6931 |
mutex_unlock(&sched_domains_mutex); |
6932 |
put_online_cpus(); |
6933 |
|
6934 |
hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); |
6935 |
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
6936 |
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
6937 |
|
6938 |
/* Move init over to a non-isolated CPU */ |
6939 |
if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) |
6940 |
BUG(); |
6941 |
free_cpumask_var(non_isolated_cpus); |
6942 |
|
6943 |
grq_lock_irq(); |
6944 |
/* |
6945 |
* Set up the relative cache distance of each online cpu from each |
6946 |
* other in a simple array for quick lookup. Locality is determined |
6947 |
* by the closest sched_domain that CPUs are separated by. CPUs with |
6948 |
* shared cache in SMT and MC are treated as local. Separate CPUs |
6949 |
* (within the same package or physically) within the same node are |
6950 |
* treated as not local. CPUs not even in the same domain (different |
6951 |
* nodes) are treated as very distant. |
6952 |
*/ |
6953 |
for_each_online_cpu(cpu) { |
6954 |
struct rq *rq = cpu_rq(cpu); |
6955 |
|
6956 |
mutex_lock(&sched_domains_mutex); |
6957 |
for_each_domain(cpu, sd) { |
6958 |
int locality, other_cpu; |
6959 |
|
6960 |
#ifdef CONFIG_SCHED_SMT |
6961 |
if (sd->level == SD_LV_SIBLING) { |
6962 |
for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) |
6963 |
cpumask_set_cpu(other_cpu, &rq->smt_siblings); |
6964 |
} |
6965 |
#endif |
6966 |
#ifdef CONFIG_SCHED_MC |
6967 |
if (sd->level == SD_LV_MC) { |
6968 |
for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) |
6969 |
cpumask_set_cpu(other_cpu, &rq->cache_siblings); |
6970 |
} |
6971 |
#endif |
6972 |
if (sd->level <= SD_LV_SIBLING) |
6973 |
locality = 1; |
6974 |
else if (sd->level <= SD_LV_MC) |
6975 |
locality = 2; |
6976 |
else if (sd->level <= SD_LV_NODE) |
6977 |
locality = 3; |
6978 |
else |
6979 |
continue; |
6980 |
|
6981 |
for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) { |
6982 |
if (locality < rq->cpu_locality[other_cpu]) |
6983 |
rq->cpu_locality[other_cpu] = locality; |
6984 |
} |
6985 |
} |
6986 |
mutex_unlock(&sched_domains_mutex); |
6987 |
|
6988 |
/* |
6989 |
* Each runqueue has its own function in case it doesn't have |
6990 |
* siblings of its own allowing mixed topologies. |
6991 |
*/ |
6992 |
#ifdef CONFIG_SCHED_SMT |
6993 |
if (cpus_weight(rq->smt_siblings) > 1) |
6994 |
rq->siblings_idle = siblings_cpu_idle; |
6995 |
#endif |
6996 |
#ifdef CONFIG_SCHED_MC |
6997 |
if (cpus_weight(rq->cache_siblings) > 1) |
6998 |
rq->cache_idle = cache_cpu_idle; |
6999 |
#endif |
7000 |
} |
7001 |
grq_unlock_irq(); |
7002 |
} |
7003 |
#else |
7004 |
void __init sched_init_smp(void) |
7005 |
{ |
7006 |
} |
7007 |
#endif /* CONFIG_SMP */ |
7008 |
|
7009 |
unsigned int sysctl_timer_migration = 1; |
7010 |
|
7011 |
int in_sched_functions(unsigned long addr) |
7012 |
{ |
7013 |
return in_lock_functions(addr) || |
7014 |
(addr >= (unsigned long)__sched_text_start |
7015 |
&& addr < (unsigned long)__sched_text_end); |
7016 |
} |
7017 |
|
7018 |
void __init sched_init(void) |
7019 |
{ |
7020 |
int i; |
7021 |
struct rq *rq; |
7022 |
|
7023 |
prio_ratios[0] = 128; |
7024 |
for (i = 1 ; i < PRIO_RANGE ; i++) |
7025 |
prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; |
7026 |
|
7027 |
raw_spin_lock_init(&grq.lock); |
7028 |
grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; |
7029 |
grq.niffies = 0; |
7030 |
grq.last_jiffy = jiffies; |
7031 |
raw_spin_lock_init(&grq.iso_lock); |
7032 |
grq.iso_ticks = 0; |
7033 |
grq.iso_refractory = false; |
7034 |
grq.noc = 1; |
7035 |
#ifdef CONFIG_SMP |
7036 |
init_defrootdomain(); |
7037 |
grq.qnr = grq.idle_cpus = 0; |
7038 |
cpumask_clear(&grq.cpu_idle_map); |
7039 |
#else |
7040 |
uprq = &per_cpu(runqueues, 0); |
7041 |
#endif |
7042 |
for_each_possible_cpu(i) { |
7043 |
rq = cpu_rq(i); |
7044 |
rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = |
7045 |
rq->iowait_pc = rq->idle_pc = 0; |
7046 |
rq->dither = false; |
7047 |
#ifdef CONFIG_SMP |
7048 |
rq->sticky_task = NULL; |
7049 |
rq->last_niffy = 0; |
7050 |
rq->sd = NULL; |
7051 |
rq->rd = NULL; |
7052 |
rq->online = false; |
7053 |
rq->cpu = i; |
7054 |
rq_attach_root(rq, &def_root_domain); |
7055 |
#endif |
7056 |
atomic_set(&rq->nr_iowait, 0); |
7057 |
} |
7058 |
|
7059 |
#ifdef CONFIG_SMP |
7060 |
nr_cpu_ids = i; |
7061 |
/* |
7062 |
* Set the base locality for cpu cache distance calculation to |
7063 |
* "distant" (3). Make sure the distance from a CPU to itself is 0. |
7064 |
*/ |
7065 |
for_each_possible_cpu(i) { |
7066 |
int j; |
7067 |
|
7068 |
rq = cpu_rq(i); |
7069 |
#ifdef CONFIG_SCHED_SMT |
7070 |
cpumask_clear(&rq->smt_siblings); |
7071 |
cpumask_set_cpu(i, &rq->smt_siblings); |
7072 |
rq->siblings_idle = sole_cpu_idle; |
7073 |
cpumask_set_cpu(i, &rq->smt_siblings); |
7074 |
#endif |
7075 |
#ifdef CONFIG_SCHED_MC |
7076 |
cpumask_clear(&rq->cache_siblings); |
7077 |
cpumask_set_cpu(i, &rq->cache_siblings); |
7078 |
rq->cache_idle = sole_cpu_idle; |
7079 |
cpumask_set_cpu(i, &rq->cache_siblings); |
7080 |
#endif |
7081 |
rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(int *), GFP_ATOMIC); |
7082 |
for_each_possible_cpu(j) { |
7083 |
if (i == j) |
7084 |
rq->cpu_locality[j] = 0; |
7085 |
else |
7086 |
rq->cpu_locality[j] = 4; |
7087 |
} |
7088 |
} |
7089 |
#endif |
7090 |
|
7091 |
for (i = 0; i < PRIO_LIMIT; i++) |
7092 |
INIT_LIST_HEAD(grq.queue + i); |
7093 |
/* delimiter for bitsearch */ |
7094 |
__set_bit(PRIO_LIMIT, grq.prio_bitmap); |
7095 |
|
7096 |
#ifdef CONFIG_PREEMPT_NOTIFIERS |
7097 |
INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
7098 |
#endif |
7099 |
|
7100 |
#ifdef CONFIG_RT_MUTEXES |
7101 |
plist_head_init(&init_task.pi_waiters); |
7102 |
#endif |
7103 |
|
7104 |
/* |
7105 |
* The boot idle thread does lazy MMU switching as well: |
7106 |
*/ |
7107 |
atomic_inc(&init_mm.mm_count); |
7108 |
enter_lazy_tlb(&init_mm, current); |
7109 |
|
7110 |
/* |
7111 |
* Make us the idle thread. Technically, schedule() should not be |
7112 |
* called from this thread, however somewhere below it might be, |
7113 |
* but because we are the idle thread, we just pick up running again |
7114 |
* when this runqueue becomes "idle". |
7115 |
*/ |
7116 |
init_idle(current, smp_processor_id()); |
7117 |
|
7118 |
#ifdef CONFIG_SMP |
7119 |
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); |
7120 |
/* May be allocated at isolcpus cmdline parse time */ |
7121 |
if (cpu_isolated_map == NULL) |
7122 |
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7123 |
idle_thread_set_boot_cpu(); |
7124 |
#endif /* SMP */ |
7125 |
} |
7126 |
|
7127 |
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
7128 |
static inline int preempt_count_equals(int preempt_offset) |
7129 |
{ |
7130 |
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
7131 |
|
7132 |
return (nested == preempt_offset); |
7133 |
} |
7134 |
|
7135 |
void __might_sleep(const char *file, int line, int preempt_offset) |
7136 |
{ |
7137 |
static unsigned long prev_jiffy; /* ratelimiting */ |
7138 |
|
7139 |
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ |
7140 |
if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
7141 |
system_state != SYSTEM_RUNNING || oops_in_progress) |
7142 |
return; |
7143 |
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
7144 |
return; |
7145 |
prev_jiffy = jiffies; |
7146 |
|
7147 |
printk(KERN_ERR |
7148 |
"BUG: sleeping function called from invalid context at %s:%d\n", |
7149 |
file, line); |
7150 |
printk(KERN_ERR |
7151 |
"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
7152 |
in_atomic(), irqs_disabled(), |
7153 |
current->pid, current->comm); |
7154 |
|
7155 |
debug_show_held_locks(current); |
7156 |
if (irqs_disabled()) |
7157 |
print_irqtrace_events(current); |
7158 |
dump_stack(); |
7159 |
} |
7160 |
EXPORT_SYMBOL(__might_sleep); |
7161 |
#endif |
7162 |
|
7163 |
#ifdef CONFIG_MAGIC_SYSRQ |
7164 |
void normalize_rt_tasks(void) |
7165 |
{ |
7166 |
struct task_struct *g, *p; |
7167 |
unsigned long flags; |
7168 |
struct rq *rq; |
7169 |
int queued; |
7170 |
|
7171 |
read_lock_irqsave(&tasklist_lock, flags); |
7172 |
|
7173 |
do_each_thread(g, p) { |
7174 |
if (!rt_task(p) && !iso_task(p)) |
7175 |
continue; |
7176 |
|
7177 |
raw_spin_lock(&p->pi_lock); |
7178 |
rq = __task_grq_lock(p); |
7179 |
|
7180 |
queued = task_queued(p); |
7181 |
if (queued) |
7182 |
dequeue_task(p); |
7183 |
__setscheduler(p, rq, SCHED_NORMAL, 0); |
7184 |
if (queued) { |
7185 |
enqueue_task(p); |
7186 |
try_preempt(p, rq); |
7187 |
} |
7188 |
|
7189 |
__task_grq_unlock(); |
7190 |
raw_spin_unlock(&p->pi_lock); |
7191 |
} while_each_thread(g, p); |
7192 |
|
7193 |
read_unlock_irqrestore(&tasklist_lock, flags); |
7194 |
} |
7195 |
#endif /* CONFIG_MAGIC_SYSRQ */ |
7196 |
|
7197 |
#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) |
7198 |
/* |
7199 |
* These functions are only useful for the IA64 MCA handling, or kdb. |
7200 |
* |
7201 |
* They can only be called when the whole system has been |
7202 |
* stopped - every CPU needs to be quiescent, and no scheduling |
7203 |
* activity can take place. Using them for anything else would |
7204 |
* be a serious bug, and as a result, they aren't even visible |
7205 |
* under any other configuration. |
7206 |
*/ |
7207 |
|
7208 |
/** |
7209 |
* curr_task - return the current task for a given cpu. |
7210 |
* @cpu: the processor in question. |
7211 |
* |
7212 |
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
7213 |
* |
7214 |
* Return: The current task for @cpu. |
7215 |
*/ |
7216 |
struct task_struct *curr_task(int cpu) |
7217 |
{ |
7218 |
return cpu_curr(cpu); |
7219 |
} |
7220 |
|
7221 |
#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ |
7222 |
|
7223 |
#ifdef CONFIG_IA64 |
7224 |
/** |
7225 |
* set_curr_task - set the current task for a given cpu. |
7226 |
* @cpu: the processor in question. |
7227 |
* @p: the task pointer to set. |
7228 |
* |
7229 |
* Description: This function must only be used when non-maskable interrupts |
7230 |
* are serviced on a separate stack. It allows the architecture to switch the |
7231 |
* notion of the current task on a cpu in a non-blocking manner. This function |
7232 |
* must be called with all CPU's synchronised, and interrupts disabled, the |
7233 |
* and caller must save the original value of the current task (see |
7234 |
* curr_task() above) and restore that value before reenabling interrupts and |
7235 |
* re-starting the system. |
7236 |
* |
7237 |
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
7238 |
*/ |
7239 |
void set_curr_task(int cpu, struct task_struct *p) |
7240 |
{ |
7241 |
cpu_curr(cpu) = p; |
7242 |
} |
7243 |
|
7244 |
#endif |
7245 |
|
7246 |
/* |
7247 |
* Use precise platform statistics if available: |
7248 |
*/ |
7249 |
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
7250 |
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
7251 |
{ |
7252 |
*ut = p->utime; |
7253 |
*st = p->stime; |
7254 |
} |
7255 |
|
7256 |
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
7257 |
{ |
7258 |
struct task_cputime cputime; |
7259 |
|
7260 |
thread_group_cputime(p, &cputime); |
7261 |
|
7262 |
*ut = cputime.utime; |
7263 |
*st = cputime.stime; |
7264 |
} |
7265 |
|
7266 |
void vtime_account_system_irqsafe(struct task_struct *tsk) |
7267 |
{ |
7268 |
unsigned long flags; |
7269 |
|
7270 |
local_irq_save(flags); |
7271 |
vtime_account_system(tsk); |
7272 |
local_irq_restore(flags); |
7273 |
} |
7274 |
EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); |
7275 |
|
7276 |
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
7277 |
void vtime_task_switch(struct task_struct *prev) |
7278 |
{ |
7279 |
if (is_idle_task(prev)) |
7280 |
vtime_account_idle(prev); |
7281 |
else |
7282 |
vtime_account_system(prev); |
7283 |
|
7284 |
vtime_account_user(prev); |
7285 |
arch_vtime_task_switch(prev); |
7286 |
} |
7287 |
#endif |
7288 |
|
7289 |
#else |
7290 |
/* |
7291 |
* Perform (stime * rtime) / total, but avoid multiplication overflow by |
7292 |
* losing precision when the numbers are big. |
7293 |
*/ |
7294 |
static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) |
7295 |
{ |
7296 |
u64 scaled; |
7297 |
|
7298 |
for (;;) { |
7299 |
/* Make sure "rtime" is the bigger of stime/rtime */ |
7300 |
if (stime > rtime) { |
7301 |
u64 tmp = rtime; rtime = stime; stime = tmp; |
7302 |
} |
7303 |
|
7304 |
/* Make sure 'total' fits in 32 bits */ |
7305 |
if (total >> 32) |
7306 |
goto drop_precision; |
7307 |
|
7308 |
/* Does rtime (and thus stime) fit in 32 bits? */ |
7309 |
if (!(rtime >> 32)) |
7310 |
break; |
7311 |
|
7312 |
/* Can we just balance rtime/stime rather than dropping bits? */ |
7313 |
if (stime >> 31) |
7314 |
goto drop_precision; |
7315 |
|
7316 |
/* We can grow stime and shrink rtime and try to make them both fit */ |
7317 |
stime <<= 1; |
7318 |
rtime >>= 1; |
7319 |
continue; |
7320 |
|
7321 |
drop_precision: |
7322 |
/* We drop from rtime, it has more bits than stime */ |
7323 |
rtime >>= 1; |
7324 |
total >>= 1; |
7325 |
} |
7326 |
|
7327 |
/* |
7328 |
* Make sure gcc understands that this is a 32x32->64 multiply, |
7329 |
* followed by a 64/32->64 divide. |
7330 |
*/ |
7331 |
scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); |
7332 |
return (__force cputime_t) scaled; |
7333 |
} |
7334 |
|
7335 |
/* |
7336 |
* Adjust tick based cputime random precision against scheduler |
7337 |
* runtime accounting. |
7338 |
*/ |
7339 |
static void cputime_adjust(struct task_cputime *curr, |
7340 |
struct cputime *prev, |
7341 |
cputime_t *ut, cputime_t *st) |
7342 |
{ |
7343 |
cputime_t rtime, stime, utime, total; |
7344 |
|
7345 |
stime = curr->stime; |
7346 |
total = stime + curr->utime; |
7347 |
|
7348 |
/* |
7349 |
* Tick based cputime accounting depend on random scheduling |
7350 |
* timeslices of a task to be interrupted or not by the timer. |
7351 |
* Depending on these circumstances, the number of these interrupts |
7352 |
* may be over or under-optimistic, matching the real user and system |
7353 |
* cputime with a variable precision. |
7354 |
* |
7355 |
* Fix this by scaling these tick based values against the total |
7356 |
* runtime accounted by the CFS scheduler. |
7357 |
*/ |
7358 |
rtime = nsecs_to_cputime(curr->sum_exec_runtime); |
7359 |
|
7360 |
/* |
7361 |
* Update userspace visible utime/stime values only if actual execution |
7362 |
* time is bigger than already exported. Note that can happen, that we |
7363 |
* provided bigger values due to scaling inaccuracy on big numbers. |
7364 |
*/ |
7365 |
if (prev->stime + prev->utime >= rtime) |
7366 |
goto out; |
7367 |
|
7368 |
if (total) { |
7369 |
stime = scale_stime((__force u64)stime, |
7370 |
(__force u64)rtime, (__force u64)total); |
7371 |
utime = rtime - stime; |
7372 |
} else { |
7373 |
stime = rtime; |
7374 |
utime = 0; |
7375 |
} |
7376 |
|
7377 |
/* |
7378 |
* If the tick based count grows faster than the scheduler one, |
7379 |
* the result of the scaling may go backward. |
7380 |
* Let's enforce monotonicity. |
7381 |
*/ |
7382 |
prev->stime = max(prev->stime, stime); |
7383 |
prev->utime = max(prev->utime, utime); |
7384 |
|
7385 |
out: |
7386 |
*ut = prev->utime; |
7387 |
*st = prev->stime; |
7388 |
} |
7389 |
|
7390 |
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
7391 |
{ |
7392 |
struct task_cputime cputime = { |
7393 |
.sum_exec_runtime = tsk_seruntime(p), |
7394 |
}; |
7395 |
|
7396 |
task_cputime(p, &cputime.utime, &cputime.stime); |
7397 |
cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
7398 |
} |
7399 |
|
7400 |
/* |
7401 |
* Must be called with siglock held. |
7402 |
*/ |
7403 |
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
7404 |
{ |
7405 |
struct task_cputime cputime; |
7406 |
|
7407 |
thread_group_cputime(p, &cputime); |
7408 |
cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); |
7409 |
} |
7410 |
#endif |
7411 |
|
7412 |
void init_idle_bootup_task(struct task_struct *idle) |
7413 |
{} |
7414 |
|
7415 |
#ifdef CONFIG_SCHED_DEBUG |
7416 |
void proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
7417 |
{} |
7418 |
|
7419 |
void proc_sched_set_task(struct task_struct *p) |
7420 |
{} |
7421 |
#endif |
7422 |
|
7423 |
#ifdef CONFIG_SMP |
7424 |
#define SCHED_LOAD_SHIFT (10) |
7425 |
#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) |
7426 |
|
7427 |
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
7428 |
{ |
7429 |
return SCHED_LOAD_SCALE; |
7430 |
} |
7431 |
|
7432 |
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) |
7433 |
{ |
7434 |
unsigned long weight = cpumask_weight(sched_domain_span(sd)); |
7435 |
unsigned long smt_gain = sd->smt_gain; |
7436 |
|
7437 |
smt_gain /= weight; |
7438 |
|
7439 |
return smt_gain; |
7440 |
} |
7441 |
#endif |
|
|
7442 |
++ b/include/uapi/linux/sched.h |
Lines 37-44
Link Here
|
37 |
#define SCHED_FIFO 1 |
37 |
#define SCHED_FIFO 1 |
38 |
#define SCHED_RR 2 |
38 |
#define SCHED_RR 2 |
39 |
#define SCHED_BATCH 3 |
39 |
#define SCHED_BATCH 3 |
40 |
/* SCHED_ISO: reserved but not implemented yet */ |
40 |
/* SCHED_ISO: Implemented on BFS only */ |
41 |
#define SCHED_IDLE 5 |
41 |
#define SCHED_IDLE 5 |
|
|
42 |
#ifdef CONFIG_SCHED_BFS |
43 |
#define SCHED_ISO 4 |
44 |
#define SCHED_IDLEPRIO SCHED_IDLE |
45 |
#define SCHED_MAX (SCHED_IDLEPRIO) |
46 |
#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) |
47 |
#endif |
48 |
|
42 |
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ |
49 |
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ |
43 |
#define SCHED_RESET_ON_FORK 0x40000000 |
50 |
#define SCHED_RESET_ON_FORK 0x40000000 |
44 |
|
51 |
|
45 |
-- a/include/linux/sched/rt.h |
52 |
++ b/include/linux/sched/rt.h |
Lines 14-24
Link Here
|
14 |
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. |
14 |
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. |
15 |
*/ |
15 |
*/ |
16 |
|
16 |
|
|
|
17 |
#ifdef CONFIG_SCHED_BFS |
18 |
#define MAX_USER_RT_PRIO 100 |
19 |
#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) |
20 |
#define DEFAULT_PRIO (MAX_RT_PRIO + 20) |
21 |
|
22 |
#define PRIO_RANGE (40) |
23 |
#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) |
24 |
#define ISO_PRIO (MAX_RT_PRIO) |
25 |
#define NORMAL_PRIO (MAX_RT_PRIO + 1) |
26 |
#define IDLE_PRIO (MAX_RT_PRIO + 2) |
27 |
#define PRIO_LIMIT ((IDLE_PRIO) + 1) |
28 |
#else /* CONFIG_SCHED_BFS */ |
17 |
#define MAX_USER_RT_PRIO 100 |
29 |
#define MAX_USER_RT_PRIO 100 |
18 |
#define MAX_RT_PRIO MAX_USER_RT_PRIO |
30 |
#define MAX_RT_PRIO MAX_USER_RT_PRIO |
19 |
|
31 |
|
20 |
#define MAX_PRIO (MAX_RT_PRIO + 40) |
32 |
#define MAX_PRIO (MAX_RT_PRIO + 40) |
21 |
#define DEFAULT_PRIO (MAX_RT_PRIO + 20) |
33 |
#define DEFAULT_PRIO (MAX_RT_PRIO + 20) |
|
|
34 |
#endif /* CONFIG_SCHED_BFS */ |
22 |
|
35 |
|
23 |
static inline int rt_prio(int prio) |
36 |
static inline int rt_prio(int prio) |
24 |
{ |
37 |
{ |
25 |
-- a/kernel/stop_machine.c |
38 |
++ b/kernel/stop_machine.c |
Lines 40-46
Link Here
|
40 |
}; |
40 |
}; |
41 |
|
41 |
|
42 |
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); |
42 |
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); |
43 |
static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); |
43 |
DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); |
|
|
44 |
|
44 |
static bool stop_machine_initialized = false; |
45 |
static bool stop_machine_initialized = false; |
45 |
|
46 |
|
46 |
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) |
47 |
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) |
47 |
-- a/drivers/cpufreq/cpufreq_conservative.c |
48 |
++ b/drivers/cpufreq/cpufreq_conservative.c |
Lines 27-34
Link Here
|
27 |
#include "cpufreq_governor.h" |
27 |
#include "cpufreq_governor.h" |
28 |
|
28 |
|
29 |
/* Conservative governor macros */ |
29 |
/* Conservative governor macros */ |
30 |
#define DEF_FREQUENCY_UP_THRESHOLD (80) |
30 |
#define DEF_FREQUENCY_UP_THRESHOLD (63) |
31 |
#define DEF_FREQUENCY_DOWN_THRESHOLD (20) |
31 |
#define DEF_FREQUENCY_DOWN_THRESHOLD (26) |
32 |
#define DEF_FREQUENCY_STEP (5) |
32 |
#define DEF_FREQUENCY_STEP (5) |
33 |
#define DEF_SAMPLING_DOWN_FACTOR (1) |
33 |
#define DEF_SAMPLING_DOWN_FACTOR (1) |
34 |
#define MAX_SAMPLING_DOWN_FACTOR (10) |
34 |
#define MAX_SAMPLING_DOWN_FACTOR (10) |
35 |
-- linux-3.11-bfs.orig/kernel/time/Kconfig |
35 |
++ b/kernel/time/Kconfig |
Lines 94-100
Link Here
|
94 |
config NO_HZ_FULL |
94 |
config NO_HZ_FULL |
95 |
bool "Full dynticks system (tickless)" |
95 |
bool "Full dynticks system (tickless)" |
96 |
# NO_HZ_COMMON dependency |
96 |
# NO_HZ_COMMON dependency |
97 |
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
97 |
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS |
98 |
# We need at least one periodic CPU for timekeeping |
98 |
# We need at least one periodic CPU for timekeeping |
99 |
depends on SMP |
99 |
depends on SMP |
100 |
# RCU_USER_QS dependency |
100 |
# RCU_USER_QS dependency |
101 |
-- a/kernel/sched/Makefile |
101 |
++ b/kernel/sched/Makefile |
Lines 11-19
Link Here
|
11 |
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
11 |
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 |
endif |
12 |
endif |
13 |
|
13 |
|
|
|
14 |
ifdef CONFIG_SCHED_BFS |
15 |
obj-y += bfs.o clock.o |
16 |
else |
14 |
obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
17 |
obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
15 |
obj-$(CONFIG_SMP) += cpupri.o |
|
|
16 |
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
18 |
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 |
obj-$(CONFIG_SCHEDSTATS) += stats.o |
|
|
18 |
obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 |
obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 |
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o |
20 |
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o |
20 |
-- /dev/null |
21 |
endif |
|
|
22 |
obj-$(CONFIG_SMP) += cpupri.o |
23 |
obj-$(CONFIG_SCHEDSTATS) += stats.o |
|
|
24 |
++ b/kernel/sched/bfs_sched.h |
Line 0
Link Here
|
0 |
-- a/kernel/sched/stats.c |
1 |
#include <linux/sched.h> |
|
|
2 |
|
3 |
#ifndef BFS_SCHED_H |
4 |
#define BFS_SCHED_H |
5 |
|
6 |
/* |
7 |
* This is the main, per-CPU runqueue data structure. |
8 |
* This data should only be modified by the local cpu. |
9 |
*/ |
10 |
struct rq { |
11 |
struct task_struct *curr, *idle, *stop; |
12 |
struct mm_struct *prev_mm; |
13 |
|
14 |
/* Stored data about rq->curr to work outside grq lock */ |
15 |
u64 rq_deadline; |
16 |
unsigned int rq_policy; |
17 |
int rq_time_slice; |
18 |
u64 rq_last_ran; |
19 |
int rq_prio; |
20 |
bool rq_running; /* There is a task running */ |
21 |
|
22 |
/* Accurate timekeeping data */ |
23 |
u64 timekeep_clock; |
24 |
unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, |
25 |
iowait_pc, idle_pc; |
26 |
atomic_t nr_iowait; |
27 |
|
28 |
#ifdef CONFIG_SMP |
29 |
int cpu; /* cpu of this runqueue */ |
30 |
bool online; |
31 |
bool scaling; /* This CPU is managed by a scaling CPU freq governor */ |
32 |
struct task_struct *sticky_task; |
33 |
|
34 |
struct root_domain *rd; |
35 |
struct sched_domain *sd; |
36 |
int *cpu_locality; /* CPU relative cache distance */ |
37 |
#ifdef CONFIG_SCHED_SMT |
38 |
bool (*siblings_idle)(int cpu); |
39 |
/* See if all smt siblings are idle */ |
40 |
cpumask_t smt_siblings; |
41 |
#endif /* CONFIG_SCHED_SMT */ |
42 |
#ifdef CONFIG_SCHED_MC |
43 |
bool (*cache_idle)(int cpu); |
44 |
/* See if all cache siblings are idle */ |
45 |
cpumask_t cache_siblings; |
46 |
#endif /* CONFIG_SCHED_MC */ |
47 |
u64 last_niffy; /* Last time this RQ updated grq.niffies */ |
48 |
#endif /* CONFIG_SMP */ |
49 |
#ifdef CONFIG_IRQ_TIME_ACCOUNTING |
50 |
u64 prev_irq_time; |
51 |
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
52 |
#ifdef CONFIG_PARAVIRT |
53 |
u64 prev_steal_time; |
54 |
#endif /* CONFIG_PARAVIRT */ |
55 |
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING |
56 |
u64 prev_steal_time_rq; |
57 |
#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ |
58 |
|
59 |
u64 clock, old_clock, last_tick; |
60 |
u64 clock_task; |
61 |
bool dither; |
62 |
|
63 |
#ifdef CONFIG_SCHEDSTATS |
64 |
|
65 |
/* latency stats */ |
66 |
struct sched_info rq_sched_info; |
67 |
unsigned long long rq_cpu_time; |
68 |
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ |
69 |
|
70 |
/* sys_sched_yield() stats */ |
71 |
unsigned int yld_count; |
72 |
|
73 |
/* schedule() stats */ |
74 |
unsigned int sched_switch; |
75 |
unsigned int sched_count; |
76 |
unsigned int sched_goidle; |
77 |
|
78 |
/* try_to_wake_up() stats */ |
79 |
unsigned int ttwu_count; |
80 |
unsigned int ttwu_local; |
81 |
#endif /* CONFIG_SCHEDSTATS */ |
82 |
|
83 |
#ifdef CONFIG_SMP |
84 |
struct llist_head wake_list; |
85 |
#endif |
86 |
}; |
87 |
|
88 |
#ifdef CONFIG_SMP |
89 |
struct rq *cpu_rq(int cpu); |
90 |
#endif |
91 |
|
92 |
static inline u64 rq_clock(struct rq *rq) |
93 |
{ |
94 |
return rq->clock; |
95 |
} |
96 |
|
97 |
static inline u64 rq_clock_task(struct rq *rq) |
98 |
{ |
99 |
return rq->clock_task; |
100 |
} |
101 |
|
102 |
#define rcu_dereference_check_sched_domain(p) \ |
103 |
rcu_dereference_check((p), \ |
104 |
lockdep_is_held(&sched_domains_mutex)) |
105 |
|
106 |
/* |
107 |
* The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
108 |
* See detach_destroy_domains: synchronize_sched for details. |
109 |
* |
110 |
* The domain tree of any CPU may only be accessed from within |
111 |
* preempt-disabled sections. |
112 |
*/ |
113 |
#define for_each_domain(cpu, __sd) \ |
114 |
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
115 |
|
116 |
#endif |
|
|
117 |
++ b/kernel/sched/stats.c |
Lines 4-10
Link Here
|
4 |
#include <linux/seq_file.h> |
4 |
#include <linux/seq_file.h> |
5 |
#include <linux/proc_fs.h> |
5 |
#include <linux/proc_fs.h> |
6 |
|
6 |
|
|
|
7 |
#ifndef CONFIG_SCHED_BFS |
7 |
#include "sched.h" |
8 |
#include "sched.h" |
|
|
9 |
#else |
10 |
#include "bfs_sched.h" |
11 |
#endif |
8 |
|
12 |
|
9 |
/* |
13 |
/* |
10 |
* bump this up when changing the output format or the meaning of an existing |
14 |
* bump this up when changing the output format or the meaning of an existing |
11 |
-- a/include/linux/spinlock.h |
15 |
++ b/include/linux/spinlock.h |
Lines 117-122
Link Here
|
117 |
#endif /*arch_spin_is_contended*/ |
117 |
#endif /*arch_spin_is_contended*/ |
118 |
#endif |
118 |
#endif |
119 |
|
119 |
|
|
|
120 |
#ifndef smp_mb__before_spinlock |
121 |
#define smp_mb__before_spinlock() smp_wmb() |
122 |
#endif |
123 |
|
120 |
/* The lock does not imply full memory barrier. */ |
124 |
/* The lock does not imply full memory barrier. */ |
121 |
#ifndef ARCH_HAS_SMP_MB_AFTER_LOCK |
125 |
#ifndef ARCH_HAS_SMP_MB_AFTER_LOCK |
122 |
static inline void smp_mb__after_lock(void) { smp_mb(); } |
126 |
static inline void smp_mb__after_lock(void) { smp_mb(); } |