diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4fdb669..89b72a9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -480,6 +480,7 @@ config SCHED_OMIT_FRAME_POINTER menuconfig PARAVIRT_GUEST bool "Paravirtualized guest support" + depends on !IPIPE ---help--- Say Y here to get to see options related to running Linux under various hypervisors. This option alone does not add any kernel code. @@ -531,6 +532,7 @@ source "arch/x86/lguest/Kconfig" config PARAVIRT bool "Enable paravirtualization code" + depends on !IPIPE ---help--- This changes the kernel so it can modify itself when it is run under a hypervisor, potentially improving performance significantly @@ -750,6 +752,8 @@ config SCHED_MC source "kernel/Kconfig.preempt" +source "kernel/ipipe/Kconfig" + config X86_UP_APIC bool "Local APIC support on uniprocessors" depends on X86_32 && !SMP && !X86_32_NON_STANDARD diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 474d80d..0b33b55 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -404,7 +404,13 @@ static inline u32 safe_apic_wait_icr_idle(void) } +#ifdef CONFIG_IPIPE +#define ack_APIC_irq() do { } while(0) +static inline void __ack_APIC_irq(void) +#else /* !CONFIG_IPIPE */ +#define __ack_APIC_irq() ack_APIC_irq() static inline void ack_APIC_irq(void) +#endif /* CONFIG_IPIPE */ { #ifdef CONFIG_X86_LOCAL_APIC /* diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 3b62da9..855534f 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -143,6 +143,7 @@ # define MAX_LOCAL_APIC 32768 #endif +#ifndef __ASSEMBLY__ /* * All x86-64 systems are xAPIC compatible. * In the following, "apicid" is a physical APIC ID. @@ -418,4 +419,7 @@ struct local_apic { #else #define BAD_APICID 0xFFFFu #endif + +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index f5693c8..b45303a 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -22,6 +22,7 @@ BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1, smp_invalidate_interrupt) BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2, smp_invalidate_interrupt) +#ifndef CONFIG_IPIPE BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3, smp_invalidate_interrupt) BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4, @@ -32,6 +33,7 @@ BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6, smp_invalidate_interrupt) BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7, smp_invalidate_interrupt) +#endif /* !CONFIG_IPIPE */ #endif BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index ba180d9..6a7c6bc 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -35,6 +35,13 @@ extern void spurious_interrupt(void); extern void thermal_interrupt(void); extern void reschedule_interrupt(void); extern void mce_self_interrupt(void); +#ifdef CONFIG_IPIPE +void ipipe_ipi0(void); +void ipipe_ipi1(void); +void ipipe_ipi2(void); +void ipipe_ipi3(void); +void ipipe_ipiX(void); +#endif extern void invalidate_interrupt(void); extern void invalidate_interrupt0(void); @@ -115,6 +122,7 @@ extern void smp_invalidate_interrupt(struct pt_regs *); #else extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif +extern asmlinkage void smp_reboot_interrupt(void); #endif extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 0b20bbb..b8a7638 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -289,11 +289,14 @@ static inline void __clear_fpu(struct task_struct *tsk) static inline void kernel_fpu_begin(void) { struct thread_info *me = current_thread_info(); + unsigned long flags; preempt_disable(); + local_irq_save_hw_cond(flags); if (me->status & TS_USEDFPU) __save_init_fpu(me->task); else clts(); + local_irq_restore_hw_cond(flags); } static inline void kernel_fpu_end(void) diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 58d7091..ac8bd15 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -24,7 +24,7 @@ extern unsigned int cached_irq_mask; #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 -extern spinlock_t i8259A_lock; +extern ipipe_spinlock_t i8259A_lock; extern void init_8259A(int auto_eoi); extern void enable_8259A_irq(unsigned int irq); diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 0b72282..6574056 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -68,6 +68,9 @@ __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest * to the APIC. */ unsigned int cfg; + unsigned long flags; + + local_irq_save_hw(flags); /* * Wait for idle. @@ -83,6 +86,8 @@ __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest * Send the IPI. The write to APIC_ICR fires this off. */ native_apic_mem_write(APIC_ICR, cfg); + + local_irq_restore_hw(flags); } /* diff --git a/arch/x86/include/asm/ipipe.h b/arch/x86/include/asm/ipipe.h new file mode 100644 index 0000000..7d29f03 --- /dev/null +++ b/arch/x86/include/asm/ipipe.h @@ -0,0 +1,158 @@ +/* -*- linux-c -*- + * arch/x86/include/asm/ipipe.h + * + * Copyright (C) 2007 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __X86_IPIPE_H +#define __X86_IPIPE_H + +#ifdef CONFIG_IPIPE + +#ifndef IPIPE_ARCH_STRING +#define IPIPE_ARCH_STRING "2.6-03" +#define IPIPE_MAJOR_NUMBER 2 +#define IPIPE_MINOR_NUMBER 6 +#define IPIPE_PATCH_NUMBER 3 +#endif + +DECLARE_PER_CPU(struct pt_regs, __ipipe_tick_regs); + +DECLARE_PER_CPU(unsigned long, __ipipe_cr2); + +static inline unsigned __ipipe_get_irq_vector(int irq) +{ +#ifdef CONFIG_X86_IO_APIC + unsigned __ipipe_get_ioapic_irq_vector(int irq); + return __ipipe_get_ioapic_irq_vector(irq); +#elif defined(CONFIG_X86_LOCAL_APIC) + return irq >= IPIPE_FIRST_APIC_IRQ && irq < IPIPE_NR_XIRQS ? + ipipe_apic_irq_vector(irq) : irq + IRQ0_VECTOR; +#else + return irq + IRQ0_VECTOR; +#endif +} + +#ifdef CONFIG_X86_32 +# include "ipipe_32.h" +#else +# include "ipipe_64.h" +#endif + +/* + * The logical processor id and the current Linux task are read from the PDA, + * so this is always safe, regardless of the underlying stack. + */ +#define ipipe_processor_id() raw_smp_processor_id() +#define ipipe_safe_current() current + +#define prepare_arch_switch(next) \ +do { \ + ipipe_schedule_notify(current, next); \ + local_irq_disable_hw(); \ +} while(0) + +#define task_hijacked(p) \ + ({ int x = __ipipe_root_domain_p; \ + __clear_bit(IPIPE_SYNC_FLAG, &ipipe_root_cpudom_var(status)); \ + if (x) local_irq_enable_hw(); !x; }) + +struct ipipe_domain; + +struct ipipe_sysinfo { + + int ncpus; /* Number of CPUs on board */ + u64 cpufreq; /* CPU frequency (in Hz) */ + + /* Arch-dependent block */ + + struct { + unsigned tmirq; /* Timer tick IRQ */ + u64 tmfreq; /* Timer frequency */ + } archdep; +}; + +/* Private interface -- Internal use only */ + +#define __ipipe_check_platform() do { } while(0) +#define __ipipe_init_platform() do { } while(0) +#define __ipipe_enable_irq(irq) irq_to_desc(irq)->chip->enable(irq) +#define __ipipe_disable_irq(irq) irq_to_desc(irq)->chip->disable(irq) + +#ifdef CONFIG_SMP +void __ipipe_hook_critical_ipi(struct ipipe_domain *ipd); +#else +#define __ipipe_hook_critical_ipi(ipd) do { } while(0) +#endif + +#define __ipipe_disable_irqdesc(ipd, irq) do { } while(0) + +void __ipipe_enable_irqdesc(struct ipipe_domain *ipd, unsigned irq); + +void __ipipe_enable_pipeline(void); + +void __ipipe_do_critical_sync(unsigned irq, void *cookie); + +void __ipipe_serial_debug(const char *fmt, ...); + +extern int __ipipe_tick_irq; + +#ifdef CONFIG_X86_LOCAL_APIC +#define ipipe_update_tick_evtdev(evtdev) \ + do { \ + if (strcmp((evtdev)->name, "lapic") == 0) \ + __ipipe_tick_irq = \ + ipipe_apic_vector_irq(LOCAL_TIMER_VECTOR); \ + else \ + __ipipe_tick_irq = 0; \ + } while (0) +#else +#define ipipe_update_tick_evtdev(evtdev) \ + __ipipe_tick_irq = 0 +#endif + +int __ipipe_check_lapic(void); + +int __ipipe_check_tickdev(const char *devname); + +#define __ipipe_syscall_watched_p(p, sc) \ + (((p)->flags & PF_EVNOTIFY) || (unsigned long)sc >= NR_syscalls) + +#define __ipipe_root_tick_p(regs) ((regs)->flags & X86_EFLAGS_IF) + +#else /* !CONFIG_IPIPE */ + +#define ipipe_update_tick_evtdev(evtdev) do { } while (0) +#define task_hijacked(p) 0 + +#endif /* CONFIG_IPIPE */ + +#if defined(CONFIG_SMP) && defined(CONFIG_IPIPE) +#define __ipipe_move_root_irq(irq) \ + do { \ + if (irq < NR_IRQS) { \ + struct irq_chip *chip = irq_to_desc(irq)->chip; \ + if (chip->move) \ + chip->move(irq); \ + } \ + } while (0) +#else /* !(CONFIG_SMP && CONFIG_IPIPE) */ +#define __ipipe_move_root_irq(irq) do { } while (0) +#endif /* !(CONFIG_SMP && CONFIG_IPIPE) */ + +#endif /* !__X86_IPIPE_H */ diff --git a/arch/x86/include/asm/ipipe_32.h b/arch/x86/include/asm/ipipe_32.h new file mode 100644 index 0000000..8d1f4b5 --- /dev/null +++ b/arch/x86/include/asm/ipipe_32.h @@ -0,0 +1,156 @@ +/* -*- linux-c -*- + * arch/x86/include/asm/ipipe_32.h + * + * Copyright (C) 2002-2005 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __X86_IPIPE_32_H +#define __X86_IPIPE_32_H + +#include +#include +#include +#include +#include + +#define ipipe_read_tsc(t) __asm__ __volatile__("rdtsc" : "=A" (t)) +#define ipipe_cpu_freq() ({ unsigned long long __freq = cpu_has_tsc?(1000LL * cpu_khz):CLOCK_TICK_RATE; __freq; }) + +#define ipipe_tsc2ns(t) \ +({ \ + unsigned long long delta = (t)*1000; \ + do_div(delta, cpu_khz/1000+1); \ + (unsigned long)delta; \ +}) + +#define ipipe_tsc2us(t) \ +({ \ + unsigned long long delta = (t); \ + do_div(delta, cpu_khz/1000+1); \ + (unsigned long)delta; \ +}) + +/* Private interface -- Internal use only */ + +int __ipipe_handle_irq(struct pt_regs *regs); + +static inline unsigned long __ipipe_ffnz(unsigned long ul) +{ + __asm__("bsrl %1, %0":"=r"(ul) + : "r"(ul)); + return ul; +} + +struct irq_desc; + +void __ipipe_ack_edge_irq(unsigned irq, struct irq_desc *desc); + +void __ipipe_end_edge_irq(unsigned irq, struct irq_desc *desc); + +static inline void __ipipe_call_root_xirq_handler(unsigned irq, + ipipe_irq_handler_t handler) +{ + struct pt_regs *regs = &__raw_get_cpu_var(__ipipe_tick_regs); + + regs->orig_ax = ~__ipipe_get_irq_vector(irq); + + __asm__ __volatile__("pushfl\n\t" + "pushl %%cs\n\t" + "pushl $__xirq_end\n\t" + "pushl %%eax\n\t" + "pushl %%gs\n\t" + "pushl %%fs\n\t" + "pushl %%es\n\t" + "pushl %%ds\n\t" + "pushl %%eax\n\t" + "pushl %%ebp\n\t" + "pushl %%edi\n\t" + "pushl %%esi\n\t" + "pushl %%edx\n\t" + "pushl %%ecx\n\t" + "pushl %%ebx\n\t" + "movl %2,%%eax\n\t" + "call *%1\n\t" + "jmp ret_from_intr\n\t" + "__xirq_end: cli\n" + : /* no output */ + : "a" (~irq), "r" (handler), "rm" (regs)); +} + +void irq_enter(void); +void irq_exit(void); + +static inline void __ipipe_call_root_virq_handler(unsigned irq, + ipipe_irq_handler_t handler, + void *cookie) +{ + irq_enter(); + __asm__ __volatile__("pushfl\n\t" + "pushl %%cs\n\t" + "pushl $__virq_end\n\t" + "pushl $-1\n\t" + "pushl %%gs\n\t" + "pushl %%fs\n\t" + "pushl %%es\n\t" + "pushl %%ds\n\t" + "pushl %%eax\n\t" + "pushl %%ebp\n\t" + "pushl %%edi\n\t" + "pushl %%esi\n\t" + "pushl %%edx\n\t" + "pushl %%ecx\n\t" + "pushl %%ebx\n\t" + "pushl %2\n\t" + "pushl %%eax\n\t" + "call *%1\n\t" + "addl $8,%%esp\n" + : /* no output */ + : "a" (irq), "r" (handler), "d" (cookie)); + irq_exit(); + __asm__ __volatile__("jmp ret_from_intr\n\t" + "__virq_end: cli\n" + : /* no output */ + : /* no input */); +} + +/* + * When running handlers, enable hw interrupts for all domains but the + * one heading the pipeline, so that IRQs can never be significantly + * deferred for the latter. + */ +#define __ipipe_run_isr(ipd, irq) \ +do { \ + if (!__ipipe_pipeline_head_p(ipd)) \ + local_irq_enable_hw(); \ + if (ipd == ipipe_root_domain) { \ + if (likely(!ipipe_virtual_irq_p(irq))) \ + __ipipe_call_root_xirq_handler(irq, \ + ipd->irqs[irq].handler); \ + else \ + __ipipe_call_root_virq_handler(irq, \ + ipd->irqs[irq].handler, \ + ipd->irqs[irq].cookie); \ + } else { \ + __clear_bit(IPIPE_SYNC_FLAG, &ipipe_cpudom_var(ipd, status)); \ + ipd->irqs[irq].handler(irq, ipd->irqs[irq].cookie); \ + __set_bit(IPIPE_SYNC_FLAG, &ipipe_cpudom_var(ipd, status)); \ + } \ + local_irq_disable_hw(); \ +} while(0) + +#endif /* !__X86_IPIPE_32_H */ diff --git a/arch/x86/include/asm/ipipe_64.h b/arch/x86/include/asm/ipipe_64.h new file mode 100644 index 0000000..bc427b8 --- /dev/null +++ b/arch/x86/include/asm/ipipe_64.h @@ -0,0 +1,161 @@ +/* -*- linux-c -*- + * arch/x86/include/asm/ipipe_64.h + * + * Copyright (C) 2007 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __X86_IPIPE_64_H +#define __X86_IPIPE_64_H + +#include +#include +#include +#include +#include +#ifdef CONFIG_SMP +#include +#include +#endif + +#define ipipe_read_tsc(t) do { \ + unsigned int __a,__d; \ + asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ + (t) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ +} while(0) + +extern unsigned cpu_khz; +#define ipipe_cpu_freq() ({ unsigned long __freq = (1000UL * cpu_khz); __freq; }) +#define ipipe_tsc2ns(t) (((t) * 1000UL) / (ipipe_cpu_freq() / 1000000UL)) +#define ipipe_tsc2us(t) ((t) / (ipipe_cpu_freq() / 1000000UL)) + +/* Private interface -- Internal use only */ + +int __ipipe_handle_irq(struct pt_regs *regs); + +static inline unsigned long __ipipe_ffnz(unsigned long ul) +{ + __asm__("bsrq %1, %0":"=r"(ul) + : "rm"(ul)); + return ul; +} + +struct irq_desc; + +void __ipipe_ack_edge_irq(unsigned irq, struct irq_desc *desc); + +void __ipipe_end_edge_irq(unsigned irq, struct irq_desc *desc); + +static inline void __ipipe_call_root_xirq_handler(unsigned irq, + void (*handler)(unsigned, void *)) +{ + struct pt_regs *regs = &__raw_get_cpu_var(__ipipe_tick_regs); + + regs->orig_ax = ~__ipipe_get_irq_vector(irq); + + __asm__ __volatile__("movq %%rsp, %%rax\n\t" + "pushq $0\n\t" + "pushq %%rax\n\t" + "pushfq\n\t" + "pushq %[kernel_cs]\n\t" + "pushq $__xirq_end\n\t" + "pushq %[vector]\n\t" + "subq $9*8,%%rsp\n\t" + "movq %%rdi,8*8(%%rsp)\n\t" + "movq %%rsi,7*8(%%rsp)\n\t" + "movq %%rdx,6*8(%%rsp)\n\t" + "movq %%rcx,5*8(%%rsp)\n\t" + "movq %%rax,4*8(%%rsp)\n\t" + "movq %%r8,3*8(%%rsp)\n\t" + "movq %%r9,2*8(%%rsp)\n\t" + "movq %%r10,1*8(%%rsp)\n\t" + "movq %%r11,(%%rsp)\n\t" + "call *%[handler]\n\t" + "cli\n\t" + "jmp exit_intr\n\t" + "__xirq_end: cli\n" + : /* no output */ + : [kernel_cs] "i" (__KERNEL_CS), + [vector] "rm" (regs->orig_ax), + [handler] "r" (handler), "D" (regs) + : "rax"); +} + +void irq_enter(void); +void irq_exit(void); + +static inline void __ipipe_call_root_virq_handler(unsigned irq, + void (*handler)(unsigned, void *), + void *cookie) +{ + irq_enter(); + __asm__ __volatile__("movq %%rsp, %%rax\n\t" + "pushq $0\n\t" + "pushq %%rax\n\t" + "pushfq\n\t" + "pushq %[kernel_cs]\n\t" + "pushq $__virq_end\n\t" + "pushq $-1\n\t" + "subq $9*8,%%rsp\n\t" + "movq %%rdi,8*8(%%rsp)\n\t" + "movq %%rsi,7*8(%%rsp)\n\t" + "movq %%rdx,6*8(%%rsp)\n\t" + "movq %%rcx,5*8(%%rsp)\n\t" + "movq %%rax,4*8(%%rsp)\n\t" + "movq %%r8,3*8(%%rsp)\n\t" + "movq %%r9,2*8(%%rsp)\n\t" + "movq %%r10,1*8(%%rsp)\n\t" + "movq %%r11,(%%rsp)\n\t" + "call *%[handler]\n\t" + : /* no output */ + : [kernel_cs] "i" (__KERNEL_CS), + [handler] "r" (handler), "D" (irq), "S" (cookie) + : "rax"); + irq_exit(); + __asm__ __volatile__("cli\n\t" + "jmp exit_intr\n\t" + "__virq_end: cli\n" + : /* no output */ + : /* no input */); +} + +/* + * When running handlers, enable hw interrupts for all domains but the + * one heading the pipeline, so that IRQs can never be significantly + * deferred for the latter. + */ +#define __ipipe_run_isr(ipd, irq) \ + do { \ + if (!__ipipe_pipeline_head_p(ipd)) \ + local_irq_enable_hw(); \ + if (ipd == ipipe_root_domain) { \ + if (likely(!ipipe_virtual_irq_p(irq))) \ + __ipipe_call_root_xirq_handler( \ + irq, (ipd)->irqs[irq].handler); \ + else \ + __ipipe_call_root_virq_handler( \ + irq, (ipd)->irqs[irq].handler, \ + (ipd)->irqs[irq].cookie); \ + } else { \ + __clear_bit(IPIPE_SYNC_FLAG, &ipipe_cpudom_var(ipd, status)); \ + ipd->irqs[irq].handler(irq, ipd->irqs[irq].cookie); \ + __set_bit(IPIPE_SYNC_FLAG, &ipipe_cpudom_var(ipd, status)); \ + } \ + local_irq_disable_hw(); \ + } while(0) + +#endif /* !__X86_IPIPE_64_H */ diff --git a/arch/x86/include/asm/ipipe_base.h b/arch/x86/include/asm/ipipe_base.h new file mode 100644 index 0000000..1098d6f --- /dev/null +++ b/arch/x86/include/asm/ipipe_base.h @@ -0,0 +1,210 @@ +/* -*- linux-c -*- + * arch/x86/include/asm/ipipe_base.h + * + * Copyright (C) 2007-2009 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __X86_IPIPE_BASE_H +#define __X86_IPIPE_BASE_H + +#include +#include +#include + +#ifdef CONFIG_X86_32 +#define IPIPE_NR_FAULTS 33 /* 32 from IDT + iret_error */ +#else +#define IPIPE_NR_FAULTS 32 +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) +/* + * System interrupts are mapped beyond the last defined external IRQ + * number. + */ +#define IPIPE_NR_XIRQS (NR_IRQS + 32) +#define IPIPE_FIRST_APIC_IRQ NR_IRQS +#define IPIPE_SERVICE_VECTOR0 (INVALIDATE_TLB_VECTOR_END + 1) +#define IPIPE_SERVICE_IPI0 ipipe_apic_vector_irq(IPIPE_SERVICE_VECTOR0) +#define IPIPE_SERVICE_VECTOR1 (INVALIDATE_TLB_VECTOR_END + 2) +#define IPIPE_SERVICE_IPI1 ipipe_apic_vector_irq(IPIPE_SERVICE_VECTOR1) +#define IPIPE_SERVICE_VECTOR2 (INVALIDATE_TLB_VECTOR_END + 3) +#define IPIPE_SERVICE_IPI2 ipipe_apic_vector_irq(IPIPE_SERVICE_VECTOR2) +#define IPIPE_SERVICE_VECTOR3 (INVALIDATE_TLB_VECTOR_END + 4) +#define IPIPE_SERVICE_IPI3 ipipe_apic_vector_irq(IPIPE_SERVICE_VECTOR3) +#ifdef CONFIG_SMP +#define IPIPE_CRITICAL_VECTOR (INVALIDATE_TLB_VECTOR_END + 5) +#define IPIPE_CRITICAL_IPI ipipe_apic_vector_irq(IPIPE_CRITICAL_VECTOR) +#endif +#define ipipe_apic_irq_vector(irq) ((irq) - IPIPE_FIRST_APIC_IRQ + FIRST_SYSTEM_VECTOR) +#define ipipe_apic_vector_irq(vec) ((vec) - FIRST_SYSTEM_VECTOR + IPIPE_FIRST_APIC_IRQ) +#else /* !(CONFIG_X86_64 || CONFIG_X86_LOCAL_APIC) */ +#define IPIPE_NR_XIRQS NR_IRQS +#endif /* !(CONFIG_X86_64 || CONFIG_X86_LOCAL_APIC) */ + +/* Pseudo-vectors used for kernel events */ +#define IPIPE_FIRST_EVENT IPIPE_NR_FAULTS +#define IPIPE_EVENT_SYSCALL (IPIPE_FIRST_EVENT) +#define IPIPE_EVENT_SCHEDULE (IPIPE_FIRST_EVENT + 1) +#define IPIPE_EVENT_SIGWAKE (IPIPE_FIRST_EVENT + 2) +#define IPIPE_EVENT_SETSCHED (IPIPE_FIRST_EVENT + 3) +#define IPIPE_EVENT_INIT (IPIPE_FIRST_EVENT + 4) +#define IPIPE_EVENT_EXIT (IPIPE_FIRST_EVENT + 5) +#define IPIPE_EVENT_CLEANUP (IPIPE_FIRST_EVENT + 6) +#define IPIPE_LAST_EVENT IPIPE_EVENT_CLEANUP +#define IPIPE_NR_EVENTS (IPIPE_LAST_EVENT + 1) + +#define ex_do_divide_error 0 +#define ex_do_debug 1 +/* NMI not pipelined. */ +#define ex_do_int3 3 +#define ex_do_overflow 4 +#define ex_do_bounds 5 +#define ex_do_invalid_op 6 +#define ex_do_device_not_available 7 +/* Double fault not pipelined. */ +#define ex_do_coprocessor_segment_overrun 9 +#define ex_do_invalid_TSS 10 +#define ex_do_segment_not_present 11 +#define ex_do_stack_segment 12 +#define ex_do_general_protection 13 +#define ex_do_page_fault 14 +#define ex_do_spurious_interrupt_bug 15 +#define ex_do_coprocessor_error 16 +#define ex_do_alignment_check 17 +#define ex_machine_check_vector 18 +#define ex_reserved ex_machine_check_vector +#define ex_do_simd_coprocessor_error 19 +#define ex_do_iret_error 32 + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_SMP + +#include + +#ifdef CONFIG_X86_32 +#define GET_ROOT_STATUS_ADDR \ + "pushfl; cli;" \ + "movl %%fs:per_cpu__this_cpu_off, %%eax;" \ + "lea per_cpu__ipipe_percpu_darray(%%eax), %%eax;" +#define PUT_ROOT_STATUS_ADDR "popfl;" +#define TEST_AND_SET_ROOT_STATUS \ + "btsl $0,(%%eax);" +#define TEST_ROOT_STATUS \ + "btl $0,(%%eax);" +#define ROOT_TEST_CLOBBER_LIST "eax" +#else /* CONFIG_X86_64 */ +#define GET_ROOT_STATUS_ADDR \ + "pushfq; cli;" \ + "movq %%gs:per_cpu__this_cpu_off, %%rax;" \ + "lea per_cpu__ipipe_percpu_darray(%%rax), %%rax;" +#define PUT_ROOT_STATUS_ADDR "popfq;" +#define TEST_AND_SET_ROOT_STATUS \ + "btsl $0,(%%rax);" +#define TEST_ROOT_STATUS \ + "btl $0,(%%rax);" +#define ROOT_TEST_CLOBBER_LIST "rax" +#endif /* CONFIG_X86_64 */ + +static inline void __ipipe_stall_root(void) +{ + __asm__ __volatile__(GET_ROOT_STATUS_ADDR + LOCK_PREFIX + TEST_AND_SET_ROOT_STATUS + PUT_ROOT_STATUS_ADDR + : : : ROOT_TEST_CLOBBER_LIST, "memory"); +} + +static inline unsigned long __ipipe_test_and_stall_root(void) +{ + int oldbit; + + __asm__ __volatile__(GET_ROOT_STATUS_ADDR + LOCK_PREFIX + TEST_AND_SET_ROOT_STATUS + "sbbl %0,%0;" + PUT_ROOT_STATUS_ADDR + :"=r" (oldbit) + : : ROOT_TEST_CLOBBER_LIST, "memory"); + return oldbit; +} + +static inline unsigned long __ipipe_test_root(void) +{ + int oldbit; + + __asm__ __volatile__(GET_ROOT_STATUS_ADDR + TEST_ROOT_STATUS + "sbbl %0,%0;" + PUT_ROOT_STATUS_ADDR + :"=r" (oldbit) + : : ROOT_TEST_CLOBBER_LIST); + return oldbit; +} + +#else /* !CONFIG_SMP */ + +#if __GNUC__ >= 4 +/* Alias to ipipe_root_cpudom_var(status) */ +extern unsigned long __ipipe_root_status; +#else +extern unsigned long *const __ipipe_root_status_addr; +#define __ipipe_root_status (*__ipipe_root_status_addr) +#endif + +static inline void __ipipe_stall_root(void) +{ + volatile unsigned long *p = &__ipipe_root_status; + __asm__ __volatile__("btsl $0,%0;" + :"+m" (*p) : : "memory"); +} + +static inline unsigned long __ipipe_test_and_stall_root(void) +{ + volatile unsigned long *p = &__ipipe_root_status; + int oldbit; + + __asm__ __volatile__("btsl $0,%1;" + "sbbl %0,%0;" + :"=r" (oldbit), "+m" (*p) + : : "memory"); + return oldbit; +} + +static inline unsigned long __ipipe_test_root(void) +{ + volatile unsigned long *p = &__ipipe_root_status; + int oldbit; + + __asm__ __volatile__("btl $0,%1;" + "sbbl %0,%0;" + :"=r" (oldbit) + :"m" (*p)); + return oldbit; +} + +#endif /* !CONFIG_SMP */ + +void __ipipe_halt_root(void); + +void __ipipe_serial_debug(const char *fmt, ...); + +#endif /* !__ASSEMBLY__ */ + +#endif /* !__X86_IPIPE_BASE_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6e90a04..6178f92 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -91,10 +91,17 @@ #define THRESHOLD_APIC_VECTOR 0xf9 #define REBOOT_VECTOR 0xf8 +#ifdef CONFIG_IPIPE +/* f0-f2 used for TLB flush, f3-f7 reserved for the I-pipe */ +#define INVALIDATE_TLB_VECTOR_END 0xf2 +#define INVALIDATE_TLB_VECTOR_START 0xf0 +#define NUM_INVALIDATE_TLB_VECTORS 3 +#else /* !CONFIG_IPIPE */ /* f0-f7 used for spreading out TLB flushes: */ #define INVALIDATE_TLB_VECTOR_END 0xf7 #define INVALIDATE_TLB_VECTOR_START 0xf0 #define NUM_INVALIDATE_TLB_VECTORS 8 +#endif /* * Local APIC timer IRQ vector is on a different priority level, @@ -120,6 +127,9 @@ */ #define MCE_SELF_VECTOR 0xeb +/* I-pipe: Lowest number of vectors above */ +#define FIRST_SYSTEM_VECTOR 0xea + /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 9e2b952..0d8d5e5 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -4,6 +4,10 @@ #include #ifndef __ASSEMBLY__ + +#include +#include + /* * Interrupt control: */ @@ -12,6 +16,10 @@ static inline unsigned long native_save_fl(void) { unsigned long flags; +#ifdef CONFIG_IPIPE + flags = (!__ipipe_test_root()) << 9; + barrier(); +#else /* * "=rm" is safe here, because "pop" adjusts the stack before * it evaluates its effective address -- this is part of the @@ -22,31 +30,53 @@ static inline unsigned long native_save_fl(void) : "=rm" (flags) : /* no input */ : "memory"); +#endif return flags; } static inline void native_restore_fl(unsigned long flags) { +#ifdef CONFIG_IPIPE + barrier(); + __ipipe_restore_root(!(flags & X86_EFLAGS_IF)); +#else asm volatile("push %0 ; popf" : /* no output */ :"g" (flags) :"memory", "cc"); +#endif } static inline void native_irq_disable(void) { +#ifdef CONFIG_IPIPE + ipipe_check_context(ipipe_root_domain); + __ipipe_stall_root(); + barrier(); +#else asm volatile("cli": : :"memory"); +#endif } static inline void native_irq_enable(void) { +#ifdef CONFIG_IPIPE + barrier(); + __ipipe_unstall_root(); +#else asm volatile("sti": : :"memory"); +#endif } static inline void native_safe_halt(void) { +#ifdef CONFIG_IPIPE + barrier(); + __ipipe_halt_root(); +#else asm volatile("sti; hlt": : :"memory"); +#endif } static inline void native_halt(void) @@ -71,6 +101,71 @@ static inline void raw_local_irq_restore(unsigned long flags) native_restore_fl(flags); } +static inline unsigned long raw_mangle_irq_bits(int virt, unsigned long real) +{ + /* + * Merge virtual and real interrupt mask bits into a single + * (32bit) word. + */ + return (real & ~(1L << 31)) | ((virt != 0) << 31); +} + +static inline int raw_demangle_irq_bits(unsigned long *x) +{ + int virt = (*x & (1L << 31)) != 0; + *x &= ~(1L << 31); + return virt; +} + +#define local_irq_save_hw_notrace(x) \ + __asm__ __volatile__("pushf ; pop %0 ; cli":"=g" (x): /* no input */ :"memory") +#define local_irq_restore_hw_notrace(x) \ + __asm__ __volatile__("push %0 ; popf": /* no output */ :"g" (x):"memory", "cc") + +#define local_save_flags_hw(x) __asm__ __volatile__("pushf ; pop %0":"=g" (x): /* no input */) + +#define irqs_disabled_hw() \ + ({ \ + unsigned long x; \ + local_save_flags_hw(x); \ + !((x) & X86_EFLAGS_IF); \ + }) + +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF +#define local_irq_disable_hw() do { \ + if (!irqs_disabled_hw()) { \ + local_irq_disable_hw_notrace(); \ + ipipe_trace_begin(0x80000000); \ + } \ + } while (0) +#define local_irq_enable_hw() do { \ + if (irqs_disabled_hw()) { \ + ipipe_trace_end(0x80000000); \ + local_irq_enable_hw_notrace(); \ + } \ + } while (0) +#define local_irq_save_hw(x) do { \ + local_save_flags_hw(x); \ + if ((x) & X86_EFLAGS_IF) { \ + local_irq_disable_hw_notrace(); \ + ipipe_trace_begin(0x80000001); \ + } \ + } while (0) +#define local_irq_restore_hw(x) do { \ + if ((x) & X86_EFLAGS_IF) \ + ipipe_trace_end(0x80000001); \ + local_irq_restore_hw_notrace(x); \ + } while (0) +#else /* !CONFIG_IPIPE_TRACE_IRQSOFF */ +#define local_irq_save_hw(x) local_irq_save_hw_notrace(x) +#define local_irq_restore_hw(x) local_irq_restore_hw_notrace(x) +#define local_irq_enable_hw() local_irq_enable_hw_notrace() +#define local_irq_disable_hw() local_irq_disable_hw_notrace() +#endif /* CONFIG_IPIPE_TRACE_IRQSOFF */ + +#define local_irq_disable_hw_notrace() __asm__ __volatile__("cli": : :"memory") +#define local_irq_enable_hw_notrace() __asm__ __volatile__("sti": : :"memory") + static inline void raw_local_irq_disable(void) { native_irq_disable(); @@ -104,16 +199,40 @@ static inline void halt(void) */ static inline unsigned long __raw_local_irq_save(void) { +#ifdef CONFIG_IPIPE + unsigned long flags = (!__ipipe_test_and_stall_root()) << 9; + barrier(); +#else unsigned long flags = __raw_local_save_flags(); raw_local_irq_disable(); +#endif return flags; } #else -#define ENABLE_INTERRUPTS(x) sti -#define DISABLE_INTERRUPTS(x) cli +#ifdef CONFIG_IPIPE +#ifdef CONFIG_X86_32 +#define DISABLE_INTERRUPTS(clobbers) PER_CPU(ipipe_percpu_darray, %eax); btsl $0,(%eax); sti +#define ENABLE_INTERRUPTS(clobbers) call __ipipe_unstall_root +#else /* CONFIG_X86_64 */ +/* Not worth virtualizing in x86_64 mode. */ +#define DISABLE_INTERRUPTS(clobbers) cli +#define ENABLE_INTERRUPTS(clobbers) sti +#endif /* CONFIG_X86_64 */ +#define ENABLE_INTERRUPTS_HW_COND sti +#define DISABLE_INTERRUPTS_HW_COND cli +#define DISABLE_INTERRUPTS_HW(clobbers) cli +#define ENABLE_INTERRUPTS_HW(clobbers) sti +#else /* !CONFIG_IPIPE */ +#define ENABLE_INTERRUPTS(x) sti +#define DISABLE_INTERRUPTS(x) cli +#define ENABLE_INTERRUPTS_HW_COND +#define DISABLE_INTERRUPTS_HW_COND +#define DISABLE_INTERRUPTS_HW(clobbers) DISABLE_INTERRUPTS(clobbers) +#define ENABLE_INTERRUPTS_HW(clobbers) ENABLE_INTERRUPTS(clobbers) +#endif /* !CONFIG_IPIPE */ #ifdef CONFIG_X86_64 #define SWAPGS swapgs @@ -156,8 +275,10 @@ static inline unsigned long __raw_local_irq_save(void) #define raw_local_save_flags(flags) \ do { (flags) = __raw_local_save_flags(); } while (0) -#define raw_local_irq_save(flags) \ - do { (flags) = __raw_local_irq_save(); } while (0) +#define raw_local_irq_save(flags) do { \ + ipipe_check_context(ipipe_root_domain); \ + (flags) = __raw_local_irq_save(); \ + } while (0) static inline int raw_irqs_disabled_flags(unsigned long flags) { diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4a2d4e0..1ee45d4 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -30,11 +30,14 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) #endif } -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +static inline void __switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) { unsigned cpu = smp_processor_id(); +#ifdef CONFIG_IPIPE_DEBUG_INTERNAL + WARN_ON_ONCE(!irqs_disabled_hw()); +#endif if (likely(prev != next)) { /* stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); @@ -70,10 +73,23 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, #endif } +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + local_irq_save_hw_cond(flags); + __switch_mm(prev, next, tsk); + local_irq_restore_hw_cond(flags); +} + +#define ipipe_mm_switch_protect(flags) local_irq_save_hw_cond(flags) +#define ipipe_mm_switch_unprotect(flags) \ + local_irq_restore_hw_cond(flags) + #define activate_mm(prev, next) \ do { \ paravirt_activate_mm((prev), (next)); \ - switch_mm((prev), (next), NULL); \ + __switch_mm((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 139d4c1..3914d19 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -29,7 +29,7 @@ extern void setup_apic_nmi_watchdog(void *); extern void stop_apic_nmi_watchdog(void *); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); -extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason); +extern int (*nmi_watchdog_tick)(struct pt_regs *regs, unsigned reason); extern void cpu_nmi_set_wd_enabled(void); extern atomic_t nmi_active; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 13b1885..3e80c19 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -435,6 +435,7 @@ struct thread_struct { unsigned short ds; unsigned short fsindex; unsigned short gsindex; + unsigned long rip; #endif #ifdef CONFIG_X86_32 unsigned long ip; diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index f08f973..093687e 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -126,8 +126,10 @@ do { \ #define switch_to(prev, next, last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq $thread_return,%P[threadrip](%[prev])\n\t" /* save RIP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ + "pushq %P[threadrip](%[next])\n\t" /* restore RIP */ \ + "jmp __switch_to\n\t" \ ".globl thread_return\n" \ "thread_return:\n\t" \ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ @@ -141,6 +143,7 @@ do { \ __switch_canary_oparam \ : [next] "S" (next), [prev] "D" (prev), \ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [threadrip] "i" (offsetof(struct task_struct, thread.rip)), \ [ti_flags] "i" (offsetof(struct thread_info, flags)), \ [_tif_fork] "i" (_TIF_FORK), \ [thread_info] "i" (offsetof(struct task_struct, stack)), \ @@ -305,8 +308,13 @@ static inline void native_wbinvd(void) #else #define read_cr0() (native_read_cr0()) #define write_cr0(x) (native_write_cr0(x)) +#ifdef CONFIG_IPIPE +#define read_cr2() __raw_get_cpu_var(__ipipe_cr2) +#define write_cr2(x) __raw_get_cpu_var(__ipipe_cr2) = (x) +#else /* !CONFIG_IPIPE */ #define read_cr2() (native_read_cr2()) #define write_cr2(x) (native_write_cr2(x)) +#endif /* !CONFIG_IPIPE */ #define read_cr3() (native_read_cr3()) #define write_cr3(x) (native_write_cr3(x)) #define read_cr4() (native_read_cr4()) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 4da91ad..25e346e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -82,8 +82,8 @@ extern int panic_on_unrecovered_nmi; void math_error(void __user *); void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 -asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif +asmlinkage void smp_thermal_interrupt(void); #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d8e5d0c..847cc01 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -85,6 +85,7 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_IPIPE) += ipipe.o obj-$(CONFIG_HPET_TIMER) += hpet.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0e69e17..47586ca 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -446,7 +446,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, if (evt->features & CLOCK_EVT_FEAT_DUMMY) return; - local_irq_save(flags); + local_irq_save_hw(flags); switch (mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -466,7 +466,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, break; } - local_irq_restore(flags); + local_irq_restore_hw(flags); } /* @@ -982,7 +982,7 @@ void lapic_shutdown(void) if (!cpu_has_apic && !apic_from_smp_config()) return; - local_irq_save(flags); + local_irq_save_hw(flags); #ifdef CONFIG_X86_32 if (!enabled_via_apicbase) @@ -992,7 +992,7 @@ void lapic_shutdown(void) disable_local_APIC(); - local_irq_restore(flags); + local_irq_restore_hw(flags); } /* @@ -1166,6 +1166,10 @@ static void __cpuinit lapic_setup_esr(void) oldvalue, value); } +int __ipipe_check_lapic(void) +{ + return !(lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY); +} /** * setup_local_APIC - setup the local APIC @@ -1229,7 +1233,7 @@ void __cpuinit setup_local_APIC(void) value = apic_read(APIC_ISR + i*0x10); for (j = 31; j >= 0; j--) { if (value & (1<> 1)); if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); + __ack_APIC_irq(); inc_irq_stat(irq_spurious_count); @@ -2004,13 +2008,13 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif - local_irq_save(flags); + local_irq_save_hw(flags); disable_local_APIC(); if (intr_remapping_enabled) disable_intr_remapping(); - local_irq_restore(flags); + local_irq_restore_hw(flags); return 0; } @@ -2025,7 +2029,7 @@ static int lapic_resume(struct sys_device *dev) if (!apic_pm_state.active) return 0; - local_irq_save(flags); + local_irq_save_hw(flags); if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { @@ -2091,7 +2095,7 @@ static int lapic_resume(struct sys_device *dev) free_ioapic_entries(ioapic_entries); } restore: - local_irq_restore(flags); + local_irq_restore_hw(flags); return ret; } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 873f81f..aada533 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -72,9 +72,9 @@ static inline void _flat_send_IPI_mask(unsigned long mask, int vector) { unsigned long flags; - local_irq_save(flags); + local_irq_save_hw(flags); __default_send_IPI_dest_field(mask, vector, apic->dest_logical); - local_irq_restore(flags); + local_irq_restore_hw(flags); } static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index dc4f486..2ed892b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -75,8 +75,11 @@ */ int sis_apic_bug = -1; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static IPIPE_DEFINE_SPINLOCK(ioapic_lock); +static IPIPE_DEFINE_SPINLOCK(vector_lock); +#ifdef CONFIG_IPIPE +unsigned long bugous_edge_irq_triggers[(NR_IRQS + BITS_PER_LONG - 1) / BITS_PER_LONG]; +#endif /* * # of IRQ routing registers @@ -417,6 +420,8 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } +#if !defined(CONFIG_IPIPE) || defined(CONFIG_SMP) + static bool io_apic_level_ack_pending(struct irq_cfg *cfg) { struct irq_pin_list *entry; @@ -440,6 +445,8 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) return false; } +#endif /* !CONFIG_IPIPE || CONFIG_SMP */ + union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -615,6 +622,7 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) BUG_ON(!cfg); spin_lock_irqsave(&ioapic_lock, flags); + ipipe_irq_lock(desc->irq); __mask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -625,7 +633,13 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); +#ifdef CONFIG_IPIPE + if (test_and_clear_bit(desc->irq, &bugous_edge_irq_triggers[0])) + __unmask_and_level_IO_APIC_irq(cfg); + else +#endif __unmask_IO_APIC_irq(cfg); + ipipe_irq_unlock(desc->irq); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -2250,6 +2264,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) } cfg = irq_cfg(irq); __unmask_IO_APIC_irq(cfg); + ipipe_irq_unlock(irq); spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -2529,23 +2544,61 @@ static void irq_complete_move(struct irq_desc **descp) static inline void irq_complete_move(struct irq_desc **descp) {} #endif +#if defined(CONFIG_IPIPE) && defined(CONFIG_SMP) + +#ifdef CONFIG_INTR_REMAP +static void eoi_ioapic_irq(struct irq_desc *desc); +#else /* !CONFIG_INTR_REMAP */ +static inline void eoi_ioapic_irq(struct irq_desc *desc) {} +#endif /* !CONFIG_INTR_REMAP */ + +static void move_apic_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg; + + if (desc->handle_irq == &handle_edge_irq) { + spin_lock(&desc->lock); + irq_complete_move(&desc); + move_native_irq(irq); + spin_unlock(&desc->lock); + } else if (desc->handle_irq == &handle_fasteoi_irq) { + spin_lock(&desc->lock); + irq_complete_move(&desc); + if (irq_remapped(irq)) + eoi_ioapic_irq(desc); + if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + cfg = desc->chip_data; + if (!io_apic_level_ack_pending(cfg)) + move_masked_irq(irq); + unmask_IO_APIC_irq_desc(desc); + } + spin_unlock(&desc->lock); + } else + WARN_ON_ONCE(1); +} +#endif /* CONFIG_IPIPE && CONFIG_SMP */ + static void ack_apic_edge(unsigned int irq) { +#ifndef CONFIG_IPIPE struct irq_desc *desc = irq_to_desc(irq); irq_complete_move(&desc); move_native_irq(irq); - ack_APIC_irq(); +#endif /* CONFIG_IPIPE */ + __ack_APIC_irq(); } atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long v; int i; struct irq_cfg *cfg; +#ifndef CONFIG_IPIPE + struct irq_desc *desc = irq_to_desc(irq); int do_unmask_irq = 0; irq_complete_move(&desc); @@ -2628,6 +2681,26 @@ static void ack_apic_level(unsigned int irq) __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } +#else /* CONFIG_IPIPE */ + /* + * Prevent low priority IRQs grabbed by high priority domains + * from being delayed, waiting for a high priority interrupt + * handler running in a low priority domain to complete. + */ + cfg = irq_cfg(irq); + i = cfg->vector; + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + spin_lock(&ioapic_lock); + if (unlikely(!(v & (1 << (i & 0x1f))))) { + /* IO-APIC erratum: see comment above. */ + atomic_inc(&irq_mis_count); + __mask_and_edge_IO_APIC_irq(cfg); + set_bit(irq, &bugous_edge_irq_triggers[0]); + } else + __mask_IO_APIC_irq(cfg); + spin_unlock(&ioapic_lock); + __ack_APIC_irq(); +#endif /* CONFIG_IPIPE */ } #ifdef CONFIG_INTR_REMAP @@ -2656,14 +2729,14 @@ eoi_ioapic_irq(struct irq_desc *desc) static void ir_ack_apic_edge(unsigned int irq) { - ack_APIC_irq(); + __ack_APIC_irq(); } static void ir_ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - ack_APIC_irq(); + __ack_APIC_irq(); eoi_ioapic_irq(desc); } #endif /* CONFIG_INTR_REMAP */ @@ -2677,6 +2750,9 @@ static struct irq_chip ioapic_chip __read_mostly = { .eoi = ack_apic_level, #ifdef CONFIG_SMP .set_affinity = set_ioapic_affinity_irq, +#ifdef CONFIG_IPIPE + .move = move_apic_irq, +#endif #endif .retrigger = ioapic_retrigger_irq, }; @@ -2691,6 +2767,9 @@ static struct irq_chip ir_ioapic_chip __read_mostly = { .eoi = ir_ack_apic_level, #ifdef CONFIG_SMP .set_affinity = set_ir_ioapic_affinity_irq, +#ifdef CONFIG_IPIPE + .move = move_apic_irq, +#endif #endif #endif .retrigger = ioapic_retrigger_irq, @@ -2736,23 +2815,29 @@ static inline void init_IO_APIC_traps(void) static void mask_lapic_irq(unsigned int irq) { - unsigned long v; + unsigned long v, flags; + local_irq_save_hw_cond(flags); + ipipe_irq_lock(irq); v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + local_irq_restore_hw_cond(flags); } static void unmask_lapic_irq(unsigned int irq) { - unsigned long v; + unsigned long v, flags; + local_irq_save_hw_cond(flags); v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); + ipipe_irq_unlock(irq); + local_irq_restore_hw_cond(flags); } static void ack_lapic_irq(unsigned int irq) { - ack_APIC_irq(); + __ack_APIC_irq(); } static struct irq_chip lapic_chip __read_mostly = { @@ -2760,6 +2845,9 @@ static struct irq_chip lapic_chip __read_mostly = { .mask = mask_lapic_irq, .unmask = unmask_lapic_irq, .ack = ack_lapic_irq, +#if defined(CONFIG_IPIPE) && defined(CONFIG_SMP) + .move = move_apic_irq, +#endif }; static void lapic_register_intr(int irq, struct irq_desc *desc) @@ -3007,6 +3095,10 @@ static inline void __init check_timer(void) "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0, desc); +#if defined(CONFIG_IPIPE) && defined(CONFIG_X86_64) + irq_to_desc(0)->ipipe_ack = __ipipe_ack_edge_irq; + irq_to_desc(0)->ipipe_end = __ipipe_end_edge_irq; +#endif apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); @@ -3404,6 +3496,9 @@ static struct irq_chip msi_chip = { .ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_msi_irq_affinity, +#ifdef CONFIG_IPIPE + .move = move_apic_irq, +#endif #endif .retrigger = ioapic_retrigger_irq, }; @@ -3416,6 +3511,9 @@ static struct irq_chip msi_ir_chip = { .ack = ir_ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = ir_set_msi_irq_affinity, +#ifdef CONFIG_IPIPE + .move = move_apic_irq, +#endif #endif #endif .retrigger = ioapic_retrigger_irq, @@ -3704,6 +3802,9 @@ static struct irq_chip ht_irq_chip = { .ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_ht_irq_affinity, +#ifdef CONFIG_IPIPE + .move = move_apic_irq, +#endif #endif .retrigger = ioapic_retrigger_irq, }; @@ -4075,6 +4176,14 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) return 0; } +#ifdef CONFIG_IPIPE +unsigned __ipipe_get_ioapic_irq_vector(int irq) +{ + return irq >= IPIPE_FIRST_APIC_IRQ && irq < IPIPE_NR_XIRQS ? + ipipe_apic_irq_vector(irq) : irq_cfg(irq)->vector; +} +#endif /* CONFIG_IPIPE */ + /* * This function currently is only a helper for the i386 smp boot process where * we need to reprogram the ioredtbls to cater for the cpus which have come online diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 08385e0..f5ad117 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -29,12 +29,12 @@ void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) * to an arbitrary mask, so I do a unicast to each CPU instead. * - mbligh */ - local_irq_save(flags); + local_irq_save_hw(flags); for_each_cpu(query_cpu, mask) { __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } - local_irq_restore(flags); + local_irq_restore_hw(flags); } void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, @@ -46,14 +46,14 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, /* See Hack comment above */ - local_irq_save(flags); + local_irq_save_hw(flags); for_each_cpu(query_cpu, mask) { if (query_cpu == this_cpu) continue; __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } - local_irq_restore(flags); + local_irq_restore_hw(flags); } void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, @@ -68,12 +68,12 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, * should be modified to do 1 message per cluster ID - mbligh */ - local_irq_save(flags); + local_irq_save_hw(flags); for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( apic->cpu_to_logical_apicid(query_cpu), vector, apic->dest_logical); - local_irq_restore(flags); + local_irq_restore_hw(flags); } void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, @@ -85,7 +85,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, /* See Hack comment above */ - local_irq_save(flags); + local_irq_save_hw(flags); for_each_cpu(query_cpu, mask) { if (query_cpu == this_cpu) continue; @@ -93,7 +93,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, apic->cpu_to_logical_apicid(query_cpu), vector, apic->dest_logical); } - local_irq_restore(flags); + local_irq_restore_hw(flags); } #ifdef CONFIG_X86_32 @@ -109,10 +109,10 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) if (WARN_ONCE(!mask, "empty IPI mask")) return; - local_irq_save(flags); + local_irq_save_hw(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __default_send_IPI_dest_field(mask, vector, apic->dest_logical); - local_irq_restore(flags); + local_irq_restore_hw(flags); } void default_send_IPI_allbutself(int vector) diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 7ff61d6..a72056e 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -59,6 +59,10 @@ static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); static int endflag __initdata; +static int default_nmi_watchdog_tick(struct pt_regs * regs, unsigned reason); +int (*nmi_watchdog_tick) (struct pt_regs * regs, unsigned reason) = &default_nmi_watchdog_tick; +EXPORT_SYMBOL(nmi_watchdog_tick); + static inline unsigned int get_nmi_count(int cpu) { return per_cpu(irq_stat, cpu).__nmi_count; @@ -387,7 +391,7 @@ void touch_nmi_watchdog(void) EXPORT_SYMBOL(touch_nmi_watchdog); notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) +default_nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { /* * Since current_thread_info()-> is always on the stack, and we diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index a5371ec..442f45c 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -61,13 +61,13 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) x2apic_wrmsr_fence(); - local_irq_save(flags); + local_irq_save_hw(flags); for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest( per_cpu(x86_cpu_to_logical_apicid, query_cpu), vector, apic->dest_logical); } - local_irq_restore(flags); + local_irq_restore_hw(flags); } static void @@ -79,7 +79,7 @@ static void x2apic_wrmsr_fence(); - local_irq_save(flags); + local_irq_save_hw(flags); for_each_cpu(query_cpu, mask) { if (query_cpu == this_cpu) continue; @@ -87,7 +87,7 @@ static void per_cpu(x86_cpu_to_logical_apicid, query_cpu), vector, apic->dest_logical); } - local_irq_restore(flags); + local_irq_restore_hw(flags); } static void x2apic_send_IPI_allbutself(int vector) @@ -98,7 +98,7 @@ static void x2apic_send_IPI_allbutself(int vector) x2apic_wrmsr_fence(); - local_irq_save(flags); + local_irq_save_hw(flags); for_each_online_cpu(query_cpu) { if (query_cpu == this_cpu) continue; @@ -106,7 +106,7 @@ static void x2apic_send_IPI_allbutself(int vector) per_cpu(x86_cpu_to_logical_apicid, query_cpu), vector, apic->dest_logical); } - local_irq_restore(flags); + local_irq_restore_hw(flags); } static void x2apic_send_IPI_all(int vector) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a8989aa..fba85fa 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -62,12 +62,12 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) x2apic_wrmsr_fence(); - local_irq_save(flags); + local_irq_save_hw(flags); for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } - local_irq_restore(flags); + local_irq_restore_hw(flags); } static void @@ -79,14 +79,14 @@ static void x2apic_wrmsr_fence(); - local_irq_save(flags); + local_irq_save_hw(flags); for_each_cpu(query_cpu, mask) { if (query_cpu != this_cpu) __x2apic_send_IPI_dest( per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } - local_irq_restore(flags); + local_irq_restore_hw(flags); } static void x2apic_send_IPI_allbutself(int vector) @@ -97,14 +97,14 @@ static void x2apic_send_IPI_allbutself(int vector) x2apic_wrmsr_fence(); - local_irq_save(flags); + local_irq_save_hw(flags); for_each_online_cpu(query_cpu) { if (query_cpu == this_cpu) continue; __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } - local_irq_restore(flags); + local_irq_restore_hw(flags); } static void x2apic_send_IPI_all(int vector) diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 228d982..c249555 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -18,7 +18,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ - local_irq_save(flags); + local_irq_save_hw(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ @@ -28,7 +28,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, rcr = getCx86(CX86_RCR_BASE + reg); setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ - local_irq_restore(flags); + local_irq_restore_hw(flags); shift = ((unsigned char *) base)[1] & 0x0f; *base >>= PAGE_SHIFT; @@ -178,6 +178,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { unsigned char arr, arr_type, arr_size; + unsigned long flags; arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ @@ -221,6 +222,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, } } + local_irq_save_hw(flags); + prepare_set(); base <<= PAGE_SHIFT; @@ -230,6 +233,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, setCx86(CX86_RCR_BASE + reg, arr_type); post_set(); + + local_irq_restore_hw(flags); } typedef struct { @@ -247,8 +252,10 @@ static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 }; static void cyrix_set_all(void) { + unsigned long flags; int i; + local_irq_save_hw(flags); prepare_set(); /* the CCRs are not contiguous */ @@ -263,6 +270,7 @@ static void cyrix_set_all(void) } post_set(); + local_irq_restore_hw(flags); } static struct mtrr_ops cyrix_mtrr_ops = { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 55da0c5..5594a98 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -635,7 +635,7 @@ static void generic_set_all(void) unsigned long mask, count; unsigned long flags; - local_irq_save(flags); + local_irq_save_hw(flags); prepare_set(); /* Actually set the state */ @@ -645,7 +645,7 @@ static void generic_set_all(void) pat_init(); post_set(); - local_irq_restore(flags); + local_irq_restore_hw(flags); /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { @@ -669,12 +669,12 @@ static void generic_set_all(void) static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { - unsigned long flags; + unsigned long flags, _flags; struct mtrr_var_range *vr; vr = &mtrr_state.var_ranges[reg]; - local_irq_save(flags); + local_irq_save_full(flags, _flags); prepare_set(); if (size == 0) { @@ -695,7 +695,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, } post_set(); - local_irq_restore(flags); + local_irq_restore_full(flags, _flags); } int generic_validate_add_page(unsigned long base, unsigned long size, diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2d8a371..8c6afa5 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -327,6 +327,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) local_irq_enable(); do_exit(SIGBUS); } +EXPORT_SYMBOL_GPL(die_nmi); static int __init oops_setup(char *s) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f7dd2a7..37b2338 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -108,6 +108,9 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), current_thread_info(), current, task_thread_info(current)); +#ifdef CONFIG_IPIPE + printk(KERN_EMERG "I-pipe domain %s\n", ipipe_current_domain->name); +#endif /* CONFIG_IPIPE */ /* * When in-kernel, we also print out the stack and code at the * time of the fault.. diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a071e6b..de2dde2 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -254,6 +254,11 @@ void show_registers(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); __show_regs(regs, 1); +#ifdef CONFIG_IPIPE + if (ipipe_current_domain != ipipe_root_domain) + printk("I-pipe domain %s\n", ipipe_current_domain->name); + else +#endif /* CONFIG_IPIPE */ printk("Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d..5918f48 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,59 @@ #define nr_syscalls ((syscall_table_size)/4) +#ifdef CONFIG_IPIPE +#define EMULATE_ROOT_IRET(bypass) \ + call __ipipe_unstall_iret_root ; \ + TRACE_IRQS_ON ; \ + bypass: \ + movl PT_EAX(%esp),%eax +#define TEST_PREEMPTIBLE(regs) call __ipipe_kpreempt_root ; testl %eax,%eax +#define CATCH_ROOT_SYSCALL(bypass1,bypass2) \ + movl %esp,%eax ; \ + call __ipipe_syscall_root ; \ + testl %eax,%eax ; \ + js bypass1 ; \ + jne bypass2 ; \ + movl PT_ORIG_EAX(%esp),%eax +#define PUSH_XCODE(v) pushl $ ex_ ## v +#define PUSH_XVEC(v) pushl $ ex_ ## v +#define HANDLE_EXCEPTION(code) movl %code,%ecx ; \ + call __ipipe_handle_exception ; \ + testl %eax,%eax ; \ + jnz restore_ret +#define DIVERT_EXCEPTION(code) movl $(__USER_DS), %ecx ; \ + movl %ecx, %ds ; \ + movl %ecx, %es ; \ + movl %esp, %eax ; \ + movl $ex_ ## code,%edx ; \ + call __ipipe_divert_exception ; \ + testl %eax,%eax ; \ + jnz restore_ret + +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF +# define IPIPE_TRACE_IRQ_ENTER \ + lea PT_EIP-4(%esp), %ebp; \ + movl PT_ORIG_EAX(%esp), %eax; \ + call ipipe_trace_begin +# define IPIPE_TRACE_IRQ_EXIT \ + pushl %eax; \ + movl PT_ORIG_EAX+4(%esp), %eax; \ + call ipipe_trace_end; \ + popl %eax +#else /* !CONFIG_IPIPE_TRACE_IRQSOFF */ +#define IPIPE_TRACE_IRQ_ENTER +#define IPIPE_TRACE_IRQ_EXIT +#endif /* CONFIG_IPIPE_TRACE_IRQSOFF */ +#else /* !CONFIG_IPIPE */ +#define EMULATE_ROOT_IRET(bypass) +#define TEST_PREEMPTIBLE(regs) testl $X86_EFLAGS_IF,PT_EFLAGS(regs) +#define CATCH_ROOT_SYSCALL(bypass1,bypass2) +#define PUSH_XCODE(v) pushl $v +#define PUSH_XVEC(v) pushl v +#define HANDLE_EXCEPTION(code) call *%code +#define DIVERT_EXCEPTION(code) +#endif /* CONFIG_IPIPE */ + #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -318,6 +372,7 @@ .endm ENTRY(ret_from_fork) + ENABLE_INTERRUPTS_HW_COND CFI_STARTPROC pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -345,7 +400,7 @@ END(ret_from_fork) RING0_PTREGS_FRAME ret_from_exception: preempt_stop(CLBR_ANY) -ret_from_intr: +ENTRY(ret_from_intr) GET_THREAD_INFO(%ebp) check_userspace: movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS @@ -369,14 +424,14 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) + DISABLE_INTERRUPTS_HW(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_all need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl jz restore_all - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? + TEST_PREEMPTIBLE(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq jmp need_resched @@ -424,7 +479,7 @@ sysenter_past_esp: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL - ENABLE_INTERRUPTS(CLBR_NONE) + ENABLE_INTERRUPTS_HW(CLBR_NONE) /* * Load the potential sixth argument from user stack. @@ -440,6 +495,7 @@ sysenter_past_esp: .previous GET_THREAD_INFO(%ebp) + CATCH_ROOT_SYSCALL(sysenter_tail,sysenter_out) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit @@ -448,6 +504,7 @@ sysenter_do_call: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) +sysenter_tail: LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF @@ -456,10 +513,13 @@ sysenter_do_call: jne sysexit_audit sysenter_exit: /* if something modifies registers it must also disable sysexit */ + EMULATE_ROOT_IRET(sysenter_out) movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp - TRACE_IRQS_ON +#ifndef CONFIG_IPIPE + TRACE_IRQS_ON +#endif 1: mov PT_FS(%esp), %fs PTGS_TO_GS ENABLE_INTERRUPTS_SYSEXIT @@ -520,6 +580,7 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) + CATCH_ROOT_SYSCALL(syscall_exit,restore_ret) # system call tracing in operation / emulation testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry @@ -552,6 +613,10 @@ restore_all_notrace: CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: +#ifdef CONFIG_IPIPE + call __ipipe_unstall_iret_root +#endif /* CONFIG_IPIPE */ +restore_ret: RESTORE_REGS 4 # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 irq_return: @@ -559,7 +624,7 @@ irq_return: .section .fixup,"ax" ENTRY(iret_exc) pushl $0 # no error code - pushl $do_iret_error + PUSH_XCODE(do_iret_error) jmp error_code .previous .section __ex_table,"a" @@ -613,7 +678,7 @@ ldt_ss: /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ - DISABLE_INTERRUPTS(CLBR_EAX) + DISABLE_INTERRUPTS_HW(CLBR_EAX) lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck @@ -627,6 +692,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: + ENABLE_INTERRUPTS_HW_COND call schedule LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt @@ -799,6 +865,48 @@ END(irq_entries_start) END(interrupt) .previous +#ifdef CONFIG_IPIPE + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + SAVE_ALL + IPIPE_TRACE_IRQ_ENTER + movl %esp, %eax + call *ipipe_irq_handler + IPIPE_TRACE_IRQ_EXIT + testl %eax,%eax + jnz ret_from_intr + jmp restore_ret + CFI_ENDPROC + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + RING0_INT_FRAME; \ + pushl $~(nr); \ + CFI_ADJUST_CFA_OFFSET 4; \ + SAVE_ALL; \ + IPIPE_TRACE_IRQ_ENTER; \ + movl %esp, %eax; \ + call *ipipe_irq_handler; \ + IPIPE_TRACE_IRQ_EXIT; \ + testl %eax,%eax; \ + jnz ret_from_intr; \ + jmp restore_ret; \ + CFI_ENDPROC + +#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) + +#ifdef CONFIG_X86_LOCAL_APIC + BUILD_INTERRUPT(ipipe_ipi0,IPIPE_SERVICE_VECTOR0) + BUILD_INTERRUPT(ipipe_ipi1,IPIPE_SERVICE_VECTOR1) + BUILD_INTERRUPT(ipipe_ipi2,IPIPE_SERVICE_VECTOR2) + BUILD_INTERRUPT(ipipe_ipi3,IPIPE_SERVICE_VECTOR3) +#ifdef CONFIG_SMP + BUILD_INTERRUPT(ipipe_ipiX,IPIPE_CRITICAL_VECTOR) +#endif +#endif + +#else /* !CONFIG_IPIPE */ /* * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: @@ -829,6 +937,8 @@ ENDPROC(name) #define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) +#endif /* !CONFIG_IPIPE */ + /* The include is where all of the SMP etc. interrupts come from */ #include @@ -836,7 +946,7 @@ ENTRY(coprocessor_error) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_error + PUSH_XCODE(do_coprocessor_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -846,7 +956,7 @@ ENTRY(simd_coprocessor_error) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl $do_simd_coprocessor_error + PUSH_XCODE(do_simd_coprocessor_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -856,7 +966,7 @@ ENTRY(device_not_available) RING0_INT_FRAME pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 - pushl $do_device_not_available + PUSH_XCODE(do_device_not_available) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -881,7 +991,7 @@ ENTRY(overflow) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl $do_overflow + PUSH_XCODE(do_overflow) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -891,7 +1001,7 @@ ENTRY(bounds) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl $do_bounds + PUSH_XCODE(do_bounds) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -901,7 +1011,7 @@ ENTRY(invalid_op) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl $do_invalid_op + PUSH_XCODE(do_invalid_op) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -911,7 +1021,7 @@ ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_segment_overrun + PUSH_XCODE(do_coprocessor_segment_overrun) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -919,7 +1029,7 @@ END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME - pushl $do_invalid_TSS + PUSH_XCODE(do_invalid_TSS) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -927,7 +1037,7 @@ END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME - pushl $do_segment_not_present + PUSH_XCODE(do_segment_not_present) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -935,7 +1045,7 @@ END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME - pushl $do_stack_segment + PUSH_XCODE(do_stack_segment) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -943,7 +1053,7 @@ END(stack_segment) ENTRY(alignment_check) RING0_EC_FRAME - pushl $do_alignment_check + PUSH_XCODE(do_alignment_check) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -953,7 +1063,7 @@ ENTRY(divide_error) RING0_INT_FRAME pushl $0 # no error code CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error + PUSH_XCODE(do_divide_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -964,7 +1074,7 @@ ENTRY(machine_check) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl machine_check_vector + PUSH_XVEC(machine_check_vector) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -975,7 +1085,7 @@ ENTRY(spurious_interrupt_bug) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl $do_spurious_interrupt_bug + PUSH_XCODE(do_spurious_interrupt_bug) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -1210,7 +1320,7 @@ syscall_table_size=(.-sys_call_table) ENTRY(page_fault) RING0_EC_FRAME - pushl $do_page_fault + PUSH_XCODE(do_page_fault) CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: @@ -1260,7 +1370,7 @@ error_code: movl %ecx, %es TRACE_IRQS_OFF movl %esp,%eax # pt_regs pointer - call *%edi + HANDLE_EXCEPTION(edi) jmp ret_from_exception CFI_ENDPROC END(page_fault) @@ -1304,6 +1414,7 @@ debug_stack_correct: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL TRACE_IRQS_OFF + DIVERT_EXCEPTION(do_debug) xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug @@ -1404,6 +1515,7 @@ ENTRY(int3) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL TRACE_IRQS_OFF + DIVERT_EXCEPTION(do_int3) xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 @@ -1413,7 +1525,7 @@ END(int3) ENTRY(general_protection) RING0_EC_FRAME - pushl $do_general_protection + PUSH_XCODE(do_general_protection) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b5c061f..ce29b45 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -61,6 +62,13 @@ #define __AUDIT_ARCH_LE 0x40000000 .code64 + +#ifdef CONFIG_IPIPE +#define PREEMPT_SCHEDULE_IRQ call __ipipe_preempt_schedule_irq +#else /* !CONFIG_IPIPE */ +#define PREEMPT_SCHEDULE_IRQ call preempt_schedule_irq +#endif /* !CONFIG_IPIPE */ + #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -336,7 +344,10 @@ ENTRY(save_args) /* * We entered an interrupt context - irqs are off: */ -2: TRACE_IRQS_OFF +2: +#ifndef CONFIG_IPIPE + TRACE_IRQS_OFF +#endif ret CFI_ENDPROC END(save_args) @@ -402,6 +413,7 @@ ENTRY(ret_from_fork) CFI_ADJUST_CFA_OFFSET 8 popf # reset kernel eflags CFI_ADJUST_CFA_OFFSET -8 + ENABLE_INTERRUPTS_HW_COND call schedule_tail # rdi: 'prev' task parameter @@ -477,6 +489,17 @@ ENTRY(system_call_after_swapgs) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET +#ifdef CONFIG_IPIPE + pushq %rdi + pushq %rax + leaq -(ARGOFFSET-16)(%rsp),%rdi # regs for handler + call __ipipe_syscall_root_thunk + testl %eax, %eax + popq %rax + popq %rdi + js ret_from_sys_call + jnz sysret_fastexit +#endif GET_THREAD_INFO(%rcx) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys @@ -506,6 +529,7 @@ sysret_check: * sysretq will re-enable interrupts: */ TRACE_IRQS_ON +sysret_fastexit: movq RIP-ARGOFFSET(%rsp),%rcx CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 @@ -517,6 +541,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),%edx + jnz ret_from_sys_call_trace bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON @@ -528,6 +554,16 @@ sysret_careful: CFI_ADJUST_CFA_OFFSET -8 jmp sysret_check +ret_from_sys_call_trace: + TRACE_IRQS_ON + sti + SAVE_REST + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + jmp int_ret_from_sys_call + /* Handle a signal */ sysret_signal: TRACE_IRQS_ON @@ -800,7 +836,29 @@ END(interrupt) CFI_ADJUST_CFA_OFFSET 10*8 call save_args PARTIAL_FRAME 0 +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF + pushq %rbp + leaq RIP-8(%rdi), %rbp # make interrupted address show up in trace + pushq %rdi + movq ORIG_RAX(%rdi), %rdi # IRQ number + notq %rdi # ...is inverted, fix up + call ipipe_trace_begin + popq %rdi + popq %rbp + + call \func + + pushq %rbp + pushq %rax + movq 8-ARGOFFSET+ORIG_RAX(%rbp), %rdi + leaq 8-ARGOFFSET+RIP-8(%rbp), %rbp + notq %rdi + call ipipe_trace_end + popq %rax + popq %rbp +#else call \func +#endif .endm /* @@ -809,9 +867,24 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: +#ifdef CONFIG_IPIPE + XCPT_FRAME + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + interrupt *ipipe_irq_handler + testl %eax, %eax + jnz ret_from_intr + decl PER_CPU_VAR(irq_count) + leaveq + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 + testl $3,CS-ARGOFFSET(%rsp) + jz restore_args + jmp retint_swapgs_notrace +#else /* !CONFIG_IPIPE */ XCPT_FRAME addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ +#endif /* !CONFIG_IPIPE */ /* 0(%rsp): old_rsp-ARGOFFSET */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) @@ -820,7 +893,7 @@ ret_from_intr: leaveq CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 -exit_intr: +ENTRY(exit_intr) GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) je retint_kernel @@ -840,20 +913,20 @@ retint_check: jnz retint_careful retint_swapgs: /* return to user-space */ + TRACE_IRQS_IRETQ /* * The iretq could re-enable interrupts: */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ +retint_swapgs_notrace: SWAPGS +retint_noswapgs: jmp restore_args retint_restore_args: /* return to kernel space */ - DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ /* * The iretq could re-enable interrupts: */ - TRACE_IRQS_IRETQ restore_args: RESTORE_ARGS 0,8,0 @@ -935,7 +1008,15 @@ ENTRY(retint_kernel) jnc retint_restore_args bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args - call preempt_schedule_irq +#ifdef CONFIG_IPIPE + /* + * We may have preempted call_softirq before __do_softirq raised or + * after it lowered the preemption counter. + */ + cmpl $0,PER_CPU_VAR(irq_count) + jge retint_restore_args +#endif + PREEMPT_SCHEDULE_IRQ jmp exit_intr #endif @@ -945,16 +1026,31 @@ END(common_interrupt) /* * APIC interrupts. */ -.macro apicinterrupt num sym do_sym + .macro apicinterrupt num sym do_sym ENTRY(\sym) INTR_FRAME pushq $~(\num) CFI_ADJUST_CFA_OFFSET 8 +#ifdef CONFIG_IPIPE + interrupt *ipipe_irq_handler + testl %eax, %eax + jnz ret_from_intr + decl PER_CPU_VAR(irq_count) + leaveq + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 + testl $3,CS-ARGOFFSET(%rsp) + jz restore_args + jmp retint_swapgs_notrace + CFI_ENDPROC + .endm +#else /* !CONFIG_IPIPE */ interrupt \do_sym jmp ret_from_intr CFI_ENDPROC END(\sym) .endm +#endif /* !CONFIG_IPIPE */ #ifdef CONFIG_SMP apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ @@ -979,6 +1075,7 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ invalidate_interrupt1 smp_invalidate_interrupt apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ invalidate_interrupt2 smp_invalidate_interrupt +#ifndef CONFIG_IPIPE apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ invalidate_interrupt3 smp_invalidate_interrupt apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ @@ -989,6 +1086,7 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ invalidate_interrupt6 smp_invalidate_interrupt apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ invalidate_interrupt7 smp_invalidate_interrupt +#endif /* !CONFIG_IPIPE */ #endif apicinterrupt THRESHOLD_APIC_VECTOR \ @@ -1023,7 +1121,7 @@ apicinterrupt LOCAL_PENDING_VECTOR \ /* * Exception entry points. */ -.macro zeroentry sym do_sym +.macro zeroentry sym do_sym ex_code ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1034,13 +1132,26 @@ ENTRY(\sym) DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ +#ifdef CONFIG_IPIPE + movq $\ex_code,%rdx + call __ipipe_handle_exception /* handle(regs, error_code, ex_code) */ + testl %eax, %eax + jz error_exit + movl %ebx,%eax + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + testl %eax,%eax + jne retint_noswapgs + jmp retint_swapgs_notrace +#else /* !CONFIG_IPIPE */ call \do_sym +#endif /* !CONFIG_IPIPE */ jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm -.macro paranoidzeroentry sym do_sym +.macro paranoidzeroentry sym do_sym ex_code=0 ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1050,14 +1161,27 @@ ENTRY(\sym) call save_paranoid TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ +#ifdef CONFIG_IPIPE + .if \ex_code + movq $\ex_code,%rsi + call __ipipe_divert_exception /* handle(regs, ex_code) */ + testl %eax,%eax + jnz 1f + movq %rsp,%rdi + .endif +#endif xorl %esi,%esi /* no error code */ call \do_sym +#ifdef CONFIG_IPIPE + xorl %eax,%eax /* tell paranoid_exit to propagate the exception */ +1: +#endif jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm -.macro paranoidzeroentry_ist sym do_sym ist +.macro paranoidzeroentry_ist sym do_sym ist ex_code=0 ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1067,17 +1191,30 @@ ENTRY(\sym) call save_paranoid TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ +#ifdef CONFIG_IPIPE + .if \ex_code + movq $\ex_code,%rsi + call __ipipe_divert_exception /* handle(regs, ex_code) */ + testl %eax,%eax + jnz 1f + movq %rsp,%rdi + .endif +#endif xorl %esi,%esi /* no error code */ PER_CPU(init_tss, %rbp) subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) call \do_sym addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) +#ifdef CONFIG_IPIPE + xorl %eax,%eax /* tell paranoid_exit to propagate the exception */ +1: +#endif jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm -.macro errorentry sym do_sym +.macro errorentry sym do_sym ex_code ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1088,14 +1225,27 @@ ENTRY(\sym) movq %rsp,%rdi /* pt_regs pointer */ movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ +#ifdef CONFIG_IPIPE + movq $\ex_code,%rdx + call __ipipe_handle_exception /* handle(regs, error_code, ex_code) */ + testl %eax, %eax + jz error_exit + movl %ebx,%eax + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + testl %eax,%eax + jne retint_noswapgs + jmp retint_swapgs_notrace +#else /* !CONFIG_IPIPE */ call \do_sym +#endif /* !CONFIG_IPIPE */ jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm /* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym +.macro paranoiderrorentry sym do_sym ex_code=0 ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1105,27 +1255,40 @@ ENTRY(\sym) DEFAULT_FRAME 0 TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ +#ifdef CONFIG_IPIPE + .if \ex_code + movq $\ex_code,%rsi + call __ipipe_divert_exception /* handle(regs, ex_code) */ + testl %eax,%eax + jnz 1f + movq %rsp,%rdi + .endif +#endif movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ call \do_sym +#ifdef CONFIG_IPIPE + xorl %eax,%eax /* tell paranoid_exit to propagate the exception */ +1: +#endif jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm -zeroentry divide_error do_divide_error -zeroentry overflow do_overflow -zeroentry bounds do_bounds -zeroentry invalid_op do_invalid_op -zeroentry device_not_available do_device_not_available +zeroentry divide_error do_divide_error ex_do_divide_error +zeroentry overflow do_overflow ex_do_overflow +zeroentry bounds do_bounds ex_do_bounds +zeroentry invalid_op do_invalid_op ex_do_invalid_op +zeroentry device_not_available do_device_not_available ex_do_device_not_available paranoiderrorentry double_fault do_double_fault -zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun -errorentry invalid_TSS do_invalid_TSS -errorentry segment_not_present do_segment_not_present -zeroentry spurious_interrupt_bug do_spurious_interrupt_bug -zeroentry coprocessor_error do_coprocessor_error -errorentry alignment_check do_alignment_check -zeroentry simd_coprocessor_error do_simd_coprocessor_error +zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun ex_do_coprocessor_segment_overrun +errorentry invalid_TSS do_invalid_TSS ex_do_invalid_TSS +errorentry segment_not_present do_segment_not_present ex_do_segment_not_present +zeroentry spurious_interrupt_bug do_spurious_interrupt_bug ex_do_spurious_interrupt_bug +zeroentry coprocessor_error do_coprocessor_error ex_do_coprocessor_error +errorentry alignment_check do_alignment_check ex_do_alignment_check +zeroentry simd_coprocessor_error do_simd_coprocessor_error ex_do_simd_coprocessor_error /* Reload gs selector with exception handling */ /* edi: new selector */ @@ -1255,14 +1418,18 @@ ENTRY(call_softirq) CFI_REL_OFFSET rbp,0 mov %rsp,%rbp CFI_DEF_CFA_REGISTER rbp + DISABLE_INTERRUPTS_HW_COND incl PER_CPU_VAR(irq_count) cmove PER_CPU_VAR(irq_stack_ptr),%rsp + ENABLE_INTERRUPTS_HW_COND push %rbp # backlink for old unwinder call __do_softirq + DISABLE_INTERRUPTS_HW_COND leaveq CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 decl PER_CPU_VAR(irq_count) + ENABLE_INTERRUPTS_HW_COND ret CFI_ENDPROC END(call_softirq) @@ -1371,16 +1538,16 @@ END(xen_failsafe_callback) */ .pushsection .kprobes.text, "ax" -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +paranoidzeroentry_ist debug do_debug DEBUG_STACK ex_do_debug +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK ex_do_int3 paranoiderrorentry stack_segment do_stack_segment #ifdef CONFIG_XEN zeroentry xen_debug do_debug zeroentry xen_int3 do_int3 errorentry xen_stack_segment do_stack_segment #endif -errorentry general_protection do_general_protection -errorentry page_fault do_page_fault +errorentry general_protection do_general_protection ex_do_general_protection +errorentry page_fault do_page_fault ex_do_page_fault #ifdef CONFIG_X86_MCE paranoidzeroentry machine_check *machine_check_vector(%rip) #endif @@ -1403,8 +1570,13 @@ ENTRY(paranoid_exit) INTR_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF +paranoid_notrace: testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_restore +#ifdef CONFIG_IPIPE + testl %eax,%eax + jnz paranoid_swapgs +#endif testl $3,CS(%rsp) jnz paranoid_userspace paranoid_swapgs: diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 23c1679..1c00022 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -130,6 +131,12 @@ static cycle_t pit_read(struct clocksource *cs) int count; u32 jifs; +#ifdef CONFIG_IPIPE + if (!__ipipe_pipeline_head_p(ipipe_root_domain)) + /* We don't really own the PIT. */ + return (cycle_t)(jiffies * LATCH) + (LATCH - 1) - old_count; +#endif /* CONFIG_IPIPE */ + spin_lock_irqsave(&i8253_lock, flags); /* * Although our caller may have the read side of xtime_lock, diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102..cfb29a2 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,7 +32,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +IPIPE_DEFINE_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); struct irq_chip i8259A_chip = { @@ -69,6 +69,7 @@ void disable_8259A_irq(unsigned int irq) unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); + ipipe_irq_lock(irq); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); @@ -79,15 +80,18 @@ void disable_8259A_irq(unsigned int irq) void enable_8259A_irq(unsigned int irq) { - unsigned int mask = ~(1 << irq); + unsigned int mask = (1 << irq); unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); + if (cached_irq_mask & mask) { + cached_irq_mask &= ~mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + ipipe_irq_unlock(irq); + } spin_unlock_irqrestore(&i8259A_lock, flags); } @@ -168,6 +172,18 @@ static void mask_and_ack_8259A(unsigned int irq) */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; +#ifdef CONFIG_IPIPE + if (irq == 0) { + /* + * Fast timer ack -- don't mask (unless supposedly + * spurious). We trace outb's in order to detect + * broken hardware inducing large delays. + */ + outb(0x60, PIC_MASTER_CMD); /* Specific EOI to master. */ + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + } +#endif /* CONFIG_IPIPE */ cached_irq_mask |= irqmask; handle_real_irq: diff --git a/arch/x86/kernel/ipipe.c b/arch/x86/kernel/ipipe.c new file mode 100644 index 0000000..116fee7 --- /dev/null +++ b/arch/x86/kernel/ipipe.c @@ -0,0 +1,1084 @@ +/* -*- linux-c -*- + * linux/arch/x86/kernel/ipipe.c + * + * Copyright (C) 2002-2007 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Architecture-dependent I-PIPE support for x86. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_LOCAL_APIC +#include +#include +#include +#include +#ifdef CONFIG_X86_IO_APIC +#include +#endif /* CONFIG_X86_IO_APIC */ +#include +#endif /* CONFIG_X86_LOCAL_APIC */ +#include + +int __ipipe_tick_irq = 0; /* Legacy timer */ + +DEFINE_PER_CPU(struct pt_regs, __ipipe_tick_regs); + +DEFINE_PER_CPU(unsigned long, __ipipe_cr2); +EXPORT_PER_CPU_SYMBOL_GPL(__ipipe_cr2); + +#ifdef CONFIG_SMP + +static cpumask_t __ipipe_cpu_sync_map; + +static cpumask_t __ipipe_cpu_lock_map; + +static unsigned long __ipipe_critical_lock; + +static IPIPE_DEFINE_SPINLOCK(__ipipe_cpu_barrier); + +static atomic_t __ipipe_critical_count = ATOMIC_INIT(0); + +static void (*__ipipe_cpu_sync) (void); + +#endif /* CONFIG_SMP */ + +/* + * ipipe_trigger_irq() -- Push the interrupt at front of the pipeline + * just like if it has been actually received from a hw source. Also + * works for virtual interrupts. + */ +int ipipe_trigger_irq(unsigned int irq) +{ + struct pt_regs regs; + unsigned long flags; + +#ifdef CONFIG_IPIPE_DEBUG + if (irq >= IPIPE_NR_IRQS) + return -EINVAL; + if (ipipe_virtual_irq_p(irq)) { + if (!test_bit(irq - IPIPE_VIRQ_BASE, + &__ipipe_virtual_irq_map)) + return -EINVAL; + } else if (irq_to_desc(irq) == NULL) + return -EINVAL; +#endif + local_irq_save_hw(flags); + regs.flags = flags; + regs.orig_ax = irq; /* Positive value - IRQ won't be acked */ + regs.cs = __KERNEL_CS; + __ipipe_handle_irq(®s); + local_irq_restore_hw(flags); + + return 1; +} + +int ipipe_get_sysinfo(struct ipipe_sysinfo *info) +{ + info->ncpus = num_online_cpus(); + info->cpufreq = ipipe_cpu_freq(); + info->archdep.tmirq = __ipipe_tick_irq; +#ifdef CONFIG_X86_TSC + info->archdep.tmfreq = ipipe_cpu_freq(); +#else /* !CONFIG_X86_TSC */ + info->archdep.tmfreq = CLOCK_TICK_RATE; +#endif /* CONFIG_X86_TSC */ + + return 0; +} + +#ifdef CONFIG_X86_UV +asmlinkage void uv_bau_message_interrupt(struct pt_regs *regs); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD +asmlinkage void smp_threshold_interrupt(void); +#endif +#ifdef CONFIG_X86_NEW_MCE +asmlinkage void smp_mce_self_interrupt(void); +#endif + +static void __ipipe_ack_irq(unsigned irq, struct irq_desc *desc) +{ + desc->ipipe_ack(irq, desc); +} + +void __ipipe_enable_irqdesc(struct ipipe_domain *ipd, unsigned irq) +{ + irq_to_desc(irq)->status &= ~IRQ_DISABLED; +} + +#ifdef CONFIG_X86_LOCAL_APIC + +static void __ipipe_noack_apic(unsigned irq, struct irq_desc *desc) +{ +} + +static void __ipipe_ack_apic(unsigned irq, struct irq_desc *desc) +{ + __ack_APIC_irq(); +} + +static void __ipipe_null_handler(unsigned irq, void *cookie) +{ +} + +#endif /* CONFIG_X86_LOCAL_APIC */ + +/* __ipipe_enable_pipeline() -- We are running on the boot CPU, hw + interrupts are off, and secondary CPUs are still lost in space. */ + +void __init __ipipe_enable_pipeline(void) +{ + unsigned int vector, irq; + +#ifdef CONFIG_X86_LOCAL_APIC + + /* Map the APIC system vectors. */ + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(LOCAL_TIMER_VECTOR), + (ipipe_irq_handler_t)&smp_apic_timer_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(SPURIOUS_APIC_VECTOR), + (ipipe_irq_handler_t)&smp_spurious_interrupt, + NULL, + &__ipipe_noack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(ERROR_APIC_VECTOR), + (ipipe_irq_handler_t)&smp_error_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(IPIPE_SERVICE_VECTOR0), + &__ipipe_null_handler, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(IPIPE_SERVICE_VECTOR1), + &__ipipe_null_handler, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(IPIPE_SERVICE_VECTOR2), + &__ipipe_null_handler, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(IPIPE_SERVICE_VECTOR3), + &__ipipe_null_handler, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + +#ifdef CONFIG_X86_THERMAL_VECTOR + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(THERMAL_APIC_VECTOR), + (ipipe_irq_handler_t)&smp_thermal_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); +#endif /* CONFIG_X86_THERMAL_VECTOR */ + +#ifdef CONFIG_X86_MCE_THRESHOLD + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(THRESHOLD_APIC_VECTOR), + (ipipe_irq_handler_t)&smp_threshold_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); +#endif /* CONFIG_X86_MCE_THRESHOLD */ + +#ifdef CONFIG_X86_NEW_MCE + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(MCE_SELF_VECTOR), + (ipipe_irq_handler_t)&smp_mce_self_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); +#endif /* CONFIG_X86_MCE_THRESHOLD */ + +#ifdef CONFIG_X86_UV + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(UV_BAU_MESSAGE), + (ipipe_irq_handler_t)&uv_bau_message_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); +#endif /* CONFIG_X86_UV */ + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(GENERIC_INTERRUPT_VECTOR), + (ipipe_irq_handler_t)&smp_generic_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + +#ifdef CONFIG_PERF_COUNTERS + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(LOCAL_PENDING_VECTOR), + (ipipe_irq_handler_t)&perf_pending_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); +#endif /* CONFIG_PERF_COUNTERS */ + +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_SMP + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(RESCHEDULE_VECTOR), + (ipipe_irq_handler_t)&smp_reschedule_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + for (vector = INVALIDATE_TLB_VECTOR_START; + vector <= INVALIDATE_TLB_VECTOR_END; ++vector) + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(vector), + (ipipe_irq_handler_t)&smp_invalidate_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(CALL_FUNCTION_VECTOR), + (ipipe_irq_handler_t)&smp_call_function_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(CALL_FUNCTION_SINGLE_VECTOR), + (ipipe_irq_handler_t)&smp_call_function_single_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + IRQ_MOVE_CLEANUP_VECTOR, + (ipipe_irq_handler_t)&smp_irq_move_cleanup_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); + + ipipe_virtualize_irq(ipipe_root_domain, + ipipe_apic_vector_irq(REBOOT_VECTOR), + (ipipe_irq_handler_t)&smp_reboot_interrupt, + NULL, + &__ipipe_ack_apic, + IPIPE_STDROOT_MASK); +#else + (void)vector; +#endif /* CONFIG_SMP */ + + /* Finally, virtualize the remaining ISA and IO-APIC + * interrupts. Interrupts which have already been virtualized + * will just beget a silent -EPERM error since + * IPIPE_SYSTEM_MASK has been passed for them, that's ok. */ + + for (irq = 0; irq < NR_IRQS; irq++) + /* + * Fails for IPIPE_CRITICAL_IPI and IRQ_MOVE_CLEANUP_VECTOR, + * but that's ok. + */ + ipipe_virtualize_irq(ipipe_root_domain, + irq, + (ipipe_irq_handler_t)&do_IRQ, + NULL, + &__ipipe_ack_irq, + IPIPE_STDROOT_MASK); + +#ifdef CONFIG_X86_LOCAL_APIC + /* Eventually allow these vectors to be reprogrammed. */ + ipipe_root_domain->irqs[IPIPE_SERVICE_IPI0].control &= ~IPIPE_SYSTEM_MASK; + ipipe_root_domain->irqs[IPIPE_SERVICE_IPI1].control &= ~IPIPE_SYSTEM_MASK; + ipipe_root_domain->irqs[IPIPE_SERVICE_IPI2].control &= ~IPIPE_SYSTEM_MASK; + ipipe_root_domain->irqs[IPIPE_SERVICE_IPI3].control &= ~IPIPE_SYSTEM_MASK; +#endif /* CONFIG_X86_LOCAL_APIC */ +} + +#ifdef CONFIG_SMP + +cpumask_t __ipipe_set_irq_affinity(unsigned irq, cpumask_t cpumask) +{ + cpumask_t oldmask; + + if (irq_to_desc(irq)->chip->set_affinity == NULL) + return CPU_MASK_NONE; + + if (cpus_empty(cpumask)) + return CPU_MASK_NONE; /* Return mask value -- no change. */ + + cpus_and(cpumask, cpumask, cpu_online_map); + if (cpus_empty(cpumask)) + return CPU_MASK_NONE; /* Error -- bad mask value or non-routable IRQ. */ + + cpumask_copy(&oldmask, irq_to_desc(irq)->affinity); + irq_to_desc(irq)->chip->set_affinity(irq, &cpumask); + + return oldmask; +} + +int __ipipe_send_ipi(unsigned ipi, cpumask_t cpumask) +{ + unsigned long flags; + int self; + + if (ipi != IPIPE_SERVICE_IPI0 && + ipi != IPIPE_SERVICE_IPI1 && + ipi != IPIPE_SERVICE_IPI2 && + ipi != IPIPE_SERVICE_IPI3) + return -EINVAL; + + local_irq_save_hw(flags); + + self = cpu_isset(ipipe_processor_id(),cpumask); + cpu_clear(ipipe_processor_id(), cpumask); + + if (!cpus_empty(cpumask)) + apic->send_IPI_mask(&cpumask, ipipe_apic_irq_vector(ipi)); + + if (self) + ipipe_trigger_irq(ipi); + + local_irq_restore_hw(flags); + + return 0; +} + +/* Always called with hw interrupts off. */ + +void __ipipe_do_critical_sync(unsigned irq, void *cookie) +{ + int cpu = ipipe_processor_id(); + + cpu_set(cpu, __ipipe_cpu_sync_map); + + /* Now we are in sync with the lock requestor running on another + CPU. Enter a spinning wait until he releases the global + lock. */ + spin_lock(&__ipipe_cpu_barrier); + + /* Got it. Now get out. */ + + if (__ipipe_cpu_sync) + /* Call the sync routine if any. */ + __ipipe_cpu_sync(); + + spin_unlock(&__ipipe_cpu_barrier); + + cpu_clear(cpu, __ipipe_cpu_sync_map); +} + +void __ipipe_hook_critical_ipi(struct ipipe_domain *ipd) +{ + ipd->irqs[IPIPE_CRITICAL_IPI].acknowledge = &__ipipe_ack_apic; + ipd->irqs[IPIPE_CRITICAL_IPI].handler = &__ipipe_do_critical_sync; + ipd->irqs[IPIPE_CRITICAL_IPI].cookie = NULL; + /* Immediately handle in the current domain but *never* pass */ + ipd->irqs[IPIPE_CRITICAL_IPI].control = + IPIPE_HANDLE_MASK|IPIPE_STICKY_MASK|IPIPE_SYSTEM_MASK; +} + +#endif /* CONFIG_SMP */ + +/* + * ipipe_critical_enter() -- Grab the superlock excluding all CPUs but + * the current one from a critical section. This lock is used when we + * must enforce a global critical section for a single CPU in a + * possibly SMP system whichever context the CPUs are running. + */ +unsigned long ipipe_critical_enter(void (*syncfn) (void)) +{ + unsigned long flags; + + local_irq_save_hw(flags); + +#ifdef CONFIG_SMP + if (unlikely(num_online_cpus() == 1)) + return flags; + + { + int cpu = ipipe_processor_id(); + cpumask_t lock_map; + + if (!cpu_test_and_set(cpu, __ipipe_cpu_lock_map)) { + while (test_and_set_bit(0, &__ipipe_critical_lock)) { + int n = 0; + do { + cpu_relax(); + } while (++n < cpu); + } + + spin_lock(&__ipipe_cpu_barrier); + + __ipipe_cpu_sync = syncfn; + + /* Send the sync IPI to all processors but the current one. */ + apic->send_IPI_allbutself(IPIPE_CRITICAL_VECTOR); + + cpus_andnot(lock_map, cpu_online_map, __ipipe_cpu_lock_map); + + while (!cpus_equal(__ipipe_cpu_sync_map, lock_map)) + cpu_relax(); + } + + atomic_inc(&__ipipe_critical_count); + } +#endif /* CONFIG_SMP */ + + return flags; +} + +/* ipipe_critical_exit() -- Release the superlock. */ + +void ipipe_critical_exit(unsigned long flags) +{ +#ifdef CONFIG_SMP + if (num_online_cpus() == 1) + goto out; + + if (atomic_dec_and_test(&__ipipe_critical_count)) { + spin_unlock(&__ipipe_cpu_barrier); + + while (!cpus_empty(__ipipe_cpu_sync_map)) + cpu_relax(); + + cpu_clear(ipipe_processor_id(), __ipipe_cpu_lock_map); + clear_bit(0, &__ipipe_critical_lock); + smp_mb__after_clear_bit(); + } +out: +#endif /* CONFIG_SMP */ + + local_irq_restore_hw(flags); +} + +static inline void __fixup_if(int s, struct pt_regs *regs) +{ + /* + * Have the saved hw state look like the domain stall bit, so + * that __ipipe_unstall_iret_root() restores the proper + * pipeline state for the root stage upon exit. + */ + if (s) + regs->flags &= ~X86_EFLAGS_IF; + else + regs->flags |= X86_EFLAGS_IF; +} + +#ifdef CONFIG_X86_32 + +/* + * Check the stall bit of the root domain to make sure the existing + * preemption opportunity upon in-kernel resumption could be + * exploited. In case a rescheduling could take place, the root stage + * is stalled before the hw interrupts are re-enabled. This routine + * must be called with hw interrupts off. + */ + +asmlinkage int __ipipe_kpreempt_root(struct pt_regs regs) +{ + if (test_bit(IPIPE_STALL_FLAG, &ipipe_root_cpudom_var(status))) + /* Root stage is stalled: rescheduling denied. */ + return 0; + + __ipipe_stall_root(); + trace_hardirqs_off(); + local_irq_enable_hw_notrace(); + + return 1; /* Ok, may reschedule now. */ +} + +asmlinkage void __ipipe_unstall_iret_root(struct pt_regs regs) +{ + struct ipipe_percpu_domain_data *p; + + /* Emulate IRET's handling of the interrupt flag. */ + + local_irq_disable_hw(); + + p = ipipe_root_cpudom_ptr(); + + /* + * Restore the software state as it used to be on kernel + * entry. CAUTION: NMIs must *not* return through this + * emulation. + */ + if (raw_irqs_disabled_flags(regs.flags)) { + if (!__test_and_set_bit(IPIPE_STALL_FLAG, &p->status)) + trace_hardirqs_off(); + regs.flags |= X86_EFLAGS_IF; + } else { + if (test_bit(IPIPE_STALL_FLAG, &p->status)) { + trace_hardirqs_on(); + __clear_bit(IPIPE_STALL_FLAG, &p->status); + } + /* + * We could have received and logged interrupts while + * stalled in the syscall path: play the log now to + * release any pending event. The SYNC_BIT prevents + * infinite recursion in case of flooding. + */ + if (unlikely(__ipipe_ipending_p(p))) + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + } +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF + ipipe_trace_end(0x8000000D); +#endif /* CONFIG_IPIPE_TRACE_IRQSOFF */ +} + +#else /* !CONFIG_X86_32 */ + +#ifdef CONFIG_PREEMPT + +asmlinkage void preempt_schedule_irq(void); + +void __ipipe_preempt_schedule_irq(void) +{ + struct ipipe_percpu_domain_data *p; + unsigned long flags; + /* + * We have no IRQ state fixup on entry to exceptions in + * x86_64, so we have to stall the root stage before + * rescheduling. + */ + BUG_ON(!irqs_disabled_hw()); + local_irq_save(flags); + local_irq_enable_hw(); + preempt_schedule_irq(); /* Ok, may reschedule now. */ + local_irq_disable_hw(); + + /* + * Flush any pending interrupt that may have been logged after + * preempt_schedule_irq() stalled the root stage before + * returning to us, and now. + */ + p = ipipe_root_cpudom_ptr(); + if (unlikely(__ipipe_ipending_p(p))) { + add_preempt_count(PREEMPT_ACTIVE); + trace_hardirqs_on(); + clear_bit(IPIPE_STALL_FLAG, &p->status); + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + sub_preempt_count(PREEMPT_ACTIVE); + } + + __local_irq_restore_nosync(flags); +} + +#endif /* CONFIG_PREEMPT */ + +#endif /* !CONFIG_X86_32 */ + +void __ipipe_halt_root(void) +{ + struct ipipe_percpu_domain_data *p; + + /* Emulate sti+hlt sequence over the root domain. */ + + local_irq_disable_hw(); + + p = ipipe_root_cpudom_ptr(); + + trace_hardirqs_on(); + clear_bit(IPIPE_STALL_FLAG, &p->status); + + if (unlikely(__ipipe_ipending_p(p))) { + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + local_irq_enable_hw(); + } else { +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF + ipipe_trace_end(0x8000000E); +#endif /* CONFIG_IPIPE_TRACE_IRQSOFF */ + asm volatile("sti; hlt": : :"memory"); + } +} + +static void do_machine_check_vector(struct pt_regs *regs, long error_code) +{ +#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_32 + extern void (*machine_check_vector)(struct pt_regs *, long error_code); + machine_check_vector(regs, error_code); +#else + do_machine_check(regs, error_code); +#endif +#endif /* CONFIG_X86_MCE */ +} + +/* Work around genksyms's issue with over-qualification in decls. */ + +typedef void dotraplinkage __ipipe_exhandler(struct pt_regs *, long); + +typedef __ipipe_exhandler *__ipipe_exptr; + +static __ipipe_exptr __ipipe_std_extable[] = { + + [ex_do_divide_error] = &do_divide_error, + [ex_do_overflow] = &do_overflow, + [ex_do_bounds] = &do_bounds, + [ex_do_invalid_op] = &do_invalid_op, + [ex_do_coprocessor_segment_overrun] = &do_coprocessor_segment_overrun, + [ex_do_invalid_TSS] = &do_invalid_TSS, + [ex_do_segment_not_present] = &do_segment_not_present, + [ex_do_stack_segment] = &do_stack_segment, + [ex_do_general_protection] = do_general_protection, + [ex_do_page_fault] = (__ipipe_exptr)&do_page_fault, + [ex_do_spurious_interrupt_bug] = &do_spurious_interrupt_bug, + [ex_do_coprocessor_error] = &do_coprocessor_error, + [ex_do_alignment_check] = &do_alignment_check, + [ex_machine_check_vector] = &do_machine_check_vector, + [ex_do_simd_coprocessor_error] = &do_simd_coprocessor_error, + [ex_do_device_not_available] = &do_device_not_available, +#ifdef CONFIG_X86_32 + [ex_do_iret_error] = &do_iret_error, +#endif +}; + +#ifdef CONFIG_KGDB +#include + +static int __ipipe_xlate_signo[] = { + + [ex_do_divide_error] = SIGFPE, + [ex_do_debug] = SIGTRAP, + [2] = -1, + [ex_do_int3] = SIGTRAP, + [ex_do_overflow] = SIGSEGV, + [ex_do_bounds] = SIGSEGV, + [ex_do_invalid_op] = SIGILL, + [ex_do_device_not_available] = -1, + [8] = -1, + [ex_do_coprocessor_segment_overrun] = SIGFPE, + [ex_do_invalid_TSS] = SIGSEGV, + [ex_do_segment_not_present] = SIGBUS, + [ex_do_stack_segment] = SIGBUS, + [ex_do_general_protection] = SIGSEGV, + [ex_do_page_fault] = SIGSEGV, + [ex_do_spurious_interrupt_bug] = -1, + [ex_do_coprocessor_error] = -1, + [ex_do_alignment_check] = SIGBUS, + [ex_machine_check_vector] = -1, + [ex_do_simd_coprocessor_error] = -1, + [20 ... 31] = -1, +#ifdef CONFIG_X86_32 + [ex_do_iret_error] = SIGSEGV, +#endif +}; +#endif /* CONFIG_KGDB */ + +int __ipipe_handle_exception(struct pt_regs *regs, long error_code, int vector) +{ + bool root_entry = false; + unsigned long flags = 0; + unsigned long cr2 = 0; + + if (ipipe_root_domain_p) { + root_entry = true; + + local_save_flags(flags); + /* + * Replicate hw interrupt state into the virtual mask + * before calling the I-pipe event handler over the + * root domain. Also required later when calling the + * Linux exception handler. + */ + if (irqs_disabled_hw()) + local_irq_disable(); + } +#ifdef CONFIG_KGDB + /* catch exception KGDB is interested in over non-root domains */ + else if (__ipipe_xlate_signo[vector] >= 0 && + !kgdb_handle_exception(vector, __ipipe_xlate_signo[vector], + error_code, regs)) + return 1; +#endif /* CONFIG_KGDB */ + + if (vector == ex_do_page_fault) + cr2 = native_read_cr2(); + + if (unlikely(ipipe_trap_notify(vector, regs))) { + if (root_entry) + local_irq_restore_nosync(flags); + return 1; + } + + if (likely(ipipe_root_domain_p)) { + /* + * In case we faulted in the iret path, regs.flags do not + * match the root domain state. The fault handler or the + * low-level return code may evaluate it. Fix this up, either + * by the root state sampled on entry or, if we migrated to + * root, with the current state. + */ + __fixup_if(root_entry ? raw_irqs_disabled_flags(flags) : + raw_irqs_disabled(), regs); + } else { + /* Detect unhandled faults over non-root domains. */ + struct ipipe_domain *ipd = ipipe_current_domain; + + /* Switch to root so that Linux can handle the fault cleanly. */ + __ipipe_current_domain = ipipe_root_domain; + + ipipe_trace_panic_freeze(); + + /* Always warn about user land and unfixable faults. */ + if ((error_code & 4) || !search_exception_tables(instruction_pointer(regs))) { + printk(KERN_ERR "BUG: Unhandled exception over domain" + " %s at 0x%lx - switching to ROOT\n", + ipd->name, instruction_pointer(regs)); + dump_stack(); + ipipe_trace_panic_dump(); +#ifdef CONFIG_IPIPE_DEBUG + /* Also report fixable ones when debugging is enabled. */ + } else { + printk(KERN_WARNING "WARNING: Fixable exception over " + "domain %s at 0x%lx - switching to ROOT\n", + ipd->name, instruction_pointer(regs)); + dump_stack(); + ipipe_trace_panic_dump(); +#endif /* CONFIG_IPIPE_DEBUG */ + } + } + + if (vector == ex_do_page_fault) + write_cr2(cr2); + + __ipipe_std_extable[vector](regs, error_code); + + /* + * Relevant for 64-bit: Restore root domain state as the low-level + * return code will not align it to regs.flags. + */ + if (root_entry) + local_irq_restore_nosync(flags); + + return 0; +} + +int __ipipe_divert_exception(struct pt_regs *regs, int vector) +{ + bool root_entry = false; + unsigned long flags = 0; + + if (ipipe_root_domain_p) { + root_entry = true; + + local_save_flags(flags); + + if (irqs_disabled_hw()) { + /* + * Same root state handling as in + * __ipipe_handle_exception. + */ + local_irq_disable(); + } + } +#ifdef CONFIG_KGDB + /* catch int1 and int3 over non-root domains */ + else { +#ifdef CONFIG_X86_32 + if (vector != ex_do_device_not_available) +#endif + { + unsigned int condition = 0; + + if (vector == 1) + get_debugreg(condition, 6); + if (!kgdb_handle_exception(vector, SIGTRAP, condition, regs)) + return 1; + } + } +#endif /* CONFIG_KGDB */ + + if (unlikely(ipipe_trap_notify(vector, regs))) { + if (root_entry) + local_irq_restore_nosync(flags); + return 1; + } + + /* see __ipipe_handle_exception */ + if (likely(ipipe_root_domain_p)) + __fixup_if(root_entry ? raw_irqs_disabled_flags(flags) : + raw_irqs_disabled(), regs); + /* + * No need to restore root state in the 64-bit case, the Linux handler + * and the return code will take care of it. + */ + + return 0; +} + +int __ipipe_syscall_root(struct pt_regs *regs) +{ + struct ipipe_percpu_domain_data *p; + unsigned long flags; + int ret; + + /* + * This routine either returns: + * 0 -- if the syscall is to be passed to Linux; + * >0 -- if the syscall should not be passed to Linux, and no + * tail work should be performed; + * <0 -- if the syscall should not be passed to Linux but the + * tail work has to be performed (for handling signals etc). + */ + + if (!__ipipe_syscall_watched_p(current, regs->orig_ax) || + !__ipipe_event_monitored_p(IPIPE_EVENT_SYSCALL)) + return 0; + + ret = __ipipe_dispatch_event(IPIPE_EVENT_SYSCALL, regs); + if (!ipipe_root_domain_p) { +#ifdef CONFIG_X86_64 + local_irq_disable_hw(); +#endif + return 1; + } + + local_irq_save_hw(flags); + p = ipipe_root_cpudom_ptr(); +#ifdef CONFIG_X86_32 + /* + * Fix-up only required on 32-bit as only here the IRET return code + * will evaluate the flags. + */ + __fixup_if(test_bit(IPIPE_STALL_FLAG, &p->status), regs); +#endif + /* + * If allowed, sync pending VIRQs before _TIF_NEED_RESCHED is + * tested. + */ + if (__ipipe_ipending_p(p)) + __ipipe_sync_pipeline(IPIPE_IRQ_DOVIRT); +#ifdef CONFIG_X86_64 + if (!ret) +#endif + local_irq_restore_hw(flags); + + return -ret; +} + +/* + * __ipipe_handle_irq() -- IPIPE's generic IRQ handler. An optimistic + * interrupt protection log is maintained here for each domain. Hw + * interrupts are off on entry. + */ +int __ipipe_handle_irq(struct pt_regs *regs) +{ + struct ipipe_domain *this_domain, *next_domain; + unsigned int vector = regs->orig_ax, irq; + struct list_head *head, *pos; + int m_ack; + + if ((long)regs->orig_ax < 0) { + vector = ~vector; +#ifdef CONFIG_X86_LOCAL_APIC + if (vector >= FIRST_SYSTEM_VECTOR) + irq = ipipe_apic_vector_irq(vector); +#ifdef CONFIG_SMP + else if (vector == IRQ_MOVE_CLEANUP_VECTOR) + irq = vector; +#endif /* CONFIG_SMP */ + else +#endif /* CONFIG_X86_LOCAL_APIC */ + irq = __get_cpu_var(vector_irq)[vector]; + m_ack = 0; + } else { /* This is a self-triggered one. */ + irq = vector; + m_ack = 1; + } + + this_domain = ipipe_current_domain; + + if (test_bit(IPIPE_STICKY_FLAG, &this_domain->irqs[irq].control)) + head = &this_domain->p_link; + else { + head = __ipipe_pipeline.next; + next_domain = list_entry(head, struct ipipe_domain, p_link); + if (likely(test_bit(IPIPE_WIRED_FLAG, &next_domain->irqs[irq].control))) { + if (!m_ack && next_domain->irqs[irq].acknowledge) + next_domain->irqs[irq].acknowledge(irq, irq_to_desc(irq)); + __ipipe_dispatch_wired(next_domain, irq); + goto finalize_nosync; + } + } + + /* Ack the interrupt. */ + + pos = head; + + while (pos != &__ipipe_pipeline) { + next_domain = list_entry(pos, struct ipipe_domain, p_link); + if (test_bit(IPIPE_HANDLE_FLAG, &next_domain->irqs[irq].control)) { + __ipipe_set_irq_pending(next_domain, irq); + if (!m_ack && next_domain->irqs[irq].acknowledge) { + next_domain->irqs[irq].acknowledge(irq, irq_to_desc(irq)); + m_ack = 1; + } + } + if (!test_bit(IPIPE_PASS_FLAG, &next_domain->irqs[irq].control)) + break; + pos = next_domain->p_link.next; + } + + /* + * If the interrupt preempted the head domain, then do not + * even try to walk the pipeline, unless an interrupt is + * pending for it. + */ + if (test_bit(IPIPE_AHEAD_FLAG, &this_domain->flags) && + !__ipipe_ipending_p(ipipe_head_cpudom_ptr())) + goto finalize_nosync; + + /* + * Now walk the pipeline, yielding control to the highest + * priority domain that has pending interrupt(s) or + * immediately to the current domain if the interrupt has been + * marked as 'sticky'. This search does not go beyond the + * current domain in the pipeline. + */ + + __ipipe_walk_pipeline(head); + +finalize_nosync: + + /* + * Given our deferred dispatching model for regular IRQs, we + * only record CPU regs for the last timer interrupt, so that + * the timer handler charges CPU times properly. It is assumed + * that other interrupt handlers don't actually care for such + * information. + */ + + if (irq == __ipipe_tick_irq) { + struct pt_regs *tick_regs = &__raw_get_cpu_var(__ipipe_tick_regs); + tick_regs->flags = regs->flags; + tick_regs->cs = regs->cs; + tick_regs->ip = regs->ip; + tick_regs->bp = regs->bp; +#ifdef CONFIG_X86_64 + tick_regs->ss = regs->ss; + tick_regs->sp = regs->sp; +#endif + if (!ipipe_root_domain_p) + tick_regs->flags &= ~X86_EFLAGS_IF; + } + + if (!ipipe_root_domain_p || + test_bit(IPIPE_STALL_FLAG, &ipipe_root_cpudom_var(status))) + return 0; + +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) + /* + * Prevent a spurious rescheduling from being triggered on + * preemptible kernels along the way out through + * ret_from_intr. + */ + if ((long)regs->orig_ax < 0) + __set_bit(IPIPE_STALL_FLAG, &ipipe_root_cpudom_var(status)); +#endif /* CONFIG_SMP */ + + return 1; +} + +int __ipipe_check_tickdev(const char *devname) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!strcmp(devname, "lapic")) + return __ipipe_check_lapic(); +#endif + + return 1; +} + +void *ipipe_irq_handler = __ipipe_handle_irq; +EXPORT_SYMBOL(ipipe_irq_handler); +EXPORT_SYMBOL(io_apic_irqs); +EXPORT_PER_CPU_SYMBOL(__ipipe_tick_regs); +__attribute__((regparm(3))) void do_notify_resume(struct pt_regs *, void *, __u32); +EXPORT_SYMBOL(do_notify_resume); +extern void *sys_call_table; +EXPORT_SYMBOL(sys_call_table); +#ifdef CONFIG_X86_32 +extern void ret_from_intr(void); +EXPORT_SYMBOL(ret_from_intr); +extern spinlock_t i8259A_lock; +extern struct desc_struct idt_table[]; +#else +extern ipipe_spinlock_t i8259A_lock; +extern gate_desc idt_table[]; +#endif +EXPORT_PER_CPU_SYMBOL(vector_irq); +EXPORT_SYMBOL(idt_table); +EXPORT_SYMBOL(i8259A_lock); +EXPORT_SYMBOL(__ipipe_sync_stage); +EXPORT_SYMBOL(kill_proc_info); +EXPORT_SYMBOL(find_task_by_pid_ns); + +EXPORT_SYMBOL(__ipipe_tick_irq); + +EXPORT_SYMBOL_GPL(irq_to_desc); +struct task_struct *__switch_to(struct task_struct *prev_p, + struct task_struct *next_p); +EXPORT_SYMBOL_GPL(__switch_to); +EXPORT_SYMBOL_GPL(show_stack); + +EXPORT_PER_CPU_SYMBOL_GPL(init_tss); +#ifdef CONFIG_SMP +EXPORT_PER_CPU_SYMBOL_GPL(cpu_tlbstate); +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +EXPORT_SYMBOL(tasklist_lock); +#endif /* CONFIG_SMP || CONFIG_DEBUG_SPINLOCK */ + +#if defined(CONFIG_CC_STACKPROTECTOR) && defined(CONFIG_X86_64) +EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); +#endif + +EXPORT_SYMBOL(__ipipe_halt_root); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 04bbd52..76d2fa3 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -38,7 +38,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - ack_APIC_irq(); + __ack_APIC_irq(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -231,11 +231,12 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; + irq = __get_cpu_var(vector_irq)[vector]; + __ipipe_move_root_irq(irq); + exit_idle(); irq_enter(); - irq = __get_cpu_var(vector_irq)[vector]; - if (!handle_irq(irq, regs)) { ack_APIC_irq(); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 40f3077..e3604ee 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -159,11 +159,13 @@ static void __init smp_intr_init(void) alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); +#ifndef CONFIG_IPIPE alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); +#endif /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); @@ -178,6 +180,10 @@ static void __init smp_intr_init(void) /* IPI used for rebooting/stopping */ alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); +#if defined(CONFIG_IPIPE) && defined(CONFIG_X86_32) + /* IPI for critical lock */ + alloc_intr_gate(IPIPE_CRITICAL_VECTOR, ipipe_ipiX); +#endif #endif #endif /* CONFIG_SMP */ } @@ -212,6 +218,12 @@ static void __init apic_intr_init(void) alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif +#if defined(CONFIG_IPIPE) && defined(CONFIG_X86_32) + alloc_intr_gate(IPIPE_SERVICE_VECTOR0, ipipe_ipi0); + alloc_intr_gate(IPIPE_SERVICE_VECTOR1, ipipe_ipi1); + alloc_intr_gate(IPIPE_SERVICE_VECTOR2, ipipe_ipi2); + alloc_intr_gate(IPIPE_SERVICE_VECTOR3, ipipe_ipi3); +#endif #endif } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d0ba107..b624d46 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -35,7 +35,15 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return -ENOMEM; WARN_ON((unsigned long)dst->thread.xstate & 15); memcpy(dst->thread.xstate, src->thread.xstate, xstate_size); + } else { +#ifdef CONFIG_IPIPE + dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, + GFP_KERNEL); + if (!dst->thread.xstate) + return -ENOMEM; +#endif } + return 0; } @@ -61,6 +69,10 @@ void arch_task_cache_init(void) kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), SLAB_PANIC | SLAB_NOTRACK, NULL); +#ifdef CONFIG_IPIPE + current->thread.xstate = kmem_cache_alloc(task_xstate_cachep, + GFP_KERNEL); +#endif } /* @@ -309,7 +321,7 @@ EXPORT_SYMBOL(default_idle); void stop_this_cpu(void *dummy) { - local_irq_disable(); + local_irq_disable_hw(); /* * Remove this CPU: */ @@ -534,6 +546,11 @@ static void c1e_idle(void) void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { +#ifdef CONFIG_IPIPE +#define default_to_mwait force_mwait +#else +#define default_to_mwait 1 +#endif #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { printk(KERN_WARNING "WARNING: polling idle and HT enabled," @@ -543,7 +560,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) if (pm_idle) return; - if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { + if (default_to_mwait && cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { /* * One CPU supports mwait => All CPUs supports mwait */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4cf7956..fff349c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -305,10 +305,12 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->cs = __USER_CS; regs->ip = new_ip; regs->sp = new_sp; +#ifndef CONFIG_IPIPE /* Lazily handled, init_fpu() will reset the state. */ /* * Free the old FP and other extended state */ free_thread_xstate(current); +#endif } EXPORT_SYMBOL_GPL(start_thread); @@ -345,7 +347,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); bool preload_fpu; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6eabe90..e96b01d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -58,6 +58,8 @@ asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); +asmlinkage extern void thread_return(void); + unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -292,6 +294,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) childregs; p->thread.sp0 = (unsigned long) (childregs+1); p->thread.usersp = me->thread.usersp; + p->thread.rip = (unsigned long) thread_return; set_tsk_thread_flag(p, TIF_FORK); @@ -358,10 +361,12 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->ss = __USER_DS; regs->flags = 0x200; set_fs(USER_DS); +#ifndef CONFIG_IPIPE /* Lazily handled, init_fpu() will reset the state. */ /* * Free the old FP and other extended state */ free_thread_xstate(current); +#endif } EXPORT_SYMBOL_GPL(start_thread); @@ -380,7 +385,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; bool preload_fpu; diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index ec1de97..a3f5bd6 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -184,9 +184,9 @@ static void native_smp_send_stop(void) udelay(1); } - local_irq_save(flags); + local_irq_save_hw(flags); disable_local_APIC(); - local_irq_restore(flags); + local_irq_restore_hw(flags); } /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 28e963d..9eee566 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -266,7 +266,7 @@ static void __cpuinit smp_callin(void) /* * Activate a secondary processor. */ -notrace static void __cpuinit start_secondary(void *unused) +static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too @@ -837,7 +837,7 @@ do_rest: int __cpuinit native_cpu_up(unsigned int cpu) { int apicid = apic->cpu_present_to_apicid(cpu); - unsigned long flags; + unsigned long flags, _flags; int err; WARN_ON(irqs_disabled()); @@ -889,9 +889,9 @@ int __cpuinit native_cpu_up(unsigned int cpu) * Check TSC synchronization with the AP (keep irqs disabled * while doing so): */ - local_irq_save(flags); + local_irq_save_full(flags, _flags); check_tsc_sync_source(cpu); - local_irq_restore(flags); + local_irq_restore_full(flags, _flags); while (!cpu_online(cpu)) { cpu_relax(); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index be25734..2b61ebd 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -70,11 +70,12 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) * manually to deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + unsigned long flags; + spin_lock_irqsave_cond(&i8259A_lock,flags); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + spin_unlock_irqrestore_cond(&i8259A_lock,flags); } global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7e37dce..38ff3e2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -805,6 +805,7 @@ void __math_state_restore(void) */ if (unlikely(restore_fpu_checking(tsk))) { stts(); + local_irq_enable_hw_cond(); force_sig(SIGSEGV, tsk); return; } @@ -827,6 +828,7 @@ asmlinkage void math_state_restore(void) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; + unsigned long flags; if (!tsk_used_math(tsk)) { local_irq_enable(); @@ -843,9 +845,11 @@ asmlinkage void math_state_restore(void) local_irq_disable(); } + local_irq_save_hw_cond(flags); clts(); /* Allow maths ops (or we recurse) */ __math_state_restore(); + local_irq_restore_hw_cond(flags); } EXPORT_SYMBOL_GPL(math_state_restore); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9c4e625..f0f25ab 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -148,12 +148,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) do_exit(SIGSEGV); } + local_irq_disable_hw_cond(); tss = &per_cpu(init_tss, get_cpu()); current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, ¤t->thread); current->thread.saved_sp0 = 0; put_cpu(); + local_irq_enable_hw_cond(); ret = KVM86->regs32; @@ -324,12 +326,14 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); + local_irq_disable_hw_cond(); tss = &per_cpu(init_tss, get_cpu()); tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); put_cpu(); + local_irq_enable_hw_cond(); tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index c9f2d9b..78d780a 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -30,7 +30,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) void *p; int i; - if (unlikely(in_interrupt())) + if (unlikely(!ipipe_root_domain_p || in_interrupt())) return __memcpy(to, from, len); p = to; diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index bf9a7d5..98609ae 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -65,6 +65,10 @@ thunk lockdep_sys_exit_thunk,lockdep_sys_exit #endif +#ifdef CONFIG_IPIPE + thunk_retrax __ipipe_syscall_root_thunk,__ipipe_syscall_root +#endif + /* SAVE_ARGS below is used only for the .cfi directives it contains. */ CFI_STARTPROC SAVE_ARGS diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f4cee90..d678a7c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,3 +1,4 @@ + /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. @@ -323,43 +324,9 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_all(void) -{ - unsigned long address; - - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -} - -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static noinline int vmalloc_fault(unsigned long address) +static inline int vmalloc_sync_one(pgd_t *pgd, unsigned long address) { - pgd_t *pgd, *pgd_ref; + pgd_t *pgd_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -373,7 +340,6 @@ static noinline int vmalloc_fault(unsigned long address) * happen within a race in page table update. In the later * case just flush: */ - pgd = pgd_offset(current->active_mm, address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; @@ -421,6 +387,46 @@ static noinline int vmalloc_fault(unsigned long address) return 0; } +void vmalloc_sync_all(void) +{ + unsigned long address; + + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd = pgd = pgd_offset(current->active_mm, address); + return vmalloc_sync_one(pgd, address); +} + static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" @@ -958,6 +964,9 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) /* Get the faulting address: */ address = read_cr2(); + if (!__ipipe_pipeline_head_p(ipipe_root_domain)) + local_irq_enable_hw_cond(); + /* * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. @@ -1137,3 +1146,43 @@ good_area: up_read(&mm->mmap_sem); } + +#ifdef CONFIG_IPIPE +void __ipipe_pin_range_globally(unsigned long start, unsigned long end) +{ +#ifdef CONFIG_X86_32 + unsigned long next, addr = start; + + do { + unsigned long flags; + struct page *page; + + next = pgd_addr_end(addr, end); + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) + vmalloc_sync_one(page_address(page), addr); + spin_unlock_irqrestore(&pgd_lock, flags); + + } while (addr = next, addr != end); +#else + unsigned long next, addr = start; + int ret = 0; + + do { + struct page *page; + + next = pgd_addr_end(addr, end); + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + ret = vmalloc_sync_one(pgd, addr); + if (ret) + break; + } + spin_unlock(&pgd_lock); + addr = next; + } while (!ret && addr != end); +#endif +} +#endif /* CONFIG_IPIPE */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 36fe08e..32adecd 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -57,11 +57,15 @@ static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; */ void leave_mm(int cpu) { + unsigned long flags; + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); + local_irq_save_hw_cond(flags); cpumask_clear_cpu(cpu, mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); load_cr3(swapper_pg_dir); + local_irq_restore_hw_cond(flags); } EXPORT_SYMBOL_GPL(leave_mm); @@ -192,6 +196,9 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, apic->send_IPI_mask(to_cpumask(f->flush_cpumask), INVALIDATE_TLB_VECTOR_START + sender); +#ifdef CONFIG_IPIPE + WARN_ON_ONCE(irqs_disabled_hw()); +#endif while (!cpumask_empty(to_cpumask(f->flush_cpumask))) cpu_relax(); } diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index 737a1c4..15e81de 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -21,7 +21,7 @@ * With multiple simultaneous hypertransport irq devices it might pay * to make this more fine grained. But start with simple, stupid, and correct. */ -static DEFINE_SPINLOCK(ht_irq_lock); +static IPIPE_DEFINE_SPINLOCK(ht_irq_lock); struct ht_irq_cfg { struct pci_dev *dev; diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index 5ed1b82..d57ad7d 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -3016,6 +3016,53 @@ static int serial8250_resume(struct platform_device *dev) return 0; } +#if defined(CONFIG_IPIPE_DEBUG) && defined(CONFIG_SERIAL_8250_CONSOLE) + +#include + +void __weak __ipipe_serial_debug(const char *fmt, ...) +{ + struct uart_8250_port *up = &serial8250_ports[0]; + unsigned int ier, count; + unsigned long flags; + char buf[128]; + va_list ap; + + va_start(ap, fmt); + vsprintf(buf, fmt, ap); + va_end(ap); + count = strlen(buf); + + touch_nmi_watchdog(); + + local_irq_save_hw(flags); + + /* + * First save the IER then disable the interrupts + */ + ier = serial_in(up, UART_IER); + + if (up->capabilities & UART_CAP_UUE) + serial_out(up, UART_IER, UART_IER_UUE); + else + serial_out(up, UART_IER, 0); + + uart_console_write(&up->port, buf, count, serial8250_console_putchar); + + /* + * Finally, wait for transmitter to become empty + * and restore the IER + */ + wait_for_xmitr(up, BOTH_EMPTY); + serial_out(up, UART_IER, ier); + + local_irq_restore_hw(flags); +} + +EXPORT_SYMBOL(__ipipe_serial_debug); + +#endif + static struct platform_driver serial8250_isa_driver = { .probe = serial8250_probe, .remove = __devexit_p(serial8250_remove), diff --git a/fs/exec.c b/fs/exec.c index a2a3944..0708ae7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -715,6 +715,7 @@ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct * old_mm, *active_mm; + unsigned long flags; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -737,8 +738,10 @@ static int exec_mmap(struct mm_struct *mm) task_lock(tsk); active_mm = tsk->active_mm; tsk->mm = mm; + ipipe_mm_switch_protect(flags); tsk->active_mm = mm; activate_mm(active_mm, mm); + ipipe_mm_switch_unprotect(flags); task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index c99c64d..5d01b93 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -60,11 +60,11 @@ static inline int atomic_add_return(int i, atomic_t *v) unsigned long flags; int temp; - local_irq_save(flags); + local_irq_save_hw(flags); temp = v->counter; temp += i; v->counter = temp; - local_irq_restore(flags); + local_irq_restore_hw(flags); return temp; } @@ -82,11 +82,11 @@ static inline int atomic_sub_return(int i, atomic_t *v) unsigned long flags; int temp; - local_irq_save(flags); + local_irq_save_hw(flags); temp = v->counter; temp -= i; v->counter = temp; - local_irq_restore(flags); + local_irq_restore_hw(flags); return temp; } @@ -139,9 +139,9 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) unsigned long flags; mask = ~mask; - local_irq_save(flags); + local_irq_save_hw(flags); *addr &= mask; - local_irq_restore(flags); + local_irq_restore_hw(flags); } #define atomic_xchg(ptr, v) (xchg(&(ptr)->counter, (v))) diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index c894646..8d42ffe 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -21,20 +21,20 @@ extern raw_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned; * this is the substitute */ #define _atomic_spin_lock_irqsave(l,f) do { \ raw_spinlock_t *s = ATOMIC_HASH(l); \ - local_irq_save(f); \ + local_irq_save_hw(f); \ __raw_spin_lock(s); \ } while(0) #define _atomic_spin_unlock_irqrestore(l,f) do { \ raw_spinlock_t *s = ATOMIC_HASH(l); \ __raw_spin_unlock(s); \ - local_irq_restore(f); \ + local_irq_restore_hw(f); \ } while(0) #else -# define _atomic_spin_lock_irqsave(l,f) do { local_irq_save(f); } while (0) -# define _atomic_spin_unlock_irqrestore(l,f) do { local_irq_restore(f); } while (0) +# define _atomic_spin_lock_irqsave(l,f) do { local_irq_save_hw(f); } while (0) +# define _atomic_spin_unlock_irqrestore(l,f) do { local_irq_restore_hw(f); } while (0) #endif /* diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h index b2ba2fc..ed01ab9 100644 --- a/include/asm-generic/cmpxchg-local.h +++ b/include/asm-generic/cmpxchg-local.h @@ -20,7 +20,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, if (size == 8 && sizeof(unsigned long) != 8) wrong_size_cmpxchg(ptr); - local_irq_save(flags); + local_irq_save_hw(flags); switch (size) { case 1: prev = *(u8 *)ptr; if (prev == old) @@ -41,7 +41,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, default: wrong_size_cmpxchg(ptr); } - local_irq_restore(flags); + local_irq_restore_hw(flags); return prev; } @@ -54,11 +54,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr, u64 prev; unsigned long flags; - local_irq_save(flags); + local_irq_save_hw(flags); prev = *(u64 *)ptr; if (prev == old) *(u64 *)ptr = new; - local_irq_restore(flags); + local_irq_restore_hw(flags); return prev; } diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 90079c3..65e872e 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -56,6 +56,20 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define __raw_get_cpu_var(var) \ (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset)) +#ifdef CONFIG_IPIPE +#if defined(CONFIG_IPIPE_DEBUG_INTERNAL) && defined(CONFIG_SMP) +extern int __ipipe_check_percpu_access(void); +#define __ipipe_local_cpu_offset \ + ({ \ + WARN_ON_ONCE(__ipipe_check_percpu_access()); \ + __my_cpu_offset; \ + }) +#else +#define __ipipe_local_cpu_offset __my_cpu_offset +#endif +#define __ipipe_get_cpu_var(var) \ + (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __ipipe_local_cpu_offset)) +#endif /* CONFIG_IPIPE */ #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA extern void setup_per_cpu_areas(void); @@ -66,6 +80,7 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) +#define __ipipe_get_cpu_var(var) __raw_get_cpu_var(var) #endif /* SMP */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 6d527ee..c997ef1 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -183,24 +183,28 @@ extern void irq_enter(void); */ extern void irq_exit(void); -#define nmi_enter() \ - do { \ - ftrace_nmi_enter(); \ - BUG_ON(in_nmi()); \ - add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ - lockdep_off(); \ - rcu_nmi_enter(); \ - trace_hardirq_enter(); \ +#define nmi_enter() \ + do { \ + if (likely(!ipipe_test_foreign_stack())) { \ + ftrace_nmi_enter(); \ + BUG_ON(in_nmi()); \ + add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + lockdep_off(); \ + rcu_nmi_enter(); \ + trace_hardirq_enter(); \ + } \ } while (0) -#define nmi_exit() \ - do { \ - trace_hardirq_exit(); \ - rcu_nmi_exit(); \ - lockdep_on(); \ - BUG_ON(!in_nmi()); \ - sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ - ftrace_nmi_exit(); \ +#define nmi_exit() \ + do { \ + if (likely(!ipipe_test_foreign_stack())) { \ + trace_hardirq_exit(); \ + rcu_nmi_exit(); \ + lockdep_on(); \ + BUG_ON(!in_nmi()); \ + sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + ftrace_nmi_exit(); \ + } \ } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/ipipe.h b/include/linux/ipipe.h new file mode 100644 index 0000000..1040a2b --- /dev/null +++ b/include/linux/ipipe.h @@ -0,0 +1,690 @@ +/* -*- linux-c -*- + * include/linux/ipipe.h + * + * Copyright (C) 2002-2007 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_H +#define __LINUX_IPIPE_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT + +#include +#include + +static inline int ipipe_disable_context_check(int cpu) +{ + return xchg(&per_cpu(ipipe_percpu_context_check, cpu), 0); +} + +static inline void ipipe_restore_context_check(int cpu, int old_state) +{ + per_cpu(ipipe_percpu_context_check, cpu) = old_state; +} + +static inline void ipipe_context_check_off(void) +{ + int cpu; + for_each_online_cpu(cpu) + per_cpu(ipipe_percpu_context_check, cpu) = 0; +} + +#else /* !CONFIG_IPIPE_DEBUG_CONTEXT */ + +static inline int ipipe_disable_context_check(int cpu) +{ + return 0; +} + +static inline void ipipe_restore_context_check(int cpu, int old_state) { } + +static inline void ipipe_context_check_off(void) { } + +#endif /* !CONFIG_IPIPE_DEBUG_CONTEXT */ + +#ifdef CONFIG_IPIPE + +#define IPIPE_VERSION_STRING IPIPE_ARCH_STRING +#define IPIPE_RELEASE_NUMBER ((IPIPE_MAJOR_NUMBER << 16) | \ + (IPIPE_MINOR_NUMBER << 8) | \ + (IPIPE_PATCH_NUMBER)) + +#ifndef BROKEN_BUILTIN_RETURN_ADDRESS +#define __BUILTIN_RETURN_ADDRESS0 ((unsigned long)__builtin_return_address(0)) +#define __BUILTIN_RETURN_ADDRESS1 ((unsigned long)__builtin_return_address(1)) +#endif /* !BUILTIN_RETURN_ADDRESS */ + +#define IPIPE_ROOT_PRIO 100 +#define IPIPE_ROOT_ID 0 +#define IPIPE_ROOT_NPTDKEYS 4 /* Must be <= BITS_PER_LONG */ + +#define IPIPE_RESET_TIMER 0x1 +#define IPIPE_GRAB_TIMER 0x2 + +/* Global domain flags */ +#define IPIPE_SPRINTK_FLAG 0 /* Synchronous printk() allowed */ +#define IPIPE_AHEAD_FLAG 1 /* Domain always heads the pipeline */ + +/* Interrupt control bits */ +#define IPIPE_HANDLE_FLAG 0 +#define IPIPE_PASS_FLAG 1 +#define IPIPE_ENABLE_FLAG 2 +#define IPIPE_DYNAMIC_FLAG IPIPE_HANDLE_FLAG +#define IPIPE_STICKY_FLAG 3 +#define IPIPE_SYSTEM_FLAG 4 +#define IPIPE_LOCK_FLAG 5 +#define IPIPE_WIRED_FLAG 6 +#define IPIPE_EXCLUSIVE_FLAG 7 + +#define IPIPE_HANDLE_MASK (1 << IPIPE_HANDLE_FLAG) +#define IPIPE_PASS_MASK (1 << IPIPE_PASS_FLAG) +#define IPIPE_ENABLE_MASK (1 << IPIPE_ENABLE_FLAG) +#define IPIPE_DYNAMIC_MASK IPIPE_HANDLE_MASK +#define IPIPE_STICKY_MASK (1 << IPIPE_STICKY_FLAG) +#define IPIPE_SYSTEM_MASK (1 << IPIPE_SYSTEM_FLAG) +#define IPIPE_LOCK_MASK (1 << IPIPE_LOCK_FLAG) +#define IPIPE_WIRED_MASK (1 << IPIPE_WIRED_FLAG) +#define IPIPE_EXCLUSIVE_MASK (1 << IPIPE_EXCLUSIVE_FLAG) + +#define IPIPE_DEFAULT_MASK (IPIPE_HANDLE_MASK|IPIPE_PASS_MASK) +#define IPIPE_STDROOT_MASK (IPIPE_HANDLE_MASK|IPIPE_PASS_MASK|IPIPE_SYSTEM_MASK) + +#define IPIPE_EVENT_SELF 0x80000000 + +#define IPIPE_NR_CPUS NR_CPUS + +/* This accessor assumes hw IRQs are off on SMP; allows assignment. */ +#define __ipipe_current_domain __ipipe_get_cpu_var(ipipe_percpu_domain) +/* This read-only accessor makes sure that hw IRQs are off on SMP. */ +#define ipipe_current_domain \ + ({ \ + struct ipipe_domain *__ipd__; \ + unsigned long __flags__; \ + local_irq_save_hw_smp(__flags__); \ + __ipd__ = __ipipe_current_domain; \ + local_irq_restore_hw_smp(__flags__); \ + __ipd__; \ + }) + +#define ipipe_virtual_irq_p(irq) ((irq) >= IPIPE_VIRQ_BASE && \ + (irq) < IPIPE_NR_IRQS) + +#define IPIPE_SAME_HANDLER ((ipipe_irq_handler_t)(-1)) + +struct irq_desc; + +typedef void (*ipipe_irq_ackfn_t)(unsigned irq, struct irq_desc *desc); + +typedef int (*ipipe_event_handler_t)(unsigned event, + struct ipipe_domain *from, + void *data); +struct ipipe_domain { + + int slot; /* Slot number in percpu domain data array. */ + struct list_head p_link; /* Link in pipeline */ + ipipe_event_handler_t evhand[IPIPE_NR_EVENTS]; /* Event handlers. */ + unsigned long long evself; /* Self-monitored event bits. */ + + struct irqdesc { + unsigned long control; + ipipe_irq_ackfn_t acknowledge; + ipipe_irq_handler_t handler; + void *cookie; + } ____cacheline_aligned irqs[IPIPE_NR_IRQS]; + + int priority; + void *pdd; + unsigned long flags; + unsigned domid; + const char *name; + struct mutex mutex; +}; + +#define IPIPE_HEAD_PRIORITY (-1) /* For domains always heading the pipeline */ + +struct ipipe_domain_attr { + + unsigned domid; /* Domain identifier -- Magic value set by caller */ + const char *name; /* Domain name -- Warning: won't be dup'ed! */ + int priority; /* Priority in interrupt pipeline */ + void (*entry) (void); /* Domain entry point */ + void *pdd; /* Per-domain (opaque) data pointer */ +}; + +#define __ipipe_irq_cookie(ipd, irq) (ipd)->irqs[irq].cookie +#define __ipipe_irq_handler(ipd, irq) (ipd)->irqs[irq].handler +#define __ipipe_cpudata_irq_hits(ipd, cpu, irq) ipipe_percpudom(ipd, irqall, cpu)[irq] + +extern unsigned __ipipe_printk_virq; + +extern unsigned long __ipipe_virtual_irq_map; + +extern struct list_head __ipipe_pipeline; + +extern int __ipipe_event_monitors[]; + +/* Private interface */ + +void ipipe_init_early(void); + +void ipipe_init(void); + +#ifdef CONFIG_PROC_FS +void ipipe_init_proc(void); + +#ifdef CONFIG_IPIPE_TRACE +void __ipipe_init_tracer(void); +#else /* !CONFIG_IPIPE_TRACE */ +#define __ipipe_init_tracer() do { } while(0) +#endif /* CONFIG_IPIPE_TRACE */ + +#else /* !CONFIG_PROC_FS */ +#define ipipe_init_proc() do { } while(0) +#endif /* CONFIG_PROC_FS */ + +void __ipipe_init_stage(struct ipipe_domain *ipd); + +void __ipipe_cleanup_domain(struct ipipe_domain *ipd); + +void __ipipe_add_domain_proc(struct ipipe_domain *ipd); + +void __ipipe_remove_domain_proc(struct ipipe_domain *ipd); + +void __ipipe_flush_printk(unsigned irq, void *cookie); + +void __ipipe_walk_pipeline(struct list_head *pos); + +void __ipipe_pend_irq(unsigned irq, struct list_head *head); + +int __ipipe_dispatch_event(unsigned event, void *data); + +void __ipipe_dispatch_wired_nocheck(struct ipipe_domain *head, unsigned irq); + +void __ipipe_dispatch_wired(struct ipipe_domain *head, unsigned irq); + +void __ipipe_sync_stage(int dovirt); + +void __ipipe_set_irq_pending(struct ipipe_domain *ipd, unsigned irq); + +void __ipipe_lock_irq(struct ipipe_domain *ipd, int cpu, unsigned irq); + +void __ipipe_unlock_irq(struct ipipe_domain *ipd, unsigned irq); + +void __ipipe_pin_range_globally(unsigned long start, unsigned long end); + +/* Must be called hw IRQs off. */ +static inline void ipipe_irq_lock(unsigned irq) +{ + __ipipe_lock_irq(__ipipe_current_domain, ipipe_processor_id(), irq); +} + +/* Must be called hw IRQs off. */ +static inline void ipipe_irq_unlock(unsigned irq) +{ + __ipipe_unlock_irq(__ipipe_current_domain, irq); +} + +#ifndef __ipipe_sync_pipeline +#define __ipipe_sync_pipeline(dovirt) __ipipe_sync_stage(dovirt) +#endif + +#ifndef __ipipe_run_irqtail +#define __ipipe_run_irqtail() do { } while(0) +#endif + +#define __ipipe_pipeline_head_p(ipd) (&(ipd)->p_link == __ipipe_pipeline.next) + +#define __ipipe_ipending_p(p) ((p)->irqpend_himap != 0) + +/* + * Keep the following as a macro, so that client code could check for + * the support of the invariant pipeline head optimization. + */ +#define __ipipe_pipeline_head() \ + list_entry(__ipipe_pipeline.next, struct ipipe_domain, p_link) + +#define local_irq_enable_hw_cond() local_irq_enable_hw() +#define local_irq_disable_hw_cond() local_irq_disable_hw() +#define local_irq_save_hw_cond(flags) local_irq_save_hw(flags) +#define local_irq_restore_hw_cond(flags) local_irq_restore_hw(flags) + +#ifdef CONFIG_SMP +cpumask_t __ipipe_set_irq_affinity(unsigned irq, cpumask_t cpumask); +int __ipipe_send_ipi(unsigned ipi, cpumask_t cpumask); +#define local_irq_save_hw_smp(flags) local_irq_save_hw(flags) +#define local_irq_restore_hw_smp(flags) local_irq_restore_hw(flags) +#else /* !CONFIG_SMP */ +#define local_irq_save_hw_smp(flags) do { (void)(flags); } while(0) +#define local_irq_restore_hw_smp(flags) do { } while(0) +#endif /* CONFIG_SMP */ + +#define local_irq_save_full(vflags, rflags) \ + do { \ + local_irq_save(vflags); \ + local_irq_save_hw(rflags); \ + } while(0) + +#define local_irq_restore_full(vflags, rflags) \ + do { \ + local_irq_restore_hw(rflags); \ + local_irq_restore(vflags); \ + } while(0) + +static inline void __local_irq_restore_nosync(unsigned long x) +{ + struct ipipe_percpu_domain_data *p = ipipe_root_cpudom_ptr(); + + if (raw_irqs_disabled_flags(x)) { + set_bit(IPIPE_STALL_FLAG, &p->status); + trace_hardirqs_off(); + } else { + trace_hardirqs_on(); + clear_bit(IPIPE_STALL_FLAG, &p->status); + } +} + +static inline void local_irq_restore_nosync(unsigned long x) +{ + unsigned long flags; + local_irq_save_hw_smp(flags); + __local_irq_restore_nosync(x); + local_irq_restore_hw_smp(flags); +} + +#define __ipipe_root_domain_p (__ipipe_current_domain == ipipe_root_domain) +#define ipipe_root_domain_p (ipipe_current_domain == ipipe_root_domain) + +static inline int __ipipe_event_monitored_p(int ev) +{ + if (__ipipe_event_monitors[ev] > 0) + return 1; + + return (ipipe_current_domain->evself & (1LL << ev)) != 0; +} + +#define ipipe_sigwake_notify(p) \ +do { \ + if (((p)->flags & PF_EVNOTIFY) && __ipipe_event_monitored_p(IPIPE_EVENT_SIGWAKE)) \ + __ipipe_dispatch_event(IPIPE_EVENT_SIGWAKE, p); \ +} while(0) + +#define ipipe_exit_notify(p) \ +do { \ + if (((p)->flags & PF_EVNOTIFY) && __ipipe_event_monitored_p(IPIPE_EVENT_EXIT)) \ + __ipipe_dispatch_event(IPIPE_EVENT_EXIT, p); \ +} while(0) + +#define ipipe_setsched_notify(p) \ +do { \ + if (((p)->flags & PF_EVNOTIFY) && __ipipe_event_monitored_p(IPIPE_EVENT_SETSCHED)) \ + __ipipe_dispatch_event(IPIPE_EVENT_SETSCHED, p); \ +} while(0) + +#define ipipe_schedule_notify(prev, next) \ +do { \ + if ((((prev)->flags|(next)->flags) & PF_EVNOTIFY) && \ + __ipipe_event_monitored_p(IPIPE_EVENT_SCHEDULE)) \ + __ipipe_dispatch_event(IPIPE_EVENT_SCHEDULE,next); \ +} while(0) + +#define ipipe_trap_notify(ex, regs) \ +({ \ + unsigned long __flags__; \ + int __ret__ = 0; \ + local_irq_save_hw_smp(__flags__); \ + if ((test_bit(IPIPE_NOSTACK_FLAG, &ipipe_this_cpudom_var(status)) || \ + ((current)->flags & PF_EVNOTIFY)) && \ + __ipipe_event_monitored_p(ex)) { \ + local_irq_restore_hw_smp(__flags__); \ + __ret__ = __ipipe_dispatch_event(ex, regs); \ + } else \ + local_irq_restore_hw_smp(__flags__); \ + __ret__; \ +}) + +static inline void ipipe_init_notify(struct task_struct *p) +{ + if (__ipipe_event_monitored_p(IPIPE_EVENT_INIT)) + __ipipe_dispatch_event(IPIPE_EVENT_INIT, p); +} + +struct mm_struct; + +static inline void ipipe_cleanup_notify(struct mm_struct *mm) +{ + if (__ipipe_event_monitored_p(IPIPE_EVENT_CLEANUP)) + __ipipe_dispatch_event(IPIPE_EVENT_CLEANUP, mm); +} + +/* Public interface */ + +int ipipe_register_domain(struct ipipe_domain *ipd, + struct ipipe_domain_attr *attr); + +int ipipe_unregister_domain(struct ipipe_domain *ipd); + +void ipipe_suspend_domain(void); + +int ipipe_virtualize_irq(struct ipipe_domain *ipd, + unsigned irq, + ipipe_irq_handler_t handler, + void *cookie, + ipipe_irq_ackfn_t acknowledge, + unsigned modemask); + +int ipipe_control_irq(unsigned irq, + unsigned clrmask, + unsigned setmask); + +unsigned ipipe_alloc_virq(void); + +int ipipe_free_virq(unsigned virq); + +int ipipe_trigger_irq(unsigned irq); + +static inline void __ipipe_propagate_irq(unsigned irq) +{ + struct list_head *next = __ipipe_current_domain->p_link.next; + if (next == &ipipe_root.p_link) { + /* Fast path: root must handle all interrupts. */ + __ipipe_set_irq_pending(&ipipe_root, irq); + return; + } + __ipipe_pend_irq(irq, next); +} + +static inline void __ipipe_schedule_irq(unsigned irq) +{ + __ipipe_pend_irq(irq, &__ipipe_current_domain->p_link); +} + +static inline void __ipipe_schedule_irq_head(unsigned irq) +{ + __ipipe_set_irq_pending(__ipipe_pipeline_head(), irq); +} + +static inline void __ipipe_schedule_irq_root(unsigned irq) +{ + __ipipe_set_irq_pending(&ipipe_root, irq); +} + +static inline void ipipe_propagate_irq(unsigned irq) +{ + unsigned long flags; + + local_irq_save_hw(flags); + __ipipe_propagate_irq(irq); + local_irq_restore_hw(flags); +} + +static inline void ipipe_schedule_irq(unsigned irq) +{ + unsigned long flags; + + local_irq_save_hw(flags); + __ipipe_schedule_irq(irq); + local_irq_restore_hw(flags); +} + +static inline void ipipe_schedule_irq_head(unsigned irq) +{ + unsigned long flags; + + local_irq_save_hw(flags); + __ipipe_schedule_irq_head(irq); + local_irq_restore_hw(flags); +} + +static inline void ipipe_schedule_irq_root(unsigned irq) +{ + unsigned long flags; + + local_irq_save_hw(flags); + __ipipe_schedule_irq_root(irq); + local_irq_restore_hw(flags); +} + +void ipipe_stall_pipeline_from(struct ipipe_domain *ipd); + +unsigned long ipipe_test_and_stall_pipeline_from(struct ipipe_domain *ipd); + +unsigned long ipipe_test_and_unstall_pipeline_from(struct ipipe_domain *ipd); + +static inline void ipipe_unstall_pipeline_from(struct ipipe_domain *ipd) +{ + ipipe_test_and_unstall_pipeline_from(ipd); +} + +void ipipe_restore_pipeline_from(struct ipipe_domain *ipd, + unsigned long x); + +static inline unsigned long ipipe_test_pipeline_from(struct ipipe_domain *ipd) +{ + return test_bit(IPIPE_STALL_FLAG, &ipipe_cpudom_var(ipd, status)); +} + +static inline void ipipe_stall_pipeline_head(void) +{ + local_irq_disable_hw(); + __set_bit(IPIPE_STALL_FLAG, &ipipe_head_cpudom_var(status)); +} + +static inline unsigned long ipipe_test_and_stall_pipeline_head(void) +{ + local_irq_disable_hw(); + return __test_and_set_bit(IPIPE_STALL_FLAG, &ipipe_head_cpudom_var(status)); +} + +void ipipe_unstall_pipeline_head(void); + +void __ipipe_restore_pipeline_head(unsigned long x); + +static inline void ipipe_restore_pipeline_head(unsigned long x) +{ + /* On some archs, __test_and_set_bit() might return different + * truth value than test_bit(), so we test the exclusive OR of + * both statuses, assuming that the lowest bit is always set in + * the truth value (if this is wrong, the failed optimization will + * be caught in __ipipe_restore_pipeline_head() if + * CONFIG_DEBUG_KERNEL is set). */ + if ((x ^ test_bit(IPIPE_STALL_FLAG, &ipipe_head_cpudom_var(status))) & 1) + __ipipe_restore_pipeline_head(x); +} + +#define ipipe_unstall_pipeline() \ + ipipe_unstall_pipeline_from(ipipe_current_domain) + +#define ipipe_test_and_unstall_pipeline() \ + ipipe_test_and_unstall_pipeline_from(ipipe_current_domain) + +#define ipipe_test_pipeline() \ + ipipe_test_pipeline_from(ipipe_current_domain) + +#define ipipe_test_and_stall_pipeline() \ + ipipe_test_and_stall_pipeline_from(ipipe_current_domain) + +#define ipipe_stall_pipeline() \ + ipipe_stall_pipeline_from(ipipe_current_domain) + +#define ipipe_restore_pipeline(x) \ + ipipe_restore_pipeline_from(ipipe_current_domain, (x)) + +void ipipe_init_attr(struct ipipe_domain_attr *attr); + +int ipipe_get_sysinfo(struct ipipe_sysinfo *sysinfo); + +unsigned long ipipe_critical_enter(void (*syncfn) (void)); + +void ipipe_critical_exit(unsigned long flags); + +static inline void ipipe_set_printk_sync(struct ipipe_domain *ipd) +{ + set_bit(IPIPE_SPRINTK_FLAG, &ipd->flags); +} + +static inline void ipipe_set_printk_async(struct ipipe_domain *ipd) +{ + clear_bit(IPIPE_SPRINTK_FLAG, &ipd->flags); +} + +static inline void ipipe_set_foreign_stack(struct ipipe_domain *ipd) +{ + /* Must be called hw interrupts off. */ + __set_bit(IPIPE_NOSTACK_FLAG, &ipipe_cpudom_var(ipd, status)); +} + +static inline void ipipe_clear_foreign_stack(struct ipipe_domain *ipd) +{ + /* Must be called hw interrupts off. */ + __clear_bit(IPIPE_NOSTACK_FLAG, &ipipe_cpudom_var(ipd, status)); +} + +static inline int ipipe_test_foreign_stack(void) +{ + /* Must be called hw interrupts off. */ + return test_bit(IPIPE_NOSTACK_FLAG, &ipipe_this_cpudom_var(status)); +} + +#ifndef ipipe_safe_current +#define ipipe_safe_current() \ +({ \ + struct task_struct *p; \ + unsigned long flags; \ + local_irq_save_hw_smp(flags); \ + p = ipipe_test_foreign_stack() ? &init_task : current; \ + local_irq_restore_hw_smp(flags); \ + p; \ +}) +#endif + +ipipe_event_handler_t ipipe_catch_event(struct ipipe_domain *ipd, + unsigned event, + ipipe_event_handler_t handler); + +cpumask_t ipipe_set_irq_affinity(unsigned irq, + cpumask_t cpumask); + +int ipipe_send_ipi(unsigned ipi, + cpumask_t cpumask); + +int ipipe_setscheduler_root(struct task_struct *p, + int policy, + int prio); + +int ipipe_reenter_root(struct task_struct *prev, + int policy, + int prio); + +int ipipe_alloc_ptdkey(void); + +int ipipe_free_ptdkey(int key); + +int ipipe_set_ptd(int key, + void *value); + +void *ipipe_get_ptd(int key); + +int ipipe_disable_ondemand_mappings(struct task_struct *tsk); + +static inline void ipipe_nmi_enter(void) +{ + int cpu = ipipe_processor_id(); + + per_cpu(ipipe_nmi_saved_root, cpu) = ipipe_root_cpudom_var(status); + __set_bit(IPIPE_STALL_FLAG, &ipipe_root_cpudom_var(status)); + +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT + per_cpu(ipipe_saved_context_check_state, cpu) = + ipipe_disable_context_check(cpu); +#endif /* CONFIG_IPIPE_DEBUG_CONTEXT */ +} + +static inline void ipipe_nmi_exit(void) +{ + int cpu = ipipe_processor_id(); + +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT + ipipe_restore_context_check + (cpu, per_cpu(ipipe_saved_context_check_state, cpu)); +#endif /* CONFIG_IPIPE_DEBUG_CONTEXT */ + + if (!test_bit(IPIPE_STALL_FLAG, &per_cpu(ipipe_nmi_saved_root, cpu))) + __clear_bit(IPIPE_STALL_FLAG, &ipipe_root_cpudom_var(status)); +} + +#else /* !CONFIG_IPIPE */ + +#define ipipe_init_early() do { } while(0) +#define ipipe_init() do { } while(0) +#define ipipe_suspend_domain() do { } while(0) +#define ipipe_sigwake_notify(p) do { } while(0) +#define ipipe_setsched_notify(p) do { } while(0) +#define ipipe_init_notify(p) do { } while(0) +#define ipipe_exit_notify(p) do { } while(0) +#define ipipe_cleanup_notify(mm) do { } while(0) +#define ipipe_trap_notify(t,r) 0 +#define ipipe_init_proc() do { } while(0) + +static inline void __ipipe_pin_range_globally(unsigned long start, + unsigned long end) +{ +} + +static inline int ipipe_test_foreign_stack(void) +{ + return 0; +} + +#define local_irq_enable_hw_cond() do { } while(0) +#define local_irq_disable_hw_cond() do { } while(0) +#define local_irq_save_hw_cond(flags) do { (void)(flags); } while(0) +#define local_irq_restore_hw_cond(flags) do { } while(0) +#define local_irq_save_hw_smp(flags) do { (void)(flags); } while(0) +#define local_irq_restore_hw_smp(flags) do { } while(0) + +#define ipipe_irq_lock(irq) do { } while(0) +#define ipipe_irq_unlock(irq) do { } while(0) + +#define __ipipe_root_domain_p 1 +#define ipipe_root_domain_p 1 +#define ipipe_safe_current current +#define ipipe_processor_id() smp_processor_id() + +#define ipipe_nmi_enter() do { } while (0) +#define ipipe_nmi_exit() do { } while (0) + +#define local_irq_disable_head() local_irq_disable() + +#define local_irq_save_full(vflags, rflags) do { (void)(vflags); local_irq_save(rflags); } while(0) +#define local_irq_restore_full(vflags, rflags) do { (void)(vflags); local_irq_restore(rflags); } while(0) +#define local_irq_restore_nosync(vflags) local_irq_restore(vflags) + +#define __ipipe_pipeline_head_p(ipd) 1 + +#endif /* CONFIG_IPIPE */ + +#endif /* !__LINUX_IPIPE_H */ diff --git a/include/linux/ipipe_base.h b/include/linux/ipipe_base.h new file mode 100644 index 0000000..9853df3 --- /dev/null +++ b/include/linux/ipipe_base.h @@ -0,0 +1,118 @@ +/* -*- linux-c -*- + * include/linux/ipipe_base.h + * + * Copyright (C) 2002-2007 Philippe Gerum. + * 2007 Jan Kiszka. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_BASE_H +#define __LINUX_IPIPE_BASE_H + +#ifdef CONFIG_IPIPE + +#include + +#define __bpl_up(x) (((x)+(BITS_PER_LONG-1)) & ~(BITS_PER_LONG-1)) +/* Number of virtual IRQs (must be a multiple of BITS_PER_LONG) */ +#define IPIPE_NR_VIRQS BITS_PER_LONG +/* First virtual IRQ # (must be aligned on BITS_PER_LONG) */ +#define IPIPE_VIRQ_BASE __bpl_up(IPIPE_NR_XIRQS) +/* Total number of IRQ slots */ +#define IPIPE_NR_IRQS (IPIPE_VIRQ_BASE+IPIPE_NR_VIRQS) + +#define IPIPE_IRQ_LOMAPSZ (IPIPE_NR_IRQS / BITS_PER_LONG) +#if IPIPE_IRQ_LOMAPSZ > BITS_PER_LONG +/* + * We need a 3-level mapping. This allows us to handle up to 32k IRQ + * vectors on 32bit machines, 256k on 64bit ones. + */ +#define __IPIPE_3LEVEL_IRQMAP 1 +#define IPIPE_IRQ_MDMAPSZ (__bpl_up(IPIPE_IRQ_LOMAPSZ) / BITS_PER_LONG) +#else +/* + * 2-level mapping is enough. This allows us to handle up to 1024 IRQ + * vectors on 32bit machines, 4096 on 64bit ones. + */ +#define __IPIPE_2LEVEL_IRQMAP 1 +#endif + +#define IPIPE_IRQ_DOALL 0 +#define IPIPE_IRQ_DOVIRT 1 + +/* Per-cpu pipeline status */ +#define IPIPE_STALL_FLAG 0 /* Stalls a pipeline stage -- guaranteed at bit #0 */ +#define IPIPE_SYNC_FLAG 1 /* The interrupt syncer is running for the domain */ +#define IPIPE_NOSTACK_FLAG 2 /* Domain currently runs on a foreign stack */ + +#define IPIPE_STALL_MASK (1L << IPIPE_STALL_FLAG) +#define IPIPE_SYNC_MASK (1L << IPIPE_SYNC_FLAG) +#define IPIPE_NOSTACK_MASK (1L << IPIPE_NOSTACK_FLAG) + +typedef void (*ipipe_irq_handler_t)(unsigned int irq, + void *cookie); + +extern struct ipipe_domain ipipe_root; + +#define ipipe_root_domain (&ipipe_root) + +void __ipipe_unstall_root(void); + +void __ipipe_restore_root(unsigned long x); + +#define ipipe_preempt_disable(flags) \ + do { \ + local_irq_save_hw(flags); \ + if (__ipipe_root_domain_p) \ + preempt_disable(); \ + } while (0) + +#define ipipe_preempt_enable(flags) \ + do { \ + if (__ipipe_root_domain_p) { \ + preempt_enable_no_resched(); \ + local_irq_restore_hw(flags); \ + preempt_check_resched(); \ + } else \ + local_irq_restore_hw(flags); \ + } while (0) + +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT +void ipipe_check_context(struct ipipe_domain *border_ipd); +#else /* !CONFIG_IPIPE_DEBUG_CONTEXT */ +static inline void ipipe_check_context(struct ipipe_domain *border_ipd) { } +#endif /* !CONFIG_IPIPE_DEBUG_CONTEXT */ + +/* Generic features */ + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +#define __IPIPE_FEATURE_REQUEST_TICKDEV 1 +#endif +#define __IPIPE_FEATURE_DELAYED_ATOMICSW 1 +#define __IPIPE_FEATURE_FASTPEND_IRQ 1 +#define __IPIPE_FEATURE_TRACE_EVENT 1 + +#else /* !CONFIG_IPIPE */ +#define ipipe_preempt_disable(flags) do { \ + preempt_disable(); \ + (void)(flags); \ + } while (0) +#define ipipe_preempt_enable(flags) preempt_enable() +#define ipipe_check_context(ipd) do { } while(0) +#endif /* CONFIG_IPIPE */ + +#endif /* !__LINUX_IPIPE_BASE_H */ diff --git a/include/linux/ipipe_compat.h b/include/linux/ipipe_compat.h new file mode 100644 index 0000000..50a245c --- /dev/null +++ b/include/linux/ipipe_compat.h @@ -0,0 +1,54 @@ +/* -*- linux-c -*- + * include/linux/ipipe_compat.h + * + * Copyright (C) 2007 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_COMPAT_H +#define __LINUX_IPIPE_COMPAT_H + +#ifdef CONFIG_IPIPE_COMPAT +/* + * OBSOLETE: defined only for backward compatibility. Will be removed + * in future releases, please update client code accordingly. + */ + +#ifdef CONFIG_SMP +#define ipipe_declare_cpuid int cpuid +#define ipipe_load_cpuid() do { \ + cpuid = ipipe_processor_id(); \ + } while(0) +#define ipipe_lock_cpu(flags) do { \ + local_irq_save_hw(flags); \ + cpuid = ipipe_processor_id(); \ + } while(0) +#define ipipe_unlock_cpu(flags) local_irq_restore_hw(flags) +#define ipipe_get_cpu(flags) ipipe_lock_cpu(flags) +#define ipipe_put_cpu(flags) ipipe_unlock_cpu(flags) +#else /* !CONFIG_SMP */ +#define ipipe_declare_cpuid const int cpuid = 0 +#define ipipe_load_cpuid() do { } while(0) +#define ipipe_lock_cpu(flags) local_irq_save_hw(flags) +#define ipipe_unlock_cpu(flags) local_irq_restore_hw(flags) +#define ipipe_get_cpu(flags) do { (void)(flags); } while(0) +#define ipipe_put_cpu(flags) do { } while(0) +#endif /* CONFIG_SMP */ + +#endif /* CONFIG_IPIPE_COMPAT */ + +#endif /* !__LINUX_IPIPE_COMPAT_H */ diff --git a/include/linux/ipipe_lock.h b/include/linux/ipipe_lock.h new file mode 100644 index 0000000..b751d54 --- /dev/null +++ b/include/linux/ipipe_lock.h @@ -0,0 +1,144 @@ +/* -*- linux-c -*- + * include/linux/ipipe_lock.h + * + * Copyright (C) 2009 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_LOCK_H +#define __LINUX_IPIPE_LOCK_H + +typedef struct { + raw_spinlock_t bare_lock; +} __ipipe_spinlock_t; + +#define ipipe_lock_p(lock) \ + __builtin_types_compatible_p(typeof(lock), __ipipe_spinlock_t *) + +#define common_lock_p(lock) \ + __builtin_types_compatible_p(typeof(lock), spinlock_t *) + +#define bare_lock(lock) (&((__ipipe_spinlock_t *)(lock))->bare_lock) +#define std_lock(lock) ((spinlock_t *)(lock)) + +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + +extern int __bad_spinlock_type(void); +#define PICK_SPINLOCK_IRQSAVE(lock, flags) \ + do { \ + if (ipipe_lock_p(lock)) \ + (flags) = __ipipe_spin_lock_irqsave(bare_lock(lock)); \ + else if (common_lock_p(lock)) \ + (flags) = _spin_lock_irqsave(std_lock(lock)); \ + else __bad_spinlock_type(); \ + } while (0) + +#else /* !(CONFIG_SMP || CONFIG_DEBUG_SPINLOCK) */ + +#define PICK_SPINLOCK_IRQSAVE(lock, flags) \ + do { \ + if (ipipe_lock_p(lock)) \ + (flags) = __ipipe_spin_lock_irqsave(bare_lock(lock)); \ + else if (common_lock_p(lock)) \ + _spin_lock_irqsave(std_lock(lock), flags); \ + } while (0) + +#endif /* !(CONFIG_SMP || CONFIG_DEBUG_SPINLOCK) */ + +#define PICK_SPINUNLOCK_IRQRESTORE(lock, flags) \ + do { \ + if (ipipe_lock_p(lock)) \ + __ipipe_spin_unlock_irqrestore(bare_lock(lock), flags); \ + else if (common_lock_p(lock)) \ + _spin_unlock_irqrestore(std_lock(lock), flags); \ + } while (0) + +#define PICK_SPINOP(op, lock) \ + do { \ + if (ipipe_lock_p(lock)) \ + __raw_spin##op(bare_lock(lock)); \ + else if (common_lock_p(lock)) \ + _spin##op(std_lock(lock)); \ + } while (0) + +#define PICK_SPINOP_IRQ(op, lock) \ + do { \ + if (ipipe_lock_p(lock)) \ + __ipipe_spin##op##_irq(bare_lock(lock)); \ + else if (common_lock_p(lock)) \ + _spin##op##_irq(std_lock(lock)); \ + } while (0) + +#define __raw_spin_lock_init(lock) \ + do { \ + IPIPE_DEFINE_SPINLOCK(__lock__); \ + *((ipipe_spinlock_t *)lock) = __lock__; \ + } while (0) + +#ifdef CONFIG_IPIPE + +#define ipipe_spinlock_t __ipipe_spinlock_t +#define IPIPE_DEFINE_SPINLOCK(x) ipipe_spinlock_t x = IPIPE_SPIN_LOCK_UNLOCKED +#define IPIPE_DECLARE_SPINLOCK(x) extern ipipe_spinlock_t x +#define IPIPE_SPIN_LOCK_UNLOCKED \ + (__ipipe_spinlock_t) { .bare_lock = __RAW_SPIN_LOCK_UNLOCKED } + +#define spin_lock_irqsave_cond(lock, flags) \ + spin_lock_irqsave(lock, flags) + +#define spin_unlock_irqrestore_cond(lock, flags) \ + spin_unlock_irqrestore(lock, flags) + +void __ipipe_spin_lock_irq(raw_spinlock_t *lock); + +void __ipipe_spin_unlock_irq(raw_spinlock_t *lock); + +unsigned long __ipipe_spin_lock_irqsave(raw_spinlock_t *lock); + +void __ipipe_spin_unlock_irqrestore(raw_spinlock_t *lock, + unsigned long x); + +void __ipipe_spin_unlock_irqbegin(ipipe_spinlock_t *lock); + +void __ipipe_spin_unlock_irqcomplete(unsigned long x); + +#else /* !CONFIG_IPIPE */ + +#define ipipe_spinlock_t spinlock_t +#define IPIPE_DEFINE_SPINLOCK(x) DEFINE_SPINLOCK(x) +#define IPIPE_DECLARE_SPINLOCK(x) extern spinlock_t x +#define IPIPE_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED + +#define spin_lock_irqsave_cond(lock, flags) \ + do { \ + (void)(flags); \ + spin_lock(lock); \ + } while(0) + +#define spin_unlock_irqrestore_cond(lock, flags) \ + spin_unlock(lock) + +#define __ipipe_spin_lock_irq(lock) do { } while (0) +#define __ipipe_spin_unlock_irq(lock) do { } while (0) +#define __ipipe_spin_lock_irqsave(lock) 0 +#define __ipipe_spin_unlock_irqrestore(lock, x) do { (void)(x); } while (0) +#define __ipipe_spin_unlock_irqbegin(lock) do { } while (0) +#define __ipipe_spin_unlock_irqcomplete(x) do { (void)(x); } while (0) + +#endif /* !CONFIG_IPIPE */ + +#endif /* !__LINUX_IPIPE_LOCK_H */ diff --git a/include/linux/ipipe_percpu.h b/include/linux/ipipe_percpu.h new file mode 100644 index 0000000..f6727e3 --- /dev/null +++ b/include/linux/ipipe_percpu.h @@ -0,0 +1,89 @@ +/* -*- linux-c -*- + * include/linux/ipipe_percpu.h + * + * Copyright (C) 2007 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_PERCPU_H +#define __LINUX_IPIPE_PERCPU_H + +#include +#include + +struct ipipe_domain; + +struct ipipe_percpu_domain_data { + unsigned long status; /* <= Must be first in struct. */ + unsigned long irqpend_himap; +#ifdef __IPIPE_3LEVEL_IRQMAP + unsigned long irqpend_mdmap[IPIPE_IRQ_MDMAPSZ]; +#endif + unsigned long irqpend_lomap[IPIPE_IRQ_LOMAPSZ]; + unsigned long irqheld_map[IPIPE_IRQ_LOMAPSZ]; + unsigned long irqall[IPIPE_NR_IRQS]; + u64 evsync; +}; + +/* + * CAREFUL: all accessors based on __raw_get_cpu_var() you may find in + * this file should be used only while hw interrupts are off, to + * prevent from CPU migration regardless of the running domain. + */ +#ifdef CONFIG_SMP +#define ipipe_percpudom_ptr(ipd, cpu) \ + (&per_cpu(ipipe_percpu_darray, cpu)[(ipd)->slot]) +#define ipipe_cpudom_ptr(ipd) \ + (&__ipipe_get_cpu_var(ipipe_percpu_darray)[(ipd)->slot]) +#else +DECLARE_PER_CPU(struct ipipe_percpu_domain_data *, ipipe_percpu_daddr[CONFIG_IPIPE_DOMAINS]); +#define ipipe_percpudom_ptr(ipd, cpu) \ + (per_cpu(ipipe_percpu_daddr, cpu)[(ipd)->slot]) +#define ipipe_cpudom_ptr(ipd) \ + (__ipipe_get_cpu_var(ipipe_percpu_daddr)[(ipd)->slot]) +#endif +#define ipipe_percpudom(ipd, var, cpu) (ipipe_percpudom_ptr(ipd, cpu)->var) +#define ipipe_cpudom_var(ipd, var) (ipipe_cpudom_ptr(ipd)->var) + +#define IPIPE_ROOT_SLOT 0 +#define IPIPE_HEAD_SLOT (CONFIG_IPIPE_DOMAINS - 1) + +DECLARE_PER_CPU(struct ipipe_percpu_domain_data, ipipe_percpu_darray[CONFIG_IPIPE_DOMAINS]); + +DECLARE_PER_CPU(struct ipipe_domain *, ipipe_percpu_domain); + +DECLARE_PER_CPU(unsigned long, ipipe_nmi_saved_root); + +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT +DECLARE_PER_CPU(int, ipipe_percpu_context_check); +DECLARE_PER_CPU(int, ipipe_saved_context_check_state); +#endif + +#define ipipe_root_cpudom_ptr(var) \ + (&__ipipe_get_cpu_var(ipipe_percpu_darray)[IPIPE_ROOT_SLOT]) + +#define ipipe_root_cpudom_var(var) ipipe_root_cpudom_ptr()->var + +#define ipipe_this_cpudom_var(var) \ + ipipe_cpudom_var(__ipipe_current_domain, var) + +#define ipipe_head_cpudom_ptr() \ + (&__ipipe_get_cpu_var(ipipe_percpu_darray)[IPIPE_HEAD_SLOT]) + +#define ipipe_head_cpudom_var(var) ipipe_head_cpudom_ptr()->var + +#endif /* !__LINUX_IPIPE_PERCPU_H */ diff --git a/include/linux/ipipe_tickdev.h b/include/linux/ipipe_tickdev.h new file mode 100644 index 0000000..4a1cb1b --- /dev/null +++ b/include/linux/ipipe_tickdev.h @@ -0,0 +1,58 @@ +/* -*- linux-c -*- + * include/linux/ipipe_tickdev.h + * + * Copyright (C) 2007 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_TICKDEV_H +#define __LINUX_IPIPE_TICKDEV_H + +#if defined(CONFIG_IPIPE) && defined(CONFIG_GENERIC_CLOCKEVENTS) + +#include + +struct tick_device; + +struct ipipe_tick_device { + + void (*emul_set_mode)(enum clock_event_mode, + struct clock_event_device *cdev); + int (*emul_set_tick)(unsigned long delta, + struct clock_event_device *cdev); + void (*real_set_mode)(enum clock_event_mode mode, + struct clock_event_device *cdev); + int (*real_set_tick)(unsigned long delta, + struct clock_event_device *cdev); + struct tick_device *slave; + unsigned long real_max_delta_ns; + unsigned long real_mult; + int real_shift; +}; + +int ipipe_request_tickdev(const char *devname, + void (*emumode)(enum clock_event_mode mode, + struct clock_event_device *cdev), + int (*emutick)(unsigned long evt, + struct clock_event_device *cdev), + int cpu, unsigned long *tmfreq); + +void ipipe_release_tickdev(int cpu); + +#endif /* CONFIG_IPIPE && CONFIG_GENERIC_CLOCKEVENTS */ + +#endif /* !__LINUX_IPIPE_TICKDEV_H */ diff --git a/include/linux/ipipe_trace.h b/include/linux/ipipe_trace.h new file mode 100644 index 0000000..627b354 --- /dev/null +++ b/include/linux/ipipe_trace.h @@ -0,0 +1,72 @@ +/* -*- linux-c -*- + * include/linux/ipipe_trace.h + * + * Copyright (C) 2005 Luotao Fu. + * 2005-2007 Jan Kiszka. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef _LINUX_IPIPE_TRACE_H +#define _LINUX_IPIPE_TRACE_H + +#ifdef CONFIG_IPIPE_TRACE + +#include + +void ipipe_trace_begin(unsigned long v); +void ipipe_trace_end(unsigned long v); +void ipipe_trace_freeze(unsigned long v); +void ipipe_trace_special(unsigned char special_id, unsigned long v); +void ipipe_trace_pid(pid_t pid, short prio); +void ipipe_trace_event(unsigned char id, unsigned long delay_tsc); +int ipipe_trace_max_reset(void); +int ipipe_trace_frozen_reset(void); + +#else /* !CONFIG_IPIPE_TRACE */ + +#define ipipe_trace_begin(v) do { (void)(v); } while(0) +#define ipipe_trace_end(v) do { (void)(v); } while(0) +#define ipipe_trace_freeze(v) do { (void)(v); } while(0) +#define ipipe_trace_special(id, v) do { (void)(id); (void)(v); } while(0) +#define ipipe_trace_pid(pid, prio) do { (void)(pid); (void)(prio); } while(0) +#define ipipe_trace_event(id, delay_tsc) do { (void)(id); (void)(delay_tsc); } while(0) +#define ipipe_trace_max_reset() do { } while(0) +#define ipipe_trace_froze_reset() do { } while(0) + +#endif /* !CONFIG_IPIPE_TRACE */ + +#ifdef CONFIG_IPIPE_TRACE_PANIC +void ipipe_trace_panic_freeze(void); +void ipipe_trace_panic_dump(void); +#else +static inline void ipipe_trace_panic_freeze(void) { } +static inline void ipipe_trace_panic_dump(void) { } +#endif + +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF +#define ipipe_trace_irq_entry(irq) ipipe_trace_begin(irq) +#define ipipe_trace_irq_exit(irq) ipipe_trace_end(irq) +#define ipipe_trace_irqsoff() ipipe_trace_begin(0x80000000UL) +#define ipipe_trace_irqson() ipipe_trace_end(0x80000000UL) +#else +#define ipipe_trace_irq_entry(irq) do { (void)(irq);} while(0) +#define ipipe_trace_irq_exit(irq) do { (void)(irq);} while(0) +#define ipipe_trace_irqsoff() do { } while(0) +#define ipipe_trace_irqson() do { } while(0) +#endif + +#endif /* !__LINUX_IPIPE_TRACE_H */ diff --git a/include/linux/irq.h b/include/linux/irq.h index 9e5f45a..85642bc 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -124,6 +124,9 @@ struct irq_chip { void (*end)(unsigned int irq); int (*set_affinity)(unsigned int irq, const struct cpumask *dest); +#ifdef CONFIG_IPIPE + void (*move)(unsigned int irq); +#endif /* CONFIG_IPIPE */ int (*retrigger)(unsigned int irq); int (*set_type)(unsigned int irq, unsigned int flow_type); int (*set_wake)(unsigned int irq, unsigned int on); @@ -173,6 +176,12 @@ struct irq_2_iommu; * @name: flow handler name for /proc/interrupts output */ struct irq_desc { +#ifdef CONFIG_IPIPE + void (*ipipe_ack)(unsigned int irq, + struct irq_desc *desc); + void (*ipipe_end)(unsigned int irq, + struct irq_desc *desc); +#endif /* CONFIG_IPIPE */ unsigned int irq; struct timer_rand_state *timer_rand_state; unsigned int *kstat_irqs; @@ -346,6 +355,10 @@ extern void set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name); +extern irq_flow_handler_t +__fixup_irq_handler(struct irq_desc *desc, irq_flow_handler_t handle, + int is_chained); + extern void __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); @@ -357,6 +370,7 @@ static inline void __set_irq_handler_unlocked(int irq, struct irq_desc *desc; desc = irq_to_desc(irq); + handler = __fixup_irq_handler(desc, handler, 0); desc->handle_irq = handler; } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f4e3184..3b80b7b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -119,9 +120,12 @@ struct user; #ifdef CONFIG_PREEMPT_VOLUNTARY extern int _cond_resched(void); -# define might_resched() _cond_resched() +# define might_resched() do { \ + ipipe_check_context(ipipe_root_domain); \ + _cond_resched(); \ + } while (0) #else -# define might_resched() do { } while (0) +# define might_resched() ipipe_check_context(ipipe_root_domain) #endif #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 72b1a10..80553be 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -9,13 +9,20 @@ #include #include #include +#include #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); #else -# define add_preempt_count(val) do { preempt_count() += (val); } while (0) -# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +# define add_preempt_count(val) do { \ + ipipe_check_context(ipipe_root_domain); \ + preempt_count() += (val); \ + } while (0) +# define sub_preempt_count(val) do { \ + ipipe_check_context(ipipe_root_domain); \ + preempt_count() -= (val); \ + } while (0) #endif #define inc_preempt_count() add_preempt_count(1) diff --git a/include/linux/sched.h b/include/linux/sched.h index 70abfd3..efecc7a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -61,6 +61,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -195,6 +196,13 @@ extern unsigned long long time_sync_thresh; #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 +#ifdef CONFIG_IPIPE +#define TASK_ATOMICSWITCH 512 +#define TASK_NOWAKEUP 1024 +#else /* !CONFIG_IPIPE */ +#define TASK_ATOMICSWITCH 0 +#define TASK_NOWAKEUP 0 +#endif /* CONFIG_IPIPE */ /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) @@ -302,6 +310,15 @@ extern void trap_init(void); extern void update_process_times(int user); extern void scheduler_tick(void); +#ifdef CONFIG_IPIPE +void update_root_process_times(struct pt_regs *regs); +#else /* !CONFIG_IPIPE */ +static inline void update_root_process_times(struct pt_regs *regs) +{ + update_process_times(user_mode(regs)); +} +#endif /* CONFIG_IPIPE */ + extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_DETECT_SOFTLOCKUP @@ -349,8 +366,8 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); -asmlinkage void __schedule(void); -asmlinkage void schedule(void); +asmlinkage int __schedule(void); +asmlinkage int schedule(void); extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); struct nsproxy; @@ -475,6 +492,9 @@ extern int get_dumpable(struct mm_struct *mm); #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#ifdef CONFIG_IPIPE +#define MMF_VM_PINNED 31 /* ondemand load up and COW disabled */ +#endif #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) @@ -1496,6 +1516,9 @@ struct task_struct { #endif atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; +#ifdef CONFIG_IPIPE + void *ptd[IPIPE_ROOT_NPTDKEYS]; +#endif /* * cache last used pipe for splice @@ -1736,6 +1759,11 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#ifdef CONFIG_IPIPE +#define PF_EVNOTIFY 0x00000020 /* Notify other domains about internal events */ +#else +#define PF_EVNOTIFY 0 +#endif /* CONFIG_IPIPE */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index f0ca7a7..3096642 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -90,10 +90,12 @@ extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock); # include #endif +#include + #ifdef CONFIG_DEBUG_SPINLOCK extern void __spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key); -# define spin_lock_init(lock) \ +# define _spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ @@ -101,10 +103,12 @@ do { \ } while (0) #else -# define spin_lock_init(lock) \ +# define _spin_lock_init(lock) \ do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0) #endif +# define spin_lock_init(lock) PICK_SPINOP(_lock_init, lock) + #ifdef CONFIG_DEBUG_SPINLOCK extern void __rwlock_init(rwlock_t *lock, const char *name, struct lock_class_key *key); @@ -186,7 +190,7 @@ static inline void smp_mb__after_lock(void) { smp_mb(); } #define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) #define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) -#define spin_lock(lock) _spin_lock(lock) +#define spin_lock(lock) PICK_SPINOP(_lock, lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass) @@ -208,7 +212,7 @@ static inline void smp_mb__after_lock(void) { smp_mb(); } #define spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ - flags = _spin_lock_irqsave(lock); \ + PICK_SPINLOCK_IRQSAVE(lock, flags); \ } while (0) #define read_lock_irqsave(lock, flags) \ do { \ @@ -240,7 +244,7 @@ static inline void smp_mb__after_lock(void) { smp_mb(); } #define spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ - _spin_lock_irqsave(lock, flags); \ + PICK_SPINLOCK_IRQSAVE(lock, flags); \ } while (0) #define read_lock_irqsave(lock, flags) \ do { \ @@ -257,23 +261,23 @@ static inline void smp_mb__after_lock(void) { smp_mb(); } #endif -#define spin_lock_irq(lock) _spin_lock_irq(lock) +#define spin_lock_irq(lock) PICK_SPINOP_IRQ(_lock, lock) #define spin_lock_bh(lock) _spin_lock_bh(lock) #define read_lock_irq(lock) _read_lock_irq(lock) #define read_lock_bh(lock) _read_lock_bh(lock) #define write_lock_irq(lock) _write_lock_irq(lock) #define write_lock_bh(lock) _write_lock_bh(lock) -#define spin_unlock(lock) _spin_unlock(lock) +#define spin_unlock(lock) PICK_SPINOP(_unlock, lock) #define read_unlock(lock) _read_unlock(lock) #define write_unlock(lock) _write_unlock(lock) -#define spin_unlock_irq(lock) _spin_unlock_irq(lock) +#define spin_unlock_irq(lock) PICK_SPINOP_IRQ(_unlock, lock) #define read_unlock_irq(lock) _read_unlock_irq(lock) #define write_unlock_irq(lock) _write_unlock_irq(lock) #define spin_unlock_irqrestore(lock, flags) \ do { \ typecheck(unsigned long, flags); \ - _spin_unlock_irqrestore(lock, flags); \ + PICK_SPINUNLOCK_IRQRESTORE(lock, flags); \ } while (0) #define spin_unlock_bh(lock) _spin_unlock_bh(lock) diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 7a7e18f..190bc0a 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -229,7 +229,9 @@ static inline int __write_trylock(rwlock_t *lock) * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ -#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +#if !defined(CONFIG_GENERIC_LOCKBREAK) || \ + defined(CONFIG_DEBUG_LOCK_ALLOC) || \ + defined(CONFIG_IPIPE) static inline void __read_lock(rwlock_t *lock) { @@ -250,7 +252,7 @@ static inline unsigned long __spin_lock_irqsave(spinlock_t *lock) * _raw_spin_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_IPIPE) LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); #else _raw_spin_lock_flags(lock, &flags); diff --git a/init/Kconfig b/init/Kconfig index eb4b337..a73e078 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -78,6 +78,7 @@ config INIT_ENV_ARG_LIMIT config LOCALVERSION string "Local version - append to kernel release" + default "-ipipe" help Append an extra string to the end of your kernel version. This will show up when you type uname, for example. diff --git a/init/main.c b/init/main.c index bc109c7..4672e7d 100644 --- a/init/main.c +++ b/init/main.c @@ -530,7 +530,7 @@ asmlinkage void __init start_kernel(void) cgroup_init_early(); - local_irq_disable(); + local_irq_disable_hw(); early_boot_irqs_off(); early_init_irq_lock_class(); @@ -565,6 +565,7 @@ asmlinkage void __init start_kernel(void) pidhash_init(); vfs_caches_init_early(); sort_main_extable(); + ipipe_init_early(); trap_init(); mm_init(); /* @@ -593,6 +594,11 @@ asmlinkage void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); + /* + * We need to wait for the interrupt and time subsystems to be + * initialized before enabling the pipeline. + */ + ipipe_init(); profile_init(); if (!irqs_disabled()) printk(KERN_CRIT "start_kernel(): bug: interrupts were " @@ -774,6 +780,7 @@ static void __init do_basic_setup(void) init_tmpfs(); driver_init(); init_irq_proc(); + ipipe_init_proc(); do_ctors(); do_initcalls(); } diff --git a/kernel/Makefile b/kernel/Makefile index d7c13d2..b6a84ee 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_IPIPE) += ipipe/ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o diff --git a/kernel/exit.c b/kernel/exit.c index f7864ac..f5c3129 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -963,6 +963,7 @@ NORET_TYPE void do_exit(long code) acct_process(); trace_sched_process_exit(tsk); + ipipe_exit_notify(tsk); exit_sem(tsk); exit_files(tsk); exit_fs(tsk); @@ -1766,3 +1767,37 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int } #endif + +void rt_daemonize(void) +{ + sigset_t blocked; + + /* + * We don't want to have TIF_FREEZE set if the system-wide hibernation + * or suspend transition begins right now. + */ + current->flags |= (PF_NOFREEZE | PF_KTHREAD); + + if (current->nsproxy != &init_nsproxy) { + get_nsproxy(&init_nsproxy); + switch_task_namespaces(current, &init_nsproxy); + } + set_special_pids(&init_struct_pid); + proc_clear_tty(current); + + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + /* Become as one with the init task */ + + daemonize_fs_struct(); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); + + reparent_to_kthreadd(); +} + +EXPORT_SYMBOL(rt_daemonize); diff --git a/kernel/fork.c b/kernel/fork.c index 166b8c4..dff0f55 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -511,6 +511,7 @@ void mmput(struct mm_struct *mm) exit_aio(mm); ksm_exit(mm); exit_mmap(mm); + ipipe_cleanup_notify(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); @@ -918,7 +919,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_EVNOTIFY); new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; @@ -1303,6 +1304,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); +#ifdef CONFIG_IPIPE + memset(p->ptd, 0, sizeof(p->ptd)); +#endif /* CONFIG_IPIPE */ perf_event_fork(p); return p; @@ -1700,11 +1704,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } if (new_mm) { + unsigned long flags; mm = current->mm; active_mm = current->active_mm; current->mm = new_mm; + ipipe_mm_switch_protect(flags); current->active_mm = new_mm; activate_mm(active_mm, new_mm); + ipipe_mm_switch_unprotect(flags); new_mm = mm; } diff --git a/kernel/ipipe/Kconfig b/kernel/ipipe/Kconfig new file mode 100644 index 0000000..de5e6a3 --- /dev/null +++ b/kernel/ipipe/Kconfig @@ -0,0 +1,35 @@ +config IPIPE + bool "Interrupt pipeline" + default y + ---help--- + Activate this option if you want the interrupt pipeline to be + compiled in. + +config IPIPE_DOMAINS + int "Max domains" + depends on IPIPE + default 4 + ---help--- + The maximum number of I-pipe domains to run concurrently. + +config IPIPE_COMPAT + bool "Maintain code compatibility with older releases" + depends on IPIPE + default y + ---help--- + Activate this option if you want the compatibility code to be + defined, so that older I-pipe clients may use obsolete + constructs. WARNING: obsolete code will be eventually + deprecated in future I-pipe releases, and removed from the + compatibility support as time passes. Please fix I-pipe + clients to get rid of such uses as soon as possible. + +config IPIPE_DELAYED_ATOMICSW + bool + depends on IPIPE + default n + +config IPIPE_UNMASKED_CONTEXT_SWITCH + bool + depends on IPIPE + default n diff --git a/kernel/ipipe/Kconfig.debug b/kernel/ipipe/Kconfig.debug new file mode 100644 index 0000000..629c894 --- /dev/null +++ b/kernel/ipipe/Kconfig.debug @@ -0,0 +1,97 @@ +config IPIPE_DEBUG + bool "I-pipe debugging" + depends on IPIPE + +config IPIPE_DEBUG_CONTEXT + bool "Check for illicit cross-domain calls" + depends on IPIPE_DEBUG + default y + ---help--- + Enable this feature to arm checkpoints in the kernel that + verify the correct invocation context. On entry of critical + Linux services a warning is issued if the caller is not + running over the root domain. + +config IPIPE_DEBUG_INTERNAL + bool "Enable internal debug checks" + depends on IPIPE_DEBUG + default y + ---help--- + When this feature is enabled, I-pipe will perform internal + consistency checks of its subsystems, e.g. on per-cpu variable + access. + +config IPIPE_TRACE + bool "Latency tracing" + depends on IPIPE_DEBUG + select FRAME_POINTER + select KALLSYMS + select PROC_FS + ---help--- + Activate this option if you want to use per-function tracing of + the kernel. The tracer will collect data via instrumentation + features like the one below or with the help of explicite calls + of ipipe_trace_xxx(). See include/linux/ipipe_trace.h for the + in-kernel tracing API. The collected data and runtime control + is available via /proc/ipipe/trace/*. + +if IPIPE_TRACE + +config IPIPE_TRACE_ENABLE + bool "Enable tracing on boot" + default y + ---help--- + Disable this option if you want to arm the tracer after booting + manually ("echo 1 > /proc/ipipe/tracer/enable"). This can reduce + boot time on slow embedded devices due to the tracer overhead. + +config IPIPE_TRACE_MCOUNT + bool "Instrument function entries" + default y + select FUNCTION_TRACER + select TRACING + select CONTEXT_SWITCH_TRACER + select FTRACE_MCOUNT_RECORD + select DYNAMIC_FTRACE + ---help--- + When enabled, records every kernel function entry in the tracer + log. While this slows down the system noticeably, it provides + the highest level of information about the flow of events. + However, it can be switch off in order to record only explicit + I-pipe trace points. + +config IPIPE_TRACE_IRQSOFF + bool "Trace IRQs-off times" + default y + ---help--- + Activate this option if I-pipe shall trace the longest path + with hard-IRQs switched off. + +config IPIPE_TRACE_SHIFT + int "Depth of trace log (14 => 16Kpoints, 15 => 32Kpoints)" + range 10 18 + default 14 + ---help--- + The number of trace points to hold tracing data for each + trace path, as a power of 2. + +config IPIPE_TRACE_VMALLOC + bool "Use vmalloc'ed trace buffer" + default y if EMBEDDED + ---help--- + Instead of reserving static kernel data, the required buffer + is allocated via vmalloc during boot-up when this option is + enabled. This can help to start systems that are low on memory, + but it slightly degrades overall performance. Try this option + when a traced kernel hangs unexpectedly at boot time. + +config IPIPE_TRACE_PANIC + bool "Enable panic back traces" + default y + ---help--- + Provides services to freeze and dump a back trace on panic + situations. This is used on IPIPE_DEBUG_CONTEXT exceptions + as well as ordinary kernel oopses. You can control the number + of printed back trace points via /proc/ipipe/trace. + +endif diff --git a/kernel/ipipe/Makefile b/kernel/ipipe/Makefile new file mode 100644 index 0000000..6257dfa --- /dev/null +++ b/kernel/ipipe/Makefile @@ -0,0 +1,3 @@ + +obj-$(CONFIG_IPIPE) += core.o +obj-$(CONFIG_IPIPE_TRACE) += tracer.o diff --git a/kernel/ipipe/core.c b/kernel/ipipe/core.c new file mode 100644 index 0000000..63deaf9 --- /dev/null +++ b/kernel/ipipe/core.c @@ -0,0 +1,1955 @@ +/* -*- linux-c -*- + * linux/kernel/ipipe/core.c + * + * Copyright (C) 2002-2005 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Architecture-independent I-PIPE core support. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_PROC_FS +#include +#include +#endif /* CONFIG_PROC_FS */ +#include +#include +#include + +static int __ipipe_ptd_key_count; + +static unsigned long __ipipe_ptd_key_map; + +static unsigned long __ipipe_domain_slot_map; + +struct ipipe_domain ipipe_root; + +#ifndef CONFIG_SMP +/* + * Create an alias to the unique root status, so that arch-dep code + * may get simple and easy access to this percpu variable. We also + * create an array of pointers to the percpu domain data; this tends + * to produce a better code when reaching non-root domains. We make + * sure that the early boot code would be able to dereference the + * pointer to the root domain data safely by statically initializing + * its value (local_irq*() routines depend on this). + */ +#if __GNUC__ >= 4 +extern unsigned long __ipipe_root_status +__attribute__((alias(__stringify(__raw_get_cpu_var(ipipe_percpu_darray))))); +EXPORT_SYMBOL(__ipipe_root_status); +#else /* __GNUC__ < 4 */ +/* + * Work around a GCC 3.x issue making alias symbols unusable as + * constant initializers. + */ +unsigned long *const __ipipe_root_status_addr = + &__raw_get_cpu_var(ipipe_percpu_darray)[IPIPE_ROOT_SLOT].status; +EXPORT_SYMBOL(__ipipe_root_status_addr); +#endif /* __GNUC__ < 4 */ + +DEFINE_PER_CPU(struct ipipe_percpu_domain_data *, ipipe_percpu_daddr[CONFIG_IPIPE_DOMAINS]) = +{ [IPIPE_ROOT_SLOT] = (struct ipipe_percpu_domain_data *)&__raw_get_cpu_var(ipipe_percpu_darray) }; +EXPORT_PER_CPU_SYMBOL(ipipe_percpu_daddr); +#endif /* !CONFIG_SMP */ + +DEFINE_PER_CPU(struct ipipe_percpu_domain_data, ipipe_percpu_darray[CONFIG_IPIPE_DOMAINS]) = +{ [IPIPE_ROOT_SLOT] = { .status = IPIPE_STALL_MASK } }; /* Root domain stalled on each CPU at startup. */ + +DEFINE_PER_CPU(struct ipipe_domain *, ipipe_percpu_domain) = { &ipipe_root }; + +DEFINE_PER_CPU(unsigned long, ipipe_nmi_saved_root); /* Copy of root status during NMI */ + +static IPIPE_DEFINE_SPINLOCK(__ipipe_pipelock); + +LIST_HEAD(__ipipe_pipeline); + +unsigned long __ipipe_virtual_irq_map; + +#ifdef CONFIG_PRINTK +unsigned __ipipe_printk_virq; +#endif /* CONFIG_PRINTK */ + +int __ipipe_event_monitors[IPIPE_NR_EVENTS]; + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + +DECLARE_PER_CPU(struct tick_device, tick_cpu_device); + +static DEFINE_PER_CPU(struct ipipe_tick_device, ipipe_tick_cpu_device); + +int ipipe_request_tickdev(const char *devname, + void (*emumode)(enum clock_event_mode mode, + struct clock_event_device *cdev), + int (*emutick)(unsigned long delta, + struct clock_event_device *cdev), + int cpu, unsigned long *tmfreq) +{ + struct ipipe_tick_device *itd; + struct tick_device *slave; + struct clock_event_device *evtdev; + unsigned long long freq; + unsigned long flags; + int status; + + flags = ipipe_critical_enter(NULL); + + itd = &per_cpu(ipipe_tick_cpu_device, cpu); + + if (itd->slave != NULL) { + status = -EBUSY; + goto out; + } + + slave = &per_cpu(tick_cpu_device, cpu); + + if (strcmp(slave->evtdev->name, devname)) { + /* + * No conflict so far with the current tick device, + * check whether the requested device is sane and has + * been blessed by the kernel. + */ + status = __ipipe_check_tickdev(devname) ? + CLOCK_EVT_MODE_UNUSED : CLOCK_EVT_MODE_SHUTDOWN; + goto out; + } + + /* + * Our caller asks for using the same clock event device for + * ticking than we do, let's create a tick emulation device to + * interpose on the set_next_event() method, so that we may + * both manage the device in oneshot mode. Only the tick + * emulation code will actually program the clockchip hardware + * for the next shot, though. + * + * CAUTION: we still have to grab the tick device even when it + * current runs in periodic mode, since the kernel may switch + * to oneshot dynamically (highres/no_hz tick mode). + */ + + evtdev = slave->evtdev; + status = evtdev->mode; + + if (status == CLOCK_EVT_MODE_SHUTDOWN) + goto out; + + itd->slave = slave; + itd->emul_set_mode = emumode; + itd->emul_set_tick = emutick; + itd->real_set_mode = evtdev->set_mode; + itd->real_set_tick = evtdev->set_next_event; + itd->real_max_delta_ns = evtdev->max_delta_ns; + itd->real_mult = evtdev->mult; + itd->real_shift = evtdev->shift; + freq = (1000000000ULL * evtdev->mult) >> evtdev->shift; + *tmfreq = (unsigned long)freq; + evtdev->set_mode = emumode; + evtdev->set_next_event = emutick; + evtdev->max_delta_ns = ULONG_MAX; + evtdev->mult = 1; + evtdev->shift = 0; +out: + ipipe_critical_exit(flags); + + return status; +} + +void ipipe_release_tickdev(int cpu) +{ + struct ipipe_tick_device *itd; + struct tick_device *slave; + struct clock_event_device *evtdev; + unsigned long flags; + + flags = ipipe_critical_enter(NULL); + + itd = &per_cpu(ipipe_tick_cpu_device, cpu); + + if (itd->slave != NULL) { + slave = &per_cpu(tick_cpu_device, cpu); + evtdev = slave->evtdev; + evtdev->set_mode = itd->real_set_mode; + evtdev->set_next_event = itd->real_set_tick; + evtdev->max_delta_ns = itd->real_max_delta_ns; + evtdev->mult = itd->real_mult; + evtdev->shift = itd->real_shift; + itd->slave = NULL; + } + + ipipe_critical_exit(flags); +} + +#endif /* CONFIG_GENERIC_CLOCKEVENTS */ + +void __init ipipe_init_early(void) +{ + struct ipipe_domain *ipd = &ipipe_root; + + /* + * Do the early init stuff. At this point, the kernel does not + * provide much services yet: be careful. + */ + __ipipe_check_platform(); /* Do platform dependent checks first. */ + + /* + * A lightweight registration code for the root domain. We are + * running on the boot CPU, hw interrupts are off, and + * secondary CPUs are still lost in space. + */ + + /* Reserve percpu data slot #0 for the root domain. */ + ipd->slot = 0; + set_bit(0, &__ipipe_domain_slot_map); + + ipd->name = "Linux"; + ipd->domid = IPIPE_ROOT_ID; + ipd->priority = IPIPE_ROOT_PRIO; + + __ipipe_init_stage(ipd); + + list_add_tail(&ipd->p_link, &__ipipe_pipeline); + + __ipipe_init_platform(); + +#ifdef CONFIG_PRINTK + __ipipe_printk_virq = ipipe_alloc_virq(); /* Cannot fail here. */ + ipd->irqs[__ipipe_printk_virq].handler = &__ipipe_flush_printk; + ipd->irqs[__ipipe_printk_virq].cookie = NULL; + ipd->irqs[__ipipe_printk_virq].acknowledge = NULL; + ipd->irqs[__ipipe_printk_virq].control = IPIPE_HANDLE_MASK; +#endif /* CONFIG_PRINTK */ +} + +void __init ipipe_init(void) +{ + /* Now we may engage the pipeline. */ + __ipipe_enable_pipeline(); + + printk(KERN_INFO "I-pipe %s: pipeline enabled.\n", + IPIPE_VERSION_STRING); +} + +void __ipipe_init_stage(struct ipipe_domain *ipd) +{ + struct ipipe_percpu_domain_data *p; + unsigned long status; + int cpu, n; + + for_each_online_cpu(cpu) { + p = ipipe_percpudom_ptr(ipd, cpu); + status = p->status; + memset(p, 0, sizeof(*p)); + p->status = status; + } + + for (n = 0; n < IPIPE_NR_IRQS; n++) { + ipd->irqs[n].acknowledge = NULL; + ipd->irqs[n].handler = NULL; + ipd->irqs[n].control = IPIPE_PASS_MASK; /* Pass but don't handle */ + } + + for (n = 0; n < IPIPE_NR_EVENTS; n++) + ipd->evhand[n] = NULL; + + ipd->evself = 0LL; + mutex_init(&ipd->mutex); + + __ipipe_hook_critical_ipi(ipd); +} + +void __ipipe_cleanup_domain(struct ipipe_domain *ipd) +{ + ipipe_unstall_pipeline_from(ipd); + +#ifdef CONFIG_SMP + { + struct ipipe_percpu_domain_data *p; + int cpu; + + for_each_online_cpu(cpu) { + p = ipipe_percpudom_ptr(ipd, cpu); + while (__ipipe_ipending_p(p)) + cpu_relax(); + } + } +#else + __raw_get_cpu_var(ipipe_percpu_daddr)[ipd->slot] = NULL; +#endif + + clear_bit(ipd->slot, &__ipipe_domain_slot_map); +} + +void __ipipe_unstall_root(void) +{ + struct ipipe_percpu_domain_data *p; + + local_irq_disable_hw(); + +#ifdef CONFIG_IPIPE_DEBUG_INTERNAL + /* This helps catching bad usage from assembly call sites. */ + BUG_ON(!__ipipe_root_domain_p); +#endif + + p = ipipe_root_cpudom_ptr(); + + __clear_bit(IPIPE_STALL_FLAG, &p->status); + + if (unlikely(__ipipe_ipending_p(p))) + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + + local_irq_enable_hw(); +} + +void __ipipe_restore_root(unsigned long x) +{ +#ifdef CONFIG_IPIPE_DEBUG_INTERNAL + BUG_ON(!ipipe_root_domain_p); +#endif + + if (x) + __ipipe_stall_root(); + else + __ipipe_unstall_root(); +} + +void ipipe_stall_pipeline_from(struct ipipe_domain *ipd) +{ + unsigned long flags; + /* + * We have to prevent against race on updating the status + * variable _and_ CPU migration at the same time, so disable + * hw IRQs here. + */ + local_irq_save_hw(flags); + + __set_bit(IPIPE_STALL_FLAG, &ipipe_cpudom_var(ipd, status)); + + if (!__ipipe_pipeline_head_p(ipd)) + local_irq_restore_hw(flags); +} + +unsigned long ipipe_test_and_stall_pipeline_from(struct ipipe_domain *ipd) +{ + unsigned long flags, x; + + /* See ipipe_stall_pipeline_from() */ + local_irq_save_hw(flags); + + x = __test_and_set_bit(IPIPE_STALL_FLAG, &ipipe_cpudom_var(ipd, status)); + + if (!__ipipe_pipeline_head_p(ipd)) + local_irq_restore_hw(flags); + + return x; +} + +unsigned long ipipe_test_and_unstall_pipeline_from(struct ipipe_domain *ipd) +{ + unsigned long flags, x; + struct list_head *pos; + + local_irq_save_hw(flags); + + x = __test_and_clear_bit(IPIPE_STALL_FLAG, &ipipe_cpudom_var(ipd, status)); + + if (ipd == __ipipe_current_domain) + pos = &ipd->p_link; + else + pos = __ipipe_pipeline.next; + + __ipipe_walk_pipeline(pos); + + if (likely(__ipipe_pipeline_head_p(ipd))) + local_irq_enable_hw(); + else + local_irq_restore_hw(flags); + + return x; +} + +void ipipe_restore_pipeline_from(struct ipipe_domain *ipd, + unsigned long x) +{ + if (x) + ipipe_stall_pipeline_from(ipd); + else + ipipe_unstall_pipeline_from(ipd); +} + +void ipipe_unstall_pipeline_head(void) +{ + struct ipipe_percpu_domain_data *p = ipipe_head_cpudom_ptr(); + struct ipipe_domain *head_domain; + + local_irq_disable_hw(); + + __clear_bit(IPIPE_STALL_FLAG, &p->status); + + if (unlikely(__ipipe_ipending_p(p))) { + head_domain = __ipipe_pipeline_head(); + if (likely(head_domain == __ipipe_current_domain)) + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + else + __ipipe_walk_pipeline(&head_domain->p_link); + } + + local_irq_enable_hw(); +} + +void __ipipe_restore_pipeline_head(unsigned long x) +{ + struct ipipe_percpu_domain_data *p = ipipe_head_cpudom_ptr(); + struct ipipe_domain *head_domain; + + local_irq_disable_hw(); + + if (x) { +#ifdef CONFIG_DEBUG_KERNEL + static int warned; + if (!warned && test_and_set_bit(IPIPE_STALL_FLAG, &p->status)) { + /* + * Already stalled albeit ipipe_restore_pipeline_head() + * should have detected it? Send a warning once. + */ + warned = 1; + printk(KERN_WARNING + "I-pipe: ipipe_restore_pipeline_head() optimization failed.\n"); + dump_stack(); + } +#else /* !CONFIG_DEBUG_KERNEL */ + set_bit(IPIPE_STALL_FLAG, &p->status); +#endif /* CONFIG_DEBUG_KERNEL */ + } + else { + __clear_bit(IPIPE_STALL_FLAG, &p->status); + if (unlikely(__ipipe_ipending_p(p))) { + head_domain = __ipipe_pipeline_head(); + if (likely(head_domain == __ipipe_current_domain)) + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + else + __ipipe_walk_pipeline(&head_domain->p_link); + } + local_irq_enable_hw(); + } +} + +void __ipipe_spin_lock_irq(raw_spinlock_t *lock) +{ + local_irq_disable_hw(); + __raw_spin_lock(lock); + __set_bit(IPIPE_STALL_FLAG, &ipipe_this_cpudom_var(status)); +} + +void __ipipe_spin_unlock_irq(raw_spinlock_t *lock) +{ + __raw_spin_unlock(lock); + __clear_bit(IPIPE_STALL_FLAG, &ipipe_this_cpudom_var(status)); + local_irq_enable_hw(); +} + +unsigned long __ipipe_spin_lock_irqsave(raw_spinlock_t *lock) +{ + unsigned long flags; + int s; + + local_irq_save_hw(flags); + __raw_spin_lock(lock); + s = __test_and_set_bit(IPIPE_STALL_FLAG, &ipipe_this_cpudom_var(status)); + + return raw_mangle_irq_bits(s, flags); +} + +void __ipipe_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long x) +{ + __raw_spin_unlock(lock); + if (!raw_demangle_irq_bits(&x)) + __clear_bit(IPIPE_STALL_FLAG, &ipipe_this_cpudom_var(status)); + local_irq_restore_hw(x); +} + +void __ipipe_spin_unlock_irqbegin(ipipe_spinlock_t *lock) +{ + __raw_spin_unlock(&lock->bare_lock); +} + +void __ipipe_spin_unlock_irqcomplete(unsigned long x) +{ + if (!raw_demangle_irq_bits(&x)) + __clear_bit(IPIPE_STALL_FLAG, &ipipe_this_cpudom_var(status)); + local_irq_restore_hw(x); +} + +#ifdef __IPIPE_3LEVEL_IRQMAP + +/* Must be called hw IRQs off. */ +static inline void __ipipe_set_irq_held(struct ipipe_percpu_domain_data *p, + unsigned int irq) +{ + __set_bit(irq, p->irqheld_map); + p->irqall[irq]++; +} + +/* Must be called hw IRQs off. */ +void __ipipe_set_irq_pending(struct ipipe_domain *ipd, unsigned int irq) +{ + struct ipipe_percpu_domain_data *p = ipipe_cpudom_ptr(ipd); + int l0b, l1b; + + l0b = irq / (BITS_PER_LONG * BITS_PER_LONG); + l1b = irq / BITS_PER_LONG; + prefetchw(p); + + if (likely(!test_bit(IPIPE_LOCK_FLAG, &ipd->irqs[irq].control))) { + __set_bit(irq, p->irqpend_lomap); + __set_bit(l1b, p->irqpend_mdmap); + __set_bit(l0b, &p->irqpend_himap); + } else + __set_bit(irq, p->irqheld_map); + + p->irqall[irq]++; +} + +/* Must be called hw IRQs off. */ +void __ipipe_lock_irq(struct ipipe_domain *ipd, int cpu, unsigned int irq) +{ + struct ipipe_percpu_domain_data *p; + int l0b, l1b; + + if (unlikely(test_and_set_bit(IPIPE_LOCK_FLAG, + &ipd->irqs[irq].control))) + return; + + l0b = irq / (BITS_PER_LONG * BITS_PER_LONG); + l1b = irq / BITS_PER_LONG; + + p = ipipe_percpudom_ptr(ipd, cpu); + if (__test_and_clear_bit(irq, p->irqpend_lomap)) { + __set_bit(irq, p->irqheld_map); + if (p->irqpend_lomap[l1b] == 0) { + __clear_bit(l1b, p->irqpend_mdmap); + if (p->irqpend_mdmap[l0b] == 0) + __clear_bit(l0b, &p->irqpend_himap); + } + } +} + +/* Must be called hw IRQs off. */ +void __ipipe_unlock_irq(struct ipipe_domain *ipd, unsigned int irq) +{ + struct ipipe_percpu_domain_data *p; + int l0b, l1b, cpu; + + if (unlikely(!test_and_clear_bit(IPIPE_LOCK_FLAG, + &ipd->irqs[irq].control))) + return; + + l0b = irq / (BITS_PER_LONG * BITS_PER_LONG); + l1b = irq / BITS_PER_LONG; + + for_each_online_cpu(cpu) { + p = ipipe_percpudom_ptr(ipd, cpu); + if (test_and_clear_bit(irq, p->irqheld_map)) { + /* We need atomic ops here: */ + set_bit(irq, p->irqpend_lomap); + set_bit(l1b, p->irqpend_mdmap); + set_bit(l0b, &p->irqpend_himap); + } + } +} + +static inline int __ipipe_next_irq(struct ipipe_percpu_domain_data *p, + int dovirt) +{ + unsigned long l0m, l1m, l2m, himask, mdmask; + int l0b, l1b, l2b, vl0b, vl1b; + unsigned int irq; + + if (dovirt) { + /* + * All virtual IRQs are mapped by a single long word. + * There is exactly BITS_PER_LONG virqs, and they are + * always last in the interrupt map, starting at + * IPIPE_VIRQ_BASE. Therefore, we only need to test a + * single bit within the high and middle maps to check + * whether a virtual IRQ is pending (the computations + * below are constant). + */ + vl0b = IPIPE_VIRQ_BASE / (BITS_PER_LONG * BITS_PER_LONG); + himask = (1L << vl0b); + vl1b = IPIPE_VIRQ_BASE / BITS_PER_LONG; + mdmask = (1L << (vl1b & (BITS_PER_LONG-1))); + } else + himask = mdmask = ~0L; + + l0m = p->irqpend_himap & himask; + if (unlikely(l0m == 0)) + return -1; + + l0b = __ipipe_ffnz(l0m); + l1m = p->irqpend_mdmap[l0b] & mdmask; + if (unlikely(l1m == 0)) + return -1; + + l1b = __ipipe_ffnz(l1m) + l0b * BITS_PER_LONG; + l2m = p->irqpend_lomap[l1b]; + if (unlikely(l2m == 0)) + return -1; + + l2b = __ipipe_ffnz(l2m); + irq = l1b * BITS_PER_LONG + l2b; + + __clear_bit(irq, p->irqpend_lomap); + if (p->irqpend_lomap[l1b] == 0) { + __clear_bit(l1b, p->irqpend_mdmap); + if (p->irqpend_mdmap[l0b] == 0) + __clear_bit(l0b, &p->irqpend_himap); + } + + return irq; +} + +#else /* __IPIPE_2LEVEL_IRQMAP */ + +/* Must be called hw IRQs off. */ +static inline void __ipipe_set_irq_held(struct ipipe_percpu_domain_data *p, + unsigned int irq) +{ + __set_bit(irq, p->irqheld_map); + p->irqall[irq]++; +} + +/* Must be called hw IRQs off. */ +void __ipipe_set_irq_pending(struct ipipe_domain *ipd, unsigned irq) +{ + struct ipipe_percpu_domain_data *p = ipipe_cpudom_ptr(ipd); + int l0b = irq / BITS_PER_LONG; + + prefetchw(p); + + if (likely(!test_bit(IPIPE_LOCK_FLAG, &ipd->irqs[irq].control))) { + __set_bit(irq, p->irqpend_lomap); + __set_bit(l0b, &p->irqpend_himap); + } else + __set_bit(irq, p->irqheld_map); + + p->irqall[irq]++; +} + +/* Must be called hw IRQs off. */ +void __ipipe_lock_irq(struct ipipe_domain *ipd, int cpu, unsigned irq) +{ + struct ipipe_percpu_domain_data *p; + int l0b = irq / BITS_PER_LONG; + + if (unlikely(test_and_set_bit(IPIPE_LOCK_FLAG, + &ipd->irqs[irq].control))) + return; + + p = ipipe_percpudom_ptr(ipd, cpu); + if (__test_and_clear_bit(irq, p->irqpend_lomap)) { + __set_bit(irq, p->irqheld_map); + if (p->irqpend_lomap[l0b] == 0) + __clear_bit(l0b, &p->irqpend_himap); + } +} + +/* Must be called hw IRQs off. */ +void __ipipe_unlock_irq(struct ipipe_domain *ipd, unsigned irq) +{ + struct ipipe_percpu_domain_data *p; + int l0b = irq / BITS_PER_LONG, cpu; + + if (unlikely(!test_and_clear_bit(IPIPE_LOCK_FLAG, + &ipd->irqs[irq].control))) + return; + + for_each_online_cpu(cpu) { + p = ipipe_percpudom_ptr(ipd, cpu); + if (test_and_clear_bit(irq, p->irqheld_map)) { + /* We need atomic ops here: */ + set_bit(irq, p->irqpend_lomap); + set_bit(l0b, &p->irqpend_himap); + } + } +} + +static inline int __ipipe_next_irq(struct ipipe_percpu_domain_data *p, + int dovirt) +{ + unsigned long l0m, l1m, himask = ~0L; + int l0b, l1b; + + himask <<= dovirt ? IPIPE_VIRQ_BASE/BITS_PER_LONG : 0; + + l0m = p->irqpend_himap & himask; + if (unlikely(l0m == 0)) + return -1; + + l0b = __ipipe_ffnz(l0m); + l1m = p->irqpend_lomap[l0b]; + if (unlikely(l1m == 0)) + return -1; + + l1b = __ipipe_ffnz(l1m); + __clear_bit(l1b, &p->irqpend_lomap[l0b]); + if (p->irqpend_lomap[l0b] == 0) + __clear_bit(l0b, &p->irqpend_himap); + + return l0b * BITS_PER_LONG + l1b; +} + +#endif /* __IPIPE_2LEVEL_IRQMAP */ + +/* + * __ipipe_walk_pipeline(): Plays interrupts pending in the log. Must + * be called with local hw interrupts disabled. + */ +void __ipipe_walk_pipeline(struct list_head *pos) +{ + struct ipipe_domain *this_domain = __ipipe_current_domain, *next_domain; + struct ipipe_percpu_domain_data *p, *np; + + p = ipipe_cpudom_ptr(this_domain); + + while (pos != &__ipipe_pipeline) { + + next_domain = list_entry(pos, struct ipipe_domain, p_link); + np = ipipe_cpudom_ptr(next_domain); + + if (test_bit(IPIPE_STALL_FLAG, &np->status)) + break; /* Stalled stage -- do not go further. */ + + if (__ipipe_ipending_p(np)) { + if (next_domain == this_domain) + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + else { + + p->evsync = 0; + __ipipe_current_domain = next_domain; + ipipe_suspend_domain(); /* Sync stage and propagate interrupts. */ + + if (__ipipe_current_domain == next_domain) + __ipipe_current_domain = this_domain; + /* + * Otherwise, something changed the current domain under our + * feet recycling the register set; do not override the new + * domain. + */ + + if (__ipipe_ipending_p(p) && + !test_bit(IPIPE_STALL_FLAG, &p->status)) + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + } + break; + } else if (next_domain == this_domain) + break; + + pos = next_domain->p_link.next; + } +} + +/* + * ipipe_suspend_domain() -- Suspend the current domain, switching to + * the next one which has pending work down the pipeline. + */ +void ipipe_suspend_domain(void) +{ + struct ipipe_domain *this_domain, *next_domain; + struct ipipe_percpu_domain_data *p; + struct list_head *ln; + unsigned long flags; + + local_irq_save_hw(flags); + + this_domain = next_domain = __ipipe_current_domain; + p = ipipe_cpudom_ptr(this_domain); + p->status &= ~(IPIPE_STALL_MASK|IPIPE_SYNC_MASK); + + if (__ipipe_ipending_p(p)) + goto sync_stage; + + for (;;) { + ln = next_domain->p_link.next; + + if (ln == &__ipipe_pipeline) + break; + + next_domain = list_entry(ln, struct ipipe_domain, p_link); + p = ipipe_cpudom_ptr(next_domain); + + if (p->status & IPIPE_STALL_MASK) + break; + + if (!__ipipe_ipending_p(p)) + continue; + + __ipipe_current_domain = next_domain; +sync_stage: + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + + if (__ipipe_current_domain != next_domain) + /* + * Something has changed the current domain under our + * feet, recycling the register set; take note. + */ + this_domain = __ipipe_current_domain; + } + + __ipipe_current_domain = this_domain; + + local_irq_restore_hw(flags); +} + + +/* ipipe_alloc_virq() -- Allocate a pipelined virtual/soft interrupt. + * Virtual interrupts are handled in exactly the same way than their + * hw-generated counterparts wrt pipelining. + */ +unsigned ipipe_alloc_virq(void) +{ + unsigned long flags, irq = 0; + int ipos; + + spin_lock_irqsave(&__ipipe_pipelock, flags); + + if (__ipipe_virtual_irq_map != ~0) { + ipos = ffz(__ipipe_virtual_irq_map); + set_bit(ipos, &__ipipe_virtual_irq_map); + irq = ipos + IPIPE_VIRQ_BASE; + } + + spin_unlock_irqrestore(&__ipipe_pipelock, flags); + + return irq; +} + +/* + * ipipe_control_irq() -- Change modes of a pipelined interrupt for + * the current domain. + */ +int ipipe_virtualize_irq(struct ipipe_domain *ipd, + unsigned irq, + ipipe_irq_handler_t handler, + void *cookie, + ipipe_irq_ackfn_t acknowledge, + unsigned modemask) +{ + ipipe_irq_handler_t old_handler; + struct irq_desc *desc; + unsigned long flags; + int err; + + if (irq >= IPIPE_NR_IRQS) + return -EINVAL; + + if (ipd->irqs[irq].control & IPIPE_SYSTEM_MASK) + return -EPERM; + + if (!test_bit(IPIPE_AHEAD_FLAG, &ipd->flags)) + /* Silently unwire interrupts for non-heading domains. */ + modemask &= ~IPIPE_WIRED_MASK; + + spin_lock_irqsave(&__ipipe_pipelock, flags); + + old_handler = ipd->irqs[irq].handler; + + if (handler != NULL) { + if (handler == IPIPE_SAME_HANDLER) { + handler = old_handler; + cookie = ipd->irqs[irq].cookie; + + if (handler == NULL) { + err = -EINVAL; + goto unlock_and_exit; + } + } else if ((modemask & IPIPE_EXCLUSIVE_MASK) != 0 && + old_handler != NULL) { + err = -EBUSY; + goto unlock_and_exit; + } + + /* Wired interrupts can only be delivered to domains + * always heading the pipeline, and using dynamic + * propagation. */ + + if ((modemask & IPIPE_WIRED_MASK) != 0) { + if ((modemask & (IPIPE_PASS_MASK | IPIPE_STICKY_MASK)) != 0) { + err = -EINVAL; + goto unlock_and_exit; + } + modemask |= (IPIPE_HANDLE_MASK); + } + + if ((modemask & IPIPE_STICKY_MASK) != 0) + modemask |= IPIPE_HANDLE_MASK; + } else + modemask &= + ~(IPIPE_HANDLE_MASK | IPIPE_STICKY_MASK | + IPIPE_EXCLUSIVE_MASK | IPIPE_WIRED_MASK); + + if (acknowledge == NULL && !ipipe_virtual_irq_p(irq)) + /* + * Acknowledge handler unspecified for a hw interrupt: + * use the Linux-defined handler instead. + */ + acknowledge = ipipe_root_domain->irqs[irq].acknowledge; + + ipd->irqs[irq].handler = handler; + ipd->irqs[irq].cookie = cookie; + ipd->irqs[irq].acknowledge = acknowledge; + ipd->irqs[irq].control = modemask; + + if (irq < NR_IRQS && !ipipe_virtual_irq_p(irq)) { + desc = irq_to_desc(irq); + if (handler != NULL) { + if (desc) + __ipipe_enable_irqdesc(ipd, irq); + + if ((modemask & IPIPE_ENABLE_MASK) != 0) { + if (ipd != __ipipe_current_domain) { + /* + * IRQ enable/disable state is domain-sensitive, so we + * may not change it for another domain. What is + * allowed however is forcing some domain to handle an + * interrupt source, by passing the proper 'ipd' + * descriptor which thus may be different from + * __ipipe_current_domain. + */ + err = -EPERM; + goto unlock_and_exit; + } + if (desc) + __ipipe_enable_irq(irq); + } + } else if (old_handler != NULL && desc) + __ipipe_disable_irqdesc(ipd, irq); + } + + err = 0; + + unlock_and_exit: + + spin_unlock_irqrestore(&__ipipe_pipelock, flags); + + return err; +} + +/* ipipe_control_irq() -- Change modes of a pipelined interrupt for + * the current domain. */ + +int ipipe_control_irq(unsigned irq, unsigned clrmask, unsigned setmask) +{ + struct ipipe_domain *ipd; + unsigned long flags; + + if (irq >= IPIPE_NR_IRQS) + return -EINVAL; + + spin_lock_irqsave(&__ipipe_pipelock, flags); + + ipd = __ipipe_current_domain; + + if (ipd->irqs[irq].control & IPIPE_SYSTEM_MASK) { + spin_unlock_irqrestore(&__ipipe_pipelock, flags); + return -EPERM; + } + + if (ipd->irqs[irq].handler == NULL) + setmask &= ~(IPIPE_HANDLE_MASK | IPIPE_STICKY_MASK); + + if ((setmask & IPIPE_STICKY_MASK) != 0) + setmask |= IPIPE_HANDLE_MASK; + + if ((clrmask & (IPIPE_HANDLE_MASK | IPIPE_STICKY_MASK)) != 0) /* If one goes, both go. */ + clrmask |= (IPIPE_HANDLE_MASK | IPIPE_STICKY_MASK); + + ipd->irqs[irq].control &= ~clrmask; + ipd->irqs[irq].control |= setmask; + + if ((setmask & IPIPE_ENABLE_MASK) != 0) + __ipipe_enable_irq(irq); + else if ((clrmask & IPIPE_ENABLE_MASK) != 0) + __ipipe_disable_irq(irq); + + spin_unlock_irqrestore(&__ipipe_pipelock, flags); + + return 0; +} + +/* __ipipe_dispatch_event() -- Low-level event dispatcher. */ + +int __ipipe_dispatch_event (unsigned event, void *data) +{ +extern void *ipipe_irq_handler; void *handler; if (ipipe_irq_handler != __ipipe_handle_irq && (handler = ipipe_root_domain->evhand[event])) { return ((int (*)(unsigned long, void *))handler)(event, data); } else { + struct ipipe_domain *start_domain, *this_domain, *next_domain; + struct ipipe_percpu_domain_data *np; + ipipe_event_handler_t evhand; + struct list_head *pos, *npos; + unsigned long flags; + int propagate = 1; + + local_irq_save_hw(flags); + + start_domain = this_domain = __ipipe_current_domain; + + list_for_each_safe(pos, npos, &__ipipe_pipeline) { + /* + * Note: Domain migration may occur while running + * event or interrupt handlers, in which case the + * current register set is going to be recycled for a + * different domain than the initiating one. We do + * care for that, always tracking the current domain + * descriptor upon return from those handlers. + */ + next_domain = list_entry(pos, struct ipipe_domain, p_link); + np = ipipe_cpudom_ptr(next_domain); + + /* + * Keep a cached copy of the handler's address since + * ipipe_catch_event() may clear it under our feet. + */ + evhand = next_domain->evhand[event]; + + if (evhand != NULL) { + __ipipe_current_domain = next_domain; + np->evsync |= (1LL << event); + local_irq_restore_hw(flags); + propagate = !evhand(event, start_domain, data); + local_irq_save_hw(flags); + /* + * We may have a migration issue here, if the + * current task is migrated to another CPU on + * behalf of the invoked handler, usually when + * a syscall event is processed. However, + * ipipe_catch_event() will make sure that a + * CPU that clears a handler for any given + * event will not attempt to wait for itself + * to clear the evsync bit for that event, + * which practically plugs the hole, without + * resorting to a much more complex strategy. + */ + np->evsync &= ~(1LL << event); + if (__ipipe_current_domain != next_domain) + this_domain = __ipipe_current_domain; + } + + /* NEVER sync the root stage here. */ + if (next_domain != ipipe_root_domain && + __ipipe_ipending_p(np) && + !test_bit(IPIPE_STALL_FLAG, &np->status)) { + __ipipe_current_domain = next_domain; + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + if (__ipipe_current_domain != next_domain) + this_domain = __ipipe_current_domain; + } + + __ipipe_current_domain = this_domain; + + if (next_domain == this_domain || !propagate) + break; + } + + local_irq_restore_hw(flags); + + return !propagate; +} } + +/* + * __ipipe_dispatch_wired -- Wired interrupt dispatcher. Wired + * interrupts are immediately and unconditionally delivered to the + * domain heading the pipeline upon receipt, and such domain must have + * been registered as an invariant head for the system (priority == + * IPIPE_HEAD_PRIORITY). The motivation for using wired interrupts is + * to get an extra-fast dispatching path for those IRQs, by relying on + * a straightforward logic based on assumptions that must always be + * true for invariant head domains. The following assumptions are + * made when dealing with such interrupts: + * + * 1- Wired interrupts are purely dynamic, i.e. the decision to + * propagate them down the pipeline must be done from the head domain + * ISR. + * 2- Wired interrupts cannot be shared or sticky. + * 3- The root domain cannot be an invariant pipeline head, in + * consequence of what the root domain cannot handle wired + * interrupts. + * 4- Wired interrupts must have a valid acknowledge handler for the + * head domain (if needed, see __ipipe_handle_irq). + * + * Called with hw interrupts off. + */ + +void __ipipe_dispatch_wired(struct ipipe_domain *head, unsigned irq) +{ + struct ipipe_percpu_domain_data *p = ipipe_cpudom_ptr(head); + + prefetchw(p); + + if (unlikely(test_bit(IPIPE_LOCK_FLAG, &head->irqs[irq].control))) { + /* + * If we can't process this IRQ right now, we must + * mark it as held, so that it will get played during + * normal log sync when the corresponding interrupt + * source is eventually unlocked. + */ + __ipipe_set_irq_held(p, irq); + return; + } + + if (test_bit(IPIPE_STALL_FLAG, &p->status)) { + __ipipe_set_irq_pending(head, irq); + return; + } + + __ipipe_dispatch_wired_nocheck(head, irq); +} + +void __ipipe_dispatch_wired_nocheck(struct ipipe_domain *head, unsigned irq) /* hw interrupts off */ +{ + struct ipipe_percpu_domain_data *p = ipipe_cpudom_ptr(head); + struct ipipe_domain *old; + + prefetchw(p); + + old = __ipipe_current_domain; + __ipipe_current_domain = head; /* Switch to the head domain. */ + + p->irqall[irq]++; + __set_bit(IPIPE_STALL_FLAG, &p->status); + head->irqs[irq].handler(irq, head->irqs[irq].cookie); /* Call the ISR. */ + __ipipe_run_irqtail(); + __clear_bit(IPIPE_STALL_FLAG, &p->status); + + if (__ipipe_current_domain == head) { + __ipipe_current_domain = old; + if (old == head) { + if (__ipipe_ipending_p(p)) + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + return; + } + } + + __ipipe_walk_pipeline(&head->p_link); +} + +/* + * __ipipe_sync_stage() -- Flush the pending IRQs for the current + * domain (and processor). This routine flushes the interrupt log + * (see "Optimistic interrupt protection" from D. Stodolsky et al. for + * more on the deferred interrupt scheme). Every interrupt that + * occurred while the pipeline was stalled gets played. WARNING: + * callers on SMP boxen should always check for CPU migration on + * return of this routine. + * + * This routine must be called with hw interrupts off. + */ +void __ipipe_sync_stage(int dovirt) +{ + struct ipipe_percpu_domain_data *p; + struct ipipe_domain *ipd; + int cpu, irq; + + ipd = __ipipe_current_domain; + p = ipipe_cpudom_ptr(ipd); + + if (__test_and_set_bit(IPIPE_SYNC_FLAG, &p->status)) { + /* + * Some questionable code in the root domain may enter + * busy waits for IRQs over interrupt context, so we + * unfortunately have to allow piling up IRQs for + * them. Non-root domains are not allowed to do this. + */ + if (ipd != ipipe_root_domain) + return; + } + + cpu = ipipe_processor_id(); + + for (;;) { + irq = __ipipe_next_irq(p, dovirt); + if (irq < 0) + break; + /* + * Make sure the compiler does not reorder + * wrongly, so that all updates to maps are + * done before the handler gets called. + */ + barrier(); + + if (test_bit(IPIPE_LOCK_FLAG, &ipd->irqs[irq].control)) + continue; + + __set_bit(IPIPE_STALL_FLAG, &p->status); + smp_wmb(); + + if (ipd == ipipe_root_domain) + trace_hardirqs_off(); + + __ipipe_run_isr(ipd, irq); + barrier(); + p = ipipe_cpudom_ptr(__ipipe_current_domain); +#ifdef CONFIG_SMP + { + int newcpu = ipipe_processor_id(); + + if (newcpu != cpu) { /* Handle CPU migration. */ + /* + * We expect any domain to clear the SYNC bit each + * time it switches in a new task, so that preemptions + * and/or CPU migrations (in the SMP case) over the + * ISR do not lock out the log syncer for some + * indefinite amount of time. In the Linux case, + * schedule() handles this (see kernel/sched.c). For + * this reason, we don't bother clearing it here for + * the source CPU in the migration handling case, + * since it must have scheduled another task in by + * now. + */ + __set_bit(IPIPE_SYNC_FLAG, &p->status); + cpu = newcpu; + } + } +#endif /* CONFIG_SMP */ +#ifdef CONFIG_TRACE_IRQFLAGS + if (__ipipe_root_domain_p && + test_bit(IPIPE_STALL_FLAG, &p->status)) + trace_hardirqs_on(); +#endif + __clear_bit(IPIPE_STALL_FLAG, &p->status); + } + + __clear_bit(IPIPE_SYNC_FLAG, &p->status); +} + +/* ipipe_register_domain() -- Link a new domain to the pipeline. */ + +int ipipe_register_domain(struct ipipe_domain *ipd, + struct ipipe_domain_attr *attr) +{ + struct ipipe_percpu_domain_data *p; + struct list_head *pos = NULL; + struct ipipe_domain *_ipd; + unsigned long flags; + + if (!ipipe_root_domain_p) { + printk(KERN_WARNING + "I-pipe: Only the root domain may register a new domain.\n"); + return -EPERM; + } + + flags = ipipe_critical_enter(NULL); + + if (attr->priority == IPIPE_HEAD_PRIORITY) { + if (test_bit(IPIPE_HEAD_SLOT, &__ipipe_domain_slot_map)) { + ipipe_critical_exit(flags); + return -EAGAIN; /* Cannot override current head. */ + } + ipd->slot = IPIPE_HEAD_SLOT; + } else + ipd->slot = ffz(__ipipe_domain_slot_map); + + if (ipd->slot < CONFIG_IPIPE_DOMAINS) { + set_bit(ipd->slot, &__ipipe_domain_slot_map); + list_for_each(pos, &__ipipe_pipeline) { + _ipd = list_entry(pos, struct ipipe_domain, p_link); + if (_ipd->domid == attr->domid) + break; + } + } + + ipipe_critical_exit(flags); + + if (pos != &__ipipe_pipeline) { + if (ipd->slot < CONFIG_IPIPE_DOMAINS) + clear_bit(ipd->slot, &__ipipe_domain_slot_map); + return -EBUSY; + } + +#ifndef CONFIG_SMP + /* + * Set up the perdomain pointers for direct access to the + * percpu domain data. This saves a costly multiply each time + * we need to refer to the contents of the percpu domain data + * array. + */ + __raw_get_cpu_var(ipipe_percpu_daddr)[ipd->slot] = &__raw_get_cpu_var(ipipe_percpu_darray)[ipd->slot]; +#endif + + ipd->name = attr->name; + ipd->domid = attr->domid; + ipd->pdd = attr->pdd; + ipd->flags = 0; + + if (attr->priority == IPIPE_HEAD_PRIORITY) { + ipd->priority = INT_MAX; + __set_bit(IPIPE_AHEAD_FLAG,&ipd->flags); + } + else + ipd->priority = attr->priority; + + __ipipe_init_stage(ipd); + + INIT_LIST_HEAD(&ipd->p_link); + +#ifdef CONFIG_PROC_FS + __ipipe_add_domain_proc(ipd); +#endif /* CONFIG_PROC_FS */ + + flags = ipipe_critical_enter(NULL); + + list_for_each(pos, &__ipipe_pipeline) { + _ipd = list_entry(pos, struct ipipe_domain, p_link); + if (ipd->priority > _ipd->priority) + break; + } + + list_add_tail(&ipd->p_link, pos); + + ipipe_critical_exit(flags); + + printk(KERN_INFO "I-pipe: Domain %s registered.\n", ipd->name); + + if (attr->entry == NULL) + return 0; + + /* + * Finally, allow the new domain to perform its initialization + * duties. + */ + local_irq_save_hw_smp(flags); + __ipipe_current_domain = ipd; + local_irq_restore_hw_smp(flags); + attr->entry(); + local_irq_save_hw(flags); + __ipipe_current_domain = ipipe_root_domain; + p = ipipe_root_cpudom_ptr(); + + if (__ipipe_ipending_p(p) && + !test_bit(IPIPE_STALL_FLAG, &p->status)) + __ipipe_sync_pipeline(IPIPE_IRQ_DOALL); + + local_irq_restore_hw(flags); + + return 0; +} + +/* ipipe_unregister_domain() -- Remove a domain from the pipeline. */ + +int ipipe_unregister_domain(struct ipipe_domain *ipd) +{ + unsigned long flags; + + if (!ipipe_root_domain_p) { + printk(KERN_WARNING + "I-pipe: Only the root domain may unregister a domain.\n"); + return -EPERM; + } + + if (ipd == ipipe_root_domain) { + printk(KERN_WARNING + "I-pipe: Cannot unregister the root domain.\n"); + return -EPERM; + } +#ifdef CONFIG_SMP + { + struct ipipe_percpu_domain_data *p; + unsigned int irq; + int cpu; + + /* + * In the SMP case, wait for the logged events to drain on + * other processors before eventually removing the domain + * from the pipeline. + */ + + ipipe_unstall_pipeline_from(ipd); + + flags = ipipe_critical_enter(NULL); + + for (irq = 0; irq < IPIPE_NR_IRQS; irq++) { + clear_bit(IPIPE_HANDLE_FLAG, &ipd->irqs[irq].control); + clear_bit(IPIPE_STICKY_FLAG, &ipd->irqs[irq].control); + set_bit(IPIPE_PASS_FLAG, &ipd->irqs[irq].control); + } + + ipipe_critical_exit(flags); + + for_each_online_cpu(cpu) { + p = ipipe_percpudom_ptr(ipd, cpu); + while (__ipipe_ipending_p(p)) + cpu_relax(); + } + } +#endif /* CONFIG_SMP */ + + mutex_lock(&ipd->mutex); + +#ifdef CONFIG_PROC_FS + __ipipe_remove_domain_proc(ipd); +#endif /* CONFIG_PROC_FS */ + + /* + * Simply remove the domain from the pipeline and we are almost done. + */ + + flags = ipipe_critical_enter(NULL); + list_del_init(&ipd->p_link); + ipipe_critical_exit(flags); + + __ipipe_cleanup_domain(ipd); + + mutex_unlock(&ipd->mutex); + + printk(KERN_INFO "I-pipe: Domain %s unregistered.\n", ipd->name); + + return 0; +} + +/* + * ipipe_propagate_irq() -- Force a given IRQ propagation on behalf of + * a running interrupt handler to the next domain down the pipeline. + * ipipe_schedule_irq() -- Does almost the same as above, but attempts + * to pend the interrupt for the current domain first. + * Must be called hw IRQs off. + */ +void __ipipe_pend_irq(unsigned irq, struct list_head *head) +{ + struct ipipe_domain *ipd; + struct list_head *ln; + +#ifdef CONFIG_IPIPE_DEBUG + BUG_ON(irq >= IPIPE_NR_IRQS || + (ipipe_virtual_irq_p(irq) + && !test_bit(irq - IPIPE_VIRQ_BASE, &__ipipe_virtual_irq_map))); +#endif + for (ln = head; ln != &__ipipe_pipeline; ln = ipd->p_link.next) { + ipd = list_entry(ln, struct ipipe_domain, p_link); + if (test_bit(IPIPE_HANDLE_FLAG, &ipd->irqs[irq].control)) { + __ipipe_set_irq_pending(ipd, irq); + return; + } + } +} + +/* ipipe_free_virq() -- Release a virtual/soft interrupt. */ + +int ipipe_free_virq(unsigned virq) +{ + if (!ipipe_virtual_irq_p(virq)) + return -EINVAL; + + clear_bit(virq - IPIPE_VIRQ_BASE, &__ipipe_virtual_irq_map); + + return 0; +} + +void ipipe_init_attr(struct ipipe_domain_attr *attr) +{ + attr->name = "anon"; + attr->domid = 1; + attr->entry = NULL; + attr->priority = IPIPE_ROOT_PRIO; + attr->pdd = NULL; +} + +/* + * ipipe_catch_event() -- Interpose or remove an event handler for a + * given domain. + */ +ipipe_event_handler_t ipipe_catch_event(struct ipipe_domain *ipd, + unsigned event, + ipipe_event_handler_t handler) +{ + ipipe_event_handler_t old_handler; + unsigned long flags; + int self = 0, cpu; + + if (event & IPIPE_EVENT_SELF) { + event &= ~IPIPE_EVENT_SELF; + self = 1; + } + + if (event >= IPIPE_NR_EVENTS) + return NULL; + + flags = ipipe_critical_enter(NULL); + + if (!(old_handler = xchg(&ipd->evhand[event],handler))) { + if (handler) { + if (self) + ipd->evself |= (1LL << event); + else + __ipipe_event_monitors[event]++; + } + } + else if (!handler) { + if (ipd->evself & (1LL << event)) + ipd->evself &= ~(1LL << event); + else + __ipipe_event_monitors[event]--; + } else if ((ipd->evself & (1LL << event)) && !self) { + __ipipe_event_monitors[event]++; + ipd->evself &= ~(1LL << event); + } else if (!(ipd->evself & (1LL << event)) && self) { + __ipipe_event_monitors[event]--; + ipd->evself |= (1LL << event); + } + + ipipe_critical_exit(flags); + + if (!handler && ipipe_root_domain_p) { + /* + * If we cleared a handler on behalf of the root + * domain, we have to wait for any current invocation + * to drain, since our caller might subsequently unmap + * the target domain. To this aim, this code + * synchronizes with __ipipe_dispatch_event(), + * guaranteeing that either the dispatcher sees a null + * handler in which case it discards the invocation + * (which also prevents from entering a livelock), or + * finds a valid handler and calls it. Symmetrically, + * ipipe_catch_event() ensures that the called code + * won't be unmapped under our feet until the event + * synchronization flag is cleared for the given event + * on all CPUs. + */ + preempt_disable(); + cpu = smp_processor_id(); + /* + * Hack: this solves the potential migration issue + * raised in __ipipe_dispatch_event(). This is a + * work-around which makes the assumption that other + * CPUs will subsequently, either process at least one + * interrupt for the target domain, or call + * __ipipe_dispatch_event() without going through a + * migration while running the handler at least once; + * practically, this is safe on any normally running + * system. + */ + ipipe_percpudom(ipd, evsync, cpu) &= ~(1LL << event); + preempt_enable(); + + for_each_online_cpu(cpu) { + while (ipipe_percpudom(ipd, evsync, cpu) & (1LL << event)) + schedule_timeout_interruptible(HZ / 50); + } + } + + return old_handler; +} + +cpumask_t ipipe_set_irq_affinity (unsigned irq, cpumask_t cpumask) +{ +#ifdef CONFIG_SMP + if (irq >= NR_IRQS) // if (irq >= IPIPE_NR_XIRQS) + /* Allow changing affinity of external IRQs only. */ + return CPU_MASK_NONE; + + if (num_online_cpus() > 1) + return __ipipe_set_irq_affinity(irq,cpumask); +#endif /* CONFIG_SMP */ + + return CPU_MASK_NONE; +} + +int ipipe_send_ipi (unsigned ipi, cpumask_t cpumask) + +{ +#ifdef CONFIG_SMP + return __ipipe_send_ipi(ipi,cpumask); +#else /* !CONFIG_SMP */ + return -EINVAL; +#endif /* CONFIG_SMP */ +} + +int ipipe_alloc_ptdkey (void) +{ + unsigned long flags; + int key = -1; + + spin_lock_irqsave(&__ipipe_pipelock,flags); + + if (__ipipe_ptd_key_count < IPIPE_ROOT_NPTDKEYS) { + key = ffz(__ipipe_ptd_key_map); + set_bit(key,&__ipipe_ptd_key_map); + __ipipe_ptd_key_count++; + } + + spin_unlock_irqrestore(&__ipipe_pipelock,flags); + + return key; +} + +int ipipe_free_ptdkey (int key) +{ + unsigned long flags; + + if (key < 0 || key >= IPIPE_ROOT_NPTDKEYS) + return -EINVAL; + + spin_lock_irqsave(&__ipipe_pipelock,flags); + + if (test_and_clear_bit(key,&__ipipe_ptd_key_map)) + __ipipe_ptd_key_count--; + + spin_unlock_irqrestore(&__ipipe_pipelock,flags); + + return 0; +} + +int ipipe_set_ptd (int key, void *value) + +{ + if (key < 0 || key >= IPIPE_ROOT_NPTDKEYS) + return -EINVAL; + + current->ptd[key] = value; + + return 0; +} + +void *ipipe_get_ptd (int key) + +{ + if (key < 0 || key >= IPIPE_ROOT_NPTDKEYS) + return NULL; + + return current->ptd[key]; +} + +#ifdef CONFIG_PROC_FS + +struct proc_dir_entry *ipipe_proc_root; + +static int __ipipe_version_info_proc(char *page, + char **start, + off_t off, int count, int *eof, void *data) +{ + int len = sprintf(page, "%s\n", IPIPE_VERSION_STRING); + + len -= off; + + if (len <= off + count) + *eof = 1; + + *start = page + off; + + if(len > count) + len = count; + + if(len < 0) + len = 0; + + return len; +} + +static int __ipipe_common_info_show(struct seq_file *p, void *data) +{ + struct ipipe_domain *ipd = (struct ipipe_domain *)p->private; + char handling, stickiness, lockbit, exclusive, virtuality; + + unsigned long ctlbits; + unsigned irq; + + seq_printf(p, " +----- Handling ([A]ccepted, [G]rabbed, [W]ired, [D]iscarded)\n"); + seq_printf(p, " |+---- Sticky\n"); + seq_printf(p, " ||+--- Locked\n"); + seq_printf(p, " |||+-- Exclusive\n"); + seq_printf(p, " ||||+- Virtual\n"); + seq_printf(p, "[IRQ] |||||\n"); + + mutex_lock(&ipd->mutex); + + for (irq = 0; irq < IPIPE_NR_IRQS; irq++) { + /* Remember to protect against + * ipipe_virtual_irq/ipipe_control_irq if more fields + * get involved. */ + ctlbits = ipd->irqs[irq].control; + + if (irq >= IPIPE_NR_XIRQS && !ipipe_virtual_irq_p(irq)) + /* + * There might be a hole between the last external + * IRQ and the first virtual one; skip it. + */ + continue; + + if (ipipe_virtual_irq_p(irq) + && !test_bit(irq - IPIPE_VIRQ_BASE, &__ipipe_virtual_irq_map)) + /* Non-allocated virtual IRQ; skip it. */ + continue; + + /* + * Statuses are as follows: + * o "accepted" means handled _and_ passed down the pipeline. + * o "grabbed" means handled, but the interrupt might be + * terminated _or_ passed down the pipeline depending on + * what the domain handler asks for to the I-pipe. + * o "wired" is basically the same as "grabbed", except that + * the interrupt is unconditionally delivered to an invariant + * pipeline head domain. + * o "passed" means unhandled by the domain but passed + * down the pipeline. + * o "discarded" means unhandled and _not_ passed down the + * pipeline. The interrupt merely disappears from the + * current domain down to the end of the pipeline. + */ + if (ctlbits & IPIPE_HANDLE_MASK) { + if (ctlbits & IPIPE_PASS_MASK) + handling = 'A'; + else if (ctlbits & IPIPE_WIRED_MASK) + handling = 'W'; + else + handling = 'G'; + } else if (ctlbits & IPIPE_PASS_MASK) + /* Do not output if no major action is taken. */ + continue; + else + handling = 'D'; + + if (ctlbits & IPIPE_STICKY_MASK) + stickiness = 'S'; + else + stickiness = '.'; + + if (ctlbits & IPIPE_LOCK_MASK) + lockbit = 'L'; + else + lockbit = '.'; + + if (ctlbits & IPIPE_EXCLUSIVE_MASK) + exclusive = 'X'; + else + exclusive = '.'; + + if (ipipe_virtual_irq_p(irq)) + virtuality = 'V'; + else + virtuality = '.'; + + seq_printf(p, " %3u: %c%c%c%c%c\n", + irq, handling, stickiness, lockbit, exclusive, virtuality); + } + + seq_printf(p, "[Domain info]\n"); + + seq_printf(p, "id=0x%.8x\n", ipd->domid); + + if (test_bit(IPIPE_AHEAD_FLAG,&ipd->flags)) + seq_printf(p, "priority=topmost\n"); + else + seq_printf(p, "priority=%d\n", ipd->priority); + + mutex_unlock(&ipd->mutex); + + return 0; +} + +static int __ipipe_common_info_open(struct inode *inode, struct file *file) +{ + return single_open(file, __ipipe_common_info_show, PROC_I(inode)->pde->data); +} + +static struct file_operations __ipipe_info_proc_ops = { + .owner = THIS_MODULE, + .open = __ipipe_common_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void __ipipe_add_domain_proc(struct ipipe_domain *ipd) +{ + struct proc_dir_entry *e = create_proc_entry(ipd->name, 0444, ipipe_proc_root); + if (e) { + e->proc_fops = &__ipipe_info_proc_ops; + e->data = (void*) ipd; + } +} + +void __ipipe_remove_domain_proc(struct ipipe_domain *ipd) +{ + remove_proc_entry(ipd->name,ipipe_proc_root); +} + +void __init ipipe_init_proc(void) +{ + ipipe_proc_root = create_proc_entry("ipipe",S_IFDIR, 0); + create_proc_read_entry("version",0444,ipipe_proc_root,&__ipipe_version_info_proc,NULL); + __ipipe_add_domain_proc(ipipe_root_domain); + + __ipipe_init_tracer(); +} + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT + +DEFINE_PER_CPU(int, ipipe_percpu_context_check) = { 1 }; +DEFINE_PER_CPU(int, ipipe_saved_context_check_state); + +void ipipe_check_context(struct ipipe_domain *border_domain) +{ + struct ipipe_percpu_domain_data *p; + struct ipipe_domain *this_domain; + unsigned long flags; + int cpu; + + local_irq_save_hw_smp(flags); + + this_domain = __ipipe_current_domain; + p = ipipe_head_cpudom_ptr(); + if (likely(this_domain->priority <= border_domain->priority && + !test_bit(IPIPE_STALL_FLAG, &p->status))) { + local_irq_restore_hw_smp(flags); + return; + } + + cpu = ipipe_processor_id(); + if (!per_cpu(ipipe_percpu_context_check, cpu)) { + local_irq_restore_hw_smp(flags); + return; + } + + local_irq_restore_hw_smp(flags); + + ipipe_context_check_off(); + ipipe_trace_panic_freeze(); + ipipe_set_printk_sync(__ipipe_current_domain); + + if (this_domain->priority > border_domain->priority) + printk(KERN_ERR "I-pipe: Detected illicit call from domain " + "'%s'\n" + KERN_ERR " into a service reserved for domain " + "'%s' and below.\n", + this_domain->name, border_domain->name); + else + printk(KERN_ERR "I-pipe: Detected stalled topmost domain, " + "probably caused by a bug.\n" + " A critical section may have been " + "left unterminated.\n"); + dump_stack(); + ipipe_trace_panic_dump(); +} + +EXPORT_SYMBOL(ipipe_check_context); + +#endif /* CONFIG_IPIPE_DEBUG_CONTEXT */ + +#if defined(CONFIG_IPIPE_DEBUG_INTERNAL) && defined(CONFIG_SMP) + +int notrace __ipipe_check_percpu_access(void) +{ + struct ipipe_percpu_domain_data *p; + struct ipipe_domain *this_domain; + unsigned long flags; + int ret = 0; + + local_irq_save_hw_notrace(flags); + + this_domain = __raw_get_cpu_var(ipipe_percpu_domain); + + /* + * Only the root domain may implement preemptive CPU migration + * of tasks, so anything above in the pipeline should be fine. + */ + if (this_domain->priority > IPIPE_ROOT_PRIO) + goto out; + + if (raw_irqs_disabled_flags(flags)) + goto out; + + /* + * Last chance: hw interrupts were enabled on entry while + * running over the root domain, but the root stage might be + * currently stalled, in which case preemption would be + * disabled, and no migration could occur. + */ + if (this_domain == ipipe_root_domain) { + p = ipipe_root_cpudom_ptr(); + if (test_bit(IPIPE_STALL_FLAG, &p->status)) + goto out; + } + /* + * Our caller may end up accessing the wrong per-cpu variable + * instance due to CPU migration; tell it to complain about + * this. + */ + ret = 1; +out: + local_irq_restore_hw_notrace(flags); + + return ret; +} + +#endif /* CONFIG_IPIPE_DEBUG_INTERNAL && CONFIG_SMP */ + +EXPORT_SYMBOL(ipipe_virtualize_irq); +EXPORT_SYMBOL(ipipe_control_irq); +EXPORT_SYMBOL(ipipe_suspend_domain); +EXPORT_SYMBOL(ipipe_alloc_virq); +EXPORT_PER_CPU_SYMBOL(ipipe_percpu_domain); +EXPORT_PER_CPU_SYMBOL(ipipe_percpu_darray); +EXPORT_SYMBOL(ipipe_root); +EXPORT_SYMBOL(ipipe_stall_pipeline_from); +EXPORT_SYMBOL(ipipe_test_and_stall_pipeline_from); +EXPORT_SYMBOL(ipipe_test_and_unstall_pipeline_from); +EXPORT_SYMBOL(ipipe_restore_pipeline_from); +EXPORT_SYMBOL(ipipe_unstall_pipeline_head); +EXPORT_SYMBOL(__ipipe_restore_pipeline_head); +EXPORT_SYMBOL(__ipipe_unstall_root); +EXPORT_SYMBOL(__ipipe_restore_root); +EXPORT_SYMBOL(__ipipe_spin_lock_irq); +EXPORT_SYMBOL(__ipipe_spin_unlock_irq); +EXPORT_SYMBOL(__ipipe_spin_lock_irqsave); +EXPORT_SYMBOL(__ipipe_spin_unlock_irqrestore); +EXPORT_SYMBOL(__ipipe_pipeline); +EXPORT_SYMBOL(__ipipe_lock_irq); +EXPORT_SYMBOL(__ipipe_unlock_irq); +EXPORT_SYMBOL(ipipe_register_domain); +EXPORT_SYMBOL(ipipe_unregister_domain); +EXPORT_SYMBOL(ipipe_free_virq); +EXPORT_SYMBOL(ipipe_init_attr); +EXPORT_SYMBOL(ipipe_catch_event); +EXPORT_SYMBOL(ipipe_alloc_ptdkey); +EXPORT_SYMBOL(ipipe_free_ptdkey); +EXPORT_SYMBOL(ipipe_set_ptd); +EXPORT_SYMBOL(ipipe_get_ptd); +EXPORT_SYMBOL(ipipe_set_irq_affinity); +EXPORT_SYMBOL(ipipe_send_ipi); +EXPORT_SYMBOL(__ipipe_pend_irq); +EXPORT_SYMBOL(__ipipe_set_irq_pending); +#if defined(CONFIG_IPIPE_DEBUG_INTERNAL) && defined(CONFIG_SMP) +EXPORT_SYMBOL(__ipipe_check_percpu_access); +#endif +#ifdef CONFIG_GENERIC_CLOCKEVENTS +EXPORT_SYMBOL(ipipe_request_tickdev); +EXPORT_SYMBOL(ipipe_release_tickdev); +#endif + +EXPORT_SYMBOL(ipipe_critical_enter); +EXPORT_SYMBOL(ipipe_critical_exit); +EXPORT_SYMBOL(ipipe_trigger_irq); +EXPORT_SYMBOL(ipipe_get_sysinfo); diff --git a/kernel/ipipe/tracer.c b/kernel/ipipe/tracer.c new file mode 100644 index 0000000..d3c1866 --- /dev/null +++ b/kernel/ipipe/tracer.c @@ -0,0 +1,1441 @@ +/* -*- linux-c -*- + * kernel/ipipe/tracer.c + * + * Copyright (C) 2005 Luotao Fu. + * 2005-2008 Jan Kiszka. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPIPE_TRACE_PATHS 4 /* Do not lower below 3 */ +#define IPIPE_DEFAULT_ACTIVE 0 +#define IPIPE_DEFAULT_MAX 1 +#define IPIPE_DEFAULT_FROZEN 2 + +#define IPIPE_TRACE_POINTS (1 << CONFIG_IPIPE_TRACE_SHIFT) +#define WRAP_POINT_NO(point) ((point) & (IPIPE_TRACE_POINTS-1)) + +#define IPIPE_DEFAULT_PRE_TRACE 10 +#define IPIPE_DEFAULT_POST_TRACE 10 +#define IPIPE_DEFAULT_BACK_TRACE 100 + +#define IPIPE_DELAY_NOTE 1000 /* in nanoseconds */ +#define IPIPE_DELAY_WARN 10000 /* in nanoseconds */ + +#define IPIPE_TFLG_NMI_LOCK 0x0001 +#define IPIPE_TFLG_NMI_HIT 0x0002 +#define IPIPE_TFLG_NMI_FREEZE_REQ 0x0004 + +#define IPIPE_TFLG_HWIRQ_OFF 0x0100 +#define IPIPE_TFLG_FREEZING 0x0200 +#define IPIPE_TFLG_CURRDOM_SHIFT 10 /* bits 10..11: current domain */ +#define IPIPE_TFLG_CURRDOM_MASK 0x0C00 +#define IPIPE_TFLG_DOMSTATE_SHIFT 12 /* bits 12..15: domain stalled? */ +#define IPIPE_TFLG_DOMSTATE_BITS 3 + +#define IPIPE_TFLG_DOMAIN_STALLED(point, n) \ + (point->flags & (1 << (n + IPIPE_TFLG_DOMSTATE_SHIFT))) +#define IPIPE_TFLG_CURRENT_DOMAIN(point) \ + ((point->flags & IPIPE_TFLG_CURRDOM_MASK) >> IPIPE_TFLG_CURRDOM_SHIFT) + +struct ipipe_trace_point { + short type; + short flags; + unsigned long eip; + unsigned long parent_eip; + unsigned long v; + unsigned long long timestamp; +}; + +struct ipipe_trace_path { + volatile int flags; + int dump_lock; /* separated from flags due to cross-cpu access */ + int trace_pos; /* next point to fill */ + int begin, end; /* finalised path begin and end */ + int post_trace; /* non-zero when in post-trace phase */ + unsigned long long length; /* max path length in cycles */ + unsigned long nmi_saved_eip; /* for deferred requests from NMIs */ + unsigned long nmi_saved_parent_eip; + unsigned long nmi_saved_v; + struct ipipe_trace_point point[IPIPE_TRACE_POINTS]; +} ____cacheline_aligned_in_smp; + +enum ipipe_trace_type +{ + IPIPE_TRACE_FUNC = 0, + IPIPE_TRACE_BEGIN, + IPIPE_TRACE_END, + IPIPE_TRACE_FREEZE, + IPIPE_TRACE_SPECIAL, + IPIPE_TRACE_PID, + IPIPE_TRACE_EVENT, +}; + +#define IPIPE_TYPE_MASK 0x0007 +#define IPIPE_TYPE_BITS 3 + +#ifdef CONFIG_IPIPE_TRACE_VMALLOC +static DEFINE_PER_CPU(struct ipipe_trace_path *, trace_path); +#else /* !CONFIG_IPIPE_TRACE_VMALLOC */ +static DEFINE_PER_CPU(struct ipipe_trace_path, trace_path[IPIPE_TRACE_PATHS]) = + { [0 ... IPIPE_TRACE_PATHS-1] = { .begin = -1, .end = -1 } }; +#endif /* CONFIG_IPIPE_TRACE_VMALLOC */ + +int ipipe_trace_enable = 0; + +static DEFINE_PER_CPU(int, active_path) = { IPIPE_DEFAULT_ACTIVE }; +static DEFINE_PER_CPU(int, max_path) = { IPIPE_DEFAULT_MAX }; +static DEFINE_PER_CPU(int, frozen_path) = { IPIPE_DEFAULT_FROZEN }; +static IPIPE_DEFINE_SPINLOCK(global_path_lock); +static int pre_trace = IPIPE_DEFAULT_PRE_TRACE; +static int post_trace = IPIPE_DEFAULT_POST_TRACE; +static int back_trace = IPIPE_DEFAULT_BACK_TRACE; +static int verbose_trace = 1; +static unsigned long trace_overhead; + +static unsigned long trigger_begin; +static unsigned long trigger_end; + +static DEFINE_MUTEX(out_mutex); +static struct ipipe_trace_path *print_path; +#ifdef CONFIG_IPIPE_TRACE_PANIC +static struct ipipe_trace_path *panic_path; +#endif /* CONFIG_IPIPE_TRACE_PANIC */ +static int print_pre_trace; +static int print_post_trace; + + +static long __ipipe_signed_tsc2us(long long tsc); +static void +__ipipe_trace_point_type(char *buf, struct ipipe_trace_point *point); +static void __ipipe_print_symname(struct seq_file *m, unsigned long eip); + + +static notrace void +__ipipe_store_domain_states(struct ipipe_trace_point *point) +{ + struct ipipe_domain *ipd; + struct list_head *pos; + int i = 0; + + list_for_each_prev(pos, &__ipipe_pipeline) { + ipd = list_entry(pos, struct ipipe_domain, p_link); + + if (test_bit(IPIPE_STALL_FLAG, &ipipe_cpudom_var(ipd, status))) + point->flags |= 1 << (i + IPIPE_TFLG_DOMSTATE_SHIFT); + + if (ipd == __ipipe_current_domain) + point->flags |= i << IPIPE_TFLG_CURRDOM_SHIFT; + + if (++i > IPIPE_TFLG_DOMSTATE_BITS) + break; + } +} + +static notrace int __ipipe_get_free_trace_path(int old, int cpu) +{ + int new_active = old; + struct ipipe_trace_path *tp; + + do { + if (++new_active == IPIPE_TRACE_PATHS) + new_active = 0; + tp = &per_cpu(trace_path, cpu)[new_active]; + } while (new_active == per_cpu(max_path, cpu) || + new_active == per_cpu(frozen_path, cpu) || + tp->dump_lock); + + return new_active; +} + +static notrace void +__ipipe_migrate_pre_trace(struct ipipe_trace_path *new_tp, + struct ipipe_trace_path *old_tp, int old_pos) +{ + int i; + + new_tp->trace_pos = pre_trace+1; + + for (i = new_tp->trace_pos; i > 0; i--) + memcpy(&new_tp->point[WRAP_POINT_NO(new_tp->trace_pos-i)], + &old_tp->point[WRAP_POINT_NO(old_pos-i)], + sizeof(struct ipipe_trace_point)); + + /* mark the end (i.e. the point before point[0]) invalid */ + new_tp->point[IPIPE_TRACE_POINTS-1].eip = 0; +} + +static notrace struct ipipe_trace_path * +__ipipe_trace_end(int cpu, struct ipipe_trace_path *tp, int pos) +{ + struct ipipe_trace_path *old_tp = tp; + long active = per_cpu(active_path, cpu); + unsigned long long length; + + /* do we have a new worst case? */ + length = tp->point[tp->end].timestamp - + tp->point[tp->begin].timestamp; + if (length > per_cpu(trace_path, cpu)[per_cpu(max_path, cpu)].length) { + /* we need protection here against other cpus trying + to start a proc dump */ + spin_lock(&global_path_lock); + + /* active path holds new worst case */ + tp->length = length; + per_cpu(max_path, cpu) = active; + + /* find next unused trace path */ + active = __ipipe_get_free_trace_path(active, cpu); + + spin_unlock(&global_path_lock); + + tp = &per_cpu(trace_path, cpu)[active]; + + /* migrate last entries for pre-tracing */ + __ipipe_migrate_pre_trace(tp, old_tp, pos); + } + + return tp; +} + +static notrace struct ipipe_trace_path * +__ipipe_trace_freeze(int cpu, struct ipipe_trace_path *tp, int pos) +{ + struct ipipe_trace_path *old_tp = tp; + long active = per_cpu(active_path, cpu); + int n; + + /* frozen paths have no core (begin=end) */ + tp->begin = tp->end; + + /* we need protection here against other cpus trying + * to set their frozen path or to start a proc dump */ + spin_lock(&global_path_lock); + + per_cpu(frozen_path, cpu) = active; + + /* find next unused trace path */ + active = __ipipe_get_free_trace_path(active, cpu); + + /* check if this is the first frozen path */ + for_each_possible_cpu(n) { + if (n != cpu && + per_cpu(trace_path, n)[per_cpu(frozen_path, n)].end >= 0) + tp->end = -1; + } + + spin_unlock(&global_path_lock); + + tp = &per_cpu(trace_path, cpu)[active]; + + /* migrate last entries for pre-tracing */ + __ipipe_migrate_pre_trace(tp, old_tp, pos); + + return tp; +} + +void notrace +__ipipe_trace(enum ipipe_trace_type type, unsigned long eip, + unsigned long parent_eip, unsigned long v) +{ + struct ipipe_trace_path *tp, *old_tp; + int pos, next_pos, begin; + struct ipipe_trace_point *point; + unsigned long flags; + int cpu; + + local_irq_save_hw_notrace(flags); + + cpu = ipipe_processor_id(); + restart: + tp = old_tp = &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]; + + /* here starts a race window with NMIs - catched below */ + + /* check for NMI recursion */ + if (unlikely(tp->flags & IPIPE_TFLG_NMI_LOCK)) { + tp->flags |= IPIPE_TFLG_NMI_HIT; + + /* first freeze request from NMI context? */ + if ((type == IPIPE_TRACE_FREEZE) && + !(tp->flags & IPIPE_TFLG_NMI_FREEZE_REQ)) { + /* save arguments and mark deferred freezing */ + tp->flags |= IPIPE_TFLG_NMI_FREEZE_REQ; + tp->nmi_saved_eip = eip; + tp->nmi_saved_parent_eip = parent_eip; + tp->nmi_saved_v = v; + } + return; /* no need for restoring flags inside IRQ */ + } + + /* clear NMI events and set lock (atomically per cpu) */ + tp->flags = (tp->flags & ~(IPIPE_TFLG_NMI_HIT | + IPIPE_TFLG_NMI_FREEZE_REQ)) + | IPIPE_TFLG_NMI_LOCK; + + /* check active_path again - some nasty NMI may have switched + * it meanwhile */ + if (unlikely(tp != + &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)])) { + /* release lock on wrong path and restart */ + tp->flags &= ~IPIPE_TFLG_NMI_LOCK; + + /* there is no chance that the NMI got deferred + * => no need to check for pending freeze requests */ + goto restart; + } + + /* get the point buffer */ + pos = tp->trace_pos; + point = &tp->point[pos]; + + /* store all trace point data */ + point->type = type; + point->flags = raw_irqs_disabled_flags(flags) ? IPIPE_TFLG_HWIRQ_OFF : 0; + point->eip = eip; + point->parent_eip = parent_eip; + point->v = v; + ipipe_read_tsc(point->timestamp); + + __ipipe_store_domain_states(point); + + /* forward to next point buffer */ + next_pos = WRAP_POINT_NO(pos+1); + tp->trace_pos = next_pos; + + /* only mark beginning if we haven't started yet */ + begin = tp->begin; + if (unlikely(type == IPIPE_TRACE_BEGIN) && (begin < 0)) + tp->begin = pos; + + /* end of critical path, start post-trace if not already started */ + if (unlikely(type == IPIPE_TRACE_END) && + (begin >= 0) && !tp->post_trace) + tp->post_trace = post_trace + 1; + + /* freeze only if the slot is free and we are not already freezing */ + if ((unlikely(type == IPIPE_TRACE_FREEZE) || + (unlikely(eip >= trigger_begin && eip <= trigger_end) && + type == IPIPE_TRACE_FUNC)) && + per_cpu(trace_path, cpu)[per_cpu(frozen_path, cpu)].begin < 0 && + !(tp->flags & IPIPE_TFLG_FREEZING)) { + tp->post_trace = post_trace + 1; + tp->flags |= IPIPE_TFLG_FREEZING; + } + + /* enforce end of trace in case of overflow */ + if (unlikely(WRAP_POINT_NO(next_pos + 1) == begin)) { + tp->end = pos; + goto enforce_end; + } + + /* stop tracing this path if we are in post-trace and + * a) that phase is over now or + * b) a new TRACE_BEGIN came in but we are not freezing this path */ + if (unlikely((tp->post_trace > 0) && ((--tp->post_trace == 0) || + ((type == IPIPE_TRACE_BEGIN) && + !(tp->flags & IPIPE_TFLG_FREEZING))))) { + /* store the path's end (i.e. excluding post-trace) */ + tp->end = WRAP_POINT_NO(pos - post_trace + tp->post_trace); + + enforce_end: + if (tp->flags & IPIPE_TFLG_FREEZING) + tp = __ipipe_trace_freeze(cpu, tp, pos); + else + tp = __ipipe_trace_end(cpu, tp, pos); + + /* reset the active path, maybe already start a new one */ + tp->begin = (type == IPIPE_TRACE_BEGIN) ? + WRAP_POINT_NO(tp->trace_pos - 1) : -1; + tp->end = -1; + tp->post_trace = 0; + tp->flags = 0; + + /* update active_path not earlier to avoid races with NMIs */ + per_cpu(active_path, cpu) = tp - per_cpu(trace_path, cpu); + } + + /* we still have old_tp and point, + * let's reset NMI lock and check for catches */ + old_tp->flags &= ~IPIPE_TFLG_NMI_LOCK; + if (unlikely(old_tp->flags & IPIPE_TFLG_NMI_HIT)) { + /* well, this late tagging may not immediately be visible for + * other cpus already dumping this path - a minor issue */ + point->flags |= IPIPE_TFLG_NMI_HIT; + + /* handle deferred freezing from NMI context */ + if (old_tp->flags & IPIPE_TFLG_NMI_FREEZE_REQ) + __ipipe_trace(IPIPE_TRACE_FREEZE, old_tp->nmi_saved_eip, + old_tp->nmi_saved_parent_eip, + old_tp->nmi_saved_v); + } + + local_irq_restore_hw_notrace(flags); +} + +static unsigned long __ipipe_global_path_lock(void) +{ + unsigned long flags; + int cpu; + struct ipipe_trace_path *tp; + + spin_lock_irqsave(&global_path_lock, flags); + + cpu = ipipe_processor_id(); + restart: + tp = &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]; + + /* here is small race window with NMIs - catched below */ + + /* clear NMI events and set lock (atomically per cpu) */ + tp->flags = (tp->flags & ~(IPIPE_TFLG_NMI_HIT | + IPIPE_TFLG_NMI_FREEZE_REQ)) + | IPIPE_TFLG_NMI_LOCK; + + /* check active_path again - some nasty NMI may have switched + * it meanwhile */ + if (tp != &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]) { + /* release lock on wrong path and restart */ + tp->flags &= ~IPIPE_TFLG_NMI_LOCK; + + /* there is no chance that the NMI got deferred + * => no need to check for pending freeze requests */ + goto restart; + } + + return flags; +} + +static void __ipipe_global_path_unlock(unsigned long flags) +{ + int cpu; + struct ipipe_trace_path *tp; + + /* release spinlock first - it's not involved in the NMI issue */ + __ipipe_spin_unlock_irqbegin(&global_path_lock); + + cpu = ipipe_processor_id(); + tp = &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]; + + tp->flags &= ~IPIPE_TFLG_NMI_LOCK; + + /* handle deferred freezing from NMI context */ + if (tp->flags & IPIPE_TFLG_NMI_FREEZE_REQ) + __ipipe_trace(IPIPE_TRACE_FREEZE, tp->nmi_saved_eip, + tp->nmi_saved_parent_eip, tp->nmi_saved_v); + + /* See __ipipe_spin_lock_irqsave() and friends. */ + __ipipe_spin_unlock_irqcomplete(flags); +} + +void notrace ipipe_trace_begin(unsigned long v) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_BEGIN, __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, v); +} +EXPORT_SYMBOL(ipipe_trace_begin); + +void notrace ipipe_trace_end(unsigned long v) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_END, __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, v); +} +EXPORT_SYMBOL(ipipe_trace_end); + +void notrace ipipe_trace_freeze(unsigned long v) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_FREEZE, __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, v); +} +EXPORT_SYMBOL(ipipe_trace_freeze); + +void notrace ipipe_trace_special(unsigned char id, unsigned long v) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_SPECIAL | (id << IPIPE_TYPE_BITS), + __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, v); +} +EXPORT_SYMBOL(ipipe_trace_special); + +void notrace ipipe_trace_pid(pid_t pid, short prio) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_PID | (prio << IPIPE_TYPE_BITS), + __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, pid); +} +EXPORT_SYMBOL(ipipe_trace_pid); + +void notrace ipipe_trace_event(unsigned char id, unsigned long delay_tsc) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_EVENT | (id << IPIPE_TYPE_BITS), + __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, delay_tsc); +} +EXPORT_SYMBOL(ipipe_trace_event); + +int ipipe_trace_max_reset(void) +{ + int cpu; + unsigned long flags; + struct ipipe_trace_path *path; + int ret = 0; + + flags = __ipipe_global_path_lock(); + + for_each_possible_cpu(cpu) { + path = &per_cpu(trace_path, cpu)[per_cpu(max_path, cpu)]; + + if (path->dump_lock) { + ret = -EBUSY; + break; + } + + path->begin = -1; + path->end = -1; + path->trace_pos = 0; + path->length = 0; + } + + __ipipe_global_path_unlock(flags); + + return ret; +} +EXPORT_SYMBOL(ipipe_trace_max_reset); + +int ipipe_trace_frozen_reset(void) +{ + int cpu; + unsigned long flags; + struct ipipe_trace_path *path; + int ret = 0; + + flags = __ipipe_global_path_lock(); + + for_each_online_cpu(cpu) { + path = &per_cpu(trace_path, cpu)[per_cpu(frozen_path, cpu)]; + + if (path->dump_lock) { + ret = -EBUSY; + break; + } + + path->begin = -1; + path->end = -1; + path->trace_pos = 0; + path->length = 0; + } + + __ipipe_global_path_unlock(flags); + + return ret; +} +EXPORT_SYMBOL(ipipe_trace_frozen_reset); + +static void +__ipipe_get_task_info(char *task_info, struct ipipe_trace_point *point, + int trylock) +{ + struct task_struct *task = NULL; + char buf[8]; + int i; + int locked = 1; + + if (trylock) { + if (!read_trylock(&tasklist_lock)) + locked = 0; + } else + read_lock(&tasklist_lock); + + if (locked) + task = find_task_by_pid_ns((pid_t)point->v, &init_pid_ns); + + if (task) + strncpy(task_info, task->comm, 11); + else + strcpy(task_info, "--"); + + if (locked) + read_unlock(&tasklist_lock); + + for (i = strlen(task_info); i < 11; i++) + task_info[i] = ' '; + + sprintf(buf, " %d ", point->type >> IPIPE_TYPE_BITS); + strcpy(task_info + (11 - strlen(buf)), buf); +} + +static void +__ipipe_get_event_date(char *buf,struct ipipe_trace_path *path, + struct ipipe_trace_point *point) +{ + long time; + int type; + + time = __ipipe_signed_tsc2us(point->timestamp - + path->point[path->begin].timestamp + point->v); + type = point->type >> IPIPE_TYPE_BITS; + + if (type == 0) + /* + * Event type #0 is predefined, stands for the next + * timer tick. + */ + sprintf(buf, "tick@%-6ld", time); + else + sprintf(buf, "%3d@%-7ld", type, time); +} + +#ifdef CONFIG_IPIPE_TRACE_PANIC +void ipipe_trace_panic_freeze(void) +{ + unsigned long flags; + int cpu; + + if (!ipipe_trace_enable) + return; + + ipipe_trace_enable = 0; + local_irq_save_hw_notrace(flags); + + cpu = ipipe_processor_id(); + + panic_path = &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]; + + local_irq_restore_hw(flags); +} +EXPORT_SYMBOL(ipipe_trace_panic_freeze); + +void ipipe_trace_panic_dump(void) +{ + int cnt = back_trace; + int start, pos; + char buf[16]; + + if (!panic_path) + return; + + ipipe_context_check_off(); + + printk("I-pipe tracer log (%d points):\n", cnt); + + start = pos = WRAP_POINT_NO(panic_path->trace_pos-1); + + while (cnt-- > 0) { + struct ipipe_trace_point *point = &panic_path->point[pos]; + long time; + char info[16]; + int i; + + printk(" %c", + (point->flags & IPIPE_TFLG_HWIRQ_OFF) ? '|' : ' '); + + for (i = IPIPE_TFLG_DOMSTATE_BITS; i >= 0; i--) + printk("%c", + (IPIPE_TFLG_CURRENT_DOMAIN(point) == i) ? + (IPIPE_TFLG_DOMAIN_STALLED(point, i) ? + '#' : '+') : + (IPIPE_TFLG_DOMAIN_STALLED(point, i) ? + '*' : ' ')); + + if (!point->eip) + printk("--\n"); + else { + __ipipe_trace_point_type(buf, point); + printk("%s", buf); + + switch (point->type & IPIPE_TYPE_MASK) { + case IPIPE_TRACE_FUNC: + printk(" "); + break; + + case IPIPE_TRACE_PID: + __ipipe_get_task_info(info, + point, 1); + printk("%s", info); + break; + + case IPIPE_TRACE_EVENT: + __ipipe_get_event_date(info, + panic_path, point); + printk("%s", info); + break; + + default: + printk("0x%08lx ", point->v); + } + + time = __ipipe_signed_tsc2us(point->timestamp - + panic_path->point[start].timestamp); + printk(" %5ld ", time); + + __ipipe_print_symname(NULL, point->eip); + printk(" ("); + __ipipe_print_symname(NULL, point->parent_eip); + printk(")\n"); + } + pos = WRAP_POINT_NO(pos - 1); + } + + panic_path = NULL; +} +EXPORT_SYMBOL(ipipe_trace_panic_dump); +#endif /* CONFIG_IPIPE_TRACE_PANIC */ + + +/* --- /proc output --- */ + +static notrace int __ipipe_in_critical_trpath(long point_no) +{ + return ((WRAP_POINT_NO(point_no-print_path->begin) < + WRAP_POINT_NO(print_path->end-print_path->begin)) || + ((print_path->end == print_path->begin) && + (WRAP_POINT_NO(point_no-print_path->end) > + print_post_trace))); +} + +static long __ipipe_signed_tsc2us(long long tsc) +{ + unsigned long long abs_tsc; + long us; + + /* ipipe_tsc2us works on unsigned => handle sign separately */ + abs_tsc = (tsc >= 0) ? tsc : -tsc; + us = ipipe_tsc2us(abs_tsc); + if (tsc < 0) + return -us; + else + return us; +} + +static void +__ipipe_trace_point_type(char *buf, struct ipipe_trace_point *point) +{ + switch (point->type & IPIPE_TYPE_MASK) { + case IPIPE_TRACE_FUNC: + strcpy(buf, "func "); + break; + + case IPIPE_TRACE_BEGIN: + strcpy(buf, "begin "); + break; + + case IPIPE_TRACE_END: + strcpy(buf, "end "); + break; + + case IPIPE_TRACE_FREEZE: + strcpy(buf, "freeze "); + break; + + case IPIPE_TRACE_SPECIAL: + sprintf(buf, "(0x%02x) ", + point->type >> IPIPE_TYPE_BITS); + break; + + case IPIPE_TRACE_PID: + sprintf(buf, "[%5d] ", (pid_t)point->v); + break; + + case IPIPE_TRACE_EVENT: + sprintf(buf, "event "); + break; + } +} + +static void +__ipipe_print_pathmark(struct seq_file *m, struct ipipe_trace_point *point) +{ + char mark = ' '; + int point_no = point - print_path->point; + int i; + + if (print_path->end == point_no) + mark = '<'; + else if (print_path->begin == point_no) + mark = '>'; + else if (__ipipe_in_critical_trpath(point_no)) + mark = ':'; + seq_printf(m, "%c%c", mark, + (point->flags & IPIPE_TFLG_HWIRQ_OFF) ? '|' : ' '); + + if (!verbose_trace) + return; + + for (i = IPIPE_TFLG_DOMSTATE_BITS; i >= 0; i--) + seq_printf(m, "%c", + (IPIPE_TFLG_CURRENT_DOMAIN(point) == i) ? + (IPIPE_TFLG_DOMAIN_STALLED(point, i) ? + '#' : '+') : + (IPIPE_TFLG_DOMAIN_STALLED(point, i) ? '*' : ' ')); +} + +static void +__ipipe_print_delay(struct seq_file *m, struct ipipe_trace_point *point) +{ + unsigned long delay = 0; + int next; + char *mark = " "; + + next = WRAP_POINT_NO(point+1 - print_path->point); + + if (next != print_path->trace_pos) + delay = ipipe_tsc2ns(print_path->point[next].timestamp - + point->timestamp); + + if (__ipipe_in_critical_trpath(point - print_path->point)) { + if (delay > IPIPE_DELAY_WARN) + mark = "! "; + else if (delay > IPIPE_DELAY_NOTE) + mark = "+ "; + } + seq_puts(m, mark); + + if (verbose_trace) + seq_printf(m, "%3lu.%03lu%c ", delay/1000, delay%1000, + (point->flags & IPIPE_TFLG_NMI_HIT) ? 'N' : ' '); + else + seq_puts(m, " "); +} + +static void __ipipe_print_symname(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + +#ifdef CONFIG_IPIPE_TRACE_PANIC + if (!m) { + /* panic dump */ + if (sym_name) { + printk("%s+0x%lx", sym_name, offset); + if (modname) + printk(" [%s]", modname); + } + } else +#endif /* CONFIG_IPIPE_TRACE_PANIC */ + { + if (sym_name) { + if (verbose_trace) { + seq_printf(m, "%s+0x%lx", sym_name, offset); + if (modname) + seq_printf(m, " [%s]", modname); + } else + seq_puts(m, sym_name); + } else + seq_printf(m, "<%08lx>", eip); + } +} + +static void __ipipe_print_headline(struct seq_file *m) +{ + seq_printf(m, "Calibrated minimum trace-point overhead: %lu.%03lu " + "us\n\n", trace_overhead/1000, trace_overhead%1000); + + if (verbose_trace) { + const char *name[4] = { [0 ... 3] = "" }; + struct list_head *pos; + int i = 0; + + list_for_each_prev(pos, &__ipipe_pipeline) { + struct ipipe_domain *ipd = + list_entry(pos, struct ipipe_domain, p_link); + + name[i] = ipd->name; + if (++i > 3) + break; + } + + seq_printf(m, + " +----- Hard IRQs ('|': locked)\n" + " |+---- %s\n" + " ||+--- %s\n" + " |||+-- %s\n" + " ||||+- %s%s\n" + " ||||| +---------- " + "Delay flag ('+': > %d us, '!': > %d us)\n" + " ||||| | +- " + "NMI noise ('N')\n" + " ||||| | |\n" + " Type User Val. Time Delay Function " + "(Parent)\n", + name[3], name[2], name[1], name[0], + name[0] ? " ('*': domain stalled, '+': current, " + "'#': current+stalled)" : "", + IPIPE_DELAY_NOTE/1000, IPIPE_DELAY_WARN/1000); + } else + seq_printf(m, + " +--------------- Hard IRQs ('|': locked)\n" + " | +- Delay flag " + "('+': > %d us, '!': > %d us)\n" + " | |\n" + " Type Time Function (Parent)\n", + IPIPE_DELAY_NOTE/1000, IPIPE_DELAY_WARN/1000); +} + +static void *__ipipe_max_prtrace_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + mutex_lock(&out_mutex); + + if (!n) { + struct ipipe_trace_path *tp; + unsigned long length_usecs; + int points, cpu; + unsigned long flags; + + /* protect against max_path/frozen_path updates while we + * haven't locked our target path, also avoid recursively + * taking global_path_lock from NMI context */ + flags = __ipipe_global_path_lock(); + + /* find the longest of all per-cpu paths */ + print_path = NULL; + for_each_online_cpu(cpu) { + tp = &per_cpu(trace_path, cpu)[per_cpu(max_path, cpu)]; + if ((print_path == NULL) || + (tp->length > print_path->length)) { + print_path = tp; + break; + } + } + print_path->dump_lock = 1; + + __ipipe_global_path_unlock(flags); + + /* does this path actually contain data? */ + if (print_path->end == print_path->begin) + return NULL; + + /* number of points inside the critical path */ + points = WRAP_POINT_NO(print_path->end-print_path->begin+1); + + /* pre- and post-tracing length, post-trace length was frozen + in __ipipe_trace, pre-trace may have to be reduced due to + buffer overrun */ + print_pre_trace = pre_trace; + print_post_trace = WRAP_POINT_NO(print_path->trace_pos - + print_path->end - 1); + if (points+pre_trace+print_post_trace > IPIPE_TRACE_POINTS - 1) + print_pre_trace = IPIPE_TRACE_POINTS - 1 - points - + print_post_trace; + + length_usecs = ipipe_tsc2us(print_path->length); + seq_printf(m, "I-pipe worst-case tracing service on %s/ipipe-%s\n" + "------------------------------------------------------------\n", + UTS_RELEASE, IPIPE_ARCH_STRING); + seq_printf(m, "CPU: %d, Begin: %lld cycles, Trace Points: " + "%d (-%d/+%d), Length: %lu us\n", + cpu, print_path->point[print_path->begin].timestamp, + points, print_pre_trace, print_post_trace, length_usecs); + __ipipe_print_headline(m); + } + + /* check if we are inside the trace range */ + if (n >= WRAP_POINT_NO(print_path->end - print_path->begin + 1 + + print_pre_trace + print_post_trace)) + return NULL; + + /* return the next point to be shown */ + return &print_path->point[WRAP_POINT_NO(print_path->begin - + print_pre_trace + n)]; +} + +static void *__ipipe_prtrace_next(struct seq_file *m, void *p, loff_t *pos) +{ + loff_t n = ++*pos; + + /* check if we are inside the trace range with the next entry */ + if (n >= WRAP_POINT_NO(print_path->end - print_path->begin + 1 + + print_pre_trace + print_post_trace)) + return NULL; + + /* return the next point to be shown */ + return &print_path->point[WRAP_POINT_NO(print_path->begin - + print_pre_trace + *pos)]; +} + +static void __ipipe_prtrace_stop(struct seq_file *m, void *p) +{ + if (print_path) + print_path->dump_lock = 0; + mutex_unlock(&out_mutex); +} + +static int __ipipe_prtrace_show(struct seq_file *m, void *p) +{ + long time; + struct ipipe_trace_point *point = p; + char buf[16]; + + if (!point->eip) { + seq_puts(m, "--\n"); + return 0; + } + + __ipipe_print_pathmark(m, point); + __ipipe_trace_point_type(buf, point); + seq_puts(m, buf); + if (verbose_trace) + switch (point->type & IPIPE_TYPE_MASK) { + case IPIPE_TRACE_FUNC: + seq_puts(m, " "); + break; + + case IPIPE_TRACE_PID: + __ipipe_get_task_info(buf, point, 0); + seq_puts(m, buf); + break; + + case IPIPE_TRACE_EVENT: + __ipipe_get_event_date(buf, print_path, point); + seq_puts(m, buf); + break; + + default: + seq_printf(m, "0x%08lx ", point->v); + } + + time = __ipipe_signed_tsc2us(point->timestamp - + print_path->point[print_path->begin].timestamp); + seq_printf(m, "%5ld", time); + + __ipipe_print_delay(m, point); + __ipipe_print_symname(m, point->eip); + seq_puts(m, " ("); + __ipipe_print_symname(m, point->parent_eip); + seq_puts(m, ")\n"); + + return 0; +} + +static struct seq_operations __ipipe_max_ptrace_ops = { + .start = __ipipe_max_prtrace_start, + .next = __ipipe_prtrace_next, + .stop = __ipipe_prtrace_stop, + .show = __ipipe_prtrace_show +}; + +static int __ipipe_max_prtrace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &__ipipe_max_ptrace_ops); +} + +static ssize_t +__ipipe_max_reset(struct file *file, const char __user *pbuffer, + size_t count, loff_t *data) +{ + mutex_lock(&out_mutex); + ipipe_trace_max_reset(); + mutex_unlock(&out_mutex); + + return count; +} + +struct file_operations __ipipe_max_prtrace_fops = { + .open = __ipipe_max_prtrace_open, + .read = seq_read, + .write = __ipipe_max_reset, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *__ipipe_frozen_prtrace_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + mutex_lock(&out_mutex); + + if (!n) { + struct ipipe_trace_path *tp; + int cpu; + unsigned long flags; + + /* protect against max_path/frozen_path updates while we + * haven't locked our target path, also avoid recursively + * taking global_path_lock from NMI context */ + flags = __ipipe_global_path_lock(); + + /* find the first of all per-cpu frozen paths */ + print_path = NULL; + for_each_online_cpu(cpu) { + tp = &per_cpu(trace_path, cpu)[per_cpu(frozen_path, cpu)]; + if (tp->end >= 0) { + print_path = tp; + break; + } + } + if (print_path) + print_path->dump_lock = 1; + + __ipipe_global_path_unlock(flags); + + if (!print_path) + return NULL; + + /* back- and post-tracing length, post-trace length was frozen + in __ipipe_trace, back-trace may have to be reduced due to + buffer overrun */ + print_pre_trace = back_trace-1; /* substract freeze point */ + print_post_trace = WRAP_POINT_NO(print_path->trace_pos - + print_path->end - 1); + if (1+pre_trace+print_post_trace > IPIPE_TRACE_POINTS - 1) + print_pre_trace = IPIPE_TRACE_POINTS - 2 - + print_post_trace; + + seq_printf(m, "I-pipe frozen back-tracing service on %s/ipipe-%s\n" + "------------------------------------------------------" + "------\n", + UTS_RELEASE, IPIPE_ARCH_STRING); + seq_printf(m, "CPU: %d, Freeze: %lld cycles, Trace Points: %d (+%d)\n", + cpu, print_path->point[print_path->begin].timestamp, + print_pre_trace+1, print_post_trace); + __ipipe_print_headline(m); + } + + /* check if we are inside the trace range */ + if (n >= print_pre_trace + 1 + print_post_trace) + return NULL; + + /* return the next point to be shown */ + return &print_path->point[WRAP_POINT_NO(print_path->begin- + print_pre_trace+n)]; +} + +static struct seq_operations __ipipe_frozen_ptrace_ops = { + .start = __ipipe_frozen_prtrace_start, + .next = __ipipe_prtrace_next, + .stop = __ipipe_prtrace_stop, + .show = __ipipe_prtrace_show +}; + +static int __ipipe_frozen_prtrace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &__ipipe_frozen_ptrace_ops); +} + +static ssize_t +__ipipe_frozen_ctrl(struct file *file, const char __user *pbuffer, + size_t count, loff_t *data) +{ + char *end, buf[16]; + int val; + int n; + + n = (count > sizeof(buf) - 1) ? sizeof(buf) - 1 : count; + + if (copy_from_user(buf, pbuffer, n)) + return -EFAULT; + + buf[n] = '\0'; + val = simple_strtol(buf, &end, 0); + + if (((*end != '\0') && !isspace(*end)) || (val < 0)) + return -EINVAL; + + mutex_lock(&out_mutex); + ipipe_trace_frozen_reset(); + if (val > 0) + ipipe_trace_freeze(-1); + mutex_unlock(&out_mutex); + + return count; +} + +struct file_operations __ipipe_frozen_prtrace_fops = { + .open = __ipipe_frozen_prtrace_open, + .read = seq_read, + .write = __ipipe_frozen_ctrl, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __ipipe_rd_proc_val(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + len = sprintf(page, "%u\n", *(int *)data); + len -= off; + if (len <= off + count) + *eof = 1; + *start = page + off; + if (len > count) + len = count; + if (len < 0) + len = 0; + + return len; +} + +static int __ipipe_wr_proc_val(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char *end, buf[16]; + int val; + int n; + + n = (count > sizeof(buf) - 1) ? sizeof(buf) - 1 : count; + + if (copy_from_user(buf, buffer, n)) + return -EFAULT; + + buf[n] = '\0'; + val = simple_strtol(buf, &end, 0); + + if (((*end != '\0') && !isspace(*end)) || (val < 0)) + return -EINVAL; + + mutex_lock(&out_mutex); + *(int *)data = val; + mutex_unlock(&out_mutex); + + return count; +} + +static int __ipipe_rd_trigger(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len; + + if (!trigger_begin) + return 0; + + len = sprint_symbol(page, trigger_begin); + page[len++] = '\n'; + + len -= off; + if (len <= off + count) + *eof = 1; + *start = page + off; + if (len > count) + len = count; + if (len < 0) + len = 0; + + return len; +} + +static int __ipipe_wr_trigger(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char buf[KSYM_SYMBOL_LEN]; + unsigned long begin, end; + + if (count > sizeof(buf) - 1) + count = sizeof(buf) - 1; + if (copy_from_user(buf, buffer, count)) + return -EFAULT; + buf[count] = 0; + if (buf[count-1] == '\n') + buf[count-1] = 0; + + begin = kallsyms_lookup_name(buf); + if (!begin || !kallsyms_lookup_size_offset(begin, &end, NULL)) + return -ENOENT; + end += begin - 1; + + mutex_lock(&out_mutex); + /* invalidate the current range before setting a new one */ + trigger_end = 0; + wmb(); + ipipe_trace_frozen_reset(); + + /* set new range */ + trigger_begin = begin; + wmb(); + trigger_end = end; + mutex_unlock(&out_mutex); + + return count; +} + +#ifdef CONFIG_IPIPE_TRACE_MCOUNT +static void notrace +ipipe_trace_function(unsigned long ip, unsigned long parent_ip) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_FUNC, ip, parent_ip, 0); +} + +static struct ftrace_ops ipipe_trace_ops = { + .func = ipipe_trace_function +}; + +static int __ipipe_wr_enable(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char *end, buf[16]; + int val; + int n; + + n = (count > sizeof(buf) - 1) ? sizeof(buf) - 1 : count; + + if (copy_from_user(buf, buffer, n)) + return -EFAULT; + + buf[n] = '\0'; + val = simple_strtol(buf, &end, 0); + + if (((*end != '\0') && !isspace(*end)) || (val < 0)) + return -EINVAL; + + mutex_lock(&out_mutex); + + if (ipipe_trace_enable) { + if (!val) + unregister_ftrace_function(&ipipe_trace_ops); + } else if (val) + register_ftrace_function(&ipipe_trace_ops); + + ipipe_trace_enable = val; + + mutex_unlock(&out_mutex); + + return count; +} +#endif /* CONFIG_IPIPE_TRACE_MCOUNT */ + +extern struct proc_dir_entry *ipipe_proc_root; + +static struct proc_dir_entry * __init +__ipipe_create_trace_proc_val(struct proc_dir_entry *trace_dir, + const char *name, int *value_ptr) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry(name, 0644, trace_dir); + if (entry) { + entry->data = value_ptr; + entry->read_proc = __ipipe_rd_proc_val; + entry->write_proc = __ipipe_wr_proc_val; + } + return entry; +} + +void __init __ipipe_init_tracer(void) +{ + struct proc_dir_entry *trace_dir; + struct proc_dir_entry *entry; + unsigned long long start, end, min = ULLONG_MAX; + int i; +#ifdef CONFIG_IPIPE_TRACE_VMALLOC + int cpu, path; + + for_each_possible_cpu(cpu) { + struct ipipe_trace_path *tp_buf; + + tp_buf = vmalloc_node(sizeof(struct ipipe_trace_path) * + IPIPE_TRACE_PATHS, cpu_to_node(cpu)); + if (!tp_buf) { + printk(KERN_ERR "I-pipe: " + "insufficient memory for trace buffer.\n"); + return; + } + memset(tp_buf, 0, + sizeof(struct ipipe_trace_path) * IPIPE_TRACE_PATHS); + for (path = 0; path < IPIPE_TRACE_PATHS; path++) { + tp_buf[path].begin = -1; + tp_buf[path].end = -1; + } + per_cpu(trace_path, cpu) = tp_buf; + } +#endif /* CONFIG_IPIPE_TRACE_VMALLOC */ + + /* Calculate minimum overhead of __ipipe_trace() */ + local_irq_disable_hw(); + for (i = 0; i < 100; i++) { + ipipe_read_tsc(start); + __ipipe_trace(IPIPE_TRACE_FUNC, __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, 0); + ipipe_read_tsc(end); + + end -= start; + if (end < min) + min = end; + } + local_irq_enable_hw(); + trace_overhead = ipipe_tsc2ns(min); + +#ifdef CONFIG_IPIPE_TRACE_ENABLE + ipipe_trace_enable = 1; +#ifdef CONFIG_IPIPE_TRACE_MCOUNT + register_ftrace_function(&ipipe_trace_ops); +#endif /* CONFIG_IPIPE_TRACE_MCOUNT */ +#endif /* CONFIG_IPIPE_TRACE_ENABLE */ + + trace_dir = create_proc_entry("trace", S_IFDIR, ipipe_proc_root); + + entry = create_proc_entry("max", 0644, trace_dir); + if (entry) + entry->proc_fops = &__ipipe_max_prtrace_fops; + + entry = create_proc_entry("frozen", 0644, trace_dir); + if (entry) + entry->proc_fops = &__ipipe_frozen_prtrace_fops; + + entry = create_proc_entry("trigger", 0644, trace_dir); + if (entry) { + entry->read_proc = __ipipe_rd_trigger; + entry->write_proc = __ipipe_wr_trigger; + } + + __ipipe_create_trace_proc_val(trace_dir, "pre_trace_points", + &pre_trace); + __ipipe_create_trace_proc_val(trace_dir, "post_trace_points", + &post_trace); + __ipipe_create_trace_proc_val(trace_dir, "back_trace_points", + &back_trace); + __ipipe_create_trace_proc_val(trace_dir, "verbose", + &verbose_trace); + entry = __ipipe_create_trace_proc_val(trace_dir, "enable", + &ipipe_trace_enable); +#ifdef CONFIG_IPIPE_TRACE_MCOUNT + if (entry) + entry->write_proc = __ipipe_wr_enable; +#endif /* CONFIG_IPIPE_TRACE_MCOUNT */ +} diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e570d19..7cebb6f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "internals.h" @@ -459,7 +460,9 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; spin_lock(&desc->lock); +#ifndef CONFIG_IPIPE mask_ack_irq(desc, irq); +#endif if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -539,8 +542,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; +#ifdef CONFIG_IPIPE + desc->chip->unmask(irq); +out: +#else out: desc->chip->eoi(irq); +#endif spin_unlock(&desc->lock); } @@ -582,8 +590,10 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ +#ifndef CONFIG_IPIPE if (desc->chip->ack) desc->chip->ack(irq); +#endif /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -637,8 +647,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); +#ifndef CONFIG_IPIPE if (desc->chip->ack) desc->chip->ack(irq); +#endif /* CONFIG_IPIPE */ action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -648,6 +660,134 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) desc->chip->eoi(irq); } +#ifdef CONFIG_IPIPE + +void __ipipe_ack_simple_irq(unsigned irq, struct irq_desc *desc) +{ +} + +void __ipipe_end_simple_irq(unsigned irq, struct irq_desc *desc) +{ +} + +void __ipipe_ack_level_irq(unsigned irq, struct irq_desc *desc) +{ + mask_ack_irq(desc, irq); +} + +void __ipipe_end_level_irq(unsigned irq, struct irq_desc *desc) +{ + if (desc->chip->unmask) + desc->chip->unmask(irq); +} + +void __ipipe_ack_fasteoi_irq(unsigned irq, struct irq_desc *desc) +{ + desc->chip->eoi(irq); +} + +void __ipipe_end_fasteoi_irq(unsigned irq, struct irq_desc *desc) +{ + /* + * Non-requestable IRQs should not be masked in EOI handler. + */ + if (!(desc->status & IRQ_NOREQUEST)) + desc->chip->unmask(irq); +} + +void __ipipe_ack_edge_irq(unsigned irq, struct irq_desc *desc) +{ + desc->chip->ack(irq); +} + +void __ipipe_ack_percpu_irq(unsigned irq, struct irq_desc *desc) +{ + if (desc->chip->ack) + desc->chip->ack(irq); +} + +void __ipipe_end_percpu_irq(unsigned irq, struct irq_desc *desc) +{ + if (desc->chip->eoi) + desc->chip->eoi(irq); +} + +void __ipipe_end_edge_irq(unsigned irq, struct irq_desc *desc) +{ +} + +void __ipipe_ack_bad_irq(unsigned irq, struct irq_desc *desc) +{ + static int done; + + handle_bad_irq(irq, desc); + + if (!done) { + printk(KERN_WARNING "%s: unknown flow handler for IRQ %d\n", + __FUNCTION__, irq); + done = 1; + } +} + +void __ipipe_noack_irq(unsigned irq, struct irq_desc *desc) +{ +} + +void __ipipe_noend_irq(unsigned irq, struct irq_desc *desc) +{ +} + +irq_flow_handler_t +__fixup_irq_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained) +{ + if (unlikely(handle == NULL)) { + desc->ipipe_ack = &__ipipe_ack_bad_irq; + desc->ipipe_end = &__ipipe_noend_irq; + } else { + if (is_chained) { + desc->ipipe_ack = handle; + desc->ipipe_end = &__ipipe_noend_irq; + handle = __ipipe_noack_irq; + } else if (handle == &handle_simple_irq) { + desc->ipipe_ack = &__ipipe_ack_simple_irq; + desc->ipipe_end = &__ipipe_end_simple_irq; + } else if (handle == &handle_level_irq) { + desc->ipipe_ack = &__ipipe_ack_level_irq; + desc->ipipe_end = &__ipipe_end_level_irq; + } else if (handle == &handle_edge_irq) { + desc->ipipe_ack = &__ipipe_ack_edge_irq; + desc->ipipe_end = &__ipipe_end_edge_irq; + } else if (handle == &handle_fasteoi_irq) { + desc->ipipe_ack = &__ipipe_ack_fasteoi_irq; + desc->ipipe_end = &__ipipe_end_fasteoi_irq; + } else if (handle == &handle_percpu_irq) { + desc->ipipe_ack = &__ipipe_ack_percpu_irq; + desc->ipipe_end = &__ipipe_end_percpu_irq; + } else if (desc->chip == &no_irq_chip) { + desc->ipipe_ack = &__ipipe_noack_irq; + desc->ipipe_end = &__ipipe_noend_irq; + } else { + desc->ipipe_ack = &__ipipe_ack_bad_irq; + desc->ipipe_end = &__ipipe_noend_irq; + } + } + + /* Suppress intermediate trampoline routine. */ + ipipe_root_domain->irqs[desc->irq].acknowledge = desc->ipipe_ack; + + return handle; +} + +#else /* !CONFIG_IPIPE */ + +irq_flow_handler_t +__fixup_irq_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained) +{ + return handle; +} + +#endif /* !CONFIG_IPIPE */ + void __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) @@ -679,6 +819,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, chip_bus_lock(irq, desc); spin_lock_irqsave(&desc->lock, flags); + handle = __fixup_irq_handler(desc, handle, is_chained); + /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->chip != &no_irq_chip) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 17c71bb..406f375 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -462,8 +462,10 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ +#ifndef CONFIG_IPIPE if (desc->chip->ack) desc->chip->ack(irq); +#endif if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -474,8 +476,10 @@ unsigned int __do_IRQ(unsigned int irq) } spin_lock(&desc->lock); +#ifndef CONFIG_IPIPE if (desc->chip->ack) desc->chip->ack(irq); +#endif /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 9af5672..fa84d6d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2318,7 +2318,7 @@ void trace_hardirqs_on_caller(unsigned long ip) /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled() && !irqs_disabled_hw())) return; if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; @@ -2361,7 +2361,7 @@ void trace_hardirqs_off_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled() && !irqs_disabled_hw())) return; if (curr->hardirqs_enabled) { @@ -2393,7 +2393,7 @@ void trace_softirqs_on(unsigned long ip) if (unlikely(!debug_locks)) return; - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled() && !irqs_disabled_hw())) return; if (curr->softirqs_enabled) { @@ -2427,7 +2427,7 @@ void trace_softirqs_off(unsigned long ip) if (unlikely(!debug_locks)) return; - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled() && !irqs_disabled_hw())) return; if (curr->softirqs_enabled) { diff --git a/kernel/panic.c b/kernel/panic.c index 96b45d0..63f5b9e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -22,6 +22,7 @@ #include #include #include +#include int panic_on_oops; static unsigned long tainted_mask; @@ -304,6 +305,8 @@ void oops_enter(void) { tracing_off(); /* can't trust the integrity of the kernel anymore: */ + ipipe_trace_panic_freeze(); + ipipe_disable_context_check(ipipe_processor_id()); debug_locks_off(); do_oops_enter_exit(); } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 04a9e90..49bc6cd 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -238,6 +238,7 @@ static int create_image(int platform_mode) goto Enable_cpus; local_irq_disable(); + local_irq_disable_hw_cond(); error = sysdev_suspend(PMSG_FREEZE); if (error) { @@ -267,6 +268,7 @@ static int create_image(int platform_mode) */ Enable_irqs: + local_irq_enable_hw_cond(); local_irq_enable(); Enable_cpus: @@ -359,6 +361,7 @@ static int resume_target_kernel(bool platform_mode) goto Enable_cpus; local_irq_disable(); + local_irq_disable_hw_cond(); error = sysdev_suspend(PMSG_QUIESCE); if (error) @@ -390,6 +393,7 @@ static int resume_target_kernel(bool platform_mode) sysdev_resume(); Enable_irqs: + local_irq_enable_hw_cond(); local_irq_enable(); Enable_cpus: @@ -471,6 +475,7 @@ int hibernation_platform_enter(void) goto Platform_finish; local_irq_disable(); + local_irq_disable_hw_cond(); sysdev_suspend(PMSG_HIBERNATE); hibernation_ops->enter(); /* We should never get here */ diff --git a/kernel/printk.c b/kernel/printk.c index f38b07f..f3f0057 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -564,6 +564,41 @@ static int have_callable_console(void) return 0; } +#ifdef CONFIG_IPIPE + +static ipipe_spinlock_t __ipipe_printk_lock = IPIPE_SPIN_LOCK_UNLOCKED; + +static int __ipipe_printk_fill; + +static char __ipipe_printk_buf[__LOG_BUF_LEN]; + +void __ipipe_flush_printk (unsigned virq, void *cookie) +{ + char *p = __ipipe_printk_buf; + int len, lmax, out = 0; + unsigned long flags; + + goto start; + + do { + spin_unlock_irqrestore(&__ipipe_printk_lock, flags); + start: + lmax = __ipipe_printk_fill; + while (out < lmax) { + len = strlen(p) + 1; + printk("%s",p); + p += len; + out += len; + } + spin_lock_irqsave(&__ipipe_printk_lock, flags); + } + while (__ipipe_printk_fill != lmax); + + __ipipe_printk_fill = 0; + + spin_unlock_irqrestore(&__ipipe_printk_lock, flags); +} + /** * printk - print a kernel message * @fmt: format string @@ -588,6 +623,65 @@ static int have_callable_console(void) asmlinkage int printk(const char *fmt, ...) { + int r, fbytes, oldcount; + unsigned long flags; + int sprintk = 1; + int cs = -1; + va_list args; + + va_start(args, fmt); + + local_irq_save_hw(flags); + + if (test_bit(IPIPE_SPRINTK_FLAG, &__ipipe_current_domain->flags) || + oops_in_progress) + cs = ipipe_disable_context_check(ipipe_processor_id()); + else if (__ipipe_current_domain == ipipe_root_domain) { + struct ipipe_domain *dom; + + list_for_each_entry(dom, &__ipipe_pipeline, p_link) { + if (dom == ipipe_root_domain) + break; + if (test_bit(IPIPE_STALL_FLAG, + &ipipe_cpudom_var(dom, status))) + sprintk = 0; + } + } else + sprintk = 0; + + local_irq_restore_hw(flags); + + if (sprintk) { + r = vprintk(fmt, args); + if (cs != -1) + ipipe_restore_context_check(ipipe_processor_id(), cs); + goto out; + } + + spin_lock_irqsave(&__ipipe_printk_lock, flags); + + oldcount = __ipipe_printk_fill; + fbytes = __LOG_BUF_LEN - oldcount; + + if (fbytes > 1) { + r = vscnprintf(__ipipe_printk_buf + __ipipe_printk_fill, + fbytes, fmt, args) + 1; /* account for the null byte */ + __ipipe_printk_fill += r; + } else + r = 0; + + spin_unlock_irqrestore(&__ipipe_printk_lock, flags); + + if (oldcount == 0) + ipipe_trigger_irq(__ipipe_printk_virq); +out: + va_end(args); + + return r; +} +#else /* !CONFIG_IPIPE */ +asmlinkage int printk(const char *fmt, ...) +{ va_list args; int r; @@ -597,6 +691,7 @@ asmlinkage int printk(const char *fmt, ...) return r; } +#endif /* CONFIG_IPIPE */ /* cpu currently holding logbuf_lock */ static volatile unsigned int printk_cpu = UINT_MAX; diff --git a/kernel/sched.c b/kernel/sched.c index ed61192..83937d6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2344,6 +2344,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { int cpu, orig_cpu, this_cpu, success = 0; + unsigned int old_state; unsigned long flags; struct rq *rq, *orig_rq; @@ -2355,7 +2356,9 @@ static int try_to_wake_up(struct task_st smp_wmb(); rq = orig_rq = task_rq_lock(p, &flags); update_rq_clock(rq); - if (!(p->state & state)) + old_state = p->state; + if (!(old_state & state) || + (old_state & (TASK_NOWAKEUP|TASK_ATOMICSWITCH))) goto out; if (p->se.on_rq) @@ -2840,22 +2843,29 @@ asmlinkage void schedule_tail(struct tas #endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); + + ipipe_init_notify(current); } /* * context_switch - switch to the new MM and the new * thread's register state. */ -static inline void +int context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, prev, next); - trace_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; + +if (!rq) { + switch_mm(oldmm, next->active_mm, next); + if (!mm) enter_lazy_tlb(oldmm, next); +} else { + prepare_task_switch(rq, prev, next); + trace_sched_switch(rq, prev, next); /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -2883,11 +2893,24 @@ context_switch(struct rq *rq, struct tas #ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif - +} +#ifdef CONFIG_IPIPE + next->ptd[IPIPE_ROOT_NPTDKEYS - 1] = prev; +#endif /* CONFIG_IPIPE */ /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); + +if (unlikely(rq)) { +#if 1 // def CONFIG_IPIPE_DELAYED_ATOMICSW + current->state &= ~TASK_ATOMICSWITCH; +#else + prev->state &= ~TASK_ATOMICSWITCH; +#endif + if (task_hijacked(prev)) + return 1; __ipipe_dispatch_event(IPIPE_FIRST_EVENT - 2, 0); + /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack @@ -2895,6 +2918,10 @@ context_switch(struct rq *rq, struct tas */ finish_task_switch(this_rq(), prev); } + return 0; +} + +EXPORT_SYMBOL(context_switch); /* * nr_running, nr_uninterruptible and nr_context_switches: @@ -5300,6 +5327,7 @@ notrace unsigned long get_parent_ip(unsi void __kprobes add_preempt_count(int val) { + ipipe_check_context(ipipe_root_domain); #ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? @@ -5322,6 +5350,7 @@ EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { + ipipe_check_context(ipipe_root_domain); #ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? @@ -5370,6 +5399,7 @@ static noinline void __schedule_bug(stru */ static inline void schedule_debug(struct task_struct *prev) { + ipipe_check_context(ipipe_root_domain); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -5448,7 +5478,7 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage int __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -5462,6 +5492,9 @@ need_resched: rcu_sched_qs(cpu); prev = rq->curr; switch_count = &prev->nivcsw; + if (unlikely(prev->state & TASK_ATOMICSWITCH)) + /* Pop one disable level -- one still remains. */ + preempt_enable(); release_kernel_lock(prev); need_resched_nonpreemptible: @@ -5499,15 +5532,18 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ + if (context_switch(rq, prev, next)) /* unlocks the rq */ + return 1; /* task hijacked by higher domain */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else + } else { + prev->state &= ~TASK_ATOMICSWITCH; spin_unlock_irq(&rq->lock); + } post_schedule(rq); @@ -5517,6 +5553,8 @@ need_resched_nonpreemptible: preempt_enable_no_resched(); if (need_resched()) goto need_resched; + + return 0; } EXPORT_SYMBOL(schedule); @@ -5600,7 +5638,8 @@ asmlinkage void __sched preempt_schedule do { add_preempt_count(PREEMPT_ACTIVE); - schedule(); + if (schedule()) + return; sub_preempt_count(PREEMPT_ACTIVE); /* @@ -6371,6 +6410,7 @@ recheck: oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, policy, param->sched_priority); + ipipe_setsched_notify(p); if (running) p->sched_class->set_curr_task(rq); @@ -7018,6 +7058,7 @@ void __cpuinit init_idle(struct task_str #else task_thread_info(idle)->preempt_count = 0; #endif + ipipe_check_context(ipipe_root_domain); /* * The idle tasks have their own, simple scheduling class: */ @@ -10958,3 +10999,64 @@ void synchronize_sched_expedited(void) EXPORT_SYMBOL_GPL(synchronize_sched_expedited); #endif /* #else #ifndef CONFIG_SMP */ + +#ifdef CONFIG_IPIPE + +int ipipe_setscheduler_root(struct task_struct *p, int policy, int prio) +{ + const struct sched_class *prev_class = p->sched_class; + int oldprio, on_rq, running; + unsigned long flags; + struct rq *rq; + + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); + update_rq_clock(rq); + on_rq = p->se.on_rq; + running = task_current(rq, p); + if (on_rq) + deactivate_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + p->sched_reset_on_fork = 0; + + oldprio = p->prio; + __setscheduler(rq, p, policy, prio); + ipipe_setsched_notify(p); + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) { + activate_task(rq, p, 0); + + check_class_changed(rq, p, prev_class, oldprio, running); + } + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + + rt_mutex_adjust_pi(p); + + return 0; +} +EXPORT_SYMBOL_GPL(ipipe_setscheduler_root); + +int ipipe_reenter_root(struct task_struct *prev, int policy, int prio) +{ + struct rq *rq = this_rq(); + + finish_task_switch(rq, prev); + + post_schedule(rq); + + (void)reacquire_kernel_lock(current); + preempt_enable_no_resched(); + + if (current->policy != policy || current->rt_priority != prio) + return ipipe_setscheduler_root(current, policy, prio); + + return 0; +} +EXPORT_SYMBOL_GPL(ipipe_reenter_root); + +#endif /* CONFIG_IPIPE */ diff --git a/kernel/signal.c b/kernel/signal.c index 4d0658d..a7eac5f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -518,6 +518,7 @@ void signal_wake_up(struct task_struct *t, int resume) unsigned int mask; set_tsk_thread_flag(t, TIF_SIGPENDING); + ipipe_sigwake_notify(t); /* TIF_SIGPENDING must be set first. */ /* * For SIGKILL, we want to wake it up in the stopped/traced/killable diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5ddab73..97cf064 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -50,7 +50,9 @@ EXPORT_SYMBOL(_write_trylock); * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ -#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +#if !defined(CONFIG_GENERIC_LOCKBREAK) || \ + defined(CONFIG_DEBUG_LOCK_ALLOC) || \ + defined(CONFIG_IPIPE) #ifndef _read_lock void __lockfunc _read_lock(rwlock_t *lock) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 83c4417..782a209 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -69,7 +69,7 @@ static void tick_periodic(int cpu) write_sequnlock(&xtime_lock); } - update_process_times(user_mode(get_irq_regs())); + update_root_process_times(get_irq_regs()); profile_tick(CPU_PROFILING); } @@ -177,6 +177,10 @@ static void tick_setup_device(struct tick_device *td, td->evtdev = newdev; + /* I-pipe: derive global tick IRQ from CPU 0 */ + if (cpu == 0) + ipipe_update_tick_evtdev(newdev); + /* * When the device is not per cpu, pin the interrupt to the * current cpu: diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 44320b1..45ec05a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -549,7 +549,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) ts->idle_jiffies++; } - update_process_times(user_mode(regs)); + update_root_process_times(regs); profile_tick(CPU_PROFILING); while (tick_nohz_reprogram(ts, now)) { @@ -700,7 +700,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) touch_softlockup_watchdog(); ts->idle_jiffies++; } - update_process_times(user_mode(regs)); + update_root_process_times(regs); profile_tick(CPU_PROFILING); } diff --git a/kernel/timer.c b/kernel/timer.c index 5db5a8d..1b45eb9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1204,6 +1204,25 @@ void update_process_times(int user_tick) run_posix_cpu_timers(p); } +#ifdef CONFIG_IPIPE + +void update_root_process_times(struct pt_regs *regs) +{ + int cpu, user_tick = user_mode(regs); + + if (__ipipe_root_tick_p(regs)) { + update_process_times(user_tick); + return; + } + + run_local_timers(); + cpu = smp_processor_id(); + rcu_check_callbacks(cpu, user_tick); + run_posix_cpu_timers(current); +} + +#endif + /* * This function runs timers and the timer-tq in bottom half context. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0cccb6c..eaba13e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -1142,6 +1143,9 @@ static int __ftrace_modify_code(void *data) static void ftrace_run_update_code(int command) { +#ifdef CONFIG_IPIPE + unsigned long flags; +#endif /* CONFIG_IPIPE */ int ret; ret = ftrace_arch_code_modify_prepare(); @@ -1149,7 +1153,13 @@ static void ftrace_run_update_code(int command) if (ret) return; +#ifdef CONFIG_IPIPE + flags = ipipe_critical_enter(NULL); + __ftrace_modify_code(&command); + ipipe_critical_exit(flags); +#else /* !CONFIG_IPIPE */ stop_machine(__ftrace_modify_code, &command, NULL); +#endif /* !CONFIG_IPIPE */ ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); @@ -2648,9 +2658,9 @@ static int ftrace_convert_nops(struct module *mod, } /* disable interrupts to prevent kstop machine */ - local_irq_save(flags); + local_irq_save_hw_notrace(flags); ftrace_update_code(mod); - local_irq_restore(flags); + local_irq_restore_hw_notrace(flags); mutex_unlock(&ftrace_lock); return 0; @@ -2729,9 +2739,9 @@ void __init ftrace_init(void) /* Keep the ftrace pointer to the stub */ addr = (unsigned long)ftrace_stub; - local_irq_save(flags); + local_irq_save_hw_notrace(flags); ftrace_dyn_arch_init(&addr); - local_irq_restore(flags); + local_irq_restore_hw_notrace(flags); /* ftrace_dyn_arch_init places the return code in addr */ if (addr) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 234ceb1..faffad9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -136,6 +136,8 @@ config DEBUG_SECTION_MISMATCH - Enable verbose reporting from modpost to help solving the section mismatches reported. +source "kernel/ipipe/Kconfig.debug" + config DEBUG_KERNEL bool "Kernel debugging" help diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c index 9681d54..2dba50c 100644 --- a/lib/bust_spinlocks.c +++ b/lib/bust_spinlocks.c @@ -13,6 +13,7 @@ #include #include #include +#include void __attribute__((weak)) bust_spinlocks(int yes) @@ -24,6 +25,7 @@ void __attribute__((weak)) bust_spinlocks(int yes) unblank_screen(); #endif console_unblank(); + ipipe_trace_panic_dump(); if (--oops_in_progress == 0) wake_up_klogd(); } diff --git a/lib/ioremap.c b/lib/ioremap.c index 14c6078..a275469 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -85,8 +85,8 @@ int ioremap_page_range(unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); - - flush_cache_vmap(start, end); + __ipipe_pin_range_globally(start, end); + flush_cache_vmap(start, end); return err; } diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4689cb0..3d12764 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -12,10 +12,13 @@ notrace unsigned int debug_smp_processor_id(void) unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); + if (!ipipe_root_domain_p) + goto out; + if (likely(preempt_count)) goto out; - if (irqs_disabled()) + if (irqs_disabled() || irqs_disabled_hw()) goto out; /* diff --git a/mm/memory.c b/mm/memory.c index 4e59455..b8d365d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -566,6 +567,32 @@ out: return pfn_to_page(pfn); } +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst, KM_USER0); + void __user *uaddr = (void __user *)(va & PAGE_MASK); + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) + memset(kaddr, 0, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(dst); + } else + copy_user_highpage(dst, src, va, vma); +} + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -574,8 +601,8 @@ out: static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr, int *rss, struct page *uncow_page) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; @@ -614,6 +641,21 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * in the parent and the child */ if (is_cow_mapping(vm_flags)) { +#ifdef CONFIG_IPIPE + if (uncow_page) { + struct page *old_page = vm_normal_page(vma, addr, pte); + cow_user_page(uncow_page, old_page, addr, vma); + pte = mk_pte(uncow_page, vma->vm_page_prot); + + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + page_add_new_anon_rmap(uncow_page, vma, addr); + rss[!!PageAnon(uncow_page)]++; + goto out_set_pte; + } +#endif /* CONFIG_IPIPE */ ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } @@ -645,13 +687,27 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; + struct page *uncow_page = NULL; int rss[2]; - +#ifdef CONFIG_IPIPE + int do_cow_break = 0; +again: + if (do_cow_break) { + uncow_page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + if (!uncow_page) + return -ENOMEM; + do_cow_break = 0; + } +#else again: +#endif rss[1] = rss[0] = 0; dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) + if (!dst_pte) { + if (uncow_page) + page_cache_release(uncow_page); return -ENOMEM; + } src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -674,7 +730,25 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); +#ifdef CONFIG_IPIPE + if (likely(uncow_page == NULL) && likely(pte_present(*src_pte))) { + if (is_cow_mapping(vma->vm_flags) && + test_bit(MMF_VM_PINNED, &src_mm->flags) && + ((vma->vm_flags|src_mm->def_flags) & VM_LOCKED)) { + arch_leave_lazy_mmu_mode(); + spin_unlock(src_ptl); + pte_unmap_nested(src_pte); + add_mm_rss(dst_mm, rss[0], rss[1]); + pte_unmap_unlock(dst_pte, dst_ptl); + cond_resched(); + do_cow_break = 1; + goto again; + } + } +#endif + copy_one_pte(dst_mm, src_mm, dst_pte, + src_pte, vma, addr, rss, uncow_page); + uncow_page = NULL; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -1941,32 +2015,6 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) -{ - /* - * If the source page was a PFN mapping, we don't have - * a "struct page" for it. We do a best-effort copy by - * just copying from the original user address. If that - * fails, we just zero-fill it. Live with it. - */ - if (unlikely(!src)) { - void *kaddr = kmap_atomic(dst, KM_USER0); - void __user *uaddr = (void __user *)(va & PAGE_MASK); - - /* - * This really shouldn't fail, because the page is there - * in the page tables. But it might just be unreadable, - * in which case we just give up and fill the result with - * zeroes. - */ - if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - memset(kaddr, 0, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(dst); - } else - copy_user_highpage(dst, src, va, vma); -} - /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -3377,3 +3425,111 @@ void might_fault(void) } EXPORT_SYMBOL(might_fault); #endif + +#ifdef CONFIG_IPIPE + +static inline int ipipe_pin_