This is a very small backport of the new HostIF_SetFastClockRate(...) function from the 1.0.0.23 drivers release. There's probably plenty of other useful things that can be pulled too, but the following is enough to make VMware Server work reliably for me on vanilla 2.6.28, with CONFIG_HZ=250... Without the patch, vmmon just crashes for me when HostIF_SetFastClockRate(...) is called with a large ``rate'' argument, namely, when rate > (HZ + HZ/16). With the patch I get the following errors periodically instead of a crash; the errors are non-fatal, and the underlying problem is because of the HPET RTC changes recently committed to the mainline kernel.org tree as part of commit 0f4d3fd8ac76122675de900d67a470306647374b. /dev/vmmon[7386]: /dev/rtc enable interrupt failed: -25 I'm guessing that VMware will investigate a proper fix in due course because even new versions of VMware products will be affected by the same warning messages above on recent kernels from upstream. Be sure to apply this patch on top of the other current patches (see Gentoo bug #227303 for details). Signed-off-by: Tim Yamin diff -urp vmmon-only/include/compat_sched.h vmmon-only.plasmaroo/include/compat_sched.h --- vmmon-only/include/compat_sched.h 2008-10-31 01:17:48.000000000 +0000 +++ vmmon-only.plasmaroo/include/compat_sched.h 2008-12-29 22:07:40.000000000 +0000 @@ -154,4 +154,31 @@ static inline void daemonize(void) { #define compat_set_user_nice(task, n) do { (task)->nice = (n); } while (0) #endif +/* + * Since 2.6.27-rc2 kill_proc() is gone... Replacement (GPL-only!) + * API is available since 2.6.19. Use them from 2.6.27-rc1 up. + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27) +typedef int compat_pid; +#define compat_find_get_pid(pid) (pid) +#define compat_put_pid(pid) do { } while (0) +#define compat_kill_pid(pid, sig, flag) kill_proc(pid, sig, flag) +#else +typedef struct pid * compat_pid; +#define compat_find_get_pid(pid) find_get_pid(pid) +#define compat_put_pid(pid) put_pid(pid) +#define compat_kill_pid(pid, sig, flag) kill_pid(pid, sig, flag) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 5, 61) +#define compat_allow_signal(signr) do { \ + spin_lock_irq(¤t->compat_sigmask_lock); \ + sigdelset(¤t->blocked, signr); \ + compat_recalc_sigpending(); \ + spin_unlock_irq(¤t->compat_sigmask_lock); \ + } while (0) +#else +#define compat_allow_signal(signr) allow_signal(signr) +#endif + #endif /* __COMPAT_SCHED_H__ */ diff -urp vmmon-only/linux/driver.c vmmon-only.plasmaroo/linux/driver.c --- vmmon-only/linux/driver.c 2008-12-29 22:06:15.000000000 +0000 +++ vmmon-only.plasmaroo/linux/driver.c 2008-12-29 22:07:40.000000000 +0000 @@ -378,12 +378,9 @@ init_module(void) linuxState.pollTimer.data = 0; linuxState.pollTimer.function = LinuxDriverPollTimeout; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24) - linuxState.fastClockTask = NULL; -#else linuxState.fastClockThread = 0; -#endif linuxState.fastClockRate = 0; + linuxState.fastClockPriority = -20; #ifdef POLLSPINLOCK spin_lock_init(&linuxState.pollListLock); @@ -908,12 +905,7 @@ LinuxDriverPoll(struct file *filp, mask = POLLIN; } } else { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24) - if ((linuxState.fastClockTask!=NULL) && - vmLinux->pollTimeoutPtr != NULL) { -#else if (linuxState.fastClockThread && vmLinux->pollTimeoutPtr != NULL) { -#endif struct timeval tv; do_gettimeofday(&tv); poll_wait(filp, &vmLinux->pollQueue, wait); @@ -2383,3 +2375,4 @@ static int LinuxDriverAPMCallback(apm_ev MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Machine Monitor."); +MODULE_LICENSE("GPL v2"); diff -urp vmmon-only/linux/driver.h vmmon-only.plasmaroo/linux/driver.h --- vmmon-only/linux/driver.h 2008-12-29 22:06:15.000000000 +0000 +++ vmmon-only.plasmaroo/linux/driver.h 2008-12-29 22:07:40.000000000 +0000 @@ -19,7 +19,7 @@ #include "compat_spinlock.h" #include "compat_wait.h" #include "driver_vmcore.h" - +#include "compat_sched.h" /* * Per-instance driver state @@ -101,12 +101,9 @@ typedef struct VMXLinuxState { spinlock_t pollListLock; #endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24) - struct task_struct *fastClockTask; -#else - volatile int fastClockThread; -#endif + volatile compat_pid fastClockThread; unsigned fastClockRate; + long fastClockPriority; } VMXLinuxState; extern VMXLinuxState linuxState; diff -urp vmmon-only/linux/hostif.c vmmon-only.plasmaroo/linux/hostif.c --- vmmon-only/linux/hostif.c 2008-12-29 22:06:15.000000000 +0000 +++ vmmon-only.plasmaroo/linux/hostif.c 2008-12-29 22:07:40.000000000 +0000 @@ -3421,6 +3421,44 @@ HostIF_NumOnlineLogicalCPUs(void) #endif } +/* + *---------------------------------------------------------------------- + * + * HostIFDoIoctl -- + * + * Issue ioctl. Assume kernel is not locked. It is not true now, + * but it makes things easier to understand, and won't surprise us + * later when we get rid of kernel lock from our code. + * + * Results: + * Same as ioctl method. + * + * Side effects: + * none. + * + *---------------------------------------------------------------------- + */ + +static long +HostIFDoIoctl(struct file *filp, + u_int iocmd, + unsigned long ioarg) +{ +#ifdef HAVE_UNLOCKED_IOCTL + if (filp->f_op->unlocked_ioctl) { + return filp->f_op->unlocked_ioctl(filp, iocmd, ioarg); + } +#endif + if (filp->f_op->ioctl) { + long err; + + lock_kernel(); + err = filp->f_op->ioctl(filp->f_dentry->d_inode, filp, iocmd, ioarg); + unlock_kernel(); + return err; + } + return -ENOIOCTLCMD; +} /* *---------------------------------------------------------------------- @@ -3443,23 +3481,21 @@ HostIF_NumOnlineLogicalCPUs(void) */ static int -HostIFFastClockThread(void *data) +HostIFFastClockThread(void *data) // IN: { struct file *filp = (struct file *) data; int res; mm_segment_t oldFS; unsigned rate = 0; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 24) compat_daemonize("vmware-rtc"); -#endif oldFS = get_fs(); set_fs(KERNEL_DS); + compat_allow_signal(SIGKILL); cap_raise(current->cap_effective, CAP_SYS_RESOURCE); - compat_set_user_nice(current, -20); + compat_set_user_nice(current, linuxState.fastClockPriority); - while (linuxState.fastClockRate > HZ + HZ/16 && - !signal_pending(current)) { + while (linuxState.fastClockRate > HZ + HZ/16) { unsigned long buf; loff_t pos = 0; unsigned p2rate; @@ -3473,8 +3509,7 @@ HostIFFastClockThread(void *data) p2rate <<= 1; } - res = filp->f_op->ioctl(filp->f_dentry->d_inode, - filp, RTC_IRQP_SET, p2rate); + res = HostIFDoIoctl(filp, RTC_IRQP_SET, p2rate); if (res < 0) { Warning("/dev/rtc set rate %d failed: %d\n", p2rate, res); goto out; @@ -3542,20 +3577,44 @@ HostIFFastClockThread(void *data) */ int -HostIF_SetFastClockRate(unsigned rate) +HostIF_SetFastClockRate(unsigned int rate) // IN: Frequency in Hz. { ASSERT(MutexIsLocked(&fastClockMutex)); linuxState.fastClockRate = rate; + + /* + * Overview + * -------- + * An SMP Linux kernel programs the 8253 timer (to increment the 'jiffies' + * counter) _and_ all local APICs (to run the scheduler code) to deliver + * interrupts HZ times a second. + * + * Time + * ---- + * The kernel tries very hard to spread all these interrupts evenly over + * time, i.e. on a 1 CPU system, the 1 local APIC phase is shifted by 1/2 + * period compared to the 8253, and on a 2 CPU system, the 2 local APIC + * phases are respectively shifted by 1/3 and 2/3 period compared to the + * 8253. This is done to reduce contention on locks guarding the global task + * queue. + * + * Space + * ----- + * The 8253 interrupts are distributed between physical CPUs, evenly on a P3 + * system, whereas on a P4 system physical CPU 0 gets all of them. + * + * Long story short, unless the monitor requested rate is significantly + * higher than HZ, we don't need to send IPIs or exclusively grab /dev/rtc + * to periodically kick vCPU threads running in the monitor on all physical + * CPUs. + */ + if (rate > HZ + HZ/16) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24) - if (linuxState.fastClockTask==NULL) { - struct task_struct *t; -#else if (!linuxState.fastClockThread) { -#endif struct file *filp; int fsuid, res; Bool cap; + long pid; fsuid = current->fsuid; current->fsuid = 0; @@ -3567,7 +3626,7 @@ HostIF_SetFastClockRate(unsigned rate) } cap = cap_raised(current->cap_effective, CAP_SYS_RESOURCE); cap_raise(current->cap_effective, CAP_SYS_RESOURCE); - res = filp->f_op->ioctl(filp->f_dentry->d_inode, filp, RTC_PIE_ON, 0); + res = HostIFDoIoctl(filp, RTC_PIE_ON, 0); if (!cap) { cap_lower(current->cap_effective, CAP_SYS_RESOURCE); } @@ -3576,38 +3635,32 @@ HostIF_SetFastClockRate(unsigned rate) compat_filp_close(filp, current->files); return -res; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24) - t = kthread_create(HostIFFastClockThread, filp, "vmware-rtc"); - if (IS_ERR(t)) { - compat_filp_close(filp, current->files); - return -PTR_ERR(t); - } - linuxState.fastClockTask=t; - wake_up_process(t); -#else - linuxState.fastClockThread = - kernel_thread(HostIFFastClockThread, filp, 0); -#endif + pid = kernel_thread(HostIFFastClockThread, filp, 0); + if (pid < 0) { + /* + * Ignore ERESTARTNOINTR silently, it occurs when signal is + * pending, and syscall layer automatically reissues operation + * after signal is handled. + */ + if (pid != -ERESTARTNOINTR) { + Warning("/dev/rtc cannot start watch thread: %ld\n", pid); + } + compat_filp_close(filp, current->files); + return -pid; + } + linuxState.fastClockThread = compat_find_get_pid(pid); } } else { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24) - if (linuxState.fastClockTask!=NULL) { - kthread_stop(linuxState.fastClockTask); - linuxState.fastClockTask = NULL; - compat_wait_for_completion(&fastClockExited); - } -#else if (linuxState.fastClockThread) { - kill_proc(linuxState.fastClockThread, SIGKILL, 1); + compat_kill_pid(linuxState.fastClockThread, SIGKILL, 1); + compat_put_pid(linuxState.fastClockThread); linuxState.fastClockThread = 0; compat_wait_for_completion(&fastClockExited); } -#endif } return 0; } - /* *----------------------------------------------------------------------------- *