diff options
-rw-r--r-- | arch/s390/include/asm/pgtable.h | 8 | ||||
-rw-r--r-- | arch/s390/kernel/ptrace.c | 30 | ||||
-rw-r--r-- | arch/s390/kernel/setup.c | 2 | ||||
-rw-r--r-- | arch/s390/kernel/signal.c | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/timer.h | 23 | ||||
-rw-r--r-- | drivers/gpio/Makefile | 2 | ||||
-rw-r--r-- | drivers/s390/cio/chsc.c | 7 | ||||
-rw-r--r-- | drivers/s390/cio/cio.h | 5 | ||||
-rw-r--r-- | drivers/s390/cio/css.c | 104 | ||||
-rw-r--r-- | drivers/s390/cio/device.c | 4 | ||||
-rw-r--r-- | drivers/s390/cio/device_fsm.c | 30 | ||||
-rw-r--r-- | drivers/s390/cio/device_ops.c | 20 | ||||
-rw-r--r-- | drivers/s390/cio/io_sch.h | 5 | ||||
-rw-r--r-- | drivers/s390/crypto/ap_bus.c | 2 | ||||
-rw-r--r-- | include/linux/init_task.h | 4 | ||||
-rw-r--r-- | kernel/irq/manage.c | 5 | ||||
-rw-r--r-- | kernel/sched.c | 17 | ||||
-rw-r--r-- | kernel/sched_fair.c | 159 | ||||
-rw-r--r-- | kernel/sched_features.h | 1 | ||||
-rw-r--r-- | kernel/sched_rt.c | 3 |
20 files changed, 313 insertions, 126 deletions
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 524d23b8610c..4f289ff0b7fe 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -599,10 +599,10 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste) skey = page_get_storage_key(address); bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); /* Clear page changed & referenced bit in the storage key */ - if (bits) { - skey ^= bits; - page_set_storage_key(address, skey, 1); - } + if (bits & _PAGE_CHANGED) + page_set_storage_key(address, skey ^ bits, 1); + else if (bits) + page_reset_referenced(address); /* Transfer page changed & referenced bit to guest bits in pgste */ pgste_val(pgste) |= bits << 48; /* RCP_GR_BIT & RCP_GC_BIT */ /* Get host changed & referenced bits from pgste */ diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 450931a45b68..573bc29551ef 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -296,13 +296,6 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) ((data & PSW_MASK_EA) && !(data & PSW_MASK_BA)))) /* Invalid psw mask. */ return -EINVAL; - if (addr == (addr_t) &dummy->regs.psw.addr) - /* - * The debugger changed the instruction address, - * reset system call restart, see signal.c:do_signal - */ - task_thread_info(child)->system_call = 0; - *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr) = data; } else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) { @@ -614,11 +607,6 @@ static int __poke_user_compat(struct task_struct *child, /* Transfer 31 bit amode bit to psw mask. */ regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) | (__u64)(tmp & PSW32_ADDR_AMODE); - /* - * The debugger changed the instruction address, - * reset system call restart, see signal.c:do_signal - */ - task_thread_info(child)->system_call = 0; } else { /* gpr 0-15 */ *(__u32*)((addr_t) ®s->psw + addr*2 + 4) = tmp; @@ -905,6 +893,14 @@ static int s390_last_break_get(struct task_struct *target, return 0; } +static int s390_last_break_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return 0; +} + #endif static int s390_system_call_get(struct task_struct *target, @@ -951,6 +947,7 @@ static const struct user_regset s390_regsets[] = { .size = sizeof(long), .align = sizeof(long), .get = s390_last_break_get, + .set = s390_last_break_set, }, #endif [REGSET_SYSTEM_CALL] = { @@ -1116,6 +1113,14 @@ static int s390_compat_last_break_get(struct task_struct *target, return 0; } +static int s390_compat_last_break_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return 0; +} + static const struct user_regset s390_compat_regsets[] = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, @@ -1139,6 +1144,7 @@ static const struct user_regset s390_compat_regsets[] = { .size = sizeof(long), .align = sizeof(long), .get = s390_compat_last_break_get, + .set = s390_compat_last_break_set, }, [REGSET_SYSTEM_CALL] = { .core_note_type = NT_S390_SYSTEM_CALL, diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index e58a462949b1..e54c4ff8abaa 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -579,7 +579,7 @@ static unsigned long __init find_crash_base(unsigned long crash_size, *msg = "first memory chunk must be at least crashkernel size"; return 0; } - if (is_kdump_kernel() && (crash_size == OLDMEM_SIZE)) + if (OLDMEM_BASE && crash_size == OLDMEM_SIZE) return OLDMEM_BASE; for (i = MEMORY_CHUNKS - 1; i >= 0; i--) { diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 05a85bc14c98..7f6f9f354545 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -460,9 +460,9 @@ void do_signal(struct pt_regs *regs) regs->svc_code >> 16); break; } - /* No longer in a system call */ - clear_thread_flag(TIF_SYSCALL); } + /* No longer in a system call */ + clear_thread_flag(TIF_SYSCALL); if ((is_compat_task() ? handle_signal32(signr, &ka, &info, oldset, regs) : @@ -486,6 +486,7 @@ void do_signal(struct pt_regs *regs) } /* No handlers present - check for system call restart */ + clear_thread_flag(TIF_SYSCALL); if (current_thread_info()->system_call) { regs->svc_code = current_thread_info()->system_call; switch (regs->gprs[2]) { @@ -500,9 +501,6 @@ void do_signal(struct pt_regs *regs) regs->gprs[2] = regs->orig_gpr2; set_thread_flag(TIF_SYSCALL); break; - default: - clear_thread_flag(TIF_SYSCALL); - break; } } diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index fa7b9176b76c..431793e5d484 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -32,6 +32,22 @@ extern int no_timer_check; * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" + * + * In: + * + * ns = cycles * cyc2ns_scale / SC + * + * Although we may still have enough bits to store the value of ns, + * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, + * leading to an incorrect result. + * + * To avoid this, we can decompose 'cycles' into quotient and remainder + * of division by SC. Then, + * + * ns = (quot * SC + rem) * cyc2ns_scale / SC + * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC + * + * - sqazi@google.com */ DECLARE_PER_CPU(unsigned long, cyc2ns); @@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { + unsigned long long quot; + unsigned long long rem; int cpu = smp_processor_id(); unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; + quot = (cyc >> CYC2NS_SCALE_FACTOR); + rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1); + ns += quot * per_cpu(cyc2ns, cpu) + + ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR); return ns; } diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index dbcb0bcfd8da..4e018d6a7639 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_ARCH_DAVINCI) += gpio-davinci.o obj-$(CONFIG_GPIO_EP93XX) += gpio-ep93xx.o obj-$(CONFIG_GPIO_IT8761E) += gpio-it8761e.o obj-$(CONFIG_GPIO_JANZ_TTL) += gpio-janz-ttl.o -obj-$(CONFIG_MACH_KS8695) += gpio-ks8695.o +obj-$(CONFIG_ARCH_KS8695) += gpio-ks8695.o obj-$(CONFIG_GPIO_LANGWELL) += gpio-langwell.o obj-$(CONFIG_ARCH_LPC32XX) += gpio-lpc32xx.o obj-$(CONFIG_GPIO_MAX730X) += gpio-max730x.o diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 75c3f1f8fd43..a84631a7391d 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -529,10 +529,7 @@ __s390_vary_chpid_on(struct subchannel_id schid, void *data) int chsc_chp_vary(struct chp_id chpid, int on) { struct channel_path *chp = chpid_to_chp(chpid); - struct chp_link link; - memset(&link, 0, sizeof(struct chp_link)); - link.chpid = chpid; /* Wait until previous actions have settled. */ css_wait_for_slow_path(); /* @@ -542,10 +539,10 @@ int chsc_chp_vary(struct chp_id chpid, int on) /* Try to update the channel path descritor. */ chsc_determine_base_channel_path_desc(chpid, &chp->desc); for_each_subchannel_staged(s390_subchannel_vary_chpid_on, - __s390_vary_chpid_on, &link); + __s390_vary_chpid_on, &chpid); } else for_each_subchannel_staged(s390_subchannel_vary_chpid_off, - NULL, &link); + NULL, &chpid); return 0; } diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h index 155a82bcb9e5..4a1ff5c2eb88 100644 --- a/drivers/s390/cio/cio.h +++ b/drivers/s390/cio/cio.h @@ -68,8 +68,13 @@ struct schib { __u8 mda[4]; /* model dependent area */ } __attribute__ ((packed,aligned(4))); +/* + * When rescheduled, todo's with higher values will overwrite those + * with lower values. + */ enum sch_todo { SCH_TODO_NOTHING, + SCH_TODO_EVAL, SCH_TODO_UNREG, }; diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 92d7324acb1c..21908e67bf67 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -195,51 +195,6 @@ void css_sch_device_unregister(struct subchannel *sch) } EXPORT_SYMBOL_GPL(css_sch_device_unregister); -static void css_sch_todo(struct work_struct *work) -{ - struct subchannel *sch; - enum sch_todo todo; - - sch = container_of(work, struct subchannel, todo_work); - /* Find out todo. */ - spin_lock_irq(sch->lock); - todo = sch->todo; - CIO_MSG_EVENT(4, "sch_todo: sch=0.%x.%04x, todo=%d\n", sch->schid.ssid, - sch->schid.sch_no, todo); - sch->todo = SCH_TODO_NOTHING; - spin_unlock_irq(sch->lock); - /* Perform todo. */ - if (todo == SCH_TODO_UNREG) - css_sch_device_unregister(sch); - /* Release workqueue ref. */ - put_device(&sch->dev); -} - -/** - * css_sched_sch_todo - schedule a subchannel operation - * @sch: subchannel - * @todo: todo - * - * Schedule the operation identified by @todo to be performed on the slow path - * workqueue. Do nothing if another operation with higher priority is already - * scheduled. Needs to be called with subchannel lock held. - */ -void css_sched_sch_todo(struct subchannel *sch, enum sch_todo todo) -{ - CIO_MSG_EVENT(4, "sch_todo: sched sch=0.%x.%04x todo=%d\n", - sch->schid.ssid, sch->schid.sch_no, todo); - if (sch->todo >= todo) - return; - /* Get workqueue ref. */ - if (!get_device(&sch->dev)) - return; - sch->todo = todo; - if (!queue_work(cio_work_q, &sch->todo_work)) { - /* Already queued, release workqueue ref. */ - put_device(&sch->dev); - } -} - static void ssd_from_pmcw(struct chsc_ssd_info *ssd, struct pmcw *pmcw) { int i; @@ -466,6 +421,65 @@ static void css_evaluate_subchannel(struct subchannel_id schid, int slow) css_schedule_eval(schid); } +/** + * css_sched_sch_todo - schedule a subchannel operation + * @sch: subchannel + * @todo: todo + * + * Schedule the operation identified by @todo to be performed on the slow path + * workqueue. Do nothing if another operation with higher priority is already + * scheduled. Needs to be called with subchannel lock held. + */ +void css_sched_sch_todo(struct subchannel *sch, enum sch_todo todo) +{ + CIO_MSG_EVENT(4, "sch_todo: sched sch=0.%x.%04x todo=%d\n", + sch->schid.ssid, sch->schid.sch_no, todo); + if (sch->todo >= todo) + return; + /* Get workqueue ref. */ + if (!get_device(&sch->dev)) + return; + sch->todo = todo; + if (!queue_work(cio_work_q, &sch->todo_work)) { + /* Already queued, release workqueue ref. */ + put_device(&sch->dev); + } +} + +static void css_sch_todo(struct work_struct *work) +{ + struct subchannel *sch; + enum sch_todo todo; + int ret; + + sch = container_of(work, struct subchannel, todo_work); + /* Find out todo. */ + spin_lock_irq(sch->lock); + todo = sch->todo; + CIO_MSG_EVENT(4, "sch_todo: sch=0.%x.%04x, todo=%d\n", sch->schid.ssid, + sch->schid.sch_no, todo); + sch->todo = SCH_TODO_NOTHING; + spin_unlock_irq(sch->lock); + /* Perform todo. */ + switch (todo) { + case SCH_TODO_NOTHING: + break; + case SCH_TODO_EVAL: + ret = css_evaluate_known_subchannel(sch, 1); + if (ret == -EAGAIN) { + spin_lock_irq(sch->lock); + css_sched_sch_todo(sch, todo); + spin_unlock_irq(sch->lock); + } + break; + case SCH_TODO_UNREG: + css_sch_device_unregister(sch); + break; + } + /* Release workqueue ref. */ + put_device(&sch->dev); +} + static struct idset *slow_subchannel_set; static spinlock_t slow_subchannel_lock; static wait_queue_head_t css_eval_wq; diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index d734f4a0ecac..47269858ecb6 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1868,9 +1868,9 @@ static void __ccw_device_pm_restore(struct ccw_device *cdev) */ cdev->private->flags.resuming = 1; cdev->private->path_new_mask = LPM_ANYPATH; - css_schedule_eval(sch->schid); + css_sched_sch_todo(sch, SCH_TODO_EVAL); spin_unlock_irq(sch->lock); - css_complete_work(); + css_wait_for_slow_path(); /* cdev may have been moved to a different subchannel. */ sch = to_subchannel(cdev->dev.parent); diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 52c233fa2b12..1b853513c891 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -496,8 +496,26 @@ static void ccw_device_reset_path_events(struct ccw_device *cdev) cdev->private->pgid_reset_mask = 0; } -void -ccw_device_verify_done(struct ccw_device *cdev, int err) +static void create_fake_irb(struct irb *irb, int type) +{ + memset(irb, 0, sizeof(*irb)); + if (type == FAKE_CMD_IRB) { + struct cmd_scsw *scsw = &irb->scsw.cmd; + scsw->cc = 1; + scsw->fctl = SCSW_FCTL_START_FUNC; + scsw->actl = SCSW_ACTL_START_PEND; + scsw->stctl = SCSW_STCTL_STATUS_PEND; + } else if (type == FAKE_TM_IRB) { + struct tm_scsw *scsw = &irb->scsw.tm; + scsw->x = 1; + scsw->cc = 1; + scsw->fctl = SCSW_FCTL_START_FUNC; + scsw->actl = SCSW_ACTL_START_PEND; + scsw->stctl = SCSW_STCTL_STATUS_PEND; + } +} + +void ccw_device_verify_done(struct ccw_device *cdev, int err) { struct subchannel *sch; @@ -520,12 +538,8 @@ callback: ccw_device_done(cdev, DEV_STATE_ONLINE); /* Deliver fake irb to device driver, if needed. */ if (cdev->private->flags.fake_irb) { - memset(&cdev->private->irb, 0, sizeof(struct irb)); - cdev->private->irb.scsw.cmd.cc = 1; - cdev->private->irb.scsw.cmd.fctl = SCSW_FCTL_START_FUNC; - cdev->private->irb.scsw.cmd.actl = SCSW_ACTL_START_PEND; - cdev->private->irb.scsw.cmd.stctl = - SCSW_STCTL_STATUS_PEND; + create_fake_irb(&cdev->private->irb, + cdev->private->flags.fake_irb); cdev->private->flags.fake_irb = 0; if (cdev->handler) cdev->handler(cdev, cdev->private->intparm, diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index f98698d5735e..ec7fb6d3b479 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -198,7 +198,7 @@ int ccw_device_start_key(struct ccw_device *cdev, struct ccw1 *cpa, if (cdev->private->state == DEV_STATE_VERIFY) { /* Remember to fake irb when finished. */ if (!cdev->private->flags.fake_irb) { - cdev->private->flags.fake_irb = 1; + cdev->private->flags.fake_irb = FAKE_CMD_IRB; cdev->private->intparm = intparm; return 0; } else @@ -213,9 +213,9 @@ int ccw_device_start_key(struct ccw_device *cdev, struct ccw1 *cpa, ret = cio_set_options (sch, flags); if (ret) return ret; - /* Adjust requested path mask to excluded varied off paths. */ + /* Adjust requested path mask to exclude unusable paths. */ if (lpm) { - lpm &= sch->opm; + lpm &= sch->lpm; if (lpm == 0) return -EACCES; } @@ -605,11 +605,21 @@ int ccw_device_tm_start_key(struct ccw_device *cdev, struct tcw *tcw, sch = to_subchannel(cdev->dev.parent); if (!sch->schib.pmcw.ena) return -EINVAL; + if (cdev->private->state == DEV_STATE_VERIFY) { + /* Remember to fake irb when finished. */ + if (!cdev->private->flags.fake_irb) { + cdev->private->flags.fake_irb = FAKE_TM_IRB; + cdev->private->intparm = intparm; + return 0; + } else + /* There's already a fake I/O around. */ + return -EBUSY; + } if (cdev->private->state != DEV_STATE_ONLINE) return -EIO; - /* Adjust requested path mask to excluded varied off paths. */ + /* Adjust requested path mask to exclude unusable paths. */ if (lpm) { - lpm &= sch->opm; + lpm &= sch->lpm; if (lpm == 0) return -EACCES; } diff --git a/drivers/s390/cio/io_sch.h b/drivers/s390/cio/io_sch.h index 2ebb492a5c17..76253dfcc1be 100644 --- a/drivers/s390/cio/io_sch.h +++ b/drivers/s390/cio/io_sch.h @@ -111,6 +111,9 @@ enum cdev_todo { CDEV_TODO_UNREG_EVAL, }; +#define FAKE_CMD_IRB 1 +#define FAKE_TM_IRB 2 + struct ccw_device_private { struct ccw_device *cdev; struct subchannel *sch; @@ -138,7 +141,7 @@ struct ccw_device_private { unsigned int doverify:1; /* delayed path verification */ unsigned int donotify:1; /* call notify function */ unsigned int recog_done:1; /* dev. recog. complete */ - unsigned int fake_irb:1; /* deliver faked irb */ + unsigned int fake_irb:2; /* deliver faked irb */ unsigned int resuming:1; /* recognition while resume */ unsigned int pgroup:1; /* pathgroup is set up */ unsigned int mpath:1; /* multipathing is set up */ diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index ec94f049e995..96bbe9d12a79 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1552,6 +1552,8 @@ static void ap_reset(struct ap_device *ap_dev) rc = ap_init_queue(ap_dev->qid); if (rc == -ENODEV) ap_dev->unregistered = 1; + else + __ap_schedule_poll_timer(); } static int __ap_poll_device(struct ap_device *ap_dev, unsigned long *flags) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 94b1e356c02a..32574eef9394 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -126,6 +126,8 @@ extern struct cred init_cred; # define INIT_PERF_EVENTS(tsk) #endif +#define INIT_TASK_COMM "swapper" + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -162,7 +164,7 @@ extern struct cred init_cred; .group_leader = &tsk, \ RCU_INIT_POINTER(.real_cred, &init_cred), \ RCU_INIT_POINTER(.cred, &init_cred), \ - .comm = "swapper", \ + .comm = INIT_TASK_COMM, \ .thread = INIT_THREAD, \ .fs = &init_fs, \ .files = &init_files, \ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0e2b179bc7b3..1da999f5e746 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -623,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) static int irq_wait_for_interrupt(struct irqaction *action) { + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { @@ -632,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action) return 0; } schedule(); + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); return -1; } diff --git a/kernel/sched.c b/kernel/sched.c index 0e9344a71be3..d6b149ccf925 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include <linux/ctype.h> #include <linux/ftrace.h> #include <linux/slab.h> +#include <linux/init_task.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) @@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits for completion of a specific task to be signaled. It is * interruptible. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_interruptible(struct completion *x) { @@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_interruptible_timeout(struct completion *x, @@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); * * This waits to be signaled for completion of a specific task. It can be * interrupted by a kill signal. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_killable(struct completion *x) { @@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable); * This waits for either a completion of a specific task to be * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_killable_timeout(struct completion *x, @@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); +#if defined(CONFIG_SMP) + sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif } /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5c9e67923b7c..a78ed2736ba7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } +static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +{ + long tg_weight; + + /* + * Use this CPU's actual weight instead of the last load_contribution + * to gain a more accurate current total weight. See + * update_cfs_rq_load_contribution(). + */ + tg_weight = atomic_read(&tg->load_weight); + tg_weight -= cfs_rq->load_contribution; + tg_weight += cfs_rq->load.weight; + + return tg_weight; +} + static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long load_weight, load, shares; + long tg_weight, load, shares; + tg_weight = calc_tg_weight(tg, cfs_rq); load = cfs_rq->load.weight; - load_weight = atomic_read(&tg->load_weight); - load_weight += load; - load_weight -= cfs_rq->load_contribution; - shares = (tg->shares * load); - if (load_weight) - shares /= load_weight; + if (tg_weight) + shares /= tg_weight; if (shares < MIN_SHARES) shares = MIN_SHARES; @@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) return; __return_cfs_rq_runtime(cfs_rq); @@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. + * + * Calculate the effective load difference if @wl is added (subtracted) to @tg + * on this @cpu and results in a total addition (subtraction) of @wg to the + * total group weight. + * + * Given a runqueue weight distribution (rw_i) we can compute a shares + * distribution (s_i) using: + * + * s_i = rw_i / \Sum rw_j (1) + * + * Suppose we have 4 CPUs and our @tg is a direct child of the root group and + * has 7 equal weight tasks, distributed as below (rw_i), with the resulting + * shares distribution (s_i): + * + * rw_i = { 2, 4, 1, 0 } + * s_i = { 2/7, 4/7, 1/7, 0 } + * + * As per wake_affine() we're interested in the load of two CPUs (the CPU the + * task used to run on and the CPU the waker is running on), we need to + * compute the effect of waking a task on either CPU and, in case of a sync + * wakeup, compute the effect of the current task going to sleep. + * + * So for a change of @wl to the local @cpu with an overall group weight change + * of @wl we can compute the new shares distribution (s'_i) using: + * + * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) + * + * Suppose we're interested in CPUs 0 and 1, and want to compute the load + * differences in waking a task to CPU 0. The additional task changes the + * weight and shares distributions like: + * + * rw'_i = { 3, 4, 1, 0 } + * s'_i = { 3/8, 4/8, 1/8, 0 } + * + * We can then compute the difference in effective weight by using: + * + * dw_i = S * (s'_i - s_i) (3) + * + * Where 'S' is the group weight as seen by its parent. + * + * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) + * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - + * 4/7) times the weight of the group. */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - if (!tg->parent) + if (!tg->parent) /* the trivial, non-cgroup case */ return wl; for_each_sched_entity(se) { - long lw, w; + long w, W; tg = se->my_q->tg; - w = se->my_q->load.weight; - /* use this cpu's instantaneous contribution */ - lw = atomic_read(&tg->load_weight); - lw -= se->my_q->load_contribution; - lw += w + wg; + /* + * W = @wg + \Sum rw_j + */ + W = wg + calc_tg_weight(tg, se->my_q); - wl += w; + /* + * w = rw_i + @wl + */ + w = se->my_q->load.weight + wl; - if (lw > 0 && wl < lw) - wl = (wl * tg->shares) / lw; + /* + * wl = S * s'_i; see (2) + */ + if (W > 0 && w < W) + wl = (w * tg->shares) / W; else wl = tg->shares; - /* zero point is MIN_SHARES */ + /* + * Per the above, wl is the new se->load.weight value; since + * those are clipped to [MIN_SHARES, ...) do so now. See + * calc_cfs_shares(). + */ if (wl < MIN_SHARES) wl = MIN_SHARES; + + /* + * wl = dw_i = S * (s'_i - s_i); see (3) + */ wl -= se->load.weight; + + /* + * Recursively apply this logic to all parent groups to compute + * the final effective load change on the root group. Since + * only the @tg group gets extra weight, all parent groups can + * only redistribute existing shares. @wl is the shift in shares + * resulting from this level per the above. + */ wg = 0; } @@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; - int i; + struct sched_group *sg; + int i, smt = 0; /* * If the task is going to be woken-up on this cpu and if it is @@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target) * Otherwise, iterate the domains and find an elegible idle cpu. */ rcu_read_lock(); +again: for_each_domain(target, sd) { - if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) - break; + if (!smt && (sd->flags & SD_SHARE_CPUPOWER)) + continue; - for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { - if (idle_cpu(i)) { - target = i; - break; + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) { + if (!smt) { + smt = 1; + goto again; } + break; } - /* - * Lets stop looking for an idle sibling when we reached - * the domain that spans the current cpu and prev_cpu. - */ - if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && - cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) - break; + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + + for_each_cpu(i, sched_group_cpus(sg)) { + if (!idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); } +done: rcu_read_unlock(); return target; @@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, } /** - * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu diff --git a/kernel/sched_features.h b/kernel/sched_features.h index efa0a7b75dde..84802245abd2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1) SCHED_FEAT(TTWU_QUEUE, 1) SCHED_FEAT(FORCE_SD_OVERLAP, 0) +SCHED_FEAT(RT_RUNTIME_SHARE, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 056cbd2e2a27..583a1368afe6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; + if (!sched_feat(RT_RUNTIME_SHARE)) + return more; + if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); more = do_balance_runtime(rt_rq); |