aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/s390/include/asm/pgtable.h8
-rw-r--r--arch/s390/kernel/ptrace.c30
-rw-r--r--arch/s390/kernel/setup.c2
-rw-r--r--arch/s390/kernel/signal.c8
-rw-r--r--arch/x86/include/asm/timer.h23
-rw-r--r--drivers/gpio/Makefile2
-rw-r--r--drivers/s390/cio/chsc.c7
-rw-r--r--drivers/s390/cio/cio.h5
-rw-r--r--drivers/s390/cio/css.c104
-rw-r--r--drivers/s390/cio/device.c4
-rw-r--r--drivers/s390/cio/device_fsm.c30
-rw-r--r--drivers/s390/cio/device_ops.c20
-rw-r--r--drivers/s390/cio/io_sch.h5
-rw-r--r--drivers/s390/crypto/ap_bus.c2
-rw-r--r--include/linux/init_task.h4
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/sched.c17
-rw-r--r--kernel/sched_fair.c159
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/sched_rt.c3
20 files changed, 313 insertions, 126 deletions
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 524d23b8610c..4f289ff0b7fe 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -599,10 +599,10 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
skey = page_get_storage_key(address);
bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
/* Clear page changed & referenced bit in the storage key */
- if (bits) {
- skey ^= bits;
- page_set_storage_key(address, skey, 1);
- }
+ if (bits & _PAGE_CHANGED)
+ page_set_storage_key(address, skey ^ bits, 1);
+ else if (bits)
+ page_reset_referenced(address);
/* Transfer page changed & referenced bit to guest bits in pgste */
pgste_val(pgste) |= bits << 48; /* RCP_GR_BIT & RCP_GC_BIT */
/* Get host changed & referenced bits from pgste */
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 450931a45b68..573bc29551ef 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -296,13 +296,6 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
((data & PSW_MASK_EA) && !(data & PSW_MASK_BA))))
/* Invalid psw mask. */
return -EINVAL;
- if (addr == (addr_t) &dummy->regs.psw.addr)
- /*
- * The debugger changed the instruction address,
- * reset system call restart, see signal.c:do_signal
- */
- task_thread_info(child)->system_call = 0;
-
*(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr) = data;
} else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) {
@@ -614,11 +607,6 @@ static int __poke_user_compat(struct task_struct *child,
/* Transfer 31 bit amode bit to psw mask. */
regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) |
(__u64)(tmp & PSW32_ADDR_AMODE);
- /*
- * The debugger changed the instruction address,
- * reset system call restart, see signal.c:do_signal
- */
- task_thread_info(child)->system_call = 0;
} else {
/* gpr 0-15 */
*(__u32*)((addr_t) &regs->psw + addr*2 + 4) = tmp;
@@ -905,6 +893,14 @@ static int s390_last_break_get(struct task_struct *target,
return 0;
}
+static int s390_last_break_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ return 0;
+}
+
#endif
static int s390_system_call_get(struct task_struct *target,
@@ -951,6 +947,7 @@ static const struct user_regset s390_regsets[] = {
.size = sizeof(long),
.align = sizeof(long),
.get = s390_last_break_get,
+ .set = s390_last_break_set,
},
#endif
[REGSET_SYSTEM_CALL] = {
@@ -1116,6 +1113,14 @@ static int s390_compat_last_break_get(struct task_struct *target,
return 0;
}
+static int s390_compat_last_break_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ return 0;
+}
+
static const struct user_regset s390_compat_regsets[] = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
@@ -1139,6 +1144,7 @@ static const struct user_regset s390_compat_regsets[] = {
.size = sizeof(long),
.align = sizeof(long),
.get = s390_compat_last_break_get,
+ .set = s390_compat_last_break_set,
},
[REGSET_SYSTEM_CALL] = {
.core_note_type = NT_S390_SYSTEM_CALL,
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index e58a462949b1..e54c4ff8abaa 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -579,7 +579,7 @@ static unsigned long __init find_crash_base(unsigned long crash_size,
*msg = "first memory chunk must be at least crashkernel size";
return 0;
}
- if (is_kdump_kernel() && (crash_size == OLDMEM_SIZE))
+ if (OLDMEM_BASE && crash_size == OLDMEM_SIZE)
return OLDMEM_BASE;
for (i = MEMORY_CHUNKS - 1; i >= 0; i--) {
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 05a85bc14c98..7f6f9f354545 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -460,9 +460,9 @@ void do_signal(struct pt_regs *regs)
regs->svc_code >> 16);
break;
}
- /* No longer in a system call */
- clear_thread_flag(TIF_SYSCALL);
}
+ /* No longer in a system call */
+ clear_thread_flag(TIF_SYSCALL);
if ((is_compat_task() ?
handle_signal32(signr, &ka, &info, oldset, regs) :
@@ -486,6 +486,7 @@ void do_signal(struct pt_regs *regs)
}
/* No handlers present - check for system call restart */
+ clear_thread_flag(TIF_SYSCALL);
if (current_thread_info()->system_call) {
regs->svc_code = current_thread_info()->system_call;
switch (regs->gprs[2]) {
@@ -500,9 +501,6 @@ void do_signal(struct pt_regs *regs)
regs->gprs[2] = regs->orig_gpr2;
set_thread_flag(TIF_SYSCALL);
break;
- default:
- clear_thread_flag(TIF_SYSCALL);
- break;
}
}
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index fa7b9176b76c..431793e5d484 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -32,6 +32,22 @@ extern int no_timer_check;
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ *
+ * In:
+ *
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * Although we may still have enough bits to store the value of ns,
+ * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
+ * leading to an incorrect result.
+ *
+ * To avoid this, we can decompose 'cycles' into quotient and remainder
+ * of division by SC. Then,
+ *
+ * ns = (quot * SC + rem) * cyc2ns_scale / SC
+ * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
+ *
+ * - sqazi@google.com
*/
DECLARE_PER_CPU(unsigned long, cyc2ns);
@@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
+ unsigned long long quot;
+ unsigned long long rem;
int cpu = smp_processor_id();
unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
- ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR;
+ quot = (cyc >> CYC2NS_SCALE_FACTOR);
+ rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
+ ns += quot * per_cpu(cyc2ns, cpu) +
+ ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
return ns;
}
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index dbcb0bcfd8da..4e018d6a7639 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_ARCH_DAVINCI) += gpio-davinci.o
obj-$(CONFIG_GPIO_EP93XX) += gpio-ep93xx.o
obj-$(CONFIG_GPIO_IT8761E) += gpio-it8761e.o
obj-$(CONFIG_GPIO_JANZ_TTL) += gpio-janz-ttl.o
-obj-$(CONFIG_MACH_KS8695) += gpio-ks8695.o
+obj-$(CONFIG_ARCH_KS8695) += gpio-ks8695.o
obj-$(CONFIG_GPIO_LANGWELL) += gpio-langwell.o
obj-$(CONFIG_ARCH_LPC32XX) += gpio-lpc32xx.o
obj-$(CONFIG_GPIO_MAX730X) += gpio-max730x.o
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index 75c3f1f8fd43..a84631a7391d 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -529,10 +529,7 @@ __s390_vary_chpid_on(struct subchannel_id schid, void *data)
int chsc_chp_vary(struct chp_id chpid, int on)
{
struct channel_path *chp = chpid_to_chp(chpid);
- struct chp_link link;
- memset(&link, 0, sizeof(struct chp_link));
- link.chpid = chpid;
/* Wait until previous actions have settled. */
css_wait_for_slow_path();
/*
@@ -542,10 +539,10 @@ int chsc_chp_vary(struct chp_id chpid, int on)
/* Try to update the channel path descritor. */
chsc_determine_base_channel_path_desc(chpid, &chp->desc);
for_each_subchannel_staged(s390_subchannel_vary_chpid_on,
- __s390_vary_chpid_on, &link);
+ __s390_vary_chpid_on, &chpid);
} else
for_each_subchannel_staged(s390_subchannel_vary_chpid_off,
- NULL, &link);
+ NULL, &chpid);
return 0;
}
diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h
index 155a82bcb9e5..4a1ff5c2eb88 100644
--- a/drivers/s390/cio/cio.h
+++ b/drivers/s390/cio/cio.h
@@ -68,8 +68,13 @@ struct schib {
__u8 mda[4]; /* model dependent area */
} __attribute__ ((packed,aligned(4)));
+/*
+ * When rescheduled, todo's with higher values will overwrite those
+ * with lower values.
+ */
enum sch_todo {
SCH_TODO_NOTHING,
+ SCH_TODO_EVAL,
SCH_TODO_UNREG,
};
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 92d7324acb1c..21908e67bf67 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -195,51 +195,6 @@ void css_sch_device_unregister(struct subchannel *sch)
}
EXPORT_SYMBOL_GPL(css_sch_device_unregister);
-static void css_sch_todo(struct work_struct *work)
-{
- struct subchannel *sch;
- enum sch_todo todo;
-
- sch = container_of(work, struct subchannel, todo_work);
- /* Find out todo. */
- spin_lock_irq(sch->lock);
- todo = sch->todo;
- CIO_MSG_EVENT(4, "sch_todo: sch=0.%x.%04x, todo=%d\n", sch->schid.ssid,
- sch->schid.sch_no, todo);
- sch->todo = SCH_TODO_NOTHING;
- spin_unlock_irq(sch->lock);
- /* Perform todo. */
- if (todo == SCH_TODO_UNREG)
- css_sch_device_unregister(sch);
- /* Release workqueue ref. */
- put_device(&sch->dev);
-}
-
-/**
- * css_sched_sch_todo - schedule a subchannel operation
- * @sch: subchannel
- * @todo: todo
- *
- * Schedule the operation identified by @todo to be performed on the slow path
- * workqueue. Do nothing if another operation with higher priority is already
- * scheduled. Needs to be called with subchannel lock held.
- */
-void css_sched_sch_todo(struct subchannel *sch, enum sch_todo todo)
-{
- CIO_MSG_EVENT(4, "sch_todo: sched sch=0.%x.%04x todo=%d\n",
- sch->schid.ssid, sch->schid.sch_no, todo);
- if (sch->todo >= todo)
- return;
- /* Get workqueue ref. */
- if (!get_device(&sch->dev))
- return;
- sch->todo = todo;
- if (!queue_work(cio_work_q, &sch->todo_work)) {
- /* Already queued, release workqueue ref. */
- put_device(&sch->dev);
- }
-}
-
static void ssd_from_pmcw(struct chsc_ssd_info *ssd, struct pmcw *pmcw)
{
int i;
@@ -466,6 +421,65 @@ static void css_evaluate_subchannel(struct subchannel_id schid, int slow)
css_schedule_eval(schid);
}
+/**
+ * css_sched_sch_todo - schedule a subchannel operation
+ * @sch: subchannel
+ * @todo: todo
+ *
+ * Schedule the operation identified by @todo to be performed on the slow path
+ * workqueue. Do nothing if another operation with higher priority is already
+ * scheduled. Needs to be called with subchannel lock held.
+ */
+void css_sched_sch_todo(struct subchannel *sch, enum sch_todo todo)
+{
+ CIO_MSG_EVENT(4, "sch_todo: sched sch=0.%x.%04x todo=%d\n",
+ sch->schid.ssid, sch->schid.sch_no, todo);
+ if (sch->todo >= todo)
+ return;
+ /* Get workqueue ref. */
+ if (!get_device(&sch->dev))
+ return;
+ sch->todo = todo;
+ if (!queue_work(cio_work_q, &sch->todo_work)) {
+ /* Already queued, release workqueue ref. */
+ put_device(&sch->dev);
+ }
+}
+
+static void css_sch_todo(struct work_struct *work)
+{
+ struct subchannel *sch;
+ enum sch_todo todo;
+ int ret;
+
+ sch = container_of(work, struct subchannel, todo_work);
+ /* Find out todo. */
+ spin_lock_irq(sch->lock);
+ todo = sch->todo;
+ CIO_MSG_EVENT(4, "sch_todo: sch=0.%x.%04x, todo=%d\n", sch->schid.ssid,
+ sch->schid.sch_no, todo);
+ sch->todo = SCH_TODO_NOTHING;
+ spin_unlock_irq(sch->lock);
+ /* Perform todo. */
+ switch (todo) {
+ case SCH_TODO_NOTHING:
+ break;
+ case SCH_TODO_EVAL:
+ ret = css_evaluate_known_subchannel(sch, 1);
+ if (ret == -EAGAIN) {
+ spin_lock_irq(sch->lock);
+ css_sched_sch_todo(sch, todo);
+ spin_unlock_irq(sch->lock);
+ }
+ break;
+ case SCH_TODO_UNREG:
+ css_sch_device_unregister(sch);
+ break;
+ }
+ /* Release workqueue ref. */
+ put_device(&sch->dev);
+}
+
static struct idset *slow_subchannel_set;
static spinlock_t slow_subchannel_lock;
static wait_queue_head_t css_eval_wq;
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index d734f4a0ecac..47269858ecb6 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -1868,9 +1868,9 @@ static void __ccw_device_pm_restore(struct ccw_device *cdev)
*/
cdev->private->flags.resuming = 1;
cdev->private->path_new_mask = LPM_ANYPATH;
- css_schedule_eval(sch->schid);
+ css_sched_sch_todo(sch, SCH_TODO_EVAL);
spin_unlock_irq(sch->lock);
- css_complete_work();
+ css_wait_for_slow_path();
/* cdev may have been moved to a different subchannel. */
sch = to_subchannel(cdev->dev.parent);
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index 52c233fa2b12..1b853513c891 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -496,8 +496,26 @@ static void ccw_device_reset_path_events(struct ccw_device *cdev)
cdev->private->pgid_reset_mask = 0;
}
-void
-ccw_device_verify_done(struct ccw_device *cdev, int err)
+static void create_fake_irb(struct irb *irb, int type)
+{
+ memset(irb, 0, sizeof(*irb));
+ if (type == FAKE_CMD_IRB) {
+ struct cmd_scsw *scsw = &irb->scsw.cmd;
+ scsw->cc = 1;
+ scsw->fctl = SCSW_FCTL_START_FUNC;
+ scsw->actl = SCSW_ACTL_START_PEND;
+ scsw->stctl = SCSW_STCTL_STATUS_PEND;
+ } else if (type == FAKE_TM_IRB) {
+ struct tm_scsw *scsw = &irb->scsw.tm;
+ scsw->x = 1;
+ scsw->cc = 1;
+ scsw->fctl = SCSW_FCTL_START_FUNC;
+ scsw->actl = SCSW_ACTL_START_PEND;
+ scsw->stctl = SCSW_STCTL_STATUS_PEND;
+ }
+}
+
+void ccw_device_verify_done(struct ccw_device *cdev, int err)
{
struct subchannel *sch;
@@ -520,12 +538,8 @@ callback:
ccw_device_done(cdev, DEV_STATE_ONLINE);
/* Deliver fake irb to device driver, if needed. */
if (cdev->private->flags.fake_irb) {
- memset(&cdev->private->irb, 0, sizeof(struct irb));
- cdev->private->irb.scsw.cmd.cc = 1;
- cdev->private->irb.scsw.cmd.fctl = SCSW_FCTL_START_FUNC;
- cdev->private->irb.scsw.cmd.actl = SCSW_ACTL_START_PEND;
- cdev->private->irb.scsw.cmd.stctl =
- SCSW_STCTL_STATUS_PEND;
+ create_fake_irb(&cdev->private->irb,
+ cdev->private->flags.fake_irb);
cdev->private->flags.fake_irb = 0;
if (cdev->handler)
cdev->handler(cdev, cdev->private->intparm,
diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c
index f98698d5735e..ec7fb6d3b479 100644
--- a/drivers/s390/cio/device_ops.c
+++ b/drivers/s390/cio/device_ops.c
@@ -198,7 +198,7 @@ int ccw_device_start_key(struct ccw_device *cdev, struct ccw1 *cpa,
if (cdev->private->state == DEV_STATE_VERIFY) {
/* Remember to fake irb when finished. */
if (!cdev->private->flags.fake_irb) {
- cdev->private->flags.fake_irb = 1;
+ cdev->private->flags.fake_irb = FAKE_CMD_IRB;
cdev->private->intparm = intparm;
return 0;
} else
@@ -213,9 +213,9 @@ int ccw_device_start_key(struct ccw_device *cdev, struct ccw1 *cpa,
ret = cio_set_options (sch, flags);
if (ret)
return ret;
- /* Adjust requested path mask to excluded varied off paths. */
+ /* Adjust requested path mask to exclude unusable paths. */
if (lpm) {
- lpm &= sch->opm;
+ lpm &= sch->lpm;
if (lpm == 0)
return -EACCES;
}
@@ -605,11 +605,21 @@ int ccw_device_tm_start_key(struct ccw_device *cdev, struct tcw *tcw,
sch = to_subchannel(cdev->dev.parent);
if (!sch->schib.pmcw.ena)
return -EINVAL;
+ if (cdev->private->state == DEV_STATE_VERIFY) {
+ /* Remember to fake irb when finished. */
+ if (!cdev->private->flags.fake_irb) {
+ cdev->private->flags.fake_irb = FAKE_TM_IRB;
+ cdev->private->intparm = intparm;
+ return 0;
+ } else
+ /* There's already a fake I/O around. */
+ return -EBUSY;
+ }
if (cdev->private->state != DEV_STATE_ONLINE)
return -EIO;
- /* Adjust requested path mask to excluded varied off paths. */
+ /* Adjust requested path mask to exclude unusable paths. */
if (lpm) {
- lpm &= sch->opm;
+ lpm &= sch->lpm;
if (lpm == 0)
return -EACCES;
}
diff --git a/drivers/s390/cio/io_sch.h b/drivers/s390/cio/io_sch.h
index 2ebb492a5c17..76253dfcc1be 100644
--- a/drivers/s390/cio/io_sch.h
+++ b/drivers/s390/cio/io_sch.h
@@ -111,6 +111,9 @@ enum cdev_todo {
CDEV_TODO_UNREG_EVAL,
};
+#define FAKE_CMD_IRB 1
+#define FAKE_TM_IRB 2
+
struct ccw_device_private {
struct ccw_device *cdev;
struct subchannel *sch;
@@ -138,7 +141,7 @@ struct ccw_device_private {
unsigned int doverify:1; /* delayed path verification */
unsigned int donotify:1; /* call notify function */
unsigned int recog_done:1; /* dev. recog. complete */
- unsigned int fake_irb:1; /* deliver faked irb */
+ unsigned int fake_irb:2; /* deliver faked irb */
unsigned int resuming:1; /* recognition while resume */
unsigned int pgroup:1; /* pathgroup is set up */
unsigned int mpath:1; /* multipathing is set up */
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index ec94f049e995..96bbe9d12a79 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -1552,6 +1552,8 @@ static void ap_reset(struct ap_device *ap_dev)
rc = ap_init_queue(ap_dev->qid);
if (rc == -ENODEV)
ap_dev->unregistered = 1;
+ else
+ __ap_schedule_poll_timer();
}
static int __ap_poll_device(struct ap_device *ap_dev, unsigned long *flags)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 94b1e356c02a..32574eef9394 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -126,6 +126,8 @@ extern struct cred init_cred;
# define INIT_PERF_EVENTS(tsk)
#endif
+#define INIT_TASK_COMM "swapper"
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -162,7 +164,7 @@ extern struct cred init_cred;
.group_leader = &tsk, \
RCU_INIT_POINTER(.real_cred, &init_cred), \
RCU_INIT_POINTER(.cred, &init_cred), \
- .comm = "swapper", \
+ .comm = INIT_TASK_COMM, \
.thread = INIT_THREAD, \
.fs = &init_fs, \
.files = &init_files, \
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0e2b179bc7b3..1da999f5e746 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -623,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
static int irq_wait_for_interrupt(struct irqaction *action)
{
+ set_current_state(TASK_INTERRUPTIBLE);
+
while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
if (test_and_clear_bit(IRQTF_RUNTHREAD,
&action->thread_flags)) {
@@ -632,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)
return 0;
}
schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
}
+ __set_current_state(TASK_RUNNING);
return -1;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 0e9344a71be3..d6b149ccf925 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
#include <linux/ctype.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
+#include <linux/init_task.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
*
* This waits for completion of a specific task to be signaled. It is
* interruptible.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_interruptible(struct completion *x)
{
@@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
*
* This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_killable(struct completion *x)
{
@@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
* This waits for either a completion of a specific task to be
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_killable_timeout(struct completion *x,
@@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
+#if defined(CONFIG_SMP)
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
}
/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c9e67923b7c..a78ed2736ba7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
list_del_leaf_cfs_rq(cfs_rq);
}
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+ long tg_weight;
+
+ /*
+ * Use this CPU's actual weight instead of the last load_contribution
+ * to gain a more accurate current total weight. See
+ * update_cfs_rq_load_contribution().
+ */
+ tg_weight = atomic_read(&tg->load_weight);
+ tg_weight -= cfs_rq->load_contribution;
+ tg_weight += cfs_rq->load.weight;
+
+ return tg_weight;
+}
+
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long load_weight, load, shares;
+ long tg_weight, load, shares;
+ tg_weight = calc_tg_weight(tg, cfs_rq);
load = cfs_rq->load.weight;
- load_weight = atomic_read(&tg->load_weight);
- load_weight += load;
- load_weight -= cfs_rq->load_contribution;
-
shares = (tg->shares * load);
- if (load_weight)
- shares /= load_weight;
+ if (tg_weight)
+ shares /= tg_weight;
if (shares < MIN_SHARES)
shares = MIN_SHARES;
@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ * s_i = rw_i / \Sum rw_j (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ * rw_i = { 2, 4, 1, 0 }
+ * s_i = { 2/7, 4/7, 1/7, 0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ * rw'_i = { 3, 4, 1, 0 }
+ * s'_i = { 3/8, 4/8, 1/8, 0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ * dw_i = S * (s'_i - s_i) (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
*/
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
- if (!tg->parent)
+ if (!tg->parent) /* the trivial, non-cgroup case */
return wl;
for_each_sched_entity(se) {
- long lw, w;
+ long w, W;
tg = se->my_q->tg;
- w = se->my_q->load.weight;
- /* use this cpu's instantaneous contribution */
- lw = atomic_read(&tg->load_weight);
- lw -= se->my_q->load_contribution;
- lw += w + wg;
+ /*
+ * W = @wg + \Sum rw_j
+ */
+ W = wg + calc_tg_weight(tg, se->my_q);
- wl += w;
+ /*
+ * w = rw_i + @wl
+ */
+ w = se->my_q->load.weight + wl;
- if (lw > 0 && wl < lw)
- wl = (wl * tg->shares) / lw;
+ /*
+ * wl = S * s'_i; see (2)
+ */
+ if (W > 0 && w < W)
+ wl = (w * tg->shares) / W;
else
wl = tg->shares;
- /* zero point is MIN_SHARES */
+ /*
+ * Per the above, wl is the new se->load.weight value; since
+ * those are clipped to [MIN_SHARES, ...) do so now. See
+ * calc_cfs_shares().
+ */
if (wl < MIN_SHARES)
wl = MIN_SHARES;
+
+ /*
+ * wl = dw_i = S * (s'_i - s_i); see (3)
+ */
wl -= se->load.weight;
+
+ /*
+ * Recursively apply this logic to all parent groups to compute
+ * the final effective load change on the root group. Since
+ * only the @tg group gets extra weight, all parent groups can
+ * only redistribute existing shares. @wl is the shift in shares
+ * resulting from this level per the above.
+ */
wg = 0;
}
@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- int i;
+ struct sched_group *sg;
+ int i, smt = 0;
/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
rcu_read_lock();
+again:
for_each_domain(target, sd) {
- if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
- break;
+ if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
+ continue;
- for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
- target = i;
- break;
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
+ if (!smt) {
+ smt = 1;
+ goto again;
}
+ break;
}
- /*
- * Lets stop looking for an idle sibling when we reached
- * the domain that spans the current cpu and prev_cpu.
- */
- if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
- cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
- break;
+ sg = sd->groups;
+ do {
+ if (!cpumask_intersects(sched_group_cpus(sg),
+ tsk_cpus_allowed(p)))
+ goto next;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (!idle_cpu(i))
+ goto next;
+ }
+
+ target = cpumask_first_and(sched_group_cpus(sg),
+ tsk_cpus_allowed(p));
+ goto done;
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
}
+done:
rcu_read_unlock();
return target;
@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
}
/**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index efa0a7b75dde..84802245abd2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
SCHED_FEAT(TTWU_QUEUE, 1)
SCHED_FEAT(FORCE_SD_OVERLAP, 0)
+SCHED_FEAT(RT_RUNTIME_SHARE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 056cbd2e2a27..583a1368afe6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
{
int more = 0;
+ if (!sched_feat(RT_RUNTIME_SHARE))
+ return more;
+
if (rt_rq->rt_time > rt_rq->rt_runtime) {
raw_spin_unlock(&rt_rq->rt_runtime_lock);
more = do_balance_runtime(rt_rq);