From 6d877e6b85691e0b2b22e90aeb9b86c3dafcfc6b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 19 Oct 2012 15:01:46 -0400 Subject: xen/hvm: If we fail to fetch an HVM parameter print out which flag it is. Makes it easier to troubleshoot in the field. Acked-by: Ian Campbell [v1: Use macro per Ian's suggestion] Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/hvm.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/xen/hvm.h b/include/xen/hvm.h index b193fa2f9fdd..13e43e41637d 100644 --- a/include/xen/hvm.h +++ b/include/xen/hvm.h @@ -5,6 +5,36 @@ #include #include +static const char *param_name(int op) +{ +#define PARAM(x) [HVM_PARAM_##x] = #x + static const char *const names[] = { + PARAM(CALLBACK_IRQ), + PARAM(STORE_PFN), + PARAM(STORE_EVTCHN), + PARAM(PAE_ENABLED), + PARAM(IOREQ_PFN), + PARAM(BUFIOREQ_PFN), + PARAM(TIMER_MODE), + PARAM(HPET_ENABLED), + PARAM(IDENT_PT), + PARAM(DM_DOMAIN), + PARAM(ACPI_S_STATE), + PARAM(VM86_TSS), + PARAM(VPT_ALIGN), + PARAM(CONSOLE_PFN), + PARAM(CONSOLE_EVTCHN), + }; +#undef PARAM + + if (op >= ARRAY_SIZE(names)) + return "unknown"; + + if (!names[op]) + return "reserved"; + + return names[op]; +} static inline int hvm_get_parameter(int idx, uint64_t *value) { struct xen_hvm_param xhv; @@ -14,8 +44,8 @@ static inline int hvm_get_parameter(int idx, uint64_t *value) xhv.index = idx; r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv); if (r < 0) { - printk(KERN_ERR "Cannot get hvm parameter %d: %d!\n", - idx, r); + printk(KERN_ERR "Cannot get hvm parameter %s (%d): %d!\n", + param_name(idx), idx, r); return r; } *value = xhv.value; -- cgit v1.2.3-59-g8ed1b From 0133370f93eae5ed3c0f16d9da2b7add7dda6076 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Fri, 2 Nov 2012 13:44:46 +0530 Subject: drivers: bus: ocp2scp: add pdata support ocp2scp was not having pdata support which makes *musb* fail for non-dt boot in OMAP platform. The pdata will have information about the devices that is connected to ocp2scp. ocp2scp driver will now make use of this information to create the devices that is attached to ocp2scp. This is needed to fix MUSB regression caused by commit c9e4412a (arm: omap: phy: remove unused functions from omap-phy-internal.c) Signed-off-by: Kishon Vijay Abraham I Acked-by: Felipe Balbi [tony@atomide.com: updated comments for regression info] Signed-off-by: Tony Lindgren --- drivers/bus/omap-ocp2scp.c | 68 ++++++++++++++++++++++++++++-- include/linux/platform_data/omap_ocp2scp.h | 31 ++++++++++++++ 2 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 include/linux/platform_data/omap_ocp2scp.h (limited to 'include') diff --git a/drivers/bus/omap-ocp2scp.c b/drivers/bus/omap-ocp2scp.c index ff63560b8467..0c48b0e05ed6 100644 --- a/drivers/bus/omap-ocp2scp.c +++ b/drivers/bus/omap-ocp2scp.c @@ -22,6 +22,26 @@ #include #include #include +#include + +/** + * _count_resources - count for the number of resources + * @res: struct resource * + * + * Count and return the number of resources populated for the device that is + * connected to ocp2scp. + */ +static unsigned _count_resources(struct resource *res) +{ + int cnt = 0; + + while (res->start != res->end) { + cnt++; + res++; + } + + return cnt; +} static int ocp2scp_remove_devices(struct device *dev, void *c) { @@ -34,20 +54,62 @@ static int ocp2scp_remove_devices(struct device *dev, void *c) static int __devinit omap_ocp2scp_probe(struct platform_device *pdev) { - int ret; - struct device_node *np = pdev->dev.of_node; + int ret; + unsigned res_cnt, i; + struct device_node *np = pdev->dev.of_node; + struct platform_device *pdev_child; + struct omap_ocp2scp_platform_data *pdata = pdev->dev.platform_data; + struct omap_ocp2scp_dev *dev; if (np) { ret = of_platform_populate(np, NULL, NULL, &pdev->dev); if (ret) { - dev_err(&pdev->dev, "failed to add resources for ocp2scp child\n"); + dev_err(&pdev->dev, + "failed to add resources for ocp2scp child\n"); goto err0; } + } else if (pdata) { + for (i = 0, dev = *pdata->devices; i < pdata->dev_cnt; i++, + dev++) { + res_cnt = _count_resources(dev->res); + + pdev_child = platform_device_alloc(dev->drv_name, + PLATFORM_DEVID_AUTO); + if (!pdev_child) { + dev_err(&pdev->dev, + "failed to allocate mem for ocp2scp child\n"); + goto err0; + } + + ret = platform_device_add_resources(pdev_child, + dev->res, res_cnt); + if (ret) { + dev_err(&pdev->dev, + "failed to add resources for ocp2scp child\n"); + goto err1; + } + + pdev_child->dev.parent = &pdev->dev; + + ret = platform_device_add(pdev_child); + if (ret) { + dev_err(&pdev->dev, + "failed to register ocp2scp child device\n"); + goto err1; + } + } + } else { + dev_err(&pdev->dev, "OCP2SCP initialized without plat data\n"); + return -EINVAL; } + pm_runtime_enable(&pdev->dev); return 0; +err1: + platform_device_put(pdev_child); + err0: device_for_each_child(&pdev->dev, NULL, ocp2scp_remove_devices); diff --git a/include/linux/platform_data/omap_ocp2scp.h b/include/linux/platform_data/omap_ocp2scp.h new file mode 100644 index 000000000000..5c6c3939355f --- /dev/null +++ b/include/linux/platform_data/omap_ocp2scp.h @@ -0,0 +1,31 @@ +/* + * omap_ocp2scp.h -- ocp2scp header file + * + * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Author: Kishon Vijay Abraham I + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __DRIVERS_OMAP_OCP2SCP_H +#define __DRIVERS_OMAP_OCP2SCP_H + +struct omap_ocp2scp_dev { + const char *drv_name; + struct resource *res; +}; + +struct omap_ocp2scp_platform_data { + int dev_cnt; + struct omap_ocp2scp_dev **devices; +}; +#endif /* __DRIVERS_OMAP_OCP2SCP_H */ -- cgit v1.2.3-59-g8ed1b From d676188e44680c2f2eb114a24b3b32e56165f079 Mon Sep 17 00:00:00 2001 From: Seungwon Jeon Date: Fri, 28 Sep 2012 14:21:59 +0900 Subject: mmc: dw_mmc: convert the variable type of irq Even though platform_get_irq returns error, 'host->irq' always has an unsigned value. Less-than-zero comparison of an unsigned value is never true. Type of 'unsigned int' will be changed for 'int'. Signed-off-by: Seungwon Jeon Acked-by: Will Newton Signed-off-by: Chris Ball --- include/linux/mmc/dw_mmc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h index 7c6a1139d8fa..31416760723c 100644 --- a/include/linux/mmc/dw_mmc.h +++ b/include/linux/mmc/dw_mmc.h @@ -186,7 +186,7 @@ struct dw_mci { struct regulator *vmmc; /* Power regulator */ unsigned long irq_flags; /* IRQ flags */ - unsigned int irq; + int irq; }; /* DMA ops for Internal/External DMAC interface */ -- cgit v1.2.3-59-g8ed1b From 63ef5d8c28b2a944f104d854254941e7375c85a3 Mon Sep 17 00:00:00 2001 From: Jerry Huang Date: Thu, 25 Oct 2012 13:47:19 +0800 Subject: mmc: sdhci-of-esdhc: disable CMD23 for some Freescale SoCs CMD23 causes lots of errors in kernel on some freescale SoCs (P1020, P1021, P1022, P1024, P1025 and P4080) when MMC card used, which is because these controllers does not support CMD23, even on the SoCs which declares CMD23 is supported. Therefore, we'll not use CMD23. Signed-off-by: Jerry Huang Signed-off-by: Shaohui Xie Acked-by: Anton Vorontsov Signed-off-by: Chris Ball --- drivers/mmc/host/sdhci-of-esdhc.c | 11 +++++++++++ drivers/mmc/host/sdhci-pltfm.c | 7 +++++++ drivers/mmc/host/sdhci.c | 3 +++ drivers/mmc/host/sdhci.h | 1 + include/linux/mmc/sdhci.h | 1 + 5 files changed, 23 insertions(+) (limited to 'include') diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index ae5fcbfa1eef..63d219f57cae 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -169,6 +169,16 @@ static void esdhc_of_resume(struct sdhci_host *host) } #endif +static void esdhc_of_platform_init(struct sdhci_host *host) +{ + u32 vvn; + + vvn = in_be32(host->ioaddr + SDHCI_SLOT_INT_STATUS); + vvn = (vvn & SDHCI_VENDOR_VER_MASK) >> SDHCI_VENDOR_VER_SHIFT; + if (vvn == VENDOR_V_22) + host->quirks2 |= SDHCI_QUIRK2_HOST_NO_CMD23; +} + static struct sdhci_ops sdhci_esdhc_ops = { .read_l = esdhc_readl, .read_w = esdhc_readw, @@ -180,6 +190,7 @@ static struct sdhci_ops sdhci_esdhc_ops = { .enable_dma = esdhc_of_enable_dma, .get_max_clock = esdhc_of_get_max_clock, .get_min_clock = esdhc_of_get_min_clock, + .platform_init = esdhc_of_platform_init, #ifdef CONFIG_PM .platform_suspend = esdhc_of_suspend, .platform_resume = esdhc_of_resume, diff --git a/drivers/mmc/host/sdhci-pltfm.c b/drivers/mmc/host/sdhci-pltfm.c index 65551a9709cc..27164457f861 100644 --- a/drivers/mmc/host/sdhci-pltfm.c +++ b/drivers/mmc/host/sdhci-pltfm.c @@ -150,6 +150,13 @@ struct sdhci_host *sdhci_pltfm_init(struct platform_device *pdev, goto err_remap; } + /* + * Some platforms need to probe the controller to be able to + * determine which caps should be used. + */ + if (host->ops && host->ops->platform_init) + host->ops->platform_init(host); + platform_set_drvdata(pdev, host); return host; diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 7922adb42386..f05a37747b3d 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2837,6 +2837,9 @@ int sdhci_add_host(struct sdhci_host *host) if (!(host->quirks & SDHCI_QUIRK_FORCE_1_BIT_DATA)) mmc->caps |= MMC_CAP_4_BIT_DATA; + if (host->quirks2 & SDHCI_QUIRK2_HOST_NO_CMD23) + mmc->caps &= ~MMC_CAP_CMD23; + if (caps[0] & SDHCI_CAN_DO_HISPD) mmc->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED; diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 97653ea8942b..71a4a7ed46c5 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -278,6 +278,7 @@ struct sdhci_ops { void (*hw_reset)(struct sdhci_host *host); void (*platform_suspend)(struct sdhci_host *host); void (*platform_resume)(struct sdhci_host *host); + void (*platform_init)(struct sdhci_host *host); }; #ifdef CONFIG_MMC_SDHCI_IO_ACCESSORS diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h index fa8529a859b8..1edcb4dad8c4 100644 --- a/include/linux/mmc/sdhci.h +++ b/include/linux/mmc/sdhci.h @@ -91,6 +91,7 @@ struct sdhci_host { unsigned int quirks2; /* More deviations from spec. */ #define SDHCI_QUIRK2_HOST_OFF_CARD_ON (1<<0) +#define SDHCI_QUIRK2_HOST_NO_CMD23 (1<<1) int irq; /* Device IRQ */ void __iomem *ioaddr; /* Mapped address */ -- cgit v1.2.3-59-g8ed1b From 8e2b36ea6e3abc613cbbdb41692fbd2f9ee18996 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 6 Nov 2012 22:55:31 +0100 Subject: mmc: dw_mmc: constify dw_mci_idmac_ops in exynos back-end The of_device_id match data is now marked as const and must not be modified. This changes the dw_mmc to mark all pointers passing the dw_mci_drv_data or dw_mci_dma_ops structures as const, and also marks the static definitions as const. drivers/mmc/host/dw_mmc-exynos.c: In function 'dw_mci_exynos_probe': drivers/mmc/host/dw_mmc-exynos.c:234:11: warning: assignment discards 'const' qualifier from pointer target type [enabled by default] Signed-off-by: Arnd Bergmann Cc: Thomas Abraham Cc: Will Newton Signed-off-by: Chris Ball --- drivers/mmc/host/dw_mmc-exynos.c | 6 +++--- drivers/mmc/host/dw_mmc-pltfm.c | 2 +- drivers/mmc/host/dw_mmc-pltfm.h | 2 +- drivers/mmc/host/dw_mmc.c | 2 +- include/linux/mmc/dw_mmc.h | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c index 0147ac3aad59..4d50da618166 100644 --- a/drivers/mmc/host/dw_mmc-exynos.c +++ b/drivers/mmc/host/dw_mmc-exynos.c @@ -208,7 +208,7 @@ static unsigned long exynos5250_dwmmc_caps[4] = { MMC_CAP_CMD23, }; -static struct dw_mci_drv_data exynos5250_drv_data = { +static const struct dw_mci_drv_data exynos5250_drv_data = { .caps = exynos5250_dwmmc_caps, .init = dw_mci_exynos_priv_init, .setup_clock = dw_mci_exynos_setup_clock, @@ -220,14 +220,14 @@ static struct dw_mci_drv_data exynos5250_drv_data = { static const struct of_device_id dw_mci_exynos_match[] = { { .compatible = "samsung,exynos5250-dw-mshc", - .data = (void *)&exynos5250_drv_data, }, + .data = &exynos5250_drv_data, }, {}, }; MODULE_DEVICE_TABLE(of, dw_mci_exynos_match); int dw_mci_exynos_probe(struct platform_device *pdev) { - struct dw_mci_drv_data *drv_data; + const struct dw_mci_drv_data *drv_data; const struct of_device_id *match; match = of_match_node(dw_mci_exynos_match, pdev->dev.of_node); diff --git a/drivers/mmc/host/dw_mmc-pltfm.c b/drivers/mmc/host/dw_mmc-pltfm.c index e5957211b171..917936bee5d5 100644 --- a/drivers/mmc/host/dw_mmc-pltfm.c +++ b/drivers/mmc/host/dw_mmc-pltfm.c @@ -24,7 +24,7 @@ #include "dw_mmc.h" int dw_mci_pltfm_register(struct platform_device *pdev, - struct dw_mci_drv_data *drv_data) + const struct dw_mci_drv_data *drv_data) { struct dw_mci *host; struct resource *regs; diff --git a/drivers/mmc/host/dw_mmc-pltfm.h b/drivers/mmc/host/dw_mmc-pltfm.h index 301f24541fc2..2ac37b81de4d 100644 --- a/drivers/mmc/host/dw_mmc-pltfm.h +++ b/drivers/mmc/host/dw_mmc-pltfm.h @@ -13,7 +13,7 @@ #define _DW_MMC_PLTFM_H_ extern int dw_mci_pltfm_register(struct platform_device *pdev, - struct dw_mci_drv_data *drv_data); + const struct dw_mci_drv_data *drv_data); extern int __devexit dw_mci_pltfm_remove(struct platform_device *pdev); extern const struct dev_pm_ops dw_mci_pltfm_pmops; diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index b087f66e30c4..c0667c8af2bd 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -435,7 +435,7 @@ static int dw_mci_idmac_init(struct dw_mci *host) return 0; } -static struct dw_mci_dma_ops dw_mci_idmac_ops = { +static const struct dw_mci_dma_ops dw_mci_idmac_ops = { .init = dw_mci_idmac_init, .start = dw_mci_idmac_start_dma, .stop = dw_mci_idmac_stop_dma, diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h index 31416760723c..96531664a061 100644 --- a/include/linux/mmc/dw_mmc.h +++ b/include/linux/mmc/dw_mmc.h @@ -137,7 +137,7 @@ struct dw_mci { dma_addr_t sg_dma; void *sg_cpu; - struct dw_mci_dma_ops *dma_ops; + const struct dw_mci_dma_ops *dma_ops; #ifdef CONFIG_MMC_DW_IDMAC unsigned int ring_size; #else @@ -162,7 +162,7 @@ struct dw_mci { u16 data_offset; struct device *dev; struct dw_mci_board *pdata; - struct dw_mci_drv_data *drv_data; + const struct dw_mci_drv_data *drv_data; void *priv; struct clk *biu_clk; struct clk *ciu_clk; -- cgit v1.2.3-59-g8ed1b From a80a6b85b428e6ce12a8363bb1f08d44c50f3252 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 8 Nov 2012 15:53:35 -0800 Subject: revert "epoll: support for disabling items, and a self-test app" Revert commit 03a7beb55b9f ("epoll: support for disabling items, and a self-test app") pending resolution of the issues identified by Michael Kerrisk, copied below. We'll revisit this for 3.8. : I've taken a look at this patch as it currently stands in 3.7-rc1, and : done a bit of testing. (By the way, the test program : tools/testing/selftests/epoll/test_epoll.c does not compile...) : : There are one or two places where the behavior seems a little strange, : so I have a question or two at the end of this mail. But other than : that, I want to check my understanding so that the interface can be : correctly documented. : : Just to go though my understanding, the problem is the following : scenario in a multithreaded application: : : 1. Multiple threads are performing epoll_wait() operations, : and maintaining a user-space cache that contains information : corresponding to each file descriptor being monitored by : epoll_wait(). : : 2. At some point, a thread wants to delete (EPOLL_CTL_DEL) : a file descriptor from the epoll interest list, and : delete the corresponding record from the user-space cache. : : 3. The problem with (2) is that some other thread may have : previously done an epoll_wait() that retrieved information : about the fd in question, and may be in the middle of using : information in the cache that relates to that fd. Thus, : there is a potential race. : : 4. The race can't solved purely in user space, because doing : so would require applying a mutex across the epoll_wait() : call, which would of course blow thread concurrency. : : Right? : : Your solution is the EPOLL_CTL_DISABLE operation. I want to : confirm my understanding about how to use this flag, since : the description that has accompanied the patches so far : has been a bit sparse : : 0. In the scenario you're concerned about, deleting a file : descriptor means (safely) doing the following: : (a) Deleting the file descriptor from the epoll interest list : using EPOLL_CTL_DEL : (b) Deleting the corresponding record in the user-space cache : : 1. It's only meaningful to use this EPOLL_CTL_DISABLE in : conjunction with EPOLLONESHOT. : : 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in : conjunction is a logical error. : : 3. The correct way to code multithreaded applications using : EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows: : : a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should : should EPOLLONESHOT. : : b. When a thread wants to delete a file descriptor, it : should do the following: : : [1] Call epoll_ctl(EPOLL_CTL_DISABLE) : [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE) : was zero, then the file descriptor can be safely : deleted by the thread that made this call. : [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY, : then the descriptor is in use. In this case, the calling : thread should set a flag in the user-space cache to : indicate that the thread that is using the descriptor : should perform the deletion operation. : : Is all of the above correct? : : The implementation depends on checking on whether : (events & ~EP_PRIVATE_BITS) == 0 : This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always : set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT : causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be : cleared. : : A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE : is only useful in conjunction with EPOLLONESHOT. However, as things : stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does : not have EPOLLONESHOT set in 'events' This results in the following : (slightly surprising) behavior: : : (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0 : (the indicator that the file descriptor can be safely deleted). : (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY. : : This doesn't seem particularly useful, and in fact is probably an : indication that the user made a logic error: they should only be using : epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which : EPOLLONESHOT was set in 'events'. If that is correct, then would it : not make sense to return an error to user space for this case? Cc: Michael Kerrisk Cc: "Paton J. Lewis" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 38 +--- include/uapi/linux/eventpoll.h | 1 - tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/epoll/Makefile | 11 - tools/testing/selftests/epoll/test_epoll.c | 344 ----------------------------- 5 files changed, 4 insertions(+), 392 deletions(-) delete mode 100644 tools/testing/selftests/epoll/Makefile delete mode 100644 tools/testing/selftests/epoll/test_epoll.c (limited to 'include') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index da72250ddc1c..cd96649bfe62 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { - return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; + return op != EPOLL_CTL_DEL; } /* Initialize the poll safe wake up structure */ @@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) return 0; } -/* - * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item - * had no event flags set, indicating that another thread may be currently - * handling that item's events (in the case that EPOLLONESHOT was being - * used). Otherwise a zero result indicates that the item has been disabled - * from receiving events. A disabled item may be re-enabled via - * EPOLL_CTL_MOD. Must be called with "mtx" held. - */ -static int ep_disable(struct eventpoll *ep, struct epitem *epi) -{ - int result = 0; - unsigned long flags; - - spin_lock_irqsave(&ep->lock, flags); - if (epi->event.events & ~EP_PRIVATE_BITS) { - if (ep_is_linked(&epi->rdllink)) - list_del_init(&epi->rdllink); - /* Ensure ep_poll_callback will not add epi back onto ready - list: */ - epi->event.events &= EP_PRIVATE_BITS; - } - else - result = -EBUSY; - spin_unlock_irqrestore(&ep->lock, flags); - - return result; -} - static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; @@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate @@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else error = -ENOENT; break; - case EPOLL_CTL_DISABLE: - if (epi) - error = ep_disable(ep, epi); - else - error = -ENOENT; - break; } mutex_unlock(&ep->mtx); diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index 8c99ce7202c5..2c267bcbb85c 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -25,7 +25,6 @@ #define EPOLL_CTL_ADD 1 #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 -#define EPOLL_CTL_DISABLE 4 /* * Request the handling of system wakeup events so as to prevent system suspends diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 43480149119e..85baf11e2acd 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,4 @@ -TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug epoll +TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile deleted file mode 100644 index 19806ed62f50..000000000000 --- a/tools/testing/selftests/epoll/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# Makefile for epoll selftests - -all: test_epoll -%: %.c - gcc -pthread -g -o $@ $^ - -run_tests: all - ./test_epoll - -clean: - $(RM) test_epoll diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c deleted file mode 100644 index f7525392ce84..000000000000 --- a/tools/testing/selftests/epoll/test_epoll.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - * tools/testing/selftests/epoll/test_epoll.c - * - * Copyright 2012 Adobe Systems Incorporated - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * Paton J. Lewis - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * A pointer to an epoll_item_private structure will be stored in the epoll - * item's event structure so that we can get access to the epoll_item_private - * data after calling epoll_wait: - */ -struct epoll_item_private { - int index; /* Position of this struct within the epoll_items array. */ - int fd; - uint32_t events; - pthread_mutex_t mutex; /* Guards the following variables... */ - int stop; - int status; /* Stores any error encountered while handling item. */ - /* The following variable allows us to test whether we have encountered - a problem while attempting to cancel and delete the associated - event. When the test program exits, 'deleted' should be exactly - one. If it is greater than one, then the failed test reflects a real - world situation where we would have tried to access the epoll item's - private data after deleting it: */ - int deleted; -}; - -struct epoll_item_private *epoll_items; - -/* - * Delete the specified item from the epoll set. In a real-world secneario this - * is where we would free the associated data structure, but in this testing - * environment we retain the structure so that we can test for double-deletion: - */ -void delete_item(int index) -{ - __sync_fetch_and_add(&epoll_items[index].deleted, 1); -} - -/* - * A pointer to a read_thread_data structure will be passed as the argument to - * each read thread: - */ -struct read_thread_data { - int stop; - int status; /* Indicates any error encountered by the read thread. */ - int epoll_set; -}; - -/* - * The function executed by the read threads: - */ -void *read_thread_function(void *function_data) -{ - struct read_thread_data *thread_data = - (struct read_thread_data *)function_data; - struct epoll_event event_data; - struct epoll_item_private *item_data; - char socket_data; - - /* Handle events until we encounter an error or this thread's 'stop' - condition is set: */ - while (1) { - int result = epoll_wait(thread_data->epoll_set, - &event_data, - 1, /* Number of desired events */ - 1000); /* Timeout in ms */ - if (result < 0) { - /* Breakpoints signal all threads. Ignore that while - debugging: */ - if (errno == EINTR) - continue; - thread_data->status = errno; - return 0; - } else if (thread_data->stop) - return 0; - else if (result == 0) /* Timeout */ - continue; - - /* We need the mutex here because checking for the stop - condition and re-enabling the epoll item need to be done - together as one atomic operation when EPOLL_CTL_DISABLE is - available: */ - item_data = (struct epoll_item_private *)event_data.data.ptr; - pthread_mutex_lock(&item_data->mutex); - - /* Remove the item from the epoll set if we want to stop - handling that event: */ - if (item_data->stop) - delete_item(item_data->index); - else { - /* Clear the data that was written to the other end of - our non-blocking socket: */ - do { - if (read(item_data->fd, &socket_data, 1) < 1) { - if ((errno == EAGAIN) || - (errno == EWOULDBLOCK)) - break; - else - goto error_unlock; - } - } while (item_data->events & EPOLLET); - - /* The item was one-shot, so re-enable it: */ - event_data.events = item_data->events; - if (epoll_ctl(thread_data->epoll_set, - EPOLL_CTL_MOD, - item_data->fd, - &event_data) < 0) - goto error_unlock; - } - - pthread_mutex_unlock(&item_data->mutex); - } - -error_unlock: - thread_data->status = item_data->status = errno; - pthread_mutex_unlock(&item_data->mutex); - return 0; -} - -/* - * A pointer to a write_thread_data structure will be passed as the argument to - * the write thread: - */ -struct write_thread_data { - int stop; - int status; /* Indicates any error encountered by the write thread. */ - int n_fds; - int *fds; -}; - -/* - * The function executed by the write thread. It writes a single byte to each - * socket in turn until the stop condition for this thread is set. If writing to - * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for - * the moment and just move on to the next socket in the list. We don't care - * about the order in which we deliver events to the epoll set. In fact we don't - * care about the data we're writing to the pipes at all; we just want to - * trigger epoll events: - */ -void *write_thread_function(void *function_data) -{ - const char data = 'X'; - int index; - struct write_thread_data *thread_data = - (struct write_thread_data *)function_data; - while (!thread_data->stop) - for (index = 0; - !thread_data->stop && (index < thread_data->n_fds); - ++index) - if ((write(thread_data->fds[index], &data, 1) < 1) && - (errno != EAGAIN) && - (errno != EWOULDBLOCK)) { - thread_data->status = errno; - return; - } -} - -/* - * Arguments are currently ignored: - */ -int main(int argc, char **argv) -{ - const int n_read_threads = 100; - const int n_epoll_items = 500; - int index; - int epoll_set = epoll_create1(0); - struct write_thread_data write_thread_data = { - 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int)) - }; - struct read_thread_data *read_thread_data = - malloc(n_read_threads * sizeof(struct read_thread_data)); - pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t)); - pthread_t write_thread; - - printf("-----------------\n"); - printf("Runing test_epoll\n"); - printf("-----------------\n"); - - epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private)); - - if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 || - read_thread_data == 0 || read_threads == 0) - goto error; - - if (sysconf(_SC_NPROCESSORS_ONLN) < 2) { - printf("Error: please run this test on a multi-core system.\n"); - goto error; - } - - /* Create the socket pairs and epoll items: */ - for (index = 0; index < n_epoll_items; ++index) { - int socket_pair[2]; - struct epoll_event event_data; - if (socketpair(AF_UNIX, - SOCK_STREAM | SOCK_NONBLOCK, - 0, - socket_pair) < 0) - goto error; - write_thread_data.fds[index] = socket_pair[0]; - epoll_items[index].index = index; - epoll_items[index].fd = socket_pair[1]; - if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0) - goto error; - /* We always use EPOLLONESHOT because this test is currently - structured to demonstrate the need for EPOLL_CTL_DISABLE, - which only produces useful information in the EPOLLONESHOT - case (without EPOLLONESHOT, calling epoll_ctl with - EPOLL_CTL_DISABLE will never return EBUSY). If support for - testing events without EPOLLONESHOT is desired, it should - probably be implemented in a separate unit test. */ - epoll_items[index].events = EPOLLIN | EPOLLONESHOT; - if (index < n_epoll_items / 2) - epoll_items[index].events |= EPOLLET; - epoll_items[index].stop = 0; - epoll_items[index].status = 0; - epoll_items[index].deleted = 0; - event_data.events = epoll_items[index].events; - event_data.data.ptr = &epoll_items[index]; - if (epoll_ctl(epoll_set, - EPOLL_CTL_ADD, - epoll_items[index].fd, - &event_data) < 0) - goto error; - } - - /* Create and start the read threads: */ - for (index = 0; index < n_read_threads; ++index) { - read_thread_data[index].stop = 0; - read_thread_data[index].status = 0; - read_thread_data[index].epoll_set = epoll_set; - if (pthread_create(&read_threads[index], - NULL, - read_thread_function, - &read_thread_data[index]) != 0) - goto error; - } - - if (pthread_create(&write_thread, - NULL, - write_thread_function, - &write_thread_data) != 0) - goto error; - - /* Cancel all event pollers: */ -#ifdef EPOLL_CTL_DISABLE - for (index = 0; index < n_epoll_items; ++index) { - pthread_mutex_lock(&epoll_items[index].mutex); - ++epoll_items[index].stop; - if (epoll_ctl(epoll_set, - EPOLL_CTL_DISABLE, - epoll_items[index].fd, - NULL) == 0) - delete_item(index); - else if (errno != EBUSY) { - pthread_mutex_unlock(&epoll_items[index].mutex); - goto error; - } - /* EBUSY means events were being handled; allow the other thread - to delete the item. */ - pthread_mutex_unlock(&epoll_items[index].mutex); - } -#else - for (index = 0; index < n_epoll_items; ++index) { - pthread_mutex_lock(&epoll_items[index].mutex); - ++epoll_items[index].stop; - pthread_mutex_unlock(&epoll_items[index].mutex); - /* Wait in case a thread running read_thread_function is - currently executing code between epoll_wait and - pthread_mutex_lock with this item. Note that a longer delay - would make double-deletion less likely (at the expense of - performance), but there is no guarantee that any delay would - ever be sufficient. Note also that we delete all event - pollers at once for testing purposes, but in a real-world - environment we are likely to want to be able to cancel event - pollers at arbitrary times. Therefore we can't improve this - situation by just splitting this loop into two loops - (i.e. signal 'stop' for all items, sleep, and then delete all - items). We also can't fix the problem via EPOLL_CTL_DEL - because that command can't prevent the case where some other - thread is executing read_thread_function within the region - mentioned above: */ - usleep(1); - pthread_mutex_lock(&epoll_items[index].mutex); - if (!epoll_items[index].deleted) - delete_item(index); - pthread_mutex_unlock(&epoll_items[index].mutex); - } -#endif - - /* Shut down the read threads: */ - for (index = 0; index < n_read_threads; ++index) - __sync_fetch_and_add(&read_thread_data[index].stop, 1); - for (index = 0; index < n_read_threads; ++index) { - if (pthread_join(read_threads[index], NULL) != 0) - goto error; - if (read_thread_data[index].status) - goto error; - } - - /* Shut down the write thread: */ - __sync_fetch_and_add(&write_thread_data.stop, 1); - if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status) - goto error; - - /* Check for final error conditions: */ - for (index = 0; index < n_epoll_items; ++index) { - if (epoll_items[index].status != 0) - goto error; - if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0) - goto error; - } - for (index = 0; index < n_epoll_items; ++index) - if (epoll_items[index].deleted != 1) { - printf("Error: item data deleted %1d times.\n", - epoll_items[index].deleted); - goto error; - } - - printf("[PASS]\n"); - return 0; - - error: - printf("[FAIL]\n"); - return errno; -} -- cgit v1.2.3-59-g8ed1b From 0bce04be442cf4d6e4ba9dac2f0a4c5ee88af5c5 Mon Sep 17 00:00:00 2001 From: Andreas Larsson Date: Tue, 6 Nov 2012 00:12:03 +0000 Subject: of/address: sparc: Declare of_address_to_resource() as an extern function for sparc again This bug-fix makes sure that of_address_to_resource is defined extern for sparc so that the sparc-specific implementation of of_address_to_resource() is once again used when including include/linux/of_address.h in a sparc context. A number of drivers in mainline relies on this function working for sparc. The bug was introduced in a850a7554442f08d3e910c6eeb4ee216868dda1e, "of/address: add empty static inlines for !CONFIG_OF". Contrary to that commit title, the static inlines are added for !CONFIG_OF_ADDRESS, and CONFIG_OF_ADDRESS is never defined for sparc. This is good behavior for the other functions in include/linux/of_address.h, as the extern functions defined in drivers/of/address.c only gets linked when OF_ADDRESS is configured. However, for of_address_to_resource there exists a sparc-specific implementation in arch/sparc/arch/sparc/kernel/of_device_common.c Solution suggested by: Sam Ravnborg Signed-off-by: Andreas Larsson Acked-by: Rob Herring Signed-off-by: David S. Miller --- arch/sparc/include/asm/prom.h | 5 +++++ include/linux/of_address.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h index c28765110706..f93003123bce 100644 --- a/arch/sparc/include/asm/prom.h +++ b/arch/sparc/include/asm/prom.h @@ -63,5 +63,10 @@ extern char *of_console_options; extern void irq_trans_init(struct device_node *dp); extern char *build_path_component(struct device_node *dp); +/* SPARC has a local implementation */ +extern int of_address_to_resource(struct device_node *dev, int index, + struct resource *r); +#define of_address_to_resource of_address_to_resource + #endif /* __KERNEL__ */ #endif /* _SPARC_PROM_H */ diff --git a/include/linux/of_address.h b/include/linux/of_address.h index a1984dd037da..e20e3af68fb6 100644 --- a/include/linux/of_address.h +++ b/include/linux/of_address.h @@ -28,11 +28,13 @@ static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; } #endif #else /* CONFIG_OF_ADDRESS */ +#ifndef of_address_to_resource static inline int of_address_to_resource(struct device_node *dev, int index, struct resource *r) { return -EINVAL; } +#endif static inline struct device_node *of_find_matching_node_by_address( struct device_node *from, const struct of_device_id *matches, -- cgit v1.2.3-59-g8ed1b From 93532c8a4890871aa0d84dd91b80dad9f58542e0 Mon Sep 17 00:00:00 2001 From: Igor Mazanov Date: Thu, 15 Nov 2012 21:07:00 +0400 Subject: clk: remove inline usage from clk-provider.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users of GCC 4.7 have reported compiler errors due to having inline applied to function declarations in clk-provider.h. The definitions exist in drivers/clk/clk.c. An example error: In file included from arch/arm/mach-omap2/clockdomain.c:25:0: arch/arm/mach-omap2/clockdomain.c: In function ‘clkdm_clk_disable’: include/linux/clk-provider.h:338:12: error: inlining failed in call to always_inline ‘__clk_get_enable_count’: function body not available arch/arm/mach-omap2/clockdomain.c:1001:28: error: called from here make[1]: *** [arch/arm/mach-omap2/clockdomain.o] Error 1 make: *** [arch/arm/mach-omap2] Error 2 This patch removes the use of inline from include/linux/clk-provider.h but keeps the function definitions in drivers/clk/clk.c as inlined since they are one-liners. Signed-off-by: Igor Mazanov Acked-by: Paul Walmsley Signed-off-by: Mike Turquette [mturquette@linaro.org: improved subject, added changelog] --- include/linux/clk-provider.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index c12731582920..f9f5e9eeb9dd 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -335,8 +335,8 @@ const char *__clk_get_name(struct clk *clk); struct clk_hw *__clk_get_hw(struct clk *clk); u8 __clk_get_num_parents(struct clk *clk); struct clk *__clk_get_parent(struct clk *clk); -inline int __clk_get_enable_count(struct clk *clk); -inline int __clk_get_prepare_count(struct clk *clk); +int __clk_get_enable_count(struct clk *clk); +int __clk_get_prepare_count(struct clk *clk); unsigned long __clk_get_rate(struct clk *clk); unsigned long __clk_get_flags(struct clk *clk); int __clk_is_enabled(struct clk *clk); -- cgit v1.2.3-59-g8ed1b From fa0cbbf145aabbf29c6f28f8a11935c0b0fd86fc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 12 Nov 2012 17:53:04 -0800 Subject: mm, oom: reintroduce /proc/pid/oom_adj This is mostly a revert of 01dc52ebdf47 ("oom: remove deprecated oom_adj") from Davidlohr Bueso. It reintroduces /proc/pid/oom_adj for backwards compatibility with earlier kernels. It simply scales the value linearly when /proc/pid/oom_score_adj is written. The major difference is that its scheduled removal is no longer included in Documentation/feature-removal-schedule.txt. We do warn users with a single printk, though, to suggest the more powerful and supported /proc/pid/oom_score_adj interface. Reported-by: Artem S. Tashkinov Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 16 ++++-- fs/proc/base.c | 109 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/oom.h | 9 +++ 3 files changed, 130 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a1793d670cd0..3844d21d6ca3 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -33,7 +33,7 @@ Table of Contents 2 Modifying System Parameters 3 Per-Process Parameters - 3.1 /proc//oom_score_adj - Adjust the oom-killer + 3.1 /proc//oom_adj & /proc//oom_score_adj - Adjust the oom-killer score 3.2 /proc//oom_score - Display current oom-killer score 3.3 /proc//io - Display the IO accounting fields @@ -1320,10 +1320,10 @@ of the kernel. CHAPTER 3: PER-PROCESS PARAMETERS ------------------------------------------------------------------------------ -3.1 /proc//oom_score_adj- Adjust the oom-killer score +3.1 /proc//oom_adj & /proc//oom_score_adj- Adjust the oom-killer score -------------------------------------------------------------------------------- -This file can be used to adjust the badness heuristic used to select which +These file can be used to adjust the badness heuristic used to select which process gets killed in out of memory conditions. The badness heuristic assigns a value to each candidate task ranging from 0 @@ -1361,6 +1361,12 @@ same system, cpuset, mempolicy, or memory controller resources to use at least equivalent to discounting 50% of the task's allowed memory from being considered as scoring against the task. +For backwards compatibility with previous kernels, /proc//oom_adj may also +be used to tune the badness score. Its acceptable values range from -16 +(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17 +(OOM_DISABLE) to disable oom killing entirely for that task. Its value is +scaled linearly with /proc//oom_score_adj. + The value of /proc//oom_score_adj may be reduced no lower than the last value set by a CAP_SYS_RESOURCE process. To reduce the value any lower requires CAP_SYS_RESOURCE. @@ -1375,7 +1381,9 @@ minimal amount of work. ------------------------------------------------------------- This file can be used to check the current score used by the oom-killer is for -any given . +any given . Use it together with /proc//oom_score_adj to tune which +process should be killed in an out-of-memory situation. + 3.3 /proc//io - Display the IO accounting fields ------------------------------------------------------- diff --git a/fs/proc/base.c b/fs/proc/base.c index 144a96732dd7..3c231adf8450 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -873,6 +873,113 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_adj = OOM_ADJUST_MIN; + size_t len; + unsigned long flags; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adj; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adj); + if (err) + goto out; + if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && + oom_adj != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (oom_adj == OOM_ADJUST_MAX) + oom_adj = OOM_SCORE_ADJ_MAX; + else + oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + + task->signal->oom_score_adj = oom_adj; + trace_oom_score_adj_update(task); +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_adj_operations = { + .read = oom_adj_read, + .write = oom_adj_write, + .llseek = generic_file_llseek, +}; + static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -2598,6 +2705,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -2964,6 +3072,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), diff --git a/include/uapi/linux/oom.h b/include/uapi/linux/oom.h index a49c4afc7060..b29272d621ce 100644 --- a/include/uapi/linux/oom.h +++ b/include/uapi/linux/oom.h @@ -8,4 +8,13 @@ #define OOM_SCORE_ADJ_MIN (-1000) #define OOM_SCORE_ADJ_MAX 1000 +/* + * /proc//oom_adj set to -17 protects from the oom killer for legacy + * purposes. + */ +#define OOM_DISABLE (-17) +/* inclusive */ +#define OOM_ADJUST_MIN (-16) +#define OOM_ADJUST_MAX 15 + #endif /* _UAPI__INCLUDE_LINUX_OOM_H */ -- cgit v1.2.3-59-g8ed1b From bea8c150a7efbc0f204e709b7274fe273f55e0d3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 16 Nov 2012 14:14:54 -0800 Subject: memcg: fix hotplugged memory zone oops When MEMCG is configured on (even when it's disabled by boot option), when adding or removing a page to/from its lru list, the zone pointer used for stats updates is nowadays taken from the struct lruvec. (On many configurations, calculating zone from page is slower.) But we have no code to update all the lruvecs (per zone, per memcg) when a memory node is hotadded. Here's an extract from the oops which results when running numactl to bind a program to a newly onlined node: BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 IP: __mod_zone_page_state+0x9/0x60 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) Call Trace: __pagevec_lru_add_fn+0xdf/0x140 pagevec_lru_move_fn+0xb1/0x100 __pagevec_lru_add+0x1c/0x30 lru_add_drain_cpu+0xa3/0x130 lru_add_drain+0x2f/0x40 ... The natural solution might be to use a memcg callback whenever memory is hotadded; but that solution has not been scoped out, and it happens that we do have an easy location at which to update lruvec->zone. The lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by mem_cgroup_page_lruvec(), and both of those do know the right zone. So check and set lruvec->zone in those; and remove the inadequate attempt to set lruvec->zone from lruvec_init(), which is called before NODE_DATA(node) has been allocated in such cases. Ah, there was one exceptionr. For no particularly good reason, mem_cgroup_force_empty_list() has its own code for deciding lruvec. Change it to use the standard mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too. In fact it was already safe against such an oops (the lru lists in danger could only be empty), but we're better proofed against future changes this way. I've marked this for stable (3.6) since we introduced the problem in 3.5 (now closed to stable); but I have no idea if this is the only fix needed to get memory hotadd working with memcg in 3.6, and received no answer when I enquired twice before. Reported-by: Tang Chen Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Konstantin Khlebnikov Cc: Wen Congyang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/memcontrol.c | 46 +++++++++++++++++++++++++++++++++++----------- mm/mmzone.c | 6 +----- mm/page_alloc.c | 2 +- 4 files changed, 38 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 50aaca81f63d..a23923ba8263 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -752,7 +752,7 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size, enum memmap_context context); -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); +extern void lruvec_init(struct lruvec *lruvec); static inline struct zone *lruvec_zone(struct lruvec *lruvec) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93a7e36ded89..dd39ba000b31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, struct mem_cgroup *memcg) { struct mem_cgroup_per_zone *mz; + struct lruvec *lruvec; - if (mem_cgroup_disabled()) - return &zone->lruvec; + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); - return &mz->lruvec; + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; } /* @@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; struct page_cgroup *pc; + struct lruvec *lruvec; - if (mem_cgroup_disabled()) - return &zone->lruvec; + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } pc = lookup_page_cgroup(page); memcg = pc->mem_cgroup; @@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) pc->mem_cgroup = memcg = root_mem_cgroup; mz = page_cgroup_zoneinfo(memcg, page); - return &mz->lruvec; + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; } /** @@ -3697,17 +3721,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct mem_cgroup_per_zone *mz; + struct lruvec *lruvec; unsigned long flags, loop; struct list_head *list; struct page *busy; struct zone *zone; zone = &NODE_DATA(node)->node_zones[zid]; - mz = mem_cgroup_zoneinfo(memcg, node, zid); - list = &mz->lruvec.lists[lru]; + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + list = &lruvec->lists[lru]; - loop = mz->lru_size[lru]; + loop = mem_cgroup_get_lru_size(lruvec, lru); /* give some margin against EBUSY etc...*/ loop += 256; busy = NULL; @@ -4745,7 +4769,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); + lruvec_init(&mz->lruvec); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg; diff --git a/mm/mmzone.c b/mm/mmzone.c index 3cef80f6ac79..4596d81b89b1 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pfn, } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ -void lruvec_init(struct lruvec *lruvec, struct zone *zone) +void lruvec_init(struct lruvec *lruvec) { enum lru_list lru; @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); - -#ifdef CONFIG_MEMCG - lruvec->zone = zone; -#endif } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b74de6702e0..c91598b1b4c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4505,7 +4505,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - lruvec_init(&zone->lruvec, zone); + lruvec_init(&zone->lruvec); if (!size) continue; -- cgit v1.2.3-59-g8ed1b From 2ca3cb50edc351875df13d083524f524cdeb3054 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 16 Nov 2012 14:14:56 -0800 Subject: rapidio: fix kernel-doc warnings Fix rapidio kernel-doc warnings: Warning(drivers/rapidio/rio.c:415): No description found for parameter 'local' Warning(drivers/rapidio/rio.c:415): Excess function parameter 'lstart' description in 'rio_map_inb_region' Warning(include/linux/rio.h:290): No description found for parameter 'switches' Warning(include/linux/rio.h:290): No description found for parameter 'destid_table' Signed-off-by: Randy Dunlap Cc: Matt Porter Acked-by: Alexandre Bounine Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio.c | 2 +- include/linux/rio.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index c17ae22567e0..0c6fcb461faf 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -401,7 +401,7 @@ EXPORT_SYMBOL_GPL(rio_release_inb_pwrite); /** * rio_map_inb_region -- Map inbound memory region. * @mport: Master port. - * @lstart: physical address of memory region to be mapped + * @local: physical address of memory region to be mapped * @rbase: RIO base address assigned to this window * @size: Size of the memory region * @rflags: Flags for mapping. diff --git a/include/linux/rio.h b/include/linux/rio.h index 4187da511006..a3e784278667 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -275,9 +275,11 @@ struct rio_id_table { * struct rio_net - RIO network info * @node: Node in global list of RIO networks * @devices: List of devices in this network + * @switches: List of switches in this netowrk * @mports: List of master ports accessing this network * @hport: Default port for accessing this network * @id: RIO network ID + * @destid_table: destID allocation table */ struct rio_net { struct list_head node; /* node in list of networks */ -- cgit v1.2.3-59-g8ed1b From 5576646f3c1abd60d72d19829de6f5d8c2ca8ecf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 16 Nov 2012 14:15:06 -0800 Subject: revert "mm: fix-up zone present pages" Revert commit 7f1290f2f2a4 ("mm: fix-up zone present pages") That patch tried to fix a issue when calculating zone->present_pages, but it caused a regression on 32bit systems with HIGHMEM. With that change, reset_zone_present_pages() resets all zone->present_pages to zero, and fixup_zone_present_pages() is called to recalculate zone->present_pages when the boot allocator frees core memory pages into buddy allocator. Because highmem pages are not freed by bootmem allocator, all highmem zones' present_pages becomes zero. Various options for improving the situation are being discussed but for now, let's return to the 3.6 code. Cc: Jianguo Wu Cc: Jiang Liu Cc: Petr Tesarik Cc: "Luck, Tony" Cc: Mel Gorman Cc: Yinghai Lu Cc: Minchan Kim Cc: Johannes Weiner Acked-by: David Rientjes Tested-by: Chris Clayton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 1 - include/linux/mm.h | 4 ---- mm/bootmem.c | 10 +--------- mm/memory_hotplug.c | 7 ------- mm/nobootmem.c | 3 --- mm/page_alloc.c | 34 ---------------------------------- 6 files changed, 1 insertion(+), 58 deletions(-) (limited to 'include') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index acd5b68e8871..082e383c1b6f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -637,7 +637,6 @@ mem_init (void) high_memory = __va(max_low_pfn * PAGE_SIZE); - reset_zone_present_pages(); for_each_online_pgdat(pgdat) if (pgdat->bdata->node_bootmem_map) totalram_pages += free_all_bootmem_node(pgdat); diff --git a/include/linux/mm.h b/include/linux/mm.h index fa0680402738..bcaab4e6fe91 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1684,9 +1684,5 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool page_is_guard(struct page *page) { return false; } #endif /* CONFIG_DEBUG_PAGEALLOC */ -extern void reset_zone_present_pages(void); -extern void fixup_zone_present_pages(int nid, unsigned long start_pfn, - unsigned long end_pfn); - #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/bootmem.c b/mm/bootmem.c index 434be4ae7a04..f468185b3b28 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -198,8 +198,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); - fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), - start, start + BITS_PER_LONG); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { @@ -210,9 +208,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); - fixup_zone_present_pages( - page_to_nid(page), - start + off, start + off + 1); count++; } vec >>= 1; @@ -226,11 +221,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; - while (pages--) { - fixup_zone_present_pages(page_to_nid(page), - page_to_pfn(page), page_to_pfn(page) + 1); + while (pages--) __free_pages_bootmem(page++, 0); - } bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 56b758ae57d2..e4eeacae2b91 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -106,7 +106,6 @@ static void get_page_bootmem(unsigned long info, struct page *page, void __ref put_page_bootmem(struct page *page) { unsigned long type; - struct zone *zone; type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -117,12 +116,6 @@ void __ref put_page_bootmem(struct page *page) set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); - - zone = page_zone(page); - zone_span_writelock(zone); - zone->present_pages++; - zone_span_writeunlock(zone); - totalram_pages++; } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 714d5d650470..bd82f6b31411 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -116,8 +116,6 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return 0; __free_pages_memory(start_pfn, end_pfn); - fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), - start_pfn, end_pfn); return end_pfn - start_pfn; } @@ -128,7 +126,6 @@ unsigned long __init free_low_memory_core_early(int nodeid) phys_addr_t start, end, size; u64 i; - reset_zone_present_pages(); for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c91598b1b4c0..7bb35ac0964a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6098,37 +6098,3 @@ void dump_page(struct page *page) dump_page_flags(page->flags); mem_cgroup_print_bad_page(page); } - -/* reset zone->present_pages */ -void reset_zone_present_pages(void) -{ - struct zone *z; - int i, nid; - - for_each_node_state(nid, N_HIGH_MEMORY) { - for (i = 0; i < MAX_NR_ZONES; i++) { - z = NODE_DATA(nid)->node_zones + i; - z->present_pages = 0; - } - } -} - -/* calculate zone's present pages in buddy system */ -void fixup_zone_present_pages(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - struct zone *z; - unsigned long zone_start_pfn, zone_end_pfn; - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) { - z = NODE_DATA(nid)->node_zones + i; - zone_start_pfn = z->zone_start_pfn; - zone_end_pfn = zone_start_pfn + z->spanned_pages; - - /* if the two regions intersect */ - if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) - z->present_pages += min(end_pfn, zone_end_pfn) - - max(start_pfn, zone_start_pfn); - } -} -- cgit v1.2.3-59-g8ed1b