From 8f0bfc25c907f38e7f9dc498e8f43000d77327ef Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 25 Jan 2021 17:14:09 +0100 Subject: watch_queue: rectify kernel-doc for init_watch() The command './scripts/kernel-doc -none kernel/watch_queue.c' reported a mismatch in the kernel-doc of init_watch(). Rectify the kernel-doc, such that no issues remain for watch_queue.c. Signed-off-by: Lukas Bulwahn Signed-off-by: David Howells --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 0ef8f65bd2d7..9c9eb20dd2c5 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -413,7 +413,7 @@ static void put_watch(struct watch *watch) } /** - * init_watch_queue - Initialise a watch + * init_watch - Initialise a watch * @watch: The watch to initialise. * @wqueue: The queue to assign. * -- cgit v1.3-8-gc7d7 From 9dc00b25eadf2908ae76ac0607b55a9f4e0e0cdc Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 6 Feb 2021 00:33:25 +1300 Subject: dma-mapping: benchmark: pretend DMA is transmitting In a real dma mapping user case, after dma_map is done, data will be transmit. Thus, in multi-threaded user scenario, IOMMU contention should not be that severe. For example, if users enable multiple threads to send network packets through 1G/10G/100Gbps NIC, usually the steps will be: map -> transmission -> unmap. Transmission delay reduces the contention of IOMMU. Here a delay is added to simulate the transmission between map and unmap so that the tested result could be more accurate for TX and simple RX. A typical TX transmission for NIC would be like: map -> TX -> unmap since the socket buffers come from OS. Simple RX model eg. disk driver, is also map -> RX -> unmap, but real RX model in a NIC could be more complicated considering packets can come spontaneously and many drivers are using pre-mapped buffers pool. This is in the TBD list. Signed-off-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 12 +++++++++++- tools/testing/selftests/dma/dma_map_benchmark.c | 21 ++++++++++++++++++--- 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index da95df381483..e0e64f8b0739 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -21,6 +21,7 @@ #define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) #define DMA_MAP_MAX_THREADS 1024 #define DMA_MAP_MAX_SECONDS 300 +#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) #define DMA_MAP_BIDIRECTIONAL 0 #define DMA_MAP_TO_DEVICE 1 @@ -36,7 +37,8 @@ struct map_benchmark { __s32 node; /* which numa node this benchmark will run on */ __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ - __u8 expansion[84]; /* For future use */ + __u32 dma_trans_ns; /* time for DMA transmission in ns */ + __u8 expansion[80]; /* For future use */ }; struct map_benchmark_data { @@ -87,6 +89,9 @@ static int map_benchmark_thread(void *data) map_etime = ktime_get(); map_delta = ktime_sub(map_etime, map_stime); + /* Pretend DMA is transmitting */ + ndelay(map->bparam.dma_trans_ns); + unmap_stime = ktime_get(); dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir); unmap_etime = ktime_get(); @@ -218,6 +223,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, return -EINVAL; } + if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) { + pr_err("invalid transmission delay\n"); + return -EINVAL; + } + if (map->bparam.node != NUMA_NO_NODE && !node_possible(map->bparam.node)) { pr_err("invalid numa node\n"); diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index 537d65968c48..fb23ce9617ea 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -12,9 +12,12 @@ #include #include +#define NSEC_PER_MSEC 1000000L + #define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) #define DMA_MAP_MAX_THREADS 1024 #define DMA_MAP_MAX_SECONDS 300 +#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) #define DMA_MAP_BIDIRECTIONAL 0 #define DMA_MAP_TO_DEVICE 1 @@ -36,7 +39,8 @@ struct map_benchmark { __s32 node; /* which numa node this benchmark will run on */ __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ - __u8 expansion[84]; /* For future use */ + __u32 dma_trans_ns; /* time for DMA transmission in ns */ + __u8 expansion[80]; /* For future use */ }; int main(int argc, char **argv) @@ -46,12 +50,12 @@ int main(int argc, char **argv) /* default single thread, run 20 seconds on NUMA_NO_NODE */ int threads = 1, seconds = 20, node = -1; /* default dma mask 32bit, bidirectional DMA */ - int bits = 32, dir = DMA_MAP_BIDIRECTIONAL; + int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; int cmd = DMA_MAP_BENCHMARK; char *p; - while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) { + while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) { switch (opt) { case 't': threads = atoi(optarg); @@ -68,6 +72,9 @@ int main(int argc, char **argv) case 'd': dir = atoi(optarg); break; + case 'x': + xdelay = atoi(optarg); + break; default: return -1; } @@ -85,6 +92,12 @@ int main(int argc, char **argv) exit(1); } + if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) { + fprintf(stderr, "invalid transmit delay, must be in 0-%ld\n", + DMA_MAP_MAX_TRANS_DELAY); + exit(1); + } + /* suppose the mininum DMA zone is 1MB in the world */ if (bits < 20 || bits > 64) { fprintf(stderr, "invalid dma mask bit, must be in 20-64\n"); @@ -109,6 +122,8 @@ int main(int argc, char **argv) map.node = node; map.dma_bits = bits; map.dma_dir = dir; + map.dma_trans_ns = xdelay; + if (ioctl(fd, cmd, &map)) { perror("ioctl"); exit(1); -- cgit v1.3-8-gc7d7 From ed1054a02aa2323f1676093d6e58cde4484d8867 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Fri, 5 Feb 2021 14:26:42 -0800 Subject: irqdomain: Mark fwnodes when their irqdomain is added/removed This allows fw_devlink to recognize irqdomain drivers that don't use the device-driver model to initialize the device. fw_devlink will use this information to make sure consumers of such irqdomain aren't indefinitely blocked from probing, waiting for the irqdomain device to appear and bind to a driver. Tested-by: Marek Szyprowski Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20210205222644.2357303-7-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- kernel/irq/irqdomain.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6aacd342cd14..288151393a06 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -205,6 +205,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, } fwnode_handle_get(fwnode); + fwnode_dev_initialized(fwnode, true); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); @@ -253,6 +254,7 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); + fwnode_dev_initialized(domain->fwnode, false); fwnode_handle_put(domain->fwnode); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); -- cgit v1.3-8-gc7d7 From 81d88ce55092edf1a1f928efb373f289c6b90efd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 16:38:40 +0100 Subject: dma-mapping: remove the {alloc,free}_noncoherent methods It turns out allowing non-contigous allocations here was a rather bad idea, as we'll now need to define ways to get the pages for mmaping or dma_buf sharing. Revert this change and stick to the original concept. A different API for the use case of non-contigous allocations will be added back later. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda :wq --- Documentation/core-api/dma-api.rst | 64 +++++++++++++------------------------- drivers/iommu/dma-iommu.c | 30 ------------------ include/linux/dma-map-ops.h | 5 --- include/linux/dma-mapping.h | 17 +++++++--- kernel/dma/mapping.c | 40 ------------------------ 5 files changed, 35 insertions(+), 121 deletions(-) (limited to 'kernel') diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 75cb757bbff0..e6d23f117308 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -528,16 +528,14 @@ an I/O device, you should not be using this part of the API. :: - void * - dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, enum dma_data_direction dir, - gfp_t gfp) + struct page * + dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, + enum dma_data_direction dir, gfp_t gfp) -This routine allocates a region of bytes of consistent memory. It -returns a pointer to the allocated region (in the processor's virtual address -space) or NULL if the allocation failed. The returned memory may or may not -be in the kernel direct mapping. Drivers must not call virt_to_page on -the returned memory region. +This routine allocates a region of bytes of non-coherent memory. It +returns a pointer to first struct page for the region, or NULL if the +allocation failed. The resulting struct page can be used for everything a +struct page is suitable for. It also returns a which may be cast to an unsigned integer the same width as the bus and given to the device as the DMA address base of @@ -558,51 +556,33 @@ reused. :: void - dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr, + dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) -Free a region of memory previously allocated using dma_alloc_noncoherent(). -dev, size and dma_handle and dir must all be the same as those passed into -dma_alloc_noncoherent(). cpu_addr must be the virtual address returned by -dma_alloc_noncoherent(). +Free a region of memory previously allocated using dma_alloc_pages(). +dev, size, dma_handle and dir must all be the same as those passed into +dma_alloc_pages(). page must be the pointer returned by dma_alloc_pages(). :: - struct page * - dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, - enum dma_data_direction dir, gfp_t gfp) - -This routine allocates a region of bytes of non-coherent memory. It -returns a pointer to first struct page for the region, or NULL if the -allocation failed. The resulting struct page can be used for everything a -struct page is suitable for. - -It also returns a which may be cast to an unsigned integer the -same width as the bus and given to the device as the DMA address base of -the region. - -The dir parameter specified if data is read and/or written by the device, -see dma_map_single() for details. - -The gfp parameter allows the caller to specify the ``GFP_`` flags (see -kmalloc()) for the allocation, but rejects flags used to specify a memory -zone such as GFP_DMA or GFP_HIGHMEM. + void * + dma_alloc_noncoherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, + gfp_t gfp) -Before giving the memory to the device, dma_sync_single_for_device() needs -to be called, and before reading memory written by the device, -dma_sync_single_for_cpu(), just like for streaming DMA mappings that are -reused. +This routine is a convenient wrapper around dma_alloc_pages that returns the +kernel virtual address for the allocated memory instead of the page structure. :: void - dma_free_pages(struct device *dev, size_t size, struct page *page, + dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, enum dma_data_direction dir) -Free a region of memory previously allocated using dma_alloc_pages(). -dev, size and dma_handle and dir must all be the same as those passed into -dma_alloc_noncoherent(). page must be the pointer returned by -dma_alloc_pages(). +Free a region of memory previously allocated using dma_alloc_noncoherent(). +dev, size, dma_handle and dir must all be the same as those passed into +dma_alloc_noncoherent(). cpu_addr must be the virtual address returned by +dma_alloc_noncoherent(). :: diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 4078358ed66e..255533faf905 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1197,34 +1197,6 @@ static void *iommu_dma_alloc(struct device *dev, size_t size, return cpu_addr; } -#ifdef CONFIG_DMA_REMAP -static void *iommu_dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *handle, enum dma_data_direction dir, gfp_t gfp) -{ - if (!gfpflags_allow_blocking(gfp)) { - struct page *page; - - page = dma_common_alloc_pages(dev, size, handle, dir, gfp); - if (!page) - return NULL; - return page_address(page); - } - - return iommu_dma_alloc_remap(dev, size, handle, gfp | __GFP_ZERO, - PAGE_KERNEL, 0); -} - -static void iommu_dma_free_noncoherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t handle, enum dma_data_direction dir) -{ - __iommu_dma_unmap(dev, handle, size); - __iommu_dma_free(dev, size, cpu_addr); -} -#else -#define iommu_dma_alloc_noncoherent NULL -#define iommu_dma_free_noncoherent NULL -#endif /* CONFIG_DMA_REMAP */ - static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) @@ -1295,8 +1267,6 @@ static const struct dma_map_ops iommu_dma_ops = { .free = iommu_dma_free, .alloc_pages = dma_common_alloc_pages, .free_pages = dma_common_free_pages, - .alloc_noncoherent = iommu_dma_alloc_noncoherent, - .free_noncoherent = iommu_dma_free_noncoherent, .mmap = iommu_dma_mmap, .get_sgtable = iommu_dma_get_sgtable, .map_page = iommu_dma_map_page, diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 70fcd0f610ea..11e02537b9e0 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -22,11 +22,6 @@ struct dma_map_ops { gfp_t gfp); void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir); - void *(*alloc_noncoherent)(struct device *dev, size_t size, - dma_addr_t *dma_handle, enum dma_data_direction dir, - gfp_t gfp); - void (*free_noncoherent)(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, enum dma_data_direction dir); int (*mmap)(struct device *, struct vm_area_struct *, void *, dma_addr_t, size_t, unsigned long attrs); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2e49996a8f39..fbfa3f5abd94 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -263,10 +263,19 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir); -void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); -void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, enum dma_data_direction dir); + +static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + struct page *page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); + return page ? page_address(page) : NULL; +} + +static inline void dma_free_noncoherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir) +{ + dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); +} static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index f87a89d08654..68992e35c8c3 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -515,46 +515,6 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, } EXPORT_SYMBOL_GPL(dma_free_pages); -void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - void *vaddr; - - if (!ops || !ops->alloc_noncoherent) { - struct page *page; - - page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); - if (!page) - return NULL; - return page_address(page); - } - - size = PAGE_ALIGN(size); - vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp); - if (vaddr) - debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir, - *dma_handle); - return vaddr; -} -EXPORT_SYMBOL_GPL(dma_alloc_noncoherent); - -void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (!ops || !ops->free_noncoherent) { - dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); - return; - } - - size = PAGE_ALIGN(size); - debug_dma_unmap_page(dev, dma_handle, size, dir); - ops->free_noncoherent(dev, size, vaddr, dma_handle, dir); -} -EXPORT_SYMBOL_GPL(dma_free_noncoherent); - int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); -- cgit v1.3-8-gc7d7 From db1cc7aede37eb9235759131ddfefd9c0ea5136f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Feb 2021 00:40:53 +0100 Subject: softirq: Move do_softirq_own_stack() to generic asm header To avoid include recursion hell move the do_softirq_own_stack() related content into a generic asm header and include it from all places in arch/ which need the prototype. This allows architectures to provide an inline implementation of do_softirq_own_stack() without introducing a lot of #ifdeffery all over the place. Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20210210002513.289960691@linutronix.de --- arch/parisc/kernel/irq.c | 1 + arch/powerpc/kernel/irq.c | 1 + arch/s390/kernel/irq.c | 1 + arch/sh/kernel/irq.c | 1 + arch/sparc/kernel/irq_64.c | 1 + arch/x86/kernel/irq_32.c | 1 + arch/x86/kernel/irq_64.c | 1 + include/asm-generic/Kbuild | 1 + include/asm-generic/softirq_stack.h | 14 ++++++++++++++ include/linux/interrupt.h | 9 --------- kernel/softirq.c | 2 ++ 11 files changed, 24 insertions(+), 9 deletions(-) create mode 100644 include/asm-generic/softirq_stack.h (limited to 'kernel') diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 49cd6d2caefb..1632d525ad64 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 6b1eca53e36c..96296d3383a7 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -65,6 +65,7 @@ #include #include #include +#include #ifdef CONFIG_PPC64 #include diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index f8a8b9428ae2..a1a2f75f3c09 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "entry.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index ab5f790b0cd2..ef0f0827cf57 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -20,6 +20,7 @@ #include #include #include +#include atomic_t irq_err_count; diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 3ec9f1402aad..c8848bb681a1 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "entry.h" #include "cpumap.h" diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 0b79efc87be5..044902d5a3c4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -22,6 +22,7 @@ #include #include +#include #ifdef CONFIG_DEBUG_STACKOVERFLOW diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index b88fdb9686e6..f335c3910834 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,7 @@ #include #include +#include #include #include #include diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 267f6dfb8960..bd684188e2dc 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -51,6 +51,7 @@ mandatory-y += sections.h mandatory-y += serial.h mandatory-y += shmparam.h mandatory-y += simd.h +mandatory-y += softirq_stack.h mandatory-y += switch_to.h mandatory-y += timex.h mandatory-y += tlbflush.h diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h new file mode 100644 index 000000000000..eceeecf6a5bd --- /dev/null +++ b/include/asm-generic/softirq_stack.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_GENERIC_SOFTIRQ_STACK_H +#define __ASM_GENERIC_SOFTIRQ_STACK_H + +#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK +void do_softirq_own_stack(void); +#else +static inline void do_softirq_own_stack(void) +{ + __do_softirq(); +} +#endif + +#endif diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f0b918f393a2..967e25767153 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -569,15 +569,6 @@ struct softirq_action asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); -#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK -void do_softirq_own_stack(void); -#else -static inline void do_softirq_own_stack(void) -{ - __do_softirq(); -} -#endif - extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); diff --git a/kernel/softirq.c b/kernel/softirq.c index 9d71046ea247..9908ec4a9bfe 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -26,6 +26,8 @@ #include #include +#include + #define CREATE_TRACE_POINTS #include -- cgit v1.3-8-gc7d7 From 88a686728b3739d3598851e729c0e81f194e5c53 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 12 Feb 2021 11:29:24 -0500 Subject: kbuild: simplify access to the kernel's version Instead of storing the version in a single integer and having various kernel (and userspace) code how it's constructed, export individual (major, patchlevel, sublevel) components and simplify kernel code that uses it. This should also make it easier on userspace. Signed-off-by: Sasha Levin Acked-by: Greg Kroah-Hartman Signed-off-by: Masahiro Yamada --- Makefile | 5 ++++- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 ++-- drivers/usb/core/hcd.c | 4 ++-- drivers/usb/gadget/udc/aspeed-vhub/hub.c | 4 ++-- include/linux/usb/composite.h | 4 ++-- kernel/sys.c | 2 +- 6 files changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/Makefile b/Makefile index 12607d389148..1fdd44fe1659 100644 --- a/Makefile +++ b/Makefile @@ -1255,7 +1255,10 @@ define filechk_version.h expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + $(SUBLEVEL)); \ fi; \ echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + \ - ((c) > 255 ? 255 : (c)))' + ((c) > 255 ? 255 : (c)))'; \ + echo \#define LINUX_VERSION_MAJOR $(VERSION); \ + echo \#define LINUX_VERSION_PATCHLEVEL $(PATCHLEVEL); \ + echo \#define LINUX_VERSION_SUBLEVEL $(SUBLEVEL) endef $(version_h): FORCE diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ca6f2fc39ea0..29f886263dc5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -235,8 +235,8 @@ static void mlx5_set_driver_version(struct mlx5_core_dev *dev) remaining_size = max_t(int, 0, driver_ver_sz - strlen(string)); snprintf(string + strlen(string), remaining_size, "%u.%u.%u", - (u8)((LINUX_VERSION_CODE >> 16) & 0xff), (u8)((LINUX_VERSION_CODE >> 8) & 0xff), - (u16)(LINUX_VERSION_CODE & 0xffff)); + LINUX_VERSION_MAJOR, LINUX_VERSION_PATCHLEVEL, + LINUX_VERSION_SUBLEVEL); /*Send the command*/ MLX5_SET(set_driver_version_in, in, opcode, diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index ad5a0f405a75..3f0381344221 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -111,8 +111,8 @@ DECLARE_WAIT_QUEUE_HEAD(usb_kill_urb_queue); */ /*-------------------------------------------------------------------------*/ -#define KERNEL_REL bin2bcd(((LINUX_VERSION_CODE >> 16) & 0x0ff)) -#define KERNEL_VER bin2bcd(((LINUX_VERSION_CODE >> 8) & 0x0ff)) +#define KERNEL_REL bin2bcd(LINUX_VERSION_MAJOR) +#define KERNEL_VER bin2bcd(LINUX_VERSION_PATCHLEVEL) /* usb 3.1 root hub device descriptor */ static const u8 usb31_rh_dev_descriptor[18] = { diff --git a/drivers/usb/gadget/udc/aspeed-vhub/hub.c b/drivers/usb/gadget/udc/aspeed-vhub/hub.c index bfd8e77788e2..5c7dea5e0ff1 100644 --- a/drivers/usb/gadget/udc/aspeed-vhub/hub.c +++ b/drivers/usb/gadget/udc/aspeed-vhub/hub.c @@ -46,8 +46,8 @@ * - Make vid/did overridable * - make it look like usb1 if usb1 mode forced */ -#define KERNEL_REL bin2bcd(((LINUX_VERSION_CODE >> 16) & 0x0ff)) -#define KERNEL_VER bin2bcd(((LINUX_VERSION_CODE >> 8) & 0x0ff)) +#define KERNEL_REL bin2bcd(LINUX_VERSION_MAJOR) +#define KERNEL_VER bin2bcd(LINUX_VERSION_PATCHLEVEL) enum { AST_VHUB_STR_INDEX_MAX = 4, diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index a2d229ab63ba..7531ce723374 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -573,8 +573,8 @@ static inline u16 get_default_bcdDevice(void) { u16 bcdDevice; - bcdDevice = bin2bcd((LINUX_VERSION_CODE >> 16 & 0xff)) << 8; - bcdDevice |= bin2bcd((LINUX_VERSION_CODE >> 8 & 0xff)); + bcdDevice = bin2bcd(LINUX_VERSION_MAJOR) << 8; + bcdDevice |= bin2bcd(LINUX_VERSION_PATCHLEVEL); return bcdDevice; } diff --git a/kernel/sys.c b/kernel/sys.c index 51f00fe20e4d..c2225bd405d5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1243,7 +1243,7 @@ static int override_release(char __user *release, size_t len) break; rest++; } - v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60; + v = LINUX_VERSION_PATCHLEVEL + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); -- cgit v1.3-8-gc7d7 From 71f1309f4f5b70aa3f1342a52b1460aa454c39ff Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Thu, 18 Feb 2021 17:01:32 +0800 Subject: cpufreq: schedutil: Remove needless sg_policy parameter from ignore_dl_rate_limit() Since sg_policy is a member of struct sugov_cpu. Also remove the local variable in sugov_update_single_common() to make the code more clean. Signed-off-by: Yue Hu Acked-by: Viresh Kumar [ rjw: Minor subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6931f0cdeb80..68fd485504a6 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -426,23 +426,21 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } * Make sugov_should_update_freq() ignore the rate limit when DL * has increased the utilization. */ -static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) +static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_policy->limits_changed = true; + sg_cpu->sg_policy->limits_changed = true; } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { - struct sugov_policy *sg_policy = sg_cpu->sg_policy; - sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - ignore_dl_rate_limit(sg_cpu, sg_policy); + ignore_dl_rate_limit(sg_cpu); - if (!sugov_should_update_freq(sg_policy, time)) + if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) return false; sugov_get_util(sg_cpu); @@ -557,7 +555,7 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - ignore_dl_rate_limit(sg_cpu, sg_policy); + ignore_dl_rate_limit(sg_cpu); if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); -- cgit v1.3-8-gc7d7 From e209cb51bfcceda7519b8ba1094c8ba41a658ce8 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Thu, 18 Feb 2021 17:37:53 +0800 Subject: cpufreq: schedutil: Remove update_lock comment from struct sugov_policy definition Currently, update_lock is also used in sugov_update_single_freq(). The comment is not helpful anymore. Signed-off-by: Yue Hu Acked-by: Viresh Kumar [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 68fd485504a6..fe7df039ce9e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -26,7 +26,7 @@ struct sugov_policy { struct sugov_tunables *tunables; struct list_head tunables_hook; - raw_spinlock_t update_lock; /* For shared policies */ + raw_spinlock_t update_lock; u64 last_freq_update_time; s64 freq_update_delay_ns; unsigned int next_freq; -- cgit v1.3-8-gc7d7 From b5d7ccb7aac3895c2138fe0980a109116ce15eff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Feb 2021 11:18:40 +0100 Subject: swiotlb: add a IO_TLB_SIZE define Add a new IO_TLB_SIZE define instead open coding it using IO_TLB_SHIFT all over. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 1 + kernel/dma/swiotlb.c | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index d9c9fc9ca5d2..5857a937c637 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -29,6 +29,7 @@ enum swiotlb_force { * controllable. */ #define IO_TLB_SHIFT 11 +#define IO_TLB_SIZE (1 << IO_TLB_SHIFT) /* default to 64MB */ #define IO_TLB_DEFAULT_SIZE (64UL<<20) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 7c42df6e6100..768187d0e17e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -171,7 +171,7 @@ void __init swiotlb_adjust_size(unsigned long new_size) * adjust/expand SWIOTLB size for their use. */ if (!io_tlb_nslabs) { - size = ALIGN(new_size, 1 << IO_TLB_SHIFT); + size = ALIGN(new_size, IO_TLB_SIZE); io_tlb_nslabs = size >> IO_TLB_SHIFT; io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); @@ -491,20 +491,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, tbl_dma_addr &= mask; - offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT; /* * Carefully handle integer overflow which can occur when mask == ~0UL. */ max_slots = mask + 1 - ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT + ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); /* * For mappings greater than or equal to a page, we limit the stride * (and hence alignment) to a page size. */ - nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; if (alloc_size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else @@ -598,7 +598,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; @@ -649,7 +649,7 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, if (orig_addr == INVALID_PHYS_ADDR) return; - orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); + orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1); switch (target) { case SYNC_FOR_CPU: @@ -707,7 +707,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, size_t swiotlb_max_mapping_size(struct device *dev) { - return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE; + return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; } bool is_swiotlb_active(void) -- cgit v1.3-8-gc7d7 From c7fbeca757fe74135d8b6a4c8ddaef76f5775d68 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Feb 2021 10:11:20 +0100 Subject: swiotlb: factor out an io_tlb_offset helper Replace the very genericly named OFFSET macro with a little inline helper that hardcodes the alignment to the only value ever passed. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 768187d0e17e..7705821dcdbd 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -50,9 +50,6 @@ #define CREATE_TRACE_POINTS #include -#define OFFSET(val,align) ((unsigned long) \ - ( (val) & ( (align) - 1))) - #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) /* @@ -192,6 +189,11 @@ void swiotlb_print_info(void) bytes >> 20); } +static inline unsigned long io_tlb_offset(unsigned long val) +{ + return val & (IO_TLB_SEGSIZE - 1); +} + /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to @@ -241,7 +243,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) __func__, alloc_size, PAGE_SIZE); for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; @@ -375,7 +377,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) goto cleanup4; for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; @@ -546,7 +548,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, for (i = index; i < (int) (index + nslots); i++) io_tlb_list[i] = 0; - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) io_tlb_list[i] = ++count; tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); @@ -632,7 +636,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * Step 2: merge the returned slots with the preceding slots, * if available (non zero) */ - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) io_tlb_list[i] = ++count; io_tlb_used -= nslots; -- cgit v1.3-8-gc7d7 From c32a77fd18780a5192dfb6eec69f239faebf28fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Feb 2021 11:19:34 +0100 Subject: swiotlb: factor out a nr_slots helper Factor out a helper to find the number of slots for a given size. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 7705821dcdbd..9492219b0743 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -194,6 +194,11 @@ static inline unsigned long io_tlb_offset(unsigned long val) return val & (IO_TLB_SEGSIZE - 1); } +static inline unsigned long nr_slots(u64 val) +{ + return DIV_ROUND_UP(val, IO_TLB_SIZE); +} + /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to @@ -493,20 +498,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, tbl_dma_addr &= mask; - offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT; + offset_slots = nr_slots(tbl_dma_addr); /* * Carefully handle integer overflow which can occur when mask == ~0UL. */ max_slots = mask + 1 - ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT + ? nr_slots(mask + 1) : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); /* * For mappings greater than or equal to a page, we limit the stride * (and hence alignment) to a page size. */ - nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; + nslots = nr_slots(alloc_size); if (alloc_size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else @@ -602,7 +607,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; + int i, count, nslots = nr_slots(alloc_size); int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; -- cgit v1.3-8-gc7d7 From ca10d0f8e530600ec63c603dbace2c30927d70b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Feb 2021 10:13:40 +0100 Subject: swiotlb: clean up swiotlb_tbl_unmap_single Remove a layer of pointless indentation, replace a hard to follow ternary expression with a plain if/else. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9492219b0743..b38b1553c466 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -626,28 +626,29 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * with slots below and above the pool being returned. */ spin_lock_irqsave(&io_tlb_lock, flags); - { - count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? - io_tlb_list[index + nslots] : 0); - /* - * Step 1: return the slots to the free list, merging the - * slots with superceeding slots - */ - for (i = index + nslots - 1; i >= index; i--) { - io_tlb_list[i] = ++count; - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - /* - * Step 2: merge the returned slots with the preceding slots, - * if available (non zero) - */ - for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; + if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) + count = io_tlb_list[index + nslots]; + else + count = 0; - io_tlb_used -= nslots; + /* + * Step 1: return the slots to the free list, merging the slots with + * superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) { + io_tlb_list[i] = ++count; + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } + + /* + * Step 2: merge the returned slots with the preceding slots, if + * available (non zero) + */ + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i]; + i--) + io_tlb_list[i] = ++count; + io_tlb_used -= nslots; spin_unlock_irqrestore(&io_tlb_lock, flags); } -- cgit v1.3-8-gc7d7 From 6fb8f43cede0e4bd3ead847de78d531424a96be9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Feb 2021 07:01:11 -0700 Subject: kernel: treat PF_IO_WORKER like PF_KTHREAD for ptrace/signals Signed-off-by: Jens Axboe --- kernel/ptrace.c | 2 +- kernel/signal.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 61db50f7ca86..821cf1723814 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request, audit_ptrace(task); retval = -EPERM; - if (unlikely(task->flags & PF_KTHREAD)) + if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER))) goto out; if (same_thread_group(task, current)) goto out; diff --git a/kernel/signal.c b/kernel/signal.c index 5ad8566534e7..ba4d1ef39a9e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) return true; /* Only allow kernel generated signals to this kthread */ - if (unlikely((t->flags & PF_KTHREAD) && + if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) && (handler == SIG_KTHREAD_KERNEL) && !force)) return true; @@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc /* * Skip useless siginfo allocation for SIGKILL and kernel threads. */ - if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) + if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER))) goto out_set; /* -- cgit v1.3-8-gc7d7 From 2596b6ae412be3d29632efc63976a2132032e620 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 19 Feb 2021 14:51:42 -0500 Subject: kexec: move machine_kexec_post_load() to public interface The kernel test robot reports the following compiler warning: | arch/arm64/kernel/machine_kexec.c:62:5: warning: no previous prototype for | function 'machine_kexec_post_load' [-Wmissing-prototypes] | int machine_kexec_post_load(struct kimage *kimage) Fix it by moving the declaration of machine_kexec_post_load() from kexec_internal.h to the public header instead. Reported-by: kernel test robot Link: https://lore.kernel.org/linux-arm-kernel/202102030727.gqTokACH-lkp@intel.com Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210219195142.13571-1-pasha.tatashin@soleen.com Fixes: 4c3c31230c91 ("arm64: kexec: move relocation function setup") Signed-off-by: Will Deacon --- include/linux/kexec.h | 2 ++ kernel/kexec_internal.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9e93bef52968..3671b845cf28 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -309,6 +309,8 @@ extern void machine_kexec_cleanup(struct kimage *image); extern int kernel_kexec(void); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); +int machine_kexec_post_load(struct kimage *image); + extern void __crash_kexec(struct pt_regs *); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 39d30ccf8d87..48aaf2ac0d0d 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -13,8 +13,6 @@ void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -int machine_kexec_post_load(struct kimage *image); - extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE -- cgit v1.3-8-gc7d7 From 179d1600723670dc0d6ae8ce572e0e2c44b64763 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 21 Feb 2021 21:29:55 -0800 Subject: block: remove superfluous param in blk_fill_rwbs() The last parameter for the function blk_fill_rwbs() was added in 5782138e47 ("tracing/events: convert block trace points to TRACE_EVENT()") in order to signal read request and use of that parameter was replaced with using switch case REQ_OP_READ with 1b9a9ab78b0 ("blktrace: use op accessors"), but the parameter was never removed. Remove the unused parameter and adjust the respective call sites. Fixes: 1b9a9ab78b0 ("blktrace: use op accessors") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 2 +- include/trace/events/bcache.h | 10 +++++----- include/trace/events/block.h | 16 ++++++++-------- kernel/trace/blktrace.c | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 05556573b896..11484f1d19a1 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -119,7 +119,7 @@ struct compat_blk_user_trace_setup { #endif -extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes); +extern void blk_fill_rwbs(char *rwbs, unsigned int op); static inline sector_t blk_rq_trace_sector(struct request *rq) { diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index e41c611d6d3b..899fdacf57b9 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->sector = bio->bi_iter.bi_sector; __entry->orig_sector = bio->bi_iter.bi_sector - 16; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", @@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u", @@ -137,7 +137,7 @@ TRACE_EVENT(bcache_read, __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); __entry->cache_hit = hit; __entry->bypass = bypass; ), @@ -168,7 +168,7 @@ TRACE_EVENT(bcache_write, __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); __entry->writeback = writeback; __entry->bypass = bypass; ), @@ -238,7 +238,7 @@ TRACE_EVENT(bcache_journal_write, __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; __entry->nr_keys = keys; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u keys %u", diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 0d782663a005..879cba8bdfca 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -89,7 +89,7 @@ TRACE_EVENT(block_rq_requeue, __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), @@ -133,7 +133,7 @@ TRACE_EVENT(block_rq_complete, __entry->nr_sector = nr_bytes >> 9; __entry->error = error; - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), @@ -166,7 +166,7 @@ DECLARE_EVENT_CLASS(block_rq, __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -249,7 +249,7 @@ TRACE_EVENT(block_bio_complete, __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = blk_status_to_errno(bio->bi_status); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u [%d]", @@ -276,7 +276,7 @@ DECLARE_EVENT_CLASS(block_bio, __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -433,7 +433,7 @@ TRACE_EVENT(block_split, __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -474,7 +474,7 @@ TRACE_EVENT(block_bio_remap, __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", @@ -518,7 +518,7 @@ TRACE_EVENT(block_rq_remap, __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e9ee4945043..8a2591c7aa41 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1867,7 +1867,7 @@ void blk_trace_remove_sysfs(struct device *dev) #ifdef CONFIG_EVENT_TRACING -void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) +void blk_fill_rwbs(char *rwbs, unsigned int op) { int i = 0; -- cgit v1.3-8-gc7d7 From 1f83bb4b491472310ae7aeca505ed3725149906c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 21 Feb 2021 21:29:56 -0800 Subject: blktrace: add blk_fill_rwbs documentation comment blk_fill_rwbs() is an expoted function, add kernel style documentation comment. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 2 +- kernel/trace/blktrace.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 11484f1d19a1..e17d04abf6a3 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -119,7 +119,7 @@ struct compat_blk_user_trace_setup { #endif -extern void blk_fill_rwbs(char *rwbs, unsigned int op); +void blk_fill_rwbs(char *rwbs, unsigned int op); static inline sector_t blk_rq_trace_sector(struct request *rq) { diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8a2591c7aa41..d7ebef83771c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1867,6 +1867,16 @@ void blk_trace_remove_sysfs(struct device *dev) #ifdef CONFIG_EVENT_TRACING +/** + * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. + * @rwbs buffer to be filled + * @op: REQ_OP_XXX for the tracepoint + * + * Description: + * Maps the REQ_OP_XXX to character and fills the buffer provided by the + * caller with resulting string. + * + **/ void blk_fill_rwbs(char *rwbs, unsigned int op) { int i = 0; -- cgit v1.3-8-gc7d7 From 26a7e094783d482f3e125f09945a5bb1d867b2e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Feb 2021 11:08:35 +0100 Subject: swiotlb: refactor swiotlb_tbl_map_single Split out a bunch of a self-contained helpers to make the function easier to follow. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 179 +++++++++++++++++++++++++-------------------------- 1 file changed, 89 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b38b1553c466..6962cb4efb08 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -468,134 +468,133 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, - size_t mapping_size, size_t alloc_size, - enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); - unsigned long flags; - phys_addr_t tlb_addr; - unsigned int nslots, stride, index, wrap; - int i; - unsigned long mask; - unsigned long offset_slots; - unsigned long max_slots; - unsigned long tmp_io_tlb_used; - - if (no_iotlb_memory) - panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); - - if (mem_encrypt_active()) - pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); +#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) - if (mapping_size > alloc_size) { - dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", - mapping_size, alloc_size); - return (phys_addr_t)DMA_MAPPING_ERROR; - } - - mask = dma_get_seg_boundary(hwdev); +/* + * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. + */ +static inline unsigned long get_max_slots(unsigned long boundary_mask) +{ + if (boundary_mask == ~0UL) + return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); + return nr_slots(boundary_mask + 1); +} - tbl_dma_addr &= mask; +static unsigned int wrap_index(unsigned int index) +{ + if (index >= io_tlb_nslabs) + return 0; + return index; +} - offset_slots = nr_slots(tbl_dma_addr); +/* + * Find a suitable number of IO TLB entries size that will fit this request and + * allocate a buffer from that IO TLB pool. + */ +static int find_slots(struct device *dev, size_t alloc_size) +{ + unsigned long boundary_mask = dma_get_seg_boundary(dev); + dma_addr_t tbl_dma_addr = + phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; + unsigned long max_slots = get_max_slots(boundary_mask); + unsigned int nslots = nr_slots(alloc_size), stride = 1; + unsigned int index, wrap, count = 0, i; + unsigned long flags; - /* - * Carefully handle integer overflow which can occur when mask == ~0UL. - */ - max_slots = mask + 1 - ? nr_slots(mask + 1) - : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); + BUG_ON(!nslots); /* * For mappings greater than or equal to a page, we limit the stride * (and hence alignment) to a page size. */ - nslots = nr_slots(alloc_size); if (alloc_size >= PAGE_SIZE) - stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); - else - stride = 1; + stride <<= (PAGE_SHIFT - IO_TLB_SHIFT); - BUG_ON(!nslots); - - /* - * Find suitable number of IO TLB entries size that will fit this - * request and allocate a buffer from that IO TLB pool. - */ spin_lock_irqsave(&io_tlb_lock, flags); - if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) goto not_found; - index = ALIGN(io_tlb_index, stride); - if (index >= io_tlb_nslabs) - index = 0; - wrap = index; - + index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); do { - while (iommu_is_span_boundary(index, nslots, offset_slots, - max_slots)) { - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - if (index == wrap) - goto not_found; - } - /* * If we find a slot that indicates we have 'nslots' number of * contiguous buffers, we allocate the buffers from that slot * and mark the entries as '0' indicating unavailable. */ - if (io_tlb_list[index] >= nslots) { - int count = 0; - - for (i = index; i < (int) (index + nslots); i++) - io_tlb_list[i] = 0; - for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; - tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); - - /* - * Update the indices to avoid searching in the next - * round. - */ - io_tlb_index = ((index + nslots) < io_tlb_nslabs - ? (index + nslots) : 0); - - goto found; + if (!iommu_is_span_boundary(index, nslots, + nr_slots(tbl_dma_addr), + max_slots)) { + if (io_tlb_list[index] >= nslots) + goto found; } - index += stride; - if (index >= io_tlb_nslabs) - index = 0; + index = wrap_index(index + stride); } while (index != wrap); not_found: - tmp_io_tlb_used = io_tlb_used; - spin_unlock_irqrestore(&io_tlb_lock, flags); - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, io_tlb_nslabs, tmp_io_tlb_used); - return (phys_addr_t)DMA_MAPPING_ERROR; + return -1; + found: + for (i = index; i < index + nslots; i++) + io_tlb_list[i] = 0; + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + + /* + * Update the indices to avoid searching in the next round. + */ + if (index + nslots < io_tlb_nslabs) + io_tlb_index = index + nslots; + else + io_tlb_index = 0; io_tlb_used += nslots; + spin_unlock_irqrestore(&io_tlb_lock, flags); + return index; +} + +phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) +{ + unsigned int index, i; + phys_addr_t tlb_addr; + + if (no_iotlb_memory) + panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + + if (mem_encrypt_active()) + pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); + + if (mapping_size > alloc_size) { + dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", + mapping_size, alloc_size); + return (phys_addr_t)DMA_MAPPING_ERROR; + } + + index = find_slots(dev, alloc_size); + if (index == -1) { + if (!(attrs & DMA_ATTR_NO_WARN)) + dev_warn_ratelimited(dev, + "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", + alloc_size, io_tlb_nslabs, io_tlb_used); + return (phys_addr_t)DMA_MAPPING_ERROR; + } /* * Save away the mapping from the original address to the DMA address. * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nslots; i++) - io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); + for (i = 0; i < nr_slots(alloc_size); i++) + io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); + + tlb_addr = slot_addr(io_tlb_start, index); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); - return tlb_addr; } -- cgit v1.3-8-gc7d7 From 16fc3cef33a04632ab6b31758abdd77563a20759 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Feb 2021 10:44:16 +0100 Subject: swiotlb: don't modify orig_addr in swiotlb_tbl_sync_single swiotlb_tbl_map_single currently nevers sets a tlb_addr that is not aligned to the tlb bucket size. But we're going to add such a case soon, for which this adjustment would be bogus. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 6962cb4efb08..bf063badda1d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -660,7 +660,6 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, if (orig_addr == INVALID_PHYS_ADDR) return; - orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1); switch (target) { case SYNC_FOR_CPU: -- cgit v1.3-8-gc7d7 From b1adbdbda458b2ec69bf5915c4dcdbe2bd5e7bad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Jan 2021 21:36:33 -0500 Subject: audit_alloc_mark(): don't open-code ERR_CAST() Signed-off-by: Al Viro --- kernel/audit_fsnotify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 5b3f01da172b..60739d5e3373 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -84,7 +84,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa dentry = kern_path_locked(pathname, &path); if (IS_ERR(dentry)) - return (void *)dentry; /* returning an error */ + return ERR_CAST(dentry); /* returning an error */ inode = path.dentry->d_inode; inode_unlock(inode); -- cgit v1.3-8-gc7d7 From c0ea57608b691d6cde8aff23e11f9858a86b5918 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 16 Feb 2021 16:52:47 +0100 Subject: blktrace: remove debugfs file dentries from struct blk_trace These debugfs dentries do not need to be saved for anything as the whole directory and everything in it is properly cleaned up when the parent directory is removed. So remove them from struct blk_trace and don't save them when created as it's not necessary. Cc: Jens Axboe Cc: Steven Rostedt Cc: Ingo Molnar Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 2 -- kernel/trace/blktrace.c | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index e17d04abf6a3..a083e15df608 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -23,8 +23,6 @@ struct blk_trace { u32 pid; u32 dev; struct dentry *dir; - struct dentry *dropped_file; - struct dentry *msg_file; struct list_head running_list; atomic_t dropped; }; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d7ebef83771c..0c8690d9f6cd 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -311,8 +311,6 @@ record_it: static void blk_trace_free(struct blk_trace *bt) { - debugfs_remove(bt->msg_file); - debugfs_remove(bt->dropped_file); relay_close(bt->rchan); debugfs_remove(bt->dir); free_percpu(bt->sequence); @@ -544,10 +542,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, INIT_LIST_HEAD(&bt->running_list); ret = -EIO; - bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, - &blk_dropped_fops); - - bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); + debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); -- cgit v1.3-8-gc7d7 From 22c8542d7b220ffc53816d47b371b1fe34341d4a Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 25 Sep 2020 16:43:53 -0700 Subject: tracing: add support for objtool mcount This change adds build support for using objtool to generate __mcount_loc sections. Signed-off-by: Sami Tolvanen --- Makefile | 7 +++++++ kernel/trace/Kconfig | 13 +++++++++++++ scripts/Makefile.build | 3 +++ 3 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/Makefile b/Makefile index 2233951666f7..3189eacf3185 100644 --- a/Makefile +++ b/Makefile @@ -860,6 +860,9 @@ ifdef CONFIG_FTRACE_MCOUNT_USE_CC endif endif endif +ifdef CONFIG_FTRACE_MCOUNT_USE_OBJTOOL + CC_FLAGS_USING += -DCC_USING_NOP_MCOUNT +endif ifdef CONFIG_FTRACE_MCOUNT_USE_RECORDMCOUNT ifdef CONFIG_HAVE_C_RECORDMCOUNT BUILD_C_RECORDMCOUNT := y @@ -1236,6 +1239,10 @@ uapi-asm-generic: PHONY += prepare-objtool prepare-resolve_btfids prepare-objtool: $(objtool_target) ifeq ($(SKIP_STACK_VALIDATION),1) +ifdef CONFIG_FTRACE_MCOUNT_USE_OBJTOOL + @echo "error: Cannot generate __mcount_loc for CONFIG_DYNAMIC_FTRACE=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel" >&2 + @false +endif ifdef CONFIG_UNWINDER_ORC @echo "error: Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel" >&2 @false diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3985a31a49b7..936ccab1b938 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -60,6 +60,11 @@ config HAVE_NOP_MCOUNT help Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount +config HAVE_OBJTOOL_MCOUNT + bool + help + Arch supports objtool --mcount + config HAVE_C_RECORDMCOUNT bool help @@ -612,10 +617,18 @@ config FTRACE_MCOUNT_USE_CC depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on FTRACE_MCOUNT_RECORD +config FTRACE_MCOUNT_USE_OBJTOOL + def_bool y + depends on HAVE_OBJTOOL_MCOUNT + depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY + depends on !FTRACE_MCOUNT_USE_CC + depends on FTRACE_MCOUNT_RECORD + config FTRACE_MCOUNT_USE_RECORDMCOUNT def_bool y depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC + depends on !FTRACE_MCOUNT_USE_OBJTOOL depends on FTRACE_MCOUNT_RECORD config TRACING_MAP diff --git a/scripts/Makefile.build b/scripts/Makefile.build index d94fc9ab819d..59fabd9685ee 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -238,6 +238,9 @@ endif ifdef CONFIG_X86_SMAP objtool_args += --uaccess endif +ifdef CONFIG_FTRACE_MCOUNT_USE_OBJTOOL + objtool_args += --mcount +endif # 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory # 'OBJECT_FILES_NON_STANDARD_foo.o := 'y': skip objtool checking for a file -- cgit v1.3-8-gc7d7 From 94d4bffdda21baa2c749bc229c41811a7559dd15 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 23 Feb 2021 23:16:23 -0800 Subject: blktrace: fix documentation for blk_fill_rw() Add missing ":" after rwbs function parameter documentation that fixes following warning :- ./kernel/trace/blktrace.c:1877: warning: Function parameter or member 'rwbs' not described in 'blk_fill_rwbs' Reported-by: Stephen Rothwell Fixes: 1f83bb4b4914 ("blktrace: add blk_fill_rwbs documentation comment") Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0c8690d9f6cd..ca6f0ceba09b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1865,7 +1865,7 @@ void blk_trace_remove_sysfs(struct device *dev) /** * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. - * @rwbs buffer to be filled + * @rwbs: buffer to be filled * @op: REQ_OP_XXX for the tracepoint * * Description: -- cgit v1.3-8-gc7d7 From 1f221a0d0dbf0e48ef3a9c62871281d6a7819f05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Feb 2021 14:39:44 -0500 Subject: swiotlb: respect min_align_mask Respect the min_align_mask in struct device_dma_parameters in swiotlb. There are two parts to it: 1) for the lower bits of the alignment inside the io tlb slot, just extent the size of the allocation and leave the start of the slot empty 2) for the high bits ensure we find a slot that matches the high bits of the alignment to avoid wasting too much memory Based on an earlier patch from Jianxiong Gao . Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index bf063badda1d..010d531999a6 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -470,6 +470,14 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, #define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) +/* + * Return the offset into a iotlb slot required to keep the device happy. + */ +static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) +{ + return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); +} + /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. */ @@ -491,24 +499,29 @@ static unsigned int wrap_index(unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int find_slots(struct device *dev, size_t alloc_size) +static int find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size) { unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); - unsigned int nslots = nr_slots(alloc_size), stride = 1; + unsigned int iotlb_align_mask = + dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); + unsigned int nslots = nr_slots(alloc_size), stride; unsigned int index, wrap, count = 0, i; unsigned long flags; BUG_ON(!nslots); /* - * For mappings greater than or equal to a page, we limit the stride - * (and hence alignment) to a page size. + * For mappings with an alignment requirement don't bother looping to + * unaligned slots once we found an aligned one. For allocations of + * PAGE_SIZE or larger only look for page aligned allocations. */ + stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; if (alloc_size >= PAGE_SIZE) - stride <<= (PAGE_SHIFT - IO_TLB_SHIFT); + stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); spin_lock_irqsave(&io_tlb_lock, flags); if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) @@ -516,6 +529,12 @@ static int find_slots(struct device *dev, size_t alloc_size) index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); do { + if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != + (orig_addr & iotlb_align_mask)) { + index = wrap_index(index + 1); + continue; + } + /* * If we find a slot that indicates we have 'nslots' number of * contiguous buffers, we allocate the buffers from that slot @@ -559,6 +578,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { + unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, i; phys_addr_t tlb_addr; @@ -574,7 +594,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } - index = find_slots(dev, alloc_size); + index = find_slots(dev, orig_addr, alloc_size + offset); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, @@ -588,10 +608,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nr_slots(alloc_size); i++) + for (i = 0; i < nr_slots(alloc_size + offset); i++) io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); - tlb_addr = slot_addr(io_tlb_start, index); + tlb_addr = slot_addr(io_tlb_start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); @@ -606,8 +626,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = nr_slots(alloc_size); - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); + int i, count, nslots = nr_slots(alloc_size + offset); + int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; /* -- cgit v1.3-8-gc7d7 From daf9514fd5eb098d7d6f3a1247cb8cc48fc94155 Mon Sep 17 00:00:00 2001 From: Martin Radev Date: Tue, 12 Jan 2021 16:07:29 +0100 Subject: swiotlb: Validate bounce size in the sync/unmap path The size of the buffer being bounced is not checked if it happens to be larger than the size of the mapped buffer. Because the size can be controlled by a device, as it's the case with virtio devices, this can lead to memory corruption. This patch saves the remaining buffer memory for each slab and uses that information for validation in the sync/unmap paths before swiotlb_bounce is called. Validating this argument is important under the threat models of AMD SEV-SNP and Intel TDX, where the HV is considered untrusted. Signed-off-by: Martin Radev Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 010d531999a6..c10e855a03bc 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -99,6 +99,11 @@ static unsigned int max_segment; #define INVALID_PHYS_ADDR (~(phys_addr_t)0) static phys_addr_t *io_tlb_orig_addr; +/* + * The mapped buffer's size should be validated during a sync operation. + */ +static size_t *io_tlb_orig_size; + /* * Protect the above data structures in the map and unmap calls */ @@ -247,9 +252,16 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)); + io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_orig_size) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + io_tlb_orig_size[i] = 0; } io_tlb_index = 0; no_iotlb_memory = false; @@ -370,7 +382,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) * between io_tlb_start and io_tlb_end. */ io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * sizeof(int))); + get_order(io_tlb_nslabs * sizeof(int))); if (!io_tlb_list) goto cleanup3; @@ -381,9 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) if (!io_tlb_orig_addr) goto cleanup4; + io_tlb_orig_size = (size_t *) + __get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * + sizeof(size_t))); + if (!io_tlb_orig_size) + goto cleanup5; + + for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + io_tlb_orig_size[i] = 0; } io_tlb_index = 0; no_iotlb_memory = false; @@ -396,6 +417,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) return 0; +cleanup5: + free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * + sizeof(phys_addr_t))); + cleanup4: free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * sizeof(int))); @@ -411,6 +436,8 @@ void __init swiotlb_exit(void) return; if (late_alloc) { + free_pages((unsigned long)io_tlb_orig_size, + get_order(io_tlb_nslabs * sizeof(size_t))); free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * sizeof(phys_addr_t))); free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * @@ -420,6 +447,8 @@ void __init swiotlb_exit(void) } else { memblock_free_late(__pa(io_tlb_orig_addr), PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + memblock_free_late(__pa(io_tlb_orig_size), + PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t))); memblock_free_late(__pa(io_tlb_list), PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); memblock_free_late(io_tlb_start, @@ -608,9 +637,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nr_slots(alloc_size + offset); i++) + for (i = 0; i < nr_slots(alloc_size + offset); i++) { io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); - + io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT); + } tlb_addr = slot_addr(io_tlb_start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) @@ -618,6 +648,17 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return tlb_addr; } +static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size) +{ + if (*size > orig_size) { + /* Warn and truncate mapping_size */ + dev_WARN_ONCE(hwdev, 1, + "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n", + orig_size, *size); + *size = orig_size; + } +} + /* * tlb_addr is the physical address of the bounce buffer to unmap. */ @@ -631,6 +672,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; + validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size); + /* * First, sync the memory before unmapping the entry */ @@ -658,6 +701,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, for (i = index + nslots - 1; i >= index; i--) { io_tlb_list[i] = ++count; io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + io_tlb_orig_size[i] = 0; } /* @@ -677,11 +721,14 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_sync_target target) { int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + size_t orig_size = io_tlb_orig_size[index]; phys_addr_t orig_addr = io_tlb_orig_addr[index]; if (orig_addr == INVALID_PHYS_ADDR) return; + validate_sync_size_and_truncate(hwdev, orig_size, &size); + switch (target) { case SYNC_FOR_CPU: if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) -- cgit v1.3-8-gc7d7 From 9c0dee54eb91d48cca048bd7bd2c1f4a166e0252 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:44 -0800 Subject: tracing: add error_report_end trace point Patch series "Add error_report_end tracepoint to KFENCE and KASAN", v3. This patchset adds a tracepoint, error_repor_end, that is to be used by KFENCE, KASAN, and potentially other bug detection tools, when they print an error report. One of the possible use cases is userspace collection of kernel error reports: interested parties can subscribe to the tracing event via tracefs, and get notified when an error report occurs. This patch (of 3): Introduce error_report_end tracepoint. It can be used in debugging tools like KASAN, KFENCE, etc. to provide extensions to the error reporting mechanisms (e.g. allow tests hook into error reporting, ease error report collection from production kernels). Another benefit would be making use of ftrace for debugging or benchmarking the tools themselves. Should we need it, the tracepoint name leaves us with the possibility to introduce a complementary error_report_start tracepoint in the future. Link: https://lkml.kernel.org/r/20210121131915.1331302-1-glider@google.com Link: https://lkml.kernel.org/r/20210121131915.1331302-2-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/error_report.h | 74 +++++++++++++++++++++++++++++++++++++ kernel/trace/Makefile | 1 + kernel/trace/error_report-traces.c | 11 ++++++ 3 files changed, 86 insertions(+) create mode 100644 include/trace/events/error_report.h create mode 100644 kernel/trace/error_report-traces.c (limited to 'kernel') diff --git a/include/trace/events/error_report.h b/include/trace/events/error_report.h new file mode 100644 index 000000000000..96f64bf218b2 --- /dev/null +++ b/include/trace/events/error_report.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Declarations for error reporting tracepoints. + * + * Copyright (C) 2021, Google LLC. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM error_report + +#if !defined(_TRACE_ERROR_REPORT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ERROR_REPORT_H + +#include + +#ifndef __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY +#define __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY + +enum error_detector { + ERROR_DETECTOR_KFENCE, + ERROR_DETECTOR_KASAN +}; + +#endif /* __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY */ + +#define error_detector_list \ + EM(ERROR_DETECTOR_KFENCE, "kfence") \ + EMe(ERROR_DETECTOR_KASAN, "kasan") +/* Always end the list with an EMe. */ + +#undef EM +#undef EMe + +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +error_detector_list + +#undef EM +#undef EMe + +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +#define show_error_detector_list(val) \ + __print_symbolic(val, error_detector_list) + +DECLARE_EVENT_CLASS(error_report_template, + TP_PROTO(enum error_detector error_detector, unsigned long id), + TP_ARGS(error_detector, id), + TP_STRUCT__entry(__field(enum error_detector, error_detector) + __field(unsigned long, id)), + TP_fast_assign(__entry->error_detector = error_detector; + __entry->id = id;), + TP_printk("[%s] %lx", + show_error_detector_list(__entry->error_detector), + __entry->id)); + +/** + * error_report_end - called after printing the error report + * @error_detector: short string describing the error detection tool + * @id: pseudo-unique descriptor identifying the report + * (e.g. the memory access address) + * + * This event occurs right after a debugging tool finishes printing the error + * report. + */ +DEFINE_EVENT(error_report_template, error_report_end, + TP_PROTO(enum error_detector error_detector, unsigned long id), + TP_ARGS(error_detector, id)); + +#endif /* _TRACE_ERROR_REPORT_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 7e44cea89fdc..b28d3e5013cd 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o +obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o diff --git a/kernel/trace/error_report-traces.c b/kernel/trace/error_report-traces.c new file mode 100644 index 000000000000..f89792c25b11 --- /dev/null +++ b/kernel/trace/error_report-traces.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Error reporting trace points. + * + * Copyright (C) 2021, Google LLC. + */ + +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(error_report_end); -- cgit v1.3-8-gc7d7 From 3b3376f222e3ab58367d9dd405cafd09d5e37b7c Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Thu, 25 Feb 2021 17:20:53 -0800 Subject: sysctl.c: fix underflow value setting risk in vm_table Apart from subsystem specific .proc_handler handler, all ctl_tables with extra1 and extra2 members set should use proc_dointvec_minmax instead of proc_dointvec, or the limit set in extra* never work and potentially echo underflow values(negative numbers) is likely make system unstable. Especially vfs_cache_pressure and zone_reclaim_mode, -1 is apparently not a valid value, but we can set to them. And then kernel may crash. # echo -1 > /proc/sys/vm/vfs_cache_pressure Link: https://lkml.kernel.org/r/20201223105535.2875-1-linf@wangsu.com Signed-off-by: Lin Feng Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9fbdd848138..62fbd09b5dc1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2962,7 +2962,7 @@ static struct ctl_table vm_table[] = { .data = &block_dump, .maxlen = sizeof(block_dump), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { @@ -2970,7 +2970,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ @@ -2980,7 +2980,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif @@ -2990,7 +2990,7 @@ static struct ctl_table vm_table[] = { .data = &node_reclaim_mode, .maxlen = sizeof(node_reclaim_mode), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { -- cgit v1.3-8-gc7d7 From e1e014115dfd48ab3e3691ce46f9484ce12e67d4 Mon Sep 17 00:00:00 2001 From: Hubert Jasudowicz Date: Thu, 25 Feb 2021 17:21:07 -0800 Subject: groups: simplify struct group_info allocation Combine kmalloc and vmalloc into a single call. Use struct_size macro instead of direct size calculation. Link: https://lkml.kernel.org/r/ba9ba5beea9a44b7196c41a0d9528abd5f20dd2e.1611620846.git.hubert.jasudowicz@gmail.com Signed-off-by: Hubert Jasudowicz Cc: Gao Xiang Cc: Micah Morton Cc: Michael Kelley Cc: "Peter Zijlstra (Intel)" Cc: Thomas Cedeno Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index fe7e6385530e..787b381c7c00 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -15,12 +15,7 @@ struct group_info *groups_alloc(int gidsetsize) { struct group_info *gi; - unsigned int len; - - len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; - gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); - if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); + gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT); if (!gi) return NULL; -- cgit v1.3-8-gc7d7 From c034f48e99907d5be147ac8f0f3e630a9307c2be Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:21:10 -0800 Subject: kernel: delete repeated words in comments Drop repeated words in kernel/events/. {if, the, that, with, time} Drop repeated words in kernel/locking/. {it, no, the} Drop repeated words in kernel/sched/. {in, not} Link: https://lkml.kernel.org/r/20210127023412.26292-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Will Deacon [kernel/locking/] Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Will Deacon Cc: Mathieu Desnoyers Cc: "Paul E. McKenney" Cc: Juri Lelli Cc: Vincent Guittot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/core.c | 8 ++++---- kernel/events/uprobes.c | 2 +- kernel/locking/rtmutex.c | 4 ++-- kernel/locking/rwsem.c | 2 +- kernel/locking/semaphore.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/membarrier.c | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 129dee540a8b..0aeca5f3c0ac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -269,7 +269,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to - * stabilize the the event->ctx relation. See + * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); @@ -1303,7 +1303,7 @@ static void put_ctx(struct perf_event_context *ctx) * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * - * But remember that that these are parent<->child context relations, and + * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * @@ -1442,7 +1442,7 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. * - * This has to cope with with the fact that until it is locked, + * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * @@ -2486,7 +2486,7 @@ static void perf_set_shadow_time(struct perf_event *event, * But this is a bit hairy. * * So instead, we have an explicit cgroup call to remain - * within the time time source all along. We believe it + * within the time source all along. We believe it * is cleaner and simpler to understand. */ if (is_cgroup_event(event)) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3ea7f8f92f1d..6addc9780319 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1733,7 +1733,7 @@ void uprobe_free_utask(struct task_struct *t) } /* - * Allocate a uprobe_task object for the task if if necessary. + * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 03b21135313c..48fff6437901 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1420,7 +1420,7 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, } /* - * Performs the wakeup of the the top-waiter and re-enables preemption. + * Performs the wakeup of the top-waiter and re-enables preemption. */ void rt_mutex_postunlock(struct wake_q_head *wake_q) { @@ -1819,7 +1819,7 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) * been started. * @waiter: the pre-initialized rt_mutex_waiter * - * Wait for the the lock acquisition started on our behalf by + * Wait for the lock acquisition started on our behalf by * rt_mutex_start_proxy_lock(). Upon failure, the caller must call * rt_mutex_cleanup_proxy_lock(). * diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ba67600c7b2c..abba5df50006 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1048,7 +1048,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* * If there were already threads queued before us and: - * 1) there are no no active locks, wake the front + * 1) there are no active locks, wake the front * queued process(es) as the handoff bit might be set. * 2) there are no active writers and some readers, the lock * must be read owned; so we try to wake any read lock diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index d9dd94defc0a..9aa855a96c4a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -119,7 +119,7 @@ EXPORT_SYMBOL(down_killable); * @sem: the semaphore to be acquired * * Try to acquire the semaphore atomically. Returns 0 if the semaphore has - * been acquired successfully or 1 if it it cannot be acquired. + * been acquired successfully or 1 if it cannot be acquired. * * NOTE: This return value is inverted from both spin_trylock and * mutex_trylock! Be careful about this when converting code. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8a8bd7b13634..794c2cb945f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5126,7 +5126,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of - * runtime as update_curr() throttling can not not trigger until it's on-rq. + * runtime as update_curr() throttling can not trigger until it's on-rq. */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 08ae45ad9261..acdae625c636 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -454,7 +454,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) /* * For each cpu runqueue, if the task's mm match @mm, ensure that all - * @mm's membarrier state set bits are also set in in the runqueue's + * @mm's membarrier state set bits are also set in the runqueue's * membarrier state. This ensures that a runqueue scheduling * between threads which are users of @mm has its membarrier state * updated. -- cgit v1.3-8-gc7d7 From d54ce6158e354f5358a547b96299ecd7f3725393 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Thu, 25 Feb 2021 17:22:38 -0800 Subject: kgdb: fix to kill breakpoints on initmem after boot Currently breakpoints in kernel .init.text section are not handled correctly while allowing to remove them even after corresponding pages have been freed. Fix it via killing .init.text section breakpoints just prior to initmem pages being freed. Doug: "HW breakpoints aren't handled by this patch but it's probably not such a big deal". Link: https://lkml.kernel.org/r/20210224081652.587785-1-sumit.garg@linaro.org Signed-off-by: Sumit Garg Suggested-by: Doug Anderson Acked-by: Doug Anderson Acked-by: Daniel Thompson Tested-by: Daniel Thompson Cc: Masami Hiramatsu Cc: Steven Rostedt (VMware) Cc: Jason Wessel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kgdb.h | 2 ++ init/main.c | 1 + kernel/debug/debug_core.c | 11 +++++++++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 0444b44bd156..392a3670944c 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -359,9 +359,11 @@ extern atomic_t kgdb_active; extern bool dbg_is_early; extern void __init dbg_late_init(void); extern void kgdb_panic(const char *msg); +extern void kgdb_free_init_mem(void); #else /* ! CONFIG_KGDB */ #define in_dbg_master() (0) #define dbg_late_init() static inline void kgdb_panic(const char *msg) {} +static inline void kgdb_free_init_mem(void) { } #endif /* ! CONFIG_KGDB */ #endif /* _KGDB_H_ */ diff --git a/init/main.c b/init/main.c index 3648c9f94882..53b278845b88 100644 --- a/init/main.c +++ b/init/main.c @@ -1426,6 +1426,7 @@ static int __ref kernel_init(void *unused) async_synchronize_full(); kprobe_free_init_mem(); ftrace_free_init_mem(); + kgdb_free_init_mem(); free_initmem(); mark_readonly(); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b636d517c02c..4708aec492df 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -455,6 +455,17 @@ setundefined: return 0; } +void kgdb_free_init_mem(void) +{ + int i; + + /* Clear init memory breakpoints. */ + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (init_section_contains((void *)kgdb_break[i].bpt_addr, 0)) + kgdb_break[i].state = BP_UNDEFINED; + } +} + #ifdef CONFIG_KGDB_KDB void kdb_dump_stack_on_cpu(int cpu) { -- cgit v1.3-8-gc7d7 From 70d443d8463339869f371e77fa594b850f374565 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 4 Mar 2021 11:23:48 +0200 Subject: tracing: Remove duplicate declaration from trace.h A declaration of function "int trace_empty(struct trace_iterator *iter)" shows up twice in the header file kernel/trace/trace.h Link: https://lkml.kernel.org/r/20210304092348.208033-1-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dec13ff66077..a6446c03cfbc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -605,7 +605,6 @@ void trace_graph_function(struct trace_array *tr, void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); -int trace_empty(struct trace_iterator *iter); void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); -- cgit v1.3-8-gc7d7 From 69268094a1c16f3f44b369f9da78ce98bab5f244 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Tue, 2 Mar 2021 09:49:28 +0100 Subject: tracing: Fix help text of TRACEPOINT_BENCHMARK in Kconfig It's "cond_resched()" not "cond_sched()". Link: https://lkml.kernel.org/r/1863065.aFVDpXsuPd@devpool47 Signed-off-by: Rolf Eike Beer Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9c266b93cbc0..7fa82778c3e6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -694,7 +694,7 @@ config TRACEPOINT_BENCHMARK help This option creates the tracepoint "benchmark:benchmark_event". When the tracepoint is enabled, it kicks off a kernel thread that - goes into an infinite loop (calling cond_sched() to let other tasks + goes into an infinite loop (calling cond_resched() to let other tasks run), and calls the tracepoint. Each iteration will record the time it took to write to the tracepoint and the next iteration that data will be passed to the tracepoint itself. That is, the tracepoint -- cgit v1.3-8-gc7d7 From 6f6be606e763f2da9fc21de00538c97fe4ca1492 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 3 Mar 2021 18:03:52 -0500 Subject: ring-buffer: Force before_stamp and write_stamp to be different on discard Part of the logic of the new time stamp code depends on the before_stamp and the write_stamp to be different if the write_stamp does not match the last event on the buffer, as it will be used to calculate the delta of the next event written on the buffer. The discard logic depends on this, as the next event to come in needs to inject a full timestamp as it can not rely on the last event timestamp in the buffer because it is unknown due to events after it being discarded. But by changing the write_stamp back to the time before it, it forces the next event to use a full time stamp, instead of relying on it. The issue came when a full time stamp was used for the event, and rb_time_delta() returns zero in that case. The update to the write_stamp (which subtracts delta) made it not change. Then when the event is removed from the buffer, because the before_stamp and write_stamp still match, the next event written would calculate its delta from the write_stamp, but that would be wrong as the write_stamp is of the time of the event that was discarded. In the case that the delta change being made to write_stamp is zero, set the before_stamp to zero as well, and this will force the next event to inject a full timestamp and not use the current write_stamp. Cc: stable@vger.kernel.org Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b9dad3500041..342f49c3ccc5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2814,6 +2814,17 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, write_stamp, write_stamp - delta)) return 0; + /* + * It's possible that the event time delta is zero + * (has the same time stamp as the previous event) + * in which case write_stamp and before_stamp could + * be the same. In such a case, force before_stamp + * to be different than write_stamp. It doesn't + * matter what it is, as long as its different. + */ + if (!delta) + rb_time_set(&cpu_buffer->before_stamp, 0); + /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume -- cgit v1.3-8-gc7d7 From 6549de1fe34162d7ace8b870ae11ca6cae5b8609 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 3 Mar 2021 18:23:40 -0500 Subject: ring-buffer: Add a little more information and a WARN when time stamp going backwards is detected When the CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS is enabled, and the time stamps are detected as not being valid, it reports information about the write stamp, but does not show the before_stamp which is still useful information. Also, it should give a warning once, such that tests detect this happening. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 342f49c3ccc5..68744c51517e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3318,9 +3318,13 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, goto out; } atomic_inc(&cpu_buffer->record_disabled); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld after:%lld\n", - cpu_buffer->cpu, - ts + info->delta, info->ts, info->delta, info->after); + /* There's some cases in boot up that this can happen */ + WARN_ON_ONCE(system_state != SYSTEM_BOOTING); + pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, + info->before, info->after, + full ? " (full)" : ""); dump_buffer_page(bpage, info, tail); atomic_dec(&ts_dump); /* Do not re-enable checking */ -- cgit v1.3-8-gc7d7 From f40fc799afc598b3d130d5a0ada994c9d4fb6cf8 Mon Sep 17 00:00:00 2001 From: Vamshi K Sthambamkadi Date: Thu, 4 Mar 2021 15:15:24 +0530 Subject: tracing: Fix memory leak in __create_synth_event() kmemleak report: unreferenced object 0xc5a6f708 (size 8): comm "ftracetest", pid 1209, jiffies 4294911500 (age 6.816s) hex dump (first 8 bytes): 00 c1 3d 60 14 83 1f 8a ..=`.... backtrace: [] __kmalloc_track_caller+0x2a6/0x460 [<7d3d60a6>] kstrndup+0x37/0x70 [<45a0e739>] argv_split+0x1c/0x120 [] __create_synth_event+0x192/0xb00 [<0708b8a3>] create_synth_event+0xbb/0x150 [<3d1941e1>] create_dyn_event+0x5c/0xb0 [<5cf8b9e3>] trace_parse_run_command+0xa7/0x140 [<04deb2ef>] dyn_event_write+0x10/0x20 [<8779ac95>] vfs_write+0xa9/0x3c0 [] ksys_write+0x89/0xc0 [] __ia32_sys_write+0x15/0x20 [<7ce02d85>] __do_fast_syscall_32+0x45/0x80 [] do_fast_syscall_32+0x29/0x60 [<2467454a>] do_SYSENTER_32+0x15/0x20 [<9beaa61d>] entry_SYSENTER_32+0xa9/0xfc unreferenced object 0xc5a6f078 (size 8): comm "ftracetest", pid 1209, jiffies 4294911500 (age 6.816s) hex dump (first 8 bytes): 08 f7 a6 c5 00 00 00 00 ........ backtrace: [] __kmalloc+0x2b6/0x470 [] argv_split+0x82/0x120 [] __create_synth_event+0x192/0xb00 [<0708b8a3>] create_synth_event+0xbb/0x150 [<3d1941e1>] create_dyn_event+0x5c/0xb0 [<5cf8b9e3>] trace_parse_run_command+0xa7/0x140 [<04deb2ef>] dyn_event_write+0x10/0x20 [<8779ac95>] vfs_write+0xa9/0x3c0 [] ksys_write+0x89/0xc0 [] __ia32_sys_write+0x15/0x20 [<7ce02d85>] __do_fast_syscall_32+0x45/0x80 [] do_fast_syscall_32+0x29/0x60 [<2467454a>] do_SYSENTER_32+0x15/0x20 [<9beaa61d>] entry_SYSENTER_32+0xa9/0xfc In __create_synth_event(), while iterating field/type arguments, the argv_split() will return array of atleast 2 elements even when zero arguments(argc=0) are passed. for e.g. when there is double delimiter or string ends with delimiter To fix call argv_free() even when argc=0. Link: https://lkml.kernel.org/r/20210304094521.GA1826@cosmos Signed-off-by: Vamshi K Sthambamkadi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 2979a96595b4..8d71e6c83f10 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1225,8 +1225,10 @@ static int __create_synth_event(const char *name, const char *raw_fields) goto err; } - if (!argc) + if (!argc) { + argv_free(argv); continue; + } n_fields_this_loop = 0; consumed = 0; -- cgit v1.3-8-gc7d7 From ee666a185558ac9a929e53b902a568442ed62416 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 1 Mar 2021 10:49:35 -0500 Subject: tracing: Skip selftests if tracing is disabled If tracing is disabled for some reason (traceoff_on_warning, command line, etc), the ftrace selftests are guaranteed to fail, as their results are defined by trace data in the ring buffers. If the ring buffers are turned off, the tests will fail, due to lack of data. Because tracing being disabled is for a specific reason (warning, user decided to, etc), it does not make sense to enable tracing to run the self tests, as the test output may corrupt the reason for the tracing to be disabled. Instead, simply skip the self tests and report that they are being skipped due to tracing being disabled. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e295c413580e..eccb4e1187cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1929,6 +1929,12 @@ static int run_tracer_selftest(struct tracer *type) if (!selftests_can_run) return save_selftest(type); + if (!tracing_is_on()) { + pr_warn("Selftest for tracer %s skipped due to tracing disabled\n", + type->name); + return 0; + } + /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current -- cgit v1.3-8-gc7d7 From cc440e8738e5c875297ac0e90316745093be7e28 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 4 Mar 2021 12:21:05 -0700 Subject: kernel: provide create_io_thread() helper Provide a generic helper for setting up an io_uring worker. Returns a task_struct so that the caller can do whatever setup is needed, then call wake_up_new_task() to kick it into gear. Add a kernel_clone_args member, io_thread, which tells copy_process() to mark the task with PF_IO_WORKER. Signed-off-by: Jens Axboe --- include/linux/sched/task.h | 2 ++ kernel/fork.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index c0f71f2e7160..ef02be869cf2 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -31,6 +31,7 @@ struct kernel_clone_args { /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; + int io_thread; struct cgroup *cgrp; struct css_set *cset; }; @@ -82,6 +83,7 @@ extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); +struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/kernel/fork.c b/kernel/fork.c index d66cd1014211..d3171e8e88e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1940,6 +1940,8 @@ static __latent_entropy struct task_struct *copy_process( p = dup_task_struct(current, node); if (!p) goto fork_out; + if (args->io_thread) + p->flags |= PF_IO_WORKER; /* * This _must_ happen before we call free_task(), i.e. before we jump @@ -2410,6 +2412,34 @@ struct mm_struct *copy_init_mm(void) return dup_mm(NULL, &init_mm); } +/* + * This is like kernel_clone(), but shaved down and tailored to just + * creating io_uring workers. It returns a created task, or an error pointer. + * The returned task is inactive, and the caller must fire it up through + * wake_up_new_task(p). All signals are blocked in the created task. + */ +struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) +{ + unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| + CLONE_IO; + struct kernel_clone_args args = { + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + .io_thread = 1, + }; + struct task_struct *tsk; + + tsk = copy_process(NULL, 0, node, &args); + if (!IS_ERR(tsk)) { + sigfillset(&tsk->blocked); + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + } + return tsk; +} + /* * Ok, this is the main fork-routine. * -- cgit v1.3-8-gc7d7