diff options
228 files changed, 2980 insertions, 1801 deletions
diff --git a/Documentation/ABI/stable/sysfs-class-bluetooth b/Documentation/ABI/stable/sysfs-class-bluetooth new file mode 100644 index 000000000000..36be02471174 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-class-bluetooth @@ -0,0 +1,9 @@ +What: /sys/class/bluetooth/hci<index>/reset +Date: 14-Jan-2025 +KernelVersion: 6.13 +Contact: linux-bluetooth@vger.kernel.org +Description: This write-only attribute allows users to trigger the vendor reset + method on the Bluetooth device when arbitrary data is written. + The reset may or may not be done through the device transport + (e.g., UART/USB), and can also be done through an out-of-band + approach such as GPIO. diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 51665a3e6a50..1e0e7358e14a 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -333,6 +333,4 @@ References .. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README -.. [#stefan] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ - .. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ diff --git a/Documentation/devicetree/bindings/net/qcom,ethqos.yaml b/Documentation/devicetree/bindings/net/qcom,ethqos.yaml index f117471fb06f..e7ee0d9efed8 100644 --- a/Documentation/devicetree/bindings/net/qcom,ethqos.yaml +++ b/Documentation/devicetree/bindings/net/qcom,ethqos.yaml @@ -22,12 +22,12 @@ properties: oneOf: - items: - enum: - - qcom,qcs8300-ethqos - - const: qcom,sa8775p-ethqos + - qcom,qcs615-ethqos + - const: qcom,qcs404-ethqos - items: - enum: - - qcom,qcs615-ethqos - - const: qcom,sm8150-ethqos + - qcom,qcs8300-ethqos + - const: qcom,sa8775p-ethqos - enum: - qcom,qcs404-ethqos - qcom,sa8775p-ethqos diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index f5e3676db954..d20a32b77b60 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -17,7 +17,8 @@ dentry_operations prototypes:: - int (*d_revalidate)(struct dentry *, unsigned int); + int (*d_revalidate)(struct inode *, const struct qstr *, + struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, @@ -30,6 +31,8 @@ prototypes:: struct vfsmount *(*d_automount)(struct path *path); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, enum d_real_type type); + bool (*d_unalias_trylock)(const struct dentry *); + void (*d_unalias_unlock)(const struct dentry *); locking rules: @@ -49,6 +52,8 @@ d_dname: no no no no d_automount: no no yes no d_manage: no no yes (ref-walk) maybe d_real no no yes no +d_unalias_trylock yes no no no +d_unalias_unlock yes no no no ================== =========== ======== ============== ======== inode_operations diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index c1c121055204..1639e78e3146 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1141,3 +1141,19 @@ pointer are gone. set_blocksize() takes opened struct file instead of struct block_device now and it *must* be opened exclusive. + +--- + +** mandatory** + +->d_revalidate() gets two extra arguments - inode of parent directory and +name our dentry is expected to have. Both are stable (dir is pinned in +non-RCU case and will stay around during the call in RCU case, and name +is guaranteed to stay unchanging). Your instance doesn't have to use +either, but it often helps to avoid a lot of painful boilerplate. +Note that while name->name is stable and NUL-terminated, it may (and +often will) have name->name[name->len] equal to '/' rather than '\0' - +in normal case it points into the pathname being looked up. +NOTE: if you need something like full path from the root of filesystem, +you are still on your own - this assists with simple cases, but it's not +magic. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 0b18af3f954e..31eea688609a 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -1251,7 +1251,8 @@ defined: .. code-block:: c struct dentry_operations { - int (*d_revalidate)(struct dentry *, unsigned int); + int (*d_revalidate)(struct inode *, const struct qstr *, + struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, @@ -1264,6 +1265,8 @@ defined: struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, enum d_real_type type); + bool (*d_unalias_trylock)(const struct dentry *); + void (*d_unalias_unlock)(const struct dentry *); }; ``d_revalidate`` @@ -1427,6 +1430,25 @@ defined: For non-regular files, the 'dentry' argument is returned. +``d_unalias_trylock`` + if present, will be called by d_splice_alias() before moving a + preexisting attached alias. Returning false prevents __d_move(), + making d_splice_alias() fail with -ESTALE. + + Rationale: setting FS_RENAME_DOES_D_MOVE will prevent d_move() + and d_exchange() calls from the outside of filesystem methods; + however, it does not guarantee that attached dentries won't + be renamed or moved by d_splice_alias() finding a preexisting + alias for a directory inode. Normally we would not care; + however, something that wants to stabilize the entire path to + root over a blocking operation might need that. See 9p for one + (and hopefully only) example. + +``d_unalias_unlock`` + should be paired with ``d_unalias_trylock``; that one is called after + __d_move() call in __d_unalias(). + + Each dentry has a pointer to its parent dentry, as well as a hash list of child dentries. Child dentries are basically like files in a directory. diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst index 62519d38c58b..b018ce346392 100644 --- a/Documentation/networking/can.rst +++ b/Documentation/networking/can.rst @@ -699,10 +699,10 @@ RAW socket option CAN_RAW_JOIN_FILTERS The CAN_RAW socket can set multiple CAN identifier specific filters that lead to multiple filters in the af_can.c filter processing. These filters -are indenpendent from each other which leads to logical OR'ed filters when +are independent from each other which leads to logical OR'ed filters when applied (see :ref:`socketcan-rawfilter`). -This socket option joines the given CAN filters in the way that only CAN +This socket option joins the given CAN filters in the way that only CAN frames are passed to user space that matched *all* given CAN filters. The semantic for the applied filters is therefore changed to a logical AND. diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index dc45c0211353..03e1d3610333 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -41,7 +41,7 @@ blackhole_timeout - INTEGER (seconds) MPTCP is re-enabled and will reset to the initial value when the blackhole issue goes away. - 0 to disable the blackhole detection. + 0 to disable the blackhole detection. This is a per-namespace sysctl. Default: 3600 diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst index 6083210ab2a4..f970a2be271a 100644 --- a/Documentation/networking/napi.rst +++ b/Documentation/networking/napi.rst @@ -362,7 +362,7 @@ It is expected that ``irq-suspend-timeout`` will be set to a value much larger than ``gro_flush_timeout`` as ``irq-suspend-timeout`` should suspend IRQs for the duration of one userland processing cycle. -While it is not stricly necessary to use ``napi_defer_hard_irqs`` and +While it is not strictly necessary to use ``napi_defer_hard_irqs`` and ``gro_flush_timeout`` to use IRQ suspension, their use is strongly recommended. diff --git a/Documentation/power/video.rst b/Documentation/power/video.rst index 337a2ba9f32f..8ab2458d1304 100644 --- a/Documentation/power/video.rst +++ b/Documentation/power/video.rst @@ -190,7 +190,7 @@ Toshiba Portege 3020CT s3_mode (3) Toshiba Satellite 4030CDT s3_mode (3) (S1 also works OK) Toshiba Satellite 4080XCDT s3_mode (3) (S1 also works OK) Toshiba Satellite 4090XCDT ??? [#f1]_ -Toshiba Satellite P10-554 s3_bios,s3_mode (4)[#f3]_ +Toshiba Satellite P10-554 s3_bios,s3_mode (4) [#f3]_ Toshiba M30 (2) xor X with nvidia driver using internal AGP Uniwill 244IIO ??? [#f1]_ =============================== =============================================== diff --git a/MAINTAINERS b/MAINTAINERS index bc8ce7af3303..d1086e53a317 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4102,6 +4102,7 @@ S: Supported W: http://www.bluez.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git +F: Documentation/ABI/stable/sysfs-class-bluetooth F: include/net/bluetooth/ F: net/bluetooth/ @@ -16277,6 +16278,7 @@ F: drivers/scsi/sun3_scsi_vme.c NCSI LIBRARY M: Samuel Mendoza-Jonas <sam@mendozajonas.com> +R: Paul Fertser <fercerpav@gmail.com> S: Maintained F: net/ncsi/ @@ -16611,6 +16613,7 @@ F: tools/testing/selftests/net/mptcp/ NETWORKING [TCP] M: Eric Dumazet <edumazet@google.com> +M: Neal Cardwell <ncardwell@google.com> L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/net_cachelines/tcp_sock.rst diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 6e9545d8b0c7..9c9ec08d78c7 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -52,13 +52,19 @@ config KASAN_SHADOW_OFFSET depends on KASAN default 0x1C000000000000 -config GCC_ASM_FLAG_OUTPUT_BROKEN +config CC_ASM_FLAG_OUTPUT_BROKEN def_bool CC_IS_GCC && GCC_VERSION < 140200 help GCC versions before 14.2.0 may die with an internal compiler error in some configurations if flag output operands are used within inline assemblies. +config CC_HAS_ASM_AOR_FORMAT_FLAGS + def_bool !(CC_IS_CLANG && CLANG_VERSION < 190100) + help + Clang versions before 19.1.0 do not support A, + O, and R inline assembly format flags. + config S390 def_bool y # @@ -72,6 +78,7 @@ config S390 select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 + select ARCH_HAS_CPU_FINALIZE_INIT select ARCH_HAS_CRC32 select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 3f25498dac65..5fae311203c2 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -22,7 +22,7 @@ KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) endif -KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11 KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS_DECOMPRESSOR += -D__DECOMPRESSOR KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float -mbackchain diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c index 11e0c3d5dbc8..79afb5fa7f1f 100644 --- a/arch/s390/boot/als.c +++ b/arch/s390/boot/als.c @@ -46,7 +46,7 @@ void print_missing_facilities(void) * z/VM adds a four character prefix. */ if (strlen(als_str) > 70) { - boot_printk("%s\n", als_str); + boot_emerg("%s\n", als_str); *als_str = '\0'; } u16_to_decimal(val_str, i * BITS_PER_LONG + j); @@ -54,7 +54,7 @@ void print_missing_facilities(void) first = 0; } } - boot_printk("%s\n", als_str); + boot_emerg("%s\n", als_str); } static void facility_mismatch(void) @@ -62,10 +62,10 @@ static void facility_mismatch(void) struct cpuid id; get_cpu_id(&id); - boot_printk("The Linux kernel requires more recent processor hardware\n"); - boot_printk("Detected machine-type number: %4x\n", id.machine); + boot_emerg("The Linux kernel requires more recent processor hardware\n"); + boot_emerg("Detected machine-type number: %4x\n", id.machine); print_missing_facilities(); - boot_printk("See Principles of Operations for facility bits\n"); + boot_emerg("See Principles of Operations for facility bits\n"); disabled_wait(); } diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 56244fe78182..69f261566a64 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -8,6 +8,7 @@ #ifndef __ASSEMBLY__ +#include <linux/printk.h> #include <asm/physmem_info.h> struct machine_info { @@ -47,13 +48,16 @@ void physmem_set_usable_limit(unsigned long limit); void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size); void physmem_free(enum reserved_range_type type); /* for continuous/multiple allocations per type */ -unsigned long physmem_alloc_top_down(enum reserved_range_type type, unsigned long size, - unsigned long align); +unsigned long physmem_alloc_or_die(enum reserved_range_type type, unsigned long size, + unsigned long align); +unsigned long physmem_alloc(enum reserved_range_type type, unsigned long size, + unsigned long align, bool die_on_oom); /* for single allocations, 1 per type */ unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size, unsigned long align, unsigned long min, unsigned long max, bool die_on_oom); unsigned long get_physmem_alloc_pos(void); +void dump_physmem_reserved(void); bool ipl_report_certs_intersects(unsigned long addr, unsigned long size, unsigned long *intersection_start); bool is_ipl_block_dump(void); @@ -69,12 +73,28 @@ void print_pgm_check_info(void); unsigned long randomize_within_range(unsigned long size, unsigned long align, unsigned long min, unsigned long max); void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit); -void __printf(1, 2) boot_printk(const char *fmt, ...); +int __printf(1, 2) boot_printk(const char *fmt, ...); void print_stacktrace(unsigned long sp); void error(char *m); int get_random(unsigned long limit, unsigned long *value); +void boot_rb_dump(void); + +#ifndef boot_fmt +#define boot_fmt(fmt) fmt +#endif + +#define boot_emerg(fmt, ...) boot_printk(KERN_EMERG boot_fmt(fmt), ##__VA_ARGS__) +#define boot_alert(fmt, ...) boot_printk(KERN_ALERT boot_fmt(fmt), ##__VA_ARGS__) +#define boot_crit(fmt, ...) boot_printk(KERN_CRIT boot_fmt(fmt), ##__VA_ARGS__) +#define boot_err(fmt, ...) boot_printk(KERN_ERR boot_fmt(fmt), ##__VA_ARGS__) +#define boot_warn(fmt, ...) boot_printk(KERN_WARNING boot_fmt(fmt), ##__VA_ARGS__) +#define boot_notice(fmt, ...) boot_printk(KERN_NOTICE boot_fmt(fmt), ##__VA_ARGS__) +#define boot_info(fmt, ...) boot_printk(KERN_INFO boot_fmt(fmt), ##__VA_ARGS__) +#define boot_debug(fmt, ...) boot_printk(KERN_DEBUG boot_fmt(fmt), ##__VA_ARGS__) extern struct machine_info machine; +extern int boot_console_loglevel; +extern bool boot_ignore_loglevel; /* Symbols defined by linker scripts */ extern const char kernel_version[]; diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index f478e8e9cbda..03500b9d9fb9 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/string.h> +#include <asm/boot_data.h> #include <asm/page.h> #include "decompressor.h" #include "boot.h" @@ -63,6 +64,15 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE; #include "../../../../lib/decompress_unzstd.c" #endif +static void decompress_error(char *m) +{ + if (bootdebug) + boot_rb_dump(); + boot_emerg("Decompression error: %s\n", m); + boot_emerg(" -- System halted\n"); + disabled_wait(); +} + unsigned long mem_safe_offset(void) { return ALIGN(free_mem_end_ptr, PAGE_SIZE); @@ -71,5 +81,5 @@ unsigned long mem_safe_offset(void) void deploy_kernel(void *output) { __decompress(_compressed_start, _compressed_end - _compressed_start, - NULL, NULL, output, vmlinux.image_size, NULL, error); + NULL, NULL, output, vmlinux.image_size, NULL, decompress_error); } diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 557462e62cd7..d3731f2983b7 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -215,7 +215,7 @@ static void check_cleared_facilities(void) for (i = 0; i < ARRAY_SIZE(als); i++) { if ((stfle_fac_list[i] & als[i]) != als[i]) { - boot_printk("Warning: The Linux kernel requires facilities cleared via command line option\n"); + boot_emerg("The Linux kernel requires facilities cleared via command line option\n"); print_missing_facilities(); break; } @@ -313,5 +313,23 @@ void parse_boot_command_line(void) #endif if (!strcmp(param, "relocate_lowcore") && test_facility(193)) relocate_lowcore = 1; + if (!strcmp(param, "earlyprintk")) + boot_earlyprintk = true; + if (!strcmp(param, "debug")) + boot_console_loglevel = CONSOLE_LOGLEVEL_DEBUG; + if (!strcmp(param, "bootdebug")) { + bootdebug = true; + if (val) + strncpy(bootdebug_filter, val, sizeof(bootdebug_filter) - 1); + } + if (!strcmp(param, "quiet")) + boot_console_loglevel = CONSOLE_LOGLEVEL_QUIET; + if (!strcmp(param, "ignore_loglevel")) + boot_ignore_loglevel = true; + if (!strcmp(param, "loglevel")) { + boot_console_loglevel = simple_strtoull(val, NULL, 10); + if (boot_console_loglevel < CONSOLE_LOGLEVEL_MIN) + boot_console_loglevel = CONSOLE_LOGLEVEL_MIN; + } } } diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c index d00898852a88..f73cd757a5f7 100644 --- a/arch/s390/boot/ipl_report.c +++ b/arch/s390/boot/ipl_report.c @@ -30,7 +30,6 @@ static unsigned long get_cert_comp_list_size(void) { struct ipl_rb_certificate_entry *cert; struct ipl_rb_component_entry *comp; - size_t size; /* * Find the length for the IPL report boot data @@ -155,7 +154,7 @@ void save_ipl_cert_comp_list(void) return; size = get_cert_comp_list_size(); - early_ipl_comp_list_addr = physmem_alloc_top_down(RR_CERT_COMP_LIST, size, sizeof(int)); + early_ipl_comp_list_addr = physmem_alloc_or_die(RR_CERT_COMP_LIST, size, sizeof(int)); ipl_cert_list_addr = early_ipl_comp_list_addr + early_ipl_comp_list_size; copy_components_bootdata(); diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c index f864d2bff775..941f4c9e27cc 100644 --- a/arch/s390/boot/kaslr.c +++ b/arch/s390/boot/kaslr.c @@ -32,7 +32,7 @@ struct prng_parm { static int check_prng(void) { if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) { - boot_printk("KASLR disabled: CPU has no PRNG\n"); + boot_warn("KASLR disabled: CPU has no PRNG\n"); return 0; } if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) @@ -168,7 +168,7 @@ static unsigned long iterate_valid_positions(unsigned long size, unsigned long a * cannot have chains. * * On the other hand, "dynamic" or "repetitive" allocations are done via - * physmem_alloc_top_down(). These allocations are tightly packed together + * physmem_alloc_or_die(). These allocations are tightly packed together * top down from the end of online memory. physmem_alloc_pos represents * current position where those allocations start. * diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c index 5abe59fb3bc0..633f11600aab 100644 --- a/arch/s390/boot/pgm_check_info.c +++ b/arch/s390/boot/pgm_check_info.c @@ -17,13 +17,14 @@ void print_stacktrace(unsigned long sp) (unsigned long)_stack_end }; bool first = true; - boot_printk("Call Trace:\n"); + boot_emerg("Call Trace:\n"); while (!(sp & 0x7) && on_stack(&boot_stack, sp, sizeof(struct stack_frame))) { struct stack_frame *sf = (struct stack_frame *)sp; - boot_printk(first ? "(sp:%016lx [<%016lx>] %pS)\n" : - " sp:%016lx [<%016lx>] %pS\n", - sp, sf->gprs[8], (void *)sf->gprs[8]); + if (first) + boot_emerg("(sp:%016lx [<%016lx>] %pS)\n", sp, sf->gprs[8], (void *)sf->gprs[8]); + else + boot_emerg(" sp:%016lx [<%016lx>] %pS\n", sp, sf->gprs[8], (void *)sf->gprs[8]); if (sf->back_chain <= sp) break; sp = sf->back_chain; @@ -36,30 +37,30 @@ void print_pgm_check_info(void) unsigned long *gpregs = (unsigned long *)get_lowcore()->gpregs_save_area; struct psw_bits *psw = &psw_bits(get_lowcore()->psw_save_area); - boot_printk("Linux version %s\n", kernel_version); + if (bootdebug) + boot_rb_dump(); + boot_emerg("Linux version %s\n", kernel_version); if (!is_prot_virt_guest() && early_command_line[0]) - boot_printk("Kernel command line: %s\n", early_command_line); - boot_printk("Kernel fault: interruption code %04x ilc:%x\n", - get_lowcore()->pgm_code, get_lowcore()->pgm_ilc >> 1); + boot_emerg("Kernel command line: %s\n", early_command_line); + boot_emerg("Kernel fault: interruption code %04x ilc:%d\n", + get_lowcore()->pgm_code, get_lowcore()->pgm_ilc >> 1); if (kaslr_enabled()) { - boot_printk("Kernel random base: %lx\n", __kaslr_offset); - boot_printk("Kernel random base phys: %lx\n", __kaslr_offset_phys); + boot_emerg("Kernel random base: %lx\n", __kaslr_offset); + boot_emerg("Kernel random base phys: %lx\n", __kaslr_offset_phys); } - boot_printk("PSW : %016lx %016lx (%pS)\n", - get_lowcore()->psw_save_area.mask, - get_lowcore()->psw_save_area.addr, - (void *)get_lowcore()->psw_save_area.addr); - boot_printk( - " R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n", - psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck, - psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri, - psw->eaba); - boot_printk("GPRS: %016lx %016lx %016lx %016lx\n", gpregs[0], gpregs[1], gpregs[2], gpregs[3]); - boot_printk(" %016lx %016lx %016lx %016lx\n", gpregs[4], gpregs[5], gpregs[6], gpregs[7]); - boot_printk(" %016lx %016lx %016lx %016lx\n", gpregs[8], gpregs[9], gpregs[10], gpregs[11]); - boot_printk(" %016lx %016lx %016lx %016lx\n", gpregs[12], gpregs[13], gpregs[14], gpregs[15]); + boot_emerg("PSW : %016lx %016lx (%pS)\n", + get_lowcore()->psw_save_area.mask, + get_lowcore()->psw_save_area.addr, + (void *)get_lowcore()->psw_save_area.addr); + boot_emerg(" R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n", + psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck, + psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri, psw->eaba); + boot_emerg("GPRS: %016lx %016lx %016lx %016lx\n", gpregs[0], gpregs[1], gpregs[2], gpregs[3]); + boot_emerg(" %016lx %016lx %016lx %016lx\n", gpregs[4], gpregs[5], gpregs[6], gpregs[7]); + boot_emerg(" %016lx %016lx %016lx %016lx\n", gpregs[8], gpregs[9], gpregs[10], gpregs[11]); + boot_emerg(" %016lx %016lx %016lx %016lx\n", gpregs[12], gpregs[13], gpregs[14], gpregs[15]); print_stacktrace(get_lowcore()->gpregs_save_area[15]); - boot_printk("Last Breaking-Event-Address:\n"); - boot_printk(" [<%016lx>] %pS\n", (unsigned long)get_lowcore()->pgm_last_break, - (void *)get_lowcore()->pgm_last_break); + boot_emerg("Last Breaking-Event-Address:\n"); + boot_emerg(" [<%016lx>] %pS\n", (unsigned long)get_lowcore()->pgm_last_break, + (void *)get_lowcore()->pgm_last_break); } diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c index 7617aa2d2f7e..aa096ef68e8c 100644 --- a/arch/s390/boot/physmem_info.c +++ b/arch/s390/boot/physmem_info.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define boot_fmt(fmt) "physmem: " fmt #include <linux/processor.h> #include <linux/errno.h> #include <linux/init.h> @@ -28,7 +29,7 @@ static struct physmem_range *__get_physmem_range_ptr(u32 n) return &physmem_info.online[n]; if (unlikely(!physmem_info.online_extended)) { physmem_info.online_extended = (struct physmem_range *)physmem_alloc_range( - RR_MEM_DETECT_EXTENDED, ENTRIES_EXTENDED_MAX, sizeof(long), 0, + RR_MEM_DETECT_EXT, ENTRIES_EXTENDED_MAX, sizeof(long), 0, physmem_alloc_pos, true); } return &physmem_info.online_extended[n - MEM_INLINED_ENTRIES]; @@ -207,11 +208,16 @@ unsigned long detect_max_physmem_end(void) max_physmem_end = search_mem_end(); physmem_info.info_source = MEM_DETECT_BIN_SEARCH; } + boot_debug("Max physical memory: 0x%016lx (info source: %s)\n", max_physmem_end, + get_physmem_info_source()); return max_physmem_end; } void detect_physmem_online_ranges(unsigned long max_physmem_end) { + unsigned long start, end; + int i; + if (!sclp_early_read_storage_info()) { physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO; } else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) { @@ -226,12 +232,16 @@ void detect_physmem_online_ranges(unsigned long max_physmem_end) } else if (max_physmem_end) { add_physmem_online_range(0, max_physmem_end); } + boot_debug("Online memory ranges (info source: %s):\n", get_physmem_info_source()); + for_each_physmem_online_range(i, &start, &end) + boot_debug(" online [%d]: 0x%016lx-0x%016lx\n", i, start, end); } void physmem_set_usable_limit(unsigned long limit) { physmem_info.usable = limit; physmem_alloc_pos = limit; + boot_debug("Usable memory limit: 0x%016lx\n", limit); } static void die_oom(unsigned long size, unsigned long align, unsigned long min, unsigned long max) @@ -241,38 +251,47 @@ static void die_oom(unsigned long size, unsigned long align, unsigned long min, enum reserved_range_type t; int i; - boot_printk("Linux version %s\n", kernel_version); + boot_emerg("Linux version %s\n", kernel_version); if (!is_prot_virt_guest() && early_command_line[0]) - boot_printk("Kernel command line: %s\n", early_command_line); - boot_printk("Out of memory allocating %lx bytes %lx aligned in range %lx:%lx\n", - size, align, min, max); - boot_printk("Reserved memory ranges:\n"); + boot_emerg("Kernel command line: %s\n", early_command_line); + boot_emerg("Out of memory allocating %lu bytes 0x%lx aligned in range %lx:%lx\n", + size, align, min, max); + boot_emerg("Reserved memory ranges:\n"); for_each_physmem_reserved_range(t, range, &start, &end) { - boot_printk("%016lx %016lx %s\n", start, end, get_rr_type_name(t)); + boot_emerg("%016lx %016lx %s\n", start, end, get_rr_type_name(t)); total_reserved_mem += end - start; } - boot_printk("Usable online memory ranges (info source: %s [%x]):\n", - get_physmem_info_source(), physmem_info.info_source); + boot_emerg("Usable online memory ranges (info source: %s [%d]):\n", + get_physmem_info_source(), physmem_info.info_source); for_each_physmem_usable_range(i, &start, &end) { - boot_printk("%016lx %016lx\n", start, end); + boot_emerg("%016lx %016lx\n", start, end); total_mem += end - start; } - boot_printk("Usable online memory total: %lx Reserved: %lx Free: %lx\n", - total_mem, total_reserved_mem, - total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0); + boot_emerg("Usable online memory total: %lu Reserved: %lu Free: %lu\n", + total_mem, total_reserved_mem, + total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0); print_stacktrace(current_frame_address()); - boot_printk("\n\n -- System halted\n"); + boot_emerg(" -- System halted\n"); disabled_wait(); } -void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) +static void _physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) { physmem_info.reserved[type].start = addr; physmem_info.reserved[type].end = addr + size; } +void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) +{ + _physmem_reserve(type, addr, size); + boot_debug("%-14s 0x%016lx-0x%016lx %s\n", "Reserve:", addr, addr + size, + get_rr_type_name(type)); +} + void physmem_free(enum reserved_range_type type) { + boot_debug("%-14s 0x%016lx-0x%016lx %s\n", "Free:", physmem_info.reserved[type].start, + physmem_info.reserved[type].end, get_rr_type_name(type)); physmem_info.reserved[type].start = 0; physmem_info.reserved[type].end = 0; } @@ -339,41 +358,73 @@ unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long s max = min(max, physmem_alloc_pos); addr = __physmem_alloc_range(size, align, min, max, 0, NULL, die_on_oom); if (addr) - physmem_reserve(type, addr, size); + _physmem_reserve(type, addr, size); + boot_debug("%-14s 0x%016lx-0x%016lx %s\n", "Alloc range:", addr, addr + size, + get_rr_type_name(type)); return addr; } -unsigned long physmem_alloc_top_down(enum reserved_range_type type, unsigned long size, - unsigned long align) +unsigned long physmem_alloc(enum reserved_range_type type, unsigned long size, + unsigned long align, bool die_on_oom) { struct reserved_range *range = &physmem_info.reserved[type]; - struct reserved_range *new_range; + struct reserved_range *new_range = NULL; unsigned int ranges_left; unsigned long addr; addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos, physmem_alloc_ranges, - &ranges_left, true); + &ranges_left, die_on_oom); + if (!addr) + return 0; /* if not a consecutive allocation of the same type or first allocation */ if (range->start != addr + size) { if (range->end) { - physmem_alloc_pos = __physmem_alloc_range( - sizeof(struct reserved_range), 0, 0, physmem_alloc_pos, - physmem_alloc_ranges, &ranges_left, true); - new_range = (struct reserved_range *)physmem_alloc_pos; + addr = __physmem_alloc_range(sizeof(struct reserved_range), 0, 0, + physmem_alloc_pos, physmem_alloc_ranges, + &ranges_left, true); + new_range = (struct reserved_range *)addr; + addr = __physmem_alloc_range(size, align, 0, addr, ranges_left, + &ranges_left, die_on_oom); + if (!addr) + return 0; *new_range = *range; range->chain = new_range; - addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos, - ranges_left, &ranges_left, true); } range->end = addr + size; } + if (type != RR_VMEM) { + boot_debug("%-14s 0x%016lx-0x%016lx %-20s align 0x%lx split %d\n", "Alloc topdown:", + addr, addr + size, get_rr_type_name(type), align, !!new_range); + } range->start = addr; physmem_alloc_pos = addr; physmem_alloc_ranges = ranges_left; return addr; } +unsigned long physmem_alloc_or_die(enum reserved_range_type type, unsigned long size, + unsigned long align) +{ + return physmem_alloc(type, size, align, true); +} + unsigned long get_physmem_alloc_pos(void) { return physmem_alloc_pos; } + +void dump_physmem_reserved(void) +{ + struct reserved_range *range; + enum reserved_range_type t; + unsigned long start, end; + + boot_debug("Reserved memory ranges:\n"); + for_each_physmem_reserved_range(t, range, &start, &end) { + if (end) { + boot_debug("%-14s 0x%016lx-0x%016lx @%012lx chain %012lx\n", + get_rr_type_name(t), start, end, (unsigned long)range, + (unsigned long)range->chain); + } + } +} diff --git a/arch/s390/boot/printk.c b/arch/s390/boot/printk.c index 35f18f2b936e..b4c66fa667d5 100644 --- a/arch/s390/boot/printk.c +++ b/arch/s390/boot/printk.c @@ -5,21 +5,111 @@ #include <linux/ctype.h> #include <asm/stacktrace.h> #include <asm/boot_data.h> +#include <asm/sections.h> #include <asm/lowcore.h> #include <asm/setup.h> #include <asm/sclp.h> #include <asm/uv.h> #include "boot.h" +int boot_console_loglevel = CONFIG_CONSOLE_LOGLEVEL_DEFAULT; +bool boot_ignore_loglevel; +char __bootdata(boot_rb)[PAGE_SIZE * 2]; +bool __bootdata(boot_earlyprintk); +size_t __bootdata(boot_rb_off); +char __bootdata(bootdebug_filter)[128]; +bool __bootdata(bootdebug); + +static void boot_rb_add(const char *str, size_t len) +{ + /* leave double '\0' in the end */ + size_t avail = sizeof(boot_rb) - boot_rb_off - 1; + + /* store strings separated by '\0' */ + if (len + 1 > avail) + boot_rb_off = 0; + strcpy(boot_rb + boot_rb_off, str); + boot_rb_off += len + 1; +} + +static void print_rb_entry(const char *str) +{ + sclp_early_printk(printk_skip_level(str)); +} + +static bool debug_messages_printed(void) +{ + return boot_earlyprintk && (boot_ignore_loglevel || boot_console_loglevel > LOGLEVEL_DEBUG); +} + +void boot_rb_dump(void) +{ + if (debug_messages_printed()) + return; + sclp_early_printk("Boot messages ring buffer:\n"); + boot_rb_foreach(print_rb_entry); +} + const char hex_asc[] = "0123456789abcdef"; static char *as_hex(char *dst, unsigned long val, int pad) { - char *p, *end = p = dst + max(pad, (int)__fls(val | 1) / 4 + 1); + char *p = dst + max(pad, (int)__fls(val | 1) / 4 + 1); - for (*p-- = 0; p >= dst; val >>= 4) + for (*p-- = '\0'; p >= dst; val >>= 4) *p-- = hex_asc[val & 0x0f]; - return end; + return dst; +} + +#define MAX_NUMLEN 21 +static char *as_dec(char *buf, unsigned long val, bool is_signed) +{ + bool negative = false; + char *p = buf + MAX_NUMLEN; + + if (is_signed && (long)val < 0) { + val = (val == LONG_MIN ? LONG_MIN : -(long)val); + negative = true; + } + + *--p = '\0'; + do { + *--p = '0' + (val % 10); + val /= 10; + } while (val); + + if (negative) + *--p = '-'; + return p; +} + +static ssize_t strpad(char *dst, size_t dst_size, const char *src, + int _pad, bool zero_pad, bool decimal) +{ + ssize_t len = strlen(src), pad = _pad; + char *p = dst; + + if (max(len, abs(pad)) >= dst_size) + return -E2BIG; + + if (pad > len) { + if (decimal && zero_pad && *src == '-') { + *p++ = '-'; + src++; + len--; + pad--; + } + memset(p, zero_pad ? '0' : ' ', pad - len); + p += pad - len; + } + memcpy(p, src, len); + p += len; + if (pad < 0 && -pad > len) { + memset(p, ' ', -pad - len); + p += -pad - len; + } + *p = '\0'; + return p - dst; } static char *symstart(char *p) @@ -58,35 +148,94 @@ static noinline char *findsym(unsigned long ip, unsigned short *off, unsigned sh return NULL; } -static noinline char *strsym(void *ip) +#define MAX_SYMLEN 64 +static noinline char *strsym(char *buf, void *ip) { - static char buf[64]; unsigned short off; unsigned short len; char *p; p = findsym((unsigned long)ip, &off, &len); if (p) { - strncpy(buf, p, sizeof(buf)); + strncpy(buf, p, MAX_SYMLEN); /* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */ - p = buf + strnlen(buf, sizeof(buf) - 15); + p = buf + strnlen(buf, MAX_SYMLEN - 15); strcpy(p, "+0x"); - p = as_hex(p + 3, off, 0); - strcpy(p, "/0x"); - as_hex(p + 3, len, 0); + as_hex(p + 3, off, 0); + strcat(p, "/0x"); + as_hex(p + strlen(p), len, 0); } else { as_hex(buf, (unsigned long)ip, 16); } return buf; } -void boot_printk(const char *fmt, ...) +static inline int printk_loglevel(const char *buf) +{ + if (buf[0] == KERN_SOH_ASCII && buf[1]) { + switch (buf[1]) { + case '0' ... '7': + return buf[1] - '0'; + } + } + return MESSAGE_LOGLEVEL_DEFAULT; +} + +static void boot_console_earlyprintk(const char *buf) +{ + int level = printk_loglevel(buf); + + /* always print emergency messages */ + if (level > LOGLEVEL_EMERG && !boot_earlyprintk) + return; + buf = printk_skip_level(buf); + /* print debug messages only when bootdebug is enabled */ + if (level == LOGLEVEL_DEBUG && (!bootdebug || !bootdebug_filter_match(skip_timestamp(buf)))) + return; + if (boot_ignore_loglevel || level < boot_console_loglevel) + sclp_early_printk(buf); +} + +static char *add_timestamp(char *buf) +{ +#ifdef CONFIG_PRINTK_TIME + union tod_clock *boot_clock = (union tod_clock *)&get_lowcore()->boot_clock; + unsigned long ns = tod_to_ns(get_tod_clock() - boot_clock->tod); + char ts[MAX_NUMLEN]; + + *buf++ = '['; + buf += strpad(buf, MAX_NUMLEN, as_dec(ts, ns / NSEC_PER_SEC, 0), 5, 0, 0); + *buf++ = '.'; + buf += strpad(buf, MAX_NUMLEN, as_dec(ts, (ns % NSEC_PER_SEC) / NSEC_PER_USEC, 0), 6, 1, 0); + *buf++ = ']'; + *buf++ = ' '; +#endif + return buf; +} + +#define va_arg_len_type(args, lenmod, typemod) \ + ((lenmod == 'l') ? va_arg(args, typemod long) : \ + (lenmod == 'h') ? (typemod short)va_arg(args, typemod int) : \ + (lenmod == 'H') ? (typemod char)va_arg(args, typemod int) : \ + (lenmod == 'z') ? va_arg(args, typemod long) : \ + va_arg(args, typemod int)) + +int boot_printk(const char *fmt, ...) { char buf[1024] = { 0 }; char *end = buf + sizeof(buf) - 1; /* make sure buf is 0 terminated */ - unsigned long pad; - char *p = buf; + bool zero_pad, decimal; + char *strval, *p = buf; + char valbuf[MAX(MAX_SYMLEN, MAX_NUMLEN)]; va_list args; + char lenmod; + ssize_t len; + int pad; + + *p++ = KERN_SOH_ASCII; + *p++ = printk_get_level(fmt) ?: '0' + MESSAGE_LOGLEVEL_DEFAULT; + p = add_timestamp(p); + fmt = printk_skip_level(fmt); va_start(args, fmt); for (; p < end && *fmt; fmt++) { @@ -94,31 +243,56 @@ void boot_printk(const char *fmt, ...) *p++ = *fmt; continue; } - pad = isdigit(*++fmt) ? simple_strtol(fmt, (char **)&fmt, 10) : 0; + if (*++fmt == '%') { + *p++ = '%'; + continue; + } + zero_pad = (*fmt == '0'); + pad = simple_strtol(fmt, (char **)&fmt, 10); + lenmod = (*fmt == 'h' || *fmt == 'l' || *fmt == 'z') ? *fmt++ : 0; + if (lenmod == 'h' && *fmt == 'h') { + lenmod = 'H'; + fmt++; + } + decimal = false; switch (*fmt) { case 's': - p = buf + strlcat(buf, va_arg(args, char *), sizeof(buf)); + if (lenmod) + goto out; + strval = va_arg(args, char *); + zero_pad = false; break; case 'p': - if (*++fmt != 'S') + if (*++fmt != 'S' || lenmod) goto out; - p = buf + strlcat(buf, strsym(va_arg(args, void *)), sizeof(buf)); + strval = strsym(valbuf, va_arg(args, void *)); + zero_pad = false; break; - case 'l': - if (*++fmt != 'x' || end - p <= max(sizeof(long) * 2, pad)) - goto out; - p = as_hex(p, va_arg(args, unsigned long), pad); + case 'd': + case 'i': + strval = as_dec(valbuf, va_arg_len_type(args, lenmod, signed), 1); + decimal = true; + break; + case 'u': + strval = as_dec(valbuf, va_arg_len_type(args, lenmod, unsigned), 0); break; case 'x': - if (end - p <= max(sizeof(int) * 2, pad)) - goto out; - p = as_hex(p, va_arg(args, unsigned int), pad); + strval = as_hex(valbuf, va_arg_len_type(args, lenmod, unsigned), 0); break; default: goto out; } + len = strpad(p, end - p, strval, pad, zero_pad, decimal); + if (len == -E2BIG) + break; + p += len; } out: va_end(args); - sclp_early_printk(buf); + len = strlen(buf); + if (len) { + boot_rb_add(buf, len); + boot_console_earlyprintk(buf); + } + return len; } diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index e6b06692ddc8..885bd1dd2c82 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define boot_fmt(fmt) "startup: " fmt #include <linux/string.h> #include <linux/elf.h> #include <asm/page-states.h> @@ -42,7 +43,8 @@ struct machine_info machine; void error(char *x) { - boot_printk("\n\n%s\n\n -- System halted", x); + boot_emerg("%s\n", x); + boot_emerg(" -- System halted\n"); disabled_wait(); } @@ -143,7 +145,7 @@ static void rescue_initrd(unsigned long min, unsigned long max) return; old_addr = addr; physmem_free(RR_INITRD); - addr = physmem_alloc_top_down(RR_INITRD, size, 0); + addr = physmem_alloc_or_die(RR_INITRD, size, 0); memmove((void *)addr, (void *)old_addr, size); } @@ -222,12 +224,16 @@ static void setup_ident_map_size(unsigned long max_physmem_end) if (oldmem_data.start) { __kaslr_enabled = 0; ident_map_size = min(ident_map_size, oldmem_data.size); + boot_debug("kdump memory limit: 0x%016lx\n", oldmem_data.size); } else if (ipl_block_valid && is_ipl_block_dump()) { __kaslr_enabled = 0; - if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size) + if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size) { ident_map_size = min(ident_map_size, hsa_size); + boot_debug("Stand-alone dump limit: 0x%016lx\n", hsa_size); + } } #endif + boot_debug("Identity map size: 0x%016lx\n", ident_map_size); } #define FIXMAP_SIZE round_up(MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore)) @@ -267,6 +273,7 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE)); BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE); vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE); + boot_debug("vmem size estimated: 0x%016lx\n", vsize); if (IS_ENABLED(CONFIG_KASAN) || __NO_KASLR_END_KERNEL > _REGION2_SIZE || (vsize > _REGION2_SIZE && kaslr_enabled())) { asce_limit = _REGION1_SIZE; @@ -290,8 +297,10 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) * otherwise asce_limit and rte_size would have been adjusted. */ vmax = adjust_to_uv_max(asce_limit); + boot_debug("%d level paging 0x%016lx vmax\n", vmax == _REGION1_SIZE ? 4 : 3, vmax); #ifdef CONFIG_KASAN BUILD_BUG_ON(__NO_KASLR_END_KERNEL > KASAN_SHADOW_START); + boot_debug("KASAN shadow area: 0x%016lx-0x%016lx\n", KASAN_SHADOW_START, KASAN_SHADOW_END); /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); #endif @@ -305,19 +314,27 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) pos = 0; kernel_end = vmax - pos * THREAD_SIZE; kernel_start = round_down(kernel_end - kernel_size, THREAD_SIZE); + boot_debug("Randomization range: 0x%016lx-0x%016lx\n", vmax - kaslr_len, vmax); + boot_debug("kernel image: 0x%016lx-0x%016lx (kaslr)\n", kernel_start, + kernel_size + kernel_size); } else if (vmax < __NO_KASLR_END_KERNEL || vsize > __NO_KASLR_END_KERNEL) { kernel_start = round_down(vmax - kernel_size, THREAD_SIZE); - boot_printk("The kernel base address is forced to %lx\n", kernel_start); + boot_debug("kernel image: 0x%016lx-0x%016lx (constrained)\n", kernel_start, + kernel_start + kernel_size); } else { kernel_start = __NO_KASLR_START_KERNEL; + boot_debug("kernel image: 0x%016lx-0x%016lx (nokaslr)\n", kernel_start, + kernel_start + kernel_size); } __kaslr_offset = kernel_start; + boot_debug("__kaslr_offset: 0x%016lx\n", __kaslr_offset); MODULES_END = round_down(kernel_start, _SEGMENT_SIZE); MODULES_VADDR = MODULES_END - MODULES_LEN; VMALLOC_END = MODULES_VADDR; if (IS_ENABLED(CONFIG_KMSAN)) VMALLOC_END -= MODULES_LEN * 2; + boot_debug("modules area: 0x%016lx-0x%016lx\n", MODULES_VADDR, MODULES_END); /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ vsize = (VMALLOC_END - FIXMAP_SIZE) / 2; @@ -329,10 +346,15 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) VMALLOC_END -= vmalloc_size * 2; } VMALLOC_START = VMALLOC_END - vmalloc_size; + boot_debug("vmalloc area: 0x%016lx-0x%016lx\n", VMALLOC_START, VMALLOC_END); __memcpy_real_area = round_down(VMALLOC_START - MEMCPY_REAL_SIZE, PAGE_SIZE); + boot_debug("memcpy real area: 0x%016lx-0x%016lx\n", __memcpy_real_area, + __memcpy_real_area + MEMCPY_REAL_SIZE); __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore)); + boot_debug("abs lowcore: 0x%016lx-0x%016lx\n", __abs_lowcore, + __abs_lowcore + ABS_LOWCORE_MAP_SIZE); /* split remaining virtual space between 1:1 mapping & vmemmap array */ pages = __abs_lowcore / (PAGE_SIZE + sizeof(struct page)); @@ -352,8 +374,11 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS)); max_mappable = max(ident_map_size, MAX_DCSS_ADDR); max_mappable = min(max_mappable, vmemmap_start); - if (IS_ENABLED(CONFIG_RANDOMIZE_IDENTITY_BASE)) - __identity_base = round_down(vmemmap_start - max_mappable, rte_size); +#ifdef CONFIG_RANDOMIZE_IDENTITY_BASE + __identity_base = round_down(vmemmap_start - max_mappable, rte_size); +#endif + boot_debug("identity map: 0x%016lx-0x%016lx\n", __identity_base, + __identity_base + ident_map_size); return asce_limit; } @@ -412,6 +437,10 @@ void startup_kernel(void) psw_t psw; setup_lpp(); + store_ipl_parmblock(); + uv_query_info(); + setup_boot_command_line(); + parse_boot_command_line(); /* * Non-randomized kernel physical start address must be _SEGMENT_SIZE @@ -431,12 +460,8 @@ void startup_kernel(void) oldmem_data.start = parmarea.oldmem_base; oldmem_data.size = parmarea.oldmem_size; - store_ipl_parmblock(); read_ipl_report(); - uv_query_info(); sclp_early_read_info(); - setup_boot_command_line(); - parse_boot_command_line(); detect_facilities(); cmma_init(); sanitize_prot_virt_host(); @@ -526,6 +551,7 @@ void startup_kernel(void) __kaslr_offset, __kaslr_offset_phys); kaslr_adjust_got(__kaslr_offset); setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit); + dump_physmem_reserved(); copy_bootdata(); __apply_alternatives((struct alt_instr *)_vmlinux_info.alt_instructions, (struct alt_instr *)_vmlinux_info.alt_instructions_end, @@ -542,5 +568,6 @@ void startup_kernel(void) */ psw.addr = __kaslr_offset + vmlinux.entry; psw.mask = PSW_KERNEL_BITS; + boot_debug("Starting kernel at: 0x%016lx\n", psw.addr); __load_psw(psw); } diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 881a1ece422f..cfca94a8eac4 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define boot_fmt(fmt) "vmem: " fmt #include <linux/sched/task.h> #include <linux/pgtable.h> #include <linux/kasan.h> @@ -13,6 +14,7 @@ #include "decompressor.h" #include "boot.h" +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) struct ctlreg __bootdata_preserved(s390_invalid_asce); #ifdef CONFIG_PROC_FS @@ -31,12 +33,42 @@ enum populate_mode { POPULATE_IDENTITY, POPULATE_KERNEL, #ifdef CONFIG_KASAN + /* KASAN modes should be last and grouped together, see is_kasan_populate_mode() */ POPULATE_KASAN_MAP_SHADOW, POPULATE_KASAN_ZERO_SHADOW, POPULATE_KASAN_SHALLOW #endif }; +#define POPULATE_MODE_NAME(t) case POPULATE_ ## t: return #t +static inline const char *get_populate_mode_name(enum populate_mode t) +{ + switch (t) { + POPULATE_MODE_NAME(NONE); + POPULATE_MODE_NAME(DIRECT); + POPULATE_MODE_NAME(LOWCORE); + POPULATE_MODE_NAME(ABS_LOWCORE); + POPULATE_MODE_NAME(IDENTITY); + POPULATE_MODE_NAME(KERNEL); +#ifdef CONFIG_KASAN + POPULATE_MODE_NAME(KASAN_MAP_SHADOW); + POPULATE_MODE_NAME(KASAN_ZERO_SHADOW); + POPULATE_MODE_NAME(KASAN_SHALLOW); +#endif + default: + return "UNKNOWN"; + } +} + +static bool is_kasan_populate_mode(enum populate_mode mode) +{ +#ifdef CONFIG_KASAN + return mode >= POPULATE_KASAN_MAP_SHADOW; +#else + return false; +#endif +} + static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode); #ifdef CONFIG_KASAN @@ -52,9 +84,12 @@ static pte_t pte_z; static inline void kasan_populate(unsigned long start, unsigned long end, enum populate_mode mode) { - start = PAGE_ALIGN_DOWN(__sha(start)); - end = PAGE_ALIGN(__sha(end)); - pgtable_populate(start, end, mode); + unsigned long sha_start = PAGE_ALIGN_DOWN(__sha(start)); + unsigned long sha_end = PAGE_ALIGN(__sha(end)); + + boot_debug("%-17s 0x%016lx-0x%016lx >> 0x%016lx-0x%016lx\n", get_populate_mode_name(mode), + start, end, sha_start, sha_end); + pgtable_populate(sha_start, sha_end, mode); } static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end) @@ -200,7 +235,7 @@ static void *boot_crst_alloc(unsigned long val) unsigned long size = PAGE_SIZE << CRST_ALLOC_ORDER; unsigned long *table; - table = (unsigned long *)physmem_alloc_top_down(RR_VMEM, size, size); + table = (unsigned long *)physmem_alloc_or_die(RR_VMEM, size, size); crst_table_init(table, val); __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; @@ -216,7 +251,7 @@ static pte_t *boot_pte_alloc(void) * during POPULATE_KASAN_MAP_SHADOW when EDAT is off */ if (!pte_leftover) { - pte_leftover = (void *)physmem_alloc_top_down(RR_VMEM, PAGE_SIZE, PAGE_SIZE); + pte_leftover = (void *)physmem_alloc_or_die(RR_VMEM, PAGE_SIZE, PAGE_SIZE); pte = pte_leftover + _PAGE_TABLE_SIZE; __arch_set_page_dat(pte, 1); } else { @@ -228,11 +263,12 @@ static pte_t *boot_pte_alloc(void) return pte; } -static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_mode mode) +static unsigned long resolve_pa_may_alloc(unsigned long addr, unsigned long size, + enum populate_mode mode) { switch (mode) { case POPULATE_NONE: - return -1; + return INVALID_PHYS_ADDR; case POPULATE_DIRECT: return addr; case POPULATE_LOWCORE: @@ -245,38 +281,64 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m return __identity_pa(addr); #ifdef CONFIG_KASAN case POPULATE_KASAN_MAP_SHADOW: - addr = physmem_alloc_top_down(RR_VMEM, size, size); - memset((void *)addr, 0, size); - return addr; + /* Allow to fail large page allocations, this will fall back to 1mb/4k pages */ + addr = physmem_alloc(RR_VMEM, size, size, size == PAGE_SIZE); + if (addr) { + memset((void *)addr, 0, size); + return addr; + } + return INVALID_PHYS_ADDR; #endif default: - return -1; + return INVALID_PHYS_ADDR; } } -static bool large_allowed(enum populate_mode mode) +static bool large_page_mapping_allowed(enum populate_mode mode) { - return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY) || (mode == POPULATE_KERNEL); + switch (mode) { + case POPULATE_DIRECT: + case POPULATE_IDENTITY: + case POPULATE_KERNEL: +#ifdef CONFIG_KASAN + case POPULATE_KASAN_MAP_SHADOW: +#endif + return true; + default: + return false; + } } -static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end, - enum populate_mode mode) +static unsigned long try_get_large_pud_pa(pud_t *pu_dir, unsigned long addr, unsigned long end, + enum populate_mode mode) { - unsigned long size = end - addr; + unsigned long pa, size = end - addr; + + if (!machine.has_edat2 || !large_page_mapping_allowed(mode) || + !IS_ALIGNED(addr, PUD_SIZE) || (size < PUD_SIZE)) + return INVALID_PHYS_ADDR; + + pa = resolve_pa_may_alloc(addr, size, mode); + if (!IS_ALIGNED(pa, PUD_SIZE)) + return INVALID_PHYS_ADDR; - return machine.has_edat2 && large_allowed(mode) && - IS_ALIGNED(addr, PUD_SIZE) && (size >= PUD_SIZE) && - IS_ALIGNED(_pa(addr, size, mode), PUD_SIZE); + return pa; } -static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end, - enum populate_mode mode) +static unsigned long try_get_large_pmd_pa(pmd_t *pm_dir, unsigned long addr, unsigned long end, + enum populate_mode mode) { - unsigned long size = end - addr; + unsigned long pa, size = end - addr; - return machine.has_edat1 && large_allowed(mode) && - IS_ALIGNED(addr, PMD_SIZE) && (size >= PMD_SIZE) && - IS_ALIGNED(_pa(addr, size, mode), PMD_SIZE); + if (!machine.has_edat1 || !large_page_mapping_allowed(mode) || + !IS_ALIGNED(addr, PMD_SIZE) || (size < PMD_SIZE)) + return INVALID_PHYS_ADDR; + + pa = resolve_pa_may_alloc(addr, size, mode); + if (!IS_ALIGNED(pa, PMD_SIZE)) + return INVALID_PHYS_ADDR; + + return pa; } static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end, @@ -290,7 +352,7 @@ static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long e if (pte_none(*pte)) { if (kasan_pte_populate_zero_shadow(pte, mode)) continue; - entry = __pte(_pa(addr, PAGE_SIZE, mode)); + entry = __pte(resolve_pa_may_alloc(addr, PAGE_SIZE, mode)); entry = set_pte_bit(entry, PAGE_KERNEL); set_pte(pte, entry); pages++; @@ -303,7 +365,7 @@ static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long e static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end, enum populate_mode mode) { - unsigned long next, pages = 0; + unsigned long pa, next, pages = 0; pmd_t *pmd, entry; pte_t *pte; @@ -313,8 +375,9 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e if (pmd_none(*pmd)) { if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode)) continue; - if (can_large_pmd(pmd, addr, next, mode)) { - entry = __pmd(_pa(addr, _SEGMENT_SIZE, mode)); + pa = try_get_large_pmd_pa(pmd, addr, next, mode); + if (pa != INVALID_PHYS_ADDR) { + entry = __pmd(pa); entry = set_pmd_bit(entry, SEGMENT_KERNEL); set_pmd(pmd, entry); pages++; @@ -334,7 +397,7 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end, enum populate_mode mode) { - unsigned long next, pages = 0; + unsigned long pa, next, pages = 0; pud_t *pud, entry; pmd_t *pmd; @@ -344,8 +407,9 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e if (pud_none(*pud)) { if (kasan_pud_populate_zero_shadow(pud, addr, next, mode)) continue; - if (can_large_pud(pud, addr, next, mode)) { - entry = __pud(_pa(addr, _REGION3_SIZE, mode)); + pa = try_get_large_pud_pa(pud, addr, next, mode); + if (pa != INVALID_PHYS_ADDR) { + entry = __pud(pa); entry = set_pud_bit(entry, REGION3_KERNEL); set_pud(pud, entry); pages++; @@ -388,6 +452,13 @@ static void pgtable_populate(unsigned long addr, unsigned long end, enum populat pgd_t *pgd; p4d_t *p4d; + if (!is_kasan_populate_mode(mode)) { + boot_debug("%-17s 0x%016lx-0x%016lx -> 0x%016lx-0x%016lx\n", + get_populate_mode_name(mode), addr, end, + resolve_pa_may_alloc(addr, 0, mode), + resolve_pa_may_alloc(end - 1, 0, mode) + 1); + } + pgd = pgd_offset(&init_mm, addr); for (; addr < end; addr = next, pgd++) { next = pgd_addr_end(addr, end); diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h index 4a6b0a8b6412..2e829c16fd8a 100644 --- a/arch/s390/include/asm/asm-extable.h +++ b/arch/s390/include/asm/asm-extable.h @@ -9,11 +9,11 @@ #define EX_TYPE_NONE 0 #define EX_TYPE_FIXUP 1 #define EX_TYPE_BPF 2 -#define EX_TYPE_UA_STORE 3 -#define EX_TYPE_UA_LOAD_MEM 4 +#define EX_TYPE_UA_FAULT 3 #define EX_TYPE_UA_LOAD_REG 5 #define EX_TYPE_UA_LOAD_REGPAIR 6 #define EX_TYPE_ZEROPAD 7 +#define EX_TYPE_FPC 8 #define EX_DATA_REG_ERR_SHIFT 0 #define EX_DATA_REG_ERR GENMASK(3, 0) @@ -69,11 +69,8 @@ #define EX_TABLE_AMODE31(_fault, _target) \ __EX_TABLE(.amode31.ex_table, _fault, _target, EX_TYPE_FIXUP, __stringify(%%r0), __stringify(%%r0), 0) -#define EX_TABLE_UA_STORE(_fault, _target, _regerr) \ - __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_STORE, _regerr, _regerr, 0) - -#define EX_TABLE_UA_LOAD_MEM(_fault, _target, _regerr, _regmem, _len) \ - __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_MEM, _regerr, _regmem, _len) +#define EX_TABLE_UA_FAULT(_fault, _target, _regerr) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_FAULT, _regerr, _regerr, 0) #define EX_TABLE_UA_LOAD_REG(_fault, _target, _regerr, _regzero) \ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REG, _regerr, _regzero, 0) @@ -84,4 +81,7 @@ #define EX_TABLE_ZEROPAD(_fault, _target, _regdata, _regaddr) \ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_ZEROPAD, _regdata, _regaddr, 0) +#define EX_TABLE_FPC(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FPC, __stringify(%%r0), __stringify(%%r0), 0) + #endif /* __ASM_EXTABLE_H */ diff --git a/arch/s390/include/asm/asm.h b/arch/s390/include/asm/asm.h index ec011b94af2a..e9062b01e2a2 100644 --- a/arch/s390/include/asm/asm.h +++ b/arch/s390/include/asm/asm.h @@ -28,7 +28,7 @@ * [var] also contains the program mask. CC_TRANSFORM() moves the condition * code to the two least significant bits and sets all other bits to zero. */ -#if defined(__GCC_ASM_FLAG_OUTPUTS__) && !(IS_ENABLED(CONFIG_GCC_ASM_FLAG_OUTPUT_BROKEN)) +#if defined(__GCC_ASM_FLAG_OUTPUTS__) && !(IS_ENABLED(CONFIG_CC_ASM_FLAG_OUTPUT_BROKEN)) #define __HAVE_ASM_FLAG_OUTPUTS__ diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 15aa64e3020e..d5125296ade2 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -60,7 +60,7 @@ static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsig asm volatile( " tm %[addr],%[mask]\n" : "=@cc" (cc) - : [addr] "R" (*addr), [mask] "I" (mask) + : [addr] "Q" (*addr), [mask] "I" (mask) ); return cc == 3; } diff --git a/arch/s390/include/asm/boot_data.h b/arch/s390/include/asm/boot_data.h index f7eed27b3220..f55f8227058e 100644 --- a/arch/s390/include/asm/boot_data.h +++ b/arch/s390/include/asm/boot_data.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_S390_BOOT_DATA_H +#include <linux/string.h> #include <asm/setup.h> #include <asm/ipl.h> @@ -15,4 +16,54 @@ extern unsigned long ipl_cert_list_size; extern unsigned long early_ipl_comp_list_addr; extern unsigned long early_ipl_comp_list_size; +extern char boot_rb[PAGE_SIZE * 2]; +extern bool boot_earlyprintk; +extern size_t boot_rb_off; +extern char bootdebug_filter[128]; +extern bool bootdebug; + +#define boot_rb_foreach(cb) \ + do { \ + size_t off = boot_rb_off + strlen(boot_rb + boot_rb_off) + 1; \ + size_t len; \ + for (; off < sizeof(boot_rb) && (len = strlen(boot_rb + off)); off += len + 1) \ + cb(boot_rb + off); \ + for (off = 0; off < boot_rb_off && (len = strlen(boot_rb + off)); off += len + 1) \ + cb(boot_rb + off); \ + } while (0) + +/* + * bootdebug_filter is a comma separated list of strings, + * where each string can be a prefix of the message. + */ +static inline bool bootdebug_filter_match(const char *buf) +{ + char *p = bootdebug_filter, *s; + char *end; + + if (!*p) + return true; + + end = p + strlen(p); + while (p < end) { + p = skip_spaces(p); + s = memscan(p, ',', end - p); + if (!strncmp(p, buf, s - p)) + return true; + p = s + 1; + } + return false; +} + +static inline const char *skip_timestamp(const char *buf) +{ +#ifdef CONFIG_PRINTK_TIME + const char *p = memchr(buf, ']', strlen(buf)); + + if (p && p[1] == ' ') + return p + 2; +#endif + return buf; +} + #endif /* _ASM_S390_BOOT_DATA_H */ diff --git a/arch/s390/include/asm/fpu-insn.h b/arch/s390/include/asm/fpu-insn.h index de510c9f6efa..f668bffd6dd3 100644 --- a/arch/s390/include/asm/fpu-insn.h +++ b/arch/s390/include/asm/fpu-insn.h @@ -100,19 +100,12 @@ static __always_inline void fpu_lfpc(unsigned int *fpc) */ static inline void fpu_lfpc_safe(unsigned int *fpc) { - u32 tmp; - instrument_read(fpc, sizeof(*fpc)); asm_inline volatile( - "0: lfpc %[fpc]\n" - "1: nopr %%r7\n" - ".pushsection .fixup, \"ax\"\n" - "2: lghi %[tmp],0\n" - " sfpc %[tmp]\n" - " jg 1b\n" - ".popsection\n" - EX_TABLE(1b, 2b) - : [tmp] "=d" (tmp) + " lfpc %[fpc]\n" + "0: nopr %%r7\n" + EX_TABLE_FPC(0b, 0b) + : : [fpc] "Q" (*fpc) : "memory"); } @@ -183,7 +176,19 @@ static __always_inline void fpu_vgfmg(u8 v1, u8 v2, u8 v3) : "memory"); } -#ifdef CONFIG_CC_IS_CLANG +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS + +static __always_inline void fpu_vl(u8 v1, const void *vxr) +{ + instrument_read(vxr, sizeof(__vector128)); + asm volatile("VL %[v1],%O[vxr],,%R[vxr]\n" + : + : [vxr] "Q" (*(__vector128 *)vxr), + [v1] "I" (v1) + : "memory"); +} + +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ static __always_inline void fpu_vl(u8 v1, const void *vxr) { @@ -197,19 +202,7 @@ static __always_inline void fpu_vl(u8 v1, const void *vxr) : "memory", "1"); } -#else /* CONFIG_CC_IS_CLANG */ - -static __always_inline void fpu_vl(u8 v1, const void *vxr) -{ - instrument_read(vxr, sizeof(__vector128)); - asm volatile("VL %[v1],%O[vxr],,%R[vxr]\n" - : - : [vxr] "Q" (*(__vector128 *)vxr), - [v1] "I" (v1) - : "memory"); -} - -#endif /* CONFIG_CC_IS_CLANG */ +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ static __always_inline void fpu_vleib(u8 v, s16 val, u8 index) { @@ -238,7 +231,7 @@ static __always_inline u64 fpu_vlgvf(u8 v, u16 index) return val; } -#ifdef CONFIG_CC_IS_CLANG +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) { @@ -246,17 +239,15 @@ static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) size = min(index + 1, sizeof(__vector128)); instrument_read(vxr, size); - asm volatile( - " la 1,%[vxr]\n" - " VLL %[v1],%[index],0,1\n" - : - : [vxr] "R" (*(u8 *)vxr), - [index] "d" (index), - [v1] "I" (v1) - : "memory", "1"); + asm volatile("VLL %[v1],%[index],%O[vxr],%R[vxr]\n" + : + : [vxr] "Q" (*(u8 *)vxr), + [index] "d" (index), + [v1] "I" (v1) + : "memory"); } -#else /* CONFIG_CC_IS_CLANG */ +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) { @@ -264,17 +255,19 @@ static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) size = min(index + 1, sizeof(__vector128)); instrument_read(vxr, size); - asm volatile("VLL %[v1],%[index],%O[vxr],%R[vxr]\n" - : - : [vxr] "Q" (*(u8 *)vxr), - [index] "d" (index), - [v1] "I" (v1) - : "memory"); + asm volatile( + " la 1,%[vxr]\n" + " VLL %[v1],%[index],0,1\n" + : + : [vxr] "R" (*(u8 *)vxr), + [index] "d" (index), + [v1] "I" (v1) + : "memory", "1"); } -#endif /* CONFIG_CC_IS_CLANG */ +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ -#ifdef CONFIG_CC_IS_CLANG +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS #define fpu_vlm(_v1, _v3, _vxrs) \ ({ \ @@ -284,17 +277,15 @@ static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) } *_v = (void *)(_vxrs); \ \ instrument_read(_v, size); \ - asm volatile( \ - " la 1,%[vxrs]\n" \ - " VLM %[v1],%[v3],0,1\n" \ - : \ - : [vxrs] "R" (*_v), \ - [v1] "I" (_v1), [v3] "I" (_v3) \ - : "memory", "1"); \ + asm volatile("VLM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \ + : \ + : [vxrs] "Q" (*_v), \ + [v1] "I" (_v1), [v3] "I" (_v3) \ + : "memory"); \ (_v3) - (_v1) + 1; \ }) -#else /* CONFIG_CC_IS_CLANG */ +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ #define fpu_vlm(_v1, _v3, _vxrs) \ ({ \ @@ -304,15 +295,17 @@ static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) } *_v = (void *)(_vxrs); \ \ instrument_read(_v, size); \ - asm volatile("VLM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \ - : \ - : [vxrs] "Q" (*_v), \ - [v1] "I" (_v1), [v3] "I" (_v3) \ - : "memory"); \ + asm volatile( \ + " la 1,%[vxrs]\n" \ + " VLM %[v1],%[v3],0,1\n" \ + : \ + : [vxrs] "R" (*_v), \ + [v1] "I" (_v1), [v3] "I" (_v3) \ + : "memory", "1"); \ (_v3) - (_v1) + 1; \ }) -#endif /* CONFIG_CC_IS_CLANG */ +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ static __always_inline void fpu_vlr(u8 v1, u8 v2) { @@ -362,7 +355,18 @@ static __always_inline void fpu_vsrlb(u8 v1, u8 v2, u8 v3) : "memory"); } -#ifdef CONFIG_CC_IS_CLANG +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS + +static __always_inline void fpu_vst(u8 v1, const void *vxr) +{ + instrument_write(vxr, sizeof(__vector128)); + asm volatile("VST %[v1],%O[vxr],,%R[vxr]\n" + : [vxr] "=Q" (*(__vector128 *)vxr) + : [v1] "I" (v1) + : "memory"); +} + +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ static __always_inline void fpu_vst(u8 v1, const void *vxr) { @@ -375,20 +379,23 @@ static __always_inline void fpu_vst(u8 v1, const void *vxr) : "memory", "1"); } -#else /* CONFIG_CC_IS_CLANG */ +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ -static __always_inline void fpu_vst(u8 v1, const void *vxr) +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS + +static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) { - instrument_write(vxr, sizeof(__vector128)); - asm volatile("VST %[v1],%O[vxr],,%R[vxr]\n" - : [vxr] "=Q" (*(__vector128 *)vxr) - : [v1] "I" (v1) + unsigned int size; + + size = min(index + 1, sizeof(__vector128)); + instrument_write(vxr, size); + asm volatile("VSTL %[v1],%[index],%O[vxr],%R[vxr]\n" + : [vxr] "=Q" (*(u8 *)vxr) + : [index] "d" (index), [v1] "I" (v1) : "memory"); } -#endif /* CONFIG_CC_IS_CLANG */ - -#ifdef CONFIG_CC_IS_CLANG +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) { @@ -404,23 +411,9 @@ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) : "memory", "1"); } -#else /* CONFIG_CC_IS_CLANG */ +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ -static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) -{ - unsigned int size; - - size = min(index + 1, sizeof(__vector128)); - instrument_write(vxr, size); - asm volatile("VSTL %[v1],%[index],%O[vxr],%R[vxr]\n" - : [vxr] "=Q" (*(u8 *)vxr) - : [index] "d" (index), [v1] "I" (v1) - : "memory"); -} - -#endif /* CONFIG_CC_IS_CLANG */ - -#ifdef CONFIG_CC_IS_CLANG +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS #define fpu_vstm(_v1, _v3, _vxrs) \ ({ \ @@ -430,16 +423,14 @@ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) } *_v = (void *)(_vxrs); \ \ instrument_write(_v, size); \ - asm volatile( \ - " la 1,%[vxrs]\n" \ - " VSTM %[v1],%[v3],0,1\n" \ - : [vxrs] "=R" (*_v) \ - : [v1] "I" (_v1), [v3] "I" (_v3) \ - : "memory", "1"); \ + asm volatile("VSTM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \ + : [vxrs] "=Q" (*_v) \ + : [v1] "I" (_v1), [v3] "I" (_v3) \ + : "memory"); \ (_v3) - (_v1) + 1; \ }) -#else /* CONFIG_CC_IS_CLANG */ +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ #define fpu_vstm(_v1, _v3, _vxrs) \ ({ \ @@ -449,14 +440,16 @@ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) } *_v = (void *)(_vxrs); \ \ instrument_write(_v, size); \ - asm volatile("VSTM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \ - : [vxrs] "=Q" (*_v) \ - : [v1] "I" (_v1), [v3] "I" (_v3) \ - : "memory"); \ + asm volatile( \ + " la 1,%[vxrs]\n" \ + " VSTM %[v1],%[v3],0,1\n" \ + : [vxrs] "=R" (*_v) \ + : [v1] "I" (_v1), [v3] "I" (_v3) \ + : "memory", "1"); \ (_v3) - (_v1) + 1; \ }) -#endif /* CONFIG_CC_IS_CLANG */ +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ static __always_inline void fpu_vupllf(u8 v1, u8 v2) { diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index a3b73a4f626e..185331e91f83 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -51,6 +51,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; } +#define ftrace_get_symaddr(fentry_ip) ((unsigned long)(fentry_ip)) #include <linux/ftrace_regs.h> diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index 752a2310f0d6..f5781794356b 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -2,80 +2,95 @@ #ifndef _ASM_S390_FUTEX_H #define _ASM_S390_FUTEX_H +#include <linux/instrumented.h> #include <linux/uaccess.h> #include <linux/futex.h> #include <asm/asm-extable.h> #include <asm/mmu_context.h> #include <asm/errno.h> -#define __futex_atomic_op(insn, ret, oldval, newval, uaddr, oparg) \ - asm volatile( \ - " sacf 256\n" \ - "0: l %1,0(%6)\n" \ - "1:"insn \ - "2: cs %1,%2,0(%6)\n" \ - "3: jl 1b\n" \ - " lhi %0,0\n" \ - "4: sacf 768\n" \ - EX_TABLE(0b,4b) EX_TABLE(1b,4b) \ - EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ - : "=d" (ret), "=&d" (oldval), "=&d" (newval), \ - "=m" (*uaddr) \ - : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ - "m" (*uaddr) : "cc"); +#define FUTEX_OP_FUNC(name, insn) \ +static uaccess_kmsan_or_inline int \ +__futex_atomic_##name(int oparg, int *old, u32 __user *uaddr) \ +{ \ + int rc, new; \ + \ + instrument_copy_from_user_before(old, uaddr, sizeof(*old)); \ + asm_inline volatile( \ + " sacf 256\n" \ + "0: l %[old],%[uaddr]\n" \ + "1:"insn \ + "2: cs %[old],%[new],%[uaddr]\n" \ + "3: jl 1b\n" \ + " lhi %[rc],0\n" \ + "4: sacf 768\n" \ + EX_TABLE_UA_FAULT(0b, 4b, %[rc]) \ + EX_TABLE_UA_FAULT(1b, 4b, %[rc]) \ + EX_TABLE_UA_FAULT(2b, 4b, %[rc]) \ + EX_TABLE_UA_FAULT(3b, 4b, %[rc]) \ + : [rc] "=d" (rc), [old] "=&d" (*old), \ + [new] "=&d" (new), [uaddr] "+Q" (*uaddr) \ + : [oparg] "d" (oparg) \ + : "cc"); \ + if (!rc) \ + instrument_copy_from_user_after(old, uaddr, sizeof(*old), 0); \ + return rc; \ +} + +FUTEX_OP_FUNC(set, "lr %[new],%[oparg]\n") +FUTEX_OP_FUNC(add, "lr %[new],%[old]\n ar %[new],%[oparg]\n") +FUTEX_OP_FUNC(or, "lr %[new],%[old]\n or %[new],%[oparg]\n") +FUTEX_OP_FUNC(and, "lr %[new],%[old]\n nr %[new],%[oparg]\n") +FUTEX_OP_FUNC(xor, "lr %[new],%[old]\n xr %[new],%[oparg]\n") -static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, - u32 __user *uaddr) +static inline +int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int oldval = 0, newval, ret; + int old, rc; switch (op) { case FUTEX_OP_SET: - __futex_atomic_op("lr %2,%5\n", - ret, oldval, newval, uaddr, oparg); + rc = __futex_atomic_set(oparg, &old, uaddr); break; case FUTEX_OP_ADD: - __futex_atomic_op("lr %2,%1\nar %2,%5\n", - ret, oldval, newval, uaddr, oparg); + rc = __futex_atomic_add(oparg, &old, uaddr); break; case FUTEX_OP_OR: - __futex_atomic_op("lr %2,%1\nor %2,%5\n", - ret, oldval, newval, uaddr, oparg); + rc = __futex_atomic_or(oparg, &old, uaddr); break; case FUTEX_OP_ANDN: - __futex_atomic_op("lr %2,%1\nnr %2,%5\n", - ret, oldval, newval, uaddr, ~oparg); + rc = __futex_atomic_and(~oparg, &old, uaddr); break; case FUTEX_OP_XOR: - __futex_atomic_op("lr %2,%1\nxr %2,%5\n", - ret, oldval, newval, uaddr, oparg); + rc = __futex_atomic_xor(oparg, &old, uaddr); break; default: - ret = -ENOSYS; + rc = -ENOSYS; } - - if (!ret) - *oval = oldval; - - return ret; + if (!rc) + *oval = old; + return rc; } -static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, - u32 oldval, u32 newval) +static uaccess_kmsan_or_inline +int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { - int ret; + int rc; - asm volatile( - " sacf 256\n" - "0: cs %1,%4,0(%5)\n" - "1: la %0,0\n" - "2: sacf 768\n" - EX_TABLE(0b,2b) EX_TABLE(1b,2b) - : "=d" (ret), "+d" (oldval), "=m" (*uaddr) - : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) + instrument_copy_from_user_before(uval, uaddr, sizeof(*uval)); + asm_inline volatile( + " sacf 256\n" + "0: cs %[old],%[new],%[uaddr]\n" + "1: lhi %[rc],0\n" + "2: sacf 768\n" + EX_TABLE_UA_FAULT(0b, 2b, %[rc]) + EX_TABLE_UA_FAULT(1b, 2b, %[rc]) + : [rc] "=d" (rc), [old] "+d" (oldval), [uaddr] "+Q" (*uaddr) + : [new] "d" (newval) : "cc", "memory"); *uval = oldval; - return ret; + instrument_copy_from_user_after(uval, uaddr, sizeof(*uval), 0); + return rc; } #endif /* _ASM_S390_FUTEX_H */ diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 4f43cdd9835b..1ff145f7b52b 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -184,7 +184,11 @@ extern struct vm_layout vm_layout; #define __kaslr_offset vm_layout.kaslr_offset #define __kaslr_offset_phys vm_layout.kaslr_offset_phys +#ifdef CONFIG_RANDOMIZE_IDENTITY_BASE #define __identity_base vm_layout.identity_base +#else +#define __identity_base 0UL +#endif #define ident_map_size vm_layout.identity_size static inline unsigned long kaslr_offset(void) diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h index 51b68a43e195..7ef3bbec98b0 100644 --- a/arch/s390/include/asm/physmem_info.h +++ b/arch/s390/include/asm/physmem_info.h @@ -26,7 +26,7 @@ enum reserved_range_type { RR_AMODE31, RR_IPLREPORT, RR_CERT_COMP_LIST, - RR_MEM_DETECT_EXTENDED, + RR_MEM_DETECT_EXT, RR_VMEM, RR_MAX }; @@ -128,7 +128,7 @@ static inline const char *get_rr_type_name(enum reserved_range_type t) RR_TYPE_NAME(AMODE31); RR_TYPE_NAME(IPLREPORT); RR_TYPE_NAME(CERT_COMP_LIST); - RR_TYPE_NAME(MEM_DETECT_EXTENDED); + RR_TYPE_NAME(MEM_DETECT_EXT); RR_TYPE_NAME(VMEM); default: return "UNKNOWN"; diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 4da3b2956285..18f37dff03c9 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -172,6 +172,7 @@ void sclp_early_printk(const char *s); void __sclp_early_printk(const char *s, unsigned int len); void sclp_emergency_printk(const char *s); +int sclp_init(void); int sclp_early_get_memsize(unsigned long *mem); int sclp_early_get_hsa_size(unsigned long *hsa_size); int _sclp_get_core_info(struct sclp_core_info *info); diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index a81f897a81ce..f5920163ee97 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -22,16 +22,117 @@ void debug_user_asce(int exit); -unsigned long __must_check -raw_copy_from_user(void *to, const void __user *from, unsigned long n); +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; -unsigned long __must_check -raw_copy_to_user(void __user *to, const void *from, unsigned long n); +static __always_inline __must_check unsigned long +raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key) +{ + unsigned long rem; + union oac spec = { + .oac2.key = key, + .oac2.as = PSW_BITS_AS_SECONDARY, + .oac2.k = 1, + .oac2.a = 1, + }; -#ifndef CONFIG_KASAN -#define INLINE_COPY_FROM_USER -#define INLINE_COPY_TO_USER -#endif + asm_inline volatile( + " lr %%r0,%[spec]\n" + "0: mvcos 0(%[to]),0(%[from]),%[size]\n" + "1: jz 5f\n" + " algr %[size],%[val]\n" + " slgr %[from],%[val]\n" + " slgr %[to],%[val]\n" + " j 0b\n" + "2: la %[rem],4095(%[from])\n" /* rem = from + 4095 */ + " nr %[rem],%[val]\n" /* rem = (from + 4095) & -4096 */ + " slgr %[rem],%[from]\n" + " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ + " jnh 6f\n" + "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" + "4: slgr %[size],%[rem]\n" + " j 6f\n" + "5: lghi %[size],0\n" + "6:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + EX_TABLE(3b, 6b) + EX_TABLE(4b, 6b) + : [size] "+&a" (size), [from] "+&a" (from), [to] "+&a" (to), [rem] "=&a" (rem) + : [val] "a" (-4096UL), [spec] "d" (spec.val) + : "cc", "memory", "0"); + return size; +} + +static __always_inline __must_check unsigned long +raw_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + return raw_copy_from_user_key(to, from, n, 0); +} + +static __always_inline __must_check unsigned long +raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key) +{ + unsigned long rem; + union oac spec = { + .oac1.key = key, + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.k = 1, + .oac1.a = 1, + }; + + asm_inline volatile( + " lr %%r0,%[spec]\n" + "0: mvcos 0(%[to]),0(%[from]),%[size]\n" + "1: jz 5f\n" + " algr %[size],%[val]\n" + " slgr %[to],%[val]\n" + " slgr %[from],%[val]\n" + " j 0b\n" + "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */ + " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */ + " slgr %[rem],%[to]\n" + " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ + " jnh 6f\n" + "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" + "4: slgr %[size],%[rem]\n" + " j 6f\n" + "5: lghi %[size],0\n" + "6:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + EX_TABLE(3b, 6b) + EX_TABLE(4b, 6b) + : [size] "+&a" (size), [to] "+&a" (to), [from] "+&a" (from), [rem] "=&a" (rem) + : [val] "a" (-4096UL), [spec] "d" (spec.val) + : "cc", "memory", "0"); + return size; +} + +static __always_inline __must_check unsigned long +raw_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + return raw_copy_to_user_key(to, from, n, 0); +} unsigned long __must_check _copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key); @@ -55,63 +156,71 @@ copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned lo return n; } -union oac { - unsigned int val; - struct { - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac1; - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac2; - }; -}; - int __noreturn __put_user_bad(void); #ifdef CONFIG_KMSAN -#define get_put_user_noinstr_attributes \ - noinline __maybe_unused __no_sanitize_memory +#define uaccess_kmsan_or_inline noinline __maybe_unused __no_sanitize_memory #else -#define get_put_user_noinstr_attributes __always_inline +#define uaccess_kmsan_or_inline __always_inline #endif -#define DEFINE_PUT_USER(type) \ -static get_put_user_noinstr_attributes int \ +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + +#define DEFINE_PUT_USER_NOINSTR(type) \ +static uaccess_kmsan_or_inline int \ +__put_user_##type##_noinstr(unsigned type __user *to, \ + unsigned type *from, \ + unsigned long size) \ +{ \ + asm goto( \ + " llilh %%r0,%[spec]\n" \ + "0: mvcos %[to],%[from],%[size]\n" \ + "1: nopr %%r7\n" \ + EX_TABLE(0b, %l[Efault]) \ + EX_TABLE(1b, %l[Efault]) \ + : [to] "+Q" (*to) \ + : [size] "d" (size), [from] "Q" (*from), \ + [spec] "I" (0x81) \ + : "cc", "0" \ + : Efault \ + ); \ + return 0; \ +Efault: \ + return -EFAULT; \ +} + +#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + +#define DEFINE_PUT_USER_NOINSTR(type) \ +static uaccess_kmsan_or_inline int \ __put_user_##type##_noinstr(unsigned type __user *to, \ unsigned type *from, \ unsigned long size) \ { \ - union oac __oac_spec = { \ - .oac1.as = PSW_BITS_AS_SECONDARY, \ - .oac1.a = 1, \ - }; \ int rc; \ \ asm volatile( \ - " lr 0,%[spec]\n" \ - "0: mvcos %[_to],%[_from],%[_size]\n" \ - "1: xr %[rc],%[rc]\n" \ + " llilh %%r0,%[spec]\n" \ + "0: mvcos %[to],%[from],%[size]\n" \ + "1: lhi %[rc],0\n" \ "2:\n" \ - EX_TABLE_UA_STORE(0b, 2b, %[rc]) \ - EX_TABLE_UA_STORE(1b, 2b, %[rc]) \ - : [rc] "=&d" (rc), [_to] "+Q" (*(to)) \ - : [_size] "d" (size), [_from] "Q" (*(from)), \ - [spec] "d" (__oac_spec.val) \ + EX_TABLE_UA_FAULT(0b, 2b, %[rc]) \ + EX_TABLE_UA_FAULT(1b, 2b, %[rc]) \ + : [rc] "=d" (rc), [to] "+Q" (*to) \ + : [size] "d" (size), [from] "Q" (*from), \ + [spec] "I" (0x81) \ : "cc", "0"); \ return rc; \ -} \ - \ +} + +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + +DEFINE_PUT_USER_NOINSTR(char); +DEFINE_PUT_USER_NOINSTR(short); +DEFINE_PUT_USER_NOINSTR(int); +DEFINE_PUT_USER_NOINSTR(long); + +#define DEFINE_PUT_USER(type) \ static __always_inline int \ __put_user_##type(unsigned type __user *to, unsigned type *from, \ unsigned long size) \ @@ -128,69 +237,111 @@ DEFINE_PUT_USER(short); DEFINE_PUT_USER(int); DEFINE_PUT_USER(long); -static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) -{ - int rc; +#define __put_user(x, ptr) \ +({ \ + __typeof__(*(ptr)) __x = (x); \ + int __prc; \ + \ + __chk_user_ptr(ptr); \ + switch (sizeof(*(ptr))) { \ + case 1: \ + __prc = __put_user_char((unsigned char __user *)(ptr), \ + (unsigned char *)&__x, \ + sizeof(*(ptr))); \ + break; \ + case 2: \ + __prc = __put_user_short((unsigned short __user *)(ptr),\ + (unsigned short *)&__x, \ + sizeof(*(ptr))); \ + break; \ + case 4: \ + __prc = __put_user_int((unsigned int __user *)(ptr), \ + (unsigned int *)&__x, \ + sizeof(*(ptr))); \ + break; \ + case 8: \ + __prc = __put_user_long((unsigned long __user *)(ptr), \ + (unsigned long *)&__x, \ + sizeof(*(ptr))); \ + break; \ + default: \ + __prc = __put_user_bad(); \ + break; \ + } \ + __builtin_expect(__prc, 0); \ +}) - switch (size) { - case 1: - rc = __put_user_char((unsigned char __user *)ptr, - (unsigned char *)x, - size); - break; - case 2: - rc = __put_user_short((unsigned short __user *)ptr, - (unsigned short *)x, - size); - break; - case 4: - rc = __put_user_int((unsigned int __user *)ptr, - (unsigned int *)x, - size); - break; - case 8: - rc = __put_user_long((unsigned long __user *)ptr, - (unsigned long *)x, - size); - break; - default: - __put_user_bad(); - break; - } - return rc; -} +#define put_user(x, ptr) \ +({ \ + might_fault(); \ + __put_user(x, ptr); \ +}) int __noreturn __get_user_bad(void); -#define DEFINE_GET_USER(type) \ -static get_put_user_noinstr_attributes int \ +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + +#define DEFINE_GET_USER_NOINSTR(type) \ +static uaccess_kmsan_or_inline int \ __get_user_##type##_noinstr(unsigned type *to, \ - unsigned type __user *from, \ + const unsigned type __user *from, \ + unsigned long size) \ +{ \ + asm goto( \ + " lhi %%r0,%[spec]\n" \ + "0: mvcos %[to],%[from],%[size]\n" \ + "1: nopr %%r7\n" \ + EX_TABLE(0b, %l[Efault]) \ + EX_TABLE(1b, %l[Efault]) \ + : [to] "=Q" (*to) \ + : [size] "d" (size), [from] "Q" (*from), \ + [spec] "I" (0x81) \ + : "cc", "0" \ + : Efault \ + ); \ + return 0; \ +Efault: \ + *to = 0; \ + return -EFAULT; \ +} + +#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + +#define DEFINE_GET_USER_NOINSTR(type) \ +static uaccess_kmsan_or_inline int \ +__get_user_##type##_noinstr(unsigned type *to, \ + const unsigned type __user *from, \ unsigned long size) \ { \ - union oac __oac_spec = { \ - .oac2.as = PSW_BITS_AS_SECONDARY, \ - .oac2.a = 1, \ - }; \ int rc; \ \ asm volatile( \ - " lr 0,%[spec]\n" \ - "0: mvcos 0(%[_to]),%[_from],%[_size]\n" \ - "1: xr %[rc],%[rc]\n" \ + " lhi %%r0,%[spec]\n" \ + "0: mvcos %[to],%[from],%[size]\n" \ + "1: lhi %[rc],0\n" \ "2:\n" \ - EX_TABLE_UA_LOAD_MEM(0b, 2b, %[rc], %[_to], %[_ksize]) \ - EX_TABLE_UA_LOAD_MEM(1b, 2b, %[rc], %[_to], %[_ksize]) \ - : [rc] "=&d" (rc), "=Q" (*(to)) \ - : [_size] "d" (size), [_from] "Q" (*(from)), \ - [spec] "d" (__oac_spec.val), [_to] "a" (to), \ - [_ksize] "K" (size) \ + EX_TABLE_UA_FAULT(0b, 2b, %[rc]) \ + EX_TABLE_UA_FAULT(1b, 2b, %[rc]) \ + : [rc] "=d" (rc), [to] "=Q" (*to) \ + : [size] "d" (size), [from] "Q" (*from), \ + [spec] "I" (0x81) \ : "cc", "0"); \ + if (likely(!rc)) \ + return 0; \ + *to = 0; \ return rc; \ -} \ - \ +} + +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + +DEFINE_GET_USER_NOINSTR(char); +DEFINE_GET_USER_NOINSTR(short); +DEFINE_GET_USER_NOINSTR(int); +DEFINE_GET_USER_NOINSTR(long); + +#define DEFINE_GET_USER(type) \ static __always_inline int \ -__get_user_##type(unsigned type *to, unsigned type __user *from, \ +__get_user_##type(unsigned type *to, const unsigned type __user *from, \ unsigned long size) \ { \ int rc; \ @@ -205,107 +356,50 @@ DEFINE_GET_USER(short); DEFINE_GET_USER(int); DEFINE_GET_USER(long); -static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) -{ - int rc; - - switch (size) { - case 1: - rc = __get_user_char((unsigned char *)x, - (unsigned char __user *)ptr, - size); - break; - case 2: - rc = __get_user_short((unsigned short *)x, - (unsigned short __user *)ptr, - size); - break; - case 4: - rc = __get_user_int((unsigned int *)x, - (unsigned int __user *)ptr, - size); - break; - case 8: - rc = __get_user_long((unsigned long *)x, - (unsigned long __user *)ptr, - size); - break; - default: - __get_user_bad(); - break; - } - return rc; -} - -/* - * These are the main single-value transfer routines. They automatically - * use the right size if we just have the right pointer type. - */ -#define __put_user(x, ptr) \ -({ \ - __typeof__(*(ptr)) __x = (x); \ - int __pu_err = -EFAULT; \ - \ - __chk_user_ptr(ptr); \ - switch (sizeof(*(ptr))) { \ - case 1: \ - case 2: \ - case 4: \ - case 8: \ - __pu_err = __put_user_fn(&__x, ptr, sizeof(*(ptr))); \ - break; \ - default: \ - __put_user_bad(); \ - break; \ - } \ - __builtin_expect(__pu_err, 0); \ -}) - -#define put_user(x, ptr) \ -({ \ - might_fault(); \ - __put_user(x, ptr); \ -}) - #define __get_user(x, ptr) \ ({ \ - int __gu_err = -EFAULT; \ + const __user void *____guptr = (ptr); \ + int __grc; \ \ __chk_user_ptr(ptr); \ switch (sizeof(*(ptr))) { \ case 1: { \ + const unsigned char __user *__guptr = ____guptr; \ unsigned char __x; \ \ - __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + __grc = __get_user_char(&__x, __guptr, sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *)&__x; \ break; \ }; \ case 2: { \ + const unsigned short __user *__guptr = ____guptr; \ unsigned short __x; \ \ - __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + __grc = __get_user_short(&__x, __guptr, sizeof(*(ptr)));\ (x) = *(__force __typeof__(*(ptr)) *)&__x; \ break; \ }; \ case 4: { \ + const unsigned int __user *__guptr = ____guptr; \ unsigned int __x; \ \ - __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + __grc = __get_user_int(&__x, __guptr, sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *)&__x; \ break; \ }; \ case 8: { \ + const unsigned long __user *__guptr = ____guptr; \ unsigned long __x; \ \ - __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + __grc = __get_user_long(&__x, __guptr, sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *)&__x; \ break; \ }; \ default: \ - __get_user_bad(); \ + __grc = __get_user_bad(); \ break; \ } \ - __builtin_expect(__gu_err, 0); \ + __builtin_expect(__grc, 0); \ }) #define get_user(x, ptr) \ @@ -341,109 +435,71 @@ static inline void *s390_kernel_write(void *dst, const void *src, size_t size) return __s390_kernel_write(dst, src, size); } -int __noreturn __put_kernel_bad(void); +void __noreturn __mvc_kernel_nofault_bad(void); -#define __put_kernel_asm(val, to, insn) \ -({ \ - int __rc; \ - \ - asm volatile( \ - "0: " insn " %[_val],%[_to]\n" \ - "1: xr %[rc],%[rc]\n" \ - "2:\n" \ - EX_TABLE_UA_STORE(0b, 2b, %[rc]) \ - EX_TABLE_UA_STORE(1b, 2b, %[rc]) \ - : [rc] "=d" (__rc), [_to] "+Q" (*(to)) \ - : [_val] "d" (val) \ - : "cc"); \ - __rc; \ -}) +#if defined(CONFIG_CC_HAS_ASM_GOTO_OUTPUT) && defined(CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS) -#define __put_kernel_nofault(dst, src, type, err_label) \ +#define __mvc_kernel_nofault(dst, src, type, err_label) \ do { \ - unsigned long __x = (unsigned long)(*((type *)(src))); \ - int __pk_err; \ - \ switch (sizeof(type)) { \ case 1: \ - __pk_err = __put_kernel_asm(__x, (type *)(dst), "stc"); \ - break; \ case 2: \ - __pk_err = __put_kernel_asm(__x, (type *)(dst), "sth"); \ - break; \ case 4: \ - __pk_err = __put_kernel_asm(__x, (type *)(dst), "st"); \ - break; \ case 8: \ - __pk_err = __put_kernel_asm(__x, (type *)(dst), "stg"); \ + asm goto( \ + "0: mvc %O[_dst](%[_len],%R[_dst]),%[_src]\n" \ + "1: nopr %%r7\n" \ + EX_TABLE(0b, %l[err_label]) \ + EX_TABLE(1b, %l[err_label]) \ + : [_dst] "=Q" (*(type *)dst) \ + : [_src] "Q" (*(type *)(src)), \ + [_len] "I" (sizeof(type)) \ + : \ + : err_label); \ break; \ default: \ - __pk_err = __put_kernel_bad(); \ + __mvc_kernel_nofault_bad(); \ break; \ } \ - if (unlikely(__pk_err)) \ - goto err_label; \ } while (0) -int __noreturn __get_kernel_bad(void); - -#define __get_kernel_asm(val, from, insn) \ -({ \ - int __rc; \ - \ - asm volatile( \ - "0: " insn " %[_val],%[_from]\n" \ - "1: xr %[rc],%[rc]\n" \ - "2:\n" \ - EX_TABLE_UA_LOAD_REG(0b, 2b, %[rc], %[_val]) \ - EX_TABLE_UA_LOAD_REG(1b, 2b, %[rc], %[_val]) \ - : [rc] "=d" (__rc), [_val] "=d" (val) \ - : [_from] "Q" (*(from)) \ - : "cc"); \ - __rc; \ -}) +#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT) && CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ -#define __get_kernel_nofault(dst, src, type, err_label) \ +#define __mvc_kernel_nofault(dst, src, type, err_label) \ do { \ - int __gk_err; \ + type *(__dst) = (type *)(dst); \ + int __rc; \ \ switch (sizeof(type)) { \ - case 1: { \ - unsigned char __x; \ - \ - __gk_err = __get_kernel_asm(__x, (type *)(src), "ic"); \ - *((type *)(dst)) = (type)__x; \ - break; \ - }; \ - case 2: { \ - unsigned short __x; \ - \ - __gk_err = __get_kernel_asm(__x, (type *)(src), "lh"); \ - *((type *)(dst)) = (type)__x; \ - break; \ - }; \ - case 4: { \ - unsigned int __x; \ - \ - __gk_err = __get_kernel_asm(__x, (type *)(src), "l"); \ - *((type *)(dst)) = (type)__x; \ - break; \ - }; \ - case 8: { \ - unsigned long __x; \ - \ - __gk_err = __get_kernel_asm(__x, (type *)(src), "lg"); \ - *((type *)(dst)) = (type)__x; \ + case 1: \ + case 2: \ + case 4: \ + case 8: \ + asm_inline volatile( \ + "0: mvc 0(%[_len],%[_dst]),%[_src]\n" \ + "1: lhi %[_rc],0\n" \ + "2:\n" \ + EX_TABLE_UA_FAULT(0b, 2b, %[_rc]) \ + EX_TABLE_UA_FAULT(1b, 2b, %[_rc]) \ + : [_rc] "=d" (__rc), \ + "=m" (*__dst) \ + : [_src] "Q" (*(type *)(src)), \ + [_dst] "a" (__dst), \ + [_len] "I" (sizeof(type))); \ + if (__rc) \ + goto err_label; \ break; \ - }; \ default: \ - __gk_err = __get_kernel_bad(); \ + __mvc_kernel_nofault_bad(); \ break; \ } \ - if (unlikely(__gk_err)) \ - goto err_label; \ } while (0) +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT && CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +#define __get_kernel_nofault __mvc_kernel_nofault +#define __put_kernel_nofault __mvc_kernel_nofault + void __cmpxchg_user_key_called_with_bad_pointer(void); #define CMPXCHG_USER_KEY_MAX_LOOPS 128 diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 62f8f5a750a3..2fa25164df7d 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -50,6 +50,7 @@ decompressor_handled_param(facilities); decompressor_handled_param(nokaslr); decompressor_handled_param(cmma); decompressor_handled_param(relocate_lowcore); +decompressor_handled_param(bootdebug); #if IS_ENABLED(CONFIG_KVM) decompressor_handled_param(prot_virt); #endif @@ -58,7 +59,7 @@ static void __init kasan_early_init(void) { #ifdef CONFIG_KASAN init_task.kasan_depth = 0; - sclp_early_printk("KernelAddressSanitizer initialized\n"); + pr_info("KernelAddressSanitizer initialized\n"); #endif } diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index c0b2c97efefb..63ba6306632e 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -266,18 +266,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; - int bit; if (unlikely(ftrace_graph_is_dead())) return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; - bit = ftrace_test_recursion_trylock(ip, *parent); - if (bit < 0) - return; if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) *parent = (unsigned long)&return_to_handler; - ftrace_test_recursion_unlock(bit); } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 1298f0860733..d78bcfe707b5 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -157,6 +157,12 @@ u64 __bootdata_preserved(stfle_fac_list[16]); EXPORT_SYMBOL(stfle_fac_list); struct oldmem_data __bootdata_preserved(oldmem_data); +char __bootdata(boot_rb)[PAGE_SIZE * 2]; +bool __bootdata(boot_earlyprintk); +size_t __bootdata(boot_rb_off); +char __bootdata(bootdebug_filter)[128]; +bool __bootdata(bootdebug); + unsigned long __bootdata_preserved(VMALLOC_START); EXPORT_SYMBOL(VMALLOC_START); @@ -686,7 +692,7 @@ static void __init reserve_physmem_info(void) { unsigned long addr, size; - if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size)) + if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size)) memblock_reserve(addr, size); } @@ -694,7 +700,7 @@ static void __init free_physmem_info(void) { unsigned long addr, size; - if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size)) + if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size)) memblock_phys_free(addr, size); } @@ -724,7 +730,7 @@ static void __init reserve_lowcore(void) void *lowcore_end = lowcore_start + sizeof(struct lowcore); void *start, *end; - if ((void *)__identity_base < lowcore_end) { + if (absolute_pointer(__identity_base) < lowcore_end) { start = max(lowcore_start, (void *)__identity_base); end = min(lowcore_end, (void *)(__identity_base + ident_map_size)); memblock_reserve(__pa(start), __pa(end)); @@ -866,6 +872,23 @@ static void __init log_component_list(void) } /* + * Print avoiding interpretation of % in buf and taking bootdebug option + * into consideration. + */ +static void __init print_rb_entry(const char *buf) +{ + char fmt[] = KERN_SOH "0boot: %s"; + int level = printk_get_level(buf); + + buf = skip_timestamp(printk_skip_level(buf)); + if (level == KERN_DEBUG[1] && (!bootdebug || !bootdebug_filter_match(buf))) + return; + + fmt[1] = level; + printk(fmt, buf); +} + +/* * Setup function called from init/main.c just after the banner * was printed. */ @@ -884,6 +907,9 @@ void __init setup_arch(char **cmdline_p) pr_info("Linux is running natively in 64-bit mode\n"); else pr_info("Linux is running as a guest in 64-bit mode\n"); + /* Print decompressor messages if not already printed */ + if (!boot_earlyprintk) + boot_rb_foreach(print_rb_entry); if (have_relocated_lowcore()) pr_info("Lowcore relocated to 0x%px\n", get_lowcore()); @@ -987,3 +1013,8 @@ void __init setup_arch(char **cmdline_p) /* Add system specific data to the random pool */ setup_randomness(); } + +void __init arch_cpu_finalize_init(void) +{ + sclp_init(); +} diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 377b9aaf8c92..ff1ddba96352 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -52,7 +52,6 @@ SECTIONS SOFTIRQENTRY_TEXT FTRACE_HOTPATCH_TRAMPOLINES_TEXT *(.text.*_indirect_*) - *(.fixup) *(.gnu.warning) . = ALIGN(PAGE_SIZE); _etext = .; /* End of text section */ diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index c7c269d5c491..f977b7c37efc 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -31,51 +31,6 @@ void debug_user_asce(int exit) } #endif /*CONFIG_DEBUG_ENTRY */ -static unsigned long raw_copy_from_user_key(void *to, const void __user *from, - unsigned long size, unsigned long key) -{ - unsigned long rem; - union oac spec = { - .oac2.key = key, - .oac2.as = PSW_BITS_AS_SECONDARY, - .oac2.k = 1, - .oac2.a = 1, - }; - - asm volatile( - " lr 0,%[spec]\n" - "0: mvcos 0(%[to]),0(%[from]),%[size]\n" - "1: jz 5f\n" - " algr %[size],%[val]\n" - " slgr %[from],%[val]\n" - " slgr %[to],%[val]\n" - " j 0b\n" - "2: la %[rem],4095(%[from])\n" /* rem = from + 4095 */ - " nr %[rem],%[val]\n" /* rem = (from + 4095) & -4096 */ - " slgr %[rem],%[from]\n" - " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ - " jnh 6f\n" - "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" - "4: slgr %[size],%[rem]\n" - " j 6f\n" - "5: slgr %[size],%[size]\n" - "6:\n" - EX_TABLE(0b, 2b) - EX_TABLE(1b, 2b) - EX_TABLE(3b, 6b) - EX_TABLE(4b, 6b) - : [size] "+&a" (size), [from] "+&a" (from), [to] "+&a" (to), [rem] "=&a" (rem) - : [val] "a" (-4096UL), [spec] "d" (spec.val) - : "cc", "memory", "0"); - return size; -} - -unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) -{ - return raw_copy_from_user_key(to, from, n, 0); -} -EXPORT_SYMBOL(raw_copy_from_user); - unsigned long _copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key) { @@ -93,51 +48,6 @@ unsigned long _copy_from_user_key(void *to, const void __user *from, } EXPORT_SYMBOL(_copy_from_user_key); -static unsigned long raw_copy_to_user_key(void __user *to, const void *from, - unsigned long size, unsigned long key) -{ - unsigned long rem; - union oac spec = { - .oac1.key = key, - .oac1.as = PSW_BITS_AS_SECONDARY, - .oac1.k = 1, - .oac1.a = 1, - }; - - asm volatile( - " lr 0,%[spec]\n" - "0: mvcos 0(%[to]),0(%[from]),%[size]\n" - "1: jz 5f\n" - " algr %[size],%[val]\n" - " slgr %[to],%[val]\n" - " slgr %[from],%[val]\n" - " j 0b\n" - "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */ - " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */ - " slgr %[rem],%[to]\n" - " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ - " jnh 6f\n" - "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" - "4: slgr %[size],%[rem]\n" - " j 6f\n" - "5: slgr %[size],%[size]\n" - "6:\n" - EX_TABLE(0b, 2b) - EX_TABLE(1b, 2b) - EX_TABLE(3b, 6b) - EX_TABLE(4b, 6b) - : [size] "+&a" (size), [to] "+&a" (to), [from] "+&a" (from), [rem] "=&a" (rem) - : [val] "a" (-4096UL), [spec] "d" (spec.val) - : "cc", "memory", "0"); - return size; -} - -unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) -{ - return raw_copy_to_user_key(to, from, n, 0); -} -EXPORT_SYMBOL(raw_copy_to_user); - unsigned long _copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key) { diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c index 0a0738a473af..a046be1715cf 100644 --- a/arch/s390/mm/extable.c +++ b/arch/s390/mm/extable.c @@ -7,6 +7,7 @@ #include <linux/panic.h> #include <asm/asm-extable.h> #include <asm/extable.h> +#include <asm/fpu.h> const struct exception_table_entry *s390_search_extables(unsigned long addr) { @@ -26,7 +27,7 @@ static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_r return true; } -static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs) +static bool ex_handler_ua_fault(const struct exception_table_entry *ex, struct pt_regs *regs) { unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); @@ -35,18 +36,6 @@ static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct p return true; } -static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs) -{ - unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); - unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); - size_t len = FIELD_GET(EX_DATA_LEN, ex->data); - - regs->gprs[reg_err] = -EFAULT; - memset((void *)regs->gprs[reg_addr], 0, len); - regs->psw.addr = extable_fixup(ex); - return true; -} - static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, bool pair, struct pt_regs *regs) { @@ -77,6 +66,13 @@ static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt return true; } +static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + fpu_sfpc(0); + regs->psw.addr = extable_fixup(ex); + return true; +} + bool fixup_exception(struct pt_regs *regs) { const struct exception_table_entry *ex; @@ -89,16 +85,16 @@ bool fixup_exception(struct pt_regs *regs) return ex_handler_fixup(ex, regs); case EX_TYPE_BPF: return ex_handler_bpf(ex, regs); - case EX_TYPE_UA_STORE: - return ex_handler_ua_store(ex, regs); - case EX_TYPE_UA_LOAD_MEM: - return ex_handler_ua_load_mem(ex, regs); + case EX_TYPE_UA_FAULT: + return ex_handler_ua_fault(ex, regs); case EX_TYPE_UA_LOAD_REG: return ex_handler_ua_load_reg(ex, false, regs); case EX_TYPE_UA_LOAD_REGPAIR: return ex_handler_ua_load_reg(ex, true, regs); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(ex, regs); + case EX_TYPE_FPC: + return ex_handler_fpc(ex, regs); } panic("invalid exception table entry"); } diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 7c684c54e721..8ead999e340b 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -662,7 +662,7 @@ void __init vmem_map_init(void) if (!static_key_enabled(&cpu_has_bear)) set_memory_x(0, 1); if (debug_pagealloc_enabled()) - __set_memory_4k(__va(0), __va(0) + ident_map_size); + __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index d5ace00d10f0..857afbc4828f 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -171,7 +171,6 @@ void zpci_bus_scan_busses(void) static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev) { return !s390_pci_no_rid && zdev->rid_available && - zpci_is_device_configured(zdev) && !zdev->vfn; } diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index 24eccaa29337..bdcf2a3b6c41 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -13,7 +13,7 @@ CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(call if_changed_rule,as_o_S) -KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes +KBUILD_CFLAGS := -std=gnu11 -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common diff --git a/arch/s390/tools/gen_opcode_table.c b/arch/s390/tools/gen_opcode_table.c index a1bc02b29c81..7d76c417f83f 100644 --- a/arch/s390/tools/gen_opcode_table.c +++ b/arch/s390/tools/gen_opcode_table.c @@ -201,6 +201,17 @@ static int cmp_long_insn(const void *a, const void *b) return strcmp(((struct insn *)a)->name, ((struct insn *)b)->name); } +static void print_insn_name(const char *name) +{ + size_t i, len; + + len = strlen(name); + printf("{"); + for (i = 0; i < len; i++) + printf(" \'%c\',", name[i]); + printf(" }"); +} + static void print_long_insn(struct gen_opcode *desc) { struct insn *insn; @@ -223,7 +234,9 @@ static void print_long_insn(struct gen_opcode *desc) insn = &desc->insn[i]; if (insn->name_len < 6) continue; - printf("\t[LONG_INSN_%s] = \"%s\", \\\n", insn->upper, insn->name); + printf("\t[LONG_INSN_%s] = ", insn->upper); + print_insn_name(insn->name); + printf(", \\\n"); } printf("}\n\n"); } @@ -236,11 +249,13 @@ static void print_opcode(struct insn *insn, int nr) if (insn->type->byte != 0) opcode += 2; printf("\t[%4d] = { .opfrag = 0x%s, .format = INSTR_%s, ", nr, opcode, insn->format); - if (insn->name_len < 6) - printf(".name = \"%s\" ", insn->name); - else - printf(".offset = LONG_INSN_%s ", insn->upper); - printf("}, \\\n"); + if (insn->name_len < 6) { + printf(".name = "); + print_insn_name(insn->name); + } else { + printf(".offset = LONG_INSN_%s", insn->upper); + } + printf(" }, \\\n"); } static void add_to_group(struct gen_opcode *desc, struct insn *insn, int offset) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7c15d6e83c37..dae6a73be40e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -227,6 +227,28 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) } static int __init +acpi_check_lapic(union acpi_subtable_headers *header, const unsigned long end) +{ + struct acpi_madt_local_apic *processor = NULL; + + processor = (struct acpi_madt_local_apic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + /* Ignore invalid ID */ + if (processor->id == 0xff) + return 0; + + /* Ignore processors that can not be onlined */ + if (!acpi_is_processor_usable(processor->lapic_flags)) + return 0; + + has_lapic_cpus = true; + return 0; +} + +static int __init acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic *processor = NULL; @@ -257,7 +279,6 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); - has_lapic_cpus = true; return 0; } @@ -1026,6 +1047,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) static int __init acpi_parse_madt_lapic_entries(void) { int count, x2count = 0; + struct acpi_subtable_proc madt_proc[2]; + int ret; if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; @@ -1034,10 +1057,27 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_LOCAL_APIC); - x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_LOCAL_APIC); + /* Check if there are valid LAPIC entries */ + acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_check_lapic, MAX_LOCAL_APIC); + + /* + * Enumerate the APIC IDs in the order that they appear in the + * MADT, no matter LAPIC entry or x2APIC entry is used. + */ + memset(madt_proc, 0, sizeof(madt_proc)); + madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; + madt_proc[0].handler = acpi_parse_lapic; + madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; + madt_proc[1].handler = acpi_parse_x2apic; + ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); + if (ret < 0) { + pr_err("Error parsing LAPIC/X2APIC entries\n"); + return ret; + } + count = madt_proc[0].count; + x2count = madt_proc[1].count; } if (!count && !x2count) { pr_err("No LAPIC entries present\n"); diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c index cb45ef5240da..068c1612660b 100644 --- a/drivers/acpi/x86/utils.c +++ b/drivers/acpi/x86/utils.c @@ -408,6 +408,19 @@ static const struct dmi_system_id acpi_quirk_skip_dmi_ids[] = { ACPI_QUIRK_SKIP_ACPI_AC_AND_BATTERY), }, { + /* Vexia Edu Atla 10 tablet 5V version */ + .matches = { + /* Having all 3 of these not set is somewhat unique */ + DMI_MATCH(DMI_SYS_VENDOR, "To be filled by O.E.M."), + DMI_MATCH(DMI_PRODUCT_NAME, "To be filled by O.E.M."), + DMI_MATCH(DMI_BOARD_NAME, "To be filled by O.E.M."), + /* Above strings are too generic, also match on BIOS date */ + DMI_MATCH(DMI_BIOS_DATE, "05/14/2015"), + }, + .driver_data = (void *)(ACPI_QUIRK_SKIP_I2C_CLIENTS | + ACPI_QUIRK_SKIP_ACPI_AC_AND_BATTERY), + }, + { /* Vexia Edu Atla 10 tablet 9V version */ .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "AMI Corporation"), diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index cbc9a7a75def..d497d448e4b2 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -656,13 +656,15 @@ static void device_resume_noirq(struct device *dev, pm_message_t state, bool asy * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime - * status to "active", but do that only if DPM_FLAG_SMART_SUSPEND is set - * to avoid confusing drivers that don't use it. + * status to "active" unless its power.set_active flag is clear, in + * which case it is not necessary to update its PM-runtime status. */ - if (skip_resume) + if (skip_resume) { pm_runtime_set_suspended(dev); - else if (dev_pm_skip_suspend(dev)) + } else if (dev->power.set_active) { pm_runtime_set_active(dev); + dev->power.set_active = false; + } if (dev->pm_domain) { info = "noirq power domain "; @@ -1189,18 +1191,24 @@ static pm_message_t resume_event(pm_message_t sleep_state) return PMSG_ON; } -static void dpm_superior_set_must_resume(struct device *dev) +static void dpm_superior_set_must_resume(struct device *dev, bool set_active) { struct device_link *link; int idx; - if (dev->parent) + if (dev->parent) { dev->parent->power.must_resume = true; + if (set_active) + dev->parent->power.set_active = true; + } idx = device_links_read_lock(); - list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) + list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { link->supplier->power.must_resume = true; + if (set_active) + link->supplier->power.set_active = true; + } device_links_read_unlock(idx); } @@ -1278,8 +1286,11 @@ Skip: dev->power.may_skip_resume)) dev->power.must_resume = true; - if (dev->power.must_resume) - dpm_superior_set_must_resume(dev); + if (dev->power.must_resume) { + dev->power.set_active = dev->power.set_active || + dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND); + dpm_superior_set_must_resume(dev, dev->power.set_active); + } Complete: complete_all(&dev->power.completion); diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 1230045d78a5..aa5ec1d444a9 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -1381,13 +1381,12 @@ static void btnxpuart_tx_work(struct work_struct *work) while ((skb = nxp_dequeue(nxpdev))) { len = serdev_device_write_buf(serdev, skb->data, skb->len); - serdev_device_wait_until_sent(serdev, 0); hdev->stat.byte_tx += len; skb_pull(skb, len); if (skb->len > 0) { skb_queue_head(&nxpdev->txq, skb); - break; + continue; } switch (hci_skb_pkt_type(skb)) { diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 9aa018d4f6f5..90966dfbd278 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -899,11 +899,6 @@ static void btusb_reset(struct hci_dev *hdev) struct btusb_data *data; int err; - if (hdev->reset) { - hdev->reset(hdev); - return; - } - data = hci_get_drvdata(hdev); /* This is not an unbalanced PM reference since the device will reset */ err = usb_autopm_get_interface(data->intf); @@ -2639,8 +2634,15 @@ static void btusb_mtk_claim_iso_intf(struct btusb_data *data) struct btmtk_data *btmtk_data = hci_get_priv(data->hdev); int err; + /* + * The function usb_driver_claim_interface() is documented to need + * locks held if it's not called from a probe routine. The code here + * is called from the hci_power_on workqueue, so grab the lock. + */ + device_lock(&btmtk_data->isopkt_intf->dev); err = usb_driver_claim_interface(&btusb_driver, btmtk_data->isopkt_intf, data); + device_unlock(&btmtk_data->isopkt_intf->dev); if (err < 0) { btmtk_data->isopkt_intf = NULL; bt_dev_err(data->hdev, "Failed to claim iso interface"); diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 704e84d00639..0ee5c691fb36 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -17,7 +17,7 @@ config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM config ARM_AIROHA_SOC_CPUFREQ tristate "Airoha EN7581 SoC CPUFreq support" - depends on ARCH_AIROHA || COMPILE_TEST + depends on (ARCH_AIROHA && OF) || COMPILE_TEST select PM_OPP default ARCH_AIROHA help diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 302df42d6887..463b69a2dff5 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -909,11 +909,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) pr_warn(FW_WARN "P-state 0 is not max freq\n"); - if (acpi_cpufreq_driver.set_boost) { - set_boost(policy, acpi_cpufreq_driver.boost_enabled); - policy->boost_enabled = acpi_cpufreq_driver.boost_enabled; - } - return result; err_unreg: diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 2486a6c5256a..8f512448382f 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -611,7 +611,8 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) * Section 8.4.7.1.1.5 of ACPI 6.1 spec) */ policy->min = cppc_perf_to_khz(caps, caps->lowest_nonlinear_perf); - policy->max = cppc_perf_to_khz(caps, caps->nominal_perf); + policy->max = cppc_perf_to_khz(caps, policy->boost_enabled ? + caps->highest_perf : caps->nominal_perf); /* * Set cpuinfo.min_freq to Lowest to make the full range of performance @@ -619,7 +620,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) * nonlinear perf */ policy->cpuinfo.min_freq = cppc_perf_to_khz(caps, caps->lowest_perf); - policy->cpuinfo.max_freq = cppc_perf_to_khz(caps, caps->nominal_perf); + policy->cpuinfo.max_freq = policy->max; policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); policy->shared_type = cpu_data->shared_type; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 1076e37a18ad..e0048856ecee 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1410,10 +1410,6 @@ static int cpufreq_online(unsigned int cpu) goto out_free_policy; } - /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ - if (cpufreq_boost_enabled() && policy_has_boost_freq(policy)) - policy->boost_enabled = true; - /* * The initialization has succeeded and the policy is online. * If there is a problem with its frequency table, take it @@ -1476,6 +1472,10 @@ static int cpufreq_online(unsigned int cpu) blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_CREATE_POLICY, policy); + } else { + ret = freq_qos_update_request(policy->max_freq_req, policy->max); + if (ret < 0) + goto out_destroy_policy; } if (cpufreq_driver->get && has_target()) { @@ -1570,6 +1570,18 @@ static int cpufreq_online(unsigned int cpu) if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); + /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ + if (policy->boost_enabled != cpufreq_boost_enabled()) { + policy->boost_enabled = cpufreq_boost_enabled(); + ret = cpufreq_driver->set_boost(policy, policy->boost_enabled); + if (ret) { + /* If the set_boost fails, the online operation is not affected */ + pr_info("%s: CPU%d: Cannot %s BOOST\n", __func__, policy->cpu, + policy->boost_enabled ? "enable" : "disable"); + policy->boost_enabled = !policy->boost_enabled; + } + } + pr_debug("initialization complete\n"); return 0; diff --git a/drivers/cpufreq/s3c64xx-cpufreq.c b/drivers/cpufreq/s3c64xx-cpufreq.c index c6bdfc308e99..9cef71528076 100644 --- a/drivers/cpufreq/s3c64xx-cpufreq.c +++ b/drivers/cpufreq/s3c64xx-cpufreq.c @@ -24,6 +24,7 @@ struct s3c64xx_dvfs { unsigned int vddarm_max; }; +#ifdef CONFIG_REGULATOR static struct s3c64xx_dvfs s3c64xx_dvfs_table[] = { [0] = { 1000000, 1150000 }, [1] = { 1050000, 1150000 }, @@ -31,6 +32,7 @@ static struct s3c64xx_dvfs s3c64xx_dvfs_table[] = { [3] = { 1200000, 1350000 }, [4] = { 1300000, 1350000 }, }; +#endif static struct cpufreq_frequency_table s3c64xx_freq_table[] = { { 0, 0, 66000 }, @@ -51,15 +53,16 @@ static struct cpufreq_frequency_table s3c64xx_freq_table[] = { static int s3c64xx_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index) { - struct s3c64xx_dvfs *dvfs; - unsigned int old_freq, new_freq; + unsigned int new_freq = s3c64xx_freq_table[index].frequency; int ret; +#ifdef CONFIG_REGULATOR + struct s3c64xx_dvfs *dvfs; + unsigned int old_freq; + old_freq = clk_get_rate(policy->clk) / 1000; - new_freq = s3c64xx_freq_table[index].frequency; dvfs = &s3c64xx_dvfs_table[s3c64xx_freq_table[index].driver_data]; -#ifdef CONFIG_REGULATOR if (vddarm && new_freq > old_freq) { ret = regulator_set_voltage(vddarm, dvfs->vddarm_min, diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 173ddcac540a..8fe5e1b47ef9 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -41,11 +41,7 @@ * idle state 2, the third bin spans from the target residency of idle state 2 * up to, but not including, the target residency of idle state 3 and so on. * The last bin spans from the target residency of the deepest idle state - * supplied by the driver to the scheduler tick period length or to infinity if - * the tick period length is less than the target residency of that state. In - * the latter case, the governor also counts events with the measured idle - * duration between the tick period length and the target residency of the - * deepest idle state. + * supplied by the driver to infinity. * * Two metrics called "hits" and "intercepts" are associated with each bin. * They are updated every time before selecting an idle state for the given CPU @@ -60,6 +56,10 @@ * into by the sleep length (these events are also referred to as "intercepts" * below). * + * The governor also counts "intercepts" with the measured idle duration below + * the tick period length and uses this information when deciding whether or not + * to stop the scheduler tick. + * * In order to select an idle state for a CPU, the governor takes the following * steps (modulo the possible latency constraint that must be taken into account * too): @@ -106,6 +106,12 @@ #include "gov.h" /* + * Idle state exit latency threshold used for deciding whether or not to check + * the time till the closest expected timer event. + */ +#define LATENCY_THRESHOLD_NS (RESIDENCY_THRESHOLD_NS / 2) + +/* * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value * is used for decreasing metrics on a regular basis. */ @@ -124,18 +130,20 @@ struct teo_bin { /** * struct teo_cpu - CPU data used by the TEO cpuidle governor. - * @time_span_ns: Time between idle state selection and post-wakeup update. * @sleep_length_ns: Time till the closest timer event (at the selection time). * @state_bins: Idle state data bins for this CPU. * @total: Grand total of the "intercepts" and "hits" metrics for all bins. - * @tick_hits: Number of "hits" after TICK_NSEC. + * @tick_intercepts: "Intercepts" before TICK_NSEC. + * @short_idles: Wakeups after short idle periods. + * @artificial_wakeup: Set if the wakeup has been triggered by a safety net. */ struct teo_cpu { - s64 time_span_ns; s64 sleep_length_ns; struct teo_bin state_bins[CPUIDLE_STATE_MAX]; unsigned int total; - unsigned int tick_hits; + unsigned int tick_intercepts; + unsigned int short_idles; + bool artificial_wakeup; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); @@ -152,23 +160,17 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) s64 target_residency_ns; u64 measured_ns; - if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { + cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT; + + if (cpu_data->artificial_wakeup) { /* - * One of the safety nets has triggered or the wakeup was close - * enough to the closest timer event expected at the idle state - * selection time to be discarded. + * If one of the safety nets has triggered, assume that this + * might have been a long sleep. */ measured_ns = U64_MAX; } else { u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; - /* - * The computations below are to determine whether or not the - * (saved) time till the next timer event and the measured idle - * duration fall into the same "bin", so use last_residency_ns - * for that instead of time_span_ns which includes the cpuidle - * overhead. - */ measured_ns = dev->last_residency_ns; /* * The delay between the wakeup and the first instruction @@ -176,14 +178,16 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * time, so take 1/2 of the exit latency as a very rough * approximation of the average of it. */ - if (measured_ns >= lat_ns) + if (measured_ns >= lat_ns) { measured_ns -= lat_ns / 2; - else + if (measured_ns < RESIDENCY_THRESHOLD_NS) + cpu_data->short_idles += PULSE; + } else { measured_ns /= 2; + cpu_data->short_idles += PULSE; + } } - cpu_data->total = 0; - /* * Decay the "hits" and "intercepts" metrics for all of the bins and * find the bins that the sleep length and the measured idle duration @@ -195,8 +199,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) bin->hits -= bin->hits >> DECAY_SHIFT; bin->intercepts -= bin->intercepts >> DECAY_SHIFT; - cpu_data->total += bin->hits + bin->intercepts; - target_residency_ns = drv->states[i].target_residency_ns; if (target_residency_ns <= cpu_data->sleep_length_ns) { @@ -206,38 +208,22 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } } - /* - * If the deepest state's target residency is below the tick length, - * make a record of it to help teo_select() decide whether or not - * to stop the tick. This effectively adds an extra hits-only bin - * beyond the last state-related one. - */ - if (target_residency_ns < TICK_NSEC) { - cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT; - - cpu_data->total += cpu_data->tick_hits; - - if (TICK_NSEC <= cpu_data->sleep_length_ns) { - idx_timer = drv->state_count; - if (TICK_NSEC <= measured_ns) { - cpu_data->tick_hits += PULSE; - goto end; - } - } - } - + cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT; /* * If the measured idle duration falls into the same bin as the sleep * length, this is a "hit", so update the "hits" metric for that bin. * Otherwise, update the "intercepts" metric for the bin fallen into by * the measured idle duration. */ - if (idx_timer == idx_duration) + if (idx_timer == idx_duration) { cpu_data->state_bins[idx_timer].hits += PULSE; - else + } else { cpu_data->state_bins[idx_duration].intercepts += PULSE; + if (TICK_NSEC <= measured_ns) + cpu_data->tick_intercepts += PULSE; + } -end: + cpu_data->total -= cpu_data->total >> DECAY_SHIFT; cpu_data->total += PULSE; } @@ -285,14 +271,12 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); ktime_t delta_tick = TICK_NSEC / 2; - unsigned int tick_intercept_sum = 0; unsigned int idx_intercept_sum = 0; unsigned int intercept_sum = 0; unsigned int idx_hit_sum = 0; unsigned int hit_sum = 0; int constraint_idx = 0; int idx0 = 0, idx = -1; - int prev_intercept_idx; s64 duration_ns; int i; @@ -301,10 +285,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, dev->last_state_idx = -1; } - cpu_data->time_span_ns = local_clock(); /* - * Set the expected sleep length to infinity in case of an early - * return. + * Set the sleep length to infinity in case the invocation of + * tick_nohz_get_sleep_length() below is skipped, in which case it won't + * be known whether or not the subsequent wakeup is caused by a timer. + * It is generally fine to count the wakeup as an intercept then, except + * for the cases when the CPU is mostly woken up by timers and there may + * be opportunities to ask for a deeper idle state when no imminent + * timers are scheduled which may be missed. */ cpu_data->sleep_length_ns = KTIME_MAX; @@ -360,17 +348,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto end; } - tick_intercept_sum = intercept_sum + - cpu_data->state_bins[drv->state_count-1].intercepts; - /* * If the sum of the intercepts metric for all of the idle states * shallower than the current candidate one (idx) is greater than the * sum of the intercepts and hits metrics for the candidate state and - * all of the deeper states a shallower idle state is likely to be a + * all of the deeper states, a shallower idle state is likely to be a * better choice. */ - prev_intercept_idx = idx; if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { int first_suitable_idx = idx; @@ -396,41 +380,38 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * first enabled state that is deep enough. */ if (teo_state_ok(i, drv) && - !dev->states_usage[i].disable) + !dev->states_usage[i].disable) { idx = i; - else - idx = first_suitable_idx; - + break; + } + idx = first_suitable_idx; break; } if (dev->states_usage[i].disable) continue; - if (!teo_state_ok(i, drv)) { + if (teo_state_ok(i, drv)) { /* - * The current state is too shallow, but if an - * alternative candidate state has been found, - * it may still turn out to be a better choice. + * The current state is deep enough, but still + * there may be a better one. */ - if (first_suitable_idx != idx) - continue; - - break; + first_suitable_idx = i; + continue; } - first_suitable_idx = i; + /* + * The current state is too shallow, so if no suitable + * states other than the initial candidate have been + * found, give up (the remaining states to check are + * shallower still), but otherwise the first suitable + * state other than the initial candidate may turn out + * to be preferable. + */ + if (first_suitable_idx == idx) + break; } } - if (!idx && prev_intercept_idx) { - /* - * We have to query the sleep length here otherwise we don't - * know after wakeup if our guess was correct. - */ - duration_ns = tick_nohz_get_sleep_length(&delta_tick); - cpu_data->sleep_length_ns = duration_ns; - goto out_tick; - } /* * If there is a latency constraint, it may be necessary to select an @@ -440,24 +421,39 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, idx = constraint_idx; /* - * Skip the timers check if state 0 is the current candidate one, - * because an immediate non-timer wakeup is expected in that case. + * If either the candidate state is state 0 or its target residency is + * low enough, there is basically nothing more to do, but if the sleep + * length is not updated, the subsequent wakeup will be counted as an + * "intercept" which may be problematic in the cases when timer wakeups + * are dominant. Namely, it may effectively prevent deeper idle states + * from being selected at one point even if no imminent timers are + * scheduled. + * + * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one + * CPU are unlikely (user space has a default 50 us slack value for + * hrtimers and there are relatively few timers with a lower deadline + * value in the kernel), and even if they did happen, the potential + * benefit from using a deep idle state in that case would be + * questionable anyway for latency reasons. Thus if the measured idle + * duration falls into that range in the majority of cases, assume + * non-timer wakeups to be dominant and skip updating the sleep length + * to reduce latency. + * + * Also, if the latency constraint is sufficiently low, it will force + * shallow idle states regardless of the wakeup type, so the sleep + * length need not be known in that case. */ - if (!idx) - goto out_tick; - - /* - * If state 0 is a polling one, check if the target residency of - * the current candidate state is low enough and skip the timers - * check in that case too. - */ - if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) && - drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) + if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && + (2 * cpu_data->short_idles >= cpu_data->total || + latency_req < LATENCY_THRESHOLD_NS)) goto out_tick; duration_ns = tick_nohz_get_sleep_length(&delta_tick); cpu_data->sleep_length_ns = duration_ns; + if (!idx) + goto out_tick; + /* * If the closest expected timer is before the target residency of the * candidate state, a shallower one needs to be found. @@ -474,7 +470,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * total wakeup events, do not stop the tick. */ if (drv->states[idx].target_residency_ns < TICK_NSEC && - tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8) + cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8) duration_ns = TICK_NSEC / 2; end: @@ -511,17 +507,16 @@ static void teo_reflect(struct cpuidle_device *dev, int state) struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); dev->last_state_idx = state; - /* - * If the wakeup was not "natural", but triggered by one of the safety - * nets, assume that the CPU might have been idle for the entire sleep - * length time. - */ if (dev->poll_time_limit || (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { + /* + * The wakeup was not "genuine", but triggered by one of the + * safety nets. + */ dev->poll_time_limit = false; - cpu_data->time_span_ns = cpu_data->sleep_length_ns; + cpu_data->artificial_wakeup = true; } else { - cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; + cpu_data->artificial_wakeup = false; } } diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c index 4cb455b2bdee..619b6fb9d833 100644 --- a/drivers/gpio/gpio-mxc.c +++ b/drivers/gpio/gpio-mxc.c @@ -490,8 +490,7 @@ static int mxc_gpio_probe(struct platform_device *pdev) port->gc.request = mxc_gpio_request; port->gc.free = mxc_gpio_free; port->gc.to_irq = mxc_gpio_to_irq; - port->gc.base = (pdev->id < 0) ? of_alias_get_id(np, "gpio") * 32 : - pdev->id * 32; + port->gc.base = of_alias_get_id(np, "gpio") * 32; err = devm_gpiochip_add_data(&pdev->dev, &port->gc, port); if (err) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 7b78c2bada81..e45bba240cbc 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1538,17 +1538,20 @@ static netdev_features_t bond_fix_features(struct net_device *dev, NETIF_F_HIGHDMA | NETIF_F_LRO) #define BOND_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ - NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE) + NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE | \ + NETIF_F_GSO_PARTIAL) #define BOND_MPLS_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_GSO_SOFTWARE) +#define BOND_GSO_PARTIAL_FEATURES (NETIF_F_GSO_ESP) + static void bond_compute_features(struct bonding *bond) { + netdev_features_t gso_partial_features = BOND_GSO_PARTIAL_FEATURES; unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; - netdev_features_t gso_partial_features = NETIF_F_GSO_ESP; netdev_features_t vlan_features = BOND_VLAN_FEATURES; netdev_features_t enc_features = BOND_ENC_FEATURES; #ifdef CONFIG_XFRM_OFFLOAD @@ -1582,8 +1585,9 @@ static void bond_compute_features(struct bonding *bond) BOND_XFRM_FEATURES); #endif /* CONFIG_XFRM_OFFLOAD */ - if (slave->dev->hw_enc_features & NETIF_F_GSO_PARTIAL) - gso_partial_features &= slave->dev->gso_partial_features; + gso_partial_features = netdev_increment_features(gso_partial_features, + slave->dev->gso_partial_features, + BOND_GSO_PARTIAL_FEATURES); mpls_features = netdev_increment_features(mpls_features, slave->dev->mpls_features, @@ -1598,12 +1602,8 @@ static void bond_compute_features(struct bonding *bond) } bond_dev->hard_header_len = max_hard_header_len; - if (gso_partial_features & NETIF_F_GSO_ESP) - bond_dev->gso_partial_features |= NETIF_F_GSO_ESP; - else - bond_dev->gso_partial_features &= ~NETIF_F_GSO_ESP; - done: + bond_dev->gso_partial_features = gso_partial_features; bond_dev->vlan_features = vlan_features; bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HW_VLAN_CTAG_TX | @@ -6046,6 +6046,7 @@ void bond_setup(struct net_device *bond_dev) bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; + bond_dev->features |= NETIF_F_GSO_PARTIAL; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_features |= BOND_XFRM_FEATURES; /* Only enable XFRM features if this is an active-backup config */ diff --git a/drivers/net/ethernet/broadcom/bgmac.h b/drivers/net/ethernet/broadcom/bgmac.h index d73ef262991d..6fee9a41839c 100644 --- a/drivers/net/ethernet/broadcom/bgmac.h +++ b/drivers/net/ethernet/broadcom/bgmac.h @@ -328,8 +328,7 @@ #define BGMAC_RX_FRAME_OFFSET 30 /* There are 2 unused bytes between header and real data */ #define BGMAC_RX_BUF_OFFSET (NET_SKB_PAD + NET_IP_ALIGN - \ BGMAC_RX_FRAME_OFFSET) -/* Jumbo frame size with FCS */ -#define BGMAC_RX_MAX_FRAME_SIZE 9724 +#define BGMAC_RX_MAX_FRAME_SIZE 1536 #define BGMAC_RX_BUF_SIZE (BGMAC_RX_FRAME_OFFSET + BGMAC_RX_MAX_FRAME_SIZE) #define BGMAC_RX_ALLOC_SIZE (SKB_DATA_ALIGN(BGMAC_RX_BUF_SIZE + BGMAC_RX_BUF_OFFSET) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 9cc8db10a8d6..1c94bf1db718 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -7424,7 +7424,7 @@ static void tg3_napi_enable(struct tg3 *tp) for (i = 0; i < tp->irq_cnt; i++) { tnapi = &tp->napi[i]; - napi_enable(&tnapi->napi); + napi_enable_locked(&tnapi->napi); if (tnapi->tx_buffers) { netif_queue_set_napi(tp->dev, txq_idx, NETDEV_QUEUE_TYPE_TX, @@ -7445,9 +7445,10 @@ static void tg3_napi_init(struct tg3 *tp) int i; for (i = 0; i < tp->irq_cnt; i++) { - netif_napi_add(tp->dev, &tp->napi[i].napi, - i ? tg3_poll_msix : tg3_poll); - netif_napi_set_irq(&tp->napi[i].napi, tp->napi[i].irq_vec); + netif_napi_add_locked(tp->dev, &tp->napi[i].napi, + i ? tg3_poll_msix : tg3_poll); + netif_napi_set_irq_locked(&tp->napi[i].napi, + tp->napi[i].irq_vec); } } @@ -11259,6 +11260,8 @@ static void tg3_timer_stop(struct tg3 *tp) static int tg3_restart_hw(struct tg3 *tp, bool reset_phy) __releases(tp->lock) __acquires(tp->lock) + __releases(tp->dev->lock) + __acquires(tp->dev->lock) { int err; @@ -11271,7 +11274,9 @@ static int tg3_restart_hw(struct tg3 *tp, bool reset_phy) tg3_timer_stop(tp); tp->irq_sync = 0; tg3_napi_enable(tp); + netdev_unlock(tp->dev); dev_close(tp->dev); + netdev_lock(tp->dev); tg3_full_lock(tp, 0); } return err; @@ -11299,6 +11304,7 @@ static void tg3_reset_task(struct work_struct *work) tg3_netif_stop(tp); + netdev_lock(tp->dev); tg3_full_lock(tp, 1); if (tg3_flag(tp, TX_RECOVERY_PENDING)) { @@ -11318,12 +11324,14 @@ static void tg3_reset_task(struct work_struct *work) * call cancel_work_sync() and wait forever. */ tg3_flag_clear(tp, RESET_TASK_PENDING); + netdev_unlock(tp->dev); dev_close(tp->dev); goto out; } tg3_netif_start(tp); tg3_full_unlock(tp); + netdev_unlock(tp->dev); tg3_phy_start(tp); tg3_flag_clear(tp, RESET_TASK_PENDING); out: @@ -11683,9 +11691,11 @@ static int tg3_start(struct tg3 *tp, bool reset_phy, bool test_irq, if (err) goto out_ints_fini; + netdev_lock(dev); tg3_napi_init(tp); tg3_napi_enable(tp); + netdev_unlock(dev); for (i = 0; i < tp->irq_cnt; i++) { err = tg3_request_irq(tp, i); @@ -12569,6 +12579,7 @@ static int tg3_set_ringparam(struct net_device *dev, irq_sync = 1; } + netdev_lock(dev); tg3_full_lock(tp, irq_sync); tp->rx_pending = ering->rx_pending; @@ -12597,6 +12608,7 @@ static int tg3_set_ringparam(struct net_device *dev, } tg3_full_unlock(tp); + netdev_unlock(dev); if (irq_sync && !err) tg3_phy_start(tp); @@ -12678,6 +12690,7 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam irq_sync = 1; } + netdev_lock(dev); tg3_full_lock(tp, irq_sync); if (epause->autoneg) @@ -12707,6 +12720,7 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam } tg3_full_unlock(tp); + netdev_unlock(dev); } tp->phy_flags |= TG3_PHYFLG_USER_CONFIGURED; @@ -13911,6 +13925,7 @@ static void tg3_self_test(struct net_device *dev, struct ethtool_test *etest, data[TG3_INTERRUPT_TEST] = 1; } + netdev_lock(dev); tg3_full_lock(tp, 0); tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); @@ -13922,6 +13937,7 @@ static void tg3_self_test(struct net_device *dev, struct ethtool_test *etest, } tg3_full_unlock(tp); + netdev_unlock(dev); if (irq_sync && !err2) tg3_phy_start(tp); @@ -14365,6 +14381,7 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu) tg3_set_mtu(dev, tp, new_mtu); + netdev_lock(dev); tg3_full_lock(tp, 1); tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); @@ -14384,6 +14401,7 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu) tg3_netif_start(tp); tg3_full_unlock(tp); + netdev_unlock(dev); if (!err) tg3_phy_start(tp); @@ -18164,6 +18182,7 @@ static int tg3_resume(struct device *device) netif_device_attach(dev); + netdev_lock(dev); tg3_full_lock(tp, 0); tg3_ape_driver_state_change(tp, RESET_KIND_INIT); @@ -18180,6 +18199,7 @@ static int tg3_resume(struct device *device) out: tg3_full_unlock(tp); + netdev_unlock(dev); if (!err) tg3_phy_start(tp); @@ -18260,7 +18280,9 @@ static pci_ers_result_t tg3_io_error_detected(struct pci_dev *pdev, done: if (state == pci_channel_io_perm_failure) { if (netdev) { + netdev_lock(netdev); tg3_napi_enable(tp); + netdev_unlock(netdev); dev_close(netdev); } err = PCI_ERS_RESULT_DISCONNECT; @@ -18314,7 +18336,9 @@ static pci_ers_result_t tg3_io_slot_reset(struct pci_dev *pdev) done: if (rc != PCI_ERS_RESULT_RECOVERED && netdev && netif_running(netdev)) { + netdev_lock(netdev); tg3_napi_enable(tp); + netdev_unlock(netdev); dev_close(netdev); } rtnl_unlock(); @@ -18340,12 +18364,14 @@ static void tg3_io_resume(struct pci_dev *pdev) if (!netdev || !netif_running(netdev)) goto done; + netdev_lock(netdev); tg3_full_lock(tp, 0); tg3_ape_driver_state_change(tp, RESET_KIND_INIT); tg3_flag_set(tp, INIT_COMPLETE); err = tg3_restart_hw(tp, true); if (err) { tg3_full_unlock(tp); + netdev_unlock(netdev); netdev_err(netdev, "Cannot restart hardware after reset.\n"); goto done; } @@ -18357,6 +18383,7 @@ static void tg3_io_resume(struct pci_dev *pdev) tg3_netif_start(tp); tg3_full_unlock(tp); + netdev_unlock(netdev); tg3_phy_start(tp); diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index 8735e333034c..b87eaf0c250c 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1777,10 +1777,11 @@ static void dm9000_drv_remove(struct platform_device *pdev) unregister_netdev(ndev); dm9000_release_board(pdev, dm); - free_netdev(ndev); /* free device structure */ if (dm->power_supply) regulator_disable(dm->power_supply); + free_netdev(ndev); /* free device structure */ + dev_dbg(&pdev->dev, "released and freed device\n"); } diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 68725506a095..f7c4ce8e9a26 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -840,6 +840,8 @@ static int fec_enet_txq_submit_tso(struct fec_enet_priv_tx_q *txq, struct fec_enet_private *fep = netdev_priv(ndev); int hdr_len, total_len, data_left; struct bufdesc *bdp = txq->bd.cur; + struct bufdesc *tmp_bdp; + struct bufdesc_ex *ebdp; struct tso_t tso; unsigned int index = 0; int ret; @@ -913,7 +915,34 @@ static int fec_enet_txq_submit_tso(struct fec_enet_priv_tx_q *txq, return 0; err_release: - /* TODO: Release all used data descriptors for TSO */ + /* Release all used data descriptors for TSO */ + tmp_bdp = txq->bd.cur; + + while (tmp_bdp != bdp) { + /* Unmap data buffers */ + if (tmp_bdp->cbd_bufaddr && + !IS_TSO_HEADER(txq, fec32_to_cpu(tmp_bdp->cbd_bufaddr))) + dma_unmap_single(&fep->pdev->dev, + fec32_to_cpu(tmp_bdp->cbd_bufaddr), + fec16_to_cpu(tmp_bdp->cbd_datlen), + DMA_TO_DEVICE); + + /* Clear standard buffer descriptor fields */ + tmp_bdp->cbd_sc = 0; + tmp_bdp->cbd_datlen = 0; + tmp_bdp->cbd_bufaddr = 0; + + /* Handle extended descriptor if enabled */ + if (fep->bufdesc_ex) { + ebdp = (struct bufdesc_ex *)tmp_bdp; + ebdp->cbd_esc = 0; + } + + tmp_bdp = fec_enet_get_nextdesc(tmp_bdp, &txq->bd); + } + + dev_kfree_skb_any(skb); + return ret; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c index 9a63fbc69408..b25fb400f476 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c @@ -40,6 +40,21 @@ EXPORT_SYMBOL(hnae3_unregister_ae_algo_prepare); */ static DEFINE_MUTEX(hnae3_common_lock); +/* ensure the drivers being unloaded one by one */ +static DEFINE_MUTEX(hnae3_unload_lock); + +void hnae3_acquire_unload_lock(void) +{ + mutex_lock(&hnae3_unload_lock); +} +EXPORT_SYMBOL(hnae3_acquire_unload_lock); + +void hnae3_release_unload_lock(void) +{ + mutex_unlock(&hnae3_unload_lock); +} +EXPORT_SYMBOL(hnae3_release_unload_lock); + static bool hnae3_client_match(enum hnae3_client_type client_type) { if (client_type == HNAE3_CLIENT_KNIC || diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 12ba380eb701..4e44f28288f9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -963,4 +963,6 @@ int hnae3_register_client(struct hnae3_client *client); void hnae3_set_client_init_flag(struct hnae3_client *client, struct hnae3_ae_dev *ae_dev, unsigned int inited); +void hnae3_acquire_unload_lock(void); +void hnae3_release_unload_lock(void); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index a7e3b22f641c..9ff797fb36c4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -6002,9 +6002,11 @@ module_init(hns3_init_module); */ static void __exit hns3_exit_module(void) { + hnae3_acquire_unload_lock(); pci_unregister_driver(&hns3_driver); hnae3_unregister_client(&client); hns3_dbg_unregister_debugfs(); + hnae3_release_unload_lock(); } module_exit(hns3_exit_module); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index db7845009252..3f17b3073e50 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -12919,9 +12919,11 @@ static int __init hclge_init(void) static void __exit hclge_exit(void) { + hnae3_acquire_unload_lock(); hnae3_unregister_ae_algo_prepare(&ae_algo); hnae3_unregister_ae_algo(&ae_algo); destroy_workqueue(hclge_wq); + hnae3_release_unload_lock(); } module_init(hclge_init); module_exit(hclge_exit); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 163c6e59ea4c..9ba767740a04 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -3410,8 +3410,10 @@ static int __init hclgevf_init(void) static void __exit hclgevf_exit(void) { + hnae3_acquire_unload_lock(); hnae3_unregister_ae_algo(&ae_algovf); destroy_workqueue(hclgevf_wq); + hnae3_release_unload_lock(); } module_init(hclgevf_init); module_exit(hclgevf_exit); diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index cbfaaa5b7d02..2d7a18fcc3be 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -773,6 +773,11 @@ iavf_vlan_filter *iavf_add_vlan(struct iavf_adapter *adapter, f->state = IAVF_VLAN_ADD; adapter->num_vlan_filters++; iavf_schedule_aq_request(adapter, IAVF_FLAG_AQ_ADD_VLAN_FILTER); + } else if (f->state == IAVF_VLAN_REMOVE) { + /* IAVF_VLAN_REMOVE means that VLAN wasn't yet removed. + * We can safely only change the state here. + */ + f->state = IAVF_VLAN_ACTIVE; } clearout: @@ -793,8 +798,18 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, struct iavf_vlan vlan) f = iavf_find_vlan(adapter, vlan); if (f) { - f->state = IAVF_VLAN_REMOVE; - iavf_schedule_aq_request(adapter, IAVF_FLAG_AQ_DEL_VLAN_FILTER); + /* IAVF_ADD_VLAN means that VLAN wasn't even added yet. + * Remove it from the list. + */ + if (f->state == IAVF_VLAN_ADD) { + list_del(&f->list); + kfree(f); + adapter->num_vlan_filters--; + } else { + f->state = IAVF_VLAN_REMOVE; + iavf_schedule_aq_request(adapter, + IAVF_FLAG_AQ_DEL_VLAN_FILTER); + } } spin_unlock_bh(&adapter->mac_vlan_list_lock); diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 01536a382e54..bdee499f991a 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -1498,7 +1498,6 @@ struct ice_aqc_dnl_equa_param { #define ICE_AQC_RX_EQU_POST1 (0x12 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_BFLF (0x13 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_BFHF (0x14 << ICE_AQC_RX_EQU_SHIFT) -#define ICE_AQC_RX_EQU_DRATE (0x15 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_CTLE_GAINHF (0x20 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_CTLE_GAINLF (0x21 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_CTLE_GAINDC (0x22 << ICE_AQC_RX_EQU_SHIFT) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 3072634bf049..f241493a6ac8 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -710,7 +710,6 @@ static int ice_get_tx_rx_equa(struct ice_hw *hw, u8 serdes_num, { ICE_AQC_RX_EQU_POST1, rx, &ptr->rx_equ_post1 }, { ICE_AQC_RX_EQU_BFLF, rx, &ptr->rx_equ_bflf }, { ICE_AQC_RX_EQU_BFHF, rx, &ptr->rx_equ_bfhf }, - { ICE_AQC_RX_EQU_DRATE, rx, &ptr->rx_equ_drate }, { ICE_AQC_RX_EQU_CTLE_GAINHF, rx, &ptr->rx_equ_ctle_gainhf }, { ICE_AQC_RX_EQU_CTLE_GAINLF, rx, &ptr->rx_equ_ctle_gainlf }, { ICE_AQC_RX_EQU_CTLE_GAINDC, rx, &ptr->rx_equ_ctle_gaindc }, diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.h b/drivers/net/ethernet/intel/ice/ice_ethtool.h index 8f2ad1c172c0..23b2cfbc9684 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.h +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.h @@ -15,7 +15,6 @@ struct ice_serdes_equalization_to_ethtool { int rx_equ_post1; int rx_equ_bflf; int rx_equ_bfhf; - int rx_equ_drate; int rx_equ_ctle_gainhf; int rx_equ_ctle_gainlf; int rx_equ_ctle_gaindc; diff --git a/drivers/net/ethernet/intel/ice/ice_parser.h b/drivers/net/ethernet/intel/ice/ice_parser.h index 6509d807627c..4f56d53d56b9 100644 --- a/drivers/net/ethernet/intel/ice/ice_parser.h +++ b/drivers/net/ethernet/intel/ice/ice_parser.h @@ -257,7 +257,6 @@ ice_pg_nm_cam_match(struct ice_pg_nm_cam_item *table, int size, /*** ICE_SID_RXPARSER_BOOST_TCAM and ICE_SID_LBL_RXPARSER_TMEM sections ***/ #define ICE_BST_TCAM_TABLE_SIZE 256 #define ICE_BST_TCAM_KEY_SIZE 20 -#define ICE_BST_KEY_TCAM_SIZE 19 /* Boost TCAM item */ struct ice_bst_tcam_item { @@ -401,7 +400,6 @@ u16 ice_xlt_kb_flag_get(struct ice_xlt_kb *kb, u64 pkt_flag); #define ICE_PARSER_GPR_NUM 128 #define ICE_PARSER_FLG_NUM 64 #define ICE_PARSER_ERR_NUM 16 -#define ICE_BST_KEY_SIZE 10 #define ICE_MARKER_ID_SIZE 9 #define ICE_MARKER_MAX_SIZE \ (ICE_MARKER_ID_SIZE * BITS_PER_BYTE - 1) @@ -431,13 +429,13 @@ struct ice_parser_rt { u8 pkt_buf[ICE_PARSER_MAX_PKT_LEN + ICE_PARSER_PKT_REV]; u16 pkt_len; u16 po; - u8 bst_key[ICE_BST_KEY_SIZE]; + u8 bst_key[ICE_BST_TCAM_KEY_SIZE]; struct ice_pg_cam_key pg_key; + u8 pg_prio; struct ice_alu *alu0; struct ice_alu *alu1; struct ice_alu *alu2; struct ice_pg_cam_action *action; - u8 pg_prio; struct ice_gpr_pu pu; u8 markers[ICE_MARKER_ID_SIZE]; bool protocols[ICE_PO_PAIR_SIZE]; diff --git a/drivers/net/ethernet/intel/ice/ice_parser_rt.c b/drivers/net/ethernet/intel/ice/ice_parser_rt.c index dedf5e854e4b..3995d662e050 100644 --- a/drivers/net/ethernet/intel/ice/ice_parser_rt.c +++ b/drivers/net/ethernet/intel/ice/ice_parser_rt.c @@ -125,22 +125,20 @@ static void ice_bst_key_init(struct ice_parser_rt *rt, else key[idd] = imem->b_kb.prio; - idd = ICE_BST_KEY_TCAM_SIZE - 1; + idd = ICE_BST_TCAM_KEY_SIZE - 2; for (i = idd; i >= 0; i--) { int j; j = ho + idd - i; if (j < ICE_PARSER_MAX_PKT_LEN) - key[i] = rt->pkt_buf[ho + idd - i]; + key[i] = rt->pkt_buf[j]; else key[i] = 0; } - ice_debug(rt->psr->hw, ICE_DBG_PARSER, "Generated Boost TCAM Key:\n"); - ice_debug(rt->psr->hw, ICE_DBG_PARSER, "%02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n", - key[0], key[1], key[2], key[3], key[4], - key[5], key[6], key[7], key[8], key[9]); - ice_debug(rt->psr->hw, ICE_DBG_PARSER, "\n"); + ice_debug_array_w_prefix(rt->psr->hw, ICE_DBG_PARSER, + KBUILD_MODNAME ": Generated Boost TCAM Key", + key, ICE_BST_TCAM_KEY_SIZE); } static u16 ice_bit_rev_u16(u16 v, int len) diff --git a/drivers/net/ethernet/intel/idpf/idpf_controlq.c b/drivers/net/ethernet/intel/idpf/idpf_controlq.c index 4849590a5591..b28991dd1870 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_controlq.c +++ b/drivers/net/ethernet/intel/idpf/idpf_controlq.c @@ -376,6 +376,9 @@ int idpf_ctlq_clean_sq(struct idpf_ctlq_info *cq, u16 *clean_count, if (!(le16_to_cpu(desc->flags) & IDPF_CTLQ_FLAG_DD)) break; + /* Ensure no other fields are read until DD flag is checked */ + dma_rmb(); + /* strip off FW internal code */ desc_err = le16_to_cpu(desc->ret_val) & 0xff; @@ -563,6 +566,9 @@ int idpf_ctlq_recv(struct idpf_ctlq_info *cq, u16 *num_q_msg, if (!(flags & IDPF_CTLQ_FLAG_DD)) break; + /* Ensure no other fields are read until DD flag is checked */ + dma_rmb(); + q_msg[i].vmvf_type = (flags & (IDPF_CTLQ_FLAG_FTYPE_VM | IDPF_CTLQ_FLAG_FTYPE_PF)) >> diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c index f71d3182580b..b6c515d14cbf 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_main.c +++ b/drivers/net/ethernet/intel/idpf/idpf_main.c @@ -174,7 +174,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_master(pdev); pci_set_drvdata(pdev, adapter); - adapter->init_wq = alloc_workqueue("%s-%s-init", 0, 0, + adapter->init_wq = alloc_workqueue("%s-%s-init", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->init_wq) { @@ -183,7 +184,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_free; } - adapter->serv_wq = alloc_workqueue("%s-%s-service", 0, 0, + adapter->serv_wq = alloc_workqueue("%s-%s-service", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->serv_wq) { @@ -192,7 +194,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_serv_wq_alloc; } - adapter->mbx_wq = alloc_workqueue("%s-%s-mbx", 0, 0, + adapter->mbx_wq = alloc_workqueue("%s-%s-mbx", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->mbx_wq) { @@ -201,7 +204,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_mbx_wq_alloc; } - adapter->stats_wq = alloc_workqueue("%s-%s-stats", 0, 0, + adapter->stats_wq = alloc_workqueue("%s-%s-stats", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->stats_wq) { @@ -210,7 +214,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_stats_wq_alloc; } - adapter->vc_event_wq = alloc_workqueue("%s-%s-vc_event", 0, 0, + adapter->vc_event_wq = alloc_workqueue("%s-%s-vc_event", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->vc_event_wq) { diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c index d46c95f91b0d..3d2413b8684f 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c +++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c @@ -517,8 +517,10 @@ static ssize_t idpf_vc_xn_exec(struct idpf_adapter *adapter, retval = -ENXIO; goto only_unlock; case IDPF_VC_XN_WAITING: - dev_notice_ratelimited(&adapter->pdev->dev, "Transaction timed-out (op %d, %dms)\n", - params->vc_op, params->timeout_ms); + dev_notice_ratelimited(&adapter->pdev->dev, + "Transaction timed-out (op:%d cookie:%04x vc_op:%d salt:%02x timeout:%dms)\n", + params->vc_op, cookie, xn->vc_op, + xn->salt, params->timeout_ms); retval = -ETIME; break; case IDPF_VC_XN_COMPLETED_SUCCESS: @@ -612,14 +614,16 @@ idpf_vc_xn_forward_reply(struct idpf_adapter *adapter, return -EINVAL; } xn = &adapter->vcxn_mngr->ring[xn_idx]; + idpf_vc_xn_lock(xn); salt = FIELD_GET(IDPF_VC_XN_SALT_M, msg_info); if (xn->salt != salt) { - dev_err_ratelimited(&adapter->pdev->dev, "Transaction salt does not match (%02x != %02x)\n", - xn->salt, salt); + dev_err_ratelimited(&adapter->pdev->dev, "Transaction salt does not match (exp:%d@%02x(%d) != got:%d@%02x)\n", + xn->vc_op, xn->salt, xn->state, + ctlq_msg->cookie.mbx.chnl_opcode, salt); + idpf_vc_xn_unlock(xn); return -EINVAL; } - idpf_vc_xn_lock(xn); switch (xn->state) { case IDPF_VC_XN_WAITING: /* success */ @@ -3077,12 +3081,21 @@ init_failed: */ void idpf_vc_core_deinit(struct idpf_adapter *adapter) { + bool remove_in_prog; + if (!test_bit(IDPF_VC_CORE_INIT, adapter->flags)) return; + /* Avoid transaction timeouts when called during reset */ + remove_in_prog = test_bit(IDPF_REMOVE_IN_PROG, adapter->flags); + if (!remove_in_prog) + idpf_vc_xn_shutdown(adapter->vcxn_mngr); + idpf_deinit_task(adapter); idpf_intr_rel(adapter); - idpf_vc_xn_shutdown(adapter->vcxn_mngr); + + if (remove_in_prog) + idpf_vc_xn_shutdown(adapter->vcxn_mngr); cancel_delayed_work_sync(&adapter->serv_task); cancel_delayed_work_sync(&adapter->mbx_task); diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 82f4333fb426..4fe121b9f94b 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4432,6 +4432,7 @@ static int mvneta_cpu_online(unsigned int cpu, struct hlist_node *node) */ if (pp->is_stopped) { spin_unlock(&pp->lock); + netdev_unlock(port->napi.dev); return 0; } netif_tx_stop_all_queues(pp->dev); diff --git a/drivers/net/ethernet/mediatek/airoha_eth.c b/drivers/net/ethernet/mediatek/airoha_eth.c index 415d784de741..09f448f29124 100644 --- a/drivers/net/ethernet/mediatek/airoha_eth.c +++ b/drivers/net/ethernet/mediatek/airoha_eth.c @@ -266,11 +266,11 @@ #define REG_GDM3_FWD_CFG GDM3_BASE #define GDM3_PAD_EN_MASK BIT(28) -#define REG_GDM4_FWD_CFG (GDM4_BASE + 0x100) +#define REG_GDM4_FWD_CFG GDM4_BASE #define GDM4_PAD_EN_MASK BIT(28) #define GDM4_SPORT_OFFSET0_MASK GENMASK(11, 8) -#define REG_GDM4_SRC_PORT_SET (GDM4_BASE + 0x33c) +#define REG_GDM4_SRC_PORT_SET (GDM4_BASE + 0x23c) #define GDM4_SPORT_OFF2_MASK GENMASK(19, 16) #define GDM4_SPORT_OFF1_MASK GENMASK(15, 12) #define GDM4_SPORT_OFF0_MASK GENMASK(11, 8) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bd41b75d246e..a814b63ed97e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2087,7 +2087,7 @@ static struct mlx5e_xdpsq *mlx5e_open_xdpredirect_sq(struct mlx5e_channel *c, struct mlx5e_xdpsq *xdpsq; int err; - xdpsq = kvzalloc_node(sizeof(*xdpsq), GFP_KERNEL, c->cpu); + xdpsq = kvzalloc_node(sizeof(*xdpsq), GFP_KERNEL, cpu_to_node(c->cpu)); if (!xdpsq) return ERR_PTR(-ENOMEM); diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index 720f577929db..499e5e39d513 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -1120,20 +1120,6 @@ static void nv_disable_hw_interrupts(struct net_device *dev, u32 mask) } } -static void nv_napi_enable(struct net_device *dev) -{ - struct fe_priv *np = get_nvpriv(dev); - - napi_enable(&np->napi); -} - -static void nv_napi_disable(struct net_device *dev) -{ - struct fe_priv *np = get_nvpriv(dev); - - napi_disable(&np->napi); -} - #define MII_READ (-1) /* mii_rw: read/write a register on the PHY. * @@ -3114,7 +3100,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu) * Changing the MTU is a rare event, it shouldn't matter. */ nv_disable_irq(dev); - nv_napi_disable(dev); + napi_disable(&np->napi); netif_tx_lock_bh(dev); netif_addr_lock(dev); spin_lock(&np->lock); @@ -3143,7 +3129,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu) spin_unlock(&np->lock); netif_addr_unlock(dev); netif_tx_unlock_bh(dev); - nv_napi_enable(dev); + napi_enable(&np->napi); nv_enable_irq(dev); } return 0; @@ -4731,7 +4717,7 @@ static int nv_set_ringparam(struct net_device *dev, if (netif_running(dev)) { nv_disable_irq(dev); - nv_napi_disable(dev); + napi_disable(&np->napi); netif_tx_lock_bh(dev); netif_addr_lock(dev); spin_lock(&np->lock); @@ -4784,7 +4770,7 @@ static int nv_set_ringparam(struct net_device *dev, spin_unlock(&np->lock); netif_addr_unlock(dev); netif_tx_unlock_bh(dev); - nv_napi_enable(dev); + napi_enable(&np->napi); nv_enable_irq(dev); } return 0; @@ -5277,7 +5263,7 @@ static void nv_self_test(struct net_device *dev, struct ethtool_test *test, u64 if (test->flags & ETH_TEST_FL_OFFLINE) { if (netif_running(dev)) { netif_stop_queue(dev); - nv_napi_disable(dev); + napi_disable(&np->napi); netif_tx_lock_bh(dev); netif_addr_lock(dev); spin_lock_irq(&np->lock); @@ -5334,7 +5320,7 @@ static void nv_self_test(struct net_device *dev, struct ethtool_test *test, u64 /* restart rx engine */ nv_start_rxtx(dev); netif_start_queue(dev); - nv_napi_enable(dev); + napi_enable(&np->napi); nv_enable_hw_interrupts(dev, np->irqmask); } } @@ -5576,6 +5562,7 @@ static int nv_open(struct net_device *dev) /* ask for interrupts */ nv_enable_hw_interrupts(dev, np->irqmask); + netdev_lock(dev); spin_lock_irq(&np->lock); writel(NVREG_MCASTADDRA_FORCE, base + NvRegMulticastAddrA); writel(0, base + NvRegMulticastAddrB); @@ -5594,7 +5581,7 @@ static int nv_open(struct net_device *dev) ret = nv_update_linkspeed(dev); nv_start_rxtx(dev); netif_start_queue(dev); - nv_napi_enable(dev); + napi_enable_locked(&np->napi); if (ret) { netif_carrier_on(dev); @@ -5611,6 +5598,7 @@ static int nv_open(struct net_device *dev) round_jiffies(jiffies + STATS_INTERVAL)); spin_unlock_irq(&np->lock); + netdev_unlock(dev); /* If the loopback feature was set while the device was down, make sure * that it's set correctly now. @@ -5632,7 +5620,7 @@ static int nv_close(struct net_device *dev) spin_lock_irq(&np->lock); np->in_shutdown = 1; spin_unlock_irq(&np->lock); - nv_napi_disable(dev); + napi_disable(&np->napi); synchronize_irq(np->pci_dev->irq); del_timer_sync(&np->oom_kick); diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c index 9ce0e8a64ba8..a73dcaffa8c5 100644 --- a/drivers/net/ethernet/realtek/8139too.c +++ b/drivers/net/ethernet/realtek/8139too.c @@ -1684,6 +1684,7 @@ static void rtl8139_tx_timeout_task (struct work_struct *work) if (tmp8 & CmdTxEnb) RTL_W8 (ChipCmd, CmdRxEnb); + netdev_lock(dev); spin_lock_bh(&tp->rx_lock); /* Disable interrupts by clearing the interrupt mask. */ RTL_W16 (IntrMask, 0x0000); @@ -1694,11 +1695,12 @@ static void rtl8139_tx_timeout_task (struct work_struct *work) spin_unlock_irq(&tp->lock); /* ...and finally, reset everything */ - napi_enable(&tp->napi); + napi_enable_locked(&tp->napi); rtl8139_hw_start(dev); netif_wake_queue(dev); spin_unlock_bh(&tp->rx_lock); + netdev_unlock(dev); } static void rtl8139_tx_timeout(struct net_device *dev, unsigned int txqueue) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index bc395294a32d..c9f4976a3527 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -3217,10 +3217,15 @@ static int ravb_suspend(struct device *dev) netif_device_detach(ndev); - if (priv->wol_enabled) - return ravb_wol_setup(ndev); + rtnl_lock(); + if (priv->wol_enabled) { + ret = ravb_wol_setup(ndev); + rtnl_unlock(); + return ret; + } ret = ravb_close(ndev); + rtnl_unlock(); if (ret) return ret; @@ -3245,19 +3250,20 @@ static int ravb_resume(struct device *dev) if (!netif_running(ndev)) return 0; + rtnl_lock(); /* If WoL is enabled restore the interface. */ - if (priv->wol_enabled) { + if (priv->wol_enabled) ret = ravb_wol_restore(ndev); - if (ret) - return ret; - } else { + else ret = pm_runtime_force_resume(dev); - if (ret) - return ret; + if (ret) { + rtnl_unlock(); + return ret; } /* Reopening the interface will restore the device to the working state. */ ret = ravb_open(ndev); + rtnl_unlock(); if (ret < 0) goto out_rpm_put; diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 8887b8921009..5fc8027c92c7 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3494,10 +3494,12 @@ static int sh_eth_suspend(struct device *dev) netif_device_detach(ndev); + rtnl_lock(); if (mdp->wol_enabled) ret = sh_eth_wol_setup(ndev); else ret = sh_eth_close(ndev); + rtnl_unlock(); return ret; } @@ -3511,10 +3513,12 @@ static int sh_eth_resume(struct device *dev) if (!netif_running(ndev)) return 0; + rtnl_lock(); if (mdp->wol_enabled) ret = sh_eth_wol_restore(ndev); else ret = sh_eth_open(ndev); + rtnl_unlock(); if (ret < 0) return ret; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 5212dc439b1d..d04543e5697b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2424,11 +2424,6 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) u32 chan = 0; u8 qmode = 0; - if (rxfifosz == 0) - rxfifosz = priv->dma_cap.rx_fifo_size; - if (txfifosz == 0) - txfifosz = priv->dma_cap.tx_fifo_size; - /* Split up the shared Tx/Rx FIFO memory on DW QoS Eth and DW XGMAC */ if (priv->plat->has_gmac4 || priv->plat->has_xgmac) { rxfifosz /= rx_channels_count; @@ -2897,11 +2892,6 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode, int rxfifosz = priv->plat->rx_fifo_size; int txfifosz = priv->plat->tx_fifo_size; - if (rxfifosz == 0) - rxfifosz = priv->dma_cap.rx_fifo_size; - if (txfifosz == 0) - txfifosz = priv->dma_cap.tx_fifo_size; - /* Adjust for real per queue fifo size */ rxfifosz /= rx_channels_count; txfifosz /= tx_channels_count; @@ -5878,9 +5868,6 @@ static int stmmac_change_mtu(struct net_device *dev, int new_mtu) const int mtu = new_mtu; int ret; - if (txfifosz == 0) - txfifosz = priv->dma_cap.tx_fifo_size; - txfifosz /= priv->plat->tx_queues_to_use; if (stmmac_xdp_is_enabled(priv) && new_mtu > ETH_DATA_LEN) { @@ -7217,6 +7204,50 @@ static int stmmac_hw_init(struct stmmac_priv *priv) if (priv->dma_cap.tsoen) dev_info(priv->device, "TSO supported\n"); + if (priv->dma_cap.number_rx_queues && + priv->plat->rx_queues_to_use > priv->dma_cap.number_rx_queues) { + dev_warn(priv->device, + "Number of Rx queues (%u) exceeds dma capability\n", + priv->plat->rx_queues_to_use); + priv->plat->rx_queues_to_use = priv->dma_cap.number_rx_queues; + } + if (priv->dma_cap.number_tx_queues && + priv->plat->tx_queues_to_use > priv->dma_cap.number_tx_queues) { + dev_warn(priv->device, + "Number of Tx queues (%u) exceeds dma capability\n", + priv->plat->tx_queues_to_use); + priv->plat->tx_queues_to_use = priv->dma_cap.number_tx_queues; + } + + if (!priv->plat->rx_fifo_size) { + if (priv->dma_cap.rx_fifo_size) { + priv->plat->rx_fifo_size = priv->dma_cap.rx_fifo_size; + } else { + dev_err(priv->device, "Can't specify Rx FIFO size\n"); + return -ENODEV; + } + } else if (priv->dma_cap.rx_fifo_size && + priv->plat->rx_fifo_size > priv->dma_cap.rx_fifo_size) { + dev_warn(priv->device, + "Rx FIFO size (%u) exceeds dma capability\n", + priv->plat->rx_fifo_size); + priv->plat->rx_fifo_size = priv->dma_cap.rx_fifo_size; + } + if (!priv->plat->tx_fifo_size) { + if (priv->dma_cap.tx_fifo_size) { + priv->plat->tx_fifo_size = priv->dma_cap.tx_fifo_size; + } else { + dev_err(priv->device, "Can't specify Tx FIFO size\n"); + return -ENODEV; + } + } else if (priv->dma_cap.tx_fifo_size && + priv->plat->tx_fifo_size > priv->dma_cap.tx_fifo_size) { + dev_warn(priv->device, + "Tx FIFO size (%u) exceeds dma capability\n", + priv->plat->tx_fifo_size); + priv->plat->tx_fifo_size = priv->dma_cap.tx_fifo_size; + } + priv->hw->vlan_fail_q_en = (priv->plat->flags & STMMAC_FLAG_VLAN_FAIL_Q_EN); priv->hw->vlan_fail_q = priv->plat->vlan_fail_q; diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index d7459866d24c..72177fea1cfb 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -6086,7 +6086,7 @@ static void niu_enable_napi(struct niu *np) int i; for (i = 0; i < np->num_ldg; i++) - napi_enable(&np->ldg[i].napi); + napi_enable_locked(&np->ldg[i].napi); } static void niu_disable_napi(struct niu *np) @@ -6116,7 +6116,9 @@ static int niu_open(struct net_device *dev) if (err) goto out_free_channels; + netdev_lock(dev); niu_enable_napi(np); + netdev_unlock(dev); spin_lock_irq(&np->lock); @@ -6521,6 +6523,7 @@ static void niu_reset_task(struct work_struct *work) niu_reset_buffers(np); + netdev_lock(np->dev); spin_lock_irqsave(&np->lock, flags); err = niu_init_hw(np); @@ -6531,6 +6534,7 @@ static void niu_reset_task(struct work_struct *work) } spin_unlock_irqrestore(&np->lock, flags); + netdev_unlock(np->dev); } static void niu_tx_timeout(struct net_device *dev, unsigned int txqueue) @@ -6761,7 +6765,9 @@ static int niu_change_mtu(struct net_device *dev, int new_mtu) niu_free_channels(np); + netdev_lock(dev); niu_enable_napi(np); + netdev_unlock(dev); err = niu_alloc_channels(np); if (err) @@ -9937,6 +9943,7 @@ static int __maybe_unused niu_resume(struct device *dev_d) spin_lock_irqsave(&np->lock, flags); + netdev_lock(dev); err = niu_init_hw(np); if (!err) { np->timer.expires = jiffies + HZ; @@ -9945,6 +9952,7 @@ static int __maybe_unused niu_resume(struct device *dev_d) } spin_unlock_irqrestore(&np->lock, flags); + netdev_unlock(dev); return err; } diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c index 894911f3d560..e56ebbdd428d 100644 --- a/drivers/net/ethernet/via/via-rhine.c +++ b/drivers/net/ethernet/via/via-rhine.c @@ -1568,7 +1568,7 @@ static void init_registers(struct net_device *dev) if (rp->quirks & rqMgmt) rhine_init_cam_filter(dev); - napi_enable(&rp->napi); + napi_enable_locked(&rp->napi); iowrite16(RHINE_EVENT & 0xffff, ioaddr + IntrEnable); @@ -1696,7 +1696,10 @@ static int rhine_open(struct net_device *dev) rhine_power_init(dev); rhine_chip_reset(dev); rhine_task_enable(rp); + + netdev_lock(dev); init_registers(dev); + netdev_unlock(dev); netif_dbg(rp, ifup, dev, "%s() Done - status %04x MII status: %04x\n", __func__, ioread16(ioaddr + ChipCmd), @@ -1727,6 +1730,8 @@ static void rhine_reset_task(struct work_struct *work) napi_disable(&rp->napi); netif_tx_disable(dev); + + netdev_lock(dev); spin_lock_bh(&rp->lock); /* clear all descriptors */ @@ -1740,6 +1745,7 @@ static void rhine_reset_task(struct work_struct *work) init_registers(dev); spin_unlock_bh(&rp->lock); + netdev_unlock(dev); netif_trans_update(dev); /* prevent tx timeout */ dev->stats.tx_errors++; @@ -2541,9 +2547,12 @@ static int rhine_resume(struct device *device) alloc_tbufs(dev); rhine_reset_rbufs(rp); rhine_task_enable(rp); + + netdev_lock(dev); spin_lock_bh(&rp->lock); init_registers(dev); spin_unlock_bh(&rp->lock); + netdev_unlock(dev); netif_device_attach(dev); diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index 3b23f3d3ca2b..5c80fbee7913 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -74,7 +74,7 @@ static void nsim_get_ringparam(struct net_device *dev, memcpy(ring, &ns->ethtool.ring, sizeof(ns->ethtool.ring)); kernel_ring->hds_thresh_max = NSIM_HDS_THRESHOLD_MAX; - if (kernel_ring->tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN) + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN) kernel_ring->tcp_data_split = ETHTOOL_TCP_DATA_SPLIT_ENABLED; } diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index dcf073bc4802..96d54c08043d 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -134,6 +134,7 @@ struct netdevsim { u32 sleep; u32 __ports[2][NSIM_UDP_TUNNEL_N_PORTS]; u32 (*ports)[NSIM_UDP_TUNNEL_N_PORTS]; + struct dentry *ddir; struct debugfs_u32_array dfs_ports[2]; } udp_ports; diff --git a/drivers/net/netdevsim/udp_tunnels.c b/drivers/net/netdevsim/udp_tunnels.c index 02dc3123eb6c..640b4983a9a0 100644 --- a/drivers/net/netdevsim/udp_tunnels.c +++ b/drivers/net/netdevsim/udp_tunnels.c @@ -112,9 +112,11 @@ nsim_udp_tunnels_info_reset_write(struct file *file, const char __user *data, struct net_device *dev = file->private_data; struct netdevsim *ns = netdev_priv(dev); - memset(ns->udp_ports.ports, 0, sizeof(ns->udp_ports.__ports)); rtnl_lock(); - udp_tunnel_nic_reset_ntf(dev); + if (dev->reg_state == NETREG_REGISTERED) { + memset(ns->udp_ports.ports, 0, sizeof(ns->udp_ports.__ports)); + udp_tunnel_nic_reset_ntf(dev); + } rtnl_unlock(); return count; @@ -144,23 +146,23 @@ int nsim_udp_tunnels_info_create(struct nsim_dev *nsim_dev, else ns->udp_ports.ports = nsim_dev->udp_ports.__ports; - debugfs_create_u32("udp_ports_inject_error", 0600, - ns->nsim_dev_port->ddir, + ns->udp_ports.ddir = debugfs_create_dir("udp_ports", + ns->nsim_dev_port->ddir); + + debugfs_create_u32("inject_error", 0600, ns->udp_ports.ddir, &ns->udp_ports.inject_error); ns->udp_ports.dfs_ports[0].array = ns->udp_ports.ports[0]; ns->udp_ports.dfs_ports[0].n_elements = NSIM_UDP_TUNNEL_N_PORTS; - debugfs_create_u32_array("udp_ports_table0", 0400, - ns->nsim_dev_port->ddir, + debugfs_create_u32_array("table0", 0400, ns->udp_ports.ddir, &ns->udp_ports.dfs_ports[0]); ns->udp_ports.dfs_ports[1].array = ns->udp_ports.ports[1]; ns->udp_ports.dfs_ports[1].n_elements = NSIM_UDP_TUNNEL_N_PORTS; - debugfs_create_u32_array("udp_ports_table1", 0400, - ns->nsim_dev_port->ddir, + debugfs_create_u32_array("table1", 0400, ns->udp_ports.ddir, &ns->udp_ports.dfs_ports[1]); - debugfs_create_file("udp_ports_reset", 0200, ns->nsim_dev_port->ddir, + debugfs_create_file("reset", 0200, ns->udp_ports.ddir, dev, &nsim_udp_tunnels_info_reset_fops); /* Note: it's not normal to allocate the info struct like this! @@ -196,6 +198,9 @@ int nsim_udp_tunnels_info_create(struct nsim_dev *nsim_dev, void nsim_udp_tunnels_info_destroy(struct net_device *dev) { + struct netdevsim *ns = netdev_priv(dev); + + debugfs_remove_recursive(ns->udp_ports.ddir); kfree(dev->udp_tunnel_nic_info); dev->udp_tunnel_nic_info = NULL; } diff --git a/drivers/net/phy/marvell-88q2xxx.c b/drivers/net/phy/marvell-88q2xxx.c index 4494b3e39ce2..a3996471a1c9 100644 --- a/drivers/net/phy/marvell-88q2xxx.c +++ b/drivers/net/phy/marvell-88q2xxx.c @@ -95,6 +95,10 @@ #define MDIO_MMD_PCS_MV_TDR_OFF_CUTOFF 65246 +struct mv88q2xxx_priv { + bool enable_temp; +}; + struct mmd_val { int devad; u32 regnum; @@ -710,17 +714,12 @@ static const struct hwmon_chip_info mv88q2xxx_hwmon_chip_info = { static int mv88q2xxx_hwmon_probe(struct phy_device *phydev) { + struct mv88q2xxx_priv *priv = phydev->priv; struct device *dev = &phydev->mdio.dev; struct device *hwmon; char *hwmon_name; - int ret; - - /* Enable temperature sense */ - ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, MDIO_MMD_PCS_MV_TEMP_SENSOR2, - MDIO_MMD_PCS_MV_TEMP_SENSOR2_DIS_MASK, 0); - if (ret < 0) - return ret; + priv->enable_temp = true; hwmon_name = devm_hwmon_sanitize_name(dev, dev_name(dev)); if (IS_ERR(hwmon_name)) return PTR_ERR(hwmon_name); @@ -743,6 +742,14 @@ static int mv88q2xxx_hwmon_probe(struct phy_device *phydev) static int mv88q2xxx_probe(struct phy_device *phydev) { + struct mv88q2xxx_priv *priv; + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + return mv88q2xxx_hwmon_probe(phydev); } @@ -810,6 +817,18 @@ static int mv88q222x_revb1_revb2_config_init(struct phy_device *phydev) static int mv88q222x_config_init(struct phy_device *phydev) { + struct mv88q2xxx_priv *priv = phydev->priv; + int ret; + + /* Enable temperature sense */ + if (priv->enable_temp) { + ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, + MDIO_MMD_PCS_MV_TEMP_SENSOR2, + MDIO_MMD_PCS_MV_TEMP_SENSOR2_DIS_MASK, 0); + if (ret < 0) + return ret; + } + if (phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] == PHY_ID_88Q2220_REVB0) return mv88q222x_revb0_config_init(phydev); else diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 323717a4821f..34231b5b9175 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -1297,6 +1297,8 @@ static int nxp_c45_soft_reset(struct phy_device *phydev) if (ret) return ret; + usleep_range(2000, 2050); + return phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1, VEND1_DEVICE_CONTROL, ret, !(ret & DEVICE_CONTROL_RESET), 20000, diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 46afb95ffabe..a19789b57190 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -61,7 +61,18 @@ #define IPHETH_USBINTF_PROTO 1 #define IPHETH_IP_ALIGN 2 /* padding at front of URB */ -#define IPHETH_NCM_HEADER_SIZE (12 + 96) /* NCMH + NCM0 */ +/* On iOS devices, NCM headers in RX have a fixed size regardless of DPE count: + * - NTH16 (NCMH): 12 bytes, as per CDC NCM 1.0 spec + * - NDP16 (NCM0): 96 bytes, of which + * - NDP16 fixed header: 8 bytes + * - maximum of 22 DPEs (21 datagrams + trailer), 4 bytes each + */ +#define IPHETH_NDP16_MAX_DPE 22 +#define IPHETH_NDP16_HEADER_SIZE (sizeof(struct usb_cdc_ncm_ndp16) + \ + IPHETH_NDP16_MAX_DPE * \ + sizeof(struct usb_cdc_ncm_dpe16)) +#define IPHETH_NCM_HEADER_SIZE (sizeof(struct usb_cdc_ncm_nth16) + \ + IPHETH_NDP16_HEADER_SIZE) #define IPHETH_TX_BUF_SIZE ETH_FRAME_LEN #define IPHETH_RX_BUF_SIZE_LEGACY (IPHETH_IP_ALIGN + ETH_FRAME_LEN) #define IPHETH_RX_BUF_SIZE_NCM 65536 @@ -207,15 +218,23 @@ static int ipheth_rcvbulk_callback_legacy(struct urb *urb) return ipheth_consume_skb(buf, len, dev); } +/* In "NCM mode", the iOS device encapsulates RX (phone->computer) traffic + * in NCM Transfer Blocks (similarly to CDC NCM). However, unlike reverse + * tethering (handled by the `cdc_ncm` driver), regular tethering is not + * compliant with the CDC NCM spec, as the device is missing the necessary + * descriptors, and TX (computer->phone) traffic is not encapsulated + * at all. Thus `ipheth` implements a very limited subset of the spec with + * the sole purpose of parsing RX URBs. + */ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) { struct usb_cdc_ncm_nth16 *ncmh; struct usb_cdc_ncm_ndp16 *ncm0; struct usb_cdc_ncm_dpe16 *dpe; struct ipheth_device *dev; + u16 dg_idx, dg_len; int retval = -EINVAL; char *buf; - int len; dev = urb->context; @@ -226,40 +245,42 @@ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) ncmh = urb->transfer_buffer; if (ncmh->dwSignature != cpu_to_le32(USB_CDC_NCM_NTH16_SIGN) || - le16_to_cpu(ncmh->wNdpIndex) >= urb->actual_length) { - dev->net->stats.rx_errors++; - return retval; - } + /* On iOS, NDP16 directly follows NTH16 */ + ncmh->wNdpIndex != cpu_to_le16(sizeof(struct usb_cdc_ncm_nth16))) + goto rx_error; - ncm0 = urb->transfer_buffer + le16_to_cpu(ncmh->wNdpIndex); - if (ncm0->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN) || - le16_to_cpu(ncmh->wHeaderLength) + le16_to_cpu(ncm0->wLength) >= - urb->actual_length) { - dev->net->stats.rx_errors++; - return retval; - } + ncm0 = urb->transfer_buffer + sizeof(struct usb_cdc_ncm_nth16); + if (ncm0->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN)) + goto rx_error; dpe = ncm0->dpe16; - while (le16_to_cpu(dpe->wDatagramIndex) != 0 && - le16_to_cpu(dpe->wDatagramLength) != 0) { - if (le16_to_cpu(dpe->wDatagramIndex) >= urb->actual_length || - le16_to_cpu(dpe->wDatagramIndex) + - le16_to_cpu(dpe->wDatagramLength) > urb->actual_length) { + for (int dpe_i = 0; dpe_i < IPHETH_NDP16_MAX_DPE; ++dpe_i, ++dpe) { + dg_idx = le16_to_cpu(dpe->wDatagramIndex); + dg_len = le16_to_cpu(dpe->wDatagramLength); + + /* Null DPE must be present after last datagram pointer entry + * (3.3.1 USB CDC NCM spec v1.0) + */ + if (dg_idx == 0 && dg_len == 0) + return 0; + + if (dg_idx < IPHETH_NCM_HEADER_SIZE || + dg_idx >= urb->actual_length || + dg_len > urb->actual_length - dg_idx) { dev->net->stats.rx_length_errors++; return retval; } - buf = urb->transfer_buffer + le16_to_cpu(dpe->wDatagramIndex); - len = le16_to_cpu(dpe->wDatagramLength); + buf = urb->transfer_buffer + dg_idx; - retval = ipheth_consume_skb(buf, len, dev); + retval = ipheth_consume_skb(buf, dg_len, dev); if (retval != 0) return retval; - - dpe++; } - return 0; +rx_error: + dev->net->stats.rx_errors++; + return retval; } static void ipheth_rcvbulk_callback(struct urb *urb) diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 01a3b2417a54..ddff6f19ff98 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -71,6 +71,14 @@ #define MSR_SPEED (1<<3) #define MSR_LINK (1<<2) +/* USB endpoints */ +enum rtl8150_usb_ep { + RTL8150_USB_EP_CONTROL = 0, + RTL8150_USB_EP_BULK_IN = 1, + RTL8150_USB_EP_BULK_OUT = 2, + RTL8150_USB_EP_INT_IN = 3, +}; + /* Interrupt pipe data */ #define INT_TSR 0x00 #define INT_RSR 0x01 @@ -867,6 +875,13 @@ static int rtl8150_probe(struct usb_interface *intf, struct usb_device *udev = interface_to_usbdev(intf); rtl8150_t *dev; struct net_device *netdev; + static const u8 bulk_ep_addr[] = { + RTL8150_USB_EP_BULK_IN | USB_DIR_IN, + RTL8150_USB_EP_BULK_OUT | USB_DIR_OUT, + 0}; + static const u8 int_ep_addr[] = { + RTL8150_USB_EP_INT_IN | USB_DIR_IN, + 0}; netdev = alloc_etherdev(sizeof(rtl8150_t)); if (!netdev) @@ -880,6 +895,13 @@ static int rtl8150_probe(struct usb_interface *intf, return -ENOMEM; } + /* Verify that all required endpoints are present */ + if (!usb_check_bulk_endpoints(intf, bulk_ep_addr) || + !usb_check_int_endpoints(intf, int_ep_addr)) { + dev_err(&intf->dev, "couldn't find required endpoints\n"); + goto out; + } + tasklet_setup(&dev->tl, rx_fixup); spin_lock_init(&dev->rx_pool_lock); diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index d2023e7131bd..6e6e9f05509a 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -411,6 +411,11 @@ static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb struct tunnel_msg *tmsg; struct net_device *dev; + if (cb->nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct tunnel_msg))) { + NL_SET_ERR_MSG(cb->extack, "Invalid msg length"); + return -EINVAL; + } + tmsg = nlmsg_data(cb->nlh); if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) { diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c index a259f4dd9540..413973d05b43 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c @@ -1479,14 +1479,13 @@ static void mt7603_mac_watchdog_reset(struct mt7603_dev *dev) tasklet_enable(&dev->mt76.pre_tbtt_tasklet); mt7603_beacon_set_timer(dev, -1, beacon_int); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); - napi_schedule(&dev->mt76.tx_napi); - napi_enable(&dev->mt76.napi[0]); - napi_schedule(&dev->mt76.napi[0]); - napi_enable(&dev->mt76.napi[1]); + + local_bh_disable(); + napi_schedule(&dev->mt76.tx_napi); + napi_schedule(&dev->mt76.napi[0]); napi_schedule(&dev->mt76.napi[1]); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/pci.c b/drivers/net/wireless/mediatek/mt76/mt7615/pci.c index 9a278589df4e..68010e27f065 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/pci.c @@ -164,12 +164,16 @@ static int mt7615_pci_resume(struct pci_dev *pdev) dev_err(mdev->dev, "PDMA engine must be reinitialized\n"); mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c index a0ca3bbdfcaf..c2e4e6aabd9f 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c @@ -262,12 +262,14 @@ void mt7615_mac_reset_work(struct work_struct *work) mt76_worker_enable(&dev->mt76.tx_worker); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); - napi_schedule(&dev->mt76.tx_napi); - mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); + } + + local_bh_disable(); + napi_schedule(&dev->mt76.tx_napi); + mt76_for_each_q_rx(&dev->mt76, i) { napi_schedule(&dev->mt76.napi[i]); } local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c index 1eb955f3ca13..b456ccd00d58 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c @@ -282,14 +282,16 @@ static int mt76x0e_resume(struct pci_dev *pdev) mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { mt76_queue_rx_reset(dev, i); napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } - napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c index 7d840ad4ae65..a82c75ba26e6 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c @@ -504,12 +504,14 @@ static void mt76x02_watchdog_reset(struct mt76x02_dev *dev) mt76_worker_enable(&dev->mt76.tx_worker); tasklet_enable(&dev->mt76.pre_tbtt_tasklet); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); - napi_schedule(&dev->mt76.tx_napi); - mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); + } + + local_bh_disable(); + napi_schedule(&dev->mt76.tx_napi); + mt76_for_each_q_rx(&dev->mt76, i) { napi_schedule(&dev->mt76.napi[i]); } local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c index 67c9d1caa0bd..727bfdd00b40 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c @@ -151,12 +151,15 @@ mt76x2e_resume(struct pci_dev *pdev) mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index 13bdc0a7174c..2ba6eb3038ce 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -1356,10 +1356,15 @@ mt7915_mac_restart(struct mt7915_dev *dev) mt7915_dma_reset(dev, true); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { if (mdev->q_rx[i].ndesc) { napi_enable(&dev->mt76.napi[i]); + } + } + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + if (mdev->q_rx[i].ndesc) { napi_schedule(&dev->mt76.napi[i]); } } @@ -1419,8 +1424,9 @@ out: if (phy2) clear_bit(MT76_RESET, &phy2->mt76->state); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); @@ -1570,9 +1576,12 @@ void mt7915_mac_reset_work(struct work_struct *work) if (phy2) clear_bit(MT76_RESET, &phy2->mt76->state); - local_bh_disable(); mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); + } + + local_bh_disable(); + mt76_for_each_q_rx(&dev->mt76, i) { napi_schedule(&dev->mt76.napi[i]); } local_bh_enable(); @@ -1581,8 +1590,8 @@ void mt7915_mac_reset_work(struct work_struct *work) mt76_worker_enable(&dev->mt76.tx_worker); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c index ba870e1b05fb..a0c9df3c2cc7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c @@ -523,12 +523,15 @@ static int mt7921_pci_resume(struct device *device) mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c index 2452b1a2d118..881812ba03ff 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c @@ -81,9 +81,12 @@ int mt7921e_mac_reset(struct mt792x_dev *dev) mt792x_wpdma_reset(dev, true); - local_bh_disable(); mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); + } + + local_bh_disable(); + mt76_for_each_q_rx(&dev->mt76, i) { napi_schedule(&dev->mt76.napi[i]); } local_bh_enable(); @@ -115,8 +118,8 @@ int mt7921e_mac_reset(struct mt792x_dev *dev) err = __mt7921_start(&dev->phy); out: - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/pci.c b/drivers/net/wireless/mediatek/mt76/mt7925/pci.c index f36893e20c61..c7b5dc1dbb34 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/pci.c @@ -556,12 +556,15 @@ static int mt7925_pci_resume(struct device *device) mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/pci_mac.c b/drivers/net/wireless/mediatek/mt76/mt7925/pci_mac.c index faedbf766d1a..4578d16bf456 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/pci_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/pci_mac.c @@ -101,12 +101,15 @@ int mt7925e_mac_reset(struct mt792x_dev *dev) mt792x_wpdma_reset(dev, true); - local_bh_disable(); mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); - napi_schedule(&dev->mt76.napi[i]); } napi_enable(&dev->mt76.tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(&dev->mt76, i) { + napi_schedule(&dev->mt76.napi[i]); + } napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c index bc8cba4dca47..019c925ae600 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c @@ -1695,7 +1695,6 @@ mt7996_mac_restart(struct mt7996_dev *dev) mt7996_dma_reset(dev, true); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { if (mtk_wed_device_active(&dev->mt76.mmio.wed) && mt76_queue_is_wed_rro(&mdev->q_rx[i])) @@ -1703,10 +1702,11 @@ mt7996_mac_restart(struct mt7996_dev *dev) if (mdev->q_rx[i].ndesc) { napi_enable(&dev->mt76.napi[i]); + local_bh_disable(); napi_schedule(&dev->mt76.napi[i]); + local_bh_enable(); } } - local_bh_enable(); clear_bit(MT76_MCU_RESET, &dev->mphy.state); clear_bit(MT76_STATE_MCU_RUNNING, &dev->mphy.state); @@ -1764,8 +1764,8 @@ out: if (phy3) clear_bit(MT76_RESET, &phy3->mt76->state); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); @@ -1958,23 +1958,23 @@ void mt7996_mac_reset_work(struct work_struct *work) if (phy3) clear_bit(MT76_RESET, &phy3->mt76->state); - local_bh_disable(); mt76_for_each_q_rx(&dev->mt76, i) { if (mtk_wed_device_active(&dev->mt76.mmio.wed) && mt76_queue_is_wed_rro(&dev->mt76.q_rx[i])) continue; napi_enable(&dev->mt76.napi[i]); + local_bh_disable(); napi_schedule(&dev->mt76.napi[i]); + local_bh_enable(); } - local_bh_enable(); tasklet_schedule(&dev->mt76.irq_tasklet); mt76_worker_enable(&dev->mt76.tx_worker); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index ea96a14d72d1..bf6468c56419 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -4,6 +4,7 @@ * * Copyright (C) 2010 OMICRON electronics GmbH */ +#include <linux/compat.h> #include <linux/module.h> #include <linux/posix-clock.h> #include <linux/poll.h> @@ -176,6 +177,9 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, struct timespec64 ts; int enable, err = 0; + if (in_compat_syscall() && cmd != PTP_ENABLE_PPS && cmd != PTP_ENABLE_PPS2) + arg = (unsigned long)compat_ptr(arg); + tsevq = pccontext->private_clkdata; switch (cmd) { diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index b932425ddc6a..35a5994bf64f 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -217,6 +217,11 @@ static int ptp_getcycles64(struct ptp_clock_info *info, struct timespec64 *ts) return info->gettime64(info, ts); } +static int ptp_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *request, int on) +{ + return -EOPNOTSUPP; +} + static void ptp_aux_kworker(struct kthread_work *work) { struct ptp_clock *ptp = container_of(work, struct ptp_clock, @@ -294,6 +299,9 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, ptp->info->getcrosscycles = ptp->info->getcrosststamp; } + if (!ptp->info->enable) + ptp->info->enable = ptp_enable; + if (ptp->info->do_aux_work) { kthread_init_delayed_work(&ptp->aux_work, ptp_aux_kworker); ptp->kworker = kthread_run_worker(0, "ptp%d", ptp->index); diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index fbffd451031f..45bd001206a2 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -245,7 +245,6 @@ static void sclp_request_timeout(bool force_restart); static void sclp_process_queue(void); static void __sclp_make_read_req(void); static int sclp_init_mask(int calculate); -static int sclp_init(void); static void __sclp_queue_read_req(void) @@ -1251,8 +1250,7 @@ static struct platform_driver sclp_pdrv = { /* Initialize SCLP driver. Return zero if driver is operational, non-zero * otherwise. */ -static int -sclp_init(void) +int sclp_init(void) { unsigned long flags; int rc = 0; @@ -1305,13 +1303,7 @@ fail_unlock: static __init int sclp_initcall(void) { - int rc; - - rc = platform_driver_register(&sclp_pdrv); - if (rc) - return rc; - - return sclp_init(); + return platform_driver_register(&sclp_pdrv); } arch_initcall(sclp_initcall); diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c index 3dd50ac9c5b0..b2d93a6e36c4 100644 --- a/drivers/s390/char/vmlogrdr.c +++ b/drivers/s390/char/vmlogrdr.c @@ -123,7 +123,7 @@ static DECLARE_WAIT_QUEUE_HEAD(read_wait_queue); */ static struct vmlogrdr_priv_t sys_ser[] = { - { .system_service = "*LOGREC ", + { .system_service = { '*', 'L', 'O', 'G', 'R', 'E', 'C', ' ' }, .internal_name = "logrec", .recording_name = "EREP", .minor_num = 0, @@ -132,7 +132,7 @@ static struct vmlogrdr_priv_t sys_ser[] = { .autorecording = 1, .autopurge = 1, }, - { .system_service = "*ACCOUNT", + { .system_service = { '*', 'A', 'C', 'C', 'O', 'U', 'N', 'T' }, .internal_name = "account", .recording_name = "ACCOUNT", .minor_num = 1, @@ -141,7 +141,7 @@ static struct vmlogrdr_priv_t sys_ser[] = { .autorecording = 1, .autopurge = 1, }, - { .system_service = "*SYMPTOM", + { .system_service = { '*', 'S', 'Y', 'M', 'P', 'T', 'O', 'M' }, .internal_name = "symptom", .recording_name = "SYMPTOM", .minor_num = 2, @@ -356,7 +356,7 @@ static int vmlogrdr_open (struct inode *inode, struct file *filp) if (connect_rc) { pr_err("vmlogrdr: iucv connection to %s " "failed with rc %i \n", - logptr->system_service, connect_rc); + logptr->internal_name, connect_rc); goto out_path; } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 698c43dd5dc8..f28bc763847a 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -202,7 +202,7 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) return inode->i_sb->s_fs_info; } -static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry) +static inline struct v9fs_session_info *v9fs_dentry2v9ses(const struct dentry *dentry) { return dentry->d_sb->s_fs_info; } diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 01338d4c2d9e..5061f192eafd 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -61,7 +61,7 @@ static void v9fs_dentry_release(struct dentry *dentry) p9_fid_put(hlist_entry(p, struct p9_fid, dlist)); } -static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { struct p9_fid *fid; struct inode *inode; @@ -99,14 +99,36 @@ out_valid: return 1; } +static int v9fs_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) +{ + return __v9fs_lookup_revalidate(dentry, flags); +} + +static bool v9fs_dentry_unalias_trylock(const struct dentry *dentry) +{ + struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); + return down_write_trylock(&v9ses->rename_sem); +} + +static void v9fs_dentry_unalias_unlock(const struct dentry *dentry) +{ + struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); + up_write(&v9ses->rename_sem); +} + const struct dentry_operations v9fs_cached_dentry_operations = { .d_revalidate = v9fs_lookup_revalidate, - .d_weak_revalidate = v9fs_lookup_revalidate, + .d_weak_revalidate = __v9fs_lookup_revalidate, .d_delete = v9fs_cached_dentry_delete, .d_release = v9fs_dentry_release, + .d_unalias_trylock = v9fs_dentry_unalias_trylock, + .d_unalias_unlock = v9fs_dentry_unalias_unlock, }; const struct dentry_operations v9fs_dentry_operations = { .d_delete = always_delete_dentry, .d_release = v9fs_dentry_release, + .d_unalias_trylock = v9fs_dentry_unalias_trylock, + .d_unalias_unlock = v9fs_dentry_unalias_unlock, }; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index a843c36fc471..02cbf38e1a77 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -23,7 +23,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, struct dir_context *ctx); -static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); +static int afs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_iput(struct dentry *dentry, struct inode *inode); static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, @@ -597,19 +598,19 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, * Do a lookup of a single name in a directory * - just returns the FID the dentry name maps to if found */ -static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, +static int afs_do_lookup_one(struct inode *dir, const struct qstr *name, struct afs_fid *fid, afs_dataversion_t *_dir_version) { struct afs_super_info *as = dir->i_sb->s_fs_info; struct afs_lookup_one_cookie cookie = { .ctx.actor = afs_lookup_one_filldir, - .name = dentry->d_name, + .name = *name, .fid.vid = as->volume->vid }; int ret; - _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + _enter("{%lu},{%.*s},", dir->i_ino, name->len, name->name); /* search the directory */ ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version); @@ -1023,21 +1024,12 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, /* * Check the validity of a dentry under RCU conditions. */ -static int afs_d_revalidate_rcu(struct dentry *dentry) +static int afs_d_revalidate_rcu(struct afs_vnode *dvnode, struct dentry *dentry) { - struct afs_vnode *dvnode; - struct dentry *parent; - struct inode *dir; long dir_version, de_version; _enter("%p", dentry); - /* Check the parent directory is still valid first. */ - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - dvnode = AFS_FS_I(dir); if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) return -ECHILD; @@ -1065,11 +1057,11 @@ static int afs_d_revalidate_rcu(struct dentry *dentry) * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode */ -static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct afs_vnode *vnode, *dir; + struct afs_vnode *vnode, *dir = AFS_FS_I(parent_dir); struct afs_fid fid; - struct dentry *parent; struct inode *inode; struct key *key; afs_dataversion_t dir_version, invalid_before; @@ -1077,7 +1069,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) int ret; if (flags & LOOKUP_RCU) - return afs_d_revalidate_rcu(dentry); + return afs_d_revalidate_rcu(dir, dentry); if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); @@ -1092,14 +1084,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (IS_ERR(key)) key = NULL; - /* Hold the parent dentry so we can peer at it */ - parent = dget_parent(dentry); - dir = AFS_FS_I(d_inode(parent)); - /* validate the parent directory */ ret = afs_validate(dir, key); if (ret == -ERESTARTSYS) { - dput(parent); key_put(key); return ret; } @@ -1127,7 +1114,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, &dir_version); + ret = afs_do_lookup_one(&dir->netfs.inode, name, &fid, &dir_version); switch (ret) { case 0: /* the filename maps to something */ @@ -1171,22 +1158,19 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_valid; default: - _debug("failed to iterate dir %pd: %d", - parent, ret); + _debug("failed to iterate parent %pd2: %d", dentry, ret); goto not_found; } out_valid: dentry->d_fsdata = (void *)(unsigned long)dir_version; out_valid_noupdate: - dput(parent); key_put(key); _leave(" = 1 [valid]"); return 1; not_found: _debug("dropping dentry %pd2", dentry); - dput(parent); key_put(key); _leave(" = 0 [bad]"); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 672ca2c1d37d..ca755e8d1a37 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -24,7 +24,10 @@ do { \ } while (0) const char * const bch2_btree_node_flags[] = { -#define x(f) #f, + "typebit", + "typebit", + "typebit", +#define x(f) [BTREE_NODE_##f] = #f, BTREE_FLAGS() #undef x NULL diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 367231ab1980..5988219c6908 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2239,8 +2239,6 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); if (!k.k) return k; @@ -2251,6 +2249,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos iter->k = u; k.k = &iter->k; + btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); return k; } diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 3b62296c3100..c378b97ebeca 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -291,8 +291,10 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, unsigned flags) { - if (flags & BTREE_ITER_cached_nofill) + if (flags & BTREE_ITER_cached_nofill) { + ck_path->l[0].b = NULL; return 0; + } struct bch_fs *c = trans->c; struct btree_iter iter; diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 6b79b672e0b1..2760dd9569ed 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags); + trans->journal_u64s, flags, trans); } #define JSET_ENTRY_LOG_U64s 4 diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index f99ff1819597..114bf2f3879f 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -4,6 +4,7 @@ #include "compress.h" #include "error.h" #include "extents.h" +#include "io_write.h" #include "opts.h" #include "super-io.h" @@ -254,11 +255,14 @@ err: goto out; } -int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, - struct bch_extent_crc_unpacked *crc) +int bch2_bio_uncompress_inplace(struct bch_write_op *op, + struct bio *bio) { + struct bch_fs *c = op->c; + struct bch_extent_crc_unpacked *crc = &op->crc; struct bbuf data = { NULL }; size_t dst_len = crc->uncompressed_size << 9; + int ret = 0; /* bio must own its pages: */ BUG_ON(!bio->bi_vcnt); @@ -266,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || crc->compressed_size << 9 > c->opts.encoded_extent_max) { - bch_err(c, "error rewriting existing data: extent too big"); + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error rewriting existing data: extent too big"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); return -EIO; } data = __bounce_alloc(c, dst_len, WRITE); if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) - bch_err(c, "error rewriting existing data: decompression error"); - bio_unmap_or_unbounce(c, data); - return -EIO; + if (!c->opts.no_data_io) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error rewriting existing data: decompression error"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = -EIO; + goto err; } /* @@ -293,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, crc->uncompressed_size = crc->live_size; crc->offset = 0; crc->csum = (struct bch_csum) { 0, 0 }; - +err: bio_unmap_or_unbounce(c, data); - return 0; + return ret; } int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 607fd5e232c9..bec2f05bfd52 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,8 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } -int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, - struct bch_extent_crc_unpacked *); +struct bch_write_op; +int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, struct bvec_iter, struct bch_extent_crc_unpacked); unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 585214931e05..337494facac6 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -91,15 +91,28 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc return true; } -static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) +static noinline void trace_move_extent_finish2(struct data_update *u, + struct bkey_i *new, + struct bkey_i *insert) { - if (trace_move_extent_finish_enabled()) { - struct printbuf buf = PRINTBUF; + struct bch_fs *c = u->op.c; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_finish(c, buf.buf); - printbuf_exit(&buf); - } + prt_newline(&buf); + + bch2_data_update_to_text(&buf, u); + prt_newline(&buf); + + prt_str_indented(&buf, "new replicas:\t"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + prt_newline(&buf); + + prt_str_indented(&buf, "insert:\t"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_newline(&buf); + + trace_move_extent_finish(c, buf.buf); + printbuf_exit(&buf); } static void trace_move_extent_fail2(struct data_update *m, @@ -372,7 +385,8 @@ restart_drop_extra_replicas: bch2_btree_iter_set_pos(&iter, next_pos); this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); - trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); + if (trace_move_extent_finish_enabled()) + trace_move_extent_finish2(m, &new->k_i, insert); } err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -525,34 +539,38 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, struct data_update_opts *data_opts) { printbuf_tabstop_push(out, 20); - prt_str(out, "rewrite ptrs:\t"); + + prt_str_indented(out, "rewrite ptrs:\t"); bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); prt_newline(out); - prt_str(out, "kill ptrs:\t"); + prt_str_indented(out, "kill ptrs:\t"); bch2_prt_u64_base2(out, data_opts->kill_ptrs); prt_newline(out); - prt_str(out, "target:\t"); + prt_str_indented(out, "target:\t"); bch2_target_to_text(out, c, data_opts->target); prt_newline(out); - prt_str(out, "compression:\t"); + prt_str_indented(out, "compression:\t"); bch2_compression_opt_to_text(out, io_opts->background_compression); prt_newline(out); - prt_str(out, "opts.replicas:\t"); + prt_str_indented(out, "opts.replicas:\t"); prt_u64(out, io_opts->data_replicas); + prt_newline(out); - prt_str(out, "extra replicas:\t"); + prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) { - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); - prt_newline(out); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); + prt_newline(out); + + prt_str_indented(out, "old key:\t"); + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); } int bch2_extent_drop_ptrs(struct btree_trans *trans, diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index b5de52a50d10..55333e82d1fe 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -20,6 +20,7 @@ #include "extents.h" #include "fsck.h" #include "inode.h" +#include "journal_reclaim.h" #include "super.h" #include <linux/console.h> diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 3e71860f66b9..dd508d93e9fc 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -406,7 +406,7 @@ static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); } -static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) { __bch2_write_op_error(out, op, op->pos.offset); } @@ -873,7 +873,7 @@ static enum prep_encoded_ret { if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; - if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) + if (bch2_bio_uncompress_inplace(op, bio)) return PREP_ENCODED_ERR; } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 5400ce94ee57..b4626013abc8 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -20,6 +20,8 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); + #define BCH_WRITE_FLAGS() \ x(ALLOC_NOWAIT) \ x(CACHED) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 2cd20114b74b..cb2c3722f674 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq) static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) { - unsigned i; - - for (i = 0; i < ARRAY_SIZE(p->list); i++) - INIT_LIST_HEAD(&p->list[i]); - INIT_LIST_HEAD(&p->flushed); + for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) + INIT_LIST_HEAD(&p->unflushed[i]); + for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) + INIT_LIST_HEAD(&p->flushed[i]); atomic_set(&p->count, count); p->devs.nr = 0; } @@ -601,6 +600,16 @@ out: : -BCH_ERR_journal_res_get_blocked; } +static unsigned max_dev_latency(struct bch_fs *c) +{ + u64 nsecs = 0; + + for_each_rw_member(c, ca) + nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); + + return nsecs_to_jiffies(nsecs); +} + /* * Essentially the entry function to the journaling code. When bcachefs is doing * a btree insert, it calls this function to get the current journal write. @@ -612,17 +621,31 @@ out: * btree node write locks. */ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned flags) + unsigned flags, + struct btree_trans *trans) { int ret; if (closure_wait_event_timeout(&j->async_wait, (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || (flags & JOURNAL_RES_GET_NONBLOCK), - HZ * 10)) + HZ)) return ret; + if (trans) + bch2_trans_unlock_long(trans); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); + + remaining_wait = max(0, remaining_wait - HZ); + + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), + remaining_wait)) + return ret; + struct printbuf buf = PRINTBUF; bch2_journal_debug_to_text(&buf, j); bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", @@ -727,7 +750,7 @@ recheck_need_open: * livelock: */ sched_annotate_sleep(); - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; @@ -848,7 +871,7 @@ out: static int __bch2_journal_meta(struct journal *j) { struct journal_res res = {}; - int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; @@ -1602,54 +1625,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) __bch2_journal_debug_to_text(out, j); spin_unlock(&j->lock); } - -bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *pin; - - spin_lock(&j->lock); - if (!test_bit(JOURNAL_running, &j->flags)) { - spin_unlock(&j->lock); - return true; - } - - *seq = max(*seq, j->pin.front); - - if (*seq >= j->pin.back) { - spin_unlock(&j->lock); - return true; - } - - out->atomic++; - - pin_list = journal_seq_pin(j, *seq); - - prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); - printbuf_indent_add(out, 2); - - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) - list_for_each_entry(pin, &pin_list->list[i], list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - if (!list_empty(&pin_list->flushed)) - prt_printf(out, "flushed:\n"); - - list_for_each_entry(pin, &pin_list->flushed, list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - printbuf_indent_sub(out, 2); - - --out->atomic; - spin_unlock(&j->lock); - - return false; -} - -void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -{ - u64 seq = 0; - - while (!bch2_journal_seq_pins_to_text(out, j, &seq)) - seq++; -} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index cb0df0663946..dccddd5420ad 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -312,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j, } int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned); + unsigned, struct btree_trans *); /* First bits for BCH_WATERMARK: */ enum journal_res_flags { @@ -368,7 +368,8 @@ static inline int journal_res_get_fast(struct journal *j, } static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s, unsigned flags) + unsigned u64s, unsigned flags, + struct btree_trans *trans) { int ret; @@ -380,7 +381,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re if (journal_res_get_fast(j, res, flags)) goto out; - ret = bch2_journal_res_get_slowpath(j, res, flags); + ret = bch2_journal_res_get_slowpath(j, res, flags, trans); if (ret) return ret; out: @@ -429,8 +430,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7f2efe85a805..11c39e0c34f4 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,6 +17,7 @@ #include "sb-clean.h" #include "trace.h" +#include <linux/ioprio.h> #include <linux/string_choices.h> void bch2_journal_pos_from_member_info_set(struct bch_fs *c) @@ -1763,6 +1764,7 @@ static CLOSURE_CALLBACK(journal_write_submit) bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); ca->prev_journal_sector = bio->bi_iter.bi_sector; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 3c8242606da7..6a9cefb635d6 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j) popped = true; } - if (popped) + if (popped) { bch2_journal_space_available(j); + __closure_wake_up(&j->reclaim_flush_wait); + } } bool __bch2_journal_pin_put(struct journal *j, u64 seq) @@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j, pin->seq = 0; list_del_init(&pin->list); + if (j->reclaim_flush_wait.list.first) + __closure_wake_up(&j->reclaim_flush_wait); + /* * Unpinning a journal entry may make journal_next_bucket() succeed, if * writing a new last_seq will now make another bucket available: @@ -383,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) { if (fn == bch2_btree_node_flush0 || fn == bch2_btree_node_flush1) - return JOURNAL_PIN_btree; + return JOURNAL_PIN_TYPE_btree; else if (fn == bch2_btree_key_cache_journal_flush) - return JOURNAL_PIN_key_cache; + return JOURNAL_PIN_TYPE_key_cache; else - return JOURNAL_PIN_other; + return JOURNAL_PIN_TYPE_other; } static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, @@ -406,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, atomic_inc(&pin_list->count); pin->seq = seq; pin->flush = flush_fn; - list_add(&pin->list, &pin_list->list[type]); + + if (list_empty(&pin_list->unflushed[type]) && + j->reclaim_flush_wait.list.first) + __closure_wake_up(&j->reclaim_flush_wait); + + list_add(&pin->list, &pin_list->unflushed[type]); } void bch2_journal_pin_copy(struct journal *j, @@ -499,16 +509,15 @@ journal_get_next_pin(struct journal *j, { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; - unsigned i; fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { if (*seq > seq_to_flush && !allowed_above_seq) break; - for (i = 0; i < JOURNAL_PIN_NR; i++) - if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || - ((1U << i) & allowed_above_seq)) { - ret = list_first_entry_or_null(&pin_list->list[i], + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) + if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || + (BIT(i) & allowed_above_seq)) { + ret = list_first_entry_or_null(&pin_list->unflushed[i], struct journal_entry_pin, list); if (ret) return ret; @@ -544,8 +553,8 @@ static size_t journal_flush_pins(struct journal *j, } if (min_key_cache) { - allowed_above |= 1U << JOURNAL_PIN_key_cache; - allowed_below |= 1U << JOURNAL_PIN_key_cache; + allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); + allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); } cond_resched(); @@ -553,7 +562,9 @@ static size_t journal_flush_pins(struct journal *j, j->last_flushed = jiffies; spin_lock(&j->lock); - pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); + pin = journal_get_next_pin(j, seq_to_flush, + allowed_below, + allowed_above, &seq); if (pin) { BUG_ON(j->flush_in_progress); j->flush_in_progress = pin; @@ -576,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j, spin_lock(&j->lock); /* Pin might have been dropped or rearmed: */ if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); j->flush_in_progress = NULL; j->flush_in_progress_dropped = false; spin_unlock(&j->lock); @@ -816,10 +827,41 @@ int bch2_journal_reclaim_start(struct journal *j) return 0; } +static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, + unsigned types) +{ + struct journal_entry_pin_list *pin_list; + u64 seq; + + spin_lock(&j->lock); + fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { + if (seq > seq_to_flush) + break; + + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) + if ((BIT(i) & types) && + (!list_empty(&pin_list->unflushed[i]) || + !list_empty(&pin_list->flushed[i]))) { + spin_unlock(&j->lock); + return true; + } + } + spin_unlock(&j->lock); + + return false; +} + +static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, + unsigned types) +{ + return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || + journal_pins_still_flushing(j, seq_to_flush, types); +} + static int journal_flush_done(struct journal *j, u64 seq_to_flush, bool *did_work) { - int ret; + int ret = 0; ret = bch2_journal_error(j); if (ret) @@ -827,12 +869,18 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins(j, seq_to_flush, - (1U << JOURNAL_PIN_key_cache)| - (1U << JOURNAL_PIN_other), 0, 0, 0) || - journal_flush_pins(j, seq_to_flush, - (1U << JOURNAL_PIN_btree), 0, 0, 0)) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, + BIT(JOURNAL_PIN_TYPE_key_cache)| + BIT(JOURNAL_PIN_TYPE_other))) { + *did_work = true; + goto unlock; + } + + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, + BIT(JOURNAL_PIN_TYPE_btree))) { *did_work = true; + goto unlock; + } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); @@ -847,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, !fifo_used(&j->pin); spin_unlock(&j->lock); +unlock: mutex_unlock(&j->reclaim_lock); return ret; @@ -860,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) if (!test_bit(JOURNAL_running, &j->flags)) return false; - closure_wait_event(&j->async_wait, + closure_wait_event(&j->reclaim_flush_wait, journal_flush_done(j, seq_to_flush, &did_work)); return did_work; @@ -926,3 +975,54 @@ err: return ret; } + +bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + + spin_lock(&j->lock); + if (!test_bit(JOURNAL_running, &j->flags)) { + spin_unlock(&j->lock); + return true; + } + + *seq = max(*seq, j->pin.front); + + if (*seq >= j->pin.back) { + spin_unlock(&j->lock); + return true; + } + + out->atomic++; + + pin_list = journal_seq_pin(j, *seq); + + prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); + printbuf_indent_add(out, 2); + + prt_printf(out, "unflushed:\n"); + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) + list_for_each_entry(pin, &pin_list->unflushed[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); + + prt_printf(out, "flushed:\n"); + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) + list_for_each_entry(pin, &pin_list->flushed[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); + + printbuf_indent_sub(out, 2); + + --out->atomic; + spin_unlock(&j->lock); + + return false; +} + +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +{ + u64 seq = 0; + + while (!bch2_journal_seq_pins_to_text(out, j, &seq)) + seq++; +} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index ec84c3345281..0a73d7134e1c 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j) int bch2_journal_flush_device_pins(struct journal *, int); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index e9bd716fbb71..3ba433a48eb8 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -53,15 +53,15 @@ struct journal_buf { */ enum journal_pin_type { - JOURNAL_PIN_btree, - JOURNAL_PIN_key_cache, - JOURNAL_PIN_other, - JOURNAL_PIN_NR, + JOURNAL_PIN_TYPE_btree, + JOURNAL_PIN_TYPE_key_cache, + JOURNAL_PIN_TYPE_other, + JOURNAL_PIN_TYPE_NR, }; struct journal_entry_pin_list { - struct list_head list[JOURNAL_PIN_NR]; - struct list_head flushed; + struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; + struct list_head flushed[JOURNAL_PIN_TYPE_NR]; atomic_t count; struct bch_devs_list devs; }; @@ -226,6 +226,7 @@ struct journal { /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; + struct closure_waitlist reclaim_flush_wait; struct delayed_work write_work; struct workqueue_struct *wq; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 85c361e78ba5..21805509ab9e 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -215,7 +215,8 @@ static int bch2_copygc(struct moving_context *ctxt, }; move_buckets buckets = { 0 }; struct move_bucket_in_flight *f; - u64 moved = atomic64_read(&ctxt->stats->sectors_moved); + u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); + u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); @@ -245,7 +246,6 @@ static int bch2_copygc(struct moving_context *ctxt, *did_work = true; } err: - darray_exit(&buckets); /* no entries in LRU btree found, or got to end: */ if (bch2_err_matches(ret, ENOENT)) @@ -254,8 +254,11 @@ err: if (ret < 0 && !bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "from bch2_move_data()"); - moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; - trace_and_count(c, copygc, c, moved, 0, 0, 0); + sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; + sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; + trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); + + darray_exit(&buckets); return ret; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index e763d52e0f38..a182b5d454ba 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -476,13 +476,13 @@ enum fsck_err_opts { NULL, "Enable nocow mode: enables runtime locking in\n"\ "data move path needed if nocow will ever be in use\n")\ x(copygc_enabled, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable copygc: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ x(rebalance_enabled, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable rebalance: disable for debugging, or to\n"\ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 0b4fe899209b..ea0a18364751 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -57,7 +57,7 @@ enum bch_fsck_flags { x(bset_wrong_sector_offset, 44, 0) \ x(bset_empty, 45, 0) \ x(bset_bad_seq, 46, 0) \ - x(bset_blacklisted_journal_seq, 47, 0) \ + x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ x(btree_node_bad_btree, 49, 0) \ x(btree_node_bad_level, 50, 0) \ diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 8c2c5539de2e..d78451c2a0c6 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -31,11 +31,11 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir } } -static int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) +static noinline int fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old) { struct qstr old_name = bch2_dirent_get_name(old); struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); @@ -71,11 +71,11 @@ static int fsck_rename_dirent(struct btree_trans *trans, return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); } -static int hash_pick_winner(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c k1, - struct bkey_s_c k2) +static noinline int hash_pick_winner(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c k1, + struct bkey_s_c k2) { if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) @@ -142,8 +142,8 @@ fsck_err: * All versions of the same inode in different snapshots must have the same hash * seed/type: verify that the hash info we're using matches the root */ -static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, - struct bch_hash_info *hash_info) +static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, + struct bch_hash_info *hash_info) { struct bch_fs *c = trans->c; struct btree_iter iter; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 9d40b7d4ea29..56a5a7fbc0fd 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -902,32 +902,30 @@ TRACE_EVENT(evacuate_bucket, TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, - u64 sectors_moved, u64 sectors_not_moved, - u64 buckets_moved, u64 buckets_not_moved), - TP_ARGS(c, - sectors_moved, sectors_not_moved, - buckets_moved, buckets_not_moved), + u64 buckets, + u64 sectors_seen, + u64 sectors_moved), + TP_ARGS(c, buckets, sectors_seen, sectors_moved), TP_STRUCT__entry( __field(dev_t, dev ) + __field(u64, buckets ) + __field(u64, sectors_seen ) __field(u64, sectors_moved ) - __field(u64, sectors_not_moved ) - __field(u64, buckets_moved ) - __field(u64, buckets_not_moved ) ), TP_fast_assign( __entry->dev = c->dev; + __entry->buckets = buckets; + __entry->sectors_seen = sectors_seen; __entry->sectors_moved = sectors_moved; - __entry->sectors_not_moved = sectors_not_moved; - __entry->buckets_moved = buckets_moved; - __entry->buckets_not_moved = buckets_moved; ), - TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", + TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->sectors_moved, __entry->sectors_not_moved, - __entry->buckets_moved, __entry->buckets_not_moved) + __entry->buckets, + __entry->sectors_seen, + __entry->sectors_moved) ); TRACE_EVENT(copygc_wait, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0bf388e07a02..62e99e65250d 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1940,29 +1940,19 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, /* * Check if cached dentry can be trusted. */ -static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) +static int ceph_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; int valid = 0; - struct dentry *parent; - struct inode *dir, *inode; + struct inode *inode; - valid = fscrypt_d_revalidate(dentry, flags); + valid = fscrypt_d_revalidate(dir, name, dentry, flags); if (valid <= 0) return valid; - if (flags & LOOKUP_RCU) { - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - inode = d_inode_rcu(dentry); - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - inode = d_inode(dentry); - } + inode = d_inode_rcu(dentry); doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n", dentry, dentry, inode, ceph_dentry(dentry)->offset, @@ -2008,6 +1998,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) req->r_parent = dir; ihold(dir); + req->r_dname = name; + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) mask |= CEPH_CAP_XATTR_SHARED; @@ -2038,9 +2030,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid"); if (!valid) ceph_dir_clear_complete(dir); - - if (!(flags & LOOKUP_RCU)) - dput(parent); return valid; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 785fe489ef4b..d7a06b9aaef6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2621,6 +2621,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) { struct inode *dir = req->r_parent; struct dentry *dentry = req->r_dentry; + const struct qstr *name = req->r_dname; u8 *cryptbuf = NULL; u32 len = 0; int ret = 0; @@ -2641,8 +2642,10 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) if (!fscrypt_has_encryption_key(dir)) goto success; - if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX, - &len)) { + if (!name) + name = &dentry->d_name; + + if (!fscrypt_fname_encrypted_size(dir, name->len, NAME_MAX, &len)) { WARN_ON_ONCE(1); return ERR_PTR(-ENAMETOOLONG); } @@ -2657,7 +2660,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) if (!cryptbuf) return ERR_PTR(-ENOMEM); - ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len); + ret = fscrypt_fname_encrypt(dir, name, cryptbuf, len); if (ret) { kfree(cryptbuf); return ERR_PTR(ret); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 38bb7e0d2d79..7c9fee9e80d4 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -299,6 +299,8 @@ struct ceph_mds_request { struct inode *r_target_inode; /* resulting inode */ struct inode *r_new_inode; /* new inode (for creates) */ + const struct qstr *r_dname; /* stable name (for ->d_revalidate) */ + #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ #define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 4e552ba7bd43..a3e2dfeedfbf 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -445,7 +445,8 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) } /* called when a cache lookup succeeds */ -static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) +static int coda_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *de, unsigned int flags) { struct inode *inode; struct coda_inode_info *cii; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 0ad52fbe51c9..010f9c0a4c2f 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -574,11 +574,10 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); * Validate dentries in encrypted directories to make sure we aren't potentially * caching stale dentries after a key has been added. */ -int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct dentry *dir; int err; - int valid; /* * Plaintext names are always valid, since fscrypt doesn't support @@ -591,30 +590,21 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) /* * No-key name; valid if the directory's key is still unavailable. * - * Although fscrypt forbids rename() on no-key names, we still must use - * dget_parent() here rather than use ->d_parent directly. That's - * because a corrupted fs image may contain directory hard links, which - * the VFS handles by moving the directory's dentry tree in the dcache - * each time ->lookup() finds the directory and it already has a dentry - * elsewhere. Thus ->d_parent can be changing, and we must safely grab - * a reference to some ->d_parent to prevent it from being freed. + * Note in RCU mode we have to bail if we get here - + * fscrypt_get_encryption_info() may block. */ if (flags & LOOKUP_RCU) return -ECHILD; - dir = dget_parent(dentry); /* * Pass allow_unsupported=true, so that files with an unsupported * encryption policy can be deleted. */ - err = fscrypt_get_encryption_info(d_inode(dir), true); - valid = !fscrypt_has_encryption_key(d_inode(dir)); - dput(dir); - + err = fscrypt_get_encryption_info(dir, true); if (err < 0) return err; - return valid; + return !fscrypt_has_encryption_key(dir); } EXPORT_SYMBOL_GPL(fscrypt_d_revalidate); diff --git a/fs/dcache.c b/fs/dcache.c index 1cd929f17eec..9cc0d47da321 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -295,12 +295,16 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c return dentry_string_cmp(cs, ct, tcount); } +/* + * long names are allocated separately from dentry and never modified. + * Refcounted, freeing is RCU-delayed. See take_dentry_name_snapshot() + * for the reason why ->count and ->head can't be combined into a union. + * dentry_string_cmp() relies upon ->name[] being word-aligned. + */ struct external_name { - union { - atomic_t count; - struct rcu_head head; - } u; - unsigned char name[]; + atomic_t count; + struct rcu_head head; + unsigned char name[] __aligned(sizeof(unsigned long)); }; static inline struct external_name *external_name(struct dentry *dentry) @@ -324,31 +328,45 @@ static void __d_free_external(struct rcu_head *head) static inline int dname_external(const struct dentry *dentry) { - return dentry->d_name.name != dentry->d_iname; + return dentry->d_name.name != dentry->d_shortname.string; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { - spin_lock(&dentry->d_lock); - name->name = dentry->d_name; - if (unlikely(dname_external(dentry))) { - atomic_inc(&external_name(dentry)->u.count); + unsigned seq; + const unsigned char *s; + + rcu_read_lock(); +retry: + seq = read_seqcount_begin(&dentry->d_seq); + s = READ_ONCE(dentry->d_name.name); + name->name.hash_len = dentry->d_name.hash_len; + name->name.name = name->inline_name.string; + if (likely(s == dentry->d_shortname.string)) { + name->inline_name = dentry->d_shortname; } else { - memcpy(name->inline_name, dentry->d_iname, - dentry->d_name.len + 1); - name->name.name = name->inline_name; + struct external_name *p; + p = container_of(s, struct external_name, name[0]); + // get a valid reference + if (unlikely(!atomic_inc_not_zero(&p->count))) + goto retry; + name->name.name = s; } - spin_unlock(&dentry->d_lock); + if (read_seqcount_retry(&dentry->d_seq, seq)) { + release_dentry_name_snapshot(name); + goto retry; + } + rcu_read_unlock(); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { - if (unlikely(name->name.name != name->inline_name)) { + if (unlikely(name->name.name != name->inline_name.string)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); - if (unlikely(atomic_dec_and_test(&p->u.count))) - kfree_rcu(p, u.head); + if (unlikely(atomic_dec_and_test(&p->count))) + kfree_rcu(p, head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -386,7 +404,7 @@ static void dentry_free(struct dentry *dentry) WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); - if (likely(atomic_dec_and_test(&p->u.count))) { + if (likely(atomic_dec_and_test(&p->count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } @@ -1654,10 +1672,10 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ - dentry->d_iname[DNAME_INLINE_LEN-1] = 0; + dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; - dname = dentry->d_iname; + dname = dentry->d_shortname.string; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, @@ -1667,10 +1685,10 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&p->u.count, 1); + atomic_set(&p->count, 1); dname = p->name; } else { - dname = dentry->d_iname; + dname = dentry->d_shortname.string; } dentry->d_name.len = name->len; @@ -2728,10 +2746,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target) * dentry:internal, target:external. Steal target's * storage and make target internal. */ - memcpy(target->d_iname, dentry->d_name.name, - dentry->d_name.len + 1); dentry->d_name.name = target->d_name.name; - target->d_name.name = target->d_iname; + target->d_shortname = dentry->d_shortname; + target->d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { @@ -2739,20 +2756,16 @@ static void swap_names(struct dentry *dentry, struct dentry *target) * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); target->d_name.name = dentry->d_name.name; - dentry->d_name.name = dentry->d_iname; + dentry->d_shortname = target->d_shortname; + dentry->d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. */ - unsigned int i; - BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); - for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { - swap(((long *) &dentry->d_iname)[i], - ((long *) &target->d_iname)[i]); - } + for (int i = 0; i < DNAME_INLINE_WORDS; i++) + swap(dentry->d_shortname.words[i], + target->d_shortname.words[i]); } } swap(dentry->d_name.hash_len, target->d_name.hash_len); @@ -2764,16 +2777,15 @@ static void copy_name(struct dentry *dentry, struct dentry *target) if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { - atomic_inc(&external_name(target)->u.count); + atomic_inc(&external_name(target)->count); dentry->d_name = target->d_name; } else { - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); - dentry->d_name.name = dentry->d_iname; + dentry->d_shortname = target->d_shortname; + dentry->d_name.name = dentry->d_shortname.string; dentry->d_name.hash_len = target->d_name.hash_len; } - if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - kfree_rcu(old_name, u.head); + if (old_name && likely(atomic_dec_and_test(&old_name->count))) + kfree_rcu(old_name, head); } /* @@ -2954,7 +2966,12 @@ static int __d_unalias(struct dentry *dentry, struct dentry *alias) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: + if (alias->d_op->d_unalias_trylock && + !alias->d_op->d_unalias_trylock(alias)) + goto out_err; __d_move(alias, dentry, false); + if (alias->d_op->d_unalias_unlock) + alias->d_op->d_unalias_unlock(alias); ret = 0; out_err: if (m2) @@ -3102,12 +3119,12 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; - BUG_ON(dentry->d_name.name != dentry->d_iname || + BUG_ON(dname_external(dentry) || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", + dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); @@ -3195,7 +3212,7 @@ static void __init dcache_init(void) */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, - d_iname); + d_shortname.string); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index acaa0825e9bb..1dfd5b81d831 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -17,7 +17,9 @@ /** * ecryptfs_d_revalidate - revalidate an ecryptfs dentry - * @dentry: The ecryptfs dentry + * @dir: inode of expected parent + * @name: expected name + * @dentry: dentry to revalidate * @flags: lookup flags * * Called when the VFS needs to revalidate a dentry. This @@ -28,7 +30,8 @@ * Returns 1 if valid, 0 otherwise. * */ -static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); int rc = 1; @@ -36,8 +39,15 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE) - rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE) { + struct inode *lower_dir = ecryptfs_inode_to_lower(dir); + struct name_snapshot n; + + take_dentry_name_snapshot(&n, lower_dentry); + rc = lower_dentry->d_op->d_revalidate(lower_dir, &n.name, + lower_dentry, flags); + release_dentry_name_snapshot(&n); + } if (d_really_is_positive(dentry)) { struct inode *inode = d_inode(dentry); diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 099f80645072..691dd77b6ab5 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -31,10 +31,9 @@ static inline void exfat_d_version_set(struct dentry *dentry, * If it happened, the negative dentry isn't actually negative anymore. So, * drop it. */ -static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags) +static int exfat_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - int ret; - if (flags & LOOKUP_RCU) return -ECHILD; @@ -58,11 +57,7 @@ static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; - spin_lock(&dentry->d_lock); - ret = inode_eq_iversion(d_inode(dentry->d_parent), - exfat_d_version(dentry)); - spin_unlock(&dentry->d_lock); - return ret; + return inode_eq_iversion(dir, exfat_d_version(dentry)); } /* returns the length of a struct qstr, ignoring trailing dots if necessary */ diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 26c4fc37edcf..da4263a14a20 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -322,9 +322,7 @@ restart: WARN_ON(!list_empty(&ei->i_fc_dilist)); spin_unlock(&sbi->s_fc_lock); - if (fc_dentry->fcd_name.name && - fc_dentry->fcd_name.len > DNAME_INLINE_LEN) - kfree(fc_dentry->fcd_name.name); + release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); return; @@ -449,22 +447,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_op = dentry_update->op; node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; - if (dentry->d_name.len > DNAME_INLINE_LEN) { - node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); - if (!node->fcd_name.name) { - kmem_cache_free(ext4_fc_dentry_cachep, node); - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); - mutex_lock(&ei->i_fc_lock); - return -ENOMEM; - } - memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, - dentry->d_name.len); - } else { - memcpy(node->fcd_iname, dentry->d_name.name, - dentry->d_name.len); - node->fcd_name.name = node->fcd_iname; - } - node->fcd_name.len = dentry->d_name.len; + take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); spin_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || @@ -832,7 +815,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; - int dlen = fc_dentry->fcd_name.len; + int dlen = fc_dentry->fcd_name.name.len; u8 *dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); @@ -847,7 +830,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); - memcpy(dst, fc_dentry->fcd_name.name, dlen); + memcpy(dst, fc_dentry->fcd_name.name.name, dlen); return true; } @@ -1328,9 +1311,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_del_init(&fc_dentry->fcd_dilist); spin_unlock(&sbi->s_fc_lock); - if (fc_dentry->fcd_name.name && - fc_dentry->fcd_name.len > DNAME_INLINE_LEN) - kfree(fc_dentry->fcd_name.name); + release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); spin_lock(&sbi->s_fc_lock); } diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 2fadb2c4780c..3bd534e4dbbf 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -109,8 +109,7 @@ struct ext4_fc_dentry_update { int fcd_op; /* Type of update create / unlink / link */ int fcd_parent; /* Parent inode number */ int fcd_ino; /* Inode number */ - struct qstr fcd_name; /* Dirent name */ - unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ + struct name_snapshot fcd_name; /* Dirent name */ struct list_head fcd_list; struct list_head fcd_dilist; }; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 15bf32c21ac0..926c26e90ef8 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -43,17 +43,13 @@ static inline void vfat_d_version_set(struct dentry *dentry, * If it happened, the negative dentry isn't actually negative * anymore. So, drop it. */ -static int vfat_revalidate_shortname(struct dentry *dentry) +static bool vfat_revalidate_shortname(struct dentry *dentry, struct inode *dir) { - int ret = 1; - spin_lock(&dentry->d_lock); - if (!inode_eq_iversion(d_inode(dentry->d_parent), vfat_d_version(dentry))) - ret = 0; - spin_unlock(&dentry->d_lock); - return ret; + return inode_eq_iversion(dir, vfat_d_version(dentry)); } -static int vfat_revalidate(struct dentry *dentry, unsigned int flags) +static int vfat_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; @@ -61,10 +57,11 @@ static int vfat_revalidate(struct dentry *dentry, unsigned int flags) /* This is not negative dentry. Always valid. */ if (d_really_is_positive(dentry)) return 1; - return vfat_revalidate_shortname(dentry); + return vfat_revalidate_shortname(dentry, dir); } -static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags) +static int vfat_revalidate_ci(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; @@ -97,7 +94,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags) if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; - return vfat_revalidate_shortname(dentry); + return vfat_revalidate_shortname(dentry, dir); } /* returns the length of a struct qstr, ignoring trailing dots */ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index be693a8a1010..198862b086ff 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -175,10 +175,12 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, memset(outarg, 0, sizeof(struct fuse_entry_out)); args->opcode = FUSE_LOOKUP; args->nodeid = nodeid; - args->in_numargs = 2; + args->in_numargs = 3; fuse_set_zero_arg0(args); - args->in_args[1].size = name->len + 1; + args->in_args[1].size = name->len; args->in_args[1].value = name->name; + args->in_args[2].size = 1; + args->in_args[2].value = ""; args->out_numargs = 1; args->out_args[0].size = sizeof(struct fuse_entry_out); args->out_args[0].value = outarg; @@ -193,10 +195,10 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, * the lookup once more. If the lookup results in the same inode, * then refresh the attributes, timeouts and mark the dentry valid. */ -static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) +static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *entry, unsigned int flags) { struct inode *inode; - struct dentry *parent; struct fuse_mount *fm; struct fuse_inode *fi; int ret; @@ -228,11 +230,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version = fuse_get_attr_version(fm->fc); - parent = dget_parent(entry); - fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), - &entry->d_name, &outarg); + fuse_lookup_init(fm->fc, &args, get_node_id(dir), + name, &outarg); ret = fuse_simple_request(fm, &args); - dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; @@ -266,9 +266,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { - parent = dget_parent(entry); - fuse_advise_use_readdirplus(d_inode(parent)); - dput(parent); + fuse_advise_use_readdirplus(dir); } } ret = 1; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 2e215e8c3c88..95050e719233 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -21,7 +21,9 @@ /** * gfs2_drevalidate - Check directory lookup consistency - * @dentry: the mapping to check + * @dir: expected parent directory inode + * @name: expexted name + * @dentry: dentry to check * @flags: lookup flags * * Check to make sure the lookup necessary to arrive at this inode from its @@ -30,50 +32,43 @@ * Returns: 1 if the dentry is ok, 0 if it isn't */ -static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) +static int gfs2_drevalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct dentry *parent; - struct gfs2_sbd *sdp; - struct gfs2_inode *dip; + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_inode *dip = GFS2_I(dir); struct inode *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; - int error, valid = 0; + int error, valid; int had_lock = 0; if (flags & LOOKUP_RCU) return -ECHILD; - parent = dget_parent(dentry); - sdp = GFS2_SB(d_inode(parent)); - dip = GFS2_I(d_inode(parent)); inode = d_inode(dentry); if (inode) { if (is_bad_inode(inode)) - goto out; + return 0; ip = GFS2_I(inode); } - if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) { - valid = 1; - goto out; - } + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + return 1; had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) - goto out; + return 0; } - error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip); + error = gfs2_dir_check(dir, name, ip); valid = inode ? !error : (error == -ENOENT); if (!had_lock) gfs2_glock_dq_uninit(&d_gh); -out: - dput(parent); return valid; } diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 76fa02e3835b..ef54fc8093cf 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -13,7 +13,8 @@ /* dentry case-handling: just lowercase everything */ -static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) +static int hfs_revalidate_dentry(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; int diff; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index d68a4e6ac345..fc8ede43afde 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1576,7 +1576,8 @@ out: return result; } -static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags) +static int jfs_ci_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { /* * This is not negative dentry. Always valid. diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 458519e416fe..5f0f8b95f44c 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1109,7 +1109,8 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, return ERR_PTR(rc); } -static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn; struct kernfs_root *root; diff --git a/fs/libfs.c b/fs/libfs.c index 5b6120b19e99..8444f5cc4064 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1782,7 +1782,7 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, { const struct dentry *parent; const struct inode *dir; - char strbuf[DNAME_INLINE_LEN]; + union shortname_store strbuf; struct qstr qstr; /* @@ -1802,22 +1802,23 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, if (!dir || !IS_CASEFOLDED(dir)) return 1; + qstr.len = len; + qstr.name = str; /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry * the lookup, so it doesn't matter what ->d_compare() returns. * However, it's unsafe to call utf8_strncasecmp() with an unstable * string. Therefore, we have to copy the name into a temporary buffer. + * As above, len is guaranteed to match str, so the shortname case + * is exactly when str points to ->d_shortname. */ - if (len <= DNAME_INLINE_LEN - 1) { - memcpy(strbuf, str, len); - strbuf[len] = 0; - str = strbuf; + if (qstr.name == dentry->d_shortname.string) { + strbuf = dentry->d_shortname; // NUL is guaranteed to be in there + qstr.name = strbuf.string; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } - qstr.len = len; - qstr.name = str; return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } diff --git a/fs/namei.c b/fs/namei.c index 8c82afddd2ad..3ab9440c5b93 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -921,10 +921,11 @@ out_dput: return false; } -static inline int d_revalidate(struct dentry *dentry, unsigned int flags) +static inline int d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) - return dentry->d_op->d_revalidate(dentry, flags); + return dentry->d_op->d_revalidate(dir, name, dentry, flags); else return 1; } @@ -1652,7 +1653,7 @@ static struct dentry *lookup_dcache(const struct qstr *name, { struct dentry *dentry = d_lookup(dir, name); if (dentry) { - int error = d_revalidate(dentry, flags); + int error = d_revalidate(dir->d_inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); @@ -1737,19 +1738,20 @@ static struct dentry *lookup_fast(struct nameidata *nd) if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); if (likely(status > 0)) return dentry; if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(nd->inode, &nd->last, + dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) @@ -1777,7 +1779,7 @@ again: if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { - int error = d_revalidate(dentry, flags); + int error = d_revalidate(inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); @@ -3575,7 +3577,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, if (d_in_lookup(dentry)) break; - error = d_revalidate(dentry, nd->flags); + error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags); if (likely(error > 0)) break; if (error) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 492cffd9d3d8..2b04038b0e40 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1672,7 +1672,7 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, return nfs_lookup_revalidate_done(dir, dentry, inode, 1); } -static int nfs_lookup_revalidate_dentry(struct inode *dir, +static int nfs_lookup_revalidate_dentry(struct inode *dir, const struct qstr *name, struct dentry *dentry, struct inode *inode, unsigned int flags) { @@ -1690,7 +1690,7 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir, goto out; dir_verifier = nfs_save_change_attribute(dir); - ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); + ret = NFS_PROTO(dir)->lookup(dir, dentry, name, fhandle, fattr); if (ret < 0) goto out; @@ -1732,8 +1732,8 @@ out: * cached dentry and do a new lookup. */ static int -nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, - unsigned int flags) +nfs_do_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; int error = 0; @@ -1775,7 +1775,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, if (NFS_STALE(inode)) goto out_bad; - return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); + return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); out_valid: return nfs_lookup_revalidate_done(dir, dentry, inode, 1); out_bad: @@ -1785,38 +1785,26 @@ out_bad: } static int -__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, - int (*reval)(struct inode *, struct dentry *, unsigned int)) +__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent; - struct inode *dir; - int ret; - if (flags & LOOKUP_RCU) { if (dentry->d_fsdata == NFS_FSDATA_BLOCKED) return -ECHILD; - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - ret = reval(dir, dentry, flags); - if (parent != READ_ONCE(dentry->d_parent)) - return -ECHILD; } else { /* Wait for unlink to complete - see unblock_revalidate() */ wait_var_event(&dentry->d_fsdata, smp_load_acquire(&dentry->d_fsdata) != NFS_FSDATA_BLOCKED); - parent = dget_parent(dentry); - ret = reval(d_inode(parent), dentry, flags); - dput(parent); } - return ret; + return 0; } -static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int nfs_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); + if (__nfs_lookup_revalidate(dentry, flags)) + return -ECHILD; + return nfs_do_lookup_revalidate(dir, name, dentry, flags); } static void block_revalidate(struct dentry *dentry) @@ -1982,7 +1970,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); - error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, + fhandle, fattr); if (error == -ENOENT) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) dir_verifier = inode_peek_iversion_raw(dir); @@ -2025,7 +2014,8 @@ void nfs_d_prune_case_insensitive_aliases(struct inode *inode) EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases); #if IS_ENABLED(CONFIG_NFS_V4) -static int nfs4_lookup_revalidate(struct dentry *, unsigned int); +static int nfs4_lookup_revalidate(struct inode *, const struct qstr *, + struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, @@ -2214,11 +2204,14 @@ no_open: EXPORT_SYMBOL_GPL(nfs_atomic_open); static int -nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, - unsigned int flags) +nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; + if (__nfs_lookup_revalidate(dentry, flags)) + return -ECHILD; + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) @@ -2254,16 +2247,10 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); + return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); full_reval: - return nfs_do_lookup_revalidate(dir, dentry, flags); -} - -static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) -{ - return __nfs_lookup_revalidate(dentry, flags, - nfs4_do_lookup_revalidate); + return nfs_do_lookup_revalidate(dir, name, dentry, flags); } #endif /* CONFIG_NFSV4 */ @@ -2319,7 +2306,8 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, d_drop(dentry); if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, + fhandle, fattr); if (error) goto out_error; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 2d53574da605..973aed9cc5fe 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -308,7 +308,7 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server) int err; /* Look it up again to get its attributes */ - err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry, + err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry, &dentry->d_name, ctx->mntfh, ctx->clone_data.fattr); dput(parent); if (err != 0) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 7359e1a3bd84..0c3bc98cd999 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -192,7 +192,7 @@ __nfs3_proc_lookup(struct inode *dir, const char *name, size_t len, } static int -nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, +nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { unsigned short task_flags = 0; @@ -202,8 +202,7 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, task_flags |= RPC_TASK_TIMEOUT; dprintk("NFS call lookup %pd2\n", dentry); - return __nfs3_proc_lookup(dir, dentry->d_name.name, - dentry->d_name.len, fhandle, fattr, + return __nfs3_proc_lookup(dir, name->name, name->len, fhandle, fattr, task_flags); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d615d520f8cf..df9669d4ded7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4544,15 +4544,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, - struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_server *server = NFS_SERVER(dir); int status; struct nfs4_lookup_arg args = { .bitmask = server->attr_bitmask, .dir_fh = NFS_FH(dir), - .name = &dentry->d_name, + .name = name, }; struct nfs4_lookup_res res = { .server = server, @@ -4594,17 +4594,16 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) } static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, - struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs4_exception exception = { .interruptible = true, }; struct rpc_clnt *client = *clnt; - const struct qstr *name = &dentry->d_name; int err; do { - err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr); + err = _nfs4_proc_lookup(client, dir, dentry, name, fhandle, fattr); trace_nfs4_lookup(dir, name, err); switch (err) { case -NFS4ERR_BADNAME: @@ -4639,13 +4638,13 @@ out: return err; } -static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry, +static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int status; struct rpc_clnt *client = NFS_CLIENT(dir); - status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, dentry, name, fhandle, fattr); if (client != NFS_CLIENT(dir)) { rpc_shutdown_client(client); nfs_fixup_secinfo_attributes(fattr); @@ -4660,7 +4659,8 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry, struct rpc_clnt *client = NFS_CLIENT(dir); int status; - status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, dentry, &dentry->d_name, + fhandle, fattr); if (status < 0) return ERR_PTR(status); return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 6c09cd090c34..77920a2e3cef 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -153,13 +153,13 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct inode *dir, struct dentry *dentry, +nfs_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_diropargs arg = { .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len + .name = name->name, + .len = name->len }; struct nfs_diropok res = { .fh = fhandle, diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 8d789b017fa9..af94e3737470 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -787,7 +787,8 @@ pack_runs: if (err) goto out; - attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); + attr = mi_find_attr(ni, mi, NULL, type, name, name_len, + &le->id); if (!attr) { err = -EINVAL; goto bad_inode; @@ -1181,7 +1182,7 @@ repack: goto out; } - attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id); + attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; @@ -1406,7 +1407,7 @@ int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, */ if (!attr->non_res) { if (vbo[1] + bytes_per_off > le32_to_cpu(attr->res.data_size)) { - ntfs_inode_err(&ni->vfs_inode, "is corrupted"); + _ntfs_bad_inode(&ni->vfs_inode); return -EINVAL; } addr = resident_data(attr); @@ -1796,7 +1797,7 @@ repack: goto out; } - attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, + attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; @@ -2041,8 +2042,8 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) } /* Look for required attribute. */ - attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, - 0, &le->id); + attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, + NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; @@ -2587,7 +2588,7 @@ int attr_force_nonresident(struct ntfs_inode *ni) attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi); if (!attr) { - ntfs_bad_inode(&ni->vfs_inode, "no data attribute"); + _ntfs_bad_inode(&ni->vfs_inode); return -ENOENT; } diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index fc6a8aa29e3a..b6da80c69ca6 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -512,7 +512,7 @@ out: ctx->pos = pos; } else if (err < 0) { if (err == -EINVAL) - ntfs_inode_err(dir, "directory corrupted"); + _ntfs_bad_inode(dir); ctx->pos = eod; } diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 8b39d0ce5f28..5df6a0b5add9 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -75,7 +75,7 @@ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni) { const struct ATTRIB *attr; - attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); + attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL); return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) : NULL; } @@ -89,7 +89,7 @@ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni) { const struct ATTRIB *attr; - attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); + attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL); return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) : NULL; @@ -148,8 +148,10 @@ int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi) goto out; err = mi_get(ni->mi.sbi, rno, &r); - if (err) + if (err) { + _ntfs_bad_inode(&ni->vfs_inode); return err; + } ni_add_mi(ni, r); @@ -201,7 +203,8 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, *mi = &ni->mi; /* Look for required attribute in primary record. */ - return mi_find_attr(&ni->mi, attr, type, name, name_len, NULL); + return mi_find_attr(ni, &ni->mi, attr, type, name, name_len, + NULL); } /* First look for list entry of required type. */ @@ -217,7 +220,7 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, return NULL; /* Look for required attribute. */ - attr = mi_find_attr(m, NULL, type, name, name_len, &le->id); + attr = mi_find_attr(ni, m, NULL, type, name, name_len, &le->id); if (!attr) goto out; @@ -238,8 +241,7 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, return attr; out: - ntfs_inode_err(&ni->vfs_inode, "failed to parse mft record"); - ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + _ntfs_bad_inode(&ni->vfs_inode); return NULL; } @@ -259,7 +261,7 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, if (mi) *mi = &ni->mi; /* Enum attributes in primary record. */ - return mi_enum_attr(&ni->mi, attr); + return mi_enum_attr(ni, &ni->mi, attr); } /* Get next list entry. */ @@ -275,7 +277,7 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, *mi = mi2; /* Find attribute in loaded record. */ - return rec_find_attr_le(mi2, le2); + return rec_find_attr_le(ni, mi2, le2); } /* @@ -293,7 +295,8 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, if (!ni->attr_list.size) { if (pmi) *pmi = &ni->mi; - return mi_find_attr(&ni->mi, NULL, type, name, name_len, NULL); + return mi_find_attr(ni, &ni->mi, NULL, type, name, name_len, + NULL); } le = al_find_ex(ni, NULL, type, name, name_len, NULL); @@ -319,7 +322,7 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, if (pmi) *pmi = mi; - attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); + attr = mi_find_attr(ni, mi, NULL, type, name, name_len, &le->id); if (!attr) return NULL; @@ -330,6 +333,7 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, vcn <= le64_to_cpu(attr->nres.evcn)) return attr; + _ntfs_bad_inode(&ni->vfs_inode); return NULL; } @@ -398,7 +402,8 @@ int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, int diff; if (base_only || type == ATTR_LIST || !ni->attr_list.size) { - attr = mi_find_attr(&ni->mi, NULL, type, name, name_len, id); + attr = mi_find_attr(ni, &ni->mi, NULL, type, name, name_len, + id); if (!attr) return -ENOENT; @@ -437,7 +442,7 @@ next_le2: al_remove_le(ni, le); - attr = mi_find_attr(mi, NULL, type, name, name_len, id); + attr = mi_find_attr(ni, mi, NULL, type, name, name_len, id); if (!attr) return -ENOENT; @@ -485,7 +490,7 @@ ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi, name = le->name; } - attr = mi_insert_attr(mi, type, name, name_len, asize, name_off); + attr = mi_insert_attr(ni, mi, type, name, name_len, asize, name_off); if (!attr) { if (le_added) al_remove_le(ni, le); @@ -673,7 +678,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) if (err) return err; - attr_list = mi_find_attr(&ni->mi, NULL, ATTR_LIST, NULL, 0, NULL); + attr_list = mi_find_attr(ni, &ni->mi, NULL, ATTR_LIST, NULL, 0, NULL); if (!attr_list) return 0; @@ -695,7 +700,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) if (!mi) return 0; - attr = mi_find_attr(mi, NULL, le->type, le_name(le), + attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) return 0; @@ -731,7 +736,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) goto out; } - attr = mi_find_attr(mi, NULL, le->type, le_name(le), + attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) { /* Should never happened, 'cause already checked. */ @@ -740,7 +745,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) asize = le32_to_cpu(attr->size); /* Insert into primary record. */ - attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le), + attr_ins = mi_insert_attr(ni, &ni->mi, le->type, le_name(le), le->name_len, asize, le16_to_cpu(attr->name_off)); if (!attr_ins) { @@ -768,7 +773,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) if (!mi) continue; - attr = mi_find_attr(mi, NULL, le->type, le_name(le), + attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) continue; @@ -831,7 +836,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) free_b = 0; attr = NULL; - for (; (attr = mi_enum_attr(&ni->mi, attr)); le = Add2Ptr(le, sz)) { + for (; (attr = mi_enum_attr(ni, &ni->mi, attr)); le = Add2Ptr(le, sz)) { sz = le_size(attr->name_len); le->type = attr->type; le->size = cpu_to_le16(sz); @@ -886,7 +891,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) u32 asize = le32_to_cpu(b->size); u16 name_off = le16_to_cpu(b->name_off); - attr = mi_insert_attr(mi, b->type, Add2Ptr(b, name_off), + attr = mi_insert_attr(ni, mi, b->type, Add2Ptr(b, name_off), b->name_len, asize, name_off); if (!attr) goto out; @@ -909,7 +914,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) goto out; } - attr = mi_insert_attr(&ni->mi, ATTR_LIST, NULL, 0, + attr = mi_insert_attr(ni, &ni->mi, ATTR_LIST, NULL, 0, lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT); if (!attr) goto out; @@ -993,13 +998,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, mi = rb_entry(node, struct mft_inode, node); if (is_mft_data && - (mi_enum_attr(mi, NULL) || + (mi_enum_attr(ni, mi, NULL) || vbo <= ((u64)mi->rno << sbi->record_bits))) { /* We can't accept this record 'cause MFT's bootstrapping. */ continue; } if (is_mft && - mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, NULL)) { + mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, NULL)) { /* * This child record already has a ATTR_DATA. * So it can't accept any other records. @@ -1008,7 +1013,7 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, } if ((type != ATTR_NAME || name_len) && - mi_find_attr(mi, NULL, type, name, name_len, NULL)) { + mi_find_attr(ni, mi, NULL, type, name, name_len, NULL)) { /* Only indexed attributes can share same record. */ continue; } @@ -1157,7 +1162,7 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, /* Estimate the result of moving all possible attributes away. */ attr = NULL; - while ((attr = mi_enum_attr(&ni->mi, attr))) { + while ((attr = mi_enum_attr(ni, &ni->mi, attr))) { if (attr->type == ATTR_STD) continue; if (attr->type == ATTR_LIST) @@ -1175,7 +1180,7 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, attr = NULL; for (;;) { - attr = mi_enum_attr(&ni->mi, attr); + attr = mi_enum_attr(ni, &ni->mi, attr); if (!attr) { /* We should never be here 'cause we have already check this case. */ err = -EINVAL; @@ -1259,7 +1264,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni) for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { mi = rb_entry(node, struct mft_inode, node); - attr = mi_enum_attr(mi, NULL); + attr = mi_enum_attr(ni, mi, NULL); if (!attr) { mft_min = mi->rno; @@ -1280,7 +1285,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni) ni_remove_mi(ni, mi_new); } - attr = mi_find_attr(&ni->mi, NULL, ATTR_DATA, NULL, 0, NULL); + attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_DATA, NULL, 0, NULL); if (!attr) { err = -EINVAL; goto out; @@ -1397,7 +1402,7 @@ int ni_expand_list(struct ntfs_inode *ni) continue; /* Find attribute in primary record. */ - attr = rec_find_attr_le(&ni->mi, le); + attr = rec_find_attr_le(ni, &ni->mi, le); if (!attr) { err = -EINVAL; goto out; @@ -1604,8 +1609,8 @@ int ni_delete_all(struct ntfs_inode *ni) roff = le16_to_cpu(attr->nres.run_off); if (roff > asize) { - _ntfs_bad_inode(&ni->vfs_inode); - return -EINVAL; + /* ni_enum_attr_ex checks this case. */ + continue; } /* run==1 means unpack and deallocate. */ @@ -2726,9 +2731,10 @@ int ni_write_frame(struct ntfs_inode *ni, struct page **pages, { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; + struct folio *folio = page_folio(pages[0]); u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; u32 frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT; - u64 frame_vbo = (u64)pages[0]->index << PAGE_SHIFT; + u64 frame_vbo = folio_pos(folio); CLST frame = frame_vbo >> frame_bits; char *frame_ondisk = NULL; struct page **pages_disk = NULL; @@ -3343,7 +3349,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (!mi->dirty) continue; - is_empty = !mi_enum_attr(mi, NULL); + is_empty = !mi_enum_attr(ni, mi, NULL); if (is_empty) clear_rec_inuse(mi->mrec); diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 03471bc9371c..938d351ebac7 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -908,7 +908,11 @@ void ntfs_bad_inode(struct inode *inode, const char *hint) ntfs_inode_err(inode, "%s", hint); make_bad_inode(inode); - ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + /* Avoid recursion if bad inode is $Volume. */ + if (inode->i_ino != MFT_REC_VOL && + !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING)) { + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + } } /* diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 9089c58a005c..7eb9fae22f8d 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1094,8 +1094,7 @@ int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn, ok: if (!index_buf_check(ib, bytes, &vbn)) { - ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); - ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + _ntfs_bad_inode(&ni->vfs_inode); err = -EINVAL; goto out; } @@ -1117,8 +1116,7 @@ ok: out: if (err == -E_NTFS_CORRUPT) { - ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); - ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + _ntfs_bad_inode(&ni->vfs_inode); err = -EINVAL; } diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index be04d2845bb7..a1e11228dafd 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -410,6 +410,9 @@ end_enum: if (!std5) goto out; + if (is_bad_inode(inode)) + goto out; + if (!is_match && name) { err = -ENOENT; goto out; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index cd8e8374bb5a..382820464dee 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -745,23 +745,24 @@ int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi); void mi_put(struct mft_inode *mi); int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno); int mi_read(struct mft_inode *mi, bool is_mft); -struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr); -// TODO: id? -struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, - enum ATTR_TYPE type, const __le16 *name, - u8 name_len, const __le16 *id); -static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec, +struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr); +struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, const __le16 *id); +static inline struct ATTRIB *rec_find_attr_le(struct ntfs_inode *ni, + struct mft_inode *rec, struct ATTR_LIST_ENTRY *le) { - return mi_find_attr(rec, NULL, le->type, le_name(le), le->name_len, + return mi_find_attr(ni, rec, NULL, le->type, le_name(le), le->name_len, &le->id); } int mi_write(struct mft_inode *mi, int wait); int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, __le16 flags, bool is_mft); -struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, - const __le16 *name, u8 name_len, u32 asize, - u16 name_off); +struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, u32 asize, u16 name_off); bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 61d53d39f3b9..714c7ecedca8 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -31,7 +31,7 @@ static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type, * * Return: Unused attribute id that is less than mrec->next_attr_id. */ -static __le16 mi_new_attt_id(struct mft_inode *mi) +static __le16 mi_new_attt_id(struct ntfs_inode *ni, struct mft_inode *mi) { u16 free_id, max_id, t16; struct MFT_REC *rec = mi->mrec; @@ -52,7 +52,7 @@ static __le16 mi_new_attt_id(struct mft_inode *mi) attr = NULL; for (;;) { - attr = mi_enum_attr(mi, attr); + attr = mi_enum_attr(ni, mi, attr); if (!attr) { rec->next_attr_id = cpu_to_le16(max_id + 1); mi->dirty = true; @@ -195,7 +195,8 @@ out: * NOTE: mi->mrec - memory of size sbi->record_size * here we sure that mi->mrec->total == sbi->record_size (see mi_read) */ -struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) +struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr) { const struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); @@ -209,11 +210,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) off = le16_to_cpu(rec->attr_off); if (used > total) - return NULL; + goto out; if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 || !IS_ALIGNED(off, 8)) { - return NULL; + goto out; } /* Skip non-resident records. */ @@ -243,7 +244,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) */ if (off + 8 > used) { static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8); - return NULL; + goto out; } if (attr->type == ATTR_END) { @@ -254,112 +255,116 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* 0x100 is last known attribute for now. */ t32 = le32_to_cpu(attr->type); if (!t32 || (t32 & 0xf) || (t32 > 0x100)) - return NULL; + goto out; /* attributes in record must be ordered by type */ if (t32 < prev_type) - return NULL; + goto out; asize = le32_to_cpu(attr->size); if (!IS_ALIGNED(asize, 8)) - return NULL; + goto out; /* Check overflow and boundary. */ if (off + asize < off || off + asize > used) - return NULL; + goto out; /* Can we use the field attr->non_res. */ if (off + 9 > used) - return NULL; + goto out; /* Check size of attribute. */ if (!attr->non_res) { /* Check resident fields. */ if (asize < SIZEOF_RESIDENT) - return NULL; + goto out; t16 = le16_to_cpu(attr->res.data_off); if (t16 > asize) - return NULL; + goto out; if (le32_to_cpu(attr->res.data_size) > asize - t16) - return NULL; + goto out; t32 = sizeof(short) * attr->name_len; if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) - return NULL; + goto out; return attr; } /* Check nonresident fields. */ if (attr->non_res != 1) - return NULL; + goto out; /* Can we use memory including attr->nres.valid_size? */ if (asize < SIZEOF_NONRESIDENT) - return NULL; + goto out; t16 = le16_to_cpu(attr->nres.run_off); if (t16 > asize) - return NULL; + goto out; t32 = sizeof(short) * attr->name_len; if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) - return NULL; + goto out; /* Check start/end vcn. */ if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1) - return NULL; + goto out; data_size = le64_to_cpu(attr->nres.data_size); if (le64_to_cpu(attr->nres.valid_size) > data_size) - return NULL; + goto out; alloc_size = le64_to_cpu(attr->nres.alloc_size); if (data_size > alloc_size) - return NULL; + goto out; t32 = mi->sbi->cluster_mask; if (alloc_size & t32) - return NULL; + goto out; if (!attr->nres.svcn && is_attr_ext(attr)) { /* First segment of sparse/compressed attribute */ /* Can we use memory including attr->nres.total_size? */ if (asize < SIZEOF_NONRESIDENT_EX) - return NULL; + goto out; tot_size = le64_to_cpu(attr->nres.total_size); if (tot_size & t32) - return NULL; + goto out; if (tot_size > alloc_size) - return NULL; + goto out; } else { if (attr->nres.c_unit) - return NULL; + goto out; if (alloc_size > mi->sbi->volume.size) - return NULL; + goto out; } return attr; + +out: + _ntfs_bad_inode(&ni->vfs_inode); + return NULL; } /* * mi_find_attr - Find the attribute by type and name and id. */ -struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, - enum ATTR_TYPE type, const __le16 *name, - u8 name_len, const __le16 *id) +struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, const __le16 *id) { u32 type_in = le32_to_cpu(type); u32 atype; next_attr: - attr = mi_enum_attr(mi, attr); + attr = mi_enum_attr(ni, mi, attr); if (!attr) return NULL; @@ -467,9 +472,9 @@ int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, * * Return: Not full constructed attribute or NULL if not possible to create. */ -struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, - const __le16 *name, u8 name_len, u32 asize, - u16 name_off) +struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, u32 asize, u16 name_off) { size_t tail; struct ATTRIB *attr; @@ -488,7 +493,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, * at which we should insert it. */ attr = NULL; - while ((attr = mi_enum_attr(mi, attr))) { + while ((attr = mi_enum_attr(ni, mi, attr))) { int diff = compare_attr(attr, type, name, name_len, upcase); if (diff < 0) @@ -508,7 +513,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, tail = used - PtrOffset(rec, attr); } - id = mi_new_attt_id(mi); + id = mi_new_attt_id(ni, mi); memmove(Add2Ptr(attr, asize), attr, tail); memset(attr, 0, asize); diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index a9b8688aaf30..1873bbbb7e5b 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -32,7 +32,8 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry) } -static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int ocfs2_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; int ret = 0; /* if all else fails, just return false */ @@ -44,8 +45,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) inode = d_inode(dentry); osb = OCFS2_SB(dentry->d_sb); - trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, - dentry->d_name.name); + trace_ocfs2_dentry_revalidate(dentry, name->len, name->name); /* For a negative dentry - * check the generation number of the parent and compare with the @@ -53,12 +53,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) */ if (inode == NULL) { unsigned long gen = (unsigned long) dentry->d_fsdata; - unsigned long pgen; - spin_lock(&dentry->d_lock); - pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; - spin_unlock(&dentry->d_lock); - trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, - dentry->d_name.name, + unsigned long pgen = OCFS2_I(dir)->ip_dir_lock_gen; + trace_ocfs2_dentry_revalidate_negative(name->len, name->name, pgen, gen); if (gen != pgen) goto bail; diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index 395a00ed8ac7..a19d1ad705db 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -13,10 +13,9 @@ #include "orangefs-kernel.h" /* Returns 1 if dentry can still be trusted, else 0. */ -static int orangefs_revalidate_lookup(struct dentry *dentry) +static int orangefs_revalidate_lookup(struct inode *parent_inode, const struct qstr *name, + struct dentry *dentry) { - struct dentry *parent_dentry = dget_parent(dentry); - struct inode *parent_inode = parent_dentry->d_inode; struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode); struct inode *inode = dentry->d_inode; struct orangefs_kernel_op_s *new_op; @@ -26,14 +25,14 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__); new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP); - if (!new_op) { - ret = -ENOMEM; - goto out_put_parent; - } + if (!new_op) + return -ENOMEM; new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW; new_op->upcall.req.lookup.parent_refn = parent->refn; - strscpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name); + /* op_alloc() leaves ->upcall zeroed */ + memcpy(new_op->upcall.req.lookup.d_name, name->name, + min(name->len, ORANGEFS_NAME_MAX - 1)); gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d interrupt flag [%d]\n", @@ -78,8 +77,6 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) ret = 1; out_release_op: op_release(new_op); -out_put_parent: - dput(parent_dentry); return ret; out_drop: gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n", @@ -92,7 +89,8 @@ out_drop: * * Should return 1 if dentry can still be trusted, else 0. */ -static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int orangefs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { int ret; unsigned long time = (unsigned long) dentry->d_fsdata; @@ -114,7 +112,7 @@ static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags) * If this passes, the positive dentry still exists or the negative * dentry still does not exist. */ - if (!orangefs_revalidate_lookup(dentry)) + if (!orangefs_revalidate_lookup(dir, name, dentry)) return 0; /* We do not need to continue with negative dentries. */ diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index fe511192f83c..86ae6f6da36b 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -91,7 +91,24 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) ret = d->d_op->d_weak_revalidate(d, flags); } else if (d->d_flags & DCACHE_OP_REVALIDATE) { - ret = d->d_op->d_revalidate(d, flags); + struct dentry *parent; + struct inode *dir; + struct name_snapshot n; + + if (flags & LOOKUP_RCU) { + parent = READ_ONCE(d->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + } else { + parent = dget_parent(d); + dir = d_inode(parent); + } + take_dentry_name_snapshot(&n, d); + ret = d->d_op->d_revalidate(dir, &n.name, d, flags); + release_dentry_name_snapshot(&n); + if (!(flags & LOOKUP_RCU)) + dput(parent); if (!ret) { if (!(flags & LOOKUP_RCU)) d_invalidate(d); @@ -127,7 +144,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, return ret; } -static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int ovl_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { return ovl_dentry_revalidate_common(dentry, flags, false); } diff --git a/fs/proc/base.c b/fs/proc/base.c index a50b222a5917..cd89e956c322 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2058,7 +2058,8 @@ void pid_update_inode(struct task_struct *task, struct inode *inode) * performed a setuid(), etc. * */ -static int pid_revalidate(struct dentry *dentry, unsigned int flags) +static int pid_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; @@ -2191,7 +2192,8 @@ static int dname_to_vma_addr(struct dentry *dentry, return 0; } -static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) +static int map_files_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 24baf23e864f..37aa778d1af7 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -140,7 +140,8 @@ static void tid_fd_update_inode(struct task_struct *task, struct inode *inode, security_task_to_inode(task, inode); } -static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +static int tid_fd_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct task_struct *task; struct inode *inode; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index dbe82cf23ee4..8ec90826a49e 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -216,7 +216,8 @@ void proc_free_inum(unsigned int inum) ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } -static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; @@ -343,7 +344,8 @@ static const struct file_operations proc_dir_operations = { .iterate_shared = proc_readdir, }; -static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { return 0; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 27a283d85a6e..cc9d74a06ff0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -884,7 +884,8 @@ static const struct inode_operations proc_sys_dir_operations = { .getattr = proc_sys_getattr, }; -static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_sys_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 1822493dd084..d1e95632ac54 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -737,7 +737,8 @@ again: } static int -cifs_d_revalidate(struct dentry *direntry, unsigned int flags) +cifs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *direntry, unsigned int flags) { struct inode *inode; int rc; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index cfc614c638da..53214499e384 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -457,7 +457,8 @@ static void tracefs_d_release(struct dentry *dentry) eventfs_d_release(dentry); } -static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int tracefs_d_revalidate(struct inode *inode, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct eventfs_inode *ei = dentry->d_fsdata; diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 5f1a14d5b927..a859ac9b74ba 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -192,7 +192,8 @@ const struct file_operations vboxsf_dir_fops = { * This is called during name resolution/lookup to check if the @dentry in * the cache is still valid. the job is handled by vboxsf_inode_revalidate. */ -static int vboxsf_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int vboxsf_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bff956f7b2b9..9a1a30857763 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -68,16 +68,24 @@ extern const struct qstr dotdot_name; * large memory footprint increase). */ #ifdef CONFIG_64BIT -# define DNAME_INLINE_LEN 40 /* 192 bytes */ +# define DNAME_INLINE_WORDS 5 /* 192 bytes */ #else # ifdef CONFIG_SMP -# define DNAME_INLINE_LEN 36 /* 128 bytes */ +# define DNAME_INLINE_WORDS 9 /* 128 bytes */ # else -# define DNAME_INLINE_LEN 44 /* 128 bytes */ +# define DNAME_INLINE_WORDS 11 /* 128 bytes */ # endif #endif +#define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long)) + +union shortname_store { + unsigned char string[DNAME_INLINE_LEN]; + unsigned long words[DNAME_INLINE_WORDS]; +}; + #define d_lock d_lockref.lock +#define d_iname d_shortname.string struct dentry { /* RCU lookup touched fields */ @@ -88,7 +96,7 @@ struct dentry { struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ - unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ + union shortname_store d_shortname; /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ /* Ref lookup also touches following */ @@ -136,7 +144,8 @@ enum d_real_type { }; struct dentry_operations { - int (*d_revalidate)(struct dentry *, unsigned int); + int (*d_revalidate)(struct inode *, const struct qstr *, + struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, @@ -150,6 +159,8 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, enum d_real_type type); + bool (*d_unalias_trylock)(const struct dentry *); + void (*d_unalias_unlock)(const struct dentry *); } ____cacheline_aligned; /* @@ -589,7 +600,7 @@ static inline struct inode *d_real_inode(const struct dentry *dentry) struct name_snapshot { struct qstr name; - unsigned char inline_name[DNAME_INLINE_LEN]; + union shortname_store inline_name; }; void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 772f822dc6b8..18855cb44b1c 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -192,7 +192,8 @@ struct fscrypt_operations { unsigned int *num_devs); }; -int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); +int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags); static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) @@ -711,8 +712,8 @@ static inline u64 fscrypt_fname_siphash(const struct inode *dir, return 0; } -static inline int fscrypt_d_revalidate(struct dentry *dentry, - unsigned int flags) +static inline int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { return 1; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8da4c61f97b9..2a59034a5fa2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1085,8 +1085,8 @@ struct netdev_net_notifier { * * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Old-style ioctl entry point. This is used internally by the - * appletalk and ieee802154 subsystems but is no longer called by - * the device ioctl handler. + * ieee802154 subsystem but is no longer called by the device + * ioctl handler. * * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); * Used by the bonding driver for its device specific ioctls: diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 162b7c0c3555..9155a6ffc370 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1781,7 +1781,7 @@ struct nfs_rpc_ops { struct nfs_fattr *, struct inode *); int (*setattr) (struct dentry *, struct nfs_fattr *, struct iattr *); - int (*lookup) (struct inode *, struct dentry *, + int (*lookup) (struct inode *, struct dentry *, const struct qstr *, struct nfs_fh *, struct nfs_fattr *); int (*lookupp) (struct inode *, struct nfs_fh *, struct nfs_fattr *); diff --git a/include/linux/pm.h b/include/linux/pm.h index 08c37b83fea8..78855d794342 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -384,12 +384,8 @@ const struct dev_pm_ops name = { \ #ifdef CONFIG_PM #define _EXPORT_DEV_PM_OPS(name, license, ns) _EXPORT_PM_OPS(name, license, ns) -#define EXPORT_PM_FN_GPL(name) EXPORT_SYMBOL_GPL(name) -#define EXPORT_PM_FN_NS_GPL(name, ns) EXPORT_SYMBOL_NS_GPL(name, "ns") #else #define _EXPORT_DEV_PM_OPS(name, license, ns) _DISCARD_PM_OPS(name, license, ns) -#define EXPORT_PM_FN_GPL(name) -#define EXPORT_PM_FN_NS_GPL(name, ns) #endif #ifdef CONFIG_PM_SLEEP @@ -684,6 +680,7 @@ struct dev_pm_info { bool no_pm_callbacks:1; /* Owned by the PM core */ bool async_in_progress:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ + bool set_active:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ #else bool should_wakeup:1; diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index ed4cd114180a..7f405672b089 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -237,7 +237,6 @@ struct page_pool { struct { struct hlist_node list; u64 detach_time; - u32 napi_id; u32 id; } user; }; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 4b0677e48190..ed4b83696c77 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1268,9 +1268,19 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, if (xo) { x = xfrm_input_state(skb); - if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) - return (xo->flags & CRYPTO_DONE) && - (xo->status & CRYPTO_SUCCESS); + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) { + bool check = (xo->flags & CRYPTO_DONE) && + (xo->status & CRYPTO_SUCCESS); + + /* The packets here are plain ones and secpath was + * needed to indicate that hardware already handled + * them and there is no need to do nothing in addition. + * + * Consume secpath which was set by drivers. + */ + secpath_reset(skb); + return check; + } } return __xfrm_check_nopolicy(net, skb, dir) || diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f87aa01ba44..10a01af63a80 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -608,7 +608,11 @@ int hibernation_platform_enter(void) local_irq_disable(); system_state = SYSTEM_SUSPEND; - syscore_suspend(); + + error = syscore_suspend(); + if (error) + goto Enable_irqs; + if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; @@ -620,6 +624,7 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); + Enable_irqs: system_state = SYSTEM_RUNNING; local_irq_enable(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a2a29e3fffca..1a19d69b91ed 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -666,7 +666,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + if (policy->dvfs_possible_from_any_cpu) + set_cpus_allowed_ptr(thread, policy->related_cpus); + else + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 149e2c8036d3..456d339be98f 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -1130,6 +1130,13 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) return 0; /* + * The special/sugov task isn't part of regular bandwidth/admission + * control so let userspace change affinities. + */ + if (dl_entity_is_special(&p->dl)) + return 0; + + /* * Since bandwidth control happens on root_domain basis, * if admission test is enabled, we only admit -deadline * tasks allowed to run on all the CPUs in the task's diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 49f97d4138ea..46ea0bee2259 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -710,12 +710,12 @@ static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu) { switch (chan->scid) { case L2CAP_CID_ATT: - if (mtu < L2CAP_LE_MIN_MTU) + if (mtu && mtu < L2CAP_LE_MIN_MTU) return false; break; default: - if (mtu < L2CAP_DEFAULT_MIN_MTU) + if (mtu && mtu < L2CAP_DEFAULT_MIN_MTU) return false; } diff --git a/net/core/dev.c b/net/core/dev.c index afa2282f2604..c0021cbd28fc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6708,7 +6708,7 @@ void napi_resume_irqs(unsigned int napi_id) static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { - napi->napi_id = napi_id; + WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } @@ -9924,6 +9924,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + return -EINVAL; + } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; @@ -10260,37 +10264,14 @@ static bool from_cleanup_net(void) #endif } -static void rtnl_drop_if_cleanup_net(void) -{ - if (from_cleanup_net()) - __rtnl_unlock(); -} - -static void rtnl_acquire_if_cleanup_net(void) -{ - if (from_cleanup_net()) - rtnl_lock(); -} - /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); -static LIST_HEAD(net_todo_list_for_cleanup_net); - -/* TODO: net_todo_list/net_todo_list_for_cleanup_net should probably - * be provided by callers, instead of being static, rtnl protected. - */ -static struct list_head *todo_list(void) -{ - return from_cleanup_net() ? &net_todo_list_for_cleanup_net : - &net_todo_list; -} - DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { - list_add_tail(&dev->todo_list, todo_list()); + list_add_tail(&dev->todo_list, &net_todo_list); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -11140,7 +11121,7 @@ void netdev_run_todo(void) #endif /* Snapshot list, allow later requests */ - list_replace_init(todo_list(), &list); + list_replace_init(&net_todo_list, &list); __rtnl_unlock(); @@ -11785,11 +11766,9 @@ void unregister_netdevice_many_notify(struct list_head *head, WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); netdev_unlock(dev); } - - rtnl_drop_if_cleanup_net(); flush_all_backlogs(); + synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; @@ -11849,9 +11828,7 @@ void unregister_netdevice_many_notify(struct list_head *head, #endif } - rtnl_drop_if_cleanup_net(); synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 1906c62dee85..f5e908c9e7ad 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -1146,7 +1146,9 @@ void page_pool_disable_direct_recycling(struct page_pool *pool) WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state)); WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1); + mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); + mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_disable_direct_recycling); diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h index 57439787b9c2..2fb06d5f6d55 100644 --- a/net/core/page_pool_priv.h +++ b/net/core/page_pool_priv.h @@ -7,6 +7,8 @@ #include "netmem_priv.h" +extern struct mutex page_pools_lock; + s32 page_pool_inflight(const struct page_pool *pool, bool strict); int page_pool_list(struct page_pool *pool); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 48335766c1bf..6677e0c2e256 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -3,6 +3,7 @@ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/xarray.h> +#include <net/busy_poll.h> #include <net/net_debug.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> @@ -14,10 +15,11 @@ #include "netdev-genl-gen.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); -/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. +/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev, + * pool->user. * Ordering: inside rtnl_lock */ -static DEFINE_MUTEX(page_pools_lock); +DEFINE_MUTEX(page_pools_lock); /* Page pools are only reachable from user space (via netlink) if they are * linked to a netdev at creation time. Following page pool "visibility" @@ -216,6 +218,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, { struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; + unsigned int napi_id; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -229,8 +232,10 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex)) goto err_cancel; - if (pool->user.napi_id && - nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) + + napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; + if (napi_id >= MIN_NAPI_ID && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) goto err_cancel; inflight = page_pool_inflight(pool, false); @@ -319,8 +324,6 @@ int page_pool_list(struct page_pool *pool) if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); - pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; - netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); } diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7bb94875a7ec..34bee42e1247 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -998,7 +998,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, ethtool_get_flow_spec_ring(info.fs.ring_cookie)) return -EINVAL; - if (ops->get_rxfh) { + if (cmd == ETHTOOL_SRXFH && ops->get_rxfh) { struct ethtool_rxfh_param rxfh = {}; rc = ops->get_rxfh(dev, &rxfh); diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 87bb3a91598e..a4bacf198555 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -700,9 +700,12 @@ static int fill_frame_info(struct hsr_frame_info *frame, frame->is_vlan = true; if (frame->is_vlan) { - if (skb->mac_len < offsetofend(struct hsr_vlan_ethhdr, vlanhdr)) + /* Note: skb->mac_len might be wrong here. */ + if (!pskb_may_pull(skb, + skb_mac_offset(skb) + + offsetofend(struct hsr_vlan_ethhdr, vlanhdr))) return -EINVAL; - vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; + vlan_hdr = (struct hsr_vlan_ethhdr *)skb_mac_header(skb); proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b0fbf804bbba..0e4076866c0a 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -279,7 +279,7 @@ static void esp_output_done(void *data, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 03b6eee407a2..28d77d454d44 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -330,9 +330,6 @@ next_entry: list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) goto next_entry2; - if (filter->dev && - !mr_mfc_uses_dev(mrt, mfc, filter->dev)) - goto next_entry2; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0e5b9a654254..bc95d2a5924f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -265,11 +265,14 @@ static u16 tcp_select_window(struct sock *sk) u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we - * are out of memory. The window is temporary, so we don't store - * it on the socket. + * are out of memory. */ - if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) { + tp->pred_flags = 0; + tp->rcv_wnd = 0; + tp->rcv_wup = tp->rcv_nxt; return 0; + } cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 5f3d0cc1555a..9e73944e3b53 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -315,7 +315,7 @@ static void esp_output_done(void *data, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 5f7b1fdbffe6..b3d5d1f266ee 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -82,14 +82,14 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) toobig = skb->len > mtu && !skb_is_gso(skb); - if (toobig && xfrm6_local_dontfrag(skb->sk)) { + if (toobig && xfrm6_local_dontfrag(sk)) { xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; } else if (toobig && xfrm6_noneed_fragment(skb)) { skb->ignore_df = 1; goto skip_frag; - } else if (!skb->ignore_df && toobig && skb->sk) { + } else if (!skb->ignore_df && toobig && sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); return -EMSGSIZE; diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 3999e0ba2c35..2dd81e6c26bd 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -418,9 +418,9 @@ void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); subflow->mpc_drop = 1; mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); - } else { - subflow->mpc_drop = 0; } + } else if (ssk->sk_state == TCP_SYN_SENT) { + subflow->mpc_drop = 0; } } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 123f3f297284..fd2de185bc93 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -108,7 +108,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; - mp_opt->use_ack = 0; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } @@ -157,11 +156,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("DSS\n"); ptr++; - /* we must clear 'mpc_map' be able to detect MP_CAPABLE - * map vs DSS map in mptcp_incoming_options(), and reconstruct - * map info accordingly - */ - mp_opt->mpc_map = 0; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; @@ -369,8 +363,11 @@ void mptcp_get_options(const struct sk_buff *skb, const unsigned char *ptr; int length; - /* initialize option status */ - mp_opt->suboptions = 0; + /* Ensure that casting the whole status to u32 is efficient and safe */ + BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status), + sizeof(u32))); + *(u32 *)&mp_opt->status = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 98ac73938bd8..572d160edca3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -2020,7 +2020,8 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && - (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | + MPTCP_PM_ADDR_FLAG_IMPLICIT))) { spin_unlock_bh(&pernet->lock); GENL_SET_ERR_MSG(info, "invalid addr flags"); return -EINVAL; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c44c89ecaca6..6bd819047470 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1767,8 +1767,10 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, * see mptcp_disconnect(). * Attempt it again outside the problematic scope. */ - if (!mptcp_disconnect(sk, 0)) + if (!mptcp_disconnect(sk, 0)) { + sk->sk_disconnects++; sk->sk_socket->state = SS_UNCONNECTED; + } } inet_clear_bit(DEFER_CONNECT, sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0174a5aad279..f6a207958459 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -149,22 +149,24 @@ struct mptcp_options_received { u32 subflow_seq; u16 data_len; __sum16 csum; - u16 suboptions; + struct_group(status, + u16 suboptions; + u16 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + mpc_map:1, + reset_reason:4, + reset_transient:1, + echo:1, + backup:1, + deny_join_id0:1, + __unused:2; + ); + u8 join_id; u32 token; u32 nonce; - u16 use_map:1, - dsn64:1, - data_fin:1, - use_ack:1, - ack64:1, - mpc_map:1, - reset_reason:4, - reset_transient:1, - echo:1, - backup:1, - deny_join_id0:1, - __unused:2; - u8 join_id; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index bf276eaf9330..7891a537bddd 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1385,6 +1385,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_package; break; case ncsi_dev_state_probe_package: + if (ndp->package_probe_id >= 8) { + /* Last package probed, finishing */ + ndp->flags |= NCSI_DEV_PROBED; + break; + } + ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_SP; @@ -1501,13 +1507,8 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) if (ret) goto error; - /* Probe next package */ + /* Probe next package after receiving response */ ndp->package_probe_id++; - if (ndp->package_probe_id >= 8) { - /* Probe finished */ - ndp->flags |= NCSI_DEV_PROBED; - break; - } nd->state = ncsi_dev_state_probe_package; ndp->active_package = NULL; break; diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 14bd66909ca4..4a8ce2949fae 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1089,14 +1089,12 @@ static int ncsi_rsp_handler_netlink(struct ncsi_request *nr) static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) { struct ncsi_dev_priv *ndp = nr->ndp; + struct sockaddr *saddr = &ndp->pending_mac; struct net_device *ndev = ndp->ndev.dev; struct ncsi_rsp_gmcma_pkt *rsp; - struct sockaddr saddr; - int ret = -1; int i; rsp = (struct ncsi_rsp_gmcma_pkt *)skb_network_header(nr->rsp); - saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netdev_info(ndev, "NCSI: Received %d provisioned MAC addresses\n", @@ -1108,20 +1106,20 @@ static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) rsp->addresses[i][4], rsp->addresses[i][5]); } + saddr->sa_family = ndev->type; for (i = 0; i < rsp->address_count; i++) { - memcpy(saddr.sa_data, &rsp->addresses[i], ETH_ALEN); - ret = ndev->netdev_ops->ndo_set_mac_address(ndev, &saddr); - if (ret < 0) { + if (!is_valid_ether_addr(rsp->addresses[i])) { netdev_warn(ndev, "NCSI: Unable to assign %pM to device\n", - saddr.sa_data); + rsp->addresses[i]); continue; } - netdev_warn(ndev, "NCSI: Set MAC address to %pM\n", saddr.sa_data); + memcpy(saddr->sa_data, rsp->addresses[i], ETH_ALEN); + netdev_warn(ndev, "NCSI: Will set MAC address to %pM\n", saddr->sa_data); break; } - ndp->gma_flag = ret == 0; - return ret; + ndp->gma_flag = 1; + return 0; } static struct ncsi_rsp_handler { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 667459256e4c..a34de9c17cf1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5078,7 +5078,7 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { - u32 num_regs = 0, key_num_regs = 0; + u32 len = 0, num_regs; struct nlattr *attr; int rem, err, i; @@ -5092,12 +5092,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, } for (i = 0; i < desc->field_count; i++) - num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + len += round_up(desc->field_len[i], sizeof(u32)); - key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); - if (key_num_regs != num_regs) + if (len != desc->klen) return -EINVAL; + num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index de175318a3a0..082ab66f120b 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -542,6 +542,8 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, pr_debug("pipe created=%d\n", pipe); + if (pipe >= NCI_HCI_MAX_PIPES) + pipe = NCI_HCI_INVALID_PIPE; return pipe; } diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index f06ddbed3fed..1525773e94aa 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -122,6 +122,10 @@ static void rose_heartbeat_expiry(struct timer_list *t) struct rose_sock *rose = rose_sk(sk); bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ/20); + goto out; + } switch (rose->state) { case ROSE_STATE_0: /* Magic here: If we listen() and a new link dies before it @@ -152,6 +156,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) } rose_start_heartbeat(sk); +out: bh_unlock_sock(sk); sock_put(sk); } @@ -162,6 +167,10 @@ static void rose_timer_expiry(struct timer_list *t) struct sock *sk = &rose->sock; bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &rose->timer, jiffies + HZ/20); + goto out; + } switch (rose->state) { case ROSE_STATE_1: /* T1 */ case ROSE_STATE_4: /* T2 */ @@ -182,6 +191,7 @@ static void rose_timer_expiry(struct timer_list *t) } break; } +out: bh_unlock_sock(sk); sock_put(sk); } @@ -192,6 +202,10 @@ static void rose_idletimer_expiry(struct timer_list *t) struct sock *sk = &rose->sock; bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &rose->idletimer, jiffies + HZ/20); + goto out; + } rose_clear_queues(sk); rose_write_internal(sk, ROSE_CLEAR_REQUEST); @@ -207,6 +221,7 @@ static void rose_idletimer_expiry(struct timer_list *t) sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } +out: bh_unlock_sock(sk); sock_put(sk); } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index d82e44a3901b..e874c31fa901 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -246,7 +246,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, bool use; int slot; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); while (!list_empty(collector)) { peer = list_entry(collector->next, @@ -257,7 +257,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, continue; use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (use) { keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; @@ -277,17 +277,17 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, */ slot += cursor; slot &= mask; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } rxrpc_put_peer(peer, rxrpc_peer_put_keepalive); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); } /* @@ -317,7 +317,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) * second; the bucket at cursor + 1 goes at now + 1s and so * on... */ - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_splice_init(&rxnet->peer_keepalive_new, &collector); stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive); @@ -329,7 +329,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) } base = now; - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxnet->peer_keepalive_base = base; rxnet->peer_keepalive_cursor = cursor; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index e1c63129586b..0fcc87f0409f 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -325,10 +325,10 @@ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) hash_key = rxrpc_peer_hash_key(local, &peer->srx); rxrpc_init_peer(local, peer, hash_key); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); } /* @@ -360,7 +360,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, return NULL; } - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); @@ -373,7 +373,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, &rxnet->peer_keepalive_new); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); @@ -423,10 +423,10 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) ASSERT(hlist_empty(&peer->error_targets)); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index f80bc05d4c5a..516038a44163 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -91,6 +91,8 @@ ets_class_from_arg(struct Qdisc *sch, unsigned long arg) { struct ets_sched *q = qdisc_priv(sch); + if (arg == 0 || arg > q->nbands) + return NULL; return &q->classes[arg - 1]; } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index fa9d1b49599b..075695173648 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -337,7 +337,10 @@ EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) { - vsock_remove_bound(vsk); + /* Transport reassignment must not remove the binding. */ + if (sock_flag(sk_vsock(vsk), SOCK_DEAD)) + vsock_remove_bound(vsk); + vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); @@ -821,12 +824,13 @@ static void __vsock_release(struct sock *sk, int level) */ lock_sock_nested(sk, level); + sock_orphan(sk); + if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); - sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; skb_queue_purge(&sk->sk_receive_queue); @@ -1519,6 +1523,11 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, if (err < 0) goto out; + /* sk_err might have been set as a result of an earlier + * (failed) connect attempt. + */ + sk->sk_err = 0; + /* Mark sock as connecting and set the error code to in * progress in case this is a non-blocking connect. */ diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 98f1e2b67c76..c397eb99d867 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -506,7 +506,7 @@ xmit: skb_dst_set(skb, dst); skb->dev = tdev; - err = dst_output(xi->net, skb->sk, skb); + err = dst_output(xi->net, skb_to_full_sk(skb), skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index b5025cf6136e..f7abd42c077d 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -802,7 +802,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { skb->protocol = htons(ETH_P_IP); - if (skb->sk) + if (skb->sk && sk_fullsock(skb->sk)) xfrm_local_error(skb, mtu); else icmp_send(skb, ICMP_DEST_UNREACH, @@ -838,6 +838,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; struct dst_entry *dst = skb_dst(skb); + struct sock *sk = skb_to_full_sk(skb); if (skb->ignore_df) goto out; @@ -852,9 +853,9 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) skb->dev = dst->dev; skb->protocol = htons(ETH_P_IPV6); - if (xfrm6_local_dontfrag(skb->sk)) + if (xfrm6_local_dontfrag(sk)) ipv6_stub->xfrm6_local_rxpmtu(skb, mtu); - else if (skb->sk) + else if (sk) xfrm_local_error(skb, mtu); else icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9e510021ee91..6551e588fe52 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2964,7 +2964,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) skb_dst_drop(skb); skb_dst_set(skb, dst); - dst_output(net, skb->sk, skb); + dst_output(net, skb_to_full_sk(skb), skb); } out: diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index e500aebbad22..dbdf8a39dffe 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -714,10 +714,12 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff oseq += skb_shinfo(skb)->gso_segs; } - if (unlikely(xo->seq.low < replay_esn->oseq)) { - XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; - xo->seq.hi = oseq_hi; - replay_esn->oseq_hi = oseq_hi; + if (unlikely(oseq < replay_esn->oseq)) { + replay_esn->oseq_hi = ++oseq_hi; + if (xo->seq.low < replay_esn->oseq) { + XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi; + xo->seq.hi = oseq_hi; + } if (replay_esn->oseq_hi == 0) { replay_esn->oseq--; replay_esn->oseq_hi--; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 34067cb8a479..ad2202fa82f3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -34,6 +34,8 @@ #define xfrm_state_deref_prot(table, net) \ rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) +#define xfrm_state_deref_check(table, net) \ + rcu_dereference_check((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) static void xfrm_state_gc_task(struct work_struct *work); @@ -62,6 +64,8 @@ static inline unsigned int xfrm_dst_hash(struct net *net, u32 reqid, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask); } @@ -70,6 +74,8 @@ static inline unsigned int xfrm_src_hash(struct net *net, const xfrm_address_t *saddr, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask); } @@ -77,11 +83,15 @@ static inline unsigned int xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask); } static unsigned int xfrm_seq_hash(struct net *net, u32 seq) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_seq_hash(seq, net->xfrm.state_hmask); } @@ -1108,16 +1118,38 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, x->props.family = tmpl->encap_family; } -static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, +struct xfrm_hash_state_ptrs { + const struct hlist_head *bydst; + const struct hlist_head *bysrc; + const struct hlist_head *byspi; + unsigned int hmask; +}; + +static void xfrm_hash_ptrs_get(const struct net *net, struct xfrm_hash_state_ptrs *ptrs) +{ + unsigned int sequence; + + do { + sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); + + ptrs->bydst = xfrm_state_deref_check(net->xfrm.state_bydst, net); + ptrs->bysrc = xfrm_state_deref_check(net->xfrm.state_bysrc, net); + ptrs->byspi = xfrm_state_deref_check(net->xfrm.state_byspi, net); + ptrs->hmask = net->xfrm.state_hmask; + } while (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence)); +} + +static struct xfrm_state *__xfrm_state_lookup_all(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family, struct xfrm_dev_offload *xdo) { - unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { + hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) { #ifdef CONFIG_XFRM_OFFLOAD if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1151,15 +1183,16 @@ static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, return NULL; } -static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, +static struct xfrm_state *__xfrm_state_lookup(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { - unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { + hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || @@ -1181,11 +1214,11 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, __be32 spi, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct hlist_head *state_cache_input; struct xfrm_state *x = NULL; - int cpu = get_cpu(); - state_cache_input = per_cpu_ptr(net->xfrm.state_cache_input, cpu); + state_cache_input = raw_cpu_ptr(net->xfrm.state_cache_input); rcu_read_lock(); hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) { @@ -1202,7 +1235,9 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, goto out; } - x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family); if (x && x->km.state == XFRM_STATE_VALID) { spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -1217,20 +1252,20 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, out: rcu_read_unlock(); - put_cpu(); return x; } EXPORT_SYMBOL(xfrm_input_state_lookup); -static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, +static struct xfrm_state *__xfrm_state_lookup_byaddr(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { - unsigned int h = xfrm_src_hash(net, daddr, saddr, family); + unsigned int h = __xfrm_src_hash(daddr, saddr, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) { + hlist_for_each_entry_rcu(x, state_ptrs->bysrc + h, bysrc) { if (x->props.family != family || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family) || @@ -1250,14 +1285,17 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, static inline struct xfrm_state * __xfrm_state_locate(struct xfrm_state *x, int use_spi, int family) { + struct xfrm_hash_state_ptrs state_ptrs; struct net *net = xs_net(x); u32 mark = x->mark.v & x->mark.m; + xfrm_hash_ptrs_get(net, &state_ptrs); + if (use_spi) - return __xfrm_state_lookup(net, mark, &x->id.daddr, + return __xfrm_state_lookup(&state_ptrs, mark, &x->id.daddr, x->id.spi, x->id.proto, family); else - return __xfrm_state_lookup_byaddr(net, mark, + return __xfrm_state_lookup_byaddr(&state_ptrs, mark, &x->id.daddr, &x->props.saddr, x->id.proto, family); @@ -1331,6 +1369,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; + struct xfrm_hash_state_ptrs state_ptrs; struct net *net = xp_net(pol); unsigned int h, h_wildcard; struct xfrm_state *x, *x0, *to_put; @@ -1395,8 +1434,10 @@ cached: else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ WARN_ON(1); - h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); - hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { + xfrm_hash_ptrs_get(net, &state_ptrs); + + h = __xfrm_dst_hash(daddr, saddr, tmpl->reqid, encap_family, state_ptrs.hmask); + hlist_for_each_entry_rcu(x, state_ptrs.bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1429,8 +1470,9 @@ cached: if (best || acquire_in_progress) goto found; - h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); - hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { + h_wildcard = __xfrm_dst_hash(daddr, &saddr_wildcard, tmpl->reqid, + encap_family, state_ptrs.hmask); + hlist_for_each_entry_rcu(x, state_ptrs.bydst + h_wildcard, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1468,7 +1510,7 @@ found: if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && - (x0 = __xfrm_state_lookup_all(net, mark, daddr, + (x0 = __xfrm_state_lookup_all(&state_ptrs, mark, daddr, tmpl->id.spi, tmpl->id.proto, encap_family, &pol->xdo)) != NULL) { @@ -2253,10 +2295,13 @@ struct xfrm_state * xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct xfrm_state *x; rcu_read_lock(); - x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family); rcu_read_unlock(); return x; } @@ -2267,10 +2312,14 @@ xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family); + + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup_byaddr(&state_ptrs, mark, daddr, saddr, proto, family); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index e16cef160bc2..ce32cb35007d 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -95,7 +95,7 @@ ynl_err_walk(struct ynl_sock *ys, void *start, void *end, unsigned int off, ynl_attr_for_each_payload(start, data_len, attr) { astart_off = (char *)attr - (char *)start; - aend_off = astart_off + ynl_attr_data_len(attr); + aend_off = (char *)ynl_attr_data_end(attr) - (char *)start; if (aend_off <= off) continue; diff --git a/tools/testing/selftests/bpf/progs/find_vma.c b/tools/testing/selftests/bpf/progs/find_vma.c index 38034fb82530..02b82774469c 100644 --- a/tools/testing/selftests/bpf/progs/find_vma.c +++ b/tools/testing/selftests/bpf/progs/find_vma.c @@ -25,7 +25,7 @@ static long check_vma(struct task_struct *task, struct vm_area_struct *vma, { if (vma->vm_file) bpf_probe_read_kernel_str(d_iname, DNAME_INLINE_LEN - 1, - vma->vm_file->f_path.dentry->d_iname); + vma->vm_file->f_path.dentry->d_shortname.string); /* check for VM_EXEC */ if (vma->vm_flags & VM_EXEC) diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh index 384cfa3d38a6..92c2f0376c08 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh @@ -142,7 +142,7 @@ function pre_ethtool { } function check_table { - local path=$NSIM_DEV_DFS/ports/$port/udp_ports_table$1 + local path=$NSIM_DEV_DFS/ports/$port/udp_ports/table$1 local -n expected=$2 local last=$3 @@ -212,7 +212,7 @@ function check_tables { } function print_table { - local path=$NSIM_DEV_DFS/ports/$port/udp_ports_table$1 + local path=$NSIM_DEV_DFS/ports/$port/udp_ports/table$1 read -a have < $path tree $NSIM_DEV_DFS/ @@ -641,7 +641,7 @@ for port in 0 1; do NSIM_NETDEV=`get_netdev_name old_netdevs` ip link set dev $NSIM_NETDEV up - echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error + echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports/inject_error msg="1 - create VxLANs v6" exp0=( 0 0 0 0 ) @@ -663,7 +663,7 @@ for port in 0 1; do new_geneve gnv0 20000 msg="2 - destroy GENEVE" - echo 2 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error + echo 2 > $NSIM_DEV_DFS/ports/$port/udp_ports/inject_error exp1=( `mke 20000 2` 0 0 0 ) del_dev gnv0 @@ -764,7 +764,7 @@ for port in 0 1; do msg="create VxLANs v4" new_vxlan vxlan0 10000 $NSIM_NETDEV - echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset + echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset check_tables msg="NIC device goes down" @@ -775,7 +775,7 @@ for port in 0 1; do fi check_tables - echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset + echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset check_tables msg="NIC device goes up again" @@ -789,7 +789,7 @@ for port in 0 1; do del_dev vxlan0 check_tables - echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset + echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset check_tables msg="destroy NIC" @@ -896,7 +896,7 @@ msg="vacate VxLAN in overflow table" exp0=( `mke 10000 1` `mke 10004 1` 0 `mke 10003 1` ) del_dev vxlan2 -echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset +echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset check_tables msg="tunnels destroyed 2" diff --git a/tools/testing/selftests/gpio/gpio-sim.sh b/tools/testing/selftests/gpio/gpio-sim.sh index 6fb66a687f17..bbc29ed9c60a 100755 --- a/tools/testing/selftests/gpio/gpio-sim.sh +++ b/tools/testing/selftests/gpio/gpio-sim.sh @@ -46,12 +46,6 @@ remove_chip() { rmdir $CONFIGFS_DIR/$CHIP || fail "Unable to remove the chip" } -configfs_cleanup() { - for CHIP in `ls $CONFIGFS_DIR/`; do - remove_chip $CHIP - done -} - create_chip() { local CHIP=$1 @@ -105,6 +99,13 @@ disable_chip() { echo 0 > $CONFIGFS_DIR/$CHIP/live || fail "Unable to disable the chip" } +configfs_cleanup() { + for CHIP in `ls $CONFIGFS_DIR/`; do + disable_chip $CHIP + remove_chip $CHIP + done +} + configfs_chip_name() { local CHIP=$1 local BANK=$2 @@ -181,6 +182,7 @@ create_chip chip create_bank chip bank enable_chip chip test -n `cat $CONFIGFS_DIR/chip/bank/chip_name` || fail "chip_name doesn't work" +disable_chip chip remove_chip chip echo "1.2. chip_name returns 'none' if the chip is still pending" @@ -195,6 +197,7 @@ create_chip chip create_bank chip bank enable_chip chip test -n `cat $CONFIGFS_DIR/chip/dev_name` || fail "dev_name doesn't work" +disable_chip chip remove_chip chip echo "2. Creating and configuring simulated chips" @@ -204,6 +207,7 @@ create_chip chip create_bank chip bank enable_chip chip test "`get_chip_num_lines chip bank`" = "1" || fail "default number of lines is not 1" +disable_chip chip remove_chip chip echo "2.2. Number of lines can be specified" @@ -212,6 +216,7 @@ create_bank chip bank set_num_lines chip bank 16 enable_chip chip test "`get_chip_num_lines chip bank`" = "16" || fail "number of lines is not 16" +disable_chip chip remove_chip chip echo "2.3. Label can be set" @@ -220,6 +225,7 @@ create_bank chip bank set_label chip bank foobar enable_chip chip test "`get_chip_label chip bank`" = "foobar" || fail "label is incorrect" +disable_chip chip remove_chip chip echo "2.4. Label can be left empty" @@ -227,6 +233,7 @@ create_chip chip create_bank chip bank enable_chip chip test -z "`cat $CONFIGFS_DIR/chip/bank/label`" || fail "label is not empty" +disable_chip chip remove_chip chip echo "2.5. Line names can be configured" @@ -238,6 +245,7 @@ set_line_name chip bank 2 bar enable_chip chip test "`get_line_name chip bank 0`" = "foo" || fail "line name is incorrect" test "`get_line_name chip bank 2`" = "bar" || fail "line name is incorrect" +disable_chip chip remove_chip chip echo "2.6. Line config can remain unused if offset is greater than number of lines" @@ -248,6 +256,7 @@ set_line_name chip bank 5 foobar enable_chip chip test "`get_line_name chip bank 0`" = "" || fail "line name is incorrect" test "`get_line_name chip bank 1`" = "" || fail "line name is incorrect" +disable_chip chip remove_chip chip echo "2.7. Line configfs directory names are sanitized" @@ -267,6 +276,7 @@ for CHIP in $CHIPS; do enable_chip $CHIP done for CHIP in $CHIPS; do + disable_chip $CHIP remove_chip $CHIP done @@ -278,6 +288,7 @@ echo foobar > $CONFIGFS_DIR/chip/bank/label 2> /dev/null && \ fail "Setting label of a live chip should fail" echo 8 > $CONFIGFS_DIR/chip/bank/num_lines 2> /dev/null && \ fail "Setting number of lines of a live chip should fail" +disable_chip chip remove_chip chip echo "2.10. Can't create line items when chip is live" @@ -285,6 +296,7 @@ create_chip chip create_bank chip bank enable_chip chip mkdir $CONFIGFS_DIR/chip/bank/line0 2> /dev/null && fail "Creating line item should fail" +disable_chip chip remove_chip chip echo "2.11. Probe errors are propagated to user-space" @@ -316,6 +328,7 @@ mkdir -p $CONFIGFS_DIR/chip/bank/line4/hog enable_chip chip $BASE_DIR/gpio-mockup-cdev -s 1 /dev/`configfs_chip_name chip bank` 4 2> /dev/null && \ fail "Setting the value of a hogged line shouldn't succeed" +disable_chip chip remove_chip chip echo "3. Controlling simulated chips" @@ -331,6 +344,7 @@ test "$?" = "1" || fail "pull set incorrectly" sysfs_set_pull chip bank 0 pull-down $BASE_DIR/gpio-mockup-cdev /dev/`configfs_chip_name chip bank` 1 test "$?" = "0" || fail "pull set incorrectly" +disable_chip chip remove_chip chip echo "3.2. Pull can be read from sysfs" @@ -344,6 +358,7 @@ SYSFS_PATH=/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/pull test `cat $SYSFS_PATH` = "pull-down" || fail "reading the pull failed" sysfs_set_pull chip bank 0 pull-up test `cat $SYSFS_PATH` = "pull-up" || fail "reading the pull failed" +disable_chip chip remove_chip chip echo "3.3. Incorrect input in sysfs is rejected" @@ -355,6 +370,7 @@ DEVNAME=`configfs_dev_name chip` CHIPNAME=`configfs_chip_name chip bank` SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/pull" echo foobar > $SYSFS_PATH 2> /dev/null && fail "invalid input not detected" +disable_chip chip remove_chip chip echo "3.4. Can't write to value" @@ -365,6 +381,7 @@ DEVNAME=`configfs_dev_name chip` CHIPNAME=`configfs_chip_name chip bank` SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/value" echo 1 > $SYSFS_PATH 2> /dev/null && fail "writing to 'value' succeeded unexpectedly" +disable_chip chip remove_chip chip echo "4. Simulated GPIO chips are functional" @@ -382,6 +399,7 @@ $BASE_DIR/gpio-mockup-cdev -s 1 /dev/`configfs_chip_name chip bank` 0 & sleep 0.1 # FIXME Any better way? test `cat $SYSFS_PATH` = "1" || fail "incorrect value read from sysfs" kill $! +disable_chip chip remove_chip chip echo "4.2. Bias settings work correctly" @@ -394,6 +412,7 @@ CHIPNAME=`configfs_chip_name chip bank` SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/value" $BASE_DIR/gpio-mockup-cdev -b pull-up /dev/`configfs_chip_name chip bank` 0 test `cat $SYSFS_PATH` = "1" || fail "bias setting does not work" +disable_chip chip remove_chip chip echo "GPIO $MODULE test PASS" diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py index d10f420e4ef6..fd0d959914e4 100755 --- a/tools/testing/selftests/net/bpf_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -215,12 +215,14 @@ def bpftool_map_list_wait(expected=0, n_retry=20, ns=""): raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None, - fail=True, include_stderr=False): + fail=True, include_stderr=False, dev_bind=None): args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name) if prog_type is not None: args += " type " + prog_type if dev is not None: args += " dev " + dev + elif dev_bind is not None: + args += " xdpmeta_dev " + dev_bind if len(maps): args += " map " + " map ".join(maps) @@ -980,6 +982,16 @@ try: rm("/sys/fs/bpf/offload") sim.wait_for_flush() + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/devbound", + dev_bind=sim['ifname']) + devbound = bpf_pinned("/sys/fs/bpf/devbound") + start_test("Test dev-bound program in generic mode...") + ret, _, err = sim.set_xdp(devbound, "generic", fail=False, include_stderr=True) + fail(ret == 0, "devbound program in generic mode allowed") + check_extack(err, "Can't attach device-bound programs in generic mode.", args) + rm("/sys/fs/bpf/devbound") + sim.wait_for_flush() + start_test("Test XDP load failure...") sim.dfs["dev/bpf_bind_verifier_accept"] = 0 ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile index 18b9443454a9..bc6b6762baf3 100644 --- a/tools/testing/selftests/net/lib/Makefile +++ b/tools/testing/selftests/net/lib/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS += -Wall -Wl,--no-as-needed -O2 -g CFLAGS += -I../../../../../usr/include/ $(KHDR_INCLUDES) # Additional include paths needed by kselftest.h CFLAGS += -I../../ diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 8e3fc05a5397..c76525fe2b84 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -2,7 +2,7 @@ top_srcdir = ../../../../.. -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) +CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ simult_flows.sh mptcp_sockopt.sh userspace_pm.sh diff --git a/tools/testing/selftests/net/openvswitch/Makefile b/tools/testing/selftests/net/openvswitch/Makefile index 2f1508abc826..3fd1da2ec07d 100644 --- a/tools/testing/selftests/net/openvswitch/Makefile +++ b/tools/testing/selftests/net/openvswitch/Makefile @@ -2,7 +2,7 @@ top_srcdir = ../../../../.. -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) +CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) TEST_PROGS := openvswitch.sh diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh index e15c43b7359b..ef8b25a606d8 100755 --- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -39,11 +39,13 @@ if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then # xfail tests that are known flaky with dbg config, not fixable. # still run them for coverage (and expect 100% pass without dbg). declare -ar xfail_list=( + "tcp_eor_no-coalesce-retrans.pkt" "tcp_fast_recovery_prr-ss.*.pkt" + "tcp_slow_start_slow-start-after-win-update.pkt" "tcp_timestamping.*.pkt" "tcp_user_timeout_user-timeout-probe.pkt" "tcp_zerocopy_epoll_.*.pkt" - "tcp_tcp_info_tcp-info-*-limited.pkt" + "tcp_tcp_info_tcp-info-.*-limited.pkt" ) readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$" [[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 7058dc614c25..de25892f865f 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -96,41 +96,57 @@ void vsock_wait_remote_close(int fd) close(epollfd); } -/* Bind to <bind_port>, connect to <cid, port> and return the file descriptor. */ -int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type) +/* Create socket <type>, bind to <cid, port> and return the file descriptor. */ +int vsock_bind(unsigned int cid, unsigned int port, int type) { - struct sockaddr_vm sa_client = { - .svm_family = AF_VSOCK, - .svm_cid = VMADDR_CID_ANY, - .svm_port = bind_port, - }; - struct sockaddr_vm sa_server = { + struct sockaddr_vm sa = { .svm_family = AF_VSOCK, .svm_cid = cid, .svm_port = port, }; + int fd; - int client_fd, ret; - - client_fd = socket(AF_VSOCK, type, 0); - if (client_fd < 0) { + fd = socket(AF_VSOCK, type, 0); + if (fd < 0) { perror("socket"); exit(EXIT_FAILURE); } - if (bind(client_fd, (struct sockaddr *)&sa_client, sizeof(sa_client))) { + if (bind(fd, (struct sockaddr *)&sa, sizeof(sa))) { perror("bind"); exit(EXIT_FAILURE); } + return fd; +} + +int vsock_connect_fd(int fd, unsigned int cid, unsigned int port) +{ + struct sockaddr_vm sa = { + .svm_family = AF_VSOCK, + .svm_cid = cid, + .svm_port = port, + }; + int ret; + timeout_begin(TIMEOUT); do { - ret = connect(client_fd, (struct sockaddr *)&sa_server, sizeof(sa_server)); + ret = connect(fd, (struct sockaddr *)&sa, sizeof(sa)); timeout_check("connect"); } while (ret < 0 && errno == EINTR); timeout_end(); - if (ret < 0) { + return ret; +} + +/* Bind to <bind_port>, connect to <cid, port> and return the file descriptor. */ +int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type) +{ + int client_fd; + + client_fd = vsock_bind(VMADDR_CID_ANY, bind_port, type); + + if (vsock_connect_fd(client_fd, cid, port)) { perror("connect"); exit(EXIT_FAILURE); } @@ -141,17 +157,6 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po /* Connect to <cid, port> and return the file descriptor. */ int vsock_connect(unsigned int cid, unsigned int port, int type) { - union { - struct sockaddr sa; - struct sockaddr_vm svm; - } addr = { - .svm = { - .svm_family = AF_VSOCK, - .svm_port = port, - .svm_cid = cid, - }, - }; - int ret; int fd; control_expectln("LISTENING"); @@ -162,20 +167,14 @@ int vsock_connect(unsigned int cid, unsigned int port, int type) exit(EXIT_FAILURE); } - timeout_begin(TIMEOUT); - do { - ret = connect(fd, &addr.sa, sizeof(addr.svm)); - timeout_check("connect"); - } while (ret < 0 && errno == EINTR); - timeout_end(); - - if (ret < 0) { + if (vsock_connect_fd(fd, cid, port)) { int old_errno = errno; close(fd); fd = -1; errno = old_errno; } + return fd; } @@ -192,28 +191,9 @@ int vsock_seqpacket_connect(unsigned int cid, unsigned int port) /* Listen on <cid, port> and return the file descriptor. */ static int vsock_listen(unsigned int cid, unsigned int port, int type) { - union { - struct sockaddr sa; - struct sockaddr_vm svm; - } addr = { - .svm = { - .svm_family = AF_VSOCK, - .svm_port = port, - .svm_cid = cid, - }, - }; int fd; - fd = socket(AF_VSOCK, type, 0); - if (fd < 0) { - perror("socket"); - exit(EXIT_FAILURE); - } - - if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) { - perror("bind"); - exit(EXIT_FAILURE); - } + fd = vsock_bind(cid, port, type); if (listen(fd, 1) < 0) { perror("listen"); diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index e62f46b2b92a..d1f765ce3eee 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -39,10 +39,12 @@ struct test_case { void init_signals(void); unsigned int parse_cid(const char *str); unsigned int parse_port(const char *str); +int vsock_connect_fd(int fd, unsigned int cid, unsigned int port); int vsock_connect(unsigned int cid, unsigned int port, int type); int vsock_accept(unsigned int cid, unsigned int port, struct sockaddr_vm *clientaddrp, int type); int vsock_stream_connect(unsigned int cid, unsigned int port); +int vsock_bind(unsigned int cid, unsigned int port, int type); int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type); int vsock_seqpacket_connect(unsigned int cid, unsigned int port); diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 1eebbc0d5f61..dfff8b288265 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -113,24 +113,9 @@ static void test_stream_bind_only_client(const struct test_opts *opts) static void test_stream_bind_only_server(const struct test_opts *opts) { - union { - struct sockaddr sa; - struct sockaddr_vm svm; - } addr = { - .svm = { - .svm_family = AF_VSOCK, - .svm_port = opts->peer_port, - .svm_cid = VMADDR_CID_ANY, - }, - }; int fd; - fd = socket(AF_VSOCK, SOCK_STREAM, 0); - - if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) { - perror("bind"); - exit(EXIT_FAILURE); - } + fd = vsock_bind(VMADDR_CID_ANY, opts->peer_port, SOCK_STREAM); /* Notify the client that the server is ready */ control_writeln("BIND"); @@ -1708,6 +1693,101 @@ static void test_stream_msgzcopy_leak_zcskb_server(const struct test_opts *opts) close(fd); } +#define MAX_PORT_RETRIES 24 /* net/vmw_vsock/af_vsock.c */ + +/* Test attempts to trigger a transport release for an unbound socket. This can + * lead to a reference count mishandling. + */ +static void test_stream_transport_uaf_client(const struct test_opts *opts) +{ + int sockets[MAX_PORT_RETRIES]; + struct sockaddr_vm addr; + int fd, i, alen; + + fd = vsock_bind(VMADDR_CID_ANY, VMADDR_PORT_ANY, SOCK_STREAM); + + alen = sizeof(addr); + if (getsockname(fd, (struct sockaddr *)&addr, &alen)) { + perror("getsockname"); + exit(EXIT_FAILURE); + } + + for (i = 0; i < MAX_PORT_RETRIES; ++i) + sockets[i] = vsock_bind(VMADDR_CID_ANY, ++addr.svm_port, + SOCK_STREAM); + + close(fd); + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + if (fd < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } + + if (!vsock_connect_fd(fd, addr.svm_cid, addr.svm_port)) { + perror("Unexpected connect() #1 success"); + exit(EXIT_FAILURE); + } + + /* Vulnerable system may crash now. */ + if (!vsock_connect_fd(fd, VMADDR_CID_HOST, VMADDR_PORT_ANY)) { + perror("Unexpected connect() #2 success"); + exit(EXIT_FAILURE); + } + + close(fd); + while (i--) + close(sockets[i]); + + control_writeln("DONE"); +} + +static void test_stream_transport_uaf_server(const struct test_opts *opts) +{ + control_expectln("DONE"); +} + +static void test_stream_connect_retry_client(const struct test_opts *opts) +{ + int fd; + + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + if (fd < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } + + if (!vsock_connect_fd(fd, opts->peer_cid, opts->peer_port)) { + fprintf(stderr, "Unexpected connect() #1 success\n"); + exit(EXIT_FAILURE); + } + + control_writeln("LISTEN"); + control_expectln("LISTENING"); + + if (vsock_connect_fd(fd, opts->peer_cid, opts->peer_port)) { + perror("connect() #2"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_connect_retry_server(const struct test_opts *opts) +{ + int fd; + + control_expectln("LISTEN"); + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + vsock_wait_remote_close(fd); + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1853,6 +1933,16 @@ static struct test_case test_cases[] = { .run_client = test_stream_msgzcopy_leak_zcskb_client, .run_server = test_stream_msgzcopy_leak_zcskb_server, }, + { + .name = "SOCK_STREAM transport release use-after-free", + .run_client = test_stream_transport_uaf_client, + .run_server = test_stream_transport_uaf_server, + }, + { + .name = "SOCK_STREAM retry failed connect()", + .run_client = test_stream_connect_retry_client, + .run_server = test_stream_connect_retry_server, + }, {}, }; |